diff options
Diffstat (limited to 'sys/kern')
119 files changed, 84052 insertions, 574 deletions
diff --git a/sys/kern/Make.tags.inc b/sys/kern/Make.tags.inc new file mode 100644 index 0000000..a09e484 --- /dev/null +++ b/sys/kern/Make.tags.inc @@ -0,0 +1,19 @@ +# @(#)Make.tags.inc 8.1 (Berkeley) 6/11/93 +# $Id$ + +# Common files for "make tags". +# Included by the Makefile for each architecture. + +# Put the ../sys stuff near the end so that subroutine definitions win when +# there is a struct tag with the same name (eg., vmmeter). The real +# solution would probably be for ctags to generate "struct vmmeter" tags. + +COMM= /sys/conf/*.[ch] \ + /sys/dev/*.[ch] /sys/dev/scsi/*.[ch] \ + /sys/kern/*.[ch] /sys/libkern/*.[ch] \ + /sys/miscfs/*/*.[ch] \ + /sys/net/*.[ch] /sys/netccitt/*.[ch] /sys/netinet/*.[ch] \ + /sys/netiso/*.[ch] /sys/netns/*.[ch] \ + /sys/nfs/*.[ch] /sys/sys/*.[ch] \ + /sys/ufs/*/*.[ch] \ + /sys/vm/*.[ch] diff --git a/sys/kern/Makefile b/sys/kern/Makefile new file mode 100644 index 0000000..f42a44e --- /dev/null +++ b/sys/kern/Makefile @@ -0,0 +1,53 @@ +# @(#)Makefile 8.2 (Berkeley) 3/21/94 + +# Makefile for kernel tags files, init_sysent, etc. + +ARCH= i386 # luna68k news3400 pmax sparc tahoe vax + +all: + @echo "make tags, make links or init_sysent.c only" + +init_sysent.c syscalls.c ../sys/syscall.h ../sys/syscall-hide.h \ +../sys/sysproto.h: makesyscalls.sh syscalls.master + -mv -f init_sysent.c init_sysent.c.bak + -mv -f syscalls.c syscalls.c.bak + -mv -f ../sys/syscall.h ../sys/syscall.h.bak + -mv -f ../sys/syscall-hide.h ../sys/syscall-hide.h.bak + -mv -f ../sys/sysproto.h ../sys/sysproto.h.bak + sh makesyscalls.sh syscalls.master + +# Kernel tags: +# Tags files are built in the top-level directory for each architecture, +# with a makefile listing the architecture-dependent files, etc. The list +# of common files is in ./Make.tags.inc. Links to the correct tags file +# are placed in each source directory. We need to have links to tags files +# from the generic directories that are relative to the machine type, even +# via remote mounts; therefore we use symlinks to $SYSTAGS, which points at +# ${SYSDIR}/${MACHINE}/tags. + +SYSTAGS=/var/db/sys_tags +SYSDIR=/sys + +# Directories in which to place tags links (other than machine-dependent) +DGEN= conf \ + dev dev/scsi \ + hp hp/dev hp/hpux \ + kern libkern \ + miscfs miscfs/deadfs miscfs/fdesc miscfs/fifofs miscfs/kernfs \ + miscfs/lofs miscfs/nullfs miscfs/portal miscfs/procfs \ + miscfs/specfs miscfs/umapfs miscfs/union \ + net netccitt netinet netiso netns nfs scripts sys \ + ufs ufs/ffs ufs/lfs ufs/mfs ufs/ufs \ + vm + +tags:: + -for i in ${ARCH}; do \ + (cd ../$$i && make ${MFLAGS} tags); done + +links:: + rm -f ${SYSTAGS} + ln -s ${SYSDIR}/${MACHINE}/tags ${SYSTAGS} + -for i in ${DGEN}; do \ + (cd ../$$i && { rm -f tags; ln -s ${SYSTAGS} tags; }) done + -for i in ${ARCH}; do \ + (cd ../$$i && make ${MFLAGS} SYSTAGS=${SYSTAGS} links); done diff --git a/sys/kern/bus_if.m b/sys/kern/bus_if.m new file mode 100644 index 0000000..fd4f648 --- /dev/null +++ b/sys/kern/bus_if.m @@ -0,0 +1,141 @@ +# +# Copyright (c) 1998 Doug Rabson +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# $Id: bus_if.m,v 1.4 1998/11/08 18:51:38 nsouch Exp $ +# + +INTERFACE bus; + +# +# This is called from system code which prints out a description of a +# device. It should describe the attachment that the child has with +# the parent. For instance the TurboLaser bus prints which node the +# device is attached to. +# +METHOD void print_child { + device_t dev; + device_t child; +}; + +# +# These two methods manage a bus specific set of instance variables of +# a child device. The intention is that each different type of bus +# defines a set of appropriate instance variables (such as ports and +# irqs for ISA bus etc.) +# +# This information could be given to the child device as a struct but +# that makes it hard for a bus to add or remove variables without +# forcing an edit and recompile for all drivers which may not be +# possible for vendor supplied binary drivers. + +# +# Read an instance variable. Return 0 on success. +# +METHOD int read_ivar { + device_t dev; + device_t child; + int index; + uintptr_t *result; +}; + +# +# Write an instance variable. Return 0 on success. +# +METHOD int write_ivar { + device_t dev; + device_t child; + int index; + uintptr_t value; +}; + +# +# Allocate a system resource attached to `dev' on behalf of `child'. +# The types are defined in <machine/resource.h>; the meaning of the +# resource-ID field varies from bus to bus (but *rid == 0 is always +# valid if the resource type is). start and end reflect the allowable +# range, and should be passed as `0UL' and `~0UL', respectively, if +# the client has no range restriction. count is the number of consecutive +# indices in the resource required. flags is a set of sharing flags +# as defined in <sys/rman.h>. +# +# Returns a resource or a null pointer on failure. The caller is +# responsible for calling rman_activate_resource() when it actually +# uses the resource. +# +METHOD struct resource * alloc_resource { + device_t dev; + device_t child; + int type; + int *rid; + u_long start; + u_long end; + u_long count; + u_int flags; +}; + +METHOD int activate_resource { + device_t dev; + device_t child; + int type; + int rid; + struct resource *r; +}; + +METHOD int deactivate_resource { + device_t dev; + device_t child; + int type; + int rid; + struct resource *r; +}; + +# +# Free a resource allocated by the preceding method. The `rid' value +# must be the same as the one returned by BUS_ALLOC_RESOURCE (which +# is not necessarily the same as the one the client passed). +# +METHOD int release_resource { + device_t dev; + device_t child; + int type; + int rid; + struct resource *res; +}; + +METHOD int setup_intr { + device_t dev; + device_t child; + struct resource *irq; + driver_intr_t *intr; + void *arg; + void **cookiep; +}; + +METHOD int teardown_intr { + device_t dev; + device_t child; + struct resource *irq; + void *cookie; +}; diff --git a/sys/kern/device_if.m b/sys/kern/device_if.m new file mode 100644 index 0000000..f429e67 --- /dev/null +++ b/sys/kern/device_if.m @@ -0,0 +1,83 @@ +# +# Copyright (c) 1998 Doug Rabson +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# $Id: device_if.m,v 1.2 1998/11/08 18:35:53 nsouch Exp $ +# + +INTERFACE device; + +# +# Probe to see if the device is present. Return 0 if the device exists, +# ENXIO if it cannot be found. +# +# Devices which implement busses should use this method to probe for +# the existence of devices attached to the bus and add them as +# children. If this is combined with the use of bus_generic_attach, +# the child devices will be automatically probed and attached. +# +METHOD int probe { + device_t dev; +}; + +# +# Attach a device to the system. The probe method will have been +# called and will have indicated that the device exists. This routine +# should initialise the hardware and allocate other system resources +# (such as devfs entries). Returns 0 on success. +# +METHOD int attach { + device_t dev; +}; + +# +# Detach a device. This can be called if the user is replacing the +# driver software or if a device is about to be physically removed +# from the system (e.g. for pccard devices). Returns 0 on success. +# +METHOD int detach { + device_t dev; +}; + +# +# This is called during system shutdown to allow the driver to put the +# hardware into a consistent state for rebooting the computer. +# +METHOD int shutdown { + device_t dev; +}; + +# +# This is called by the power-management subsystem when a suspend has been +# requested by the user or by some automatic mechanism. This gives +# drivers a chance to veto the suspend or save their configuration before +# power is removed. +# +METHOD int suspend { + device_t dev; +}; + +METHOD int resume { + device_t dev; +}; diff --git a/sys/kern/imgact_aout.c b/sys/kern/imgact_aout.c new file mode 100644 index 0000000..9fbd203 --- /dev/null +++ b/sys/kern/imgact_aout.c @@ -0,0 +1,304 @@ +/* + * Copyright (c) 1993, David Greenman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: imgact_aout.c,v 1.43 1998/10/16 03:55:00 peter Exp $ + */ + +#include <sys/param.h> +#include <sys/acct.h> +#include <sys/resourcevar.h> +#include <sys/exec.h> +#include <sys/fcntl.h> +#include <sys/imgact.h> +#include <sys/imgact_aout.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/namei.h> +#include <sys/pioctl.h> +#include <sys/proc.h> +#include <sys/signalvar.h> +#include <sys/stat.h> +#include <sys/sysent.h> +#include <sys/syscall.h> +#include <sys/vnode.h> +#include <sys/systm.h> +#include <machine/md_var.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <sys/user.h> + +static int exec_aout_imgact __P((struct image_params *imgp)); + +struct sysentvec aout_sysvec = { + SYS_MAXSYSCALL, + sysent, + 0, + 0, + 0, + 0, + 0, + 0, + 0, + sendsig, + sigcode, + &szsigcode, + 0, + "FreeBSD a.out", + aout_coredump +}; + +static int +exec_aout_imgact(imgp) + struct image_params *imgp; +{ + const struct exec *a_out = (const struct exec *) imgp->image_header; + struct vmspace *vmspace; + struct vnode *vp; + vm_object_t object; + vm_offset_t text_end, data_end; + unsigned long virtual_offset; + unsigned long file_offset; + unsigned long bss_size; + int error; + + /* + * Linux and *BSD binaries look very much alike, + * only the machine id is different: + * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI. + * NetBSD is in network byte order.. ugh. + */ + if (((a_out->a_magic >> 16) & 0xff) != 0x86 && + ((a_out->a_magic >> 16) & 0xff) != 0 && + ((((int)ntohl(a_out->a_magic)) >> 16) & 0xff) != 0x86) + return -1; + + /* + * Set file/virtual offset based on a.out variant. + * We do two cases: host byte order and network byte order + * (for NetBSD compatibility) + */ + switch ((int)(a_out->a_magic & 0xffff)) { + case ZMAGIC: + virtual_offset = 0; + if (a_out->a_text) { + file_offset = PAGE_SIZE; + } else { + /* Bill's "screwball mode" */ + file_offset = 0; + } + break; + case QMAGIC: + virtual_offset = PAGE_SIZE; + file_offset = 0; + break; + default: + /* NetBSD compatibility */ + switch ((int)(ntohl(a_out->a_magic) & 0xffff)) { + case ZMAGIC: + case QMAGIC: + virtual_offset = PAGE_SIZE; + file_offset = 0; + break; + default: + return (-1); + } + } + + bss_size = roundup(a_out->a_bss, PAGE_SIZE); + + /* + * Check various fields in header for validity/bounds. + */ + if (/* entry point must lay with text region */ + a_out->a_entry < virtual_offset || + a_out->a_entry >= virtual_offset + a_out->a_text || + + /* text and data size must each be page rounded */ + a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) + return (-1); + + /* text + data can't exceed file size */ + if (a_out->a_data + a_out->a_text > imgp->attr->va_size) + return (EFAULT); + + /* + * text/data/bss must not exceed limits + */ + if (/* text can't exceed maximum text size */ + a_out->a_text > MAXTSIZ || + + /* data + bss can't exceed rlimit */ + a_out->a_data + bss_size > + imgp->proc->p_rlimit[RLIMIT_DATA].rlim_cur) + return (ENOMEM); + + /* copy in arguments and/or environment from old process */ + error = exec_extract_strings(imgp); + if (error) + return (error); + + /* + * Destroy old process VM and create a new one (with a new stack) + */ + exec_new_vmspace(imgp); + + /* + * The vm space can be changed by exec_new_vmspace + */ + vmspace = imgp->proc->p_vmspace; + + vp = imgp->vp; + object = vp->v_object; + vm_object_reference(object); + + text_end = virtual_offset + a_out->a_text; + error = vm_map_insert(&vmspace->vm_map, object, + file_offset, + virtual_offset, text_end, + VM_PROT_READ | VM_PROT_EXECUTE, VM_PROT_ALL, + MAP_COPY_NEEDED | MAP_COPY_ON_WRITE); + if (error) + return (error); + + data_end = text_end + a_out->a_data; + if (a_out->a_data) { + vm_object_reference(object); + error = vm_map_insert(&vmspace->vm_map, object, + file_offset + a_out->a_text, + text_end, data_end, + VM_PROT_ALL, VM_PROT_ALL, + MAP_COPY_NEEDED | MAP_COPY_ON_WRITE); + if (error) + return (error); + } + + pmap_object_init_pt(&vmspace->vm_pmap, virtual_offset, + object, (vm_pindex_t) OFF_TO_IDX(file_offset), + a_out->a_text + a_out->a_data, 0); + + if (bss_size) { + error = vm_map_insert(&vmspace->vm_map, NULL, 0, + data_end, data_end + bss_size, + VM_PROT_ALL, VM_PROT_ALL, 0); + if (error) + return (error); + } + + /* Fill in process VM information */ + vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT; + vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT; + vmspace->vm_taddr = (caddr_t) (uintptr_t) virtual_offset; + vmspace->vm_daddr = (caddr_t) (uintptr_t) + (virtual_offset + a_out->a_text); + + /* Fill in image_params */ + imgp->interpreted = 0; + imgp->entry_addr = a_out->a_entry; + + imgp->proc->p_sysent = &aout_sysvec; + + /* Indicate that this file should not be modified */ + imgp->vp->v_flag |= VTEXT; + + return (0); +} + +/* + * Dump core, into a file named as described in the comments for + * expand_name(), unless the process was setuid/setgid. + */ +int +aout_coredump(p) + register struct proc *p; +{ + register struct vnode *vp; + register struct ucred *cred = p->p_cred->pc_ucred; + register struct vmspace *vm = p->p_vmspace; + struct nameidata nd; + struct vattr vattr; + int error, error1; + char *name; /* name of corefile */ + + STOPEVENT(p, S_CORE, 0); + if (sugid_coredump == 0 && p->p_flag & P_SUGID) + return (EFAULT); + if (ctob(UPAGES + vm->vm_dsize + vm->vm_ssize) >= + p->p_rlimit[RLIMIT_CORE].rlim_cur) + return (EFAULT); + name = expand_name(p->p_comm, p->p_ucred->cr_uid, p->p_pid); + if (name == NULL) + return (EFAULT); /* XXX -- not the best error */ + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, name, p); + error = vn_open(&nd, O_CREAT | FWRITE, S_IRUSR | S_IWUSR); + free(name, M_TEMP); + if (error) + return (error); + vp = nd.ni_vp; + + /* Don't dump to non-regular files or files with links. */ + if (vp->v_type != VREG || + VOP_GETATTR(vp, &vattr, cred, p) || vattr.va_nlink != 1) { + error = EFAULT; + goto out; + } + VATTR_NULL(&vattr); + vattr.va_size = 0; + VOP_LEASE(vp, p, cred, LEASE_WRITE); + VOP_SETATTR(vp, &vattr, cred, p); + p->p_acflag |= ACORE; + bcopy(p, &p->p_addr->u_kproc.kp_proc, sizeof(struct proc)); + fill_eproc(p, &p->p_addr->u_kproc.kp_eproc); + error = cpu_coredump(p, vp, cred); + if (error == 0) + error = vn_rdwr(UIO_WRITE, vp, vm->vm_daddr, + (int)ctob(vm->vm_dsize), (off_t)ctob(UPAGES), UIO_USERSPACE, + IO_NODELOCKED|IO_UNIT, cred, (int *) NULL, p); + if (error == 0) + error = vn_rdwr(UIO_WRITE, vp, + (caddr_t) trunc_page(USRSTACK - ctob(vm->vm_ssize)), + round_page(ctob(vm->vm_ssize)), + (off_t)ctob(UPAGES) + ctob(vm->vm_dsize), UIO_USERSPACE, + IO_NODELOCKED|IO_UNIT, cred, (int *) NULL, p); +out: + VOP_UNLOCK(vp, 0, p); + error1 = vn_close(vp, FWRITE, cred, p); + if (error == 0) + error = error1; + return (error); +} + +/* + * Tell kern_execve.c about it, with a little help from the linker. + * Since `const' objects end up in the text segment, TEXT_SET is the + * correct directive to use. + */ +static const struct execsw aout_execsw = { exec_aout_imgact, "a.out" }; +EXEC_SET(aout, aout_execsw); diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c new file mode 100644 index 0000000..a0a2284 --- /dev/null +++ b/sys/kern/imgact_elf.c @@ -0,0 +1,992 @@ +/*- + * Copyright (c) 1995-1996 Søren Schmidt + * Copyright (c) 1996 Peter Wemm + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software withough specific prior written permission + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $Id: imgact_elf.c,v 1.43 1998/12/04 22:54:51 archie Exp $ + */ + +#include "opt_rlimit.h" + +#include <sys/param.h> +#include <sys/acct.h> +#include <sys/exec.h> +#include <sys/fcntl.h> +#include <sys/imgact.h> +#include <sys/imgact_elf.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mman.h> +#include <sys/namei.h> +#include <sys/pioctl.h> +#include <sys/proc.h> +#include <sys/procfs.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/stat.h> +#include <sys/syscall.h> +#include <sys/sysctl.h> +#include <sys/sysent.h> +#include <sys/systm.h> +#include <sys/vnode.h> + +#include <vm/vm.h> +#include <vm/vm_kern.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#include <sys/lock.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_prot.h> +#include <vm/vm_extern.h> + +#include <machine/elf.h> +#include <machine/md_var.h> + +__ElfType(Brandinfo); +__ElfType(Auxargs); + +static int elf_check_header __P((const Elf_Ehdr *hdr, int type)); +static int elf_freebsd_fixup __P((long **stack_base, + struct image_params *imgp)); +static int elf_load_file __P((struct proc *p, char *file, u_long *addr, + u_long *entry)); +static int elf_load_section __P((struct proc *p, + struct vmspace *vmspace, struct vnode *vp, + vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, + vm_prot_t prot)); +static int exec_elf_imgact __P((struct image_params *imgp)); + +static int elf_trace = 0; +SYSCTL_INT(_debug, OID_AUTO, elf_trace, CTLFLAG_RW, &elf_trace, 0, ""); + +static struct sysentvec elf_freebsd_sysvec = { + SYS_MAXSYSCALL, + sysent, + 0, + 0, + 0, + 0, + 0, + 0, + elf_freebsd_fixup, + sendsig, + sigcode, + &szsigcode, + 0, + "FreeBSD ELF", + elf_coredump +}; + +static Elf_Brandinfo freebsd_brand_info = { + "FreeBSD", + "", + "/usr/libexec/ld-elf.so.1", + &elf_freebsd_sysvec + }; +static Elf_Brandinfo *elf_brand_list[MAX_BRANDS] = { + &freebsd_brand_info, + NULL, NULL, NULL, + NULL, NULL, NULL, NULL + }; + +int +elf_insert_brand_entry(Elf_Brandinfo *entry) +{ + int i; + + for (i=1; i<MAX_BRANDS; i++) { + if (elf_brand_list[i] == NULL) { + elf_brand_list[i] = entry; + break; + } + } + if (i == MAX_BRANDS) + return -1; + return 0; +} + +int +elf_remove_brand_entry(Elf_Brandinfo *entry) +{ + int i; + + for (i=1; i<MAX_BRANDS; i++) { + if (elf_brand_list[i] == entry) { + elf_brand_list[i] = NULL; + break; + } + } + if (i == MAX_BRANDS) + return -1; + return 0; +} + +static int +elf_check_header(const Elf_Ehdr *hdr, int type) +{ + if (!IS_ELF(*hdr) || + hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS || + hdr->e_ident[EI_DATA] != ELF_TARG_DATA || + hdr->e_ident[EI_VERSION] != EV_CURRENT) + return ENOEXEC; + + if (!ELF_MACHINE_OK(hdr->e_machine)) + return ENOEXEC; + + if (hdr->e_type != type || hdr->e_version != ELF_TARG_VER) + return ENOEXEC; + + return 0; +} + +static int +elf_load_section(struct proc *p, struct vmspace *vmspace, struct vnode *vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot) +{ + size_t map_len; + vm_offset_t map_addr; + int error, rv; + size_t copy_len; + vm_object_t object; + vm_offset_t file_addr; + vm_offset_t data_buf = 0; + + object = vp->v_object; + error = 0; + + map_addr = trunc_page((vm_offset_t)vmaddr); + file_addr = trunc_page(offset); + + /* + * We have two choices. We can either clear the data in the last page + * of an oversized mapping, or we can start the anon mapping a page + * early and copy the initialized data into that first page. We + * choose the second.. + */ + if (memsz > filsz) + map_len = trunc_page(offset+filsz) - file_addr; + else + map_len = round_page(offset+filsz) - file_addr; + + if (map_len != 0) { + vm_object_reference(object); + vm_map_lock(&vmspace->vm_map); + rv = vm_map_insert(&vmspace->vm_map, + object, + file_addr, /* file offset */ + map_addr, /* virtual start */ + map_addr + map_len,/* virtual end */ + prot, + VM_PROT_ALL, + MAP_COPY_NEEDED | MAP_COPY_ON_WRITE); + vm_map_unlock(&vmspace->vm_map); + if (rv != KERN_SUCCESS) + return EINVAL; + + /* prefault the page tables */ + pmap_object_init_pt(&vmspace->vm_pmap, + map_addr, + object, + (vm_pindex_t) OFF_TO_IDX(file_addr), + map_len, + 0); + + /* we can stop now if we've covered it all */ + if (memsz == filsz) + return 0; + } + + + /* + * We have to get the remaining bit of the file into the first part + * of the oversized map segment. This is normally because the .data + * segment in the file is extended to provide bss. It's a neat idea + * to try and save a page, but it's a pain in the behind to implement. + */ + copy_len = (offset + filsz) - trunc_page(offset + filsz); + map_addr = trunc_page((vm_offset_t)vmaddr + filsz); + map_len = round_page((vm_offset_t)vmaddr + memsz) - map_addr; + + /* This had damn well better be true! */ + if (map_len != 0) { + vm_map_lock(&vmspace->vm_map); + rv = vm_map_insert(&vmspace->vm_map, NULL, 0, + map_addr, map_addr + map_len, + VM_PROT_ALL, VM_PROT_ALL, 0); + vm_map_unlock(&vmspace->vm_map); + if (rv != KERN_SUCCESS) + return EINVAL; + } + + if (copy_len != 0) { + vm_object_reference(object); + rv = vm_map_find(exec_map, + object, + trunc_page(offset + filsz), + &data_buf, + PAGE_SIZE, + TRUE, + VM_PROT_READ, + VM_PROT_ALL, + MAP_COPY_ON_WRITE | MAP_COPY_NEEDED); + if (rv != KERN_SUCCESS) { + vm_object_deallocate(object); + return EINVAL; + } + pmap_object_init_pt(exec_map->pmap, data_buf, object, + (vm_pindex_t) OFF_TO_IDX(trunc_page(offset + filsz)), + PAGE_SIZE, 1); + + /* send the page fragment to user space */ + error = copyout((caddr_t)data_buf, (caddr_t)map_addr, copy_len); + vm_map_remove(exec_map, data_buf, data_buf + PAGE_SIZE); + if (error) + return (error); + } + + /* + * set it to the specified protection + */ + vm_map_protect(&vmspace->vm_map, map_addr, map_addr + map_len, prot, + FALSE); + + return error; +} + +static int +elf_load_file(struct proc *p, char *file, u_long *addr, u_long *entry) +{ + Elf_Ehdr *hdr = NULL; + Elf_Phdr *phdr = NULL; + struct nameidata nd; + struct vmspace *vmspace = p->p_vmspace; + struct vattr attr; + struct image_params image_params, *imgp; + vm_prot_t prot; + unsigned long text_size = 0, data_size = 0; + unsigned long text_addr = 0, data_addr = 0; + int error, i; + + imgp = &image_params; + /* + * Initialize part of the common data + */ + imgp->proc = p; + imgp->uap = NULL; + imgp->attr = &attr; + imgp->firstpage = NULL; + imgp->image_header = (char *)kmem_alloc_wait(exec_map, PAGE_SIZE); + + if (imgp->image_header == NULL) { + nd.ni_vp = NULL; + error = ENOMEM; + goto fail; + } + + NDINIT(&nd, LOOKUP, LOCKLEAF|FOLLOW, UIO_SYSSPACE, file, p); + + if (error = namei(&nd)) { + nd.ni_vp = NULL; + goto fail; + } + + imgp->vp = nd.ni_vp; + + /* + * Check permissions, modes, uid, etc on the file, and "open" it. + */ + error = exec_check_permissions(imgp); + if (error) { + VOP_UNLOCK(nd.ni_vp, 0, p); + goto fail; + } + + error = exec_map_first_page(imgp); + VOP_UNLOCK(nd.ni_vp, 0, p); + if (error) + goto fail; + + hdr = (Elf_Ehdr *)imgp->image_header; + if (error = elf_check_header(hdr, ET_DYN)) + goto fail; + + /* Only support headers that fit within first page for now */ + if ((hdr->e_phoff > PAGE_SIZE) || + (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) { + error = ENOEXEC; + goto fail; + } + + phdr = (Elf_Phdr *)(imgp->image_header + hdr->e_phoff); + + for (i = 0; i < hdr->e_phnum; i++) { + if (phdr[i].p_type == PT_LOAD) { /* Loadable segment */ + prot = 0; + if (phdr[i].p_flags & PF_X) + prot |= VM_PROT_EXECUTE; + if (phdr[i].p_flags & PF_W) + prot |= VM_PROT_WRITE; + if (phdr[i].p_flags & PF_R) + prot |= VM_PROT_READ; + + if (error = elf_load_section(p, vmspace, nd.ni_vp, + phdr[i].p_offset, + (caddr_t)phdr[i].p_vaddr + + (*addr), + phdr[i].p_memsz, + phdr[i].p_filesz, prot)) + goto fail; + + /* + * Is this .text or .data ?? + * + * We only handle one each of those yet XXX + */ + if (hdr->e_entry >= phdr[i].p_vaddr && + hdr->e_entry <(phdr[i].p_vaddr+phdr[i].p_memsz)) { + text_addr = trunc_page(phdr[i].p_vaddr+(*addr)); + text_size = round_page(phdr[i].p_memsz + + phdr[i].p_vaddr - + trunc_page(phdr[i].p_vaddr)); + *entry=(unsigned long)hdr->e_entry+(*addr); + } else { + data_addr = trunc_page(phdr[i].p_vaddr+(*addr)); + data_size = round_page(phdr[i].p_memsz + + phdr[i].p_vaddr - + trunc_page(phdr[i].p_vaddr)); + } + } + } + +fail: + if (imgp->firstpage) + exec_unmap_first_page(imgp); + if (imgp->image_header) + kmem_free_wakeup(exec_map, (vm_offset_t)imgp->image_header, + PAGE_SIZE); + if (nd.ni_vp) + vrele(nd.ni_vp); + + return error; +} + +static int +exec_elf_imgact(struct image_params *imgp) +{ + const Elf_Ehdr *hdr = (const Elf_Ehdr *) imgp->image_header; + const Elf_Phdr *phdr; + Elf_Auxargs *elf_auxargs = NULL; + struct vmspace *vmspace; + vm_prot_t prot; + u_long text_size = 0, data_size = 0; + u_long text_addr = 0, data_addr = 0; + u_long addr, entry = 0, proghdr = 0; + int error, i; + const char *interp = NULL; + Elf_Brandinfo *brand_info; + char *brand; + char path[MAXPATHLEN]; + + /* + * Do we have a valid ELF header ? + */ + if (elf_check_header(hdr, ET_EXEC)) + return -1; + + /* + * From here on down, we return an errno, not -1, as we've + * detected an ELF file. + */ + + if ((hdr->e_phoff > PAGE_SIZE) || + (hdr->e_phoff + hdr->e_phentsize * hdr->e_phnum) > PAGE_SIZE) { + /* Only support headers in first page for now */ + return ENOEXEC; + } + phdr = (const Elf_Phdr*)(imgp->image_header + hdr->e_phoff); + + /* + * From this point on, we may have resources that need to be freed. + */ + if (error = exec_extract_strings(imgp)) + goto fail; + + exec_new_vmspace(imgp); + + vmspace = imgp->proc->p_vmspace; + + for (i = 0; i < hdr->e_phnum; i++) { + switch(phdr[i].p_type) { + + case PT_LOAD: /* Loadable segment */ + prot = 0; + if (phdr[i].p_flags & PF_X) + prot |= VM_PROT_EXECUTE; + if (phdr[i].p_flags & PF_W) + prot |= VM_PROT_WRITE; + if (phdr[i].p_flags & PF_R) + prot |= VM_PROT_READ; + + if (error = elf_load_section(imgp->proc, + vmspace, imgp->vp, + phdr[i].p_offset, + (caddr_t)phdr[i].p_vaddr, + phdr[i].p_memsz, + phdr[i].p_filesz, prot)) + goto fail; + + /* + * Is this .text or .data ?? + * + * We only handle one each of those yet XXX + */ + if (hdr->e_entry >= phdr[i].p_vaddr && + hdr->e_entry <(phdr[i].p_vaddr+phdr[i].p_memsz)) { + text_addr = trunc_page(phdr[i].p_vaddr); + text_size = round_page(phdr[i].p_memsz + + phdr[i].p_vaddr - + text_addr); + entry = (u_long)hdr->e_entry; + } else { + data_addr = trunc_page(phdr[i].p_vaddr); + data_size = round_page(phdr[i].p_memsz + + phdr[i].p_vaddr - + data_addr); + } + break; + case PT_INTERP: /* Path to interpreter */ + if (phdr[i].p_filesz > MAXPATHLEN || + phdr[i].p_offset + phdr[i].p_filesz > PAGE_SIZE) { + error = ENOEXEC; + goto fail; + } + interp = imgp->image_header + phdr[i].p_offset; + break; + case PT_PHDR: /* Program header table info */ + proghdr = phdr[i].p_vaddr; + break; + default: + break; + } + } + + vmspace->vm_tsize = text_size >> PAGE_SHIFT; + vmspace->vm_taddr = (caddr_t)(uintptr_t)text_addr; + vmspace->vm_dsize = data_size >> PAGE_SHIFT; + vmspace->vm_daddr = (caddr_t)(uintptr_t)data_addr; + + addr = 2L*MAXDSIZ; /* May depend on OS type XXX */ + + imgp->entry_addr = entry; + + /* If the executable has a brand, search for it in the brand list. */ + brand_info = NULL; + brand = (char *)&hdr->e_ident[EI_BRAND]; + if (brand[0] != '\0') { + for (i = 0; i < MAX_BRANDS; i++) { + Elf_Brandinfo *bi = elf_brand_list[i]; + + if (bi != NULL && strcmp(brand, bi->brand) == 0) { + brand_info = bi; + break; + } + } + } + + /* Lacking a known brand, search for a recognized interpreter. */ + if (brand_info == NULL && interp != NULL) { + for (i = 0; i < MAX_BRANDS; i++) { + Elf_Brandinfo *bi = elf_brand_list[i]; + + if (bi != NULL && + strcmp(interp, bi->interp_path) == 0) { + brand_info = bi; + break; + } + } + } + +#ifdef __alpha__ + /* XXX - Assume FreeBSD on the alpha. */ + if (brand_info == NULL) + brand_info = &freebsd_brand_info; +#endif + + if (brand_info == NULL) { + if (brand[0] == 0) + uprintf("ELF binary type not known." + " Use \"brandelf\" to brand it.\n"); + else + uprintf("ELF binary type \"%.*s\" not known.\n", + EI_NIDENT - EI_BRAND, brand); + error = ENOEXEC; + goto fail; + } + + imgp->proc->p_sysent = brand_info->sysvec; + if (interp != NULL) { + snprintf(path, sizeof(path), "%s%s", + brand_info->emul_path, interp); + if ((error = elf_load_file(imgp->proc, path, &addr, + &imgp->entry_addr)) != 0) { + uprintf("ELF interpreter %s not found\n", path); + goto fail; + } + } + + /* + * Construct auxargs table (used by the fixup routine) + */ + elf_auxargs = malloc(sizeof(Elf_Auxargs), M_TEMP, M_WAITOK); + elf_auxargs->execfd = -1; + elf_auxargs->phdr = proghdr; + elf_auxargs->phent = hdr->e_phentsize; + elf_auxargs->phnum = hdr->e_phnum; + elf_auxargs->pagesz = PAGE_SIZE; + elf_auxargs->base = addr; + elf_auxargs->flags = 0; + elf_auxargs->entry = entry; + elf_auxargs->trace = elf_trace; + + imgp->auxargs = elf_auxargs; + imgp->interpreted = 0; + + /* don't allow modifying the file while we run it */ + imgp->vp->v_flag |= VTEXT; + +fail: + return error; +} + +static int +elf_freebsd_fixup(long **stack_base, struct image_params *imgp) +{ + Elf_Auxargs *args = (Elf_Auxargs *)imgp->auxargs; + long *pos; + + pos = *stack_base + (imgp->argc + imgp->envc + 2); + + if (args->trace) { + AUXARGS_ENTRY(pos, AT_DEBUG, 1); + } + if (args->execfd != -1) { + AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); + } + AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); + AUXARGS_ENTRY(pos, AT_PHENT, args->phent); + AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); + AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); + AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); + AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); + AUXARGS_ENTRY(pos, AT_BASE, args->base); + AUXARGS_ENTRY(pos, AT_NULL, 0); + + free(imgp->auxargs, M_TEMP); + imgp->auxargs = NULL; + + (*stack_base)--; + suword(*stack_base, (long) imgp->argc); + return 0; +} + +/* + * Code for generating ELF core dumps. + */ + +typedef void (*segment_callback) __P((vm_map_entry_t, void *)); + +/* Closure for cb_put_phdr(). */ +struct phdr_closure { + Elf_Phdr *phdr; /* Program header to fill in */ + Elf_Off offset; /* Offset of segment in core file */ +}; + +/* Closure for cb_size_segment(). */ +struct sseg_closure { + int count; /* Count of writable segments. */ + size_t size; /* Total size of all writable segments. */ +}; + +static void cb_put_phdr __P((vm_map_entry_t, void *)); +static void cb_size_segment __P((vm_map_entry_t, void *)); +static void each_writable_segment __P((struct proc *, segment_callback, + void *)); +static int elf_corehdr __P((struct proc *, struct vnode *, struct ucred *, + int, void *, size_t)); +static void elf_puthdr __P((struct proc *, void *, size_t *, + const prstatus_t *, const prfpregset_t *, const prpsinfo_t *, int)); +static void elf_putnote __P((void *, size_t *, const char *, int, + const void *, size_t)); + +extern int osreldate; + +int +elf_coredump(p) + register struct proc *p; +{ + register struct vnode *vp; + register struct ucred *cred = p->p_cred->pc_ucred; + struct nameidata nd; + struct vattr vattr; + int error, error1; + char *name; /* name of corefile */ + struct sseg_closure seginfo; + void *hdr; + size_t hdrsize; + + STOPEVENT(p, S_CORE, 0); + + if (sugid_coredump == 0 && p->p_flag & P_SUGID) + return (EFAULT); + + /* Size the program segments. */ + seginfo.count = 0; + seginfo.size = 0; + each_writable_segment(p, cb_size_segment, &seginfo); + + /* + * Calculate the size of the core file header area by making + * a dry run of generating it. Nothing is written, but the + * size is calculated. + */ + hdrsize = 0; + elf_puthdr((struct proc *)NULL, (void *)NULL, &hdrsize, + (const prstatus_t *)NULL, (const prfpregset_t *)NULL, + (const prpsinfo_t *)NULL, seginfo.count); + + if (hdrsize + seginfo.size >= p->p_rlimit[RLIMIT_CORE].rlim_cur) + return (EFAULT); + name = expand_name(p->p_comm, p->p_ucred->cr_uid, p->p_pid); + if (name == NULL) + return (EFAULT); /* XXX -- not the best error */ + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, name, p); + error = vn_open(&nd, O_CREAT | FWRITE, S_IRUSR | S_IWUSR); + free(name, M_TEMP); + if (error) + return (error); + vp = nd.ni_vp; + + /* Don't dump to non-regular files or files with links. */ + if (vp->v_type != VREG || + VOP_GETATTR(vp, &vattr, cred, p) || vattr.va_nlink != 1) { + error = EFAULT; + goto out; + } + VATTR_NULL(&vattr); + vattr.va_size = 0; + VOP_LEASE(vp, p, cred, LEASE_WRITE); + VOP_SETATTR(vp, &vattr, cred, p); + p->p_acflag |= ACORE; + + + /* + * Allocate memory for building the header, fill it up, + * and write it out. + */ + hdr = malloc(hdrsize, M_TEMP, M_WAITOK); + if (hdr == NULL) { + error = EINVAL; + goto out; + } + error = elf_corehdr(p, vp, cred, seginfo.count, hdr, hdrsize); + + /* Write the contents of all of the writable segments. */ + if (error == 0) { + Elf_Phdr *php; + off_t offset; + int i; + + php = (Elf_Phdr *)((char *)hdr + sizeof(Elf_Ehdr)) + 1; + offset = hdrsize; + for (i = 0; i < seginfo.count; i++) { + error = vn_rdwr(UIO_WRITE, vp, (caddr_t)php->p_vaddr, + php->p_filesz, offset, UIO_USERSPACE, + IO_NODELOCKED|IO_UNIT, cred, (int *)NULL, p); + if (error != 0) + break; + offset += php->p_filesz; + php++; + } + } + free(hdr, M_TEMP); + +out: + VOP_UNLOCK(vp, 0, p); + error1 = vn_close(vp, FWRITE, cred, p); + if (error == 0) + error = error1; + return (error); +} + +/* + * A callback for each_writable_segment() to write out the segment's + * program header entry. + */ +static void +cb_put_phdr(entry, closure) + vm_map_entry_t entry; + void *closure; +{ + struct phdr_closure *phc = (struct phdr_closure *)closure; + Elf_Phdr *phdr = phc->phdr; + + phc->offset = round_page(phc->offset); + + phdr->p_type = PT_LOAD; + phdr->p_offset = phc->offset; + phdr->p_vaddr = entry->start; + phdr->p_paddr = 0; + phdr->p_filesz = phdr->p_memsz = entry->end - entry->start; + phdr->p_align = PAGE_SIZE; + phdr->p_flags = 0; + if (entry->protection & VM_PROT_READ) + phdr->p_flags |= PF_R; + if (entry->protection & VM_PROT_WRITE) + phdr->p_flags |= PF_W; + if (entry->protection & VM_PROT_EXECUTE) + phdr->p_flags |= PF_X; + + phc->offset += phdr->p_filesz; + phc->phdr++; +} + +/* + * A callback for each_writable_segment() to gather information about + * the number of segments and their total size. + */ +static void +cb_size_segment(entry, closure) + vm_map_entry_t entry; + void *closure; +{ + struct sseg_closure *ssc = (struct sseg_closure *)closure; + + ssc->count++; + ssc->size += entry->end - entry->start; +} + +/* + * For each writable segment in the process's memory map, call the given + * function with a pointer to the map entry and some arbitrary + * caller-supplied data. + */ +static void +each_writable_segment(p, func, closure) + struct proc *p; + segment_callback func; + void *closure; +{ + vm_map_t map = &p->p_vmspace->vm_map; + vm_map_entry_t entry; + + for (entry = map->header.next; entry != &map->header; + entry = entry->next) { + vm_object_t obj; + + if (entry->eflags & (MAP_ENTRY_IS_A_MAP|MAP_ENTRY_IS_SUB_MAP) || + (entry->protection & (VM_PROT_READ|VM_PROT_WRITE)) != + (VM_PROT_READ|VM_PROT_WRITE)) + continue; + + if ((obj = entry->object.vm_object) == NULL) + continue; + + /* Find the deepest backing object. */ + while (obj->backing_object != NULL) + obj = obj->backing_object; + + /* Ignore memory-mapped devices and such things. */ + if (obj->type != OBJT_DEFAULT && + obj->type != OBJT_SWAP && + obj->type != OBJT_VNODE) + continue; + + (*func)(entry, closure); + } +} + +/* + * Write the core file header to the file, including padding up to + * the page boundary. + */ +static int +elf_corehdr(p, vp, cred, numsegs, hdr, hdrsize) + struct proc *p; + struct vnode *vp; + struct ucred *cred; + int numsegs; + size_t hdrsize; + void *hdr; +{ + size_t off; + prstatus_t status; + prfpregset_t fpregset; + prpsinfo_t psinfo; + + /* Gather the information for the header. */ + bzero(&status, sizeof status); + status.pr_version = PRSTATUS_VERSION; + status.pr_statussz = sizeof(prstatus_t); + status.pr_gregsetsz = sizeof(gregset_t); + status.pr_fpregsetsz = sizeof(fpregset_t); + status.pr_osreldate = osreldate; +#ifndef COMPAT_LINUX_THREADS + status.pr_cursig = p->p_sigacts->ps_sig; +#else + status.pr_cursig = p->p_sig; +#endif /* COMPAT_LINUX_THREADS */ + status.pr_pid = p->p_pid; + fill_regs(p, &status.pr_reg); + + fill_fpregs(p, &fpregset); + + bzero(&psinfo, sizeof psinfo); + psinfo.pr_version = PRPSINFO_VERSION; + psinfo.pr_psinfosz = sizeof(prpsinfo_t); + strncpy(psinfo.pr_fname, p->p_comm, MAXCOMLEN); + /* XXX - We don't fill in the command line arguments properly yet. */ + strncpy(psinfo.pr_psargs, p->p_comm, PRARGSZ); + + /* Fill in the header. */ + bzero(hdr, hdrsize); + off = 0; + elf_puthdr(p, hdr, &off, &status, &fpregset, &psinfo, numsegs); + + /* Write it to the core file. */ + return vn_rdwr(UIO_WRITE, vp, hdr, hdrsize, (off_t)0, + UIO_SYSSPACE, IO_NODELOCKED|IO_UNIT, cred, NULL, p); +} + +static void +elf_puthdr(struct proc *p, void *dst, size_t *off, const prstatus_t *status, + const prfpregset_t *fpregset, const prpsinfo_t *psinfo, int numsegs) +{ + size_t ehoff; + size_t phoff; + size_t noteoff; + size_t notesz; + + ehoff = *off; + *off += sizeof(Elf_Ehdr); + + phoff = *off; + *off += (numsegs + 1) * sizeof(Elf_Phdr); + + noteoff = *off; + elf_putnote(dst, off, "FreeBSD", NT_PRSTATUS, status, + sizeof *status); + elf_putnote(dst, off, "FreeBSD", NT_FPREGSET, fpregset, + sizeof *fpregset); + elf_putnote(dst, off, "FreeBSD", NT_PRPSINFO, psinfo, + sizeof *psinfo); + notesz = *off - noteoff; + + /* Align up to a page boundary for the program segments. */ + *off = round_page(*off); + + if (dst != NULL) { + Elf_Ehdr *ehdr; + Elf_Phdr *phdr; + struct phdr_closure phc; + + /* + * Fill in the ELF header. + */ + ehdr = (Elf_Ehdr *)((char *)dst + ehoff); + ehdr->e_ident[EI_MAG0] = ELFMAG0; + ehdr->e_ident[EI_MAG1] = ELFMAG1; + ehdr->e_ident[EI_MAG2] = ELFMAG2; + ehdr->e_ident[EI_MAG3] = ELFMAG3; + ehdr->e_ident[EI_CLASS] = ELF_CLASS; + ehdr->e_ident[EI_DATA] = ELF_DATA; + ehdr->e_ident[EI_VERSION] = EV_CURRENT; + ehdr->e_ident[EI_PAD] = 0; + strncpy(ehdr->e_ident + EI_BRAND, "FreeBSD", + EI_NIDENT - EI_BRAND); + ehdr->e_type = ET_CORE; + ehdr->e_machine = ELF_ARCH; + ehdr->e_version = EV_CURRENT; + ehdr->e_entry = 0; + ehdr->e_phoff = phoff; + ehdr->e_flags = 0; + ehdr->e_ehsize = sizeof(Elf_Ehdr); + ehdr->e_phentsize = sizeof(Elf_Phdr); + ehdr->e_phnum = numsegs + 1; + ehdr->e_shentsize = sizeof(Elf_Shdr); + ehdr->e_shnum = 0; + ehdr->e_shstrndx = SHN_UNDEF; + + /* + * Fill in the program header entries. + */ + phdr = (Elf_Phdr *)((char *)dst + phoff); + + /* The note segement. */ + phdr->p_type = PT_NOTE; + phdr->p_offset = noteoff; + phdr->p_vaddr = 0; + phdr->p_paddr = 0; + phdr->p_filesz = notesz; + phdr->p_memsz = 0; + phdr->p_flags = 0; + phdr->p_align = 0; + phdr++; + + /* All the writable segments from the program. */ + phc.phdr = phdr; + phc.offset = *off; + each_writable_segment(p, cb_put_phdr, &phc); + } +} + +static void +elf_putnote(void *dst, size_t *off, const char *name, int type, + const void *desc, size_t descsz) +{ + Elf_Note note; + + note.n_namesz = strlen(name) + 1; + note.n_descsz = descsz; + note.n_type = type; + if (dst != NULL) + bcopy(¬e, (char *)dst + *off, sizeof note); + *off += sizeof note; + if (dst != NULL) + bcopy(name, (char *)dst + *off, note.n_namesz); + *off += roundup2(note.n_namesz, sizeof(Elf_Size)); + if (dst != NULL) + bcopy(desc, (char *)dst + *off, note.n_descsz); + *off += roundup2(note.n_descsz, sizeof(Elf_Size)); +} + +/* + * Tell kern_execve.c about it, with a little help from the linker. + * Since `const' objects end up in the text segment, TEXT_SET is the + * correct directive to use. + */ +static const struct execsw elf_execsw = {exec_elf_imgact, "ELF"}; +EXEC_SET(elf, elf_execsw); diff --git a/sys/kern/imgact_gzip.c b/sys/kern/imgact_gzip.c new file mode 100644 index 0000000..d666a87 --- /dev/null +++ b/sys/kern/imgact_gzip.c @@ -0,0 +1,378 @@ +/* + * ---------------------------------------------------------------------------- + * "THE BEER-WARE LICENSE" (Revision 42): + * <phk@login.dkuug.dk> wrote this file. As long as you retain this notice you + * can do whatever you want with this stuff. If we meet some day, and you think + * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp + * ---------------------------------------------------------------------------- + * + * $Id: imgact_gzip.c,v 1.34 1998/07/15 05:00:26 bde Exp $ + * + * This module handles execution of a.out files which have been run through + * "gzip". This saves diskspace, but wastes cpu-cycles and VM. + * + * TODO: + * text-segments should be made R/O after being filled + * is the vm-stuff safe ? + * should handle the entire header of gzip'ed stuff. + * inflate isn't quite reentrant yet... + * error-handling is a mess... + * so is the rest... + * tidy up unnecesary includes + */ + +#include <sys/param.h> +#include <sys/exec.h> +#include <sys/imgact.h> +#include <sys/imgact_aout.h> +#include <sys/kernel.h> +#include <sys/mman.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/sysent.h> +#include <sys/systm.h> +#include <sys/vnode.h> +#include <sys/inflate.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> + +struct imgact_gzip { + struct image_params *ip; + struct exec a_out; + int error; + int where; + u_char *inbuf; + u_long offset; + u_long output; + u_long len; + int idx; + u_long virtual_offset, file_offset, file_end, bss_size; +}; + +static int exec_gzip_imgact __P((struct image_params *imgp)); +static int NextByte __P((void *vp)); +static int do_aout_hdr __P((struct imgact_gzip *)); +static int Flush __P((void *vp, u_char *, u_long siz)); + +static int +exec_gzip_imgact(imgp) + struct image_params *imgp; +{ + int error, error2 = 0; + const u_char *p = (const u_char *) imgp->image_header; + struct imgact_gzip igz; + struct inflate infl; + struct vmspace *vmspace; + + /* If these four are not OK, it isn't a gzip file */ + if (p[0] != 0x1f) + return -1; /* 0 Simply magic */ + if (p[1] != 0x8b) + return -1; /* 1 Simply magic */ + if (p[2] != 0x08) + return -1; /* 2 Compression method */ + if (p[9] != 0x03) + return -1; /* 9 OS compressed on */ + + /* + * If this one contains anything but a comment or a filename marker, + * we don't want to chew on it + */ + if (p[3] & ~(0x18)) + return ENOEXEC; /* 3 Flags */ + + /* These are of no use to us */ + /* 4-7 Timestamp */ + /* 8 Extra flags */ + + bzero(&igz, sizeof igz); + bzero(&infl, sizeof infl); + infl.gz_private = (void *) &igz; + infl.gz_input = NextByte; + infl.gz_output = Flush; + + igz.ip = imgp; + igz.idx = 10; + + if (p[3] & 0x08) { /* skip a filename */ + while (p[igz.idx++]) + if (igz.idx >= PAGE_SIZE) + return ENOEXEC; + } + if (p[3] & 0x10) { /* skip a comment */ + while (p[igz.idx++]) + if (igz.idx >= PAGE_SIZE) + return ENOEXEC; + } + igz.len = imgp->attr->va_size; + + error = inflate(&infl); + + if ( !error ) { + vmspace = imgp->proc->p_vmspace; + error = vm_map_protect(&vmspace->vm_map, + (vm_offset_t) vmspace->vm_taddr, + (vm_offset_t) (vmspace->vm_taddr + + (vmspace->vm_tsize << PAGE_SHIFT)) , + VM_PROT_READ|VM_PROT_EXECUTE,0); + } + + if (igz.inbuf) { + error2 = + vm_map_remove(kernel_map, (vm_offset_t) igz.inbuf, + (vm_offset_t) igz.inbuf + PAGE_SIZE); + } + if (igz.error || error || error2) { + printf("Output=%lu ", igz.output); + printf("Inflate_error=%d igz.error=%d error2=%d where=%d\n", + error, igz.error, error2, igz.where); + } + if (igz.error) + return igz.error; + if (error) + return ENOEXEC; + if (error2) + return error2; + return 0; +} + +static int +do_aout_hdr(struct imgact_gzip * gz) +{ + int error; + struct vmspace *vmspace; + vm_offset_t vmaddr; + + /* + * Set file/virtual offset based on a.out variant. We do two cases: + * host byte order and network byte order (for NetBSD compatibility) + */ + switch ((int) (gz->a_out.a_magic & 0xffff)) { + case ZMAGIC: + gz->virtual_offset = 0; + if (gz->a_out.a_text) { + gz->file_offset = PAGE_SIZE; + } else { + /* Bill's "screwball mode" */ + gz->file_offset = 0; + } + break; + case QMAGIC: + gz->virtual_offset = PAGE_SIZE; + gz->file_offset = 0; + break; + default: + /* NetBSD compatibility */ + switch ((int) (ntohl(gz->a_out.a_magic) & 0xffff)) { + case ZMAGIC: + case QMAGIC: + gz->virtual_offset = PAGE_SIZE; + gz->file_offset = 0; + break; + default: + gz->where = __LINE__; + return (-1); + } + } + + gz->bss_size = roundup(gz->a_out.a_bss, PAGE_SIZE); + + /* + * Check various fields in header for validity/bounds. + */ + if ( /* entry point must lay with text region */ + gz->a_out.a_entry < gz->virtual_offset || + gz->a_out.a_entry >= gz->virtual_offset + gz->a_out.a_text || + + /* text and data size must each be page rounded */ + gz->a_out.a_text & PAGE_MASK || gz->a_out.a_data & PAGE_MASK) { + gz->where = __LINE__; + return (-1); + } + /* + * text/data/bss must not exceed limits + */ + if ( /* text can't exceed maximum text size */ + gz->a_out.a_text > MAXTSIZ || + + /* data + bss can't exceed rlimit */ + gz->a_out.a_data + gz->bss_size > + gz->ip->proc->p_rlimit[RLIMIT_DATA].rlim_cur) { + gz->where = __LINE__; + return (ENOMEM); + } + /* Find out how far we should go */ + gz->file_end = gz->file_offset + gz->a_out.a_text + gz->a_out.a_data; + + /* copy in arguments and/or environment from old process */ + error = exec_extract_strings(gz->ip); + if (error) { + gz->where = __LINE__; + return (error); + } + /* + * Destroy old process VM and create a new one (with a new stack) + */ + exec_new_vmspace(gz->ip); + + vmspace = gz->ip->proc->p_vmspace; + + vmaddr = gz->virtual_offset; + + error = vm_mmap(&vmspace->vm_map, + &vmaddr, + gz->a_out.a_text + gz->a_out.a_data, + VM_PROT_ALL, VM_PROT_ALL, MAP_ANON | MAP_FIXED, + 0, + 0); + + if (error) { + gz->where = __LINE__; + return (error); + } + + if (gz->bss_size != 0) { + /* + * Allocate demand-zeroed area for uninitialized data. + * "bss" = 'block started by symbol' - named after the + * IBM 7090 instruction of the same name. + */ + vmaddr = gz->virtual_offset + gz->a_out.a_text + + gz->a_out.a_data; + error = vm_map_find(&vmspace->vm_map, + NULL, + 0, + &vmaddr, + gz->bss_size, + FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); + if (error) { + gz->where = __LINE__; + return (error); + } + } + /* Fill in process VM information */ + vmspace->vm_tsize = gz->a_out.a_text >> PAGE_SHIFT; + vmspace->vm_dsize = (gz->a_out.a_data + gz->bss_size) >> PAGE_SHIFT; + vmspace->vm_taddr = (caddr_t) (uintptr_t) gz->virtual_offset; + vmspace->vm_daddr = (caddr_t) (uintptr_t) + (gz->virtual_offset + gz->a_out.a_text); + + /* Fill in image_params */ + gz->ip->interpreted = 0; + gz->ip->entry_addr = gz->a_out.a_entry; + + gz->ip->proc->p_sysent = &aout_sysvec; + + return 0; +} + +static int +NextByte(void *vp) +{ + int error; + struct imgact_gzip *igz = (struct imgact_gzip *) vp; + + if (igz->idx >= igz->len) { + igz->where = __LINE__; + return GZ_EOF; + } + if (igz->inbuf && igz->idx < (igz->offset + PAGE_SIZE)) { + return igz->inbuf[(igz->idx++) - igz->offset]; + } + if (igz->inbuf) { + error = vm_map_remove(kernel_map, (vm_offset_t) igz->inbuf, + (vm_offset_t) igz->inbuf + PAGE_SIZE); + if (error) { + igz->where = __LINE__; + igz->error = error; + return GZ_EOF; + } + } + igz->offset = igz->idx & ~PAGE_MASK; + + error = vm_mmap(kernel_map, /* map */ + (vm_offset_t *) & igz->inbuf, /* address */ + PAGE_SIZE, /* size */ + VM_PROT_READ, /* protection */ + VM_PROT_READ, /* max protection */ + 0, /* flags */ + (caddr_t) igz->ip->vp, /* vnode */ + igz->offset); /* offset */ + if (error) { + igz->where = __LINE__; + igz->error = error; + return GZ_EOF; + } + return igz->inbuf[(igz->idx++) - igz->offset]; +} + +static int +Flush(void *vp, u_char * ptr, u_long siz) +{ + struct imgact_gzip *gz = (struct imgact_gzip *) vp; + u_char *p = ptr, *q; + int i; + + /* First, find a a.out-header */ + if (gz->output < sizeof gz->a_out) { + q = (u_char *) & gz->a_out; + i = min(siz, sizeof gz->a_out - gz->output); + bcopy(p, q + gz->output, i); + gz->output += i; + p += i; + siz -= i; + if (gz->output == sizeof gz->a_out) { + i = do_aout_hdr(gz); + if (i == -1) { + if (!gz->where) + gz->where = __LINE__; + gz->error = ENOEXEC; + return ENOEXEC; + } else if (i) { + gz->where = __LINE__; + gz->error = i; + return ENOEXEC; + } + if (gz->file_offset == 0) { + q = (u_char *) (uintptr_t) gz->virtual_offset; + copyout(&gz->a_out, q, sizeof gz->a_out); + } + } + } + /* Skip over zero-padded first PAGE if needed */ + if (gz->output < gz->file_offset && + gz->output + siz > gz->file_offset) { + i = min(siz, gz->file_offset - gz->output); + gz->output += i; + p += i; + siz -= i; + } + if (gz->output >= gz->file_offset && gz->output < gz->file_end) { + i = min(siz, gz->file_end - gz->output); + q = (u_char *) (uintptr_t) + (gz->virtual_offset + gz->output - gz->file_offset); + copyout(p, q, i); + gz->output += i; + p += i; + siz -= i; + } + gz->output += siz; + return 0; +} + + +/* + * Tell kern_execve.c about it, with a little help from the linker. + * Since `const' objects end up in the text segment, TEXT_SET is the + * correct directive to use. + */ + +static const struct execsw gzip_execsw = {exec_gzip_imgact, "gzip"}; +EXEC_SET(execgzip, gzip_execsw); diff --git a/sys/kern/imgact_shell.c b/sys/kern/imgact_shell.c new file mode 100644 index 0000000..e72b86d --- /dev/null +++ b/sys/kern/imgact_shell.c @@ -0,0 +1,138 @@ +/* + * Copyright (c) 1993, David Greenman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: imgact_shell.c,v 1.16 1997/08/02 14:31:23 bde Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/exec.h> +#include <sys/imgact.h> +#include <sys/kernel.h> + +#if BYTE_ORDER == LITTLE_ENDIAN +#define SHELLMAGIC 0x2123 /* #! */ +#else +#define SHELLMAGIC 0x2321 +#endif + +#define MAXSHELLCMDLEN 64 + +static int exec_shell_imgact __P((struct image_params *imgp)); + +/* + * Shell interpreter image activator. A interpreter name beginning + * at imgp->stringbase is the minimal successful exit requirement. + */ +static int +exec_shell_imgact(imgp) + struct image_params *imgp; +{ + const char *image_header = imgp->image_header; + const char *ihp, *line_endp; + char *interp; + + /* a shell script? */ + if (((const short *) image_header)[0] != SHELLMAGIC) + return(-1); + + /* + * Don't allow a shell script to be the shell for a shell + * script. :-) + */ + if (imgp->interpreted) + return(ENOEXEC); + + imgp->interpreted = 1; + + /* + * Copy shell name and arguments from image_header into string + * buffer. + */ + + /* + * Find end of line; return if the line > MAXSHELLCMDLEN long. + */ + for (ihp = &image_header[2]; *ihp != '\n'; ++ihp) { + if (ihp >= &image_header[MAXSHELLCMDLEN]) + return(ENOEXEC); + } + line_endp = ihp; + + /* reset for another pass */ + ihp = &image_header[2]; + + /* Skip over leading spaces - until the interpreter name */ + while ((*ihp == ' ') || (*ihp == '\t')) ihp++; + + /* copy the interpreter name */ + interp = imgp->interpreter_name; + while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t')) + *interp++ = *ihp++; + *interp = '\0'; + + /* Disallow a null interpreter filename */ + if (*imgp->interpreter_name == '\0') + return(ENOEXEC); + + /* reset for another pass */ + ihp = &image_header[2]; + + /* copy the interpreter name and arguments */ + while (ihp < line_endp) { + /* Skip over leading spaces */ + while ((*ihp == ' ') || (*ihp == '\t')) ihp++; + + if (ihp < line_endp) { + /* + * Copy to end of token. No need to watch stringspace + * because this is at the front of the string buffer + * and the maximum shell command length is tiny. + */ + while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t')) { + *imgp->stringp++ = *ihp++; + imgp->stringspace--; + } + + *imgp->stringp++ = 0; + imgp->stringspace--; + + imgp->argc++; + } + } + + imgp->argv0 = imgp->uap->fname; + + return(0); +} + +/* + * Tell kern_execve.c about it, with a little help from the linker. + * Since `const' objects end up in the text segment, TEXT_SET is the + * correct directive to use. + */ +static const struct execsw shell_execsw = { exec_shell_imgact, "#!" }; +EXEC_SET(shell, shell_execsw); diff --git a/sys/kern/inflate.c b/sys/kern/inflate.c new file mode 100644 index 0000000..1db9b2c --- /dev/null +++ b/sys/kern/inflate.c @@ -0,0 +1,1078 @@ +/* + * Most parts of this file are not covered by: + * ---------------------------------------------------------------------------- + * "THE BEER-WARE LICENSE" (Revision 42): + * <phk@login.dknet.dk> wrote this file. As long as you retain this notice you + * can do whatever you want with this stuff. If we meet some day, and you think + * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp + * ---------------------------------------------------------------------------- + * + * $Id: inflate.c,v 1.11 1997/10/12 20:23:40 phk Exp $ + * + * + */ + +#include <sys/param.h> +#include <sys/inflate.h> +#ifdef KERNEL +#include <sys/systm.h> +#include <sys/kernel.h> +#endif +#include <sys/malloc.h> + +#ifdef KERNEL +static MALLOC_DEFINE(M_GZIP, "Gzip trees", "Gzip trees"); +#endif + +/* needed to make inflate() work */ +#define uch u_char +#define ush u_short +#define ulg u_long + +/* Stuff to make inflate() work */ +#ifdef KERNEL +#define memzero(dest,len) bzero(dest,len) +#endif +#define NOMEMCPY +#ifdef KERNEL +#define FPRINTF printf +#else +extern void putstr (char *); +#define FPRINTF putstr +#endif + +#define FLUSH(x,y) { \ + int foo = (*x->gz_output)(x->gz_private,x->gz_slide,y); \ + if (foo) \ + return foo; \ + } + +static const int qflag = 0; + +#ifndef KERNEL /* want to use this file in kzip also */ +extern unsigned char *kzipmalloc (int); +extern void kzipfree (void*); +#define malloc(x, y, z) kzipmalloc((x)) +#define free(x, y) kzipfree((x)) +#endif + +/* + * This came from unzip-5.12. I have changed it the flow to pass + * a structure pointer around, thus hopefully making it re-entrant. + * Poul-Henning + */ + +/* inflate.c -- put in the public domain by Mark Adler + version c14o, 23 August 1994 */ + +/* You can do whatever you like with this source file, though I would + prefer that if you modify it and redistribute it that you include + comments to that effect with your name and the date. Thank you. + + History: + vers date who what + ---- --------- -------------- ------------------------------------ + a ~~ Feb 92 M. Adler used full (large, one-step) lookup table + b1 21 Mar 92 M. Adler first version with partial lookup tables + b2 21 Mar 92 M. Adler fixed bug in fixed-code blocks + b3 22 Mar 92 M. Adler sped up match copies, cleaned up some + b4 25 Mar 92 M. Adler added prototypes; removed window[] (now + is the responsibility of unzip.h--also + changed name to slide[]), so needs diffs + for unzip.c and unzip.h (this allows + compiling in the small model on MSDOS); + fixed cast of q in huft_build(); + b5 26 Mar 92 M. Adler got rid of unintended macro recursion. + b6 27 Mar 92 M. Adler got rid of nextbyte() routine. fixed + bug in inflate_fixed(). + c1 30 Mar 92 M. Adler removed lbits, dbits environment variables. + changed BMAX to 16 for explode. Removed + OUTB usage, and replaced it with flush()-- + this was a 20% speed improvement! Added + an explode.c (to replace unimplod.c) that + uses the huft routines here. Removed + register union. + c2 4 Apr 92 M. Adler fixed bug for file sizes a multiple of 32k. + c3 10 Apr 92 M. Adler reduced memory of code tables made by + huft_build significantly (factor of two to + three). + c4 15 Apr 92 M. Adler added NOMEMCPY do kill use of memcpy(). + worked around a Turbo C optimization bug. + c5 21 Apr 92 M. Adler added the GZ_WSIZE #define to allow reducing + the 32K window size for specialized + applications. + c6 31 May 92 M. Adler added some typecasts to eliminate warnings + c7 27 Jun 92 G. Roelofs added some more typecasts (444: MSC bug). + c8 5 Oct 92 J-l. Gailly added ifdef'd code to deal with PKZIP bug. + c9 9 Oct 92 M. Adler removed a memory error message (~line 416). + c10 17 Oct 92 G. Roelofs changed ULONG/UWORD/byte to ulg/ush/uch, + removed old inflate, renamed inflate_entry + to inflate, added Mark's fix to a comment. + c10.5 14 Dec 92 M. Adler fix up error messages for incomplete trees. + c11 2 Jan 93 M. Adler fixed bug in detection of incomplete + tables, and removed assumption that EOB is + the longest code (bad assumption). + c12 3 Jan 93 M. Adler make tables for fixed blocks only once. + c13 5 Jan 93 M. Adler allow all zero length codes (pkzip 2.04c + outputs one zero length code for an empty + distance tree). + c14 12 Mar 93 M. Adler made inflate.c standalone with the + introduction of inflate.h. + c14b 16 Jul 93 G. Roelofs added (unsigned) typecast to w at 470. + c14c 19 Jul 93 J. Bush changed v[N_MAX], l[288], ll[28x+3x] arrays + to static for Amiga. + c14d 13 Aug 93 J-l. Gailly de-complicatified Mark's c[*p++]++ thing. + c14e 8 Oct 93 G. Roelofs changed memset() to memzero(). + c14f 22 Oct 93 G. Roelofs renamed quietflg to qflag; made Trace() + conditional; added inflate_free(). + c14g 28 Oct 93 G. Roelofs changed l/(lx+1) macro to pointer (Cray bug) + c14h 7 Dec 93 C. Ghisler huft_build() optimizations. + c14i 9 Jan 94 A. Verheijen set fixed_t{d,l} to NULL after freeing; + G. Roelofs check NEXTBYTE macro for GZ_EOF. + c14j 23 Jan 94 G. Roelofs removed Ghisler "optimizations"; ifdef'd + GZ_EOF check. + c14k 27 Feb 94 G. Roelofs added some typecasts to avoid warnings. + c14l 9 Apr 94 G. Roelofs fixed split comments on preprocessor lines + to avoid bug in Encore compiler. + c14m 7 Jul 94 P. Kienitz modified to allow assembler version of + inflate_codes() (define ASM_INFLATECODES) + c14n 22 Jul 94 G. Roelofs changed fprintf to FPRINTF for DLL versions + c14o 23 Aug 94 C. Spieler added a newline to a debug statement; + G. Roelofs added another typecast to avoid MSC warning + */ + + +/* + Inflate deflated (PKZIP's method 8 compressed) data. The compression + method searches for as much of the current string of bytes (up to a + length of 258) in the previous 32K bytes. If it doesn't find any + matches (of at least length 3), it codes the next byte. Otherwise, it + codes the length of the matched string and its distance backwards from + the current position. There is a single Huffman code that codes both + single bytes (called "literals") and match lengths. A second Huffman + code codes the distance information, which follows a length code. Each + length or distance code actually represents a base value and a number + of "extra" (sometimes zero) bits to get to add to the base value. At + the end of each deflated block is a special end-of-block (EOB) literal/ + length code. The decoding process is basically: get a literal/length + code; if EOB then done; if a literal, emit the decoded byte; if a + length then get the distance and emit the referred-to bytes from the + sliding window of previously emitted data. + + There are (currently) three kinds of inflate blocks: stored, fixed, and + dynamic. The compressor outputs a chunk of data at a time and decides + which method to use on a chunk-by-chunk basis. A chunk might typically + be 32K to 64K, uncompressed. If the chunk is uncompressible, then the + "stored" method is used. In this case, the bytes are simply stored as + is, eight bits per byte, with none of the above coding. The bytes are + preceded by a count, since there is no longer an EOB code. + + If the data is compressible, then either the fixed or dynamic methods + are used. In the dynamic method, the compressed data is preceded by + an encoding of the literal/length and distance Huffman codes that are + to be used to decode this block. The representation is itself Huffman + coded, and so is preceded by a description of that code. These code + descriptions take up a little space, and so for small blocks, there is + a predefined set of codes, called the fixed codes. The fixed method is + used if the block ends up smaller that way (usually for quite small + chunks); otherwise the dynamic method is used. In the latter case, the + codes are customized to the probabilities in the current block and so + can code it much better than the pre-determined fixed codes can. + + The Huffman codes themselves are decoded using a mutli-level table + lookup, in order to maximize the speed of decoding plus the speed of + building the decoding tables. See the comments below that precede the + lbits and dbits tuning parameters. + */ + + +/* + Notes beyond the 1.93a appnote.txt: + + 1. Distance pointers never point before the beginning of the output + stream. + 2. Distance pointers can point back across blocks, up to 32k away. + 3. There is an implied maximum of 7 bits for the bit length table and + 15 bits for the actual data. + 4. If only one code exists, then it is encoded using one bit. (Zero + would be more efficient, but perhaps a little confusing.) If two + codes exist, they are coded using one bit each (0 and 1). + 5. There is no way of sending zero distance codes--a dummy must be + sent if there are none. (History: a pre 2.0 version of PKZIP would + store blocks with no distance codes, but this was discovered to be + too harsh a criterion.) Valid only for 1.93a. 2.04c does allow + zero distance codes, which is sent as one code of zero bits in + length. + 6. There are up to 286 literal/length codes. Code 256 represents the + end-of-block. Note however that the static length tree defines + 288 codes just to fill out the Huffman codes. Codes 286 and 287 + cannot be used though, since there is no length base or extra bits + defined for them. Similarily, there are up to 30 distance codes. + However, static trees define 32 codes (all 5 bits) to fill out the + Huffman codes, but the last two had better not show up in the data. + 7. Unzip can check dynamic Huffman blocks for complete code sets. + The exception is that a single code would not be complete (see #4). + 8. The five bits following the block type is really the number of + literal codes sent minus 257. + 9. Length codes 8,16,16 are interpreted as 13 length codes of 8 bits + (1+6+6). Therefore, to output three times the length, you output + three codes (1+1+1), whereas to output four times the same length, + you only need two codes (1+3). Hmm. + 10. In the tree reconstruction algorithm, Code = Code + Increment + only if BitLength(i) is not zero. (Pretty obvious.) + 11. Correction: 4 Bits: # of Bit Length codes - 4 (4 - 19) + 12. Note: length code 284 can represent 227-258, but length code 285 + really is 258. The last length deserves its own, short code + since it gets used a lot in very redundant files. The length + 258 is special since 258 - 3 (the min match length) is 255. + 13. The literal/length and distance code bit lengths are read as a + single stream of lengths. It is possible (and advantageous) for + a repeat code (16, 17, or 18) to go across the boundary between + the two sets of lengths. + */ + + +#define PKZIP_BUG_WORKAROUND /* PKZIP 1.93a problem--live with it */ + +/* + inflate.h must supply the uch slide[GZ_WSIZE] array and the NEXTBYTE, + FLUSH() and memzero macros. If the window size is not 32K, it + should also define GZ_WSIZE. If INFMOD is defined, it can include + compiled functions to support the NEXTBYTE and/or FLUSH() macros. + There are defaults for NEXTBYTE and FLUSH() below for use as + examples of what those functions need to do. Normally, you would + also want FLUSH() to compute a crc on the data. inflate.h also + needs to provide these typedefs: + + typedef unsigned char uch; + typedef unsigned short ush; + typedef unsigned long ulg; + + This module uses the external functions malloc() and free() (and + probably memset() or bzero() in the memzero() macro). Their + prototypes are normally found in <string.h> and <stdlib.h>. + */ +#define INFMOD /* tell inflate.h to include code to be + * compiled */ + +/* Huffman code lookup table entry--this entry is four bytes for machines + that have 16-bit pointers (e.g. PC's in the small or medium model). + Valid extra bits are 0..13. e == 15 is EOB (end of block), e == 16 + means that v is a literal, 16 < e < 32 means that v is a pointer to + the next table, which codes e - 16 bits, and lastly e == 99 indicates + an unused code. If a code with e == 99 is looked up, this implies an + error in the data. */ +struct huft { + uch e; /* number of extra bits or operation */ + uch b; /* number of bits in this code or subcode */ + union { + ush n; /* literal, length base, or distance + * base */ + struct huft *t; /* pointer to next level of table */ + } v; +}; + + +/* Function prototypes */ +static int huft_build __P((struct inflate *, unsigned *, unsigned, unsigned, const ush *, const ush *, struct huft **, int *)); +static int huft_free __P((struct inflate *, struct huft *)); +static int inflate_codes __P((struct inflate *, struct huft *, struct huft *, int, int)); +static int inflate_stored __P((struct inflate *)); +static int xinflate __P((struct inflate *)); +static int inflate_fixed __P((struct inflate *)); +static int inflate_dynamic __P((struct inflate *)); +static int inflate_block __P((struct inflate *, int *)); + +/* The inflate algorithm uses a sliding 32K byte window on the uncompressed + stream to find repeated byte strings. This is implemented here as a + circular buffer. The index is updated simply by incrementing and then + and'ing with 0x7fff (32K-1). */ +/* It is left to other modules to supply the 32K area. It is assumed + to be usable as if it were declared "uch slide[32768];" or as just + "uch *slide;" and then malloc'ed in the latter case. The definition + must be in unzip.h, included above. */ + + +/* Tables for deflate from PKZIP's appnote.txt. */ + +/* Order of the bit length code lengths */ +static const unsigned border[] = { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; + +static const ush cplens[] = { /* Copy lengths for literal codes 257..285 */ + 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, + 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0}; + /* note: see note #13 above about the 258 in this list. */ + +static const ush cplext[] = { /* Extra bits for literal codes 257..285 */ + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, + 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 99, 99}; /* 99==invalid */ + +static const ush cpdist[] = { /* Copy offsets for distance codes 0..29 */ + 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, + 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, + 8193, 12289, 16385, 24577}; + +static const ush cpdext[] = { /* Extra bits for distance codes */ + 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, + 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, + 12, 12, 13, 13}; + +/* And'ing with mask[n] masks the lower n bits */ +static const ush mask[] = { + 0x0000, + 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff, + 0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff +}; + + +/* Macros for inflate() bit peeking and grabbing. + The usage is: + + NEEDBITS(glbl,j) + x = b & mask[j]; + DUMPBITS(j) + + where NEEDBITS makes sure that b has at least j bits in it, and + DUMPBITS removes the bits from b. The macros use the variable k + for the number of bits in b. Normally, b and k are register + variables for speed, and are initialized at the begining of a + routine that uses these macros from a global bit buffer and count. + + In order to not ask for more bits than there are in the compressed + stream, the Huffman tables are constructed to only ask for just + enough bits to make up the end-of-block code (value 256). Then no + bytes need to be "returned" to the buffer at the end of the last + block. See the huft_build() routine. + */ + +/* + * The following 2 were global variables. + * They are now fields of the inflate structure. + */ + +#define NEEDBITS(glbl,n) { \ + while(k<(n)) { \ + int c=(*glbl->gz_input)(glbl->gz_private); \ + if(c==GZ_EOF) \ + return 1; \ + b|=((ulg)c)<<k; \ + k+=8; \ + } \ + } + +#define DUMPBITS(n) {b>>=(n);k-=(n);} + +/* + Huffman code decoding is performed using a multi-level table lookup. + The fastest way to decode is to simply build a lookup table whose + size is determined by the longest code. However, the time it takes + to build this table can also be a factor if the data being decoded + is not very long. The most common codes are necessarily the + shortest codes, so those codes dominate the decoding time, and hence + the speed. The idea is you can have a shorter table that decodes the + shorter, more probable codes, and then point to subsidiary tables for + the longer codes. The time it costs to decode the longer codes is + then traded against the time it takes to make longer tables. + + This results of this trade are in the variables lbits and dbits + below. lbits is the number of bits the first level table for literal/ + length codes can decode in one step, and dbits is the same thing for + the distance codes. Subsequent tables are also less than or equal to + those sizes. These values may be adjusted either when all of the + codes are shorter than that, in which case the longest code length in + bits is used, or when the shortest code is *longer* than the requested + table size, in which case the length of the shortest code in bits is + used. + + There are two different values for the two tables, since they code a + different number of possibilities each. The literal/length table + codes 286 possible values, or in a flat code, a little over eight + bits. The distance table codes 30 possible values, or a little less + than five bits, flat. The optimum values for speed end up being + about one bit more than those, so lbits is 8+1 and dbits is 5+1. + The optimum values may differ though from machine to machine, and + possibly even between compilers. Your mileage may vary. + */ + +static const int lbits = 9; /* bits in base literal/length lookup table */ +static const int dbits = 6; /* bits in base distance lookup table */ + + +/* If BMAX needs to be larger than 16, then h and x[] should be ulg. */ +#define BMAX 16 /* maximum bit length of any code (16 for + * explode) */ +#define N_MAX 288 /* maximum number of codes in any set */ + +/* Given a list of code lengths and a maximum table size, make a set of + tables to decode that set of codes. Return zero on success, one if + the given code set is incomplete (the tables are still built in this + case), two if the input is invalid (all zero length codes or an + oversubscribed set of lengths), and three if not enough memory. + The code with value 256 is special, and the tables are constructed + so that no bits beyond that code are fetched when that code is + decoded. */ +static int +huft_build(glbl, b, n, s, d, e, t, m) + struct inflate *glbl; + unsigned *b; /* code lengths in bits (all assumed <= BMAX) */ + unsigned n; /* number of codes (assumed <= N_MAX) */ + unsigned s; /* number of simple-valued codes (0..s-1) */ + const ush *d; /* list of base values for non-simple codes */ + const ush *e; /* list of extra bits for non-simple codes */ + struct huft **t; /* result: starting table */ + int *m; /* maximum lookup bits, returns actual */ +{ + unsigned a; /* counter for codes of length k */ + unsigned c[BMAX + 1]; /* bit length count table */ + unsigned el; /* length of EOB code (value 256) */ + unsigned f; /* i repeats in table every f entries */ + int g; /* maximum code length */ + int h; /* table level */ + register unsigned i; /* counter, current code */ + register unsigned j; /* counter */ + register int k; /* number of bits in current code */ + int lx[BMAX + 1]; /* memory for l[-1..BMAX-1] */ + int *l = lx + 1; /* stack of bits per table */ + register unsigned *p; /* pointer into c[], b[], or v[] */ + register struct huft *q;/* points to current table */ + struct huft r; /* table entry for structure assignment */ + struct huft *u[BMAX];/* table stack */ + unsigned v[N_MAX]; /* values in order of bit length */ + register int w; /* bits before this table == (l * h) */ + unsigned x[BMAX + 1]; /* bit offsets, then code stack */ + unsigned *xp; /* pointer into x */ + int y; /* number of dummy codes added */ + unsigned z; /* number of entries in current table */ + + /* Generate counts for each bit length */ + el = n > 256 ? b[256] : BMAX; /* set length of EOB code, if any */ +#ifdef KERNEL + memzero((char *) c, sizeof(c)); +#else + for (i = 0; i < BMAX+1; i++) + c [i] = 0; +#endif + p = b; + i = n; + do { + c[*p]++; + p++; /* assume all entries <= BMAX */ + } while (--i); + if (c[0] == n) { /* null input--all zero length codes */ + *t = (struct huft *) NULL; + *m = 0; + return 0; + } + /* Find minimum and maximum length, bound *m by those */ + for (j = 1; j <= BMAX; j++) + if (c[j]) + break; + k = j; /* minimum code length */ + if ((unsigned) *m < j) + *m = j; + for (i = BMAX; i; i--) + if (c[i]) + break; + g = i; /* maximum code length */ + if ((unsigned) *m > i) + *m = i; + + /* Adjust last length count to fill out codes, if needed */ + for (y = 1 << j; j < i; j++, y <<= 1) + if ((y -= c[j]) < 0) + return 2; /* bad input: more codes than bits */ + if ((y -= c[i]) < 0) + return 2; + c[i] += y; + + /* Generate starting offsets into the value table for each length */ + x[1] = j = 0; + p = c + 1; + xp = x + 2; + while (--i) { /* note that i == g from above */ + *xp++ = (j += *p++); + } + + /* Make a table of values in order of bit lengths */ + p = b; + i = 0; + do { + if ((j = *p++) != 0) + v[x[j]++] = i; + } while (++i < n); + + /* Generate the Huffman codes and for each, make the table entries */ + x[0] = i = 0; /* first Huffman code is zero */ + p = v; /* grab values in bit order */ + h = -1; /* no tables yet--level -1 */ + w = l[-1] = 0; /* no bits decoded yet */ + u[0] = (struct huft *) NULL; /* just to keep compilers happy */ + q = (struct huft *) NULL; /* ditto */ + z = 0; /* ditto */ + + /* go through the bit lengths (k already is bits in shortest code) */ + for (; k <= g; k++) { + a = c[k]; + while (a--) { + /* + * here i is the Huffman code of length k bits for + * value *p + */ + /* make tables up to required level */ + while (k > w + l[h]) { + w += l[h++]; /* add bits already decoded */ + + /* + * compute minimum size table less than or + * equal to *m bits + */ + z = (z = g - w) > (unsigned) *m ? *m : z; /* upper limit */ + if ((f = 1 << (j = k - w)) > a + 1) { /* try a k-w bit table *//* t + * oo few codes for k-w + * bit table */ + f -= a + 1; /* deduct codes from + * patterns left */ + xp = c + k; + while (++j < z) { /* try smaller tables up + * to z bits */ + if ((f <<= 1) <= *++xp) + break; /* enough codes to use + * up j bits */ + f -= *xp; /* else deduct codes + * from patterns */ + } + } + if ((unsigned) w + j > el && (unsigned) w < el) + j = el - w; /* make EOB code end at + * table */ + z = 1 << j; /* table entries for j-bit + * table */ + l[h] = j; /* set table size in stack */ + + /* allocate and link in new table */ + if ((q = (struct huft *) malloc((z + 1) * sizeof(struct huft), M_GZIP, M_WAITOK)) == + (struct huft *) NULL) { + if (h) + huft_free(glbl, u[0]); + return 3; /* not enough memory */ + } + glbl->gz_hufts += z + 1; /* track memory usage */ + *t = q + 1; /* link to list for + * huft_free() */ + *(t = &(q->v.t)) = (struct huft *) NULL; + u[h] = ++q; /* table starts after link */ + + /* connect to last table, if there is one */ + if (h) { + x[h] = i; /* save pattern for + * backing up */ + r.b = (uch) l[h - 1]; /* bits to dump before + * this table */ + r.e = (uch) (16 + j); /* bits in this table */ + r.v.t = q; /* pointer to this table */ + j = (i & ((1 << w) - 1)) >> (w - l[h - 1]); + u[h - 1][j] = r; /* connect to last table */ + } + } + + /* set up table entry in r */ + r.b = (uch) (k - w); + if (p >= v + n) + r.e = 99; /* out of values--invalid + * code */ + else if (*p < s) { + r.e = (uch) (*p < 256 ? 16 : 15); /* 256 is end-of-block + * code */ + r.v.n = *p++; /* simple code is just the + * value */ + } else { + r.e = (uch) e[*p - s]; /* non-simple--look up + * in lists */ + r.v.n = d[*p++ - s]; + } + + /* fill code-like entries with r */ + f = 1 << (k - w); + for (j = i >> w; j < z; j += f) + q[j] = r; + + /* backwards increment the k-bit code i */ + for (j = 1 << (k - 1); i & j; j >>= 1) + i ^= j; + i ^= j; + + /* backup over finished tables */ + while ((i & ((1 << w) - 1)) != x[h]) + w -= l[--h]; /* don't need to update q */ + } + } + + /* return actual size of base table */ + *m = l[0]; + + /* Return true (1) if we were given an incomplete table */ + return y != 0 && g != 1; +} + +static int +huft_free(glbl, t) + struct inflate *glbl; + struct huft *t; /* table to free */ +/* Free the malloc'ed tables built by huft_build(), which makes a linked + list of the tables it made, with the links in a dummy first entry of + each table. */ +{ + register struct huft *p, *q; + + /* Go through linked list, freeing from the malloced (t[-1]) address. */ + p = t; + while (p != (struct huft *) NULL) { + q = (--p)->v.t; + free(p, M_GZIP); + p = q; + } + return 0; +} + +/* inflate (decompress) the codes in a deflated (compressed) block. + Return an error code or zero if it all goes ok. */ +static int +inflate_codes(glbl, tl, td, bl, bd) + struct inflate *glbl; + struct huft *tl, *td;/* literal/length and distance decoder tables */ + int bl, bd; /* number of bits decoded by tl[] and td[] */ +{ + register unsigned e; /* table entry flag/number of extra bits */ + unsigned n, d; /* length and index for copy */ + unsigned w; /* current window position */ + struct huft *t; /* pointer to table entry */ + unsigned ml, md; /* masks for bl and bd bits */ + register ulg b; /* bit buffer */ + register unsigned k; /* number of bits in bit buffer */ + + /* make local copies of globals */ + b = glbl->gz_bb; /* initialize bit buffer */ + k = glbl->gz_bk; + w = glbl->gz_wp; /* initialize window position */ + + /* inflate the coded data */ + ml = mask[bl]; /* precompute masks for speed */ + md = mask[bd]; + while (1) { /* do until end of block */ + NEEDBITS(glbl, (unsigned) bl) + if ((e = (t = tl + ((unsigned) b & ml))->e) > 16) + do { + if (e == 99) + return 1; + DUMPBITS(t->b) + e -= 16; + NEEDBITS(glbl, e) + } while ((e = (t = t->v.t + ((unsigned) b & mask[e]))->e) > 16); + DUMPBITS(t->b) + if (e == 16) { /* then it's a literal */ + glbl->gz_slide[w++] = (uch) t->v.n; + if (w == GZ_WSIZE) { + FLUSH(glbl, w); + w = 0; + } + } else { /* it's an EOB or a length */ + /* exit if end of block */ + if (e == 15) + break; + + /* get length of block to copy */ + NEEDBITS(glbl, e) + n = t->v.n + ((unsigned) b & mask[e]); + DUMPBITS(e); + + /* decode distance of block to copy */ + NEEDBITS(glbl, (unsigned) bd) + if ((e = (t = td + ((unsigned) b & md))->e) > 16) + do { + if (e == 99) + return 1; + DUMPBITS(t->b) + e -= 16; + NEEDBITS(glbl, e) + } while ((e = (t = t->v.t + ((unsigned) b & mask[e]))->e) > 16); + DUMPBITS(t->b) + NEEDBITS(glbl, e) + d = w - t->v.n - ((unsigned) b & mask[e]); + DUMPBITS(e) + /* do the copy */ + do { + n -= (e = (e = GZ_WSIZE - ((d &= GZ_WSIZE - 1) > w ? d : w)) > n ? n : e); +#ifndef NOMEMCPY + if (w - d >= e) { /* (this test assumes + * unsigned comparison) */ + memcpy(glbl->gz_slide + w, glbl->gz_slide + d, e); + w += e; + d += e; + } else /* do it slow to avoid memcpy() + * overlap */ +#endif /* !NOMEMCPY */ + do { + glbl->gz_slide[w++] = glbl->gz_slide[d++]; + } while (--e); + if (w == GZ_WSIZE) { + FLUSH(glbl, w); + w = 0; + } + } while (n); + } + } + + /* restore the globals from the locals */ + glbl->gz_wp = w; /* restore global window pointer */ + glbl->gz_bb = b; /* restore global bit buffer */ + glbl->gz_bk = k; + + /* done */ + return 0; +} + +/* "decompress" an inflated type 0 (stored) block. */ +static int +inflate_stored(glbl) + struct inflate *glbl; +{ + unsigned n; /* number of bytes in block */ + unsigned w; /* current window position */ + register ulg b; /* bit buffer */ + register unsigned k; /* number of bits in bit buffer */ + + /* make local copies of globals */ + b = glbl->gz_bb; /* initialize bit buffer */ + k = glbl->gz_bk; + w = glbl->gz_wp; /* initialize window position */ + + /* go to byte boundary */ + n = k & 7; + DUMPBITS(n); + + /* get the length and its complement */ + NEEDBITS(glbl, 16) + n = ((unsigned) b & 0xffff); + DUMPBITS(16) + NEEDBITS(glbl, 16) + if (n != (unsigned) ((~b) & 0xffff)) + return 1; /* error in compressed data */ + DUMPBITS(16) + /* read and output the compressed data */ + while (n--) { + NEEDBITS(glbl, 8) + glbl->gz_slide[w++] = (uch) b; + if (w == GZ_WSIZE) { + FLUSH(glbl, w); + w = 0; + } + DUMPBITS(8) + } + + /* restore the globals from the locals */ + glbl->gz_wp = w; /* restore global window pointer */ + glbl->gz_bb = b; /* restore global bit buffer */ + glbl->gz_bk = k; + return 0; +} + +/* decompress an inflated type 1 (fixed Huffman codes) block. We should + either replace this with a custom decoder, or at least precompute the + Huffman tables. */ +static int +inflate_fixed(glbl) + struct inflate *glbl; +{ + /* if first time, set up tables for fixed blocks */ + if (glbl->gz_fixed_tl == (struct huft *) NULL) { + int i; /* temporary variable */ + static unsigned l[288]; /* length list for huft_build */ + + /* literal table */ + for (i = 0; i < 144; i++) + l[i] = 8; + for (; i < 256; i++) + l[i] = 9; + for (; i < 280; i++) + l[i] = 7; + for (; i < 288; i++) /* make a complete, but wrong code + * set */ + l[i] = 8; + glbl->gz_fixed_bl = 7; + if ((i = huft_build(glbl, l, 288, 257, cplens, cplext, + &glbl->gz_fixed_tl, &glbl->gz_fixed_bl)) != 0) { + glbl->gz_fixed_tl = (struct huft *) NULL; + return i; + } + /* distance table */ + for (i = 0; i < 30; i++) /* make an incomplete code + * set */ + l[i] = 5; + glbl->gz_fixed_bd = 5; + if ((i = huft_build(glbl, l, 30, 0, cpdist, cpdext, + &glbl->gz_fixed_td, &glbl->gz_fixed_bd)) > 1) { + huft_free(glbl, glbl->gz_fixed_tl); + glbl->gz_fixed_tl = (struct huft *) NULL; + return i; + } + } + /* decompress until an end-of-block code */ + return inflate_codes(glbl, glbl->gz_fixed_tl, glbl->gz_fixed_td, glbl->gz_fixed_bl, glbl->gz_fixed_bd) != 0; +} + +/* decompress an inflated type 2 (dynamic Huffman codes) block. */ +static int +inflate_dynamic(glbl) + struct inflate *glbl; +{ + int i; /* temporary variables */ + unsigned j; + unsigned l; /* last length */ + unsigned m; /* mask for bit lengths table */ + unsigned n; /* number of lengths to get */ + struct huft *tl; /* literal/length code table */ + struct huft *td; /* distance code table */ + int bl; /* lookup bits for tl */ + int bd; /* lookup bits for td */ + unsigned nb; /* number of bit length codes */ + unsigned nl; /* number of literal/length codes */ + unsigned nd; /* number of distance codes */ +#ifdef PKZIP_BUG_WORKAROUND + unsigned ll[288 + 32]; /* literal/length and distance code + * lengths */ +#else + unsigned ll[286 + 30]; /* literal/length and distance code + * lengths */ +#endif + register ulg b; /* bit buffer */ + register unsigned k; /* number of bits in bit buffer */ + + /* make local bit buffer */ + b = glbl->gz_bb; + k = glbl->gz_bk; + + /* read in table lengths */ + NEEDBITS(glbl, 5) + nl = 257 + ((unsigned) b & 0x1f); /* number of + * literal/length codes */ + DUMPBITS(5) + NEEDBITS(glbl, 5) + nd = 1 + ((unsigned) b & 0x1f); /* number of distance codes */ + DUMPBITS(5) + NEEDBITS(glbl, 4) + nb = 4 + ((unsigned) b & 0xf); /* number of bit length codes */ + DUMPBITS(4) +#ifdef PKZIP_BUG_WORKAROUND + if (nl > 288 || nd > 32) +#else + if (nl > 286 || nd > 30) +#endif + return 1; /* bad lengths */ + /* read in bit-length-code lengths */ + for (j = 0; j < nb; j++) { + NEEDBITS(glbl, 3) + ll[border[j]] = (unsigned) b & 7; + DUMPBITS(3) + } + for (; j < 19; j++) + ll[border[j]] = 0; + + /* build decoding table for trees--single level, 7 bit lookup */ + bl = 7; + if ((i = huft_build(glbl, ll, 19, 19, NULL, NULL, &tl, &bl)) != 0) { + if (i == 1) + huft_free(glbl, tl); + return i; /* incomplete code set */ + } + /* read in literal and distance code lengths */ + n = nl + nd; + m = mask[bl]; + i = l = 0; + while ((unsigned) i < n) { + NEEDBITS(glbl, (unsigned) bl) + j = (td = tl + ((unsigned) b & m))->b; + DUMPBITS(j) + j = td->v.n; + if (j < 16) /* length of code in bits (0..15) */ + ll[i++] = l = j; /* save last length in l */ + else if (j == 16) { /* repeat last length 3 to 6 times */ + NEEDBITS(glbl, 2) + j = 3 + ((unsigned) b & 3); + DUMPBITS(2) + if ((unsigned) i + j > n) + return 1; + while (j--) + ll[i++] = l; + } else if (j == 17) { /* 3 to 10 zero length codes */ + NEEDBITS(glbl, 3) + j = 3 + ((unsigned) b & 7); + DUMPBITS(3) + if ((unsigned) i + j > n) + return 1; + while (j--) + ll[i++] = 0; + l = 0; + } else { /* j == 18: 11 to 138 zero length codes */ + NEEDBITS(glbl, 7) + j = 11 + ((unsigned) b & 0x7f); + DUMPBITS(7) + if ((unsigned) i + j > n) + return 1; + while (j--) + ll[i++] = 0; + l = 0; + } + } + + /* free decoding table for trees */ + huft_free(glbl, tl); + + /* restore the global bit buffer */ + glbl->gz_bb = b; + glbl->gz_bk = k; + + /* build the decoding tables for literal/length and distance codes */ + bl = lbits; + i = huft_build(glbl, ll, nl, 257, cplens, cplext, &tl, &bl); + if (i != 0) { + if (i == 1 && !qflag) { + FPRINTF("(incomplete l-tree) "); + huft_free(glbl, tl); + } + return i; /* incomplete code set */ + } + bd = dbits; + i = huft_build(glbl, ll + nl, nd, 0, cpdist, cpdext, &td, &bd); + if (i != 0) { + if (i == 1 && !qflag) { + FPRINTF("(incomplete d-tree) "); +#ifdef PKZIP_BUG_WORKAROUND + i = 0; + } +#else + huft_free(glbl, td); + } + huft_free(glbl, tl); + return i; /* incomplete code set */ +#endif + } + /* decompress until an end-of-block code */ + if (inflate_codes(glbl, tl, td, bl, bd)) + return 1; + + /* free the decoding tables, return */ + huft_free(glbl, tl); + huft_free(glbl, td); + return 0; +} + +/* decompress an inflated block */ +static int +inflate_block(glbl, e) + struct inflate *glbl; + int *e; /* last block flag */ +{ + unsigned t; /* block type */ + register ulg b; /* bit buffer */ + register unsigned k; /* number of bits in bit buffer */ + + /* make local bit buffer */ + b = glbl->gz_bb; + k = glbl->gz_bk; + + /* read in last block bit */ + NEEDBITS(glbl, 1) + * e = (int) b & 1; + DUMPBITS(1) + /* read in block type */ + NEEDBITS(glbl, 2) + t = (unsigned) b & 3; + DUMPBITS(2) + /* restore the global bit buffer */ + glbl->gz_bb = b; + glbl->gz_bk = k; + + /* inflate that block type */ + if (t == 2) + return inflate_dynamic(glbl); + if (t == 0) + return inflate_stored(glbl); + if (t == 1) + return inflate_fixed(glbl); + /* bad block type */ + return 2; +} + + + +/* decompress an inflated entry */ +static int +xinflate(glbl) + struct inflate *glbl; +{ + int e; /* last block flag */ + int r; /* result code */ + unsigned h; /* maximum struct huft's malloc'ed */ + + glbl->gz_fixed_tl = (struct huft *) NULL; + + /* initialize window, bit buffer */ + glbl->gz_wp = 0; + glbl->gz_bk = 0; + glbl->gz_bb = 0; + + /* decompress until the last block */ + h = 0; + do { + glbl->gz_hufts = 0; + if ((r = inflate_block(glbl, &e)) != 0) + return r; + if (glbl->gz_hufts > h) + h = glbl->gz_hufts; + } while (!e); + + /* flush out slide */ + FLUSH(glbl, glbl->gz_wp); + + /* return success */ + return 0; +} + +/* Nobody uses this - why not? */ +int +inflate(glbl) + struct inflate *glbl; +{ + int i; +#ifdef KERNEL + u_char *p = NULL; + + if (!glbl->gz_slide) + p = glbl->gz_slide = malloc(GZ_WSIZE, M_GZIP, M_WAITOK); +#endif + if (!glbl->gz_slide) +#ifdef KERNEL + return(ENOMEM); +#else + return 3; /* kzip expects 3 */ +#endif + i = xinflate(glbl); + + if (glbl->gz_fixed_td != (struct huft *) NULL) { + huft_free(glbl, glbl->gz_fixed_td); + glbl->gz_fixed_td = (struct huft *) NULL; + } + if (glbl->gz_fixed_tl != (struct huft *) NULL) { + huft_free(glbl, glbl->gz_fixed_tl); + glbl->gz_fixed_tl = (struct huft *) NULL; + } +#ifdef KERNEL + if (p == glbl->gz_slide) { + free(glbl->gz_slide, M_GZIP); + glbl->gz_slide = NULL; + } +#endif + return i; +} +/* ----------------------- END INFLATE.C */ diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c new file mode 100644 index 0000000..246684f --- /dev/null +++ b/sys/kern/init_main.c @@ -0,0 +1,698 @@ +/* + * Copyright (c) 1995 Terrence R. Lambert + * All rights reserved. + * + * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)init_main.c 8.9 (Berkeley) 1/21/94 + * $Id: init_main.c,v 1.102 1998/12/30 10:38:58 dfr Exp $ + */ + +#include "opt_devfs.h" + +#include <sys/param.h> +#include <sys/file.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/mount.h> +#include <sys/sysctl.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/systm.h> +#include <sys/vnode.h> +#include <sys/sysent.h> +#include <sys/reboot.h> +#include <sys/sysproto.h> +#include <sys/vmmeter.h> +#include <sys/unistd.h> +#include <sys/malloc.h> + +#include <machine/cpu.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <sys/user.h> +#include <sys/copyright.h> + +extern struct linker_set sysinit_set; /* XXX */ + +extern void __main __P((void)); +extern void main __P((void *framep)); + +/* Components of the first process -- never freed. */ +static struct session session0; +static struct pgrp pgrp0; +struct proc proc0; +static struct pcred cred0; +#ifdef COMPAT_LINUX_THREADS +static struct procsig procsig0; +#endif /* COMPAT_LINUX_THREADS */ +static struct filedesc0 filedesc0; +static struct plimit limit0; +static struct vmspace vmspace0; +struct proc *initproc; + +int cmask = CMASK; +extern struct user *proc0paddr; + +struct vnode *rootvp; +int boothowto = 0; /* initialized so that it can be patched */ + +struct timeval boottime; +SYSCTL_STRUCT(_kern, KERN_BOOTTIME, boottime, + CTLFLAG_RD, &boottime, timeval, ""); + +static int shutdowntimeout = 120; +SYSCTL_INT(_kern, OID_AUTO, shutdown_timeout, + CTLFLAG_RW, &shutdowntimeout, 0, ""); + +/* + * Promiscuous argument pass for start_init() + * + * This is a kludge because we use a return from main() rather than a call + * to a new routine in locore.s to kick the kernel alive from locore.s. + */ +static void *init_framep; + + +#if __GNUC__ >= 2 +void __main() {} +#endif + + +/* + * This ensures that there is at least one entry so that the sysinit_set + * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never + * executed. + */ +SYSINIT(placeholder, SI_SUB_DUMMY,SI_ORDER_ANY, NULL, NULL) + +/* + * The sysinit table itself. Items are checked off as the are run. + * If we want to register new sysinit types, add them to newsysinit. + */ +struct sysinit **sysinit = (struct sysinit **)sysinit_set.ls_items; +struct sysinit **newsysinit; + +/* + * Merge a new sysinit set into the current set, reallocating it if + * necessary. This can only be called after malloc is running. + */ +void +sysinit_add(set) + struct sysinit **set; +{ + struct sysinit **newset; + struct sysinit **sipp; + struct sysinit **xipp; + int count = 0; + + if (newsysinit) + for (sipp = newsysinit; *sipp; sipp++) + count++; + else + for (sipp = sysinit; *sipp; sipp++) + count++; + for (sipp = set; *sipp; sipp++) + count++; + count++; /* Trailing NULL */ + newset = malloc(count * sizeof(*sipp), M_TEMP, M_NOWAIT); + if (newset == NULL) + panic("cannot malloc for sysinit"); + xipp = newset; + if (newsysinit) + for (sipp = newsysinit; *sipp; sipp++) + *xipp++ = *sipp; + else + for (sipp = sysinit; *sipp; sipp++) + *xipp++ = *sipp; + for (sipp = set; *sipp; sipp++) + *xipp++ = *sipp; + *xipp = NULL; + if (newsysinit) + free(newsysinit, M_TEMP); + newsysinit = newset; +} + +/* + * System startup; initialize the world, create process 0, mount root + * filesystem, and fork to create init and pagedaemon. Most of the + * hard work is done in the lower-level initialization routines including + * startup(), which does memory initialization and autoconfiguration. + * + * This allows simple addition of new kernel subsystems that require + * boot time initialization. It also allows substitution of subsystem + * (for instance, a scheduler, kernel profiler, or VM system) by object + * module. Finally, it allows for optional "kernel threads". + */ +void +main(framep) + void *framep; +{ + + register struct sysinit **sipp; /* system initialization*/ + register struct sysinit **xipp; /* interior loop of sort*/ + register struct sysinit *save; /* bubble*/ + + /* + * Copy the locore.s frame pointer for proc0, this is forked into + * all other processes. + */ + init_framep = framep; + +restart: + /* + * Perform a bubble sort of the system initialization objects by + * their subsystem (primary key) and order (secondary key). + */ + for (sipp = sysinit; *sipp; sipp++) { + for (xipp = sipp + 1; *xipp; xipp++) { + if ((*sipp)->subsystem < (*xipp)->subsystem || + ((*sipp)->subsystem == (*xipp)->subsystem && + (*sipp)->order < (*xipp)->order)) + continue; /* skip*/ + save = *sipp; + *sipp = *xipp; + *xipp = save; + } + } + + /* + * Traverse the (now) ordered list of system initialization tasks. + * Perform each task, and continue on to the next task. + * + * The last item on the list is expected to be the scheduler, + * which will not return. + */ + for (sipp = sysinit; *sipp; sipp++) { + + if ((*sipp)->subsystem == SI_SUB_DUMMY) + continue; /* skip dummy task(s)*/ + + if ((*sipp)->subsystem == SI_SUB_DONE) + continue; + + switch( (*sipp)->type) { + case SI_TYPE_DEFAULT: + /* no special processing*/ + (*((*sipp)->func))((*sipp)->udata); + break; + + case SI_TYPE_KTHREAD: +#if !defined(SMP) + /* kernel thread*/ + if (fork1(&proc0, RFMEM|RFFDG|RFPROC)) + panic("fork kernel thread"); + cpu_set_fork_handler(pfind(proc0.p_retval[0]), + (*sipp)->func, (*sipp)->udata); + break; +#endif + + case SI_TYPE_KPROCESS: + if (fork1(&proc0, RFFDG|RFPROC)) + panic("fork kernel process"); + cpu_set_fork_handler(pfind(proc0.p_retval[0]), + (*sipp)->func, (*sipp)->udata); + break; + + default: + panic("init_main: unrecognized init type"); + } + + /* Check off the one we're just done */ + (*sipp)->subsystem = SI_SUB_DONE; + + /* Check if we've installed more sysinit items via KLD */ + if (newsysinit != NULL) { + if (sysinit != (struct sysinit **)sysinit_set.ls_items) + free(sysinit, M_TEMP); + sysinit = newsysinit; + newsysinit = NULL; + goto restart; + } + } + + panic("Shouldn't get here!"); + /* NOTREACHED*/ +} + + +/* + * Start a kernel process. This is called after a fork() call in + * main() in the file kern/init_main.c. + * + * This function is used to start "internal" daemons. + */ +/* ARGSUSED*/ +void +kproc_start(udata) + void *udata; +{ + struct kproc_desc *kp = udata; + struct proc *p = curproc; + +#ifdef DIAGNOSTIC + printf("Start pid=%d <%s>\n",p->p_pid, kp->arg0); +#endif + + /* save a global descriptor, if desired*/ + if( kp->global_procpp != NULL) + *kp->global_procpp = p; + + /* this is a non-swapped system process*/ + p->p_flag |= P_INMEM | P_SYSTEM; + + /* set up arg0 for 'ps', et al*/ + strcpy( p->p_comm, kp->arg0); + + /* call the processes' main()...*/ + (*kp->func)(); + + /* NOTREACHED */ + panic("kproc_start: %s", kp->arg0); +} + + +/* + *************************************************************************** + **** + **** The following SYSINIT's belong elsewhere, but have not yet + **** been moved. + **** + *************************************************************************** + */ +#ifdef OMIT +/* + * Handled by vfs_mountroot (bad idea) at this time... should be + * done the same as 4.4Lite2. + */ +SYSINIT(swapinit, SI_SUB_SWAP, SI_ORDER_FIRST, swapinit, NULL) +#endif /* OMIT*/ + +static void print_caddr_t __P((void *data)); +static void +print_caddr_t(data) + void *data; +{ + printf("%s", (char *)data); +} +SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright) + + +/* + *************************************************************************** + **** + **** The two following SYSINT's are proc0 specific glue code. I am not + **** convinced that they can not be safely combined, but their order of + **** operation has been maintained as the same as the original init_main.c + **** for right now. + **** + **** These probably belong in init_proc.c or kern_proc.c, since they + **** deal with proc0 (the fork template process). + **** + *************************************************************************** + */ +/* ARGSUSED*/ +static void proc0_init __P((void *dummy)); +static void +proc0_init(dummy) + void *dummy; +{ + register struct proc *p; + register struct filedesc0 *fdp; + register unsigned i; + + /* + * Initialize the current process pointer (curproc) before + * any possible traps/probes to simplify trap processing. + */ + p = &proc0; + curproc = p; /* XXX redundant*/ + + /* + * Initialize process and pgrp structures. + */ + procinit(); + + /* + * Initialize sleep queue hash table + */ + sleepinit(); + + /* + * additional VM structures + */ + vm_init2(); + + /* + * Create process 0 (the swapper). + */ + LIST_INSERT_HEAD(&allproc, p, p_list); + p->p_pgrp = &pgrp0; + LIST_INSERT_HEAD(PGRPHASH(0), &pgrp0, pg_hash); + LIST_INIT(&pgrp0.pg_members); + LIST_INSERT_HEAD(&pgrp0.pg_members, p, p_pglist); + + pgrp0.pg_session = &session0; + session0.s_count = 1; + session0.s_leader = p; + + p->p_sysent = &aout_sysvec; + + p->p_flag = P_INMEM | P_SYSTEM; + p->p_stat = SRUN; + p->p_nice = NZERO; + p->p_rtprio.type = RTP_PRIO_NORMAL; + p->p_rtprio.prio = 0; + +/* + * Link for kernel based threads + */ + p->p_peers = 0; + p->p_leader = p; + + bcopy("swapper", p->p_comm, sizeof ("swapper")); + + /* Create credentials. */ + cred0.p_refcnt = 1; + p->p_cred = &cred0; + p->p_ucred = crget(); + p->p_ucred->cr_ngroups = 1; /* group 0 */ + +#ifdef COMPAT_LINUX_THREADS + /* Create procsig. */ + p->p_procsig = &procsig0; + p->p_procsig->ps_refcnt = 2; + +#endif /* COMPAT_LINUX_THREADS */ + /* Create the file descriptor table. */ + fdp = &filedesc0; + p->p_fd = &fdp->fd_fd; + fdp->fd_fd.fd_refcnt = 1; + fdp->fd_fd.fd_cmask = cmask; + fdp->fd_fd.fd_ofiles = fdp->fd_dfiles; + fdp->fd_fd.fd_ofileflags = fdp->fd_dfileflags; + fdp->fd_fd.fd_nfiles = NDFILE; + + /* Create the limits structures. */ + p->p_limit = &limit0; + for (i = 0; i < sizeof(p->p_rlimit)/sizeof(p->p_rlimit[0]); i++) + limit0.pl_rlimit[i].rlim_cur = + limit0.pl_rlimit[i].rlim_max = RLIM_INFINITY; + limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur = + limit0.pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles; + limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur = + limit0.pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc; + i = ptoa(cnt.v_free_count); + limit0.pl_rlimit[RLIMIT_RSS].rlim_max = i; + limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i; + limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_cur = i / 3; + limit0.p_cpulimit = RLIM_INFINITY; + limit0.p_refcnt = 1; + + + /* Allocate a prototype map so we have something to fork. */ + pmap_pinit0(&vmspace0.vm_pmap); + p->p_vmspace = &vmspace0; + vmspace0.vm_refcnt = 1; + vm_map_init(&vmspace0.vm_map, round_page(VM_MIN_ADDRESS), + trunc_page(VM_MAXUSER_ADDRESS)); + vmspace0.vm_map.pmap = &vmspace0.vm_pmap; + p->p_addr = proc0paddr; /* XXX */ + +#ifndef __alpha__ /* XXX what is this? */ +#define INCOMPAT_LITES2 +#ifdef INCOMPAT_LITES2 + /* + * proc0 needs to have a coherent frame base in its stack. + */ + cpu_set_init_frame(p, init_framep); /* XXX! */ +#endif /* INCOMPAT_LITES2*/ +#endif + + /* + * We continue to place resource usage info and signal + * actions in the user struct so they're pageable. + */ + p->p_stats = &p->p_addr->u_stats; + p->p_sigacts = &p->p_addr->u_sigacts; + + /* + * Charge root for one process. + */ + (void)chgproccnt(0, 1); + + /* + * Initialize the procfs flags (to 0, of course) + */ + p->p_stops = p->p_stype = p->p_step = 0; + +} +SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL) + +/* ARGSUSED*/ +static void proc0_post __P((void *dummy)); +static void +proc0_post(dummy) + void *dummy; +{ + struct timespec ts; + + /* + * Now can look at time, having had a chance to verify the time + * from the file system. Reset p->p_runtime as it may have been + * munched in mi_switch() after the time got set. Set + * p->p_switchtime to be consistent with this unmunching. + */ + microtime(&proc0.p_stats->p_start); + proc0.p_runtime = 0; + microuptime(&proc0.p_switchtime); + + /* + * Give the ``random'' number generator a thump. + * XXX: Does read_random() contain enough bits to be used here ? + */ + nanotime(&ts); + srandom(ts.tv_sec ^ ts.tv_nsec); + + /* Initialize signal state for process 0. */ + siginit(&proc0); +} +SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL) + + + + +/* + *************************************************************************** + **** + **** The following SYSINIT's and glue code should be moved to the + **** respective files on a per subsystem basis. + **** + *************************************************************************** + */ + +/* ARGSUSED */ +static void root_conf __P((void *dummy)); +static void +root_conf(dummy) + void *dummy; +{ + cpu_rootconf(); +} +SYSINIT(root_conf, SI_SUB_ROOT_CONF, SI_ORDER_FIRST, root_conf, NULL) + +/* ARGSUSED*/ +static void xxx_vfs_root_fdtab __P((void *dummy)); +static void +xxx_vfs_root_fdtab(dummy) + void *dummy; +{ + register struct filedesc0 *fdp = &filedesc0; + + /* Get the vnode for '/'. Set fdp->fd_fd.fd_cdir to reference it. */ + if (VFS_ROOT(mountlist.cqh_first, &rootvnode)) + panic("cannot find root vnode"); + fdp->fd_fd.fd_cdir = rootvnode; + VREF(fdp->fd_fd.fd_cdir); + VOP_UNLOCK(rootvnode, 0, &proc0); + fdp->fd_fd.fd_rdir = rootvnode; +} +SYSINIT(retrofit, SI_SUB_ROOT_FDTAB, SI_ORDER_FIRST, xxx_vfs_root_fdtab, NULL) + + +/* + *************************************************************************** + **** + **** The following code probably belongs in another file, like + **** kern/init_init.c. It is here for two reasons only: + **** + **** 1) This code returns to startup the system; this is + **** abnormal for a kernel thread. + **** 2) This code promiscuously uses init_frame + **** + *************************************************************************** + */ + +static void kthread_init __P((void *dummy)); +SYSINIT_KP(init,SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kthread_init, NULL) + + +extern void prepare_usermode __P((void)); +static void start_init __P((struct proc *p)); + +/* ARGSUSED*/ +static void +kthread_init(dummy) + void *dummy; +{ + /* Create process 1 (init(8)). */ + start_init(curproc); + + prepare_usermode(); + + /* + * This returns to the fork trampoline, then to user mode. + */ + return; +} + + +/* + * List of paths to try when searching for "init". + */ +static char *initpaths[] = { + "/sbin/init", + "/sbin/oinit", + "/sbin/init.bak", + "/stand/sysinstall", + NULL, +}; + +/* + * Start the initial user process; try exec'ing each pathname in "initpaths". + * The program is invoked with one argument containing the boot flags. + */ +static void +start_init(p) + struct proc *p; +{ + vm_offset_t addr; + struct execve_args args; + int options, i, error; + char **pathp, *path, *ucp, **uap, *arg0, *arg1; + + initproc = p; + + /* + * Need just enough stack to hold the faked-up "execve()" arguments. + */ + addr = trunc_page(USRSTACK - PAGE_SIZE); + if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0) + panic("init: couldn't allocate argument space"); + p->p_vmspace->vm_maxsaddr = (caddr_t)addr; + p->p_vmspace->vm_ssize = 1; + + for (pathp = &initpaths[0]; (path = *pathp) != NULL; pathp++) { + /* + * Move out the boot flag argument. + */ + options = 0; + ucp = (char *)USRSTACK; + (void)subyte(--ucp, 0); /* trailing zero */ + if (boothowto & RB_SINGLE) { + (void)subyte(--ucp, 's'); + options = 1; + } +#ifdef notyet + if (boothowto & RB_FASTBOOT) { + (void)subyte(--ucp, 'f'); + options = 1; + } +#endif + +#ifdef BOOTCDROM + (void)subyte(--ucp, 'C'); + options = 1; +#endif + if (options == 0) + (void)subyte(--ucp, '-'); + (void)subyte(--ucp, '-'); /* leading hyphen */ + arg1 = ucp; + + /* + * Move out the file name (also arg 0). + */ + for (i = strlen(path) + 1; i >= 0; i--) + (void)subyte(--ucp, path[i]); + arg0 = ucp; + + /* + * Move out the arg pointers. + */ + uap = (char **)((intptr_t)ucp & ~(sizeof(intptr_t)-1)); + (void)suword((caddr_t)--uap, (long)0); /* terminator */ + (void)suword((caddr_t)--uap, (long)(intptr_t)arg1); + (void)suword((caddr_t)--uap, (long)(intptr_t)arg0); + + /* + * Point at the arguments. + */ + args.fname = arg0; + args.argv = uap; + args.envv = NULL; + + /* + * Now try to exec the program. If can't for any reason + * other than it doesn't exist, complain. + * + * Otherwise return to main() which returns to btext + * which completes the system startup. + */ + if ((error = execve(p, &args)) == 0) + return; + if (error != ENOENT) + printf("exec %s: error %d\n", path, error); + } + printf("init: not found\n"); + panic("no init"); +} diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c new file mode 100644 index 0000000..c31ed46 --- /dev/null +++ b/sys/kern/init_sysent.c @@ -0,0 +1,360 @@ +/* + * System call switch table. + * + * DO NOT EDIT-- this file is automatically generated. + * created from Id: syscalls.master,v 1.55 1998/11/11 12:45:14 peter Exp + */ + +#include "opt_compat.h" + +#include <sys/param.h> +#include <sys/sysent.h> +#include <sys/sysproto.h> + +#ifdef COMPAT_43 +#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name) +#else +#define compat(n, name) 0, (sy_call_t *)nosys +#endif + +/* The casts are bogus but will do for now. */ +struct sysent sysent[] = { + { 0, (sy_call_t *)nosys }, /* 0 = syscall */ + { 1, (sy_call_t *)exit }, /* 1 = exit */ + { 0, (sy_call_t *)fork }, /* 2 = fork */ + { 3, (sy_call_t *)read }, /* 3 = read */ + { 3, (sy_call_t *)write }, /* 4 = write */ + { 3, (sy_call_t *)open }, /* 5 = open */ + { 1, (sy_call_t *)close }, /* 6 = close */ + { 4, (sy_call_t *)wait4 }, /* 7 = wait4 */ + { compat(2,creat) }, /* 8 = old creat */ + { 2, (sy_call_t *)link }, /* 9 = link */ + { 1, (sy_call_t *)unlink }, /* 10 = unlink */ + { 0, (sy_call_t *)nosys }, /* 11 = obsolete execv */ + { 1, (sy_call_t *)chdir }, /* 12 = chdir */ + { 1, (sy_call_t *)fchdir }, /* 13 = fchdir */ + { 3, (sy_call_t *)mknod }, /* 14 = mknod */ + { 2, (sy_call_t *)chmod }, /* 15 = chmod */ + { 3, (sy_call_t *)chown }, /* 16 = chown */ + { 1, (sy_call_t *)obreak }, /* 17 = break */ + { 3, (sy_call_t *)getfsstat }, /* 18 = getfsstat */ + { compat(3,lseek) }, /* 19 = old lseek */ + { 0, (sy_call_t *)getpid }, /* 20 = getpid */ + { 4, (sy_call_t *)mount }, /* 21 = mount */ + { 2, (sy_call_t *)unmount }, /* 22 = unmount */ + { 1, (sy_call_t *)setuid }, /* 23 = setuid */ + { 0, (sy_call_t *)getuid }, /* 24 = getuid */ + { 0, (sy_call_t *)geteuid }, /* 25 = geteuid */ + { 4, (sy_call_t *)ptrace }, /* 26 = ptrace */ + { 3, (sy_call_t *)recvmsg }, /* 27 = recvmsg */ + { 3, (sy_call_t *)sendmsg }, /* 28 = sendmsg */ + { 6, (sy_call_t *)recvfrom }, /* 29 = recvfrom */ + { 3, (sy_call_t *)accept }, /* 30 = accept */ + { 3, (sy_call_t *)getpeername }, /* 31 = getpeername */ + { 3, (sy_call_t *)getsockname }, /* 32 = getsockname */ + { 2, (sy_call_t *)access }, /* 33 = access */ + { 2, (sy_call_t *)chflags }, /* 34 = chflags */ + { 2, (sy_call_t *)fchflags }, /* 35 = fchflags */ + { 0, (sy_call_t *)sync }, /* 36 = sync */ + { 2, (sy_call_t *)kill }, /* 37 = kill */ + { compat(2,stat) }, /* 38 = old stat */ + { 0, (sy_call_t *)getppid }, /* 39 = getppid */ + { compat(2,lstat) }, /* 40 = old lstat */ + { 1, (sy_call_t *)dup }, /* 41 = dup */ + { 0, (sy_call_t *)pipe }, /* 42 = pipe */ + { 0, (sy_call_t *)getegid }, /* 43 = getegid */ + { 4, (sy_call_t *)profil }, /* 44 = profil */ + { 4, (sy_call_t *)ktrace }, /* 45 = ktrace */ + { 3, (sy_call_t *)sigaction }, /* 46 = sigaction */ + { 0, (sy_call_t *)getgid }, /* 47 = getgid */ + { 2, (sy_call_t *)sigprocmask }, /* 48 = sigprocmask */ + { 2, (sy_call_t *)getlogin }, /* 49 = getlogin */ + { 1, (sy_call_t *)setlogin }, /* 50 = setlogin */ + { 1, (sy_call_t *)acct }, /* 51 = acct */ + { 0, (sy_call_t *)sigpending }, /* 52 = sigpending */ + { 2, (sy_call_t *)sigaltstack }, /* 53 = sigaltstack */ + { 3, (sy_call_t *)ioctl }, /* 54 = ioctl */ + { 1, (sy_call_t *)reboot }, /* 55 = reboot */ + { 1, (sy_call_t *)revoke }, /* 56 = revoke */ + { 2, (sy_call_t *)symlink }, /* 57 = symlink */ + { 3, (sy_call_t *)readlink }, /* 58 = readlink */ + { 3, (sy_call_t *)execve }, /* 59 = execve */ + { 1, (sy_call_t *)umask }, /* 60 = umask */ + { 1, (sy_call_t *)chroot }, /* 61 = chroot */ + { compat(2,fstat) }, /* 62 = old fstat */ + { compat(4,getkerninfo) }, /* 63 = old getkerninfo */ + { compat(0,getpagesize) }, /* 64 = old getpagesize */ + { 3, (sy_call_t *)msync }, /* 65 = msync */ + { 0, (sy_call_t *)vfork }, /* 66 = vfork */ + { 0, (sy_call_t *)nosys }, /* 67 = obsolete vread */ + { 0, (sy_call_t *)nosys }, /* 68 = obsolete vwrite */ + { 1, (sy_call_t *)sbrk }, /* 69 = sbrk */ + { 1, (sy_call_t *)sstk }, /* 70 = sstk */ + { compat(6,mmap) }, /* 71 = old mmap */ + { 1, (sy_call_t *)ovadvise }, /* 72 = vadvise */ + { 2, (sy_call_t *)munmap }, /* 73 = munmap */ + { 3, (sy_call_t *)mprotect }, /* 74 = mprotect */ + { 3, (sy_call_t *)madvise }, /* 75 = madvise */ + { 0, (sy_call_t *)nosys }, /* 76 = obsolete vhangup */ + { 0, (sy_call_t *)nosys }, /* 77 = obsolete vlimit */ + { 3, (sy_call_t *)mincore }, /* 78 = mincore */ + { 2, (sy_call_t *)getgroups }, /* 79 = getgroups */ + { 2, (sy_call_t *)setgroups }, /* 80 = setgroups */ + { 0, (sy_call_t *)getpgrp }, /* 81 = getpgrp */ + { 2, (sy_call_t *)setpgid }, /* 82 = setpgid */ + { 3, (sy_call_t *)setitimer }, /* 83 = setitimer */ + { compat(0,wait) }, /* 84 = old wait */ + { 1, (sy_call_t *)swapon }, /* 85 = swapon */ + { 2, (sy_call_t *)getitimer }, /* 86 = getitimer */ + { compat(2,gethostname) }, /* 87 = old gethostname */ + { compat(2,sethostname) }, /* 88 = old sethostname */ + { 0, (sy_call_t *)getdtablesize }, /* 89 = getdtablesize */ + { 2, (sy_call_t *)dup2 }, /* 90 = dup2 */ + { 0, (sy_call_t *)nosys }, /* 91 = getdopt */ + { 3, (sy_call_t *)fcntl }, /* 92 = fcntl */ + { 5, (sy_call_t *)select }, /* 93 = select */ + { 0, (sy_call_t *)nosys }, /* 94 = setdopt */ + { 1, (sy_call_t *)fsync }, /* 95 = fsync */ + { 3, (sy_call_t *)setpriority }, /* 96 = setpriority */ + { 3, (sy_call_t *)socket }, /* 97 = socket */ + { 3, (sy_call_t *)connect }, /* 98 = connect */ + { compat(3,accept) }, /* 99 = old accept */ + { 2, (sy_call_t *)getpriority }, /* 100 = getpriority */ + { compat(4,send) }, /* 101 = old send */ + { compat(4,recv) }, /* 102 = old recv */ + { 1, (sy_call_t *)sigreturn }, /* 103 = sigreturn */ + { 3, (sy_call_t *)bind }, /* 104 = bind */ + { 5, (sy_call_t *)setsockopt }, /* 105 = setsockopt */ + { 2, (sy_call_t *)listen }, /* 106 = listen */ + { 0, (sy_call_t *)nosys }, /* 107 = obsolete vtimes */ + { compat(3,sigvec) }, /* 108 = old sigvec */ + { compat(1,sigblock) }, /* 109 = old sigblock */ + { compat(1,sigsetmask) }, /* 110 = old sigsetmask */ + { 1, (sy_call_t *)sigsuspend }, /* 111 = sigsuspend */ + { compat(2,sigstack) }, /* 112 = old sigstack */ + { compat(3,recvmsg) }, /* 113 = old recvmsg */ + { compat(3,sendmsg) }, /* 114 = old sendmsg */ + { 0, (sy_call_t *)nosys }, /* 115 = obsolete vtrace */ + { 2, (sy_call_t *)gettimeofday }, /* 116 = gettimeofday */ + { 2, (sy_call_t *)getrusage }, /* 117 = getrusage */ + { 5, (sy_call_t *)getsockopt }, /* 118 = getsockopt */ + { 0, (sy_call_t *)nosys }, /* 119 = resuba */ + { 3, (sy_call_t *)readv }, /* 120 = readv */ + { 3, (sy_call_t *)writev }, /* 121 = writev */ + { 2, (sy_call_t *)settimeofday }, /* 122 = settimeofday */ + { 3, (sy_call_t *)fchown }, /* 123 = fchown */ + { 2, (sy_call_t *)fchmod }, /* 124 = fchmod */ + { compat(6,recvfrom) }, /* 125 = old recvfrom */ + { 2, (sy_call_t *)setreuid }, /* 126 = setreuid */ + { 2, (sy_call_t *)setregid }, /* 127 = setregid */ + { 2, (sy_call_t *)rename }, /* 128 = rename */ + { compat(2,truncate) }, /* 129 = old truncate */ + { compat(2,ftruncate) }, /* 130 = old ftruncate */ + { 2, (sy_call_t *)flock }, /* 131 = flock */ + { 2, (sy_call_t *)mkfifo }, /* 132 = mkfifo */ + { 6, (sy_call_t *)sendto }, /* 133 = sendto */ + { 2, (sy_call_t *)shutdown }, /* 134 = shutdown */ + { 4, (sy_call_t *)socketpair }, /* 135 = socketpair */ + { 2, (sy_call_t *)mkdir }, /* 136 = mkdir */ + { 1, (sy_call_t *)rmdir }, /* 137 = rmdir */ + { 2, (sy_call_t *)utimes }, /* 138 = utimes */ + { 0, (sy_call_t *)nosys }, /* 139 = obsolete 4.2 sigreturn */ + { 2, (sy_call_t *)adjtime }, /* 140 = adjtime */ + { compat(3,getpeername) }, /* 141 = old getpeername */ + { compat(0,gethostid) }, /* 142 = old gethostid */ + { compat(1,sethostid) }, /* 143 = old sethostid */ + { compat(2,getrlimit) }, /* 144 = old getrlimit */ + { compat(2,setrlimit) }, /* 145 = old setrlimit */ + { compat(2,killpg) }, /* 146 = old killpg */ + { 0, (sy_call_t *)setsid }, /* 147 = setsid */ + { 4, (sy_call_t *)quotactl }, /* 148 = quotactl */ + { compat(0,quota) }, /* 149 = old quota */ + { compat(3,getsockname) }, /* 150 = old getsockname */ + { 0, (sy_call_t *)nosys }, /* 151 = sem_lock */ + { 0, (sy_call_t *)nosys }, /* 152 = sem_wakeup */ + { 0, (sy_call_t *)nosys }, /* 153 = asyncdaemon */ + { 0, (sy_call_t *)nosys }, /* 154 = nosys */ + { 2, (sy_call_t *)nosys }, /* 155 = nfssvc */ + { compat(4,getdirentries) }, /* 156 = old getdirentries */ + { 2, (sy_call_t *)statfs }, /* 157 = statfs */ + { 2, (sy_call_t *)fstatfs }, /* 158 = fstatfs */ + { 0, (sy_call_t *)nosys }, /* 159 = nosys */ + { 0, (sy_call_t *)nosys }, /* 160 = nosys */ + { 2, (sy_call_t *)nosys }, /* 161 = getfh */ + { 2, (sy_call_t *)getdomainname }, /* 162 = getdomainname */ + { 2, (sy_call_t *)setdomainname }, /* 163 = setdomainname */ + { 1, (sy_call_t *)uname }, /* 164 = uname */ + { 2, (sy_call_t *)sysarch }, /* 165 = sysarch */ + { 3, (sy_call_t *)rtprio }, /* 166 = rtprio */ + { 0, (sy_call_t *)nosys }, /* 167 = nosys */ + { 0, (sy_call_t *)nosys }, /* 168 = nosys */ + { 5, (sy_call_t *)semsys }, /* 169 = semsys */ + { 6, (sy_call_t *)msgsys }, /* 170 = msgsys */ + { 4, (sy_call_t *)shmsys }, /* 171 = shmsys */ + { 0, (sy_call_t *)nosys }, /* 172 = nosys */ + { 0, (sy_call_t *)nosys }, /* 173 = nosys */ + { 0, (sy_call_t *)nosys }, /* 174 = nosys */ + { 0, (sy_call_t *)nosys }, /* 175 = nosys */ + { 1, (sy_call_t *)ntp_adjtime }, /* 176 = ntp_adjtime */ + { 0, (sy_call_t *)nosys }, /* 177 = sfork */ + { 0, (sy_call_t *)nosys }, /* 178 = getdescriptor */ + { 0, (sy_call_t *)nosys }, /* 179 = setdescriptor */ + { 0, (sy_call_t *)nosys }, /* 180 = nosys */ + { 1, (sy_call_t *)setgid }, /* 181 = setgid */ + { 1, (sy_call_t *)setegid }, /* 182 = setegid */ + { 1, (sy_call_t *)seteuid }, /* 183 = seteuid */ + { 0, (sy_call_t *)nosys }, /* 184 = lfs_bmapv */ + { 0, (sy_call_t *)nosys }, /* 185 = lfs_markv */ + { 0, (sy_call_t *)nosys }, /* 186 = lfs_segclean */ + { 0, (sy_call_t *)nosys }, /* 187 = lfs_segwait */ + { 2, (sy_call_t *)stat }, /* 188 = stat */ + { 2, (sy_call_t *)fstat }, /* 189 = fstat */ + { 2, (sy_call_t *)lstat }, /* 190 = lstat */ + { 2, (sy_call_t *)pathconf }, /* 191 = pathconf */ + { 2, (sy_call_t *)fpathconf }, /* 192 = fpathconf */ + { 0, (sy_call_t *)nosys }, /* 193 = nosys */ + { 2, (sy_call_t *)getrlimit }, /* 194 = getrlimit */ + { 2, (sy_call_t *)setrlimit }, /* 195 = setrlimit */ + { 4, (sy_call_t *)getdirentries }, /* 196 = getdirentries */ + { 8, (sy_call_t *)mmap }, /* 197 = mmap */ + { 0, (sy_call_t *)nosys }, /* 198 = __syscall */ + { 5, (sy_call_t *)lseek }, /* 199 = lseek */ + { 4, (sy_call_t *)truncate }, /* 200 = truncate */ + { 4, (sy_call_t *)ftruncate }, /* 201 = ftruncate */ + { 6, (sy_call_t *)__sysctl }, /* 202 = __sysctl */ + { 2, (sy_call_t *)mlock }, /* 203 = mlock */ + { 2, (sy_call_t *)munlock }, /* 204 = munlock */ + { 1, (sy_call_t *)undelete }, /* 205 = undelete */ + { 2, (sy_call_t *)futimes }, /* 206 = futimes */ + { 1, (sy_call_t *)getpgid }, /* 207 = getpgid */ + { 0, (sy_call_t *)nosys }, /* 208 = newreboot */ + { 3, (sy_call_t *)poll }, /* 209 = poll */ + { 0, (sy_call_t *)lkmnosys }, /* 210 = lkmnosys */ + { 0, (sy_call_t *)lkmnosys }, /* 211 = lkmnosys */ + { 0, (sy_call_t *)lkmnosys }, /* 212 = lkmnosys */ + { 0, (sy_call_t *)lkmnosys }, /* 213 = lkmnosys */ + { 0, (sy_call_t *)lkmnosys }, /* 214 = lkmnosys */ + { 0, (sy_call_t *)lkmnosys }, /* 215 = lkmnosys */ + { 0, (sy_call_t *)lkmnosys }, /* 216 = lkmnosys */ + { 0, (sy_call_t *)lkmnosys }, /* 217 = lkmnosys */ + { 0, (sy_call_t *)lkmnosys }, /* 218 = lkmnosys */ + { 0, (sy_call_t *)lkmnosys }, /* 219 = lkmnosys */ + { 4, (sy_call_t *)__semctl }, /* 220 = __semctl */ + { 3, (sy_call_t *)semget }, /* 221 = semget */ + { 3, (sy_call_t *)semop }, /* 222 = semop */ + { 1, (sy_call_t *)semconfig }, /* 223 = semconfig */ + { 3, (sy_call_t *)msgctl }, /* 224 = msgctl */ + { 2, (sy_call_t *)msgget }, /* 225 = msgget */ + { 4, (sy_call_t *)msgsnd }, /* 226 = msgsnd */ + { 5, (sy_call_t *)msgrcv }, /* 227 = msgrcv */ + { 3, (sy_call_t *)shmat }, /* 228 = shmat */ + { 3, (sy_call_t *)shmctl }, /* 229 = shmctl */ + { 1, (sy_call_t *)shmdt }, /* 230 = shmdt */ + { 3, (sy_call_t *)shmget }, /* 231 = shmget */ + { 2, (sy_call_t *)clock_gettime }, /* 232 = clock_gettime */ + { 2, (sy_call_t *)clock_settime }, /* 233 = clock_settime */ + { 2, (sy_call_t *)clock_getres }, /* 234 = clock_getres */ + { 0, (sy_call_t *)nosys }, /* 235 = timer_create */ + { 0, (sy_call_t *)nosys }, /* 236 = timer_delete */ + { 0, (sy_call_t *)nosys }, /* 237 = timer_settime */ + { 0, (sy_call_t *)nosys }, /* 238 = timer_gettime */ + { 0, (sy_call_t *)nosys }, /* 239 = timer_getoverrun */ + { 2, (sy_call_t *)nanosleep }, /* 240 = nanosleep */ + { 0, (sy_call_t *)nosys }, /* 241 = nosys */ + { 0, (sy_call_t *)nosys }, /* 242 = nosys */ + { 0, (sy_call_t *)nosys }, /* 243 = nosys */ + { 0, (sy_call_t *)nosys }, /* 244 = nosys */ + { 0, (sy_call_t *)nosys }, /* 245 = nosys */ + { 0, (sy_call_t *)nosys }, /* 246 = nosys */ + { 0, (sy_call_t *)nosys }, /* 247 = nosys */ + { 0, (sy_call_t *)nosys }, /* 248 = nosys */ + { 0, (sy_call_t *)nosys }, /* 249 = nosys */ + { 3, (sy_call_t *)minherit }, /* 250 = minherit */ + { 1, (sy_call_t *)rfork }, /* 251 = rfork */ + { 3, (sy_call_t *)openbsd_poll }, /* 252 = openbsd_poll */ + { 0, (sy_call_t *)issetugid }, /* 253 = issetugid */ + { 3, (sy_call_t *)lchown }, /* 254 = lchown */ + { 0, (sy_call_t *)nosys }, /* 255 = nosys */ + { 0, (sy_call_t *)nosys }, /* 256 = nosys */ + { 0, (sy_call_t *)nosys }, /* 257 = nosys */ + { 0, (sy_call_t *)nosys }, /* 258 = nosys */ + { 0, (sy_call_t *)nosys }, /* 259 = nosys */ + { 0, (sy_call_t *)nosys }, /* 260 = nosys */ + { 0, (sy_call_t *)nosys }, /* 261 = nosys */ + { 0, (sy_call_t *)nosys }, /* 262 = nosys */ + { 0, (sy_call_t *)nosys }, /* 263 = nosys */ + { 0, (sy_call_t *)nosys }, /* 264 = nosys */ + { 0, (sy_call_t *)nosys }, /* 265 = nosys */ + { 0, (sy_call_t *)nosys }, /* 266 = nosys */ + { 0, (sy_call_t *)nosys }, /* 267 = nosys */ + { 0, (sy_call_t *)nosys }, /* 268 = nosys */ + { 0, (sy_call_t *)nosys }, /* 269 = nosys */ + { 0, (sy_call_t *)nosys }, /* 270 = nosys */ + { 0, (sy_call_t *)nosys }, /* 271 = nosys */ + { 3, (sy_call_t *)getdents }, /* 272 = getdents */ + { 0, (sy_call_t *)nosys }, /* 273 = nosys */ + { 2, (sy_call_t *)lchmod }, /* 274 = lchmod */ + { 3, (sy_call_t *)lchown }, /* 275 = netbsd_lchown */ + { 2, (sy_call_t *)lutimes }, /* 276 = lutimes */ + { 3, (sy_call_t *)msync }, /* 277 = netbsd_msync */ + { 2, (sy_call_t *)nstat }, /* 278 = nstat */ + { 2, (sy_call_t *)nfstat }, /* 279 = nfstat */ + { 2, (sy_call_t *)nlstat }, /* 280 = nlstat */ + { 0, (sy_call_t *)nosys }, /* 281 = nosys */ + { 0, (sy_call_t *)nosys }, /* 282 = nosys */ + { 0, (sy_call_t *)nosys }, /* 283 = nosys */ + { 0, (sy_call_t *)nosys }, /* 284 = nosys */ + { 0, (sy_call_t *)nosys }, /* 285 = nosys */ + { 0, (sy_call_t *)nosys }, /* 286 = nosys */ + { 0, (sy_call_t *)nosys }, /* 287 = nosys */ + { 0, (sy_call_t *)nosys }, /* 288 = nosys */ + { 0, (sy_call_t *)nosys }, /* 289 = nosys */ + { 0, (sy_call_t *)nosys }, /* 290 = nosys */ + { 0, (sy_call_t *)nosys }, /* 291 = nosys */ + { 0, (sy_call_t *)nosys }, /* 292 = nosys */ + { 0, (sy_call_t *)nosys }, /* 293 = nosys */ + { 0, (sy_call_t *)nosys }, /* 294 = nosys */ + { 0, (sy_call_t *)nosys }, /* 295 = nosys */ + { 0, (sy_call_t *)nosys }, /* 296 = nosys */ + { 0, (sy_call_t *)nosys }, /* 297 = nosys */ + { 0, (sy_call_t *)nosys }, /* 298 = nosys */ + { 0, (sy_call_t *)nosys }, /* 299 = nosys */ + { 1, (sy_call_t *)modnext }, /* 300 = modnext */ + { 2, (sy_call_t *)modstat }, /* 301 = modstat */ + { 1, (sy_call_t *)modfnext }, /* 302 = modfnext */ + { 1, (sy_call_t *)modfind }, /* 303 = modfind */ + { 1, (sy_call_t *)kldload }, /* 304 = kldload */ + { 1, (sy_call_t *)kldunload }, /* 305 = kldunload */ + { 1, (sy_call_t *)kldfind }, /* 306 = kldfind */ + { 1, (sy_call_t *)kldnext }, /* 307 = kldnext */ + { 2, (sy_call_t *)kldstat }, /* 308 = kldstat */ + { 1, (sy_call_t *)kldfirstmod }, /* 309 = kldfirstmod */ + { 1, (sy_call_t *)getsid }, /* 310 = getsid */ + { 0, (sy_call_t *)nosys }, /* 311 = setresuid */ + { 0, (sy_call_t *)nosys }, /* 312 = setresgid */ + { 0, (sy_call_t *)nosys }, /* 313 = obsolete signanosleep */ + { 1, (sy_call_t *)aio_return }, /* 314 = aio_return */ + { 3, (sy_call_t *)aio_suspend }, /* 315 = aio_suspend */ + { 2, (sy_call_t *)aio_cancel }, /* 316 = aio_cancel */ + { 1, (sy_call_t *)aio_error }, /* 317 = aio_error */ + { 1, (sy_call_t *)aio_read }, /* 318 = aio_read */ + { 1, (sy_call_t *)aio_write }, /* 319 = aio_write */ + { 4, (sy_call_t *)lio_listio }, /* 320 = lio_listio */ + { 0, (sy_call_t *)yield }, /* 321 = yield */ + { 1, (sy_call_t *)thr_sleep }, /* 322 = thr_sleep */ + { 1, (sy_call_t *)thr_wakeup }, /* 323 = thr_wakeup */ + { 1, (sy_call_t *)mlockall }, /* 324 = mlockall */ + { 0, (sy_call_t *)munlockall }, /* 325 = munlockall */ + { 2, (sy_call_t *)__getcwd }, /* 326 = __getcwd */ + { 2, (sy_call_t *)sched_setparam }, /* 327 = sched_setparam */ + { 2, (sy_call_t *)sched_getparam }, /* 328 = sched_getparam */ + { 3, (sy_call_t *)sched_setscheduler }, /* 329 = sched_setscheduler */ + { 1, (sy_call_t *)sched_getscheduler }, /* 330 = sched_getscheduler */ + { 0, (sy_call_t *)sched_yield }, /* 331 = sched_yield */ + { 1, (sy_call_t *)sched_get_priority_max }, /* 332 = sched_get_priority_max */ + { 1, (sy_call_t *)sched_get_priority_min }, /* 333 = sched_get_priority_min */ + { 2, (sy_call_t *)sched_rr_get_interval }, /* 334 = sched_rr_get_interval */ + { 2, (sy_call_t *)utrace }, /* 335 = utrace */ + { 8, (sy_call_t *)sendfile }, /* 336 = sendfile */ + { 3, (sy_call_t *)kldsym }, /* 337 = kldsym */ +}; diff --git a/sys/kern/kern_acct.c b/sys/kern/kern_acct.c new file mode 100644 index 0000000..11db4e9 --- /dev/null +++ b/sys/kern/kern_acct.c @@ -0,0 +1,330 @@ +/*- + * Copyright (c) 1994 Christopher G. Demetriou + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_acct.c 8.1 (Berkeley) 6/14/93 + * $Id: kern_acct.c,v 1.18 1997/11/06 19:29:07 phk Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/proc.h> +#include <sys/mount.h> +#include <sys/vnode.h> +#include <sys/fcntl.h> +#include <sys/syslog.h> +#include <sys/kernel.h> +#include <sys/sysent.h> +#include <sys/sysctl.h> +#include <sys/namei.h> +#include <sys/acct.h> +#include <sys/resourcevar.h> +#include <sys/tty.h> + +/* + * The routines implemented in this file are described in: + * Leffler, et al.: The Design and Implementation of the 4.3BSD + * UNIX Operating System (Addison Welley, 1989) + * on pages 62-63. + * + * Arguably, to simplify accounting operations, this mechanism should + * be replaced by one in which an accounting log file (similar to /dev/klog) + * is read by a user process, etc. However, that has its own problems. + */ + +/* + * Internal accounting functions. + * The former's operation is described in Leffler, et al., and the latter + * was provided by UCB with the 4.4BSD-Lite release + */ +static comp_t encode_comp_t __P((u_long, u_long)); +static void acctwatch __P((void *)); + +/* + * Accounting callout handle used for periodic scheduling of + * acctwatch. + */ +static struct callout_handle acctwatch_handle + = CALLOUT_HANDLE_INITIALIZER(&acctwatch_handle); + +/* + * Accounting vnode pointer, and saved vnode pointer. + */ +static struct vnode *acctp; +static struct vnode *savacctp; + +/* + * Values associated with enabling and disabling accounting + */ +static int acctsuspend = 2; /* stop accounting when < 2% free space left */ +SYSCTL_INT(_kern, OID_AUTO, acct_suspend, CTLFLAG_RW, + &acctsuspend, 0, ""); + +static int acctresume = 4; /* resume when free space risen to > 4% */ +SYSCTL_INT(_kern, OID_AUTO, acct_resume, CTLFLAG_RW, + &acctresume, 0, ""); + +static int acctchkfreq = 15; /* frequency (in seconds) to check space */ +SYSCTL_INT(_kern, OID_AUTO, acct_chkfreq, CTLFLAG_RW, + &acctchkfreq, 0, ""); + +/* + * Accounting system call. Written based on the specification and + * previous implementation done by Mark Tinguely. + */ +int +acct(a1, uap) + struct proc *a1; + struct acct_args /* { + syscallarg(char *) path; + } */ *uap; +{ + struct proc *p = curproc; /* XXX */ + struct nameidata nd; + int error; + + /* Make sure that the caller is root. */ + error = suser(p->p_ucred, &p->p_acflag); + if (error) + return (error); + + /* + * If accounting is to be started to a file, open that file for + * writing and make sure it's a 'normal'. + */ + if (SCARG(uap, path) != NULL) { + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), + p); + error = vn_open(&nd, FWRITE, 0); + if (error) + return (error); + VOP_UNLOCK(nd.ni_vp, 0, p); + if (nd.ni_vp->v_type != VREG) { + vn_close(nd.ni_vp, FWRITE, p->p_ucred, p); + return (EACCES); + } + } + + /* + * If accounting was previously enabled, kill the old space-watcher, + * close the file, and (if no new file was specified, leave). + */ + if (acctp != NULLVP || savacctp != NULLVP) { + untimeout(acctwatch, NULL, acctwatch_handle); + error = vn_close((acctp != NULLVP ? acctp : savacctp), FWRITE, + p->p_ucred, p); + acctp = savacctp = NULLVP; + } + if (SCARG(uap, path) == NULL) + return (error); + + /* + * Save the new accounting file vnode, and schedule the new + * free space watcher. + */ + acctp = nd.ni_vp; + acctwatch(NULL); + return (error); +} + +/* + * Write out process accounting information, on process exit. + * Data to be written out is specified in Leffler, et al. + * and are enumerated below. (They're also noted in the system + * "acct.h" header file.) + */ + +int +acct_process(p) + struct proc *p; +{ + struct acct acct; + struct rusage *r; + struct timeval ut, st, tmp; + int t; + struct vnode *vp; + + /* If accounting isn't enabled, don't bother */ + vp = acctp; + if (vp == NULLVP) + return (0); + + /* + * Get process accounting information. + */ + + /* (1) The name of the command that ran */ + bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm); + + /* (2) The amount of user and system time that was used */ + calcru(p, &ut, &st, NULL); + acct.ac_utime = encode_comp_t(ut.tv_sec, ut.tv_usec); + acct.ac_stime = encode_comp_t(st.tv_sec, st.tv_usec); + + /* (3) The elapsed time the commmand ran (and its starting time) */ + acct.ac_btime = p->p_stats->p_start.tv_sec; + microtime(&tmp); + timevalsub(&tmp, &p->p_stats->p_start); + acct.ac_etime = encode_comp_t(tmp.tv_sec, tmp.tv_usec); + + /* (4) The average amount of memory used */ + r = &p->p_stats->p_ru; + tmp = ut; + timevaladd(&tmp, &st); + t = tmp.tv_sec * hz + tmp.tv_usec / tick; + if (t) + acct.ac_mem = (r->ru_ixrss + r->ru_idrss + r->ru_isrss) / t; + else + acct.ac_mem = 0; + + /* (5) The number of disk I/O operations done */ + acct.ac_io = encode_comp_t(r->ru_inblock + r->ru_oublock, 0); + + /* (6) The UID and GID of the process */ + acct.ac_uid = p->p_cred->p_ruid; + acct.ac_gid = p->p_cred->p_rgid; + + /* (7) The terminal from which the process was started */ + if ((p->p_flag & P_CONTROLT) && p->p_pgrp->pg_session->s_ttyp) + acct.ac_tty = p->p_pgrp->pg_session->s_ttyp->t_dev; + else + acct.ac_tty = NODEV; + + /* (8) The boolean flags that tell how the process terminated, etc. */ + acct.ac_flag = p->p_acflag; + + /* + * Eliminate any file size rlimit. + */ + if (p->p_limit->p_refcnt > 1 && + (p->p_limit->p_lflags & PL_SHAREMOD) == 0) { + p->p_limit->p_refcnt--; + p->p_limit = limcopy(p->p_limit); + } + p->p_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; + + /* + * Write the accounting information to the file. + */ + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + return (vn_rdwr(UIO_WRITE, vp, (caddr_t)&acct, sizeof (acct), + (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, p->p_ucred, + (int *)0, p)); +} + +/* + * Encode_comp_t converts from ticks in seconds and microseconds + * to ticks in 1/AHZ seconds. The encoding is described in + * Leffler, et al., on page 63. + */ + +#define MANTSIZE 13 /* 13 bit mantissa. */ +#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */ +#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */ + +static comp_t +encode_comp_t(s, us) + u_long s, us; +{ + int exp, rnd; + + exp = 0; + rnd = 0; + s *= AHZ; + s += us / (1000000 / AHZ); /* Maximize precision. */ + + while (s > MAXFRACT) { + rnd = s & (1 << (EXPSIZE - 1)); /* Round up? */ + s >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */ + exp++; + } + + /* If we need to round up, do it (and handle overflow correctly). */ + if (rnd && (++s > MAXFRACT)) { + s >>= EXPSIZE; + exp++; + } + + /* Clean it up and polish it off. */ + exp <<= MANTSIZE; /* Shift the exponent into place */ + exp += s; /* and add on the mantissa. */ + return (exp); +} + +/* + * Periodically check the file system to see if accounting + * should be turned on or off. Beware the case where the vnode + * has been vgone()'d out from underneath us, e.g. when the file + * system containing the accounting file has been forcibly unmounted. + */ +/* ARGSUSED */ +static void +acctwatch(a) + void *a; +{ + struct statfs sb; + + if (savacctp != NULLVP) { + if (savacctp->v_type == VBAD) { + (void) vn_close(savacctp, FWRITE, NOCRED, NULL); + savacctp = NULLVP; + return; + } + (void)VFS_STATFS(savacctp->v_mount, &sb, (struct proc *)0); + if (sb.f_bavail > acctresume * sb.f_blocks / 100) { + acctp = savacctp; + savacctp = NULLVP; + log(LOG_NOTICE, "Accounting resumed\n"); + } + } else { + if (acctp == NULLVP) + return; + if (acctp->v_type == VBAD) { + (void) vn_close(acctp, FWRITE, NOCRED, NULL); + acctp = NULLVP; + return; + } + (void)VFS_STATFS(acctp->v_mount, &sb, (struct proc *)0); + if (sb.f_bavail <= acctsuspend * sb.f_blocks / 100) { + savacctp = acctp; + acctp = NULLVP; + log(LOG_NOTICE, "Accounting suspended\n"); + } + } + acctwatch_handle = timeout(acctwatch, NULL, acctchkfreq * hz); +} diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c new file mode 100644 index 0000000..2ea378f --- /dev/null +++ b/sys/kern/kern_clock.c @@ -0,0 +1,870 @@ +/*- + * Copyright (c) 1997, 1998 Poul-Henning Kamp <phk@FreeBSD.org> + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 + * $Id: kern_clock.c,v 1.85 1998/11/23 09:58:53 phk Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/dkstat.h> +#include <sys/callout.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/timex.h> +#include <vm/vm.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <sys/sysctl.h> + +#include <machine/cpu.h> +#include <machine/limits.h> + +#ifdef GPROF +#include <sys/gmon.h> +#endif + +#if defined(SMP) && defined(BETTER_CLOCK) +#include <machine/smp.h> +#endif + +/* This is where the NTIMECOUNTER option hangs out */ +#include "opt_ntp.h" + +/* + * Number of timecounters used to implement stable storage + */ +#ifndef NTIMECOUNTER +#define NTIMECOUNTER 5 +#endif + +static MALLOC_DEFINE(M_TIMECOUNTER, "timecounter", + "Timecounter stable storage"); + +static void initclocks __P((void *dummy)); +SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL) + +static void tco_forward __P((int force)); +static void tco_setscales __P((struct timecounter *tc)); +static __inline unsigned tco_delta __P((struct timecounter *tc)); + +/* Some of these don't belong here, but it's easiest to concentrate them. */ +#if defined(SMP) && defined(BETTER_CLOCK) +long cp_time[CPUSTATES]; +#else +static long cp_time[CPUSTATES]; +#endif + +long tk_cancc; +long tk_nin; +long tk_nout; +long tk_rawcc; + +time_t time_second; + +/* + * Which update policy to use. + * 0 - every tick, bad hardware may fail with "calcru negative..." + * 1 - more resistent to the above hardware, but less efficient. + */ +static int tco_method; + +/* + * Implement a dummy timecounter which we can use until we get a real one + * in the air. This allows the console and other early stuff to use + * timeservices. + */ + +static unsigned +dummy_get_timecount(struct timecounter *tc) +{ + static unsigned now; + return (++now); +} + +static struct timecounter dummy_timecounter = { + dummy_get_timecount, + 0, + ~0u, + 1000000, + "dummy" +}; + +struct timecounter *timecounter = &dummy_timecounter; + +/* + * Clock handling routines. + * + * This code is written to operate with two timers that run independently of + * each other. + * + * The main timer, running hz times per second, is used to trigger interval + * timers, timeouts and rescheduling as needed. + * + * The second timer handles kernel and user profiling, + * and does resource use estimation. If the second timer is programmable, + * it is randomized to avoid aliasing between the two clocks. For example, + * the randomization prevents an adversary from always giving up the cpu + * just before its quantum expires. Otherwise, it would never accumulate + * cpu ticks. The mean frequency of the second timer is stathz. + * + * If no second timer exists, stathz will be zero; in this case we drive + * profiling and statistics off the main clock. This WILL NOT be accurate; + * do not do it unless absolutely necessary. + * + * The statistics clock may (or may not) be run at a higher rate while + * profiling. This profile clock runs at profhz. We require that profhz + * be an integral multiple of stathz. + * + * If the statistics clock is running fast, it must be divided by the ratio + * profhz/stathz for statistics. (For profiling, every tick counts.) + * + * Time-of-day is maintained using a "timecounter", which may or may + * not be related to the hardware generating the above mentioned + * interrupts. + */ + +int stathz; +int profhz; +static int profprocs; +int ticks; +static int psdiv, pscnt; /* prof => stat divider */ +int psratio; /* ratio: prof / stat */ + +/* + * Initialize clock frequencies and start both clocks running. + */ +/* ARGSUSED*/ +static void +initclocks(dummy) + void *dummy; +{ + register int i; + + /* + * Set divisors to 1 (normal case) and let the machine-specific + * code do its bit. + */ + psdiv = pscnt = 1; + cpu_initclocks(); + + /* + * Compute profhz/stathz, and fix profhz if needed. + */ + i = stathz ? stathz : hz; + if (profhz == 0) + profhz = i; + psratio = profhz / i; +} + +/* + * The real-time timer, interrupting hz times per second. + */ +void +hardclock(frame) + register struct clockframe *frame; +{ + register struct proc *p; + + p = curproc; + if (p) { + register struct pstats *pstats; + + /* + * Run current process's virtual and profile time, as needed. + */ + pstats = p->p_stats; + if (CLKF_USERMODE(frame) && + timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) && + itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) + psignal(p, SIGVTALRM); + if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) && + itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) + psignal(p, SIGPROF); + } + +#if defined(SMP) && defined(BETTER_CLOCK) + forward_hardclock(pscnt); +#endif + + /* + * If no separate statistics clock is available, run it from here. + */ + if (stathz == 0) + statclock(frame); + + tco_forward(0); + ticks++; + + /* + * Process callouts at a very low cpu priority, so we don't keep the + * relatively high clock interrupt priority any longer than necessary. + */ + if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) { + if (CLKF_BASEPRI(frame)) { + /* + * Save the overhead of a software interrupt; + * it will happen as soon as we return, so do it now. + */ + (void)splsoftclock(); + softclock(); + } else + setsoftclock(); + } else if (softticks + 1 == ticks) + ++softticks; +} + +/* + * Compute number of ticks in the specified amount of time. + */ +int +tvtohz(tv) + struct timeval *tv; +{ + register unsigned long ticks; + register long sec, usec; + + /* + * If the number of usecs in the whole seconds part of the time + * difference fits in a long, then the total number of usecs will + * fit in an unsigned long. Compute the total and convert it to + * ticks, rounding up and adding 1 to allow for the current tick + * to expire. Rounding also depends on unsigned long arithmetic + * to avoid overflow. + * + * Otherwise, if the number of ticks in the whole seconds part of + * the time difference fits in a long, then convert the parts to + * ticks separately and add, using similar rounding methods and + * overflow avoidance. This method would work in the previous + * case but it is slightly slower and assumes that hz is integral. + * + * Otherwise, round the time difference down to the maximum + * representable value. + * + * If ints have 32 bits, then the maximum value for any timeout in + * 10ms ticks is 248 days. + */ + sec = tv->tv_sec; + usec = tv->tv_usec; + if (usec < 0) { + sec--; + usec += 1000000; + } + if (sec < 0) { +#ifdef DIAGNOSTIC + if (usec > 0) { + sec++; + usec -= 1000000; + } + printf("tvotohz: negative time difference %ld sec %ld usec\n", + sec, usec); +#endif + ticks = 1; + } else if (sec <= LONG_MAX / 1000000) + ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1)) + / tick + 1; + else if (sec <= LONG_MAX / hz) + ticks = sec * hz + + ((unsigned long)usec + (tick - 1)) / tick + 1; + else + ticks = LONG_MAX; + if (ticks > INT_MAX) + ticks = INT_MAX; + return ((int)ticks); +} + +/* + * Start profiling on a process. + * + * Kernel profiling passes proc0 which never exits and hence + * keeps the profile clock running constantly. + */ +void +startprofclock(p) + register struct proc *p; +{ + int s; + + if ((p->p_flag & P_PROFIL) == 0) { + p->p_flag |= P_PROFIL; + if (++profprocs == 1 && stathz != 0) { + s = splstatclock(); + psdiv = pscnt = psratio; + setstatclockrate(profhz); + splx(s); + } + } +} + +/* + * Stop profiling on a process. + */ +void +stopprofclock(p) + register struct proc *p; +{ + int s; + + if (p->p_flag & P_PROFIL) { + p->p_flag &= ~P_PROFIL; + if (--profprocs == 0 && stathz != 0) { + s = splstatclock(); + psdiv = pscnt = 1; + setstatclockrate(stathz); + splx(s); + } + } +} + +/* + * Statistics clock. Grab profile sample, and if divider reaches 0, + * do process and kernel statistics. + */ +void +statclock(frame) + register struct clockframe *frame; +{ +#ifdef GPROF + register struct gmonparam *g; + int i; +#endif + register struct proc *p; + struct pstats *pstats; + long rss; + struct rusage *ru; + struct vmspace *vm; + + if (curproc != NULL && CLKF_USERMODE(frame)) { + p = curproc; + if (p->p_flag & P_PROFIL) + addupc_intr(p, CLKF_PC(frame), 1); +#if defined(SMP) && defined(BETTER_CLOCK) + if (stathz != 0) + forward_statclock(pscnt); +#endif + if (--pscnt > 0) + return; + /* + * Came from user mode; CPU was in user state. + * If this process is being profiled record the tick. + */ + p->p_uticks++; + if (p->p_nice > NZERO) + cp_time[CP_NICE]++; + else + cp_time[CP_USER]++; + } else { +#ifdef GPROF + /* + * Kernel statistics are just like addupc_intr, only easier. + */ + g = &_gmonparam; + if (g->state == GMON_PROF_ON) { + i = CLKF_PC(frame) - g->lowpc; + if (i < g->textsize) { + i /= HISTFRACTION * sizeof(*g->kcount); + g->kcount[i]++; + } + } +#endif +#if defined(SMP) && defined(BETTER_CLOCK) + if (stathz != 0) + forward_statclock(pscnt); +#endif + if (--pscnt > 0) + return; + /* + * Came from kernel mode, so we were: + * - handling an interrupt, + * - doing syscall or trap work on behalf of the current + * user process, or + * - spinning in the idle loop. + * Whichever it is, charge the time as appropriate. + * Note that we charge interrupts to the current process, + * regardless of whether they are ``for'' that process, + * so that we know how much of its real time was spent + * in ``non-process'' (i.e., interrupt) work. + */ + p = curproc; + if (CLKF_INTR(frame)) { + if (p != NULL) + p->p_iticks++; + cp_time[CP_INTR]++; + } else if (p != NULL) { + p->p_sticks++; + cp_time[CP_SYS]++; + } else + cp_time[CP_IDLE]++; + } + pscnt = psdiv; + + /* + * We maintain statistics shown by user-level statistics + * programs: the amount of time in each cpu state. + */ + + /* + * We adjust the priority of the current process. The priority of + * a process gets worse as it accumulates CPU time. The cpu usage + * estimator (p_estcpu) is increased here. The formula for computing + * priorities (in kern_synch.c) will compute a different value each + * time p_estcpu increases by 4. The cpu usage estimator ramps up + * quite quickly when the process is running (linearly), and decays + * away exponentially, at a rate which is proportionally slower when + * the system is busy. The basic principal is that the system will + * 90% forget that the process used a lot of CPU time in 5 * loadav + * seconds. This causes the system to favor processes which haven't + * run much recently, and to round-robin among other processes. + */ + if (p != NULL) { + p->p_cpticks++; + if (++p->p_estcpu == 0) + p->p_estcpu--; + if ((p->p_estcpu & 3) == 0) { + resetpriority(p); + if (p->p_priority >= PUSER) + p->p_priority = p->p_usrpri; + } + + /* Update resource usage integrals and maximums. */ + if ((pstats = p->p_stats) != NULL && + (ru = &pstats->p_ru) != NULL && + (vm = p->p_vmspace) != NULL) { + ru->ru_ixrss += vm->vm_tsize * PAGE_SIZE / 1024; + ru->ru_idrss += vm->vm_dsize * PAGE_SIZE / 1024; + ru->ru_isrss += vm->vm_ssize * PAGE_SIZE / 1024; + rss = vm->vm_pmap.pm_stats.resident_count * + PAGE_SIZE / 1024; + if (ru->ru_maxrss < rss) + ru->ru_maxrss = rss; + } + } +} + +/* + * Return information about system clocks. + */ +static int +sysctl_kern_clockrate SYSCTL_HANDLER_ARGS +{ + struct clockinfo clkinfo; + /* + * Construct clockinfo structure. + */ + clkinfo.hz = hz; + clkinfo.tick = tick; + clkinfo.tickadj = tickadj; + clkinfo.profhz = profhz; + clkinfo.stathz = stathz ? stathz : hz; + return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req)); +} + +SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD, + 0, 0, sysctl_kern_clockrate, "S,clockinfo",""); + +static __inline unsigned +tco_delta(struct timecounter *tc) +{ + + return ((tc->tc_get_timecount(tc) - tc->tc_offset_count) & + tc->tc_counter_mask); +} + +/* + * We have four functions for looking at the clock, two for microseconds + * and two for nanoseconds. For each there is fast but less precise + * version "get{nano|micro}time" which will return a time which is up + * to 1/HZ previous to the call, whereas the raw version "{nano|micro}time" + * will return a timestamp which is as precise as possible. + */ + +void +getmicrotime(struct timeval *tvp) +{ + struct timecounter *tc; + + if (!tco_method) { + tc = timecounter; + *tvp = tc->tc_microtime; + } else { + microtime(tvp); + } +} + +void +getnanotime(struct timespec *tsp) +{ + struct timecounter *tc; + + if (!tco_method) { + tc = timecounter; + *tsp = tc->tc_nanotime; + } else { + nanotime(tsp); + } +} + +void +microtime(struct timeval *tv) +{ + struct timecounter *tc; + + tc = (struct timecounter *)timecounter; + tv->tv_sec = tc->tc_offset_sec; + tv->tv_usec = tc->tc_offset_micro; + tv->tv_usec += ((u_int64_t)tco_delta(tc) * tc->tc_scale_micro) >> 32; + tv->tv_usec += boottime.tv_usec; + tv->tv_sec += boottime.tv_sec; + while (tv->tv_usec >= 1000000) { + tv->tv_usec -= 1000000; + tv->tv_sec++; + } +} + +void +nanotime(struct timespec *ts) +{ + unsigned count; + u_int64_t delta; + struct timecounter *tc; + + tc = (struct timecounter *)timecounter; + ts->tv_sec = tc->tc_offset_sec; + count = tco_delta(tc); + delta = tc->tc_offset_nano; + delta += ((u_int64_t)count * tc->tc_scale_nano_f); + delta >>= 32; + delta += ((u_int64_t)count * tc->tc_scale_nano_i); + delta += boottime.tv_usec * 1000; + ts->tv_sec += boottime.tv_sec; + while (delta >= 1000000000) { + delta -= 1000000000; + ts->tv_sec++; + } + ts->tv_nsec = delta; +} + +void +timecounter_timespec(unsigned count, struct timespec *ts) +{ + u_int64_t delta; + struct timecounter *tc; + + tc = (struct timecounter *)timecounter; + ts->tv_sec = tc->tc_offset_sec; + count -= tc->tc_offset_count; + count &= tc->tc_counter_mask; + delta = tc->tc_offset_nano; + delta += ((u_int64_t)count * tc->tc_scale_nano_f); + delta >>= 32; + delta += ((u_int64_t)count * tc->tc_scale_nano_i); + delta += boottime.tv_usec * 1000; + ts->tv_sec += boottime.tv_sec; + while (delta >= 1000000000) { + delta -= 1000000000; + ts->tv_sec++; + } + ts->tv_nsec = delta; +} + +void +getmicrouptime(struct timeval *tvp) +{ + struct timecounter *tc; + + if (!tco_method) { + tc = timecounter; + tvp->tv_sec = tc->tc_offset_sec; + tvp->tv_usec = tc->tc_offset_micro; + } else { + microuptime(tvp); + } +} + +void +getnanouptime(struct timespec *tsp) +{ + struct timecounter *tc; + + if (!tco_method) { + tc = timecounter; + tsp->tv_sec = tc->tc_offset_sec; + tsp->tv_nsec = tc->tc_offset_nano >> 32; + } else { + nanouptime(tsp); + } +} + +void +microuptime(struct timeval *tv) +{ + struct timecounter *tc; + + tc = (struct timecounter *)timecounter; + tv->tv_sec = tc->tc_offset_sec; + tv->tv_usec = tc->tc_offset_micro; + tv->tv_usec += ((u_int64_t)tco_delta(tc) * tc->tc_scale_micro) >> 32; + if (tv->tv_usec >= 1000000) { + tv->tv_usec -= 1000000; + tv->tv_sec++; + } +} + +void +nanouptime(struct timespec *ts) +{ + unsigned count; + u_int64_t delta; + struct timecounter *tc; + + tc = (struct timecounter *)timecounter; + ts->tv_sec = tc->tc_offset_sec; + count = tco_delta(tc); + delta = tc->tc_offset_nano; + delta += ((u_int64_t)count * tc->tc_scale_nano_f); + delta >>= 32; + delta += ((u_int64_t)count * tc->tc_scale_nano_i); + if (delta >= 1000000000) { + delta -= 1000000000; + ts->tv_sec++; + } + ts->tv_nsec = delta; +} + +static void +tco_setscales(struct timecounter *tc) +{ + u_int64_t scale; + + scale = 1000000000LL << 32; + if (tc->tc_adjustment > 0) + scale += (tc->tc_adjustment * 1000LL) << 10; + else + scale -= (-tc->tc_adjustment * 1000LL) << 10; + scale /= tc->tc_frequency; + tc->tc_scale_micro = scale / 1000; + tc->tc_scale_nano_f = scale & 0xffffffff; + tc->tc_scale_nano_i = scale >> 32; +} + +void +init_timecounter(struct timecounter *tc) +{ + struct timespec ts1; + struct timecounter *t1, *t2, *t3; + int i; + + tc->tc_adjustment = 0; + tco_setscales(tc); + tc->tc_offset_count = tc->tc_get_timecount(tc); + tc->tc_tweak = tc; + MALLOC(t1, struct timecounter *, sizeof *t1, M_TIMECOUNTER, M_WAITOK); + *t1 = *tc; + t2 = t1; + for (i = 1; i < NTIMECOUNTER; i++) { + MALLOC(t3, struct timecounter *, sizeof *t3, + M_TIMECOUNTER, M_WAITOK); + *t3 = *tc; + t3->tc_other = t2; + t2 = t3; + } + t1->tc_other = t3; + tc = t1; + + printf("Timecounter \"%s\" frequency %lu Hz\n", + tc->tc_name, (u_long)tc->tc_frequency); + + /* XXX: For now always start using the counter. */ + tc->tc_offset_count = tc->tc_get_timecount(tc); + nanouptime(&ts1); + tc->tc_offset_nano = (u_int64_t)ts1.tv_nsec << 32; + tc->tc_offset_micro = ts1.tv_nsec / 1000; + tc->tc_offset_sec = ts1.tv_sec; + timecounter = tc; +} + +void +set_timecounter(struct timespec *ts) +{ + struct timespec ts2; + + nanouptime(&ts2); + boottime.tv_sec = ts->tv_sec - ts2.tv_sec; + boottime.tv_usec = (ts->tv_nsec - ts2.tv_nsec) / 1000; + if (boottime.tv_usec < 0) { + boottime.tv_usec += 1000000; + boottime.tv_sec--; + } + /* fiddle all the little crinkly bits around the fiords... */ + tco_forward(1); +} + + +#if 0 /* Currently unused */ +void +switch_timecounter(struct timecounter *newtc) +{ + int s; + struct timecounter *tc; + struct timespec ts; + + s = splclock(); + tc = timecounter; + if (newtc == tc || newtc == tc->tc_other) { + splx(s); + return; + } + nanouptime(&ts); + newtc->tc_offset_sec = ts.tv_sec; + newtc->tc_offset_nano = (u_int64_t)ts.tv_nsec << 32; + newtc->tc_offset_micro = ts.tv_nsec / 1000; + newtc->tc_offset_count = newtc->tc_get_timecount(newtc); + timecounter = newtc; + splx(s); +} +#endif + +static struct timecounter * +sync_other_counter(void) +{ + struct timecounter *tc, *tcn, *tco; + unsigned delta; + + tco = timecounter; + tc = tco->tc_other; + tcn = tc->tc_other; + *tc = *tco; + tc->tc_other = tcn; + delta = tco_delta(tc); + tc->tc_offset_count += delta; + tc->tc_offset_count &= tc->tc_counter_mask; + tc->tc_offset_nano += (u_int64_t)delta * tc->tc_scale_nano_f; + tc->tc_offset_nano += (u_int64_t)delta * tc->tc_scale_nano_i << 32; + return (tc); +} + +static void +tco_forward(int force) +{ + struct timecounter *tc, *tco; + + tco = timecounter; + tc = sync_other_counter(); + /* + * We may be inducing a tiny error here, the tc_poll_pps() may + * process a latched count which happens after the tco_delta() + * in sync_other_counter(), which would extend the previous + * counters parameters into the domain of this new one. + * Since the timewindow is very small for this, the error is + * going to be only a few weenieseconds (as Dave Mills would + * say), so lets just not talk more about it, OK ? + */ + if (tco->tc_poll_pps) + tco->tc_poll_pps(tco); + if (timedelta != 0) { + tc->tc_offset_nano += (u_int64_t)(tickdelta * 1000) << 32; + timedelta -= tickdelta; + force++; + } + + while (tc->tc_offset_nano >= 1000000000ULL << 32) { + tc->tc_offset_nano -= 1000000000ULL << 32; + tc->tc_offset_sec++; + tc->tc_frequency = tc->tc_tweak->tc_frequency; + tc->tc_adjustment = tc->tc_tweak->tc_adjustment; + ntp_update_second(tc); /* XXX only needed if xntpd runs */ + tco_setscales(tc); + force++; + } + + if (tco_method && !force) + return; + + tc->tc_offset_micro = (tc->tc_offset_nano / 1000) >> 32; + + /* Figure out the wall-clock time */ + tc->tc_nanotime.tv_sec = tc->tc_offset_sec + boottime.tv_sec; + tc->tc_nanotime.tv_nsec = + (tc->tc_offset_nano >> 32) + boottime.tv_usec * 1000; + tc->tc_microtime.tv_usec = tc->tc_offset_micro + boottime.tv_usec; + if (tc->tc_nanotime.tv_nsec >= 1000000000) { + tc->tc_nanotime.tv_nsec -= 1000000000; + tc->tc_microtime.tv_usec -= 1000000; + tc->tc_nanotime.tv_sec++; + } + time_second = tc->tc_microtime.tv_sec = tc->tc_nanotime.tv_sec; + + timecounter = tc; +} + +static int +sysctl_kern_timecounter_frequency SYSCTL_HANDLER_ARGS +{ + + return (sysctl_handle_opaque(oidp, + &timecounter->tc_tweak->tc_frequency, + sizeof(timecounter->tc_tweak->tc_frequency), req)); +} + +static int +sysctl_kern_timecounter_adjustment SYSCTL_HANDLER_ARGS +{ + + return (sysctl_handle_opaque(oidp, + &timecounter->tc_tweak->tc_adjustment, + sizeof(timecounter->tc_tweak->tc_adjustment), req)); +} + +SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, ""); + +SYSCTL_INT(_kern_timecounter, KERN_ARGMAX, method, CTLFLAG_RW, &tco_method, 0, + "This variable determines the method used for updating timecounters. " + "If the default algorithm (0) fails with \"calcru negative...\" messages " + "try the alternate algorithm (1) which handles bad hardware better." + +); + +SYSCTL_PROC(_kern_timecounter, OID_AUTO, frequency, CTLTYPE_INT | CTLFLAG_RW, + 0, sizeof(u_int), sysctl_kern_timecounter_frequency, "I", ""); + +SYSCTL_PROC(_kern_timecounter, OID_AUTO, adjustment, CTLTYPE_INT | CTLFLAG_RW, + 0, sizeof(int), sysctl_kern_timecounter_adjustment, "I", ""); diff --git a/sys/kern/kern_conf.c b/sys/kern/kern_conf.c new file mode 100644 index 0000000..df832f6 --- /dev/null +++ b/sys/kern/kern_conf.c @@ -0,0 +1,220 @@ +/*- + * Parts Copyright (c) 1995 Terrence R. Lambert + * Copyright (c) 1995 Julian R. Elischer + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Terrence R. Lambert. + * 4. The name Terrence R. Lambert may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY Julian R. Elischer ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE TERRENCE R. LAMBERT BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: kern_conf.c,v 1.28 1998/10/25 17:44:50 phk Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/module.h> +#include <sys/conf.h> +#include <sys/vnode.h> + +#define NUMBDEV 128 +#define NUMCDEV 256 +#define bdevsw_ALLOCSTART (NUMBDEV/2) +#define cdevsw_ALLOCSTART (NUMCDEV/2) + +struct cdevsw *bdevsw[NUMBDEV]; +int nblkdev = NUMBDEV; +struct cdevsw *cdevsw[NUMCDEV]; +int nchrdev = NUMCDEV; + +/* + * Routine to convert from character to block device number. + * + * A minimal stub routine can always return NODEV. + */ +dev_t +chrtoblk(dev_t dev) +{ + struct cdevsw *cd; + + if(cd = cdevsw[major(dev)]) { + if (cd->d_bmaj != -1) + return(makedev(cd->d_bmaj,minor(dev))); + } + return(NODEV); +} + +/* + * (re)place an entry in the bdevsw or cdevsw table + * return the slot used in major(*descrip) + */ +static int +bdevsw_add(dev_t *descrip, + struct cdevsw *newentry, + struct cdevsw **oldentry) +{ + int i ; + + if ( (int)*descrip == NODEV) { /* auto (0 is valid) */ + /* + * Search the table looking for a slot... + */ + for (i = bdevsw_ALLOCSTART; i < nblkdev; i++) + if (bdevsw[i] == NULL) + break; /* found one! */ + /* out of allocable slots? */ + if (i >= nblkdev) { + return ENFILE; + } + } else { /* assign */ + i = major(*descrip); + if (i < 0 || i >= nblkdev) { + return EINVAL; + } + } + + /* maybe save old */ + if (oldentry) { + *oldentry = bdevsw[i]; + } + if (newentry) { + newentry->d_bmaj = i; + } + /* replace with new */ + bdevsw[i] = newentry; + + /* done! let them know where we put it */ + *descrip = makedev(i,0); + return 0; +} + +int +cdevsw_add(dev_t *descrip, + struct cdevsw *newentry, + struct cdevsw **oldentry) +{ + int i ; + + if ( (int)*descrip == NODEV) { /* auto (0 is valid) */ + /* + * Search the table looking for a slot... + */ + for (i = cdevsw_ALLOCSTART; i < nchrdev; i++) + if (cdevsw[i] == NULL) + break; /* found one! */ + /* out of allocable slots? */ + if (i >= nchrdev) { + return ENFILE; + } + } else { /* assign */ + i = major(*descrip); + if (i < 0 || i >= nchrdev) { + return EINVAL; + } + } + + /* maybe save old */ + if (oldentry) { + *oldentry = cdevsw[i]; + } + if (newentry) { + newentry->d_bmaj = -1; + newentry->d_maj = i; + } + /* replace with new */ + cdevsw[i] = newentry; + + /* done! let them know where we put it */ + *descrip = makedev(i,0); + return 0; +} + +/* + * note must call cdevsw_add before bdevsw_add due to d_bmaj hack. + */ +void +cdevsw_add_generic(int bdev, int cdev, struct cdevsw *cdevsw) +{ + dev_t dev; + + dev = makedev(cdev, 0); + cdevsw_add(&dev, cdevsw, NULL); + dev = makedev(bdev, 0); + bdevsw_add(&dev, cdevsw, NULL); +} + +int +cdevsw_module_handler(module_t mod, int what, void *arg) +{ + struct cdevsw_module_data* data = (struct cdevsw_module_data*) arg; + int error; + + switch (what) { + case MOD_LOAD: + if (error = cdevsw_add(&data->dev, data->cdevsw, NULL)) + return error; + break; + + case MOD_UNLOAD: + if (error = cdevsw_add(&data->dev, NULL, NULL)) + return error; + break; + } + + if (data->chainevh) + return data->chainevh(mod, what, data->chainarg); + else + return 0; +} + +int +bdevsw_module_handler(module_t mod, int what, void* arg) +{ + struct bdevsw_module_data* data = (struct bdevsw_module_data*) arg; + int error; + + switch (what) { + case MOD_LOAD: + if (error = cdevsw_add(&data->cdev, data->cdevsw, NULL)) + return error; + if (error = bdevsw_add(&data->bdev, data->cdevsw, NULL)) { + cdevsw_add(&data->bdev, NULL, NULL); + return error; + } + break; + + case MOD_UNLOAD: + if (error = bdevsw_add(&data->bdev, NULL, NULL)) + return error; + if (error = cdevsw_add(&data->cdev, NULL, NULL)) + return error; + break; + } + + if (data->chainevh) + return data->chainevh(mod, what, data->chainarg); + else + return 0; +} diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c new file mode 100644 index 0000000..1d18a86 --- /dev/null +++ b/sys/kern/kern_descrip.c @@ -0,0 +1,1313 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 + * $Id: kern_descrip.c,v 1.57 1998/11/11 10:55:56 truckman Exp $ + */ + +#include "opt_compat.h" +#include "opt_devfs.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/conf.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/vnode.h> +#include <sys/proc.h> +#include <sys/file.h> +#include <sys/socketvar.h> +#include <sys/stat.h> +#include <sys/filio.h> +#include <sys/ttycom.h> +#include <sys/fcntl.h> +#include <sys/malloc.h> +#include <sys/unistd.h> +#include <sys/resourcevar.h> +#include <sys/pipe.h> + +#include <vm/vm.h> +#include <vm/vm_extern.h> + +#ifdef DEVFS +#include <sys/devfsext.h> +#endif /*DEVFS*/ + +static MALLOC_DEFINE(M_FILEDESC, "file desc", "Open file descriptor table"); +MALLOC_DEFINE(M_FILE, "file", "Open file structure"); +static MALLOC_DEFINE(M_SIGIO, "sigio", "sigio structures"); + + +static d_open_t fdopen; +#define NUMFDESC 64 + +#define CDEV_MAJOR 22 +static struct cdevsw fildesc_cdevsw = + { fdopen, noclose, noread, nowrite, + noioc, nostop, nullreset, nodevtotty, + seltrue, nommap, nostrat }; + +static int finishdup __P((struct filedesc *fdp, int old, int new, register_t *retval)); +/* + * Descriptor management. + */ +struct filelist filehead; /* head of list of open files */ +int nfiles; /* actual number of open files */ +extern int cmask; + +/* + * System calls on descriptors. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getdtablesize_args { + int dummy; +}; +#endif +/* ARGSUSED */ +int +getdtablesize(p, uap) + struct proc *p; + struct getdtablesize_args *uap; +{ + + p->p_retval[0] = + min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc); + return (0); +} + +/* + * Duplicate a file descriptor to a particular value. + */ +#ifndef _SYS_SYSPROTO_H_ +struct dup2_args { + u_int from; + u_int to; +}; +#endif +/* ARGSUSED */ +int +dup2(p, uap) + struct proc *p; + struct dup2_args *uap; +{ + register struct filedesc *fdp = p->p_fd; + register u_int old = uap->from, new = uap->to; + int i, error; + + if (old >= fdp->fd_nfiles || + fdp->fd_ofiles[old] == NULL || + new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur || + new >= maxfilesperproc) + return (EBADF); + if (old == new) { + p->p_retval[0] = new; + return (0); + } + if (new >= fdp->fd_nfiles) { + if ((error = fdalloc(p, new, &i))) + return (error); + if (new != i) + panic("dup2: fdalloc"); + } else if (fdp->fd_ofiles[new]) { + if (fdp->fd_ofileflags[new] & UF_MAPPED) + (void) munmapfd(p, new); + /* + * dup2() must succeed even if the close has an error. + */ + (void) closef(fdp->fd_ofiles[new], p); + } + return (finishdup(fdp, (int)old, (int)new, p->p_retval)); +} + +/* + * Duplicate a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct dup_args { + u_int fd; +}; +#endif +/* ARGSUSED */ +int +dup(p, uap) + struct proc *p; + struct dup_args *uap; +{ + register struct filedesc *fdp; + u_int old; + int new, error; + + old = uap->fd; + +#if 0 + /* + * XXX Compatibility + */ + if (old &~ 077) { uap->fd &= 077; return (dup2(p, uap, p->p_retval)); } +#endif + + fdp = p->p_fd; + if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) + return (EBADF); + if ((error = fdalloc(p, 0, &new))) + return (error); + return (finishdup(fdp, (int)old, new, p->p_retval)); +} + +/* + * The file control system call. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fcntl_args { + int fd; + int cmd; + long arg; +}; +#endif +/* ARGSUSED */ +int +fcntl(p, uap) + struct proc *p; + register struct fcntl_args *uap; +{ + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + register char *pop; + struct vnode *vp; + int i, tmp, error, flg = F_POSIX; + struct flock fl; + u_int newmin; + + if ((unsigned)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) + return (EBADF); + pop = &fdp->fd_ofileflags[uap->fd]; + switch (uap->cmd) { + + case F_DUPFD: + newmin = uap->arg; + if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur || + newmin >= maxfilesperproc) + return (EINVAL); + if ((error = fdalloc(p, newmin, &i))) + return (error); + return (finishdup(fdp, uap->fd, i, p->p_retval)); + + case F_GETFD: + p->p_retval[0] = *pop & 1; + return (0); + + case F_SETFD: + *pop = (*pop &~ 1) | (uap->arg & 1); + return (0); + + case F_GETFL: + p->p_retval[0] = OFLAGS(fp->f_flag); + return (0); + + case F_SETFL: + fp->f_flag &= ~FCNTLFLAGS; + fp->f_flag |= FFLAGS(uap->arg & ~O_ACCMODE) & FCNTLFLAGS; + tmp = fp->f_flag & FNONBLOCK; + error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p); + if (error) + return (error); + tmp = fp->f_flag & FASYNC; + error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p); + if (!error) + return (0); + fp->f_flag &= ~FNONBLOCK; + tmp = 0; + (void) (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p); + return (error); + + case F_GETOWN: + error = (*fp->f_ops->fo_ioctl) + (fp, FIOGETOWN, (caddr_t)p->p_retval, p); + return (error); + + case F_SETOWN: + return ((*fp->f_ops->fo_ioctl) + (fp, FIOSETOWN, (caddr_t)&uap->arg, p)); + + case F_SETLKW: + flg |= F_WAIT; + /* Fall into F_SETLK */ + + case F_SETLK: + if (fp->f_type != DTYPE_VNODE) + return (EBADF); + vp = (struct vnode *)fp->f_data; + /* Copy in the lock structure */ + error = copyin((caddr_t)(intptr_t)uap->arg, (caddr_t)&fl, + sizeof(fl)); + if (error) + return (error); + if (fl.l_whence == SEEK_CUR) + fl.l_start += fp->f_offset; + switch (fl.l_type) { + + case F_RDLCK: + if ((fp->f_flag & FREAD) == 0) + return (EBADF); + p->p_flag |= P_ADVLOCK; + return (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &fl, flg)); + + case F_WRLCK: + if ((fp->f_flag & FWRITE) == 0) + return (EBADF); + p->p_flag |= P_ADVLOCK; + return (VOP_ADVLOCK(vp, (caddr_t)p, F_SETLK, &fl, flg)); + + case F_UNLCK: + return (VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &fl, + F_POSIX)); + + default: + return (EINVAL); + } + + case F_GETLK: + if (fp->f_type != DTYPE_VNODE) + return (EBADF); + vp = (struct vnode *)fp->f_data; + /* Copy in the lock structure */ + error = copyin((caddr_t)(intptr_t)uap->arg, (caddr_t)&fl, + sizeof(fl)); + if (error) + return (error); + if (fl.l_type != F_RDLCK && fl.l_type != F_WRLCK && + fl.l_type != F_UNLCK) + return (EINVAL); + if (fl.l_whence == SEEK_CUR) + fl.l_start += fp->f_offset; + if ((error = VOP_ADVLOCK(vp,(caddr_t)p,F_GETLK,&fl,F_POSIX))) + return (error); + return (copyout((caddr_t)&fl, (caddr_t)(intptr_t)uap->arg, + sizeof(fl))); + + default: + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Common code for dup, dup2, and fcntl(F_DUPFD). + */ +static int +finishdup(fdp, old, new, retval) + register struct filedesc *fdp; + register int old, new; + register_t *retval; +{ + register struct file *fp; + + fp = fdp->fd_ofiles[old]; + fdp->fd_ofiles[new] = fp; + fdp->fd_ofileflags[new] = fdp->fd_ofileflags[old] &~ UF_EXCLOSE; + fp->f_count++; + if (new > fdp->fd_lastfile) + fdp->fd_lastfile = new; + *retval = new; + return (0); +} + +/* + * If sigio is on the list associated with a process or process group, + * disable signalling from the device, remove sigio from the list and + * free sigio. + */ +void +funsetown(sigio) + struct sigio *sigio; +{ + int s; + + if (sigio == NULL) + return; + s = splhigh(); + *(sigio->sio_myref) = NULL; + splx(s); + if (sigio->sio_pgid < 0) { + SLIST_REMOVE(&sigio->sio_pgrp->pg_sigiolst, sigio, + sigio, sio_pgsigio); + } else /* if ((*sigiop)->sio_pgid > 0) */ { + SLIST_REMOVE(&sigio->sio_proc->p_sigiolst, sigio, + sigio, sio_pgsigio); + } + crfree(sigio->sio_ucred); + FREE(sigio, M_SIGIO); +} + +/* Free a list of sigio structures. */ +void +funsetownlst(sigiolst) + struct sigiolst *sigiolst; +{ + struct sigio *sigio; + + while ((sigio = sigiolst->slh_first) != NULL) + funsetown(sigio); +} + +/* + * This is common code for FIOSETOWN ioctl called by fcntl(fd, F_SETOWN, arg). + * + * After permission checking, add a sigio structure to the sigio list for + * the process or process group. + */ +int +fsetown(pgid, sigiop) + pid_t pgid; + struct sigio **sigiop; +{ + struct proc *proc; + struct pgrp *pgrp; + struct sigio *sigio; + int s; + + if (pgid == 0) { + funsetown(*sigiop); + return (0); + } + if (pgid > 0) { + proc = pfind(pgid); + if (proc == NULL) + return (ESRCH); + /* + * Policy - Don't allow a process to FSETOWN a process + * in another session. + * + * Remove this test to allow maximum flexibility or + * restrict FSETOWN to the current process or process + * group for maximum safety. + */ + else if (proc->p_session != curproc->p_session) + return (EPERM); + pgrp = NULL; + } else /* if (pgid < 0) */ { + pgrp = pgfind(-pgid); + if (pgrp == NULL) + return (ESRCH); + /* + * Policy - Don't allow a process to FSETOWN a process + * in another session. + * + * Remove this test to allow maximum flexibility or + * restrict FSETOWN to the current process or process + * group for maximum safety. + */ + else if (pgrp->pg_session != curproc->p_session) + return (EPERM); + proc = NULL; + } + funsetown(*sigiop); + MALLOC(sigio, struct sigio *, sizeof(struct sigio), M_SIGIO, + M_WAITOK); + if (pgid > 0) { + SLIST_INSERT_HEAD(&proc->p_sigiolst, sigio, sio_pgsigio); + sigio->sio_proc = proc; + } else { + SLIST_INSERT_HEAD(&pgrp->pg_sigiolst, sigio, sio_pgsigio); + sigio->sio_pgrp = pgrp; + } + sigio->sio_pgid = pgid; + crhold(curproc->p_ucred); + sigio->sio_ucred = curproc->p_ucred; + /* It would be convenient if p_ruid was in ucred. */ + sigio->sio_ruid = curproc->p_cred->p_ruid; + sigio->sio_myref = sigiop; + s = splhigh(); + *sigiop = sigio; + splx(s); + return (0); +} + +/* + * This is common code for FIOGETOWN ioctl called by fcntl(fd, F_GETOWN, arg). + */ +pid_t +fgetown(sigio) + struct sigio *sigio; +{ + return (sigio != NULL ? sigio->sio_pgid : 0); +} + +/* + * Close a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct close_args { + int fd; +}; +#endif +/* ARGSUSED */ +int +close(p, uap) + struct proc *p; + struct close_args *uap; +{ + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + register int fd = uap->fd; + register u_char *pf; + + if ((unsigned)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) + return (EBADF); + pf = (u_char *)&fdp->fd_ofileflags[fd]; + if (*pf & UF_MAPPED) + (void) munmapfd(p, fd); + fdp->fd_ofiles[fd] = NULL; + while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL) + fdp->fd_lastfile--; + if (fd < fdp->fd_freefile) + fdp->fd_freefile = fd; + *pf = 0; + return (closef(fp, p)); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Return status information about a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ofstat_args { + int fd; + struct ostat *sb; +}; +#endif +/* ARGSUSED */ +int +ofstat(p, uap) + struct proc *p; + register struct ofstat_args *uap; +{ + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + struct stat ub; + struct ostat oub; + int error; + + if ((unsigned)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) + return (EBADF); + switch (fp->f_type) { + + case DTYPE_FIFO: + case DTYPE_VNODE: + error = vn_stat((struct vnode *)fp->f_data, &ub, p); + break; + + case DTYPE_SOCKET: + error = soo_stat((struct socket *)fp->f_data, &ub); + break; + + case DTYPE_PIPE: + error = pipe_stat((struct pipe *)fp->f_data, &ub); + break; + + default: + panic("ofstat"); + /*NOTREACHED*/ + } + cvtstat(&ub, &oub); + if (error == 0) + error = copyout((caddr_t)&oub, (caddr_t)uap->sb, sizeof (oub)); + return (error); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Return status information about a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fstat_args { + int fd; + struct stat *sb; +}; +#endif +/* ARGSUSED */ +int +fstat(p, uap) + struct proc *p; + register struct fstat_args *uap; +{ + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + struct stat ub; + int error; + + if ((unsigned)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) + return (EBADF); + switch (fp->f_type) { + + case DTYPE_FIFO: + case DTYPE_VNODE: + error = vn_stat((struct vnode *)fp->f_data, &ub, p); + break; + + case DTYPE_SOCKET: + error = soo_stat((struct socket *)fp->f_data, &ub); + break; + + case DTYPE_PIPE: + error = pipe_stat((struct pipe *)fp->f_data, &ub); + break; + + default: + panic("fstat"); + /*NOTREACHED*/ + } + if (error == 0) + error = copyout((caddr_t)&ub, (caddr_t)uap->sb, sizeof (ub)); + return (error); +} + +/* + * Return status information about a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct nfstat_args { + int fd; + struct nstat *sb; +}; +#endif +/* ARGSUSED */ +int +nfstat(p, uap) + struct proc *p; + register struct nfstat_args *uap; +{ + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + struct stat ub; + struct nstat nub; + int error; + + if ((unsigned)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) + return (EBADF); + switch (fp->f_type) { + + case DTYPE_FIFO: + case DTYPE_VNODE: + error = vn_stat((struct vnode *)fp->f_data, &ub, p); + break; + + case DTYPE_SOCKET: + error = soo_stat((struct socket *)fp->f_data, &ub); + break; + + case DTYPE_PIPE: + error = pipe_stat((struct pipe *)fp->f_data, &ub); + break; + + default: + panic("fstat"); + /*NOTREACHED*/ + } + if (error == 0) { + cvtnstat(&ub, &nub); + error = copyout((caddr_t)&nub, (caddr_t)uap->sb, sizeof (nub)); + } + return (error); +} + +/* + * Return pathconf information about a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fpathconf_args { + int fd; + int name; +}; +#endif +/* ARGSUSED */ +int +fpathconf(p, uap) + struct proc *p; + register struct fpathconf_args *uap; +{ + struct filedesc *fdp = p->p_fd; + struct file *fp; + struct vnode *vp; + + if ((unsigned)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) + return (EBADF); + switch (fp->f_type) { + + case DTYPE_PIPE: + case DTYPE_SOCKET: + if (uap->name != _PC_PIPE_BUF) + return (EINVAL); + p->p_retval[0] = PIPE_BUF; + return (0); + + case DTYPE_FIFO: + case DTYPE_VNODE: + vp = (struct vnode *)fp->f_data; + return (VOP_PATHCONF(vp, uap->name, p->p_retval)); + + default: + panic("fpathconf"); + } + /*NOTREACHED*/ +} + +/* + * Allocate a file descriptor for the process. + */ +static int fdexpand; +SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, ""); + +int +fdalloc(p, want, result) + struct proc *p; + int want; + int *result; +{ + register struct filedesc *fdp = p->p_fd; + register int i; + int lim, last, nfiles; + struct file **newofile; + char *newofileflags; + + /* + * Search for a free descriptor starting at the higher + * of want or fd_freefile. If that fails, consider + * expanding the ofile array. + */ + lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc); + for (;;) { + last = min(fdp->fd_nfiles, lim); + if ((i = want) < fdp->fd_freefile) + i = fdp->fd_freefile; + for (; i < last; i++) { + if (fdp->fd_ofiles[i] == NULL) { + fdp->fd_ofileflags[i] = 0; + if (i > fdp->fd_lastfile) + fdp->fd_lastfile = i; + if (want <= fdp->fd_freefile) + fdp->fd_freefile = i; + *result = i; + return (0); + } + } + + /* + * No space in current array. Expand? + */ + if (fdp->fd_nfiles >= lim) + return (EMFILE); + if (fdp->fd_nfiles < NDEXTENT) + nfiles = NDEXTENT; + else + nfiles = 2 * fdp->fd_nfiles; + MALLOC(newofile, struct file **, nfiles * OFILESIZE, + M_FILEDESC, M_WAITOK); + newofileflags = (char *) &newofile[nfiles]; + /* + * Copy the existing ofile and ofileflags arrays + * and zero the new portion of each array. + */ + bcopy(fdp->fd_ofiles, newofile, + (i = sizeof(struct file *) * fdp->fd_nfiles)); + bzero((char *)newofile + i, nfiles * sizeof(struct file *) - i); + bcopy(fdp->fd_ofileflags, newofileflags, + (i = sizeof(char) * fdp->fd_nfiles)); + bzero(newofileflags + i, nfiles * sizeof(char) - i); + if (fdp->fd_nfiles > NDFILE) + FREE(fdp->fd_ofiles, M_FILEDESC); + fdp->fd_ofiles = newofile; + fdp->fd_ofileflags = newofileflags; + fdp->fd_nfiles = nfiles; + fdexpand++; + } + return (0); +} + +/* + * Check to see whether n user file descriptors + * are available to the process p. + */ +int +fdavail(p, n) + struct proc *p; + register int n; +{ + register struct filedesc *fdp = p->p_fd; + register struct file **fpp; + register int i, lim, last; + + lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc); + if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) + return (1); + + last = min(fdp->fd_nfiles, lim); + fpp = &fdp->fd_ofiles[fdp->fd_freefile]; + for (i = last - fdp->fd_freefile; --i >= 0; fpp++) + if (*fpp == NULL && --n <= 0) + return (1); + return (0); +} + +/* + * Create a new open file structure and allocate + * a file decriptor for the process that refers to it. + */ +int +falloc(p, resultfp, resultfd) + register struct proc *p; + struct file **resultfp; + int *resultfd; +{ + register struct file *fp, *fq; + int error, i; + + if ((error = fdalloc(p, 0, &i))) + return (error); + if (nfiles >= maxfiles) { + tablefull("file"); + return (ENFILE); + } + /* + * Allocate a new file descriptor. + * If the process has file descriptor zero open, add to the list + * of open files at that point, otherwise put it at the front of + * the list of open files. + */ + nfiles++; + MALLOC(fp, struct file *, sizeof(struct file), M_FILE, M_WAITOK); + bzero(fp, sizeof(struct file)); + if ((fq = p->p_fd->fd_ofiles[0])) { + LIST_INSERT_AFTER(fq, fp, f_list); + } else { + LIST_INSERT_HEAD(&filehead, fp, f_list); + } + p->p_fd->fd_ofiles[i] = fp; + fp->f_count = 1; + fp->f_cred = p->p_ucred; + fp->f_seqcount = 1; + crhold(fp->f_cred); + if (resultfp) + *resultfp = fp; + if (resultfd) + *resultfd = i; + return (0); +} + +/* + * Free a file descriptor. + */ +void +ffree(fp) + register struct file *fp; +{ + LIST_REMOVE(fp, f_list); + crfree(fp->f_cred); +#if defined(DIAGNOSTIC) || defined(INVARIANTS) + fp->f_count = 0; +#endif + nfiles--; + FREE(fp, M_FILE); +} + +/* + * Build a new filedesc structure. + */ +struct filedesc * +fdinit(p) + struct proc *p; +{ + register struct filedesc0 *newfdp; + register struct filedesc *fdp = p->p_fd; + + MALLOC(newfdp, struct filedesc0 *, sizeof(struct filedesc0), + M_FILEDESC, M_WAITOK); + bzero(newfdp, sizeof(struct filedesc0)); + newfdp->fd_fd.fd_cdir = fdp->fd_cdir; + VREF(newfdp->fd_fd.fd_cdir); + newfdp->fd_fd.fd_rdir = fdp->fd_rdir; + VREF(newfdp->fd_fd.fd_rdir); + + /* Create the file descriptor table. */ + newfdp->fd_fd.fd_refcnt = 1; + newfdp->fd_fd.fd_cmask = cmask; + newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles; + newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags; + newfdp->fd_fd.fd_nfiles = NDFILE; + + newfdp->fd_fd.fd_freefile = 0; + newfdp->fd_fd.fd_lastfile = 0; + + return (&newfdp->fd_fd); +} + +/* + * Share a filedesc structure. + */ +struct filedesc * +fdshare(p) + struct proc *p; +{ + p->p_fd->fd_refcnt++; + return (p->p_fd); +} + +/* + * Copy a filedesc structure. + */ +struct filedesc * +fdcopy(p) + struct proc *p; +{ + register struct filedesc *newfdp, *fdp = p->p_fd; + register struct file **fpp; + register int i; + +/* + * Certain daemons might not have file descriptors + */ + if (fdp == NULL) + return NULL; + + MALLOC(newfdp, struct filedesc *, sizeof(struct filedesc0), + M_FILEDESC, M_WAITOK); + bcopy(fdp, newfdp, sizeof(struct filedesc)); + VREF(newfdp->fd_cdir); + VREF(newfdp->fd_rdir); + newfdp->fd_refcnt = 1; + + /* + * If the number of open files fits in the internal arrays + * of the open file structure, use them, otherwise allocate + * additional memory for the number of descriptors currently + * in use. + */ + if (newfdp->fd_lastfile < NDFILE) { + newfdp->fd_ofiles = ((struct filedesc0 *) newfdp)->fd_dfiles; + newfdp->fd_ofileflags = + ((struct filedesc0 *) newfdp)->fd_dfileflags; + i = NDFILE; + } else { + /* + * Compute the smallest multiple of NDEXTENT needed + * for the file descriptors currently in use, + * allowing the table to shrink. + */ + i = newfdp->fd_nfiles; + while (i > 2 * NDEXTENT && i > newfdp->fd_lastfile * 2) + i /= 2; + MALLOC(newfdp->fd_ofiles, struct file **, i * OFILESIZE, + M_FILEDESC, M_WAITOK); + newfdp->fd_ofileflags = (char *) &newfdp->fd_ofiles[i]; + } + newfdp->fd_nfiles = i; + bcopy(fdp->fd_ofiles, newfdp->fd_ofiles, i * sizeof(struct file **)); + bcopy(fdp->fd_ofileflags, newfdp->fd_ofileflags, i * sizeof(char)); + fpp = newfdp->fd_ofiles; + for (i = newfdp->fd_lastfile; i-- >= 0; fpp++) + if (*fpp != NULL) + (*fpp)->f_count++; + return (newfdp); +} + +/* + * Release a filedesc structure. + */ +void +fdfree(p) + struct proc *p; +{ + register struct filedesc *fdp = p->p_fd; + struct file **fpp; + register int i; + +/* + * Certain daemons might not have file descriptors + */ + if (fdp == NULL) + return; + + if (--fdp->fd_refcnt > 0) + return; + fpp = fdp->fd_ofiles; + for (i = fdp->fd_lastfile; i-- >= 0; fpp++) + if (*fpp) + (void) closef(*fpp, p); + if (fdp->fd_nfiles > NDFILE) + FREE(fdp->fd_ofiles, M_FILEDESC); + vrele(fdp->fd_cdir); + vrele(fdp->fd_rdir); + FREE(fdp, M_FILEDESC); +} + +/* + * Close any files on exec? + */ +void +fdcloseexec(p) + struct proc *p; +{ + struct filedesc *fdp = p->p_fd; + struct file **fpp; + char *fdfp; + register int i; + +/* + * Certain daemons might not have file descriptors + */ + if (fdp == NULL) + return; + + fpp = fdp->fd_ofiles; + fdfp = fdp->fd_ofileflags; + for (i = 0; i <= fdp->fd_lastfile; i++, fpp++, fdfp++) + if (*fpp != NULL && (*fdfp & UF_EXCLOSE)) { + if (*fdfp & UF_MAPPED) + (void) munmapfd(p, i); + (void) closef(*fpp, p); + *fpp = NULL; + *fdfp = 0; + if (i < fdp->fd_freefile) + fdp->fd_freefile = i; + } + while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL) + fdp->fd_lastfile--; +} + +/* + * Internal form of close. + * Decrement reference count on file structure. + * Note: p may be NULL when closing a file + * that was being passed in a message. + */ +int +closef(fp, p) + register struct file *fp; + register struct proc *p; +{ + struct vnode *vp; + struct flock lf; + int error; + + if (fp == NULL) + return (0); + /* + * POSIX record locking dictates that any close releases ALL + * locks owned by this process. This is handled by setting + * a flag in the unlock to free ONLY locks obeying POSIX + * semantics, and not to free BSD-style file locks. + * If the descriptor was in a message, POSIX-style locks + * aren't passed with the descriptor. + */ + if (p && (p->p_flag & P_ADVLOCK) && fp->f_type == DTYPE_VNODE) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_UNLCK; + vp = (struct vnode *)fp->f_data; + (void) VOP_ADVLOCK(vp, (caddr_t)p, F_UNLCK, &lf, F_POSIX); + } + if (--fp->f_count > 0) + return (0); + if (fp->f_count < 0) + panic("closef: count < 0"); + if ((fp->f_flag & FHASLOCK) && fp->f_type == DTYPE_VNODE) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + lf.l_type = F_UNLCK; + vp = (struct vnode *)fp->f_data; + (void) VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK); + } + if (fp->f_ops) + error = (*fp->f_ops->fo_close)(fp, p); + else + error = 0; + ffree(fp); + return (error); +} + +/* + * Apply an advisory lock on a file descriptor. + * + * Just attempt to get a record lock of the requested type on + * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0). + */ +#ifndef _SYS_SYSPROTO_H_ +struct flock_args { + int fd; + int how; +}; +#endif +/* ARGSUSED */ +int +flock(p, uap) + struct proc *p; + register struct flock_args *uap; +{ + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + struct vnode *vp; + struct flock lf; + + if ((unsigned)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) + return (EBADF); + if (fp->f_type != DTYPE_VNODE) + return (EOPNOTSUPP); + vp = (struct vnode *)fp->f_data; + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + if (uap->how & LOCK_UN) { + lf.l_type = F_UNLCK; + fp->f_flag &= ~FHASLOCK; + return (VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK)); + } + if (uap->how & LOCK_EX) + lf.l_type = F_WRLCK; + else if (uap->how & LOCK_SH) + lf.l_type = F_RDLCK; + else + return (EBADF); + fp->f_flag |= FHASLOCK; + if (uap->how & LOCK_NB) + return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK)); + return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK|F_WAIT)); +} + +/* + * File Descriptor pseudo-device driver (/dev/fd/). + * + * Opening minor device N dup()s the file (if any) connected to file + * descriptor N belonging to the calling process. Note that this driver + * consists of only the ``open()'' routine, because all subsequent + * references to this file will be direct to the other driver. + */ +/* ARGSUSED */ +static int +fdopen(dev, mode, type, p) + dev_t dev; + int mode, type; + struct proc *p; +{ + + /* + * XXX Kludge: set curproc->p_dupfd to contain the value of the + * the file descriptor being sought for duplication. The error + * return ensures that the vnode for this device will be released + * by vn_open. Open will detect this special error and take the + * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN + * will simply report the error. + */ + p->p_dupfd = minor(dev); + return (ENODEV); +} + +/* + * Duplicate the specified descriptor to a free descriptor. + */ +int +dupfdopen(fdp, indx, dfd, mode, error) + register struct filedesc *fdp; + register int indx, dfd; + int mode; + int error; +{ + register struct file *wfp; + struct file *fp; + + /* + * If the to-be-dup'd fd number is greater than the allowed number + * of file descriptors, or the fd to be dup'd has already been + * closed, reject. Note, check for new == old is necessary as + * falloc could allocate an already closed to-be-dup'd descriptor + * as the new descriptor. + */ + fp = fdp->fd_ofiles[indx]; + if ((u_int)dfd >= fdp->fd_nfiles || + (wfp = fdp->fd_ofiles[dfd]) == NULL || fp == wfp) + return (EBADF); + + /* + * There are two cases of interest here. + * + * For ENODEV simply dup (dfd) to file descriptor + * (indx) and return. + * + * For ENXIO steal away the file structure from (dfd) and + * store it in (indx). (dfd) is effectively closed by + * this operation. + * + * Any other error code is just returned. + */ + switch (error) { + case ENODEV: + /* + * Check that the mode the file is being opened for is a + * subset of the mode of the existing descriptor. + */ + if (((mode & (FREAD|FWRITE)) | wfp->f_flag) != wfp->f_flag) + return (EACCES); + fdp->fd_ofiles[indx] = wfp; + fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; + wfp->f_count++; + if (indx > fdp->fd_lastfile) + fdp->fd_lastfile = indx; + return (0); + + case ENXIO: + /* + * Steal away the file pointer from dfd, and stuff it into indx. + */ + fdp->fd_ofiles[indx] = fdp->fd_ofiles[dfd]; + fdp->fd_ofiles[dfd] = NULL; + fdp->fd_ofileflags[indx] = fdp->fd_ofileflags[dfd]; + fdp->fd_ofileflags[dfd] = 0; + /* + * Complete the clean up of the filedesc structure by + * recomputing the various hints. + */ + if (indx > fdp->fd_lastfile) + fdp->fd_lastfile = indx; + else + while (fdp->fd_lastfile > 0 && + fdp->fd_ofiles[fdp->fd_lastfile] == NULL) + fdp->fd_lastfile--; + if (dfd < fdp->fd_freefile) + fdp->fd_freefile = dfd; + return (0); + + default: + return (error); + } + /* NOTREACHED */ +} + +/* + * Get file structures. + */ +static int +sysctl_kern_file SYSCTL_HANDLER_ARGS +{ + int error; + struct file *fp; + + if (!req->oldptr) { + /* + * overestimate by 10 files + */ + return (SYSCTL_OUT(req, 0, sizeof(filehead) + + (nfiles + 10) * sizeof(struct file))); + } + + error = SYSCTL_OUT(req, (caddr_t)&filehead, sizeof(filehead)); + if (error) + return (error); + + /* + * followed by an array of file structures + */ + for (fp = filehead.lh_first; fp != NULL; fp = fp->f_list.le_next) { + error = SYSCTL_OUT(req, (caddr_t)fp, sizeof (struct file)); + if (error) + return (error); + } + return (0); +} + +SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD, + 0, 0, sysctl_kern_file, "S,file", ""); + +SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, + CTLFLAG_RW, &maxfilesperproc, 0, ""); + +SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, &maxfiles, 0, ""); + +static fildesc_devsw_installed = 0; +#ifdef DEVFS +static void *devfs_token_stdin; +static void *devfs_token_stdout; +static void *devfs_token_stderr; +static void *devfs_token_fildesc[NUMFDESC]; +#endif + +static void fildesc_drvinit(void *unused) +{ + dev_t dev; +#ifdef DEVFS + int fd; +#endif + + if( ! fildesc_devsw_installed ) { + dev = makedev(CDEV_MAJOR,0); + cdevsw_add(&dev,&fildesc_cdevsw,NULL); + fildesc_devsw_installed = 1; +#ifdef DEVFS + for (fd = 0; fd < NUMFDESC; fd++) + devfs_token_fildesc[fd] = + devfs_add_devswf(&fildesc_cdevsw, fd, DV_CHR, + UID_BIN, GID_BIN, 0666, + "fd/%d", fd); + devfs_token_stdin = + devfs_add_devswf(&fildesc_cdevsw, 0, DV_CHR, + UID_ROOT, GID_WHEEL, 0666, + "stdin"); + devfs_token_stdout = + devfs_add_devswf(&fildesc_cdevsw, 1, DV_CHR, + UID_ROOT, GID_WHEEL, 0666, + "stdout"); + devfs_token_stderr = + devfs_add_devswf(&fildesc_cdevsw, 2, DV_CHR, + UID_ROOT, GID_WHEEL, 0666, + "stderr"); +#endif + } +} + +SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR, + fildesc_drvinit,NULL) + + diff --git a/sys/kern/kern_environment.c b/sys/kern/kern_environment.c new file mode 100644 index 0000000..2243e27 --- /dev/null +++ b/sys/kern/kern_environment.c @@ -0,0 +1,137 @@ +/*- + * Copyright (c) 1998 Michael Smith + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: kern_environment.c,v 1.3 1998/10/09 21:21:34 msmith Exp $ + */ + +/* + * The unified bootloader passes us a pointer to a preserved copy of + * bootstrap/kernel environment variables. + * We make these available using sysctl for both in-kernel and + * out-of-kernel consumers. + * + * Note that the current sysctl infrastructure doesn't allow + * dynamic insertion or traversal through handled spaces. Grr. + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/sysctl.h> +#include <sys/libkern.h> +#include <machine/bootinfo.h> + +char *kern_envp; + +static char *kernenv_next(char *cp); + +char * +getenv(char *name) +{ + char *cp, *ep; + int len; + + for (cp = kern_envp; cp != NULL; cp = kernenv_next(cp)) { + for (ep = cp; (*ep != '=') && (*ep != 0); ep++) + ; + len = ep - cp; + if (*ep = '=') + ep++; + if (!strncmp(name, cp, len)) + return(ep); + } + return(NULL); +} + +/* + * Return an integer value from an environment variable. + */ +int +getenv_int(char *name, int *data) +{ + char *value, *vtp; + quad_t iv; + + if ((value = getenv(name)) == NULL) + return(0); + + iv = strtoq(value, &vtp, 0); + if ((vtp == value) || (*vtp != 0)) + return(0); + + *data = (int)iv; + return(1); +} + +static int +sysctl_kernenv SYSCTL_HANDLER_ARGS +{ + int *name = (int *)arg1; + u_int namelen = arg2; + char *cp; + int i, error; + + if (kern_envp == NULL) + return(ENOENT); + + name++; + namelen--; + + if (namelen != 1) + return(EINVAL); + + cp = kern_envp; + for (i = 0; i < name[0]; i++) { + cp = kernenv_next(cp); + if (cp == NULL) + break; + } + + if (cp == NULL) + return(ENOENT); + + error = SYSCTL_OUT(req, cp, strlen(cp) + 1); + return (error); +} + +SYSCTL_NODE(_kern, OID_AUTO, environment, CTLFLAG_RD, sysctl_kernenv, "kernel environment space"); + +/* + * Find the next entry after the one which (cp) falls within, return a + * pointer to its start or NULL if there are no more. + */ +static char * +kernenv_next(char *cp) +{ + if (cp != NULL) { + while (*cp != 0) + cp++; + cp++; + if (*cp == 0) + cp = NULL; + } + return(cp); +} + diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c new file mode 100644 index 0000000..dd63672 --- /dev/null +++ b/sys/kern/kern_exec.c @@ -0,0 +1,778 @@ +/* + * Copyright (c) 1993, David Greenman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: kern_exec.c,v 1.92 1998/12/30 10:38:59 dfr Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/signalvar.h> +#include <sys/kernel.h> +#include <sys/mount.h> +#include <sys/filedesc.h> +#include <sys/fcntl.h> +#include <sys/acct.h> +#include <sys/exec.h> +#include <sys/imgact.h> +#include <sys/imgact_elf.h> +#include <sys/wait.h> +#include <sys/proc.h> +#include <sys/pioctl.h> +#include <sys/malloc.h> +#include <sys/namei.h> +#include <sys/sysent.h> +#include <sys/shm.h> +#include <sys/sysctl.h> +#include <sys/vnode.h> +#include <sys/buf.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#include <vm/vm_object.h> +#include <vm/vm_zone.h> +#include <vm/vm_pager.h> + +#include <machine/reg.h> + +static long *exec_copyout_strings __P((struct image_params *)); + +static long ps_strings = PS_STRINGS; +SYSCTL_LONG(_kern, KERN_PS_STRINGS, ps_strings, CTLFLAG_RD, &ps_strings, ""); + +static long usrstack = USRSTACK; +SYSCTL_LONG(_kern, KERN_USRSTACK, usrstack, CTLFLAG_RD, &usrstack, ""); + +/* + * Each of the items is a pointer to a `const struct execsw', hence the + * double pointer here. + */ +static const struct execsw **execsw; + +#ifndef _SYS_SYSPROTO_H_ +struct execve_args { + char *fname; + char **argv; + char **envv; +}; +#endif + +/* + * execve() system call. + */ +int +execve(p, uap) + struct proc *p; + register struct execve_args *uap; +{ + struct nameidata nd, *ndp; + long *stack_base; + int error, len, i; + struct image_params image_params, *imgp; + struct vattr attr; + + imgp = &image_params; + + /* + * Initialize part of the common data + */ + imgp->proc = p; + imgp->uap = uap; + imgp->attr = &attr; + imgp->argc = imgp->envc = 0; + imgp->argv0 = NULL; + imgp->entry_addr = 0; + imgp->vmspace_destroyed = 0; + imgp->interpreted = 0; + imgp->interpreter_name[0] = '\0'; + imgp->auxargs = NULL; + imgp->vp = NULL; + imgp->firstpage = NULL; + + /* + * Allocate temporary demand zeroed space for argument and + * environment strings + */ + imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX + PAGE_SIZE); + if (imgp->stringbase == NULL) { + error = ENOMEM; + goto exec_fail; + } + imgp->stringp = imgp->stringbase; + imgp->stringspace = ARG_MAX; + imgp->image_header = imgp->stringbase + ARG_MAX; + + /* + * Translate the file name. namei() returns a vnode pointer + * in ni_vp amoung other things. + */ + ndp = &nd; + NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, + UIO_USERSPACE, uap->fname, p); + +interpret: + + error = namei(ndp); + if (error) { + kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, + ARG_MAX + PAGE_SIZE); + goto exec_fail; + } + + imgp->vp = ndp->ni_vp; + imgp->fname = uap->fname; + + /* + * Check file permissions (also 'opens' file) + */ + error = exec_check_permissions(imgp); + if (error) { + VOP_UNLOCK(imgp->vp, 0, p); + goto exec_fail_dealloc; + } + + error = exec_map_first_page(imgp); + VOP_UNLOCK(imgp->vp, 0, p); + if (error) + goto exec_fail_dealloc; + + /* + * Loop through list of image activators, calling each one. + * If there is no match, the activator returns -1. If there + * is a match, but there was an error during the activation, + * the error is returned. Otherwise 0 means success. If the + * image is interpreted, loop back up and try activating + * the interpreter. + */ + for (i = 0; execsw[i]; ++i) { + if (execsw[i]->ex_imgact) + error = (*execsw[i]->ex_imgact)(imgp); + else + continue; + if (error == -1) + continue; + if (error) + goto exec_fail_dealloc; + if (imgp->interpreted) { + exec_unmap_first_page(imgp); + /* free old vnode and name buffer */ + vrele(ndp->ni_vp); + zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); + /* set new name to that of the interpreter */ + NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, + UIO_SYSSPACE, imgp->interpreter_name, p); + goto interpret; + } + break; + } + /* If we made it through all the activators and none matched, exit. */ + if (error == -1) { + error = ENOEXEC; + goto exec_fail_dealloc; + } + + /* + * Copy out strings (args and env) and initialize stack base + */ + stack_base = exec_copyout_strings(imgp); + p->p_vmspace->vm_minsaddr = (char *)stack_base; + + /* + * If custom stack fixup routine present for this process + * let it do the stack setup. + * Else stuff argument count as first item on stack + */ + if (p->p_sysent->sv_fixup) + (*p->p_sysent->sv_fixup)(&stack_base, imgp); + else + suword(--stack_base, imgp->argc); + + /* + * For security and other reasons, the file descriptor table cannot + * be shared after an exec. + */ + if (p->p_fd->fd_refcnt > 1) { + struct filedesc *tmp; + + tmp = fdcopy(p); + fdfree(p); + p->p_fd = tmp; + } + + /* close files on exec */ + fdcloseexec(p); + + /* reset caught signals */ + execsigs(p); + + /* name this process - nameiexec(p, ndp) */ + len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN); + bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len); + p->p_comm[len] = 0; + + /* + * mark as execed, wakeup the process that vforked (if any) and tell + * it that it now has its own resources back + */ + p->p_flag |= P_EXEC; + if (p->p_pptr && (p->p_flag & P_PPWAIT)) { + p->p_flag &= ~P_PPWAIT; + wakeup((caddr_t)p->p_pptr); + } + + /* + * Implement image setuid/setgid. + * + * Don't honor setuid/setgid if the filesystem prohibits it or if + * the process is being traced. + */ + if ((attr.va_mode & VSUID && p->p_ucred->cr_uid != attr.va_uid || + attr.va_mode & VSGID && p->p_ucred->cr_gid != attr.va_gid) && + (imgp->vp->v_mount->mnt_flag & MNT_NOSUID) == 0 && + (p->p_flag & P_TRACED) == 0) { + /* + * Turn off syscall tracing for set-id programs, except for + * root. + */ + if (p->p_tracep && suser(p->p_ucred, &p->p_acflag)) { + p->p_traceflag = 0; + vrele(p->p_tracep); + p->p_tracep = NULL; + } + /* + * Set the new credentials. + */ + p->p_ucred = crcopy(p->p_ucred); + if (attr.va_mode & VSUID) + p->p_ucred->cr_uid = attr.va_uid; + if (attr.va_mode & VSGID) + p->p_ucred->cr_gid = attr.va_gid; + setsugid(p); + } else { + if (p->p_ucred->cr_uid == p->p_cred->p_ruid && + p->p_ucred->cr_gid == p->p_cred->p_rgid) + p->p_flag &= ~P_SUGID; + } + + /* + * Implement correct POSIX saved-id behavior. + */ + p->p_cred->p_svuid = p->p_ucred->cr_uid; + p->p_cred->p_svgid = p->p_ucred->cr_gid; + + /* + * Store the vp for use in procfs + */ + if (p->p_textvp) /* release old reference */ + vrele(p->p_textvp); + VREF(ndp->ni_vp); + p->p_textvp = ndp->ni_vp; + + /* + * If tracing the process, trap to debugger so breakpoints + * can be set before the program executes. + */ + STOPEVENT(p, S_EXEC, 0); + + if (p->p_flag & P_TRACED) + psignal(p, SIGTRAP); + + /* clear "fork but no exec" flag, as we _are_ execing */ + p->p_acflag &= ~AFORK; + + /* Set entry address */ + setregs(p, imgp->entry_addr, (u_long)(uintptr_t)stack_base); + +exec_fail_dealloc: + + /* + * free various allocated resources + */ + if (imgp->firstpage) + exec_unmap_first_page(imgp); + + if (imgp->stringbase != NULL) + kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, + ARG_MAX + PAGE_SIZE); + + if (imgp->vp) { + vrele(imgp->vp); + zfree(namei_zone, ndp->ni_cnd.cn_pnbuf); + } + + if (error == 0) + return (0); + +exec_fail: + if (imgp->vmspace_destroyed) { + /* sorry, no more process anymore. exit gracefully */ + exit1(p, W_EXITCODE(0, SIGABRT)); + /* NOT REACHED */ + return(0); + } else { + return(error); + } +} + +int +exec_map_first_page(imgp) + struct image_params *imgp; +{ + int s, rv, i; + int initial_pagein; + vm_page_t ma[VM_INITIAL_PAGEIN]; + vm_object_t object; + + + if (imgp->firstpage) { + exec_unmap_first_page(imgp); + } + + object = imgp->vp->v_object; + s = splvm(); + + ma[0] = vm_page_grab(object, 0, VM_ALLOC_NORMAL | VM_ALLOC_RETRY); + + if ((ma[0]->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) { + initial_pagein = VM_INITIAL_PAGEIN; + if (initial_pagein > object->size) + initial_pagein = object->size; + for (i = 1; i < initial_pagein; i++) { + if (ma[i] = vm_page_lookup(object, i)) { + if ((ma[i]->flags & PG_BUSY) || ma[i]->busy) + break; + if (ma[i]->valid) + break; + vm_page_busy(ma[i]); + } else { + ma[i] = vm_page_alloc(object, i, VM_ALLOC_NORMAL); + if (ma[i] == NULL) + break; + } + } + initial_pagein = i; + + rv = vm_pager_get_pages(object, ma, initial_pagein, 0); + ma[0] = vm_page_lookup(object, 0); + + if ((rv != VM_PAGER_OK) || (ma[0] == NULL) || (ma[0]->valid == 0)) { + if (ma[0]) { + vm_page_protect(ma[0], VM_PROT_NONE); + vm_page_free(ma[0]); + } + splx(s); + return EIO; + } + } + + vm_page_wire(ma[0]); + vm_page_wakeup(ma[0]); + splx(s); + + pmap_kenter((vm_offset_t) imgp->image_header, VM_PAGE_TO_PHYS(ma[0])); + imgp->firstpage = ma[0]; + + return 0; +} + +void +exec_unmap_first_page(imgp) + struct image_params *imgp; +{ + if (imgp->firstpage) { + pmap_kremove((vm_offset_t) imgp->image_header); + vm_page_unwire(imgp->firstpage, 1); + imgp->firstpage = NULL; + } +} + +/* + * Destroy old address space, and allocate a new stack + * The new stack is only SGROWSIZ large because it is grown + * automatically in trap.c. + */ +int +exec_new_vmspace(imgp) + struct image_params *imgp; +{ + int error; + struct vmspace *vmspace = imgp->proc->p_vmspace; +#ifdef VM_STACK + caddr_t stack_addr = (caddr_t) (USRSTACK - MAXSSIZ); +#else + caddr_t stack_addr = (caddr_t) (USRSTACK - SGROWSIZ); +#endif + vm_map_t map = &vmspace->vm_map; + + imgp->vmspace_destroyed = 1; + + /* + * Blow away entire process VM, if address space not shared, + * otherwise, create a new VM space so that other threads are + * not disrupted + */ + if (vmspace->vm_refcnt == 1) { + if (vmspace->vm_shm) + shmexit(imgp->proc); + pmap_remove_pages(&vmspace->vm_pmap, 0, VM_MAXUSER_ADDRESS); + vm_map_remove(map, 0, VM_MAXUSER_ADDRESS); + } else { + vmspace_exec(imgp->proc); + vmspace = imgp->proc->p_vmspace; + map = &vmspace->vm_map; + } + + /* Allocate a new stack */ +#ifdef VM_STACK + error = vm_map_stack (&vmspace->vm_map, (vm_offset_t)stack_addr, + (vm_size_t)MAXSSIZ, VM_PROT_ALL, VM_PROT_ALL, 0); + if (error) + return (error); + + /* vm_ssize and vm_maxsaddr are somewhat antiquated concepts in the + * VM_STACK case, but they are still used to monitor the size of the + * process stack so we can check the stack rlimit. + */ + vmspace->vm_ssize = SGROWSIZ >> PAGE_SHIFT; + vmspace->vm_maxsaddr = (char *)USRSTACK - MAXSSIZ; +#else + error = vm_map_insert(&vmspace->vm_map, NULL, 0, + (vm_offset_t) stack_addr, (vm_offset_t) USRSTACK, + VM_PROT_ALL, VM_PROT_ALL, 0); + if (error) + return (error); + + vmspace->vm_ssize = SGROWSIZ >> PAGE_SHIFT; + + /* Initialize maximum stack address */ + vmspace->vm_maxsaddr = (char *)USRSTACK - MAXSSIZ; +#endif + + return(0); +} + +/* + * Copy out argument and environment strings from the old process + * address space into the temporary string buffer. + */ +int +exec_extract_strings(imgp) + struct image_params *imgp; +{ + char **argv, **envv; + char *argp, *envp; + int error; + size_t length; + + /* + * extract arguments first + */ + + argv = imgp->uap->argv; + + if (argv) { + argp = (caddr_t) (intptr_t) fuword(argv); + if (argp == (caddr_t) -1) + return (EFAULT); + if (argp) + argv++; + if (imgp->argv0) + argp = imgp->argv0; + if (argp) { + do { + if (argp == (caddr_t) -1) + return (EFAULT); + if ((error = copyinstr(argp, imgp->stringp, + imgp->stringspace, &length))) { + if (error == ENAMETOOLONG) + return(E2BIG); + return (error); + } + imgp->stringspace -= length; + imgp->stringp += length; + imgp->argc++; + } while ((argp = (caddr_t) (intptr_t) fuword(argv++))); + } + } + + /* + * extract environment strings + */ + + envv = imgp->uap->envv; + + if (envv) { + while ((envp = (caddr_t) (intptr_t) fuword(envv++))) { + if (envp == (caddr_t) -1) + return (EFAULT); + if ((error = copyinstr(envp, imgp->stringp, + imgp->stringspace, &length))) { + if (error == ENAMETOOLONG) + return(E2BIG); + return (error); + } + imgp->stringspace -= length; + imgp->stringp += length; + imgp->envc++; + } + } + + return (0); +} + +/* + * Copy strings out to the new process address space, constructing + * new arg and env vector tables. Return a pointer to the base + * so that it can be used as the initial stack pointer. + */ +long * +exec_copyout_strings(imgp) + struct image_params *imgp; +{ + int argc, envc; + char **vectp; + char *stringp, *destp; + long *stack_base; + struct ps_strings *arginfo; + int szsigcode; + + /* + * Calculate string base and vector table pointers. + * Also deal with signal trampoline code for this exec type. + */ + arginfo = (struct ps_strings *)PS_STRINGS; + szsigcode = *(imgp->proc->p_sysent->sv_szsigcode); + destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE - + roundup((ARG_MAX - imgp->stringspace), sizeof(char *)); + + /* + * install sigcode + */ + if (szsigcode) + copyout(imgp->proc->p_sysent->sv_sigcode, + ((caddr_t)arginfo - szsigcode), szsigcode); + + /* + * If we have a valid auxargs ptr, prepare some room + * on the stack. + */ + if (imgp->auxargs) + /* + * The '+ 2' is for the null pointers at the end of each of the + * arg and env vector sets, and 'AT_COUNT*2' is room for the + * ELF Auxargs data. + */ + vectp = (char **)(destp - (imgp->argc + imgp->envc + 2 + + AT_COUNT*2) * sizeof(char*)); + else + /* + * The '+ 2' is for the null pointers at the end of each of the + * arg and env vector sets + */ + vectp = (char **) + (destp - (imgp->argc + imgp->envc + 2) * sizeof(char*)); + + /* + * vectp also becomes our initial stack base + */ + stack_base = (long *)vectp; + + stringp = imgp->stringbase; + argc = imgp->argc; + envc = imgp->envc; + + /* + * Copy out strings - arguments and environment. + */ + copyout(stringp, destp, ARG_MAX - imgp->stringspace); + + /* + * Fill in "ps_strings" struct for ps, w, etc. + */ + suword(&arginfo->ps_argvstr, (long)(intptr_t)vectp); + suword(&arginfo->ps_nargvstr, argc); + + /* + * Fill in argument portion of vector table. + */ + for (; argc > 0; --argc) { + suword(vectp++, (long)(intptr_t)destp); + while (*stringp++ != 0) + destp++; + destp++; + } + + /* a null vector table pointer seperates the argp's from the envp's */ + suword(vectp++, 0); + + suword(&arginfo->ps_envstr, (long)(intptr_t)vectp); + suword(&arginfo->ps_nenvstr, envc); + + /* + * Fill in environment portion of vector table. + */ + for (; envc > 0; --envc) { + suword(vectp++, (long)(intptr_t)destp); + while (*stringp++ != 0) + destp++; + destp++; + } + + /* end of vector table is a null pointer */ + suword(vectp, 0); + + return (stack_base); +} + +/* + * Check permissions of file to execute. + * Return 0 for success or error code on failure. + */ +int +exec_check_permissions(imgp) + struct image_params *imgp; +{ + struct proc *p = imgp->proc; + struct vnode *vp = imgp->vp; + struct vattr *attr = imgp->attr; + int error; + + /* Get file attributes */ + error = VOP_GETATTR(vp, attr, p->p_ucred, p); + if (error) + return (error); + + /* + * 1) Check if file execution is disabled for the filesystem that this + * file resides on. + * 2) Insure that at least one execute bit is on - otherwise root + * will always succeed, and we don't want to happen unless the + * file really is executable. + * 3) Insure that the file is a regular file. + */ + if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || + ((attr->va_mode & 0111) == 0) || + (attr->va_type != VREG)) { + return (EACCES); + } + + /* + * Zero length files can't be exec'd + */ + if (attr->va_size == 0) + return (ENOEXEC); + + /* + * Check for execute permission to file based on current credentials. + */ + error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p); + if (error) + return (error); + + /* + * Check number of open-for-writes on the file and deny execution + * if there are any. + */ + if (vp->v_writecount) + return (ETXTBSY); + + /* + * Call filesystem specific open routine (which does nothing in the + * general case). + */ + error = VOP_OPEN(vp, FREAD, p->p_ucred, p); + if (error) + return (error); + + return (0); +} + +/* + * Exec handler registration + */ +int +exec_register(execsw_arg) + const struct execsw *execsw_arg; +{ + const struct execsw **es, **xs, **newexecsw; + int count = 2; /* New slot and trailing NULL */ + + if (execsw) + for (es = execsw; *es; es++) + count++; + newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); + if (newexecsw == NULL) + return ENOMEM; + xs = newexecsw; + if (execsw) + for (es = execsw; *es; es++) + *xs++ = *es; + *xs++ = execsw_arg; + *xs = NULL; + if (execsw) + free(execsw, M_TEMP); + execsw = newexecsw; + return 0; +} + +int +exec_unregister(execsw_arg) + const struct execsw *execsw_arg; +{ + const struct execsw **es, **xs, **newexecsw; + int count = 1; + + if (execsw == NULL) + panic("unregister with no handlers left?\n"); + + for (es = execsw; *es; es++) { + if (*es == execsw_arg) + break; + } + if (*es == NULL) + return ENOENT; + for (es = execsw; *es; es++) + if (*es != execsw_arg) + count++; + newexecsw = malloc(count * sizeof(*es), M_TEMP, M_WAITOK); + if (newexecsw == NULL) + return ENOMEM; + xs = newexecsw; + for (es = execsw; *es; es++) + if (*es != execsw_arg) + *xs++ = *es; + *xs = NULL; + if (execsw) + free(execsw, M_TEMP); + execsw = newexecsw; + return 0; +} diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c new file mode 100644 index 0000000..7be01af --- /dev/null +++ b/sys/kern/kern_exit.c @@ -0,0 +1,647 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_exit.c 8.7 (Berkeley) 2/12/94 + * $Id: kern_exit.c,v 1.70 1998/12/19 02:55:33 julian Exp $ + */ + +#include "opt_compat.h" +#include "opt_ktrace.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <sys/pioctl.h> +#include <sys/tty.h> +#include <sys/wait.h> +#include <sys/vnode.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/ptrace.h> +#include <sys/acct.h> /* for acct_process() function prototype */ +#include <sys/filedesc.h> +#include <sys/shm.h> +#include <sys/sem.h> +#include <sys/aio.h> + +#ifdef COMPAT_43 +#include <machine/reg.h> +#include <machine/psl.h> +#endif +#include <machine/limits.h> /* for UCHAR_MAX = typeof(p_priority)_MAX */ + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_zone.h> +#ifdef COMPAT_LINUX_THREADS +#include <sys/user.h> +#endif + +static MALLOC_DEFINE(M_ZOMBIE, "zombie", "zombie proc status"); + +static int wait1 __P((struct proc *, struct wait_args *, int)); + +/* + * callout list for things to do at exit time + */ +typedef struct exit_list_element { + struct exit_list_element *next; + exitlist_fn function; +} *ele_p; + +static ele_p exit_list; + +/* + * exit -- + * Death of process. + */ +void +exit(p, uap) + struct proc *p; + struct rexit_args /* { + int rval; + } */ *uap; +{ + + exit1(p, W_EXITCODE(uap->rval, 0)); + /* NOTREACHED */ +} + +/* + * Exit: deallocate address space and other resources, change proc state + * to zombie, and unlink proc from allproc and parent's lists. Save exit + * status and rusage for wait(). Check for child processes and orphan them. + */ +void +exit1(p, rv) + register struct proc *p; + int rv; +{ + register struct proc *q, *nq; + register struct vmspace *vm; + ele_p ep = exit_list; + + if (p->p_pid == 1) { + printf("init died (signal %d, exit %d)\n", + WTERMSIG(rv), WEXITSTATUS(rv)); + panic("Going nowhere without my init!"); + } + + aio_proc_rundown(p); + + /* are we a task leader? */ + if(p == p->p_leader) { + struct kill_args killArgs; + killArgs.signum = SIGKILL; + q = p->p_peers; + while(q) { + killArgs.pid = q->p_pid; + /* + * The interface for kill is better + * than the internal signal + */ + kill(p, &killArgs); + nq = q; + q = q->p_peers; + /* + * orphan the threads so we don't mess up + * when they call exit + */ + nq->p_peers = 0; + nq->p_leader = nq; + } + + /* otherwise are we a peer? */ + } else if(p->p_peers) { + q = p->p_leader; + while(q->p_peers != p) + q = q->p_peers; + q->p_peers = p->p_peers; + } + +#ifdef PGINPROF + vmsizmon(); +#endif + STOPEVENT(p, S_EXIT, rv); + + /* + * Check if any LKMs need anything done at process exit. + * e.g. SYSV IPC stuff + * XXX what if one of these generates an error? + */ + while (ep) { + (*ep->function)(p); + ep = ep->next; + } + + if (p->p_flag & P_PROFIL) + stopprofclock(p); + MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage), + M_ZOMBIE, M_WAITOK); + /* + * If parent is waiting for us to exit or exec, + * P_PPWAIT is set; we will wakeup the parent below. + */ + p->p_flag &= ~(P_TRACED | P_PPWAIT); + p->p_flag |= P_WEXIT; +#ifndef COMPAT_LINUX_THREADS + p->p_sigignore = ~0; +#endif /* COMPAT_LINUX_THREADS */ + p->p_siglist = 0; + if (timevalisset(&p->p_realtimer.it_value)) + untimeout(realitexpire, (caddr_t)p, p->p_ithandle); + + /* + * Reset any sigio structures pointing to us as a result of + * F_SETOWN with our pid. + */ + funsetownlst(&p->p_sigiolst); + + /* + * Close open files and release open-file table. + * This may block! + */ + fdfree(p); + + /* + * XXX Shutdown SYSV semaphores + */ + semexit(p); + + /* The next two chunks should probably be moved to vmspace_exit. */ + vm = p->p_vmspace; + /* + * Release user portion of address space. + * This releases references to vnodes, + * which could cause I/O if the file has been unlinked. + * Need to do this early enough that we can still sleep. + * Can't free the entire vmspace as the kernel stack + * may be mapped within that space also. + */ + if (vm->vm_refcnt == 1) { + if (vm->vm_shm) + shmexit(p); + pmap_remove_pages(&vm->vm_pmap, VM_MIN_ADDRESS, + VM_MAXUSER_ADDRESS); + (void) vm_map_remove(&vm->vm_map, VM_MIN_ADDRESS, + VM_MAXUSER_ADDRESS); + } + + if (SESS_LEADER(p)) { + register struct session *sp = p->p_session; + + if (sp->s_ttyvp) { + /* + * Controlling process. + * Signal foreground pgrp, + * drain controlling terminal + * and revoke access to controlling terminal. + */ + if (sp->s_ttyp && (sp->s_ttyp->t_session == sp)) { + if (sp->s_ttyp->t_pgrp) + pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1); + (void) ttywait(sp->s_ttyp); + /* + * The tty could have been revoked + * if we blocked. + */ + if (sp->s_ttyvp) + VOP_REVOKE(sp->s_ttyvp, REVOKEALL); + } + if (sp->s_ttyvp) + vrele(sp->s_ttyvp); + sp->s_ttyvp = NULL; + /* + * s_ttyp is not zero'd; we use this to indicate + * that the session once had a controlling terminal. + * (for logging and informational purposes) + */ + } + sp->s_leader = NULL; + } + fixjobc(p, p->p_pgrp, 0); + (void)acct_process(p); +#ifdef KTRACE + /* + * release trace file + */ + p->p_traceflag = 0; /* don't trace the vrele() */ + if (p->p_tracep) + vrele(p->p_tracep); +#endif + /* + * Remove proc from allproc queue and pidhash chain. + * Place onto zombproc. Unlink from parent's child list. + */ + LIST_REMOVE(p, p_list); + LIST_INSERT_HEAD(&zombproc, p, p_list); + p->p_stat = SZOMB; + + LIST_REMOVE(p, p_hash); + + q = p->p_children.lh_first; + if (q) /* only need this if any child is S_ZOMB */ + wakeup((caddr_t) initproc); + for (; q != 0; q = nq) { + nq = q->p_sibling.le_next; + LIST_REMOVE(q, p_sibling); + LIST_INSERT_HEAD(&initproc->p_children, q, p_sibling); + q->p_pptr = initproc; +#ifdef COMPAT_LINUX_THREADS + q->p_sigparent = 0; +#endif /* COMPAT_LINUX_THREADS */ + /* + * Traced processes are killed + * since their existence means someone is screwing up. + */ + if (q->p_flag & P_TRACED) { + q->p_flag &= ~P_TRACED; + psignal(q, SIGKILL); + } + } + + /* + * Save exit status and final rusage info, adding in child rusage + * info and self times. + */ + p->p_xstat = rv; + *p->p_ru = p->p_stats->p_ru; + calcru(p, &p->p_ru->ru_utime, &p->p_ru->ru_stime, NULL); + ruadd(p->p_ru, &p->p_stats->p_cru); + + /* + * Notify parent that we're gone. If parent has the P_NOCLDWAIT + * flag set, notify process 1 instead (and hope it will handle + * this situation). + */ +#ifndef COMPAT_LINUX_THREADS + if (p->p_pptr->p_flag & P_NOCLDWAIT) { +#else + if (p->p_pptr->p_procsig->ps_flag & P_NOCLDWAIT) { +#endif /* COMPAT_LINUX_THREADS */ + struct proc *pp = p->p_pptr; + proc_reparent(p, initproc); + /* + * If this was the last child of our parent, notify + * parent, so in case he was wait(2)ing, he will + * continue. + */ + if (LIST_EMPTY(&pp->p_children)) + wakeup((caddr_t)pp); + } + +#ifndef COMPAT_LINUX_THREADS + psignal(p->p_pptr, SIGCHLD); +#else + if (p->p_sigparent && p->p_pptr != initproc) { + psignal(p->p_pptr, p->p_sigparent); + } else { + psignal(p->p_pptr, SIGCHLD); + } +#endif /* COMPAT_LINUX_THREADS */ + wakeup((caddr_t)p->p_pptr); +#if defined(tahoe) + /* move this to cpu_exit */ + p->p_addr->u_pcb.pcb_savacc.faddr = (float *)NULL; +#endif + /* + * Clear curproc after we've done all operations + * that could block, and before tearing down the rest + * of the process state that might be used from clock, etc. + * Also, can't clear curproc while we're still runnable, + * as we're not on a run queue (we are current, just not + * a proper proc any longer!). + * + * Other substructures are freed from wait(). + */ + curproc = NULL; + if (--p->p_limit->p_refcnt == 0) { + FREE(p->p_limit, M_SUBPROC); + p->p_limit = NULL; + } + + /* + * Finally, call machine-dependent code to release the remaining + * resources including address space, the kernel stack and pcb. + * The address space is released by "vmspace_free(p->p_vmspace)"; + * This is machine-dependent, as we may have to change stacks + * or ensure that the current one isn't reallocated before we + * finish. cpu_exit will end with a call to cpu_switch(), finishing + * our execution (pun intended). + */ + cpu_exit(p); +} + +#ifdef COMPAT_43 +#if defined(hp300) || defined(luna68k) +#include <machine/frame.h> +#define GETPS(rp) ((struct frame *)(rp))->f_sr +#else +#define GETPS(rp) (rp)[PS] +#endif + +int +owait(p, uap) + struct proc *p; + register struct owait_args /* { + int dummy; + } */ *uap; +{ + struct wait_args w; + +#ifdef PSL_ALLCC + if ((GETPS(p->p_md.md_regs) & PSL_ALLCC) != PSL_ALLCC) { + w.options = 0; + w.rusage = NULL; + } else { + w.options = p->p_md.md_regs[R0]; + w.rusage = (struct rusage *)p->p_md.md_regs[R1]; + } +#else + w.options = 0; + w.rusage = NULL; +#endif + w.pid = WAIT_ANY; + w.status = NULL; + return (wait1(p, &w, 1)); +} +#endif /* COMPAT_43 */ + +int +wait4(p, uap) + struct proc *p; + struct wait_args *uap; +{ + + return (wait1(p, uap, 0)); +} + +static int +wait1(q, uap, compat) + register struct proc *q; + register struct wait_args /* { + int pid; + int *status; + int options; + struct rusage *rusage; + } */ *uap; + int compat; +{ + register int nfound; + register struct proc *p, *t; + int status, error; + + if (uap->pid == 0) + uap->pid = -q->p_pgid; + if (uap->options &~ (WUNTRACED|WNOHANG)) + return (EINVAL); +loop: + nfound = 0; + for (p = q->p_children.lh_first; p != 0; p = p->p_sibling.le_next) { + if (uap->pid != WAIT_ANY && + p->p_pid != uap->pid && p->p_pgid != -uap->pid) + continue; + nfound++; + if (p->p_stat == SZOMB) { + /* charge childs scheduling cpu usage to parent */ + if (curproc->p_pid != 1) { + curproc->p_estcpu = min(curproc->p_estcpu + + p->p_estcpu, UCHAR_MAX); + } + + q->p_retval[0] = p->p_pid; +#ifdef COMPAT_43 + if (compat) + q->p_retval[1] = p->p_xstat; + else +#endif + if (uap->status) { + status = p->p_xstat; /* convert to int */ + if ((error = copyout((caddr_t)&status, + (caddr_t)uap->status, sizeof(status)))) + return (error); + } + if (uap->rusage && (error = copyout((caddr_t)p->p_ru, + (caddr_t)uap->rusage, sizeof (struct rusage)))) + return (error); + /* + * If we got the child via a ptrace 'attach', + * we need to give it back to the old parent. + */ + if (p->p_oppid && (t = pfind(p->p_oppid))) { + p->p_oppid = 0; + proc_reparent(p, t); + psignal(t, SIGCHLD); + wakeup((caddr_t)t); + return (0); + } + p->p_xstat = 0; + ruadd(&q->p_stats->p_cru, p->p_ru); + FREE(p->p_ru, M_ZOMBIE); + p->p_ru = NULL; + + /* + * Decrement the count of procs running with this uid. + */ + (void)chgproccnt(p->p_cred->p_ruid, -1); + + /* + * Release reference to text vnode + */ + if (p->p_textvp) + vrele(p->p_textvp); + + /* + * Free up credentials. + */ + if (--p->p_cred->p_refcnt == 0) { + crfree(p->p_cred->pc_ucred); + FREE(p->p_cred, M_SUBPROC); + p->p_cred = NULL; + } + + /* + * Finally finished with old proc entry. + * Unlink it from its process group and free it. + */ + leavepgrp(p); + LIST_REMOVE(p, p_list); /* off zombproc */ + LIST_REMOVE(p, p_sibling); + +#ifdef COMPAT_LINUX_THREADS + if (--p->p_procsig->ps_refcnt == 0) { + if (p->p_sigacts != &p->p_addr->u_sigacts) + FREE(p->p_sigacts, M_SUBPROC); + FREE(p->p_procsig, M_SUBPROC); + p->p_procsig = NULL; + } +#endif /* COMPAT_LINUX_THREADS */ + /* + * Give machine-dependent layer a chance + * to free anything that cpu_exit couldn't + * release while still running in process context. + */ + cpu_wait(p); + zfree(proc_zone, p); + nprocs--; + return (0); + } + if (p->p_stat == SSTOP && (p->p_flag & P_WAITED) == 0 && + (p->p_flag & P_TRACED || uap->options & WUNTRACED)) { + p->p_flag |= P_WAITED; + q->p_retval[0] = p->p_pid; +#ifdef COMPAT_43 + if (compat) { + q->p_retval[1] = W_STOPCODE(p->p_xstat); + error = 0; + } else +#endif + if (uap->status) { + status = W_STOPCODE(p->p_xstat); + error = copyout((caddr_t)&status, + (caddr_t)uap->status, sizeof(status)); + } else + error = 0; + return (error); + } + } + if (nfound == 0) + return (ECHILD); + if (uap->options & WNOHANG) { + q->p_retval[0] = 0; + return (0); + } + if ((error = tsleep((caddr_t)q, PWAIT | PCATCH, "wait", 0))) + return (error); + goto loop; +} + +/* + * make process 'parent' the new parent of process 'child'. + */ +void +proc_reparent(child, parent) + register struct proc *child; + register struct proc *parent; +{ + + if (child->p_pptr == parent) + return; + + LIST_REMOVE(child, p_sibling); + LIST_INSERT_HEAD(&parent->p_children, child, p_sibling); + child->p_pptr = parent; +} + +/* + * The next two functions are to handle adding/deleting items on the + * exit callout list + * + * at_exit(): + * Take the arguments given and put them onto the exit callout list, + * However first make sure that it's not already there. + * returns 0 on success. + */ +int +at_exit(function) + exitlist_fn function; +{ + ele_p ep; + + /* Be noisy if the programmer has lost track of things */ + if (rm_at_exit(function)) + printf("exit callout entry already present\n"); + ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT); + if (ep == NULL) + return (ENOMEM); + ep->next = exit_list; + ep->function = function; + exit_list = ep; + return (0); +} +/* + * Scan the exit callout list for the given items and remove them. + * Returns the number of items removed. + * Logically this can only be 0 or 1. + */ +int +rm_at_exit(function) + exitlist_fn function; +{ + ele_p *epp, ep; + int count; + + count = 0; + epp = &exit_list; + ep = *epp; + while (ep) { + if (ep->function == function) { + *epp = ep->next; + free(ep, M_TEMP); + count++; + } else { + epp = &ep->next; + } + ep = *epp; + } + return (count); +} + +#ifdef COMPAT_LINUX_THREADS +void check_sigacts (void) +{ + struct proc *p = curproc; + struct sigacts *pss; + int s; + + if (p->p_procsig->ps_refcnt == 1 && + p->p_sigacts != &p->p_addr->u_sigacts) { + pss = p->p_sigacts; + s = splhigh(); + p->p_addr->u_sigacts = *pss; + p->p_sigacts = &p->p_addr->u_sigacts; + splx(s); + FREE(pss, M_SUBPROC); + } +} +#endif diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c new file mode 100644 index 0000000..732712b --- /dev/null +++ b/sys/kern/kern_fork.c @@ -0,0 +1,546 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94 + * $Id: kern_fork.c,v 1.53 1998/12/19 02:55:33 julian Exp $ + */ + +#include "opt_ktrace.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/vnode.h> +#include <sys/acct.h> +#include <sys/ktrace.h> +#include <sys/unistd.h> + +#include <vm/vm.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_extern.h> +#include <vm/vm_zone.h> + +#ifdef COMPAT_LINUX_THREADS +#include <machine/frame.h> +#include <sys/user.h> +#endif /* COMPAT_LINUX_THREADS */ +#ifdef SMP +static int fast_vfork = 0; /* Doesn't work on SMP yet. */ +#else +static int fast_vfork = 1; +#endif +SYSCTL_INT(_kern, OID_AUTO, fast_vfork, CTLFLAG_RW, &fast_vfork, 0, ""); + +/* + * These are the stuctures used to create a callout list for things to do + * when forking a process + */ +typedef struct fork_list_element { + struct fork_list_element *next; + forklist_fn function; +} *fle_p; + +static fle_p fork_list; + +#ifndef _SYS_SYSPROTO_H_ +struct fork_args { + int dummy; +}; +#endif + +/* ARGSUSED */ +int +fork(p, uap) + struct proc *p; + struct fork_args *uap; +{ + + return (fork1(p, RFFDG | RFPROC)); +} + +/* ARGSUSED */ +int +vfork(p, uap) + struct proc *p; + struct vfork_args *uap; +{ + + return (fork1(p, RFFDG | RFPROC | RFPPWAIT | (fast_vfork ? RFMEM : 0))); +} + +/* ARGSUSED */ +int +rfork(p, uap) + struct proc *p; + struct rfork_args *uap; +{ + + return (fork1(p, uap->flags)); +} + + +int nprocs = 1; /* process 0 */ +static int nextpid = 0; + +int +fork1(p1, flags) + register struct proc *p1; + int flags; +{ + register struct proc *p2, *pptr; + register uid_t uid; + struct proc *newproc; + int count; + static int pidchecked = 0; + fle_p ep ; + + ep = fork_list; + + if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) + return (EINVAL); + +#ifdef SMP + /* + * FATAL now, we cannot have the same PTD on both cpus, the PTD + * needs to move out of PTmap and be per-process, even for shared + * page table processes. Unfortunately, this means either removing + * PTD[] as a fixed virtual address, or move it to the per-cpu map + * area for SMP mode. Both cases require seperate management of + * the per-process-even-if-PTmap-is-shared PTD. + */ + if (flags & RFMEM) { + printf("shared address space fork attempted: pid: %d\n", + p1->p_pid); + return (EOPNOTSUPP); + } +#endif + + /* + * Here we don't create a new process, but we divorce + * certain parts of a process from itself. + */ + if ((flags & RFPROC) == 0) { + + /* + * Divorce the memory, if it is shared, essentially + * this changes shared memory amongst threads, into + * COW locally. + */ + if ((flags & RFMEM) == 0) { + if (p1->p_vmspace->vm_refcnt > 1) { + vmspace_unshare(p1); + } + } + + /* + * Close all file descriptors. + */ + if (flags & RFCFDG) { + struct filedesc *fdtmp; + fdtmp = fdinit(p1); + fdfree(p1); + p1->p_fd = fdtmp; + } + + /* + * Unshare file descriptors (from parent.) + */ + if (flags & RFFDG) { + if (p1->p_fd->fd_refcnt > 1) { + struct filedesc *newfd; + newfd = fdcopy(p1); + fdfree(p1); + p1->p_fd = newfd; + } + } + return (0); + } + + /* + * Although process entries are dynamically created, we still keep + * a global limit on the maximum number we will create. Don't allow + * a nonprivileged user to use the last process; don't let root + * exceed the limit. The variable nprocs is the current number of + * processes, maxproc is the limit. + */ + uid = p1->p_cred->p_ruid; + if ((nprocs >= maxproc - 1 && uid != 0) || nprocs >= maxproc) { + tablefull("proc"); + return (EAGAIN); + } + /* + * Increment the nprocs resource before blocking can occur. There + * are hard-limits as to the number of processes that can run. + */ + nprocs++; + + /* + * Increment the count of procs running with this uid. Don't allow + * a nonprivileged user to exceed their current limit. + */ + count = chgproccnt(uid, 1); + if (uid != 0 && count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur) { + (void)chgproccnt(uid, -1); + /* + * Back out the process count + */ + nprocs--; + return (EAGAIN); + } + + /* Allocate new proc. */ + newproc = zalloc(proc_zone); + +/* + * Setup linkage for kernel based threading + */ + if((flags & RFTHREAD) != 0) { + newproc->p_peers = p1->p_peers; + p1->p_peers = newproc; + newproc->p_leader = p1->p_leader; + } else { + newproc->p_peers = 0; + newproc->p_leader = newproc; + } + + newproc->p_wakeup = 0; + + /* + * Find an unused process ID. We remember a range of unused IDs + * ready to use (from nextpid+1 through pidchecked-1). + */ + nextpid++; +retry: + /* + * If the process ID prototype has wrapped around, + * restart somewhat above 0, as the low-numbered procs + * tend to include daemons that don't exit. + */ + if (nextpid >= PID_MAX) { + nextpid = 100; + pidchecked = 0; + } + if (nextpid >= pidchecked) { + int doingzomb = 0; + + pidchecked = PID_MAX; + /* + * Scan the active and zombie procs to check whether this pid + * is in use. Remember the lowest pid that's greater + * than nextpid, so we can avoid checking for a while. + */ + p2 = allproc.lh_first; +again: + for (; p2 != 0; p2 = p2->p_list.le_next) { + while (p2->p_pid == nextpid || + p2->p_pgrp->pg_id == nextpid || + p2->p_session->s_sid == nextpid) { + nextpid++; + if (nextpid >= pidchecked) + goto retry; + } + if (p2->p_pid > nextpid && pidchecked > p2->p_pid) + pidchecked = p2->p_pid; + if (p2->p_pgrp->pg_id > nextpid && + pidchecked > p2->p_pgrp->pg_id) + pidchecked = p2->p_pgrp->pg_id; + if (p2->p_session->s_sid > nextpid && + pidchecked > p2->p_session->s_sid) + pidchecked = p2->p_session->s_sid; + } + if (!doingzomb) { + doingzomb = 1; + p2 = zombproc.lh_first; + goto again; + } + } + + p2 = newproc; + p2->p_stat = SIDL; /* protect against others */ + p2->p_pid = nextpid; + LIST_INSERT_HEAD(&allproc, p2, p_list); + LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); + + /* + * Make a proc table entry for the new process. + * Start by zeroing the section of proc that is zero-initialized, + * then copy the section that is copied directly from the parent. + */ + bzero(&p2->p_startzero, + (unsigned) ((caddr_t)&p2->p_endzero - (caddr_t)&p2->p_startzero)); + bcopy(&p1->p_startcopy, &p2->p_startcopy, + (unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy)); + + p2->p_aioinfo = NULL; + + /* + * Duplicate sub-structures as needed. + * Increase reference counts on shared objects. + * The p_stats and p_sigacts substructs are set in vm_fork. + */ + p2->p_flag = P_INMEM; + if (p1->p_flag & P_PROFIL) + startprofclock(p2); + MALLOC(p2->p_cred, struct pcred *, sizeof(struct pcred), + M_SUBPROC, M_WAITOK); + bcopy(p1->p_cred, p2->p_cred, sizeof(*p2->p_cred)); + p2->p_cred->p_refcnt = 1; + crhold(p1->p_ucred); + +#ifdef COMPAT_LINUX_THREADS + if (flags & RFSIGSHARE) { + p2->p_procsig = p1->p_procsig; + p2->p_procsig->ps_refcnt++; + if (p1->p_sigacts == &p1->p_addr->u_sigacts) { + struct sigacts *newsigacts; + int s; + + if (p2->p_procsig->ps_refcnt != 2) + printf ("PID:%d Creating shared sigacts with procsig->ps_refcnt %d\n", + p2->p_pid, p2->p_procsig->ps_refcnt); + /* Create the shared sigacts structure */ + MALLOC (newsigacts, struct sigacts *, sizeof (struct sigacts), + M_SUBPROC, M_WAITOK); + s = splhigh(); + /* Set p_sigacts to the new shared structure. Note that this + * is updating p1->p_sigacts at the same time, since p_sigacts + * is just a pointer to the shared p_procsig->ps_sigacts. + */ + p2->p_sigacts = newsigacts; + /* Copy in the values from the u area */ + *p2->p_sigacts = p1->p_addr->u_sigacts; + splx (s); + } + } else { + MALLOC (p2->p_procsig, struct procsig *, sizeof(struct procsig), + M_SUBPROC, M_WAITOK); + bcopy(&p1->p_procsig->ps_begincopy, &p2->p_procsig->ps_begincopy, + (unsigned)&p1->p_procsig->ps_endcopy - + (unsigned)&p1->p_procsig->ps_begincopy); + p2->p_procsig->ps_refcnt = 1; + /* Note that we fill in the values of sigacts in vm_fork */ + p2->p_sigacts = NULL; + } + if (flags & RFLINUXTHPN) { + p2->p_sigparent = SIGUSR1; + } +#endif /* COMPAT_LINUX_THREADS */ + /* bump references to the text vnode (for procfs) */ + p2->p_textvp = p1->p_textvp; + if (p2->p_textvp) + VREF(p2->p_textvp); + + if (flags & RFCFDG) + p2->p_fd = fdinit(p1); + else if (flags & RFFDG) + p2->p_fd = fdcopy(p1); + else + p2->p_fd = fdshare(p1); + + /* + * If p_limit is still copy-on-write, bump refcnt, + * otherwise get a copy that won't be modified. + * (If PL_SHAREMOD is clear, the structure is shared + * copy-on-write.) + */ + if (p1->p_limit->p_lflags & PL_SHAREMOD) + p2->p_limit = limcopy(p1->p_limit); + else { + p2->p_limit = p1->p_limit; + p2->p_limit->p_refcnt++; + } + + /* + * Preserve some more flags in subprocess. P_PROFIL has already + * been preserved. + */ + p2->p_flag |= p1->p_flag & P_SUGID; + if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT) + p2->p_flag |= P_CONTROLT; + if (flags & RFPPWAIT) + p2->p_flag |= P_PPWAIT; + + LIST_INSERT_AFTER(p1, p2, p_pglist); + + /* + * Attach the new process to its parent. + * + * If RFNOWAIT is set, the newly created process becomes a child + * of init. This effectively disassociates the child from the + * parent. + */ + if (flags & RFNOWAIT) + pptr = initproc; + else + pptr = p1; + p2->p_pptr = pptr; + LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling); + LIST_INIT(&p2->p_children); + +#ifdef KTRACE + /* + * Copy traceflag and tracefile if enabled. + * If not inherited, these were zeroed above. + */ + if (p1->p_traceflag&KTRFAC_INHERIT) { + p2->p_traceflag = p1->p_traceflag; + if ((p2->p_tracep = p1->p_tracep) != NULL) + VREF(p2->p_tracep); + } +#endif + + /* + * set priority of child to be that of parent + */ + p2->p_estcpu = p1->p_estcpu; + + /* + * This begins the section where we must prevent the parent + * from being swapped. + */ + p1->p_flag |= P_NOSWAP; + + /* + * Finish creating the child process. It will return via a different + * execution path later. (ie: directly into user mode) + */ + vm_fork(p1, p2, flags); + + /* + * Both processes are set up, now check if any LKMs want + * to adjust anything. + * What if they have an error? XXX + */ + while (ep) { + (*ep->function)(p1, p2, flags); + ep = ep->next; + } + + /* + * Make child runnable and add to run queue. + */ + microtime(&(p2->p_stats->p_start)); + p2->p_acflag = AFORK; + (void) splhigh(); + p2->p_stat = SRUN; + setrunqueue(p2); + (void) spl0(); + + /* + * Now can be swapped. + */ + p1->p_flag &= ~P_NOSWAP; + + /* + * Preserve synchronization semantics of vfork. If waiting for + * child to exec or exit, set P_PPWAIT on child, and sleep on our + * proc (in case of exit). + */ + while (p2->p_flag & P_PPWAIT) + tsleep(p1, PWAIT, "ppwait", 0); + + /* + * Return child pid to parent process, + * marking us as parent via p1->p_retval[1]. + */ + p1->p_retval[0] = p2->p_pid; + p1->p_retval[1] = 0; + return (0); +} + +/* + * The next two functionms are general routines to handle adding/deleting + * items on the fork callout list. + * + * at_fork(): + * Take the arguments given and put them onto the fork callout list, + * However first make sure that it's not already there. + * Returns 0 on success or a standard error number. + */ +int +at_fork(function) + forklist_fn function; +{ + fle_p ep; + + /* let the programmer know if he's been stupid */ + if (rm_at_fork(function)) + printf("fork callout entry already present\n"); + ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT); + if (ep == NULL) + return (ENOMEM); + ep->next = fork_list; + ep->function = function; + fork_list = ep; + return (0); +} + +/* + * Scan the exit callout list for the given items and remove them. + * Returns the number of items removed. + * Theoretically this value can only be 0 or 1. + */ +int +rm_at_fork(function) + forklist_fn function; +{ + fle_p *epp, ep; + int count; + + count= 0; + epp = &fork_list; + ep = *epp; + while (ep) { + if (ep->function == function) { + *epp = ep->next; + free(ep, M_TEMP); + count++; + } else { + epp = &ep->next; + } + ep = *epp; + } + return (count); +} diff --git a/sys/kern/kern_intr.c b/sys/kern/kern_intr.c new file mode 100644 index 0000000..1d6756c --- /dev/null +++ b/sys/kern/kern_intr.c @@ -0,0 +1,535 @@ +/* + * Copyright (c) 1997, Stefan Esser <se@freebsd.org> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $Id: kern_intr.c,v 1.20 1998/09/26 14:25:31 dfr Exp $ + * + */ + + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/errno.h> +#ifdef RESOURCE_CHECK +#include <sys/drvresource.h> +#endif /* RESOURCE_CHECK */ + +#include <machine/ipl.h> + +#ifdef __i386__ +#include <i386/isa/icu.h> +#include <i386/isa/intr_machdep.h> +#endif + +#include <sys/interrupt.h> + +#include <stddef.h> + +#ifdef __i386__ + +typedef struct intrec { + intrmask_t mask; + inthand2_t *handler; + void *argument; + struct intrec *next; + void *devdata; + int intr; + intrmask_t *maskptr; + int flags; +} intrec; + +static intrec *intreclist_head[NHWI]; + +#endif + +struct swilist { + swihand_t *sl_handler; + struct swilist *sl_next; +}; + +static struct swilist swilists[NSWI]; + +#ifdef __i386__ + +/* + * The interrupt multiplexer calls each of the handlers in turn, + * and applies the associated interrupt mask to "cpl", which is + * defined as a ".long" in /sys/i386/isa/ipl.s + */ + +#ifndef SMP +static __inline intrmask_t +splq(intrmask_t mask) +{ + intrmask_t tmp = cpl; + cpl |= mask; + return (tmp); +} +#endif /* SMP */ + +static void +intr_mux(void *arg) +{ + intrec *p = arg; + + while (p != NULL) { + int oldspl = splq(p->mask); + p->handler(p->argument); + splx(oldspl); + p = p->next; + } +} + +static intrec* +find_idesc(unsigned *maskptr, int irq) +{ + intrec *p = intreclist_head[irq]; + + while (p && p->maskptr != maskptr) + p = p->next; + + return (p); +} + +static intrec** +find_pred(intrec *idesc, int irq) +{ + intrec **pp = &intreclist_head[irq]; + intrec *p = *pp; + + while (p != idesc) { + if (p == NULL) + return (NULL); + pp = &p->next; + p = *pp; + } + return (pp); +} + +/* + * Both the low level handler and the shared interrupt multiplexer + * block out further interrupts as set in the handlers "mask", while + * the handler is running. In fact *maskptr should be used for this + * purpose, but since this requires one more pointer dereference on + * each interrupt, we rather bother update "mask" whenever *maskptr + * changes. The function "update_masks" should be called **after** + * all manipulation of the linked list of interrupt handlers hung + * off of intrdec_head[irq] is complete, since the chain of handlers + * will both determine the *maskptr values and the instances of mask + * that are fixed. This function should be called with the irq for + * which a new handler has been add blocked, since the masks may not + * yet know about the use of this irq for a device of a certain class. + */ + +static void +update_mux_masks(void) +{ + int irq; + for (irq = 0; irq < ICU_LEN; irq++) { + intrec *idesc = intreclist_head[irq]; + while (idesc != NULL) { + if (idesc->maskptr != NULL) { + /* our copy of *maskptr may be stale, refresh */ + idesc->mask = *idesc->maskptr; + } + idesc = idesc->next; + } + } +} + +static void +update_masks(intrmask_t *maskptr, int irq) +{ + intrmask_t mask = 1 << irq; + + if (maskptr == NULL) + return; + + if (find_idesc(maskptr, irq) == NULL) { + /* no reference to this maskptr was found in this irq's chain */ + if ((*maskptr & mask) == 0) + return; + /* the irq was included in the classes mask, remove it */ + INTRUNMASK(*maskptr, mask); + } else { + /* a reference to this maskptr was found in this irq's chain */ + if ((*maskptr & mask) != 0) + return; + /* put the irq into the classes mask */ + INTRMASK(*maskptr, mask); + } + /* we need to update all values in the intr_mask[irq] array */ + update_intr_masks(); + /* update mask in chains of the interrupt multiplex handler as well */ + update_mux_masks(); +} + +/* + * Add interrupt handler to linked list hung off of intreclist_head[irq] + * and install shared interrupt multiplex handler, if necessary + */ + +static int +add_intrdesc(intrec *idesc) +{ + int irq = idesc->intr; + + intrec *head = intreclist_head[irq]; + + if (head == NULL) { + /* first handler for this irq, just install it */ + if (icu_setup(irq, idesc->handler, idesc->argument, + idesc->maskptr, idesc->flags) != 0) + return (-1); + + update_intrname(irq, (intptr_t)idesc->devdata); + /* keep reference */ + intreclist_head[irq] = idesc; + } else { + if ((idesc->flags & INTR_EXCL) != 0 + || (head->flags & INTR_EXCL) != 0) { + /* + * can't append new handler, if either list head or + * new handler do not allow interrupts to be shared + */ + if (bootverbose) + printf("\tdevice combination doesn't support " + "shared irq%d\n", irq); + return (-1); + } + if (head->next == NULL) { + /* + * second handler for this irq, replace device driver's + * handler by shared interrupt multiplexer function + */ + icu_unset(irq, head->handler); + if (icu_setup(irq, (inthand2_t*)intr_mux, head, 0, 0) != 0) + return (-1); + if (bootverbose) + printf("\tusing shared irq%d.\n", irq); + update_intrname(irq, -1); + } + /* just append to the end of the chain */ + while (head->next != NULL) + head = head->next; + head->next = idesc; + } + update_masks(idesc->maskptr, irq); + return (0); +} + +/* + * Add the interrupt handler descriptor data structure created by an + * earlier call of create_intr() to the linked list for its irq and + * adjust the interrupt masks if necessary. + * + * This function effectively activates the handler. + */ + +int +intr_connect(intrec *idesc) +{ + int errcode = -1; + int irq; + +#ifdef RESOURCE_CHECK + int resflag; +#endif /* RESOURCE_CHECK */ + + if (idesc == NULL) + return (-1); + + irq = idesc->intr; +#ifdef RESOURCE_CHECK + resflag = (idesc->flags & INTR_EXCL) ? RESF_NONE : RESF_SHARED; + if (resource_claim(idesc->devdata, REST_INT, resflag, irq, irq) == 0) +#endif /* RESOURCE_CHECK */ + { + /* block this irq */ + intrmask_t oldspl = splq(1 << irq); + + /* add irq to class selected by maskptr */ + errcode = add_intrdesc(idesc); + splx(oldspl); + } + if (errcode != 0 && bootverbose) + printf("\tintr_connect(irq%d) failed, result=%d\n", + irq, errcode); + + return (errcode); +} + +/* + * Remove the interrupt handler descriptor data connected created by an + * earlier call of intr_connect() from the linked list and adjust the + * interrupt masks if necessary. + * + * This function deactivates the handler. + */ + +int +intr_disconnect(intrec *idesc) +{ + intrec **hook, *head; + int irq; + int errcode = 0; + + if (idesc == NULL) + return (-1); + + irq = idesc->intr; + + /* find pointer that keeps the reference to this interrupt descriptor */ + hook = find_pred(idesc, irq); + if (hook == NULL) + return (-1); + + /* make copy of original list head, the line after may overwrite it */ + head = intreclist_head[irq]; + + /* unlink: make predecessor point to idesc->next instead of to idesc */ + *hook = idesc->next; + + /* now check whether the element we removed was the list head */ + if (idesc == head) { + intrmask_t oldspl = splq(1 << irq); + + /* we want to remove the list head, which was known to intr_mux */ + icu_unset(irq, (inthand2_t*)intr_mux); + + /* check whether the new list head is the only element on list */ + head = intreclist_head[irq]; + if (head != NULL) { + if (head->next != NULL) { + /* install the multiplex handler with new list head as argument */ + errcode = icu_setup(irq, (inthand2_t*)intr_mux, head, 0, 0); + if (errcode == 0) + update_intrname(irq, -1); + } else { + /* install the one remaining handler for this irq */ + errcode = icu_setup(irq, head->handler, + head->argument, + head->maskptr, head->flags); + if (errcode == 0) + update_intrname(irq, (intptr_t)head->devdata); + } + } + splx(oldspl); + } + update_masks(idesc->maskptr, irq); +#ifdef RESOURCE_CHECK + resource_free(idesc->devdata); +#endif /* RESOURCE_CHECK */ + return (0); +} + +/* + * Create an interrupt handler descriptor data structure, which later can + * be activated or deactivated at will by calls of [dis]connect(intrec*). + * + * The dev_instance pointer is required for resource management, and will + * only be passed through to resource_claim(). + * + * The interrupt handler takes an argument of type (void*), which is not + * what is currently used for ISA devices. But since the unit number passed + * to an ISA interrupt handler can be stored in a (void*) variable, this + * causes no problems. Eventually all the ISA interrupt handlers should be + * modified to accept the pointer to their private data, too, instead of + * an integer index. + * + * There will be functions that derive a driver and unit name from a + * dev_instance variable, and those functions will be used to maintain the + * interrupt counter label array referenced by systat and vmstat to report + * device interrupt rates (->update_intrlabels). + */ + +intrec * +intr_create(void *dev_instance, int irq, inthand2_t handler, void *arg, + intrmask_t *maskptr, int flags) +{ + intrec *idesc; + + if (ICU_LEN > 8 * sizeof *maskptr) { + printf("create_intr: ICU_LEN of %d too high for %d bit intrmask\n", + ICU_LEN, 8 * sizeof *maskptr); + return (NULL); + } + if ((unsigned)irq >= ICU_LEN) { + printf("create_intr: requested irq%d too high, limit is %d\n", + irq, ICU_LEN -1); + return (NULL); + } + + idesc = malloc(sizeof *idesc, M_DEVBUF, M_WAITOK); + if (idesc) { + idesc->next = NULL; + bzero(idesc, sizeof *idesc); + + idesc->devdata = dev_instance; + idesc->handler = handler; + idesc->argument = arg; + idesc->maskptr = maskptr; + idesc->intr = irq; + idesc->flags = flags; + } + return (idesc); +} + +/* + * Return the memory held by the interrupt handler descriptor data structure + * to the system. Make sure, the handler is not actively used anymore, before. + */ + +int +intr_destroy(intrec *rec) +{ + if (intr_disconnect(rec) != 0) + return (-1); + free(rec, M_DEVBUF); + return (0); +} + +/* + * Emulate the register_intr() call previously defined as low level function. + * That function (now icu_setup()) may no longer be directly called, since + * a conflict between an ISA and PCI interrupt might go by unnocticed, else. + */ + +int +register_intr(int intr, int device_id, u_int flags, + inthand2_t handler, u_int *maskptr, int unit) +{ + /* XXX modify to include isa_device instead of device_id */ + intrec *idesc; + + flags |= INTR_EXCL; + idesc = intr_create((void *)(intptr_t)device_id, intr, handler, + (void*)(intptr_t)unit, maskptr, flags); + return (intr_connect(idesc)); +} + +/* + * Emulate the old unregister_intr() low level function. + * Make sure there is just one interrupt, that it was + * registered as non-shared, and that the handlers match. + */ + +int +unregister_intr(int intr, inthand2_t handler) +{ + intrec *p = intreclist_head[intr]; + + if (p != NULL && (p->flags & INTR_EXCL) != 0 && p->handler == handler) + return (intr_destroy(p)); + return (EINVAL); +} + +#endif /* __i386__ */ + +void +register_swi(intr, handler) + int intr; + swihand_t *handler; +{ + struct swilist *slp, *slq; + int s; + + if (intr < NHWI || intr >= NHWI + NSWI) + panic("register_swi: bad intr %d", intr); + if (handler == swi_generic || handler == swi_null) + panic("register_swi: bad handler %p", (void *)handler); + slp = &swilists[intr - NHWI]; + s = splhigh(); + if (ihandlers[intr] == swi_null) + ihandlers[intr] = handler; + else { + if (slp->sl_next == NULL) { + slp->sl_handler = ihandlers[intr]; + ihandlers[intr] = swi_generic; + } + slq = malloc(sizeof(*slq), M_DEVBUF, M_NOWAIT); + if (slq == NULL) + panic("register_swi: malloc failed"); + slq->sl_handler = handler; + slq->sl_next = NULL; + while (slp->sl_next != NULL) + slp = slp->sl_next; + slp->sl_next = slq; + } + splx(s); +} + +void +swi_dispatcher(intr) + int intr; +{ + struct swilist *slp; + + slp = &swilists[intr - NHWI]; + do { + (*slp->sl_handler)(); + slp = slp->sl_next; + } while (slp != NULL); +} + +void +unregister_swi(intr, handler) + int intr; + swihand_t *handler; +{ + struct swilist *slfoundpred, *slp, *slq; + int s; + + if (intr < NHWI || intr >= NHWI + NSWI) + panic("unregister_swi: bad intr %d", intr); + if (handler == swi_generic || handler == swi_null) + panic("unregister_swi: bad handler %p", (void *)handler); + slp = &swilists[intr - NHWI]; + s = splhigh(); + if (ihandlers[intr] == handler) + ihandlers[intr] = swi_null; + else if (slp->sl_next != NULL) { + slfoundpred = NULL; + for (slq = slp->sl_next; slq != NULL; + slp = slq, slq = slp->sl_next) + if (slq->sl_handler == handler) + slfoundpred = slp; + slp = &swilists[intr - NHWI]; + if (slfoundpred != NULL) { + slq = slfoundpred->sl_next; + slfoundpred->sl_next = slq->sl_next; + free(slq, M_DEVBUF); + } else if (slp->sl_handler == handler) { + slq = slp->sl_next; + slp->sl_next = slq->sl_next; + slp->sl_handler = slq->sl_handler; + free(slq, M_DEVBUF); + } + if (slp->sl_next == NULL) + ihandlers[intr] = slp->sl_handler; + } + splx(s); +} + diff --git a/sys/kern/kern_ktrace.c b/sys/kern/kern_ktrace.c new file mode 100644 index 0000000..7a6d237 --- /dev/null +++ b/sys/kern/kern_ktrace.c @@ -0,0 +1,529 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_ktrace.c 8.2 (Berkeley) 9/23/93 + * $Id: kern_ktrace.c,v 1.24 1998/11/10 09:16:29 peter Exp $ + */ + +#include "opt_ktrace.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/fcntl.h> +#include <sys/lock.h> +#include <sys/namei.h> +#include <sys/vnode.h> +#include <sys/ktrace.h> +#include <sys/malloc.h> +#include <sys/syslog.h> + +static MALLOC_DEFINE(M_KTRACE, "KTRACE", "KTRACE"); + +#ifdef KTRACE +static struct ktr_header *ktrgetheader __P((int type)); +static void ktrwrite __P((struct vnode *, struct ktr_header *)); +static int ktrcanset __P((struct proc *,struct proc *)); +static int ktrsetchildren __P((struct proc *,struct proc *,int,int,struct vnode *)); +static int ktrops __P((struct proc *,struct proc *,int,int,struct vnode *)); + + +static struct ktr_header * +ktrgetheader(type) + int type; +{ + register struct ktr_header *kth; + struct proc *p = curproc; /* XXX */ + + MALLOC(kth, struct ktr_header *, sizeof (struct ktr_header), + M_KTRACE, M_WAITOK); + kth->ktr_type = type; + microtime(&kth->ktr_time); + kth->ktr_pid = p->p_pid; + bcopy(p->p_comm, kth->ktr_comm, MAXCOMLEN); + return (kth); +} + +void +ktrsyscall(vp, code, narg, args) + struct vnode *vp; + int code, narg, args[]; +{ + struct ktr_header *kth; + struct ktr_syscall *ktp; + register int len = sizeof(struct ktr_syscall) + (narg * sizeof(int)); + struct proc *p = curproc; /* XXX */ + int *argp, i; + + p->p_traceflag |= KTRFAC_ACTIVE; + kth = ktrgetheader(KTR_SYSCALL); + MALLOC(ktp, struct ktr_syscall *, len, M_KTRACE, M_WAITOK); + ktp->ktr_code = code; + ktp->ktr_narg = narg; + argp = (int *)((char *)ktp + sizeof(struct ktr_syscall)); + for (i = 0; i < narg; i++) + *argp++ = args[i]; + kth->ktr_buf = (caddr_t)ktp; + kth->ktr_len = len; + ktrwrite(vp, kth); + FREE(ktp, M_KTRACE); + FREE(kth, M_KTRACE); + p->p_traceflag &= ~KTRFAC_ACTIVE; +} + +void +ktrsysret(vp, code, error, retval) + struct vnode *vp; + int code, error, retval; +{ + struct ktr_header *kth; + struct ktr_sysret ktp; + struct proc *p = curproc; /* XXX */ + + p->p_traceflag |= KTRFAC_ACTIVE; + kth = ktrgetheader(KTR_SYSRET); + ktp.ktr_code = code; + ktp.ktr_error = error; + ktp.ktr_retval = retval; /* what about val2 ? */ + + kth->ktr_buf = (caddr_t)&ktp; + kth->ktr_len = sizeof(struct ktr_sysret); + + ktrwrite(vp, kth); + FREE(kth, M_KTRACE); + p->p_traceflag &= ~KTRFAC_ACTIVE; +} + +void +ktrnamei(vp, path) + struct vnode *vp; + char *path; +{ + struct ktr_header *kth; + struct proc *p = curproc; /* XXX */ + + p->p_traceflag |= KTRFAC_ACTIVE; + kth = ktrgetheader(KTR_NAMEI); + kth->ktr_len = strlen(path); + kth->ktr_buf = path; + + ktrwrite(vp, kth); + FREE(kth, M_KTRACE); + p->p_traceflag &= ~KTRFAC_ACTIVE; +} + +void +ktrgenio(vp, fd, rw, iov, len, error) + struct vnode *vp; + int fd; + enum uio_rw rw; + register struct iovec *iov; + int len, error; +{ + struct ktr_header *kth; + register struct ktr_genio *ktp; + register caddr_t cp; + register int resid = len, cnt; + struct proc *p = curproc; /* XXX */ + + if (error) + return; + p->p_traceflag |= KTRFAC_ACTIVE; + kth = ktrgetheader(KTR_GENIO); + MALLOC(ktp, struct ktr_genio *, sizeof(struct ktr_genio) + len, + M_KTRACE, M_WAITOK); + ktp->ktr_fd = fd; + ktp->ktr_rw = rw; + cp = (caddr_t)((char *)ktp + sizeof (struct ktr_genio)); + while (resid > 0) { + if ((cnt = iov->iov_len) > resid) + cnt = resid; + if (copyin(iov->iov_base, cp, (unsigned)cnt)) + goto done; + cp += cnt; + resid -= cnt; + iov++; + } + kth->ktr_buf = (caddr_t)ktp; + kth->ktr_len = sizeof (struct ktr_genio) + len; + + ktrwrite(vp, kth); +done: + FREE(kth, M_KTRACE); + FREE(ktp, M_KTRACE); + p->p_traceflag &= ~KTRFAC_ACTIVE; +} + +void +ktrpsig(vp, sig, action, mask, code) + struct vnode *vp; + int sig; + sig_t action; + int mask, code; +{ + struct ktr_header *kth; + struct ktr_psig kp; + struct proc *p = curproc; /* XXX */ + + p->p_traceflag |= KTRFAC_ACTIVE; + kth = ktrgetheader(KTR_PSIG); + kp.signo = (char)sig; + kp.action = action; + kp.mask = mask; + kp.code = code; + kth->ktr_buf = (caddr_t)&kp; + kth->ktr_len = sizeof (struct ktr_psig); + + ktrwrite(vp, kth); + FREE(kth, M_KTRACE); + p->p_traceflag &= ~KTRFAC_ACTIVE; +} + +void +ktrcsw(vp, out, user) + struct vnode *vp; + int out, user; +{ + struct ktr_header *kth; + struct ktr_csw kc; + struct proc *p = curproc; /* XXX */ + + p->p_traceflag |= KTRFAC_ACTIVE; + kth = ktrgetheader(KTR_CSW); + kc.out = out; + kc.user = user; + kth->ktr_buf = (caddr_t)&kc; + kth->ktr_len = sizeof (struct ktr_csw); + + ktrwrite(vp, kth); + FREE(kth, M_KTRACE); + p->p_traceflag &= ~KTRFAC_ACTIVE; +} +#endif + +/* Interface and common routines */ + +/* + * ktrace system call + */ +#ifndef _SYS_SYSPROTO_H_ +struct ktrace_args { + char *fname; + int ops; + int facs; + int pid; +}; +#endif +/* ARGSUSED */ +int +ktrace(curp, uap) + struct proc *curp; + register struct ktrace_args *uap; +{ +#ifdef KTRACE + register struct vnode *vp = NULL; + register struct proc *p; + struct pgrp *pg; + int facs = uap->facs & ~KTRFAC_ROOT; + int ops = KTROP(uap->ops); + int descend = uap->ops & KTRFLAG_DESCEND; + int ret = 0; + int error = 0; + struct nameidata nd; + + curp->p_traceflag |= KTRFAC_ACTIVE; + if (ops != KTROP_CLEAR) { + /* + * an operation which requires a file argument. + */ + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->fname, curp); + error = vn_open(&nd, FREAD|FWRITE, 0); + if (error) { + curp->p_traceflag &= ~KTRFAC_ACTIVE; + return (error); + } + vp = nd.ni_vp; + VOP_UNLOCK(vp, 0, curp); + if (vp->v_type != VREG) { + (void) vn_close(vp, FREAD|FWRITE, curp->p_ucred, curp); + curp->p_traceflag &= ~KTRFAC_ACTIVE; + return (EACCES); + } + } + /* + * Clear all uses of the tracefile + */ + if (ops == KTROP_CLEARFILE) { + for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { + if (p->p_tracep == vp) { + if (ktrcanset(curp, p)) { + p->p_tracep = NULL; + p->p_traceflag = 0; + (void) vn_close(vp, FREAD|FWRITE, + p->p_ucred, p); + } else + error = EPERM; + } + } + goto done; + } + /* + * need something to (un)trace (XXX - why is this here?) + */ + if (!facs) { + error = EINVAL; + goto done; + } + /* + * do it + */ + if (uap->pid < 0) { + /* + * by process group + */ + pg = pgfind(-uap->pid); + if (pg == NULL) { + error = ESRCH; + goto done; + } + for (p = pg->pg_members.lh_first; p != 0; p = p->p_pglist.le_next) + if (descend) + ret |= ktrsetchildren(curp, p, ops, facs, vp); + else + ret |= ktrops(curp, p, ops, facs, vp); + + } else { + /* + * by pid + */ + p = pfind(uap->pid); + if (p == NULL) { + error = ESRCH; + goto done; + } + if (descend) + ret |= ktrsetchildren(curp, p, ops, facs, vp); + else + ret |= ktrops(curp, p, ops, facs, vp); + } + if (!ret) + error = EPERM; +done: + if (vp != NULL) + (void) vn_close(vp, FWRITE, curp->p_ucred, curp); + curp->p_traceflag &= ~KTRFAC_ACTIVE; + return (error); +#else + return ENOSYS; +#endif +} + +/* + * utrace system call + */ +/* ARGSUSED */ +int +utrace(curp, uap) + struct proc *curp; + register struct utrace_args *uap; +{ +#ifdef KTRACE + struct ktr_header *kth; + struct proc *p = curproc; /* XXX */ + register caddr_t cp; + + if (!KTRPOINT(p, KTR_USER)) + return (0); + p->p_traceflag |= KTRFAC_ACTIVE; + kth = ktrgetheader(KTR_USER); + MALLOC(cp, caddr_t, uap->len, M_KTRACE, M_WAITOK); + if (!copyin(uap->addr, cp, uap->len)) { + kth->ktr_buf = cp; + kth->ktr_len = uap->len; + ktrwrite(p->p_tracep, kth); + } + FREE(kth, M_KTRACE); + FREE(cp, M_KTRACE); + p->p_traceflag &= ~KTRFAC_ACTIVE; + + return (0); +#else + return (ENOSYS); +#endif +} + +#ifdef KTRACE +static int +ktrops(curp, p, ops, facs, vp) + struct proc *p, *curp; + int ops, facs; + struct vnode *vp; +{ + + if (!ktrcanset(curp, p)) + return (0); + if (ops == KTROP_SET) { + if (p->p_tracep != vp) { + /* + * if trace file already in use, relinquish + */ + if (p->p_tracep != NULL) + vrele(p->p_tracep); + VREF(vp); + p->p_tracep = vp; + } + p->p_traceflag |= facs; + if (curp->p_ucred->cr_uid == 0) + p->p_traceflag |= KTRFAC_ROOT; + } else { + /* KTROP_CLEAR */ + if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) { + /* no more tracing */ + p->p_traceflag = 0; + if (p->p_tracep != NULL) { + vrele(p->p_tracep); + p->p_tracep = NULL; + } + } + } + + return (1); +} + +static int +ktrsetchildren(curp, top, ops, facs, vp) + struct proc *curp, *top; + int ops, facs; + struct vnode *vp; +{ + register struct proc *p; + register int ret = 0; + + p = top; + for (;;) { + ret |= ktrops(curp, p, ops, facs, vp); + /* + * If this process has children, descend to them next, + * otherwise do any siblings, and if done with this level, + * follow back up the tree (but not past top). + */ + if (p->p_children.lh_first) + p = p->p_children.lh_first; + else for (;;) { + if (p == top) + return (ret); + if (p->p_sibling.le_next) { + p = p->p_sibling.le_next; + break; + } + p = p->p_pptr; + } + } + /*NOTREACHED*/ +} + +static void +ktrwrite(vp, kth) + struct vnode *vp; + register struct ktr_header *kth; +{ + struct uio auio; + struct iovec aiov[2]; + register struct proc *p = curproc; /* XXX */ + int error; + + if (vp == NULL) + return; + auio.uio_iov = &aiov[0]; + auio.uio_offset = 0; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_rw = UIO_WRITE; + aiov[0].iov_base = (caddr_t)kth; + aiov[0].iov_len = sizeof(struct ktr_header); + auio.uio_resid = sizeof(struct ktr_header); + auio.uio_iovcnt = 1; + auio.uio_procp = curproc; + if (kth->ktr_len > 0) { + auio.uio_iovcnt++; + aiov[1].iov_base = kth->ktr_buf; + aiov[1].iov_len = kth->ktr_len; + auio.uio_resid += kth->ktr_len; + } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + error = VOP_WRITE(vp, &auio, IO_UNIT|IO_APPEND, p->p_ucred); + VOP_UNLOCK(vp, 0, p); + if (!error) + return; + /* + * If error encountered, give up tracing on this vnode. + */ + log(LOG_NOTICE, "ktrace write failed, errno %d, tracing stopped\n", + error); + for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { + if (p->p_tracep == vp) { + p->p_tracep = NULL; + p->p_traceflag = 0; + vrele(vp); + } + } +} + +/* + * Return true if caller has permission to set the ktracing state + * of target. Essentially, the target can't possess any + * more permissions than the caller. KTRFAC_ROOT signifies that + * root previously set the tracing status on the target process, and + * so, only root may further change it. + * + * TODO: check groups. use caller effective gid. + */ +static int +ktrcanset(callp, targetp) + struct proc *callp, *targetp; +{ + register struct pcred *caller = callp->p_cred; + register struct pcred *target = targetp->p_cred; + + if ((caller->pc_ucred->cr_uid == target->p_ruid && + target->p_ruid == target->p_svuid && + caller->p_rgid == target->p_rgid && /* XXX */ + target->p_rgid == target->p_svgid && + (targetp->p_traceflag & KTRFAC_ROOT) == 0) || + caller->pc_ucred->cr_uid == 0) + return (1); + + return (0); +} + +#endif /* KTRACE */ diff --git a/sys/kern/kern_linker.c b/sys/kern/kern_linker.c new file mode 100644 index 0000000..97def9f --- /dev/null +++ b/sys/kern/kern_linker.c @@ -0,0 +1,1016 @@ +/*- + * Copyright (c) 1997 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: kern_linker.c,v 1.20 1999/01/19 16:26:32 peter Exp $ + */ + +#include "opt_ddb.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/sysproto.h> +#include <sys/sysent.h> +#include <sys/proc.h> +#include <sys/lock.h> +#include <machine/cpu.h> +#include <machine/bootinfo.h> +#include <sys/module.h> +#include <sys/linker.h> +#include <sys/unistd.h> +#include <sys/fcntl.h> +#include <sys/libkern.h> +#include <sys/namei.h> +#include <sys/vnode.h> +#include <sys/sysctl.h> + +#ifdef KLD_DEBUG +int kld_debug = 0; +#endif + +MALLOC_DEFINE(M_LINKER, "kld", "kernel linker"); +linker_file_t linker_current_file; +linker_file_t linker_kernel_file; + +static struct lock lock; /* lock for the file list */ +static linker_class_list_t classes; +static linker_file_list_t files; +static int next_file_id = 1; + +static void +linker_init(void* arg) +{ + lockinit(&lock, PVM, "klink", 0, 0); + TAILQ_INIT(&classes); + TAILQ_INIT(&files); +} + +SYSINIT(linker, SI_SUB_KLD, SI_ORDER_FIRST, linker_init, 0); + +int +linker_add_class(const char* desc, void* priv, + struct linker_class_ops* ops) +{ + linker_class_t lc; + + lc = malloc(sizeof(struct linker_class), M_LINKER, M_NOWAIT); + if (!lc) + return ENOMEM; + bzero(lc, sizeof(*lc)); + + lc->desc = desc; + lc->priv = priv; + lc->ops = ops; + TAILQ_INSERT_HEAD(&classes, lc, link); + + return 0; +} + +static void +linker_file_sysinit(linker_file_t lf) +{ + struct linker_set* sysinits; + struct sysinit** sipp; + struct sysinit** xipp; + struct sysinit* save; + moduledata_t *moddata; + + KLD_DPF(FILE, ("linker_file_sysinit: calling SYSINITs for %s\n", + lf->filename)); + + sysinits = (struct linker_set*) + linker_file_lookup_symbol(lf, "sysinit_set", 0); + + KLD_DPF(FILE, ("linker_file_sysinit: SYSINITs %p\n", sysinits)); + if (!sysinits) + return; + + /* HACK ALERT! */ + for (sipp = (struct sysinit **)sysinits->ls_items; *sipp; sipp++) { + if ((*sipp)->func == module_register_init) { + moddata = (*sipp)->udata; + moddata->_file = lf; + } + } + + /* + * Perform a bubble sort of the system initialization objects by + * their subsystem (primary key) and order (secondary key). + * + * Since some things care about execution order, this is the + * operation which ensures continued function. + */ + for (sipp = (struct sysinit **)sysinits->ls_items; *sipp; sipp++) { + for (xipp = sipp + 1; *xipp; xipp++) { + if ((*sipp)->subsystem <= (*xipp)->subsystem || + ((*sipp)->subsystem == (*xipp)->subsystem && + (*sipp)->order <= (*xipp)->order)) + continue; /* skip*/ + save = *sipp; + *sipp = *xipp; + *xipp = save; + } + } + + + /* + * Traverse the (now) ordered list of system initialization tasks. + * Perform each task, and continue on to the next task. + */ + for (sipp = (struct sysinit **)sysinits->ls_items; *sipp; sipp++) { + if ((*sipp)->subsystem == SI_SUB_DUMMY) + continue; /* skip dummy task(s)*/ + + switch ((*sipp)->type) { + case SI_TYPE_DEFAULT: + /* no special processing*/ + (*((*sipp)->func))((*sipp)->udata); + break; + + case SI_TYPE_KTHREAD: +#if !defined(SMP) + /* kernel thread*/ + if (fork1(&proc0, RFFDG|RFPROC|RFMEM)) + panic("fork kernel thread"); + cpu_set_fork_handler(pfind(proc0.p_retval[0]), + (*sipp)->func, (*sipp)->udata); + break; +#endif + + case SI_TYPE_KPROCESS: + /* kernel thread*/ + if (fork1(&proc0, RFFDG|RFPROC)) + panic("fork kernel process"); + cpu_set_fork_handler(pfind(proc0.p_retval[0]), + (*sipp)->func, (*sipp)->udata); + break; + + default: + panic ("linker_file_sysinit: unrecognized init type"); + } + } +} + +static void +linker_file_sysuninit(linker_file_t lf) +{ + struct linker_set* sysuninits; + struct sysinit** sipp; + struct sysinit** xipp; + struct sysinit* save; + + KLD_DPF(FILE, ("linker_file_sysuninit: calling SYSUNINITs for %s\n", + lf->filename)); + + sysuninits = (struct linker_set*) + linker_file_lookup_symbol(lf, "sysuninit_set", 0); + + KLD_DPF(FILE, ("linker_file_sysuninit: SYSUNINITs %p\n", sysuninits)); + if (!sysuninits) + return; + + /* + * Perform a reverse bubble sort of the system initialization objects + * by their subsystem (primary key) and order (secondary key). + * + * Since some things care about execution order, this is the + * operation which ensures continued function. + */ + for (sipp = (struct sysinit **)sysuninits->ls_items; *sipp; sipp++) { + for (xipp = sipp + 1; *xipp; xipp++) { + if ((*sipp)->subsystem >= (*xipp)->subsystem || + ((*sipp)->subsystem == (*xipp)->subsystem && + (*sipp)->order >= (*xipp)->order)) + continue; /* skip*/ + save = *sipp; + *sipp = *xipp; + *xipp = save; + } + } + + + /* + * Traverse the (now) ordered list of system initialization tasks. + * Perform each task, and continue on to the next task. + */ + for (sipp = (struct sysinit **)sysuninits->ls_items; *sipp; sipp++) { + if ((*sipp)->subsystem == SI_SUB_DUMMY) + continue; /* skip dummy task(s)*/ + + switch ((*sipp)->type) { + case SI_TYPE_DEFAULT: + /* no special processing*/ + (*((*sipp)->func))((*sipp)->udata); + break; + + default: + panic("linker_file_sysuninit: unrecognized uninit type"); + } + } +} + +int +linker_load_file(const char* filename, linker_file_t* result) +{ + linker_class_t lc; + linker_file_t lf; + int foundfile, error = 0; + char *koname = NULL; + + lf = linker_find_file_by_name(filename); + if (lf) { + KLD_DPF(FILE, ("linker_load_file: file %s is already loaded, incrementing refs\n", filename)); + *result = lf; + lf->refs++; + goto out; + } + + koname = malloc(strlen(filename) + 4, M_LINKER, M_WAITOK); + if (koname == NULL) { + error = ENOMEM; + goto out; + } + sprintf(koname, "%s.ko", filename); + lf = NULL; + foundfile = 0; + for (lc = TAILQ_FIRST(&classes); lc; lc = TAILQ_NEXT(lc, link)) { + KLD_DPF(FILE, ("linker_load_file: trying to load %s as %s\n", + filename, lc->desc)); + + error = lc->ops->load_file(koname, &lf); /* First with .ko */ + if (lf == NULL && error == ENOENT) + error = lc->ops->load_file(filename, &lf); /* Then try without */ + /* + * If we got something other than ENOENT, then it exists but we cannot + * load it for some other reason. + */ + if (error != ENOENT) + foundfile = 1; + if (lf) { + linker_file_sysinit(lf); + + *result = lf; + error = 0; + goto out; + } + } + /* + * Less than ideal, but tells the user whether it failed to load or + * the module was not found. + */ + if (foundfile) + error = ENOEXEC; /* Format not recognised (or unloadable) */ + else + error = ENOENT; /* Nothing found */ + +out: + if (koname) + free(koname, M_LINKER); + return error; +} + +linker_file_t +linker_find_file_by_name(const char* filename) +{ + linker_file_t lf = 0; + char *koname; + + koname = malloc(strlen(filename) + 4, M_LINKER, M_WAITOK); + if (koname == NULL) + goto out; + sprintf(koname, "%s.ko", filename); + + lockmgr(&lock, LK_SHARED, 0, curproc); + for (lf = TAILQ_FIRST(&files); lf; lf = TAILQ_NEXT(lf, link)) { + if (!strcmp(lf->filename, koname)) + break; + if (!strcmp(lf->filename, filename)) + break; + } + lockmgr(&lock, LK_RELEASE, 0, curproc); + +out: + if (koname) + free(koname, M_LINKER); + return lf; +} + +linker_file_t +linker_find_file_by_id(int fileid) +{ + linker_file_t lf = 0; + + lockmgr(&lock, LK_SHARED, 0, curproc); + for (lf = TAILQ_FIRST(&files); lf; lf = TAILQ_NEXT(lf, link)) + if (lf->id == fileid) + break; + lockmgr(&lock, LK_RELEASE, 0, curproc); + + return lf; +} + +linker_file_t +linker_make_file(const char* pathname, void* priv, struct linker_file_ops* ops) +{ + linker_file_t lf = 0; + int namelen; + const char *filename; + + filename = rindex(pathname, '/'); + if (filename && filename[1]) + filename++; + else + filename = pathname; + + KLD_DPF(FILE, ("linker_make_file: new file, filename=%s\n", filename)); + lockmgr(&lock, LK_EXCLUSIVE|LK_RETRY, 0, curproc); + namelen = strlen(filename) + 1; + lf = malloc(sizeof(struct linker_file) + namelen, M_LINKER, M_WAITOK); + if (!lf) + goto out; + bzero(lf, sizeof(*lf)); + + lf->refs = 1; + lf->userrefs = 0; + lf->filename = (char*) (lf + 1); + strcpy(lf->filename, filename); + lf->id = next_file_id++; + lf->ndeps = 0; + lf->deps = NULL; + STAILQ_INIT(&lf->common); + TAILQ_INIT(&lf->modules); + + lf->priv = priv; + lf->ops = ops; + TAILQ_INSERT_TAIL(&files, lf, link); + +out: + lockmgr(&lock, LK_RELEASE, 0, curproc); + return lf; +} + +int +linker_file_unload(linker_file_t file) +{ + module_t mod, next; + struct common_symbol* cp; + int error = 0; + int i; + + KLD_DPF(FILE, ("linker_file_unload: lf->refs=%d\n", file->refs)); + lockmgr(&lock, LK_EXCLUSIVE|LK_RETRY, 0, curproc); + if (file->refs == 1) { + KLD_DPF(FILE, ("linker_file_unload: file is unloading, informing modules\n")); + /* + * Inform any modules associated with this file. + */ + for (mod = TAILQ_FIRST(&file->modules); mod; mod = next) { + next = module_getfnext(mod); + + /* + * Give the module a chance to veto the unload. + */ + if (error = module_unload(mod)) { + KLD_DPF(FILE, ("linker_file_unload: module %x vetoes unload\n", + mod)); + lockmgr(&lock, LK_RELEASE, 0, curproc); + goto out; + } + + module_release(mod); + } + } + + file->refs--; + if (file->refs > 0) { + lockmgr(&lock, LK_RELEASE, 0, curproc); + goto out; + } + + linker_file_sysuninit(file); + + TAILQ_REMOVE(&files, file, link); + lockmgr(&lock, LK_RELEASE, 0, curproc); + + for (i = 0; i < file->ndeps; i++) + linker_file_unload(file->deps[i]); + free(file->deps, M_LINKER); + + for (cp = STAILQ_FIRST(&file->common); cp; + cp = STAILQ_FIRST(&file->common)) { + STAILQ_REMOVE(&file->common, cp, common_symbol, link); + free(cp, M_LINKER); + } + + file->ops->unload(file); + free(file, M_LINKER); + +out: + return error; +} + +int +linker_file_add_dependancy(linker_file_t file, linker_file_t dep) +{ + linker_file_t* newdeps; + + newdeps = malloc((file->ndeps + 1) * sizeof(linker_file_t*), + M_LINKER, M_WAITOK); + if (newdeps == NULL) + return ENOMEM; + bzero(newdeps, (file->ndeps + 1) * sizeof(linker_file_t*)); + + if (file->deps) { + bcopy(file->deps, newdeps, file->ndeps * sizeof(linker_file_t*)); + free(file->deps, M_LINKER); + } + file->deps = newdeps; + file->deps[file->ndeps] = dep; + file->ndeps++; + + return 0; +} + +caddr_t +linker_file_lookup_symbol(linker_file_t file, const char* name, int deps) +{ + linker_sym_t sym; + linker_symval_t symval; + linker_file_t lf; + caddr_t address; + size_t common_size = 0; + int i; + + KLD_DPF(SYM, ("linker_file_lookup_symbol: file=%x, name=%s, deps=%d\n", + file, name, deps)); + + if (file->ops->lookup_symbol(file, name, &sym) == 0) { + file->ops->symbol_values(file, sym, &symval); + if (symval.value == 0) + /* + * For commons, first look them up in the dependancies and + * only allocate space if not found there. + */ + common_size = symval.size; + else { + KLD_DPF(SYM, ("linker_file_lookup_symbol: symbol.value=%x\n", symval.value)); + return symval.value; + } + } + + if (deps) { + for (i = 0; i < file->ndeps; i++) { + address = linker_file_lookup_symbol(file->deps[i], name, 0); + if (address) { + KLD_DPF(SYM, ("linker_file_lookup_symbol: deps value=%x\n", address)); + return address; + } + } + + /* If we have not found it in the dependencies, search globally */ + for (lf = TAILQ_FIRST(&files); lf; lf = TAILQ_NEXT(lf, link)) { + /* But skip the current file if it's on the list */ + if (lf == file) + continue; + /* And skip the files we searched above */ + for (i = 0; i < file->ndeps; i++) + if (lf == file->deps[i]) + break; + if (i < file->ndeps) + continue; + address = linker_file_lookup_symbol(lf, name, 0); + if (address) { + KLD_DPF(SYM, ("linker_file_lookup_symbol: global value=%x\n", address)); + return address; + } + } + } + + if (common_size > 0) { + /* + * This is a common symbol which was not found in the + * dependancies. We maintain a simple common symbol table in + * the file object. + */ + struct common_symbol* cp; + + for (cp = STAILQ_FIRST(&file->common); cp; + cp = STAILQ_NEXT(cp, link)) + if (!strcmp(cp->name, name)) { + KLD_DPF(SYM, ("linker_file_lookup_symbol: old common value=%x\n", cp->address)); + return cp->address; + } + + /* + * Round the symbol size up to align. + */ + common_size = (common_size + sizeof(int) - 1) & -sizeof(int); + cp = malloc(sizeof(struct common_symbol) + + common_size + + strlen(name) + 1, + M_LINKER, M_WAITOK); + if (!cp) { + KLD_DPF(SYM, ("linker_file_lookup_symbol: nomem\n")); + return 0; + } + bzero(cp, sizeof(struct common_symbol) + common_size + strlen(name)+ 1); + + cp->address = (caddr_t) (cp + 1); + cp->name = cp->address + common_size; + strcpy(cp->name, name); + bzero(cp->address, common_size); + STAILQ_INSERT_TAIL(&file->common, cp, link); + + KLD_DPF(SYM, ("linker_file_lookup_symbol: new common value=%x\n", cp->address)); + return cp->address; + } + + KLD_DPF(SYM, ("linker_file_lookup_symbol: fail\n")); + return 0; +} + +#ifdef DDB +/* + * DDB Helpers. DDB has to look across multiple files with their own + * symbol tables and string tables. + * + * Note that we do not obey list locking protocols here. We really don't + * need DDB to hang because somebody's got the lock held. We'll take the + * chance that the files list is inconsistant instead. + */ + +int +linker_ddb_lookup(char *symstr, linker_sym_t *sym) +{ + linker_file_t lf; + + for (lf = TAILQ_FIRST(&files); lf; lf = TAILQ_NEXT(lf, link)) { + if (lf->ops->lookup_symbol(lf, symstr, sym) == 0) + return 0; + } + return ENOENT; +} + +int +linker_ddb_search_symbol(caddr_t value, linker_sym_t *sym, long *diffp) +{ + linker_file_t lf; + u_long off = (u_long)value; + u_long diff, bestdiff; + linker_sym_t best; + linker_sym_t es; + + best = 0; + bestdiff = off; + for (lf = TAILQ_FIRST(&files); lf; lf = TAILQ_NEXT(lf, link)) { + if (lf->ops->search_symbol(lf, value, &es, &diff) != 0) + continue; + if (es != 0 && diff < bestdiff) { + best = es; + bestdiff = diff; + } + if (bestdiff == 0) + break; + } + if (best) { + *sym = best; + *diffp = bestdiff; + return 0; + } else { + *sym = 0; + *diffp = off; + return ENOENT; + } +} + +int +linker_ddb_symbol_values(linker_sym_t sym, linker_symval_t *symval) +{ + linker_file_t lf; + + for (lf = TAILQ_FIRST(&files); lf; lf = TAILQ_NEXT(lf, link)) { + if (lf->ops->symbol_values(lf, sym, symval) == 0) + return 0; + } + return ENOENT; +} + +#endif + +/* + * Syscalls. + */ + +int +kldload(struct proc* p, struct kldload_args* uap) +{ + char* filename = NULL, *modulename; + linker_file_t lf; + int error = 0; + + p->p_retval[0] = -1; + + if (securelevel > 0) + return EPERM; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return error; + + filename = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); + if (error = copyinstr(SCARG(uap, file), filename, MAXPATHLEN, NULL)) + goto out; + + /* Can't load more than one module with the same name */ + modulename = rindex(filename, '/'); + if (modulename == NULL) + modulename = filename; + if (linker_find_file_by_name(modulename)) { + error = EEXIST; + goto out; + } + + if (error = linker_load_file(filename, &lf)) + goto out; + + lf->userrefs++; + p->p_retval[0] = lf->id; + +out: + if (filename) + free(filename, M_TEMP); + return error; +} + +int +kldunload(struct proc* p, struct kldunload_args* uap) +{ + linker_file_t lf; + int error = 0; + + if (securelevel > 0) + return EPERM; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return error; + + lf = linker_find_file_by_id(SCARG(uap, fileid)); + if (lf) { + KLD_DPF(FILE, ("kldunload: lf->userrefs=%d\n", lf->userrefs)); + if (lf->userrefs == 0) { + printf("linkerunload: attempt to unload file which was not loaded by user\n"); + error = EBUSY; + goto out; + } + error = linker_file_unload(lf); + if (error) + goto out; + lf->userrefs--; + } else + error = ENOENT; + +out: + return error; +} + +int +kldfind(struct proc* p, struct kldfind_args* uap) +{ + char* filename = NULL, *modulename; + linker_file_t lf; + int error = 0; + + p->p_retval[0] = -1; + + filename = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); + if (error = copyinstr(SCARG(uap, file), filename, MAXPATHLEN, NULL)) + goto out; + + modulename = rindex(filename, '/'); + if (modulename == NULL) + modulename = filename; + + lf = linker_find_file_by_name(modulename); + if (lf) + p->p_retval[0] = lf->id; + else + error = ENOENT; + +out: + if (filename) + free(filename, M_TEMP); + return error; +} + +int +kldnext(struct proc* p, struct kldnext_args* uap) +{ + linker_file_t lf; + int error = 0; + + if (SCARG(uap, fileid) == 0) { + if (TAILQ_FIRST(&files)) + p->p_retval[0] = TAILQ_FIRST(&files)->id; + else + p->p_retval[0] = 0; + return 0; + } + + lf = linker_find_file_by_id(SCARG(uap, fileid)); + if (lf) { + if (TAILQ_NEXT(lf, link)) + p->p_retval[0] = TAILQ_NEXT(lf, link)->id; + else + p->p_retval[0] = 0; + } else + error = ENOENT; + + return error; +} + +int +kldstat(struct proc* p, struct kldstat_args* uap) +{ + linker_file_t lf; + int error = 0; + int version; + struct kld_file_stat* stat; + int namelen; + + lf = linker_find_file_by_id(SCARG(uap, fileid)); + if (!lf) { + error = ENOENT; + goto out; + } + + stat = SCARG(uap, stat); + + /* + * Check the version of the user's structure. + */ + if (error = copyin(&stat->version, &version, sizeof(version))) + goto out; + if (version != sizeof(struct kld_file_stat)) { + error = EINVAL; + goto out; + } + + namelen = strlen(lf->filename) + 1; + if (namelen > MAXPATHLEN) + namelen = MAXPATHLEN; + if (error = copyout(lf->filename, &stat->name[0], namelen)) + goto out; + if (error = copyout(&lf->refs, &stat->refs, sizeof(int))) + goto out; + if (error = copyout(&lf->id, &stat->id, sizeof(int))) + goto out; + if (error = copyout(&lf->address, &stat->address, sizeof(caddr_t))) + goto out; + if (error = copyout(&lf->size, &stat->size, sizeof(size_t))) + goto out; + + p->p_retval[0] = 0; + +out: + return error; +} + +int +kldfirstmod(struct proc* p, struct kldfirstmod_args* uap) +{ + linker_file_t lf; + int error = 0; + + lf = linker_find_file_by_id(SCARG(uap, fileid)); + if (lf) { + if (TAILQ_FIRST(&lf->modules)) + p->p_retval[0] = module_getid(TAILQ_FIRST(&lf->modules)); + else + p->p_retval[0] = 0; + } else + error = ENOENT; + + return error; +} + +int +kldsym(struct proc *p, struct kldsym_args *uap) +{ + char *symstr = NULL; + linker_sym_t sym; + linker_symval_t symval; + linker_file_t lf; + struct kld_sym_lookup lookup; + int error = 0; + + if (error = copyin(SCARG(uap, data), &lookup, sizeof(lookup))) + goto out; + if (lookup.version != sizeof(lookup) || SCARG(uap, cmd) != KLDSYM_LOOKUP) { + error = EINVAL; + goto out; + } + + symstr = malloc(MAXPATHLEN, M_TEMP, M_WAITOK); + if (error = copyinstr(lookup.symname, symstr, MAXPATHLEN, NULL)) + goto out; + + if (SCARG(uap, fileid) != 0) { + lf = linker_find_file_by_id(SCARG(uap, fileid)); + if (lf == NULL) { + error = ENOENT; + goto out; + } + if (lf->ops->lookup_symbol(lf, symstr, &sym) == 0 && + lf->ops->symbol_values(lf, sym, &symval) == 0) { + lookup.symvalue = (u_long)symval.value; + lookup.symsize = symval.size; + error = copyout(&lookup, SCARG(uap, data), sizeof(lookup)); + } else + error = ENOENT; + } else { + for (lf = TAILQ_FIRST(&files); lf; lf = TAILQ_NEXT(lf, link)) { + if (lf->ops->lookup_symbol(lf, symstr, &sym) == 0 && + lf->ops->symbol_values(lf, sym, &symval) == 0) { + lookup.symvalue = (u_long)symval.value; + lookup.symsize = symval.size; + error = copyout(&lookup, SCARG(uap, data), sizeof(lookup)); + break; + } + } + if (!lf) + error = ENOENT; + } +out: + if (symstr) + free(symstr, M_TEMP); + return error; +} + +/* + * Preloaded module support + */ + +static void +linker_preload(void* arg) +{ + caddr_t modptr; + char *modname; + char *modtype; + linker_file_t lf; + linker_class_t lc; + int error; + struct linker_set *sysinits; + struct sysinit **sipp; + moduledata_t *moddata; + + modptr = NULL; + while ((modptr = preload_search_next_name(modptr)) != NULL) { + modname = (char *)preload_search_info(modptr, MODINFO_NAME); + modtype = (char *)preload_search_info(modptr, MODINFO_TYPE); + if (modname == NULL) { + printf("Preloaded module at %p does not have a name!\n", modptr); + continue; + } + if (modtype == NULL) { + printf("Preloaded module at %p does not have a type!\n", modptr); + continue; + } + printf("Preloaded %s \"%s\" at %p.\n", modtype, modname, modptr); + lf = linker_find_file_by_name(modname); + if (lf) { + lf->userrefs++; + continue; + } + lf = NULL; + for (lc = TAILQ_FIRST(&classes); lc; lc = TAILQ_NEXT(lc, link)) { + error = lc->ops->load_file(modname, &lf); + if (error) { + lf = NULL; + break; + } + } + if (lf) { + lf->userrefs++; + + sysinits = (struct linker_set*) + linker_file_lookup_symbol(lf, "sysinit_set", 0); + if (sysinits) { + /* HACK ALERT! + * This is to set the sysinit moduledata so that the module + * can attach itself to the correct containing file. + * The sysinit could be run at *any* time. + */ + for (sipp = (struct sysinit **)sysinits->ls_items; *sipp; sipp++) { + if ((*sipp)->func == module_register_init) { + moddata = (*sipp)->udata; + moddata->_file = lf; + } + } + sysinit_add((struct sysinit **)sysinits->ls_items); + } + } + } +} + +SYSINIT(preload, SI_SUB_KLD, SI_ORDER_MIDDLE, linker_preload, 0); + +/* + * Search for a not-loaded module by name. + * + * Modules may be found in the following locations: + * + * - preloaded (result is just the module name) + * - on disk (result is full path to module) + * + * If the module name is qualified in any way (contains path, etc.) + * the we simply return a copy of it. + * + * The search path can be manipulated via sysctl. Note that we use the ';' + * character as a separator to be consistent with the bootloader. + */ + +static char linker_path[MAXPATHLEN + 1] = "/;/boot/;/modules/"; + +SYSCTL_STRING(_kern, OID_AUTO, module_path, CTLFLAG_RW, linker_path, + sizeof(linker_path), "module load search path"); + +static char * +linker_strdup(const char *str) +{ + char *result; + + if ((result = malloc((strlen(str) + 1), M_LINKER, M_WAITOK)) != NULL) + strcpy(result, str); + return(result); +} + +char * +linker_search_path(const char *name) +{ + struct nameidata nd; + struct proc *p = curproc; /* XXX */ + char *cp, *ep, *result; + int error; + enum vtype type; + + /* qualified at all? */ + if (index(name, '/')) + return(linker_strdup(name)); + + /* traverse the linker path */ + cp = linker_path; + for (;;) { + + /* find the end of this component */ + for (ep = cp; (*ep != 0) && (*ep != ';'); ep++) + ; + result = malloc((strlen(name) + (ep - cp) + 1), M_LINKER, M_WAITOK); + if (result == NULL) /* actually ENOMEM */ + return(NULL); + + strncpy(result, cp, ep - cp); + strcpy(result + (ep - cp), name); + + /* + * Attempt to open the file, and return the path if we succeed and it's + * a regular file. + */ + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, result, p); + error = vn_open(&nd, FREAD, 0); + if (error == 0) { + type = nd.ni_vp->v_type; + VOP_UNLOCK(nd.ni_vp, 0, p); + vn_close(nd.ni_vp, FREAD, p->p_ucred, p); + if (type == VREG) + return(result); + } + free(result, M_LINKER); + + if (*ep == 0) + break; + cp = ep + 1; + } + return(NULL); +} diff --git a/sys/kern/kern_lkm.c b/sys/kern/kern_lkm.c new file mode 100644 index 0000000..e5ea629 --- /dev/null +++ b/sys/kern/kern_lkm.c @@ -0,0 +1,838 @@ +/*- + * Copyright (c) 1992 Terrence R. Lambert. + * Copyright (c) 1994 Christopher G. Demetriou + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Terrence R. Lambert. + * 4. The name Terrence R. Lambert may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY TERRENCE R. LAMBERT ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE TERRENCE R. LAMBERT BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: kern_lkm.c,v 1.59 1998/11/10 09:12:40 peter Exp $ + */ + +#include "opt_devfs.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/tty.h> +#include <sys/conf.h> +#include <sys/fcntl.h> +#include <sys/kernel.h> +#include <sys/mount.h> +#include <sys/sysctl.h> +#include <sys/sysent.h> +#include <sys/exec.h> +#include <sys/lkm.h> +#include <sys/vnode.h> +#ifdef DEVFS +#include <sys/devfsext.h> +#endif /*DEVFS*/ + +#include <vm/vm.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> + + +#define PAGESIZE 1024 /* kmem_alloc() allocation quantum */ + +#define LKM_ALLOC 0x01 +#define LKM_WANT 0x02 + +#define LKMS_IDLE 0x00 +#define LKMS_RESERVED 0x01 +#define LKMS_LOADING 0x02 +#define LKMS_LOADED 0x04 +#define LKMS_UNLOADING 0x08 + +static int lkm_v = 0; +static int lkm_state = LKMS_IDLE; + +#ifndef MAXLKMS +#define MAXLKMS 20 +#endif + +static struct lkm_table lkmods[MAXLKMS]; /* table of loaded modules */ +static struct lkm_table *curp; /* global for in-progress ops */ + +static int _lkm_dev __P((struct lkm_table *lkmtp, int cmd)); +static int _lkm_exec __P((struct lkm_table *lkmtp, int cmd)); +static int _lkm_vfs __P((struct lkm_table *lkmtp, int cmd)); +static int _lkm_syscall __P((struct lkm_table *lkmtp, int cmd)); +static void lkmunreserve __P((void)); + +static d_open_t lkmcopen; +static d_close_t lkmcclose; +static d_ioctl_t lkmcioctl; + +#define CDEV_MAJOR 32 +static struct cdevsw lkmc_cdevsw = + { lkmcopen, lkmcclose, noread, nowrite, /*32*/ + lkmcioctl, nostop, nullreset, nodevtotty, + seltrue, nommap, NULL, "lkm", NULL, -1 }; + + +/*ARGSUSED*/ +static int +lkmcopen(dev, flag, devtype, p) + dev_t dev; + int flag; + int devtype; + struct proc *p; +{ + int error; + + if (minor(dev) != 0) + return(ENXIO); /* bad minor # */ + + /* + * Use of the loadable kernel module device must be exclusive; we + * may try to remove this restriction later, but it's really no + * hardship. + */ + while (lkm_v & LKM_ALLOC) { + if (flag & FNONBLOCK) /* don't hang */ + return(EBUSY); + lkm_v |= LKM_WANT; + /* + * Sleep pending unlock; we use tsleep() to allow + * an alarm out of the open. + */ + error = tsleep((caddr_t)&lkm_v, TTIPRI|PCATCH, "lkmopn", 0); + if (error) + return(error); /* leave LKM_WANT set -- no problem */ + } + lkm_v |= LKM_ALLOC; + + return(0); /* pseudo-device open */ +} + +/* + * Unreserve the memory associated with the current loaded module; done on + * a coerced close of the lkm device (close on premature exit of modload) + * or explicitly by modload as a result of a link failure. + */ +static void +lkmunreserve() +{ + + if (lkm_state == LKMS_IDLE) + return; + + /* + * Actually unreserve the memory + */ + if (curp && curp->area) { + kmem_free(kernel_map, curp->area, curp->size);/**/ + curp->area = 0; + if (curp->private.lkm_any != NULL) + curp->private.lkm_any = NULL; + } + + lkm_state = LKMS_IDLE; +} + +static int +lkmcclose(dev, flag, mode, p) + dev_t dev; + int flag; + int mode; + struct proc *p; +{ + + if (!(lkm_v & LKM_ALLOC)) { +#ifdef DEBUG + printf("LKM: close before open!\n"); +#endif /* DEBUG */ + return(EBADF); + } + + /* do this before waking the herd... */ + if (curp && !curp->used) { + /* + * If we close before setting used, we have aborted + * by way of error or by way of close-on-exit from + * a premature exit of "modload". + */ + lkmunreserve(); /* coerce state to LKM_IDLE */ + } + + lkm_v &= ~LKM_ALLOC; + wakeup((caddr_t)&lkm_v); /* thundering herd "problem" here */ + + return(0); /* pseudo-device closed */ +} + +/*ARGSUSED*/ +static int +lkmcioctl(dev, cmd, data, flag, p) + dev_t dev; + u_long cmd; + caddr_t data; + int flag; + struct proc *p; +{ + int err = 0; + int i; + struct lmc_resrv *resrvp; + struct lmc_loadbuf *loadbufp; + struct lmc_unload *unloadp; + struct lmc_stat *statp; + char istr[MAXLKMNAME]; + + switch(cmd) { + case LMRESERV: /* reserve pages for a module */ + if ((flag & FWRITE) == 0 || securelevel > 0) + /* only allow this if writing and insecure */ + return EPERM; + + resrvp = (struct lmc_resrv *)data; + + /* + * Find a free slot. + */ + for (i = 0; i < MAXLKMS; i++) + if (!lkmods[i].used) + break; + if (i == MAXLKMS) { + err = ENOMEM; /* no slots available */ + break; + } + curp = &lkmods[i]; + curp->id = i; /* self reference slot offset */ + + resrvp->slot = i; /* return slot */ + + /* + * Get memory for module + */ + curp->size = resrvp->size; + + curp->area = kmem_alloc(kernel_map, curp->size);/**/ + + curp->offset = 0; /* load offset */ + + resrvp->addr = curp->area; /* ret kernel addr */ + +#ifdef DEBUG + printf("LKM: LMRESERV (actual = 0x%08lx)\n", curp->area); + printf("LKM: LMRESERV (adjusted = 0x%08lx)\n", + trunc_page(curp->area)); +#endif /* DEBUG */ + lkm_state = LKMS_RESERVED; + break; + + case LMLOADBUF: /* Copy in; stateful, follows LMRESERV */ + if ((flag & FWRITE) == 0 || securelevel > 0) + /* only allow this if writing and insecure */ + return EPERM; + + loadbufp = (struct lmc_loadbuf *)data; + i = loadbufp->cnt; + if ((lkm_state != LKMS_RESERVED && lkm_state != LKMS_LOADING) + || i < 0 + || i > MODIOBUF + || i > curp->size - curp->offset) { + err = ENOMEM; + break; + } + + /* copy in buffer full of data */ + err = copyin((caddr_t)loadbufp->data, + (caddr_t)(uintptr_t)(curp->area + curp->offset), i); + if (err) + break; + + if ((curp->offset + i) < curp->size) { + lkm_state = LKMS_LOADING; +#ifdef DEBUG + printf( + "LKM: LMLOADBUF (loading @ %lu of %lu, i = %d)\n", + curp->offset, curp->size, i); +#endif /* DEBUG */ + } else { + lkm_state = LKMS_LOADED; +#ifdef DEBUG + printf("LKM: LMLOADBUF (loaded)\n"); +#endif /* DEBUG */ + } + curp->offset += i; + break; + + case LMUNRESRV: /* discard reserved pages for a module */ + if ((flag & FWRITE) == 0 || securelevel > 0) + /* only allow this if writing and insecure */ + return EPERM; + + lkmunreserve(); /* coerce state to LKM_IDLE */ +#ifdef DEBUG + printf("LKM: LMUNRESERV\n"); +#endif /* DEBUG */ + break; + + case LMREADY: /* module loaded: call entry */ + if ((flag & FWRITE) == 0 || securelevel > 0) + /* only allow this if writing or insecure */ + return EPERM; + + switch (lkm_state) { + case LKMS_LOADED: + break; + case LKMS_LOADING: + /* The remainder must be bss, so we clear it */ + bzero((caddr_t)(uintptr_t)(curp->area + curp->offset), + curp->size - curp->offset); + break; + default: + +#ifdef DEBUG + printf("lkm_state is %02x\n", lkm_state); +#endif /* DEBUG */ + return ENXIO; + } + + /* XXX gack */ + curp->entry = (int (*) __P((struct lkm_table *, int, int))) + (*(uintfptr_t *)data); + + /* call entry(load)... (assigns "private" portion) */ + err = (*(curp->entry))(curp, LKM_E_LOAD, LKM_VERSION); + if (err) { + /* + * Module may refuse loading or may have a + * version mismatch... + */ + lkm_state = LKMS_UNLOADING; /* for lkmunreserve */ + lkmunreserve(); /* free memory */ + curp->used = 0; /* free slot */ + break; + } + /* + * It's possible for a user to load a module that doesn't + * initialize itself correctly. (You can even get away with + * using it for a while.) Unfortunately, we are faced with + * the following problems: + * - we can't tell a good module from a bad one until + * after we've run its entry function (if the private + * section is uninitalized after we return from the + * entry, then something's fishy) + * - now that we've called the entry function, we can't + * forcibly unload the module without risking a crash + * - since we don't know what the module's entry function + * did, we can't easily clean up the mess it may have + * made, so we can't know just how unstable the system + * may be + * So, being stuck between a rock and a hard place, we + * have no choice but to do this... + */ + if (curp->private.lkm_any == NULL) + panic("loadable module initialization failed"); + + curp->used = 1; +#ifdef DEBUG + printf("LKM: LMREADY\n"); +#endif /* DEBUG */ + lkm_state = LKMS_IDLE; + break; + + case LMUNLOAD: /* unload a module */ + if ((flag & FWRITE) == 0 || securelevel > 0) + /* only allow this if writing and insecure */ + return EPERM; + + unloadp = (struct lmc_unload *)data; + + if ((i = unloadp->id) == -1) { /* unload by name */ + /* + * Copy name and lookup id from all loaded + * modules. May fail. + */ + err =copyinstr(unloadp->name, istr, MAXLKMNAME-1, NULL); + if (err) + break; + + /* + * look up id... + */ + for (i = 0; i < MAXLKMS; i++) { + if (!lkmods[i].used) + continue; + if (!strcmp(istr, + lkmods[i].private.lkm_any->lkm_name)) + break; + } + } + + /* + * Range check the value; on failure, return EINVAL + */ + if (i < 0 || i >= MAXLKMS) { + err = EINVAL; + break; + } + + curp = &lkmods[i]; + + if (!curp->used) { + err = ENOENT; + break; + } + + /* call entry(unload) */ + if ((*(curp->entry))(curp, LKM_E_UNLOAD, LKM_VERSION)) { + err = EBUSY; + break; + } + + lkm_state = LKMS_UNLOADING; /* non-idle for lkmunreserve */ + lkmunreserve(); /* free memory */ + curp->used = 0; /* free slot */ + break; + + case LMSTAT: /* stat a module by id/name */ + /* allow readers and writers to stat */ + + statp = (struct lmc_stat *)data; + + if ((i = statp->id) == -1) { /* stat by name */ + /* + * Copy name and lookup id from all loaded + * modules. + */ + copystr(statp->name, istr, MAXLKMNAME-1, NULL); + /* + * look up id... + */ + for (i = 0; i < MAXLKMS; i++) { + if (!lkmods[i].used) + continue; + if (!strcmp(istr, + lkmods[i].private.lkm_any->lkm_name)) + break; + } + + if (i == MAXLKMS) { /* Not found */ + err = ENOENT; + break; + } + } + + /* + * Range check the value; on failure, return EINVAL + */ + if (i < 0 || i >= MAXLKMS) { + err = EINVAL; + break; + } + + curp = &lkmods[i]; + + if (!curp->used) { /* Not found */ + err = ENOENT; + break; + } + + /* + * Copy out stat information for this module... + */ + statp->id = curp->id; + statp->offset = curp->private.lkm_any->lkm_offset; + statp->type = curp->private.lkm_any->lkm_type; + statp->area = curp->area; + statp->size = curp->size / PAGESIZE; + statp->private = (uintptr_t)curp->private.lkm_any; + statp->ver = curp->private.lkm_any->lkm_ver; + copystr(curp->private.lkm_any->lkm_name, + statp->name, + MAXLKMNAME - 2, + NULL); + + break; + + default: /* bad ioctl()... */ + err = ENOTTY; + break; + } + + return (err); +} + +int +lkmexists(lkmtp) + struct lkm_table *lkmtp; +{ + int i; + + /* + * see if name exists... + */ + for (i = 0; i < MAXLKMS; i++) { + /* + * An unused module and the one we are testing are not + * considered. + */ + if (!lkmods[i].used || &lkmods[i] == lkmtp) + continue; + if (!strcmp(lkmtp->private.lkm_any->lkm_name, + lkmods[i].private.lkm_any->lkm_name)) + return(1); /* already loaded... */ + } + + return(0); /* module not loaded... */ +} + +/* + * For the loadable system call described by the structure pointed to + * by lkmtp, load/unload/stat it depending on the cmd requested. + */ +static int +_lkm_syscall(lkmtp, cmd) + struct lkm_table *lkmtp; + int cmd; +{ + struct lkm_syscall *args = lkmtp->private.lkm_syscall; + int i; + int err = 0; + + switch(cmd) { + case LKM_E_LOAD: + /* don't load twice! */ + if (lkmexists(lkmtp)) + return(EEXIST); + + if (args->lkm_offset == LKM_ANON) + i = NO_SYSCALL; + else + i = args->lkm_offset; + + err = syscall_register(&i, args->lkm_sysent, + &(args->lkm_oldent)); + if (err) + return(err); + + /* done! */ + args->lkm_offset = i; /* slot in sysent[] */ + + break; + + case LKM_E_UNLOAD: + /* current slot... */ + i = args->lkm_offset; + + err = syscall_deregister(&i, &(args->lkm_oldent)); + if (err) + return(err); + break; + + case LKM_E_STAT: /* no special handling... */ + break; + } + + return(err); +} + +/* + * For the loadable virtual file system described by the structure pointed + * to by lkmtp, load/unload/stat it depending on the cmd requested. + */ +static int +_lkm_vfs(lkmtp, cmd) + struct lkm_table *lkmtp; + int cmd; +{ + struct lkm_vfs *args = lkmtp->private.lkm_vfs; + struct vfsconf *vfc = args->lkm_vfsconf; + int error, i; + + switch(cmd) { + case LKM_E_LOAD: + /* don't load twice! */ + if (lkmexists(lkmtp)) + return(EEXIST); + + for(i = 0; args->lkm_vnodeops->ls_items[i]; i++) + vfs_add_vnodeops((void*)args->lkm_vnodeops->ls_items[i]); + error = vfs_register(vfc); + if (error) + return(error); + + args->lkm_offset = vfc->vfc_typenum; + + /* done! */ + break; + + case LKM_E_UNLOAD: + /* current slot... */ + i = args->lkm_offset; + + error = vfs_unregister(vfc); + if (error) + return(error); + + for(i = 0; args->lkm_vnodeops->ls_items[i]; i++) + vfs_rm_vnodeops((void*)args->lkm_vnodeops->ls_items[i]); + + break; + + case LKM_E_STAT: /* no special handling... */ + break; + } + return (0); +} + +/* + * For the loadable device driver described by the structure pointed to + * by lkmtp, load/unload/stat it depending on the cmd requested. + */ +static int +_lkm_dev(lkmtp, cmd) + struct lkm_table *lkmtp; + int cmd; +{ + struct lkm_dev *args = lkmtp->private.lkm_dev; + int i; + dev_t descrip; + int err = 0; + + switch(cmd) { + case LKM_E_LOAD: + /* don't load twice! */ + if (lkmexists(lkmtp)) + return(EEXIST); + switch(args->lkm_devtype) { + case LM_DT_CHAR: + if ((i = args->lkm_offset) == LKM_ANON) + descrip = (dev_t) -1; + else + descrip = makedev(args->lkm_offset,0); + if ( err = cdevsw_add(&descrip, args->lkm_dev.cdev, + &(args->lkm_olddev.cdev))) { + break; + } + args->lkm_offset = major(descrip) ; + break; + + default: + err = ENODEV; + break; + } + break; + + case LKM_E_UNLOAD: + /* current slot... */ + i = args->lkm_offset; + descrip = makedev(i,0); + + switch(args->lkm_devtype) { + case LM_DT_CHAR: + /* replace current slot contents with old contents */ + cdevsw_add(&descrip, args->lkm_olddev.cdev,NULL); + break; + + default: + err = ENODEV; + break; + } + break; + + case LKM_E_STAT: /* no special handling... */ + break; + } + + return(err); +} + +#ifdef STREAMS +/* + * For the loadable streams module described by the structure pointed to + * by lkmtp, load/unload/stat it depending on the cmd requested. + */ +static int +_lkm_strmod(lkmtp, cmd) + struct lkm_table *lkmtp; + int cmd; +{ + struct lkm_strmod *args = lkmtp->private.lkm_strmod; + int i; + int err = 0; + + switch(cmd) { + case LKM_E_LOAD: + /* don't load twice! */ + if (lkmexists(lkmtp)) + return(EEXIST); + break; + + case LKM_E_UNLOAD: + break; + + case LKM_E_STAT: /* no special handling... */ + break; + } + + return(err); +} +#endif /* STREAMS */ + +/* + * For the loadable execution class described by the structure pointed to + * by lkmtp, load/unload/stat it depending on the cmd requested. + */ +static int +_lkm_exec(lkmtp, cmd) + struct lkm_table *lkmtp; + int cmd; +{ + struct lkm_exec *args = lkmtp->private.lkm_exec; + int err = 0; + + switch(cmd) { + case LKM_E_LOAD: + /* don't load twice! */ + if (lkmexists(lkmtp)) + return(EEXIST); + if (args->lkm_offset != LKM_ANON) { /* auto */ + err = EINVAL; + break; + } + + err = exec_register(args->lkm_exec); + + /* done! */ + args->lkm_offset = 0; + + break; + + case LKM_E_UNLOAD: + + err = exec_unregister(args->lkm_exec); + + break; + + case LKM_E_STAT: /* no special handling... */ + break; + } + return(err); +} + +/* + * This code handles the per-module type "wiring-in" of loadable modules + * into existing kernel tables. For "LM_MISC" modules, wiring and unwiring + * is assumed to be done in their entry routines internal to the module + * itself. + */ +int +lkmdispatch(lkmtp, cmd) + struct lkm_table *lkmtp; + int cmd; +{ + int err = 0; /* default = success */ + + switch(lkmtp->private.lkm_any->lkm_type) { + case LM_SYSCALL: + err = _lkm_syscall(lkmtp, cmd); + break; + + case LM_VFS: + err = _lkm_vfs(lkmtp, cmd); + break; + + case LM_DEV: + err = _lkm_dev(lkmtp, cmd); + break; + +#ifdef STREAMS + case LM_STRMOD: + { + struct lkm_strmod *args = lkmtp->private.lkm_strmod; + } + break; + +#endif /* STREAMS */ + + case LM_EXEC: + err = _lkm_exec(lkmtp, cmd); + break; + + case LM_MISC: /* ignore content -- no "misc-specific" procedure */ + if (lkmexists(lkmtp)) + err = EEXIST; + break; + + default: + err = ENXIO; /* unknown type */ + break; + } + + return(err); +} + +int +lkm_nullcmd(lkmtp, cmd) + struct lkm_table *lkmtp; + int cmd; +{ + + return (0); +} + +#ifdef DEVFS +static void *lkmc_devfs_token; +#endif + +static int +lkm_modevent(module_t mod, int type, void *data) +{ + dev_t dev; + static struct cdevsw *oldcdevsw; + + switch (type) { + case MOD_LOAD: + dev = makedev(CDEV_MAJOR, 0); + cdevsw_add(&dev, &lkmc_cdevsw, &oldcdevsw); +#ifdef DEVFS + lkmc_devfs_token = devfs_add_devswf(&lkmc_cdevsw, 0, DV_CHR, + UID_ROOT, GID_WHEEL, 0644, + "lkm"); +#endif + break; + case MOD_UNLOAD: +#ifdef DEVFS + devfs_remove_dev(lkmc_devfs_token); +#endif + cdevsw_add(&dev, oldcdevsw, NULL); + break; + default: + break; + } + return 0; +} +static moduledata_t lkm_mod = { + "lkm", + lkm_modevent, + NULL +}; +DECLARE_MODULE(lkm, lkm_mod, SI_SUB_DRIVERS, SI_ORDER_MIDDLE+CDEV_MAJOR); diff --git a/sys/kern/kern_lock.c b/sys/kern/kern_lock.c new file mode 100644 index 0000000..e832acf --- /dev/null +++ b/sys/kern/kern_lock.c @@ -0,0 +1,613 @@ +/* + * Copyright (c) 1995 + * The Regents of the University of California. All rights reserved. + * + * Copyright (C) 1997 + * John S. Dyson. All rights reserved. + * + * This code contains ideas from software contributed to Berkeley by + * Avadis Tevanian, Jr., Michael Wayne Young, and the Mach Operating + * System project at Carnegie-Mellon University. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_lock.c 8.18 (Berkeley) 5/21/95 + * $Id: kern_lock.c,v 1.22 1999/01/10 01:58:24 eivind Exp $ + */ + +#include "opt_lint.h" + +#include <sys/param.h> +#include <sys/proc.h> +#include <sys/lock.h> +#include <sys/systm.h> + +/* + * Locking primitives implementation. + * Locks provide shared/exclusive sychronization. + */ + +#ifdef SIMPLELOCK_DEBUG +#define COUNT(p, x) if (p) (p)->p_locks += (x) +#else +#define COUNT(p, x) +#endif + +#define LOCK_WAIT_TIME 100 +#define LOCK_SAMPLE_WAIT 7 + +#if defined(DIAGNOSTIC) +#define LOCK_INLINE +#else +#define LOCK_INLINE __inline +#endif + +#define LK_ALL (LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE | \ + LK_SHARE_NONZERO | LK_WAIT_NONZERO) + +static int acquire(struct lock *lkp, int extflags, int wanted); +static int apause(struct lock *lkp, int flags); +static int acquiredrain(struct lock *lkp, int extflags) ; + +static LOCK_INLINE void +sharelock(struct lock *lkp, int incr) { + lkp->lk_flags |= LK_SHARE_NONZERO; + lkp->lk_sharecount += incr; +} + +static LOCK_INLINE void +shareunlock(struct lock *lkp, int decr) { + + KASSERT(lkp->lk_sharecount >= decr, ("shareunlock: count < decr")); + + if (lkp->lk_sharecount == decr) { + lkp->lk_flags &= ~LK_SHARE_NONZERO; + if (lkp->lk_flags & (LK_WANT_UPGRADE | LK_WANT_EXCL)) { + wakeup(lkp); + } + lkp->lk_sharecount = 0; + } else { + lkp->lk_sharecount -= decr; + } +} + +/* + * This is the waitloop optimization, and note for this to work + * simple_lock and simple_unlock should be subroutines to avoid + * optimization troubles. + */ +static int +apause(struct lock *lkp, int flags) { + int lock_wait; + lock_wait = LOCK_WAIT_TIME; + for (; lock_wait > 0; lock_wait--) { + int i; + if ((lkp->lk_flags & flags) == 0) + return 0; + simple_unlock(&lkp->lk_interlock); + for (i = LOCK_SAMPLE_WAIT; i > 0; i--) { + if ((lkp->lk_flags & flags) == 0) { + simple_lock(&lkp->lk_interlock); + if ((lkp->lk_flags & flags) == 0) + return 0; + break; + } + } + } + return 1; +} + +static int +acquire(struct lock *lkp, int extflags, int wanted) { + int s, error; + + if ((extflags & LK_NOWAIT) && (lkp->lk_flags & wanted)) { + return EBUSY; + } + + if (((lkp->lk_flags | extflags) & LK_NOPAUSE) == 0) { + error = apause(lkp, wanted); + if (error == 0) + return 0; + } + + s = splhigh(); + while ((lkp->lk_flags & wanted) != 0) { + lkp->lk_flags |= LK_WAIT_NONZERO; + lkp->lk_waitcount++; + simple_unlock(&lkp->lk_interlock); + error = tsleep(lkp, lkp->lk_prio, lkp->lk_wmesg, lkp->lk_timo); + simple_lock(&lkp->lk_interlock); + if (lkp->lk_waitcount == 1) { + lkp->lk_flags &= ~LK_WAIT_NONZERO; + lkp->lk_waitcount = 0; + } else { + lkp->lk_waitcount--; + } + if (error) { + splx(s); + return error; + } + if (extflags & LK_SLEEPFAIL) { + splx(s); + return ENOLCK; + } + } + splx(s); + return 0; +} + +/* + * Set, change, or release a lock. + * + * Shared requests increment the shared count. Exclusive requests set the + * LK_WANT_EXCL flag (preventing further shared locks), and wait for already + * accepted shared locks and shared-to-exclusive upgrades to go away. + */ +int +#ifndef DEBUG_LOCKS +lockmgr(lkp, flags, interlkp, p) +#else +debuglockmgr(lkp, flags, interlkp, p, name, file, line) +#endif + struct lock *lkp; + u_int flags; + struct simplelock *interlkp; + struct proc *p; +#ifdef DEBUG_LOCKS + const char *name; /* Name of lock function */ + const char *file; /* Name of file call is from */ + int line; /* Line number in file */ +#endif +{ + int error; + pid_t pid; + int extflags; + + error = 0; + if (p == NULL) + pid = LK_KERNPROC; + else + pid = p->p_pid; + + simple_lock(&lkp->lk_interlock); + if (flags & LK_INTERLOCK) + simple_unlock(interlkp); + + extflags = (flags | lkp->lk_flags) & LK_EXTFLG_MASK; + + switch (flags & LK_TYPE_MASK) { + + case LK_SHARED: + if (lkp->lk_lockholder != pid) { + error = acquire(lkp, extflags, + LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE); + if (error) + break; + sharelock(lkp, 1); + COUNT(p, 1); + break; + } + /* + * We hold an exclusive lock, so downgrade it to shared. + * An alternative would be to fail with EDEADLK. + */ + sharelock(lkp, 1); + COUNT(p, 1); + /* fall into downgrade */ + + case LK_DOWNGRADE: +#if !defined(MAX_PERF) + if (lkp->lk_lockholder != pid || lkp->lk_exclusivecount == 0) + panic("lockmgr: not holding exclusive lock"); +#endif + sharelock(lkp, lkp->lk_exclusivecount); + lkp->lk_exclusivecount = 0; + lkp->lk_flags &= ~LK_HAVE_EXCL; + lkp->lk_lockholder = LK_NOPROC; + if (lkp->lk_waitcount) + wakeup((void *)lkp); + break; + + case LK_EXCLUPGRADE: + /* + * If another process is ahead of us to get an upgrade, + * then we want to fail rather than have an intervening + * exclusive access. + */ + if (lkp->lk_flags & LK_WANT_UPGRADE) { + shareunlock(lkp, 1); + COUNT(p, -1); + error = EBUSY; + break; + } + /* fall into normal upgrade */ + + case LK_UPGRADE: + /* + * Upgrade a shared lock to an exclusive one. If another + * shared lock has already requested an upgrade to an + * exclusive lock, our shared lock is released and an + * exclusive lock is requested (which will be granted + * after the upgrade). If we return an error, the file + * will always be unlocked. + */ +#if !defined(MAX_PERF) + if ((lkp->lk_lockholder == pid) || (lkp->lk_sharecount <= 0)) + panic("lockmgr: upgrade exclusive lock"); +#endif + shareunlock(lkp, 1); + COUNT(p, -1); + /* + * If we are just polling, check to see if we will block. + */ + if ((extflags & LK_NOWAIT) && + ((lkp->lk_flags & LK_WANT_UPGRADE) || + lkp->lk_sharecount > 1)) { + error = EBUSY; + break; + } + if ((lkp->lk_flags & LK_WANT_UPGRADE) == 0) { + /* + * We are first shared lock to request an upgrade, so + * request upgrade and wait for the shared count to + * drop to zero, then take exclusive lock. + */ + lkp->lk_flags |= LK_WANT_UPGRADE; + error = acquire(lkp, extflags, LK_SHARE_NONZERO); + lkp->lk_flags &= ~LK_WANT_UPGRADE; + + if (error) + break; + lkp->lk_flags |= LK_HAVE_EXCL; + lkp->lk_lockholder = pid; +#if !defined(MAX_PERF) + if (lkp->lk_exclusivecount != 0) + panic("lockmgr: non-zero exclusive count"); +#endif + lkp->lk_exclusivecount = 1; +#if defined(DEBUG_LOCKS) + lkp->lk_filename = file; + lkp->lk_lineno = line; + lkp->lk_lockername = name; +#endif + COUNT(p, 1); + break; + } + /* + * Someone else has requested upgrade. Release our shared + * lock, awaken upgrade requestor if we are the last shared + * lock, then request an exclusive lock. + */ + if ( (lkp->lk_flags & (LK_SHARE_NONZERO|LK_WAIT_NONZERO)) == + LK_WAIT_NONZERO) + wakeup((void *)lkp); + /* fall into exclusive request */ + + case LK_EXCLUSIVE: + if (lkp->lk_lockholder == pid && pid != LK_KERNPROC) { + /* + * Recursive lock. + */ +#if !defined(MAX_PERF) + if ((extflags & LK_CANRECURSE) == 0) + panic("lockmgr: locking against myself"); +#endif + lkp->lk_exclusivecount++; + COUNT(p, 1); + break; + } + /* + * If we are just polling, check to see if we will sleep. + */ + if ((extflags & LK_NOWAIT) && + (lkp->lk_flags & (LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE | LK_SHARE_NONZERO))) { + error = EBUSY; + break; + } + /* + * Try to acquire the want_exclusive flag. + */ + error = acquire(lkp, extflags, (LK_HAVE_EXCL | LK_WANT_EXCL)); + if (error) + break; + lkp->lk_flags |= LK_WANT_EXCL; + /* + * Wait for shared locks and upgrades to finish. + */ + error = acquire(lkp, extflags, LK_WANT_UPGRADE | LK_SHARE_NONZERO); + lkp->lk_flags &= ~LK_WANT_EXCL; + if (error) + break; + lkp->lk_flags |= LK_HAVE_EXCL; + lkp->lk_lockholder = pid; +#if !defined(MAX_PERF) + if (lkp->lk_exclusivecount != 0) + panic("lockmgr: non-zero exclusive count"); +#endif + lkp->lk_exclusivecount = 1; +#if defined(DEBUG_LOCKS) + lkp->lk_filename = file; + lkp->lk_lineno = line; + lkp->lk_lockername = name; +#endif + COUNT(p, 1); + break; + + case LK_RELEASE: + if (lkp->lk_exclusivecount != 0) { +#if !defined(MAX_PERF) + if (pid != lkp->lk_lockholder) + panic("lockmgr: pid %d, not %s %d unlocking", + pid, "exclusive lock holder", + lkp->lk_lockholder); +#endif + COUNT(p, -1); + if (lkp->lk_exclusivecount == 1) { + lkp->lk_flags &= ~LK_HAVE_EXCL; + lkp->lk_lockholder = LK_NOPROC; + lkp->lk_exclusivecount = 0; + } else { + lkp->lk_exclusivecount--; + } + } else if (lkp->lk_flags & LK_SHARE_NONZERO) { + shareunlock(lkp, 1); + COUNT(p, -1); + } + if (lkp->lk_flags & LK_WAIT_NONZERO) + wakeup((void *)lkp); + break; + + case LK_DRAIN: + /* + * Check that we do not already hold the lock, as it can + * never drain if we do. Unfortunately, we have no way to + * check for holding a shared lock, but at least we can + * check for an exclusive one. + */ +#if !defined(MAX_PERF) + if (lkp->lk_lockholder == pid) + panic("lockmgr: draining against myself"); +#endif + + error = acquiredrain(lkp, extflags); + if (error) + break; + lkp->lk_flags |= LK_DRAINING | LK_HAVE_EXCL; + lkp->lk_lockholder = pid; + lkp->lk_exclusivecount = 1; +#if defined(DEBUG_LOCKS) + lkp->lk_filename = file; + lkp->lk_lineno = line; + lkp->lk_lockername = name; +#endif + COUNT(p, 1); + break; + + default: +#if !defined(MAX_PERF) + simple_unlock(&lkp->lk_interlock); + panic("lockmgr: unknown locktype request %d", + flags & LK_TYPE_MASK); +#endif + /* NOTREACHED */ + } + if ((lkp->lk_flags & LK_WAITDRAIN) && + (lkp->lk_flags & (LK_HAVE_EXCL | LK_WANT_EXCL | LK_WANT_UPGRADE | + LK_SHARE_NONZERO | LK_WAIT_NONZERO)) == 0) { + lkp->lk_flags &= ~LK_WAITDRAIN; + wakeup((void *)&lkp->lk_flags); + } + simple_unlock(&lkp->lk_interlock); + return (error); +} + +static int +acquiredrain(struct lock *lkp, int extflags) { + int error; + + if ((extflags & LK_NOWAIT) && (lkp->lk_flags & LK_ALL)) { + return EBUSY; + } + + error = apause(lkp, LK_ALL); + if (error == 0) + return 0; + + while (lkp->lk_flags & LK_ALL) { + lkp->lk_flags |= LK_WAITDRAIN; + simple_unlock(&lkp->lk_interlock); + error = tsleep(&lkp->lk_flags, lkp->lk_prio, + lkp->lk_wmesg, lkp->lk_timo); + simple_lock(&lkp->lk_interlock); + if (error) + return error; + if (extflags & LK_SLEEPFAIL) { + return ENOLCK; + } + } + return 0; +} + +/* + * Initialize a lock; required before use. + */ +void +lockinit(lkp, prio, wmesg, timo, flags) + struct lock *lkp; + int prio; + char *wmesg; + int timo; + int flags; +{ + + simple_lock_init(&lkp->lk_interlock); + lkp->lk_flags = (flags & LK_EXTFLG_MASK); + lkp->lk_sharecount = 0; + lkp->lk_waitcount = 0; + lkp->lk_exclusivecount = 0; + lkp->lk_prio = prio; + lkp->lk_wmesg = wmesg; + lkp->lk_timo = timo; + lkp->lk_lockholder = LK_NOPROC; +} + +/* + * Determine the status of a lock. + */ +int +lockstatus(lkp) + struct lock *lkp; +{ + int lock_type = 0; + + simple_lock(&lkp->lk_interlock); + if (lkp->lk_exclusivecount != 0) + lock_type = LK_EXCLUSIVE; + else if (lkp->lk_sharecount != 0) + lock_type = LK_SHARED; + simple_unlock(&lkp->lk_interlock); + return (lock_type); +} + +/* + * Print out information about state of a lock. Used by VOP_PRINT + * routines to display status about contained locks. + */ +void +lockmgr_printinfo(lkp) + struct lock *lkp; +{ + + if (lkp->lk_sharecount) + printf(" lock type %s: SHARED (count %d)", lkp->lk_wmesg, + lkp->lk_sharecount); + else if (lkp->lk_flags & LK_HAVE_EXCL) + printf(" lock type %s: EXCL (count %d) by pid %d", + lkp->lk_wmesg, lkp->lk_exclusivecount, lkp->lk_lockholder); + if (lkp->lk_waitcount > 0) + printf(" with %d pending", lkp->lk_waitcount); +} + +#if defined(SIMPLELOCK_DEBUG) && (NCPUS == 1 || defined(COMPILING_LINT)) +#include <sys/kernel.h> +#include <sys/sysctl.h> + +static int lockpausetime = 0; +SYSCTL_INT(_debug, OID_AUTO, lockpausetime, CTLFLAG_RW, &lockpausetime, 0, ""); + +static int simplelockrecurse; + +/* + * Simple lock functions so that the debugger can see from whence + * they are being called. + */ +void +simple_lock_init(alp) + struct simplelock *alp; +{ + + alp->lock_data = 0; +} + +void +_simple_lock(alp, id, l) + struct simplelock *alp; + const char *id; + int l; +{ + + if (simplelockrecurse) + return; + if (alp->lock_data == 1) { + if (lockpausetime == -1) + panic("%s:%d: simple_lock: lock held", id, l); + printf("%s:%d: simple_lock: lock held\n", id, l); + if (lockpausetime == 1) { + Debugger("simple_lock"); + /*BACKTRACE(curproc); */ + } else if (lockpausetime > 1) { + printf("%s:%d: simple_lock: lock held...", id, l); + tsleep(&lockpausetime, PCATCH | PPAUSE, "slock", + lockpausetime * hz); + printf(" continuing\n"); + } + } + alp->lock_data = 1; + if (curproc) + curproc->p_simple_locks++; +} + +int +_simple_lock_try(alp, id, l) + struct simplelock *alp; + const char *id; + int l; +{ + + if (alp->lock_data) + return (0); + if (simplelockrecurse) + return (1); + alp->lock_data = 1; + if (curproc) + curproc->p_simple_locks++; + return (1); +} + +void +_simple_unlock(alp, id, l) + struct simplelock *alp; + const char *id; + int l; +{ + + if (simplelockrecurse) + return; + if (alp->lock_data == 0) { + if (lockpausetime == -1) + panic("%s:%d: simple_unlock: lock not held", id, l); + printf("%s:%d: simple_unlock: lock not held\n", id, l); + if (lockpausetime == 1) { + Debugger("simple_unlock"); + /* BACKTRACE(curproc); */ + } else if (lockpausetime > 1) { + printf("%s:%d: simple_unlock: lock not held...", id, l); + tsleep(&lockpausetime, PCATCH | PPAUSE, "sunlock", + lockpausetime * hz); + printf(" continuing\n"); + } + } + alp->lock_data = 0; + if (curproc) + curproc->p_simple_locks--; +} +#elif defined(SIMPLELOCK_DEBUG) +#error "SIMPLELOCK_DEBUG is not compatible with SMP!" +#endif /* SIMPLELOCK_DEBUG && NCPUS == 1 */ diff --git a/sys/kern/kern_lockf.c b/sys/kern/kern_lockf.c new file mode 100644 index 0000000..cc1b8a5 --- /dev/null +++ b/sys/kern/kern_lockf.c @@ -0,0 +1,806 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Scooter Morris at Genentech Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94 + * $Id: kern_lockf.c,v 1.19 1998/07/29 17:38:14 bde Exp $ + */ + +#include "opt_debug_lockf.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/proc.h> +#include <sys/unistd.h> +#include <sys/vnode.h> +#include <sys/malloc.h> +#include <sys/fcntl.h> + +#include <sys/lockf.h> + +/* + * This variable controls the maximum number of processes that will + * be checked in doing deadlock detection. + */ +static int maxlockdepth = MAXDEPTH; + +#ifdef LOCKF_DEBUG +#include <sys/kernel.h> +#include <sys/sysctl.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> + + +static int lockf_debug = 0; +SYSCTL_INT(_debug, OID_AUTO, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, ""); +#endif + +static MALLOC_DEFINE(M_LOCKF, "lockf", "Byte-range locking structures"); + +#define NOLOCKF (struct lockf *)0 +#define SELF 0x1 +#define OTHERS 0x2 +static int lf_clearlock __P((struct lockf *)); +static int lf_findoverlap __P((struct lockf *, + struct lockf *, int, struct lockf ***, struct lockf **)); +static struct lockf * + lf_getblock __P((struct lockf *)); +static int lf_getlock __P((struct lockf *, struct flock *)); +static int lf_setlock __P((struct lockf *)); +static void lf_split __P((struct lockf *, struct lockf *)); +static void lf_wakelock __P((struct lockf *)); + +/* + * Advisory record locking support + */ +int +lf_advlock(ap, head, size) + struct vop_advlock_args /* { + struct vnode *a_vp; + caddr_t a_id; + int a_op; + struct flock *a_fl; + int a_flags; + } */ *ap; + struct lockf **head; + u_quad_t size; +{ + register struct flock *fl = ap->a_fl; + register struct lockf *lock; + off_t start, end; + int error; + + /* + * Convert the flock structure into a start and end. + */ + switch (fl->l_whence) { + + case SEEK_SET: + case SEEK_CUR: + /* + * Caller is responsible for adding any necessary offset + * when SEEK_CUR is used. + */ + start = fl->l_start; + break; + + case SEEK_END: + start = size + fl->l_start; + break; + + default: + return (EINVAL); + } + if (start < 0) + return (EINVAL); + if (fl->l_len == 0) + end = -1; + else { + end = start + fl->l_len - 1; + if (end < start) + return (EINVAL); + } + /* + * Avoid the common case of unlocking when inode has no locks. + */ + if (*head == (struct lockf *)0) { + if (ap->a_op != F_SETLK) { + fl->l_type = F_UNLCK; + return (0); + } + } + /* + * Create the lockf structure + */ + MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK); + lock->lf_start = start; + lock->lf_end = end; + lock->lf_id = ap->a_id; +/* lock->lf_inode = ip; */ /* XXX JH */ + lock->lf_type = fl->l_type; + lock->lf_head = head; + lock->lf_next = (struct lockf *)0; + TAILQ_INIT(&lock->lf_blkhd); + lock->lf_flags = ap->a_flags; + /* + * Do the requested operation. + */ + switch(ap->a_op) { + case F_SETLK: + return (lf_setlock(lock)); + + case F_UNLCK: + error = lf_clearlock(lock); + FREE(lock, M_LOCKF); + return (error); + + case F_GETLK: + error = lf_getlock(lock, fl); + FREE(lock, M_LOCKF); + return (error); + + default: + free(lock, M_LOCKF); + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Set a byte-range lock. + */ +static int +lf_setlock(lock) + register struct lockf *lock; +{ + register struct lockf *block; + struct lockf **head = lock->lf_head; + struct lockf **prev, *overlap, *ltmp; + static char lockstr[] = "lockf"; + int ovcase, priority, needtolink, error; + +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) + lf_print("lf_setlock", lock); +#endif /* LOCKF_DEBUG */ + + /* + * Set the priority + */ + priority = PLOCK; + if (lock->lf_type == F_WRLCK) + priority += 4; + priority |= PCATCH; + /* + * Scan lock list for this file looking for locks that would block us. + */ + while ((block = lf_getblock(lock))) { + /* + * Free the structure and return if nonblocking. + */ + if ((lock->lf_flags & F_WAIT) == 0) { + FREE(lock, M_LOCKF); + return (EAGAIN); + } + /* + * We are blocked. Since flock style locks cover + * the whole file, there is no chance for deadlock. + * For byte-range locks we must check for deadlock. + * + * Deadlock detection is done by looking through the + * wait channels to see if there are any cycles that + * involve us. MAXDEPTH is set just to make sure we + * do not go off into neverland. + */ + if ((lock->lf_flags & F_POSIX) && + (block->lf_flags & F_POSIX)) { + register struct proc *wproc; + register struct lockf *waitblock; + int i = 0; + + /* The block is waiting on something */ + wproc = (struct proc *)block->lf_id; + while (wproc->p_wchan && + (wproc->p_wmesg == lockstr) && + (i++ < maxlockdepth)) { + waitblock = (struct lockf *)wproc->p_wchan; + /* Get the owner of the blocking lock */ + waitblock = waitblock->lf_next; + if ((waitblock->lf_flags & F_POSIX) == 0) + break; + wproc = (struct proc *)waitblock->lf_id; + if (wproc == (struct proc *)lock->lf_id) { + free(lock, M_LOCKF); + return (EDEADLK); + } + } + } + /* + * For flock type locks, we must first remove + * any shared locks that we hold before we sleep + * waiting for an exclusive lock. + */ + if ((lock->lf_flags & F_FLOCK) && + lock->lf_type == F_WRLCK) { + lock->lf_type = F_UNLCK; + (void) lf_clearlock(lock); + lock->lf_type = F_WRLCK; + } + /* + * Add our lock to the blocked list and sleep until we're free. + * Remember who blocked us (for deadlock detection). + */ + lock->lf_next = block; + TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block); +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) { + lf_print("lf_setlock: blocking on", block); + lf_printlist("lf_setlock", block); + } +#endif /* LOCKF_DEBUG */ + if ((error = tsleep((caddr_t)lock, priority, lockstr, 0))) { + /* + * We may have been awakened by a signal (in + * which case we must remove ourselves from the + * blocked list) and/or by another process + * releasing a lock (in which case we have already + * been removed from the blocked list and our + * lf_next field set to NOLOCKF). + */ + if (lock->lf_next) + TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, + lf_block); + free(lock, M_LOCKF); + return (error); + } + } + /* + * No blocks!! Add the lock. Note that we will + * downgrade or upgrade any overlapping locks this + * process already owns. + * + * Skip over locks owned by other processes. + * Handle any locks that overlap and are owned by ourselves. + */ + prev = head; + block = *head; + needtolink = 1; + for (;;) { + ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap); + if (ovcase) + block = overlap->lf_next; + /* + * Six cases: + * 0) no overlap + * 1) overlap == lock + * 2) overlap contains lock + * 3) lock contains overlap + * 4) overlap starts before lock + * 5) overlap ends after lock + */ + switch (ovcase) { + case 0: /* no overlap */ + if (needtolink) { + *prev = lock; + lock->lf_next = overlap; + } + break; + + case 1: /* overlap == lock */ + /* + * If downgrading lock, others may be + * able to acquire it. + */ + if (lock->lf_type == F_RDLCK && + overlap->lf_type == F_WRLCK) + lf_wakelock(overlap); + overlap->lf_type = lock->lf_type; + FREE(lock, M_LOCKF); + lock = overlap; /* for debug output below */ + break; + + case 2: /* overlap contains lock */ + /* + * Check for common starting point and different types. + */ + if (overlap->lf_type == lock->lf_type) { + free(lock, M_LOCKF); + lock = overlap; /* for debug output below */ + break; + } + if (overlap->lf_start == lock->lf_start) { + *prev = lock; + lock->lf_next = overlap; + overlap->lf_start = lock->lf_end + 1; + } else + lf_split(overlap, lock); + lf_wakelock(overlap); + break; + + case 3: /* lock contains overlap */ + /* + * If downgrading lock, others may be able to + * acquire it, otherwise take the list. + */ + if (lock->lf_type == F_RDLCK && + overlap->lf_type == F_WRLCK) { + lf_wakelock(overlap); + } else { + while (ltmp = overlap->lf_blkhd.tqh_first) { + TAILQ_REMOVE(&overlap->lf_blkhd, ltmp, + lf_block); + TAILQ_INSERT_TAIL(&lock->lf_blkhd, + ltmp, lf_block); + } + } + /* + * Add the new lock if necessary and delete the overlap. + */ + if (needtolink) { + *prev = lock; + lock->lf_next = overlap->lf_next; + prev = &lock->lf_next; + needtolink = 0; + } else + *prev = overlap->lf_next; + free(overlap, M_LOCKF); + continue; + + case 4: /* overlap starts before lock */ + /* + * Add lock after overlap on the list. + */ + lock->lf_next = overlap->lf_next; + overlap->lf_next = lock; + overlap->lf_end = lock->lf_start - 1; + prev = &lock->lf_next; + lf_wakelock(overlap); + needtolink = 0; + continue; + + case 5: /* overlap ends after lock */ + /* + * Add the new lock before overlap. + */ + if (needtolink) { + *prev = lock; + lock->lf_next = overlap; + } + overlap->lf_start = lock->lf_end + 1; + lf_wakelock(overlap); + break; + } + break; + } +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) { + lf_print("lf_setlock: got the lock", lock); + lf_printlist("lf_setlock", lock); + } +#endif /* LOCKF_DEBUG */ + return (0); +} + +/* + * Remove a byte-range lock on an inode. + * + * Generally, find the lock (or an overlap to that lock) + * and remove it (or shrink it), then wakeup anyone we can. + */ +static int +lf_clearlock(unlock) + register struct lockf *unlock; +{ + struct lockf **head = unlock->lf_head; + register struct lockf *lf = *head; + struct lockf *overlap, **prev; + int ovcase; + + if (lf == NOLOCKF) + return (0); +#ifdef LOCKF_DEBUG + if (unlock->lf_type != F_UNLCK) + panic("lf_clearlock: bad type"); + if (lockf_debug & 1) + lf_print("lf_clearlock", unlock); +#endif /* LOCKF_DEBUG */ + prev = head; + while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap))) { + /* + * Wakeup the list of locks to be retried. + */ + lf_wakelock(overlap); + + switch (ovcase) { + + case 1: /* overlap == lock */ + *prev = overlap->lf_next; + FREE(overlap, M_LOCKF); + break; + + case 2: /* overlap contains lock: split it */ + if (overlap->lf_start == unlock->lf_start) { + overlap->lf_start = unlock->lf_end + 1; + break; + } + lf_split(overlap, unlock); + overlap->lf_next = unlock->lf_next; + break; + + case 3: /* lock contains overlap */ + *prev = overlap->lf_next; + lf = overlap->lf_next; + free(overlap, M_LOCKF); + continue; + + case 4: /* overlap starts before lock */ + overlap->lf_end = unlock->lf_start - 1; + prev = &overlap->lf_next; + lf = overlap->lf_next; + continue; + + case 5: /* overlap ends after lock */ + overlap->lf_start = unlock->lf_end + 1; + break; + } + break; + } +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) + lf_printlist("lf_clearlock", unlock); +#endif /* LOCKF_DEBUG */ + return (0); +} + +/* + * Check whether there is a blocking lock, + * and if so return its process identifier. + */ +static int +lf_getlock(lock, fl) + register struct lockf *lock; + register struct flock *fl; +{ + register struct lockf *block; + +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) + lf_print("lf_getlock", lock); +#endif /* LOCKF_DEBUG */ + + if ((block = lf_getblock(lock))) { + fl->l_type = block->lf_type; + fl->l_whence = SEEK_SET; + fl->l_start = block->lf_start; + if (block->lf_end == -1) + fl->l_len = 0; + else + fl->l_len = block->lf_end - block->lf_start + 1; + if (block->lf_flags & F_POSIX) + fl->l_pid = ((struct proc *)(block->lf_id))->p_pid; + else + fl->l_pid = -1; + } else { + fl->l_type = F_UNLCK; + } + return (0); +} + +/* + * Walk the list of locks for an inode and + * return the first blocking lock. + */ +static struct lockf * +lf_getblock(lock) + register struct lockf *lock; +{ + struct lockf **prev, *overlap, *lf = *(lock->lf_head); + int ovcase; + + prev = lock->lf_head; + while ((ovcase = lf_findoverlap(lf, lock, OTHERS, &prev, &overlap))) { + /* + * We've found an overlap, see if it blocks us + */ + if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK)) + return (overlap); + /* + * Nope, point to the next one on the list and + * see if it blocks us + */ + lf = overlap->lf_next; + } + return (NOLOCKF); +} + +/* + * Walk the list of locks for an inode to + * find an overlapping lock (if any). + * + * NOTE: this returns only the FIRST overlapping lock. There + * may be more than one. + */ +static int +lf_findoverlap(lf, lock, type, prev, overlap) + register struct lockf *lf; + struct lockf *lock; + int type; + struct lockf ***prev; + struct lockf **overlap; +{ + off_t start, end; + + *overlap = lf; + if (lf == NOLOCKF) + return (0); +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + lf_print("lf_findoverlap: looking for overlap in", lock); +#endif /* LOCKF_DEBUG */ + start = lock->lf_start; + end = lock->lf_end; + while (lf != NOLOCKF) { + if (((type & SELF) && lf->lf_id != lock->lf_id) || + ((type & OTHERS) && lf->lf_id == lock->lf_id)) { + *prev = &lf->lf_next; + *overlap = lf = lf->lf_next; + continue; + } +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + lf_print("\tchecking", lf); +#endif /* LOCKF_DEBUG */ + /* + * OK, check for overlap + * + * Six cases: + * 0) no overlap + * 1) overlap == lock + * 2) overlap contains lock + * 3) lock contains overlap + * 4) overlap starts before lock + * 5) overlap ends after lock + */ + if ((lf->lf_end != -1 && start > lf->lf_end) || + (end != -1 && lf->lf_start > end)) { + /* Case 0 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("no overlap\n"); +#endif /* LOCKF_DEBUG */ + if ((type & SELF) && end != -1 && lf->lf_start > end) + return (0); + *prev = &lf->lf_next; + *overlap = lf = lf->lf_next; + continue; + } + if ((lf->lf_start == start) && (lf->lf_end == end)) { + /* Case 1 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap == lock\n"); +#endif /* LOCKF_DEBUG */ + return (1); + } + if ((lf->lf_start <= start) && + (end != -1) && + ((lf->lf_end >= end) || (lf->lf_end == -1))) { + /* Case 2 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap contains lock\n"); +#endif /* LOCKF_DEBUG */ + return (2); + } + if (start <= lf->lf_start && + (end == -1 || + (lf->lf_end != -1 && end >= lf->lf_end))) { + /* Case 3 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("lock contains overlap\n"); +#endif /* LOCKF_DEBUG */ + return (3); + } + if ((lf->lf_start < start) && + ((lf->lf_end >= start) || (lf->lf_end == -1))) { + /* Case 4 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap starts before lock\n"); +#endif /* LOCKF_DEBUG */ + return (4); + } + if ((lf->lf_start > start) && + (end != -1) && + ((lf->lf_end > end) || (lf->lf_end == -1))) { + /* Case 5 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap ends after lock\n"); +#endif /* LOCKF_DEBUG */ + return (5); + } + panic("lf_findoverlap: default"); + } + return (0); +} + +/* + * Split a lock and a contained region into + * two or three locks as necessary. + */ +static void +lf_split(lock1, lock2) + register struct lockf *lock1; + register struct lockf *lock2; +{ + register struct lockf *splitlock; + +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) { + lf_print("lf_split", lock1); + lf_print("splitting from", lock2); + } +#endif /* LOCKF_DEBUG */ + /* + * Check to see if spliting into only two pieces. + */ + if (lock1->lf_start == lock2->lf_start) { + lock1->lf_start = lock2->lf_end + 1; + lock2->lf_next = lock1; + return; + } + if (lock1->lf_end == lock2->lf_end) { + lock1->lf_end = lock2->lf_start - 1; + lock2->lf_next = lock1->lf_next; + lock1->lf_next = lock2; + return; + } + /* + * Make a new lock consisting of the last part of + * the encompassing lock + */ + MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK); + bcopy((caddr_t)lock1, (caddr_t)splitlock, sizeof *splitlock); + splitlock->lf_start = lock2->lf_end + 1; + TAILQ_INIT(&splitlock->lf_blkhd); + lock1->lf_end = lock2->lf_start - 1; + /* + * OK, now link it in + */ + splitlock->lf_next = lock1->lf_next; + lock2->lf_next = splitlock; + lock1->lf_next = lock2; +} + +/* + * Wakeup a blocklist + */ +static void +lf_wakelock(listhead) + struct lockf *listhead; +{ + register struct lockf *wakelock; + + while (wakelock = listhead->lf_blkhd.tqh_first) { + TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block); + wakelock->lf_next = NOLOCKF; +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + lf_print("lf_wakelock: awakening", wakelock); +#endif /* LOCKF_DEBUG */ + wakeup((caddr_t)wakelock); + } +} + +#ifdef LOCKF_DEBUG +/* + * Print out a lock. + */ +void +lf_print(tag, lock) + char *tag; + register struct lockf *lock; +{ + + printf("%s: lock %p for ", tag, (void *)lock); + if (lock->lf_flags & F_POSIX) + printf("proc %ld", (long)((struct proc *)lock->lf_id)->p_pid); + else + printf("id %p", (void *)lock->lf_id); + /* XXX no %qd in kernel. Truncate. */ + printf(" in ino %lu on dev <%d, %d>, %s, start %ld, end %ld", + (u_long)lock->lf_inode->i_number, + major(lock->lf_inode->i_dev), + minor(lock->lf_inode->i_dev), + lock->lf_type == F_RDLCK ? "shared" : + lock->lf_type == F_WRLCK ? "exclusive" : + lock->lf_type == F_UNLCK ? "unlock" : + "unknown", (long)lock->lf_start, (long)lock->lf_end); + if (lock->lf_blkhd.tqh_first) + printf(" block %p\n", (void *)lock->lf_blkhd.tqh_first); + else + printf("\n"); +} + +void +lf_printlist(tag, lock) + char *tag; + struct lockf *lock; +{ + register struct lockf *lf, *blk; + + printf("%s: Lock list for ino %lu on dev <%d, %d>:\n", + tag, (u_long)lock->lf_inode->i_number, + major(lock->lf_inode->i_dev), + minor(lock->lf_inode->i_dev)); + for (lf = lock->lf_inode->i_lockf; lf; lf = lf->lf_next) { + printf("\tlock %p for ",(void *)lf); + if (lf->lf_flags & F_POSIX) + printf("proc %ld", + (long)((struct proc *)lf->lf_id)->p_pid); + else + printf("id %p", (void *)lf->lf_id); + /* XXX no %qd in kernel. Truncate. */ + printf(", %s, start %ld, end %ld", + lf->lf_type == F_RDLCK ? "shared" : + lf->lf_type == F_WRLCK ? "exclusive" : + lf->lf_type == F_UNLCK ? "unlock" : + "unknown", (long)lf->lf_start, (long)lf->lf_end); + for (blk = lf->lf_blkhd.tqh_first; blk; + blk = blk->lf_block.tqe_next) { + printf("\n\t\tlock request %p for ", (void *)blk); + if (blk->lf_flags & F_POSIX) + printf("proc %ld", + (long)((struct proc *)blk->lf_id)->p_pid); + else + printf("id %p", (void *)blk->lf_id); + /* XXX no %qd in kernel. Truncate. */ + printf(", %s, start %ld, end %ld", + blk->lf_type == F_RDLCK ? "shared" : + blk->lf_type == F_WRLCK ? "exclusive" : + blk->lf_type == F_UNLCK ? "unlock" : + "unknown", (long)blk->lf_start, + (long)blk->lf_end); + if (blk->lf_blkhd.tqh_first) + panic("lf_printlist: bad list"); + } + printf("\n"); + } +} +#endif /* LOCKF_DEBUG */ diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c new file mode 100644 index 0000000..be9f9d3 --- /dev/null +++ b/sys/kern/kern_malloc.c @@ -0,0 +1,501 @@ +/* + * Copyright (c) 1987, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_malloc.c 8.3 (Berkeley) 1/4/94 + * $Id: kern_malloc.c,v 1.51 1999/01/10 01:58:24 eivind Exp $ + */ + +#include "opt_vm.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#define MALLOC_INSTANTIATE +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/vmmeter.h> +#include <sys/lock.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> + +static void kmeminit __P((void *)); +SYSINIT(kmem, SI_SUB_KMEM, SI_ORDER_FIRST, kmeminit, NULL) + +static MALLOC_DEFINE(M_FREE, "free", "should be on free list"); + +static struct malloc_type *kmemstatistics; +static struct kmembuckets bucket[MINBUCKET + 16]; +static struct kmemusage *kmemusage; +static char *kmembase; +static char *kmemlimit; +static int vm_kmem_size; + +#ifdef INVARIANTS +/* + * This structure provides a set of masks to catch unaligned frees. + */ +static long addrmask[] = { 0, + 0x00000001, 0x00000003, 0x00000007, 0x0000000f, + 0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff, + 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff, + 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, +}; + +/* + * The WEIRD_ADDR is used as known text to copy into free objects so + * that modifications after frees can be detected. + */ +#define WEIRD_ADDR 0xdeadc0de +#define MAX_COPY 64 + +/* + * Normally the first word of the structure is used to hold the list + * pointer for free objects. However, when running with diagnostics, + * we use the third and fourth fields, so as to catch modifications + * in the most commonly trashed first two words. + */ +struct freelist { + long spare0; + struct malloc_type *type; + long spare1; + caddr_t next; +}; +#else /* !INVARIANTS */ +struct freelist { + caddr_t next; +}; +#endif /* INVARIANTS */ + +/* + * malloc: + * + * Allocate a block of memory. + * + * If M_NOWAIT is set, this routine will not block and return NULL if + * the allocation fails. + * + * If M_ASLEEP is set (M_NOWAIT must also be set), this routine + * will have the side effect of calling asleep() if it returns NULL, + * allowing the parent to await() at some future time. + */ +void * +malloc(size, type, flags) + unsigned long size; + struct malloc_type *type; + int flags; +{ + register struct kmembuckets *kbp; + register struct kmemusage *kup; + register struct freelist *freep; + long indx, npg, allocsize; + int s; + caddr_t va, cp, savedlist; +#ifdef INVARIANTS + long *end, *lp; + int copysize; + char *savedtype; +#endif + register struct malloc_type *ksp = type; + + /* + * Must be at splmem() prior to initializing segment to handle + * potential initialization race. + */ + + s = splmem(); + + if (!type->ks_next) { + malloc_init(type); + } + + indx = BUCKETINDX(size); + kbp = &bucket[indx]; + + while (ksp->ks_memuse >= ksp->ks_limit) { + if (flags & M_ASLEEP) { + if (ksp->ks_limblocks < 65535) + ksp->ks_limblocks++; + asleep((caddr_t)ksp, PSWP+2, type->ks_shortdesc, 0); + } + if (flags & M_NOWAIT) { + splx(s); + return ((void *) NULL); + } + if (ksp->ks_limblocks < 65535) + ksp->ks_limblocks++; + tsleep((caddr_t)ksp, PSWP+2, type->ks_shortdesc, 0); + } + ksp->ks_size |= 1 << indx; +#ifdef INVARIANTS + copysize = 1 << indx < MAX_COPY ? 1 << indx : MAX_COPY; +#endif + if (kbp->kb_next == NULL) { + kbp->kb_last = NULL; + if (size > MAXALLOCSAVE) + allocsize = roundup(size, PAGE_SIZE); + else + allocsize = 1 << indx; + npg = btoc(allocsize); + va = (caddr_t) kmem_malloc(kmem_map, (vm_size_t)ctob(npg), flags); + if (va == NULL) { + splx(s); + return ((void *) NULL); + } + kbp->kb_total += kbp->kb_elmpercl; + kup = btokup(va); + kup->ku_indx = indx; + if (allocsize > MAXALLOCSAVE) { + if (npg > 65535) + panic("malloc: allocation too large"); + kup->ku_pagecnt = npg; + ksp->ks_memuse += allocsize; + goto out; + } + kup->ku_freecnt = kbp->kb_elmpercl; + kbp->kb_totalfree += kbp->kb_elmpercl; + /* + * Just in case we blocked while allocating memory, + * and someone else also allocated memory for this + * bucket, don't assume the list is still empty. + */ + savedlist = kbp->kb_next; + kbp->kb_next = cp = va + (npg * PAGE_SIZE) - allocsize; + for (;;) { + freep = (struct freelist *)cp; +#ifdef INVARIANTS + /* + * Copy in known text to detect modification + * after freeing. + */ + end = (long *)&cp[copysize]; + for (lp = (long *)cp; lp < end; lp++) + *lp = WEIRD_ADDR; + freep->type = M_FREE; +#endif /* INVARIANTS */ + if (cp <= va) + break; + cp -= allocsize; + freep->next = cp; + } + freep->next = savedlist; + if (kbp->kb_last == NULL) + kbp->kb_last = (caddr_t)freep; + } + va = kbp->kb_next; + kbp->kb_next = ((struct freelist *)va)->next; +#ifdef INVARIANTS + freep = (struct freelist *)va; + savedtype = (char *) type->ks_shortdesc; +#if BYTE_ORDER == BIG_ENDIAN + freep->type = (struct malloc_type *)WEIRD_ADDR >> 16; +#endif +#if BYTE_ORDER == LITTLE_ENDIAN + freep->type = (struct malloc_type *)WEIRD_ADDR; +#endif + if ((intptr_t)(void *)&freep->next & 0x2) + freep->next = (caddr_t)((WEIRD_ADDR >> 16)|(WEIRD_ADDR << 16)); + else + freep->next = (caddr_t)WEIRD_ADDR; + end = (long *)&va[copysize]; + for (lp = (long *)va; lp < end; lp++) { + if (*lp == WEIRD_ADDR) + continue; + printf("%s %ld of object %p size %lu %s %s (0x%lx != 0x%lx)\n", + "Data modified on freelist: word", + (long)(lp - (long *)va), (void *)va, size, + "previous type", savedtype, *lp, (u_long)WEIRD_ADDR); + break; + } + freep->spare0 = 0; +#endif /* INVARIANTS */ + kup = btokup(va); + if (kup->ku_indx != indx) + panic("malloc: wrong bucket"); + if (kup->ku_freecnt == 0) + panic("malloc: lost data"); + kup->ku_freecnt--; + kbp->kb_totalfree--; + ksp->ks_memuse += 1 << indx; +out: + kbp->kb_calls++; + ksp->ks_inuse++; + ksp->ks_calls++; + if (ksp->ks_memuse > ksp->ks_maxused) + ksp->ks_maxused = ksp->ks_memuse; + splx(s); + return ((void *) va); +} + +/* + * free: + * + * Free a block of memory allocated by malloc. + * + * This routine may not block. + */ +void +free(addr, type) + void *addr; + struct malloc_type *type; +{ + register struct kmembuckets *kbp; + register struct kmemusage *kup; + register struct freelist *freep; + long size; + int s; +#ifdef INVARIANTS + struct freelist *fp; + long *end, *lp, alloc, copysize; +#endif + register struct malloc_type *ksp = type; + + if (!type->ks_next) + panic("freeing with unknown type (%s)", type->ks_shortdesc); + + KASSERT(kmembase <= (char *)addr && (char *)addr < kmemlimit, + ("free: address %p out of range", (void *)addr)); + kup = btokup(addr); + size = 1 << kup->ku_indx; + kbp = &bucket[kup->ku_indx]; + s = splmem(); +#ifdef INVARIANTS + /* + * Check for returns of data that do not point to the + * beginning of the allocation. + */ + if (size > PAGE_SIZE) + alloc = addrmask[BUCKETINDX(PAGE_SIZE)]; + else + alloc = addrmask[kup->ku_indx]; + if (((uintptr_t)(void *)addr & alloc) != 0) + panic("free: unaligned addr %p, size %ld, type %s, mask %ld", + (void *)addr, size, type->ks_shortdesc, alloc); +#endif /* INVARIANTS */ + if (size > MAXALLOCSAVE) { + kmem_free(kmem_map, (vm_offset_t)addr, ctob(kup->ku_pagecnt)); + size = kup->ku_pagecnt << PAGE_SHIFT; + ksp->ks_memuse -= size; + kup->ku_indx = 0; + kup->ku_pagecnt = 0; + if (ksp->ks_memuse + size >= ksp->ks_limit && + ksp->ks_memuse < ksp->ks_limit) + wakeup((caddr_t)ksp); + ksp->ks_inuse--; + kbp->kb_total -= 1; + splx(s); + return; + } + freep = (struct freelist *)addr; +#ifdef INVARIANTS + /* + * Check for multiple frees. Use a quick check to see if + * it looks free before laboriously searching the freelist. + */ + if (freep->spare0 == WEIRD_ADDR) { + fp = (struct freelist *)kbp->kb_next; + while (fp) { + if (fp->spare0 != WEIRD_ADDR) + panic("free: free item %p modified", fp); + else if (addr == (caddr_t)fp) + panic("free: multiple freed item %p", addr); + fp = (struct freelist *)fp->next; + } + } + /* + * Copy in known text to detect modification after freeing + * and to make it look free. Also, save the type being freed + * so we can list likely culprit if modification is detected + * when the object is reallocated. + */ + copysize = size < MAX_COPY ? size : MAX_COPY; + end = (long *)&((caddr_t)addr)[copysize]; + for (lp = (long *)addr; lp < end; lp++) + *lp = WEIRD_ADDR; + freep->type = type; +#endif /* INVARIANTS */ + kup->ku_freecnt++; + if (kup->ku_freecnt >= kbp->kb_elmpercl) + if (kup->ku_freecnt > kbp->kb_elmpercl) + panic("free: multiple frees"); + else if (kbp->kb_totalfree > kbp->kb_highwat) + kbp->kb_couldfree++; + kbp->kb_totalfree++; + ksp->ks_memuse -= size; + if (ksp->ks_memuse + size >= ksp->ks_limit && + ksp->ks_memuse < ksp->ks_limit) + wakeup((caddr_t)ksp); + ksp->ks_inuse--; +#ifdef OLD_MALLOC_MEMORY_POLICY + if (kbp->kb_next == NULL) + kbp->kb_next = addr; + else + ((struct freelist *)kbp->kb_last)->next = addr; + freep->next = NULL; + kbp->kb_last = addr; +#else + /* + * Return memory to the head of the queue for quick reuse. This + * can improve performance by improving the probability of the + * item being in the cache when it is reused. + */ + if (kbp->kb_next == NULL) { + kbp->kb_next = addr; + kbp->kb_last = addr; + freep->next = NULL; + } else { + freep->next = kbp->kb_next; + kbp->kb_next = addr; + } +#endif + splx(s); +} + +/* + * Initialize the kernel memory allocator + */ +/* ARGSUSED*/ +static void +kmeminit(dummy) + void *dummy; +{ + register long indx; + int npg; + int mem_size; + +#if ((MAXALLOCSAVE & (MAXALLOCSAVE - 1)) != 0) +#error "kmeminit: MAXALLOCSAVE not power of 2" +#endif +#if (MAXALLOCSAVE > MINALLOCSIZE * 32768) +#error "kmeminit: MAXALLOCSAVE too big" +#endif +#if (MAXALLOCSAVE < PAGE_SIZE) +#error "kmeminit: MAXALLOCSAVE too small" +#endif + + /* + * Try to auto-tune the kernel memory size, so that it is + * more applicable for a wider range of machine sizes. + * On an X86, a VM_KMEM_SIZE_SCALE value of 4 is good, while + * a VM_KMEM_SIZE of 12MB is a fair compromise. The + * VM_KMEM_SIZE_MAX is dependent on the maximum KVA space + * available, and on an X86 with a total KVA space of 256MB, + * try to keep VM_KMEM_SIZE_MAX at 80MB or below. + * + * Note that the kmem_map is also used by the zone allocator, + * so make sure that there is enough space. + */ + vm_kmem_size = VM_KMEM_SIZE; + mem_size = cnt.v_page_count * PAGE_SIZE; + +#if defined(VM_KMEM_SIZE_SCALE) + if ((mem_size / VM_KMEM_SIZE_SCALE) > vm_kmem_size) + vm_kmem_size = mem_size / VM_KMEM_SIZE_SCALE; +#endif + +#if defined(VM_KMEM_SIZE_MAX) + if (vm_kmem_size >= VM_KMEM_SIZE_MAX) + vm_kmem_size = VM_KMEM_SIZE_MAX; +#endif + + if (vm_kmem_size > 2 * (cnt.v_page_count * PAGE_SIZE)) + vm_kmem_size = 2 * (cnt.v_page_count * PAGE_SIZE); + + npg = (nmbufs * MSIZE + nmbclusters * MCLBYTES + vm_kmem_size) + / PAGE_SIZE; + + kmemusage = (struct kmemusage *) kmem_alloc(kernel_map, + (vm_size_t)(npg * sizeof(struct kmemusage))); + kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase, + (vm_offset_t *)&kmemlimit, (vm_size_t)(npg * PAGE_SIZE)); + kmem_map->system_map = 1; + for (indx = 0; indx < MINBUCKET + 16; indx++) { + if (1 << indx >= PAGE_SIZE) + bucket[indx].kb_elmpercl = 1; + else + bucket[indx].kb_elmpercl = PAGE_SIZE / (1 << indx); + bucket[indx].kb_highwat = 5 * bucket[indx].kb_elmpercl; + } +} + +void +malloc_init(data) + void *data; +{ + struct malloc_type *type = (struct malloc_type *)data; + + if (type->ks_magic != M_MAGIC) + panic("malloc type lacks magic"); + + if (type->ks_next) + return; + + if (cnt.v_page_count == 0) + panic("malloc_init not allowed before vm init"); + + /* + * The default limits for each malloc region is 1/2 of the + * malloc portion of the kmem map size. + */ + type->ks_limit = vm_kmem_size / 2; + type->ks_next = kmemstatistics; + kmemstatistics = type; +} + +void +malloc_uninit(data) + void *data; +{ + struct malloc_type *type = (struct malloc_type *)data; + struct malloc_type *t; + + if (type->ks_magic != M_MAGIC) + panic("malloc type lacks magic"); + + if (cnt.v_page_count == 0) + panic("malloc_uninit not allowed before vm init"); + + if (type == kmemstatistics) + kmemstatistics = type->ks_next; + else { + for (t = kmemstatistics; t->ks_next != NULL; t = t->ks_next) { + if (t->ks_next == type) { + t->ks_next = type->ks_next; + break; + } + } + } +} diff --git a/sys/kern/kern_mib.c b/sys/kern/kern_mib.c new file mode 100644 index 0000000..22fcd33 --- /dev/null +++ b/sys/kern/kern_mib.c @@ -0,0 +1,182 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Karels at Berkeley Software Design, Inc. + * + * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD + * project, to make these variables more userfriendly. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94 + * $Id: kern_mib.c,v 1.15 1998/03/28 11:49:52 dufault Exp $ + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/sysctl.h> +#include <sys/proc.h> +#include <sys/unistd.h> + +#if defined(SMP) +#include <machine/smp.h> +#endif + +SYSCTL_NODE(, 0, sysctl, CTLFLAG_RW, 0, + "Sysctl internal magic"); +SYSCTL_NODE(, CTL_KERN, kern, CTLFLAG_RW, 0, + "High kernel, proc, limits &c"); +SYSCTL_NODE(, CTL_VM, vm, CTLFLAG_RW, 0, + "Virtual memory"); +SYSCTL_NODE(, CTL_VFS, vfs, CTLFLAG_RW, 0, + "File system"); +SYSCTL_NODE(, CTL_NET, net, CTLFLAG_RW, 0, + "Network, (see socket.h)"); +SYSCTL_NODE(, CTL_DEBUG, debug, CTLFLAG_RW, 0, + "Debugging"); +SYSCTL_NODE(, CTL_HW, hw, CTLFLAG_RW, 0, + "hardware"); +SYSCTL_NODE(, CTL_MACHDEP, machdep, CTLFLAG_RW, 0, + "machine dependent"); +SYSCTL_NODE(, CTL_USER, user, CTLFLAG_RW, 0, + "user-level"); + +SYSCTL_NODE(, CTL_P1003_1B, p1003_1b, CTLFLAG_RW, 0, + "p1003_1b, (see p1003_1b.h)"); + +SYSCTL_STRING(_kern, KERN_OSRELEASE, osrelease, CTLFLAG_RD, osrelease, 0, ""); + +SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD, 0, BSD, ""); + +SYSCTL_STRING(_kern, KERN_VERSION, version, CTLFLAG_RD, version, 0, ""); + +SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD, ostype, 0, ""); + +extern int osreldate; +SYSCTL_INT(_kern, KERN_OSRELDATE, osreldate, CTLFLAG_RD, &osreldate, 0, ""); + +SYSCTL_INT(_kern, KERN_MAXPROC, maxproc, CTLFLAG_RD, &maxproc, 0, ""); + +SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid, + CTLFLAG_RW, &maxprocperuid, 0, ""); + +SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD, 0, ARG_MAX, ""); + +SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD, 0, _KPOSIX_VERSION, ""); + +SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RD, 0, NGROUPS_MAX, ""); + +SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD, 0, 1, ""); + +#ifdef _POSIX_SAVED_IDS +SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD, 0, 1, ""); +#else +SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD, 0, 0, ""); +#endif + +char kernelname[MAXPATHLEN] = "/kernel"; /* XXX bloat */ + +SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile, + CTLFLAG_RW, kernelname, sizeof kernelname, ""); + +#ifdef SMP +SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD, &mp_ncpus, 0, ""); +#else +SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD, 0, 1, ""); +#endif + +SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD, 0, BYTE_ORDER, ""); + +SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD, 0, PAGE_SIZE, ""); + +static char machine_arch[] = MACHINE_ARCH; +SYSCTL_STRING(_hw, HW_MACHINE_ARCH, machine_arch, CTLFLAG_RD, + machine_arch, 0, ""); + +char hostname[MAXHOSTNAMELEN]; + +SYSCTL_STRING(_kern, KERN_HOSTNAME, hostname, CTLFLAG_RW, + hostname, sizeof(hostname), ""); + +int securelevel = -1; + +static int +sysctl_kern_securelvl SYSCTL_HANDLER_ARGS +{ + int error, level; + + level = securelevel; + error = sysctl_handle_int(oidp, &level, 0, req); + if (error || !req->newptr) + return (error); + if (level < securelevel) + return (EPERM); + securelevel = level; + return (error); +} + +SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel, CTLTYPE_INT|CTLFLAG_RW, + 0, 0, sysctl_kern_securelvl, "I", ""); + +char domainname[MAXHOSTNAMELEN]; +SYSCTL_STRING(_kern, KERN_NISDOMAINNAME, domainname, CTLFLAG_RW, + &domainname, sizeof(domainname), ""); + +long hostid; +/* Some trouble here, if sizeof (int) != sizeof (long) */ +SYSCTL_INT(_kern, KERN_HOSTID, hostid, CTLFLAG_RW, &hostid, 0, ""); + +/* + * This is really cheating. These actually live in the libc, something + * which I'm not quite sure is a good idea anyway, but in order for + * getnext and friends to actually work, we define dummies here. + */ +SYSCTL_STRING(_user, USER_CS_PATH, cs_path, CTLFLAG_RD, "", 0, ""); +SYSCTL_INT(_user, USER_BC_BASE_MAX, bc_base_max, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_BC_DIM_MAX, bc_dim_max, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_BC_SCALE_MAX, bc_scale_max, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_BC_STRING_MAX, bc_string_max, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_COLL_WEIGHTS_MAX, coll_weights_max, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_EXPR_NEST_MAX, expr_nest_max, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_LINE_MAX, line_max, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_RE_DUP_MAX, re_dup_max, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_POSIX2_VERSION, posix2_version, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_POSIX2_C_BIND, posix2_c_bind, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_POSIX2_C_DEV, posix2_c_dev, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_POSIX2_CHAR_TERM, posix2_char_term, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_POSIX2_FORT_DEV, posix2_fort_dev, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_POSIX2_FORT_RUN, posix2_fort_run, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_POSIX2_LOCALEDEF, posix2_localedef, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_POSIX2_SW_DEV, posix2_sw_dev, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_POSIX2_UPE, posix2_upe, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_STREAM_MAX, stream_max, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_TZNAME_MAX, tzname_max, CTLFLAG_RD, 0, 0, ""); diff --git a/sys/kern/kern_module.c b/sys/kern/kern_module.c new file mode 100644 index 0000000..afe9f2e --- /dev/null +++ b/sys/kern/kern_module.c @@ -0,0 +1,330 @@ +/*- + * Copyright (c) 1997 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: kern_module.c,v 1.13 1999/01/09 14:59:50 dfr Exp $ + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/sysproto.h> +#include <sys/sysent.h> +#include <sys/module.h> +#include <sys/linker.h> +#include <sys/proc.h> + +#define M_MODULE M_TEMP /* XXX */ + +typedef TAILQ_HEAD(, module) modulelist_t; +struct module { + TAILQ_ENTRY(module) link; /* chain together all modules */ + TAILQ_ENTRY(module) flink; /* all modules in a file */ + struct linker_file* file; /* file which contains this module */ + int refs; /* reference count */ + int id; /* unique id number */ + char *name; /* module name */ + modeventhand_t handler; /* event handler */ + void *arg; /* argument for handler */ + modspecific_t data; /* module specific data */ +}; + +#define MOD_EVENT(mod, type) (mod)->handler((mod), (type), (mod)->arg) + +static modulelist_t modules; +static int nextid = 1; + +static void module_shutdown(int, void*); + +static void +module_init(void* arg) +{ + TAILQ_INIT(&modules); + at_shutdown(module_shutdown, 0, SHUTDOWN_POST_SYNC); +} + +SYSINIT(module, SI_SUB_KLD, SI_ORDER_FIRST, module_init, 0); + +static void +module_shutdown(int arg1, void* arg2) +{ + module_t mod; + + for (mod = TAILQ_FIRST(&modules); mod; mod = TAILQ_NEXT(mod, link)) + MOD_EVENT(mod, MOD_SHUTDOWN); +} + +void +module_register_init(void *arg) +{ + moduledata_t* data = (moduledata_t*) arg; + int error; + + error = module_register(data->name, data->evhand, data->priv, data->_file); + if (error) + printf("module_register_init: module_register(%s, %lx, %p) error %d\n", + data->name, (u_long)(uintfptr_t)data->evhand, data->priv, error); +} + +int +module_register(const char* name, modeventhand_t handler, void* arg, void *file) +{ + size_t namelen; + module_t newmod; + int error; + linker_file_t container = file; + + namelen = strlen(name) + 1; + newmod = (module_t) malloc(sizeof(struct module) + namelen, + M_MODULE, M_WAITOK); + if (newmod == 0) + return ENOMEM; + + newmod->refs = 1; + newmod->id = nextid++; + newmod->name = (char *) (newmod + 1); + strcpy(newmod->name, name); + newmod->handler = handler; + newmod->arg = arg; + bzero(&newmod->data, sizeof(newmod->data)); + TAILQ_INSERT_TAIL(&modules, newmod, link); + + if (container == NULL) + container = linker_current_file; + if (container) { + TAILQ_INSERT_TAIL(&container->modules, newmod, flink); + newmod->file = container; + } else + newmod->file = 0; + + if (error = MOD_EVENT(newmod, MOD_LOAD)) { + MOD_EVENT(newmod, MOD_UNLOAD); + module_release(newmod); + return error; + } + + return 0; +} + +void +module_reference(module_t mod) +{ + MOD_DPF(REFS, ("module_reference: before, refs=%d\n", mod->refs)); + + mod->refs++; +} + +void +module_release(module_t mod) +{ + if (mod->refs <= 0) + panic("module_release: bad reference count"); + + MOD_DPF(REFS, ("module_release: before, refs=%d\n", mod->refs)); + + mod->refs--; + if (mod->refs == 0) { + TAILQ_REMOVE(&modules, mod, link); + if (mod->file) { + TAILQ_REMOVE(&mod->file->modules, mod, flink); + } + free(mod, M_MODULE); + } +} + +module_t +module_lookupbyname(const char* name) +{ + module_t mod; + + for (mod = TAILQ_FIRST(&modules); mod; mod = TAILQ_NEXT(mod, link)) { + if (!strcmp(mod->name, name)) + return mod; + } + + return 0; +} + +module_t +module_lookupbyid(int modid) +{ + module_t mod; + + for (mod = TAILQ_FIRST(&modules); mod; mod = TAILQ_NEXT(mod, link)) { + if (mod->id == modid) + return mod; + } + + return 0; +} + +int +module_unload(module_t mod) +{ + return MOD_EVENT(mod, MOD_UNLOAD); +} + +int +module_getid(module_t mod) +{ + return mod->id; +} + +module_t +module_getfnext(module_t mod) +{ + return TAILQ_NEXT(mod, flink); +} + +void +module_setspecific(module_t mod, modspecific_t *datap) +{ + mod->data = *datap; +} + +/* + * Syscalls. + */ +int +modnext(struct proc* p, struct modnext_args* uap) +{ + module_t mod; + + p->p_retval[0] = -1; + if (SCARG(uap, modid) == 0) { + mod = TAILQ_FIRST(&modules); + if (mod) { + p->p_retval[0] = mod->id; + return 0; + } else + return ENOENT; + } + + mod = module_lookupbyid(SCARG(uap, modid)); + if (!mod) + return ENOENT; + + if (TAILQ_NEXT(mod, link)) + p->p_retval[0] = TAILQ_NEXT(mod, link)->id; + else + p->p_retval[0] = 0; + return 0; +} + +int +modfnext(struct proc* p, struct modfnext_args* uap) +{ + module_t mod; + + p->p_retval[0] = -1; + + mod = module_lookupbyid(SCARG(uap, modid)); + if (!mod) + return ENOENT; + + if (TAILQ_NEXT(mod, flink)) + p->p_retval[0] = TAILQ_NEXT(mod, flink)->id; + else + p->p_retval[0] = 0; + return 0; +} + +struct module_stat_v1 { + int version; /* set to sizeof(struct module_stat) */ + char name[MAXMODNAME]; + int refs; + int id; +}; + +int +modstat(struct proc* p, struct modstat_args* uap) +{ + module_t mod; + int error = 0; + int namelen; + int version; + struct module_stat* stat; + + mod = module_lookupbyid(SCARG(uap, modid)); + if (!mod) + return ENOENT; + + stat = SCARG(uap, stat); + + /* + * Check the version of the user's structure. + */ + if (error = copyin(&stat->version, &version, sizeof(version))) + goto out; + if (version != sizeof(struct module_stat_v1) + && version != sizeof(struct module_stat)) { + error = EINVAL; + goto out; + } + + namelen = strlen(mod->name) + 1; + if (namelen > MAXMODNAME) + namelen = MAXMODNAME; + if (error = copyout(mod->name, &stat->name[0], namelen)) + goto out; + + if (error = copyout(&mod->refs, &stat->refs, sizeof(int))) + goto out; + if (error = copyout(&mod->id, &stat->id, sizeof(int))) + goto out; + + /* + * >v1 stat includes module data. + */ + if (version == sizeof(struct module_stat)) { + if (error = copyout(&mod->data, &stat->data, sizeof(mod->data))) + goto out; + } + + p->p_retval[0] = 0; + +out: + return error; +} + +int +modfind(struct proc* p, struct modfind_args* uap) +{ + int error = 0; + char name[MAXMODNAME]; + module_t mod; + + if (error = copyinstr(SCARG(uap, name), name, sizeof name, 0)) + goto out; + + mod = module_lookupbyname(name); + if (!mod) + error = ENOENT; + else + p->p_retval[0] = mod->id; + +out: + return error; +} diff --git a/sys/kern/kern_ntptime.c b/sys/kern/kern_ntptime.c new file mode 100644 index 0000000..2f4114d --- /dev/null +++ b/sys/kern/kern_ntptime.c @@ -0,0 +1,856 @@ +/****************************************************************************** + * * + * Copyright (c) David L. Mills 1993, 1994 * + * * + * Permission to use, copy, modify, and distribute this software and its * + * documentation for any purpose and without fee is hereby granted, provided * + * that the above copyright notice appears in all copies and that both the * + * copyright notice and this permission notice appear in supporting * + * documentation, and that the name University of Delaware not be used in * + * advertising or publicity pertaining to distribution of the software * + * without specific, written prior permission. The University of Delaware * + * makes no representations about the suitability this software for any * + * purpose. It is provided "as is" without express or implied warranty. * + * * + ******************************************************************************/ + +/* + * Modification history kern_ntptime.c + * + * 24 Sep 94 David L. Mills + * Tightened code at exits. + * + * 24 Mar 94 David L. Mills + * Revised syscall interface to include new variables for PPS + * time discipline. + * + * 14 Feb 94 David L. Mills + * Added code for external clock + * + * 28 Nov 93 David L. Mills + * Revised frequency scaling to conform with adjusted parameters + * + * 17 Sep 93 David L. Mills + * Created file + */ +/* + * ntp_gettime(), ntp_adjtime() - precision time interface for SunOS + * V4.1.1 and V4.1.3 + * + * These routines consitute the Network Time Protocol (NTP) interfaces + * for user and daemon application programs. The ntp_gettime() routine + * provides the time, maximum error (synch distance) and estimated error + * (dispersion) to client user application programs. The ntp_adjtime() + * routine is used by the NTP daemon to adjust the system clock to an + * externally derived time. The time offset and related variables set by + * this routine are used by hardclock() to adjust the phase and + * frequency of the phase-lock loop which controls the system clock. + */ + +#include "opt_ntp.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/timex.h> +#include <sys/timepps.h> +#include <sys/sysctl.h> + +/* + * Phase/frequency-lock loop (PLL/FLL) definitions + * + * The following variables are read and set by the ntp_adjtime() system + * call. + * + * time_state shows the state of the system clock, with values defined + * in the timex.h header file. + * + * time_status shows the status of the system clock, with bits defined + * in the timex.h header file. + * + * time_offset is used by the PLL/FLL to adjust the system time in small + * increments. + * + * time_constant determines the bandwidth or "stiffness" of the PLL. + * + * time_tolerance determines maximum frequency error or tolerance of the + * CPU clock oscillator and is a property of the architecture; however, + * in principle it could change as result of the presence of external + * discipline signals, for instance. + * + * time_precision is usually equal to the kernel tick variable; however, + * in cases where a precision clock counter or external clock is + * available, the resolution can be much less than this and depend on + * whether the external clock is working or not. + * + * time_maxerror is initialized by a ntp_adjtime() call and increased by + * the kernel once each second to reflect the maximum error + * bound growth. + * + * time_esterror is set and read by the ntp_adjtime() call, but + * otherwise not used by the kernel. + */ +static int time_status = STA_UNSYNC; /* clock status bits */ +static int time_state = TIME_OK; /* clock state */ +static long time_offset = 0; /* time offset (us) */ +static long time_constant = 0; /* pll time constant */ +static long time_tolerance = MAXFREQ; /* frequency tolerance (scaled ppm) */ +static long time_precision = 1; /* clock precision (us) */ +static long time_maxerror = MAXPHASE; /* maximum error (us) */ +static long time_esterror = MAXPHASE; /* estimated error (us) */ +static int time_daemon = 0; /* No timedaemon active */ + +/* + * The following variables establish the state of the PLL/FLL and the + * residual time and frequency offset of the local clock. The scale + * factors are defined in the timex.h header file. + * + * time_phase and time_freq are the phase increment and the frequency + * increment, respectively, of the kernel time variable at each tick of + * the clock. + * + * time_freq is set via ntp_adjtime() from a value stored in a file when + * the synchronization daemon is first started. Its value is retrieved + * via ntp_adjtime() and written to the file about once per hour by the + * daemon. + * + * time_adj is the adjustment added to the value of tick at each timer + * interrupt and is recomputed from time_phase and time_freq at each + * seconds rollover. + * + * time_reftime is the second's portion of the system time on the last + * call to ntp_adjtime(). It is used to adjust the time_freq variable + * and to increase the time_maxerror as the time since last update + * increases. + */ +long time_phase = 0; /* phase offset (scaled us) */ +static long time_freq = 0; /* frequency offset (scaled ppm) */ +long time_adj = 0; /* tick adjust (scaled 1 / hz) */ +static long time_reftime = 0; /* time at last adjustment (s) */ + +#ifdef PPS_SYNC +/* + * The following variables are used only if the kernel PPS discipline + * code is configured (PPS_SYNC). The scale factors are defined in the + * timex.h header file. + * + * pps_time contains the time at each calibration interval, as read by + * microtime(). pps_count counts the seconds of the calibration + * interval, the duration of which is nominally pps_shift in powers of + * two. + * + * pps_offset is the time offset produced by the time median filter + * pps_tf[], while pps_jitter is the dispersion (jitter) measured by + * this filter. + * + * pps_freq is the frequency offset produced by the frequency median + * filter pps_ff[], while pps_stabil is the dispersion (wander) measured + * by this filter. + * + * pps_usec is latched from a high resolution counter or external clock + * at pps_time. Here we want the hardware counter contents only, not the + * contents plus the time_tv.usec as usual. + * + * pps_valid counts the number of seconds since the last PPS update. It + * is used as a watchdog timer to disable the PPS discipline should the + * PPS signal be lost. + * + * pps_glitch counts the number of seconds since the beginning of an + * offset burst more than tick/2 from current nominal offset. It is used + * mainly to suppress error bursts due to priority conflicts between the + * PPS interrupt and timer interrupt. + * + * pps_intcnt counts the calibration intervals for use in the interval- + * adaptation algorithm. It's just too complicated for words. + */ +static struct timeval pps_time; /* kernel time at last interval */ +static long pps_offset = 0; /* pps time offset (us) */ +static long pps_jitter = MAXTIME; /* pps time dispersion (jitter) (us) */ +static long pps_tf[] = {0, 0, 0}; /* pps time offset median filter (us) */ +static long pps_freq = 0; /* frequency offset (scaled ppm) */ +static long pps_stabil = MAXFREQ; /* frequency dispersion (scaled ppm) */ +static long pps_ff[] = {0, 0, 0}; /* frequency offset median filter */ +static long pps_usec = 0; /* microsec counter at last interval */ +static long pps_valid = PPS_VALID; /* pps signal watchdog counter */ +static int pps_glitch = 0; /* pps signal glitch counter */ +static int pps_count = 0; /* calibration interval counter (s) */ +static int pps_shift = PPS_SHIFT; /* interval duration (s) (shift) */ +static int pps_intcnt = 0; /* intervals at current duration */ + +/* + * PPS signal quality monitors + * + * pps_jitcnt counts the seconds that have been discarded because the + * jitter measured by the time median filter exceeds the limit MAXTIME + * (100 us). + * + * pps_calcnt counts the frequency calibration intervals, which are + * variable from 4 s to 256 s. + * + * pps_errcnt counts the calibration intervals which have been discarded + * because the wander exceeds the limit MAXFREQ (100 ppm) or where the + * calibration interval jitter exceeds two ticks. + * + * pps_stbcnt counts the calibration intervals that have been discarded + * because the frequency wander exceeds the limit MAXFREQ / 4 (25 us). + */ +static long pps_jitcnt = 0; /* jitter limit exceeded */ +static long pps_calcnt = 0; /* calibration intervals */ +static long pps_errcnt = 0; /* calibration errors */ +static long pps_stbcnt = 0; /* stability limit exceeded */ +#endif /* PPS_SYNC */ + +static void hardupdate __P((int64_t offset, int prescaled)); + +/* + * hardupdate() - local clock update + * + * This routine is called by ntp_adjtime() to update the local clock + * phase and frequency. The implementation is of an adaptive-parameter, + * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new + * time and frequency offset estimates for each call. If the kernel PPS + * discipline code is configured (PPS_SYNC), the PPS signal itself + * determines the new time offset, instead of the calling argument. + * Presumably, calls to ntp_adjtime() occur only when the caller + * believes the local clock is valid within some bound (+-128 ms with + * NTP). If the caller's time is far different than the PPS time, an + * argument will ensue, and it's not clear who will lose. + * + * For uncompensated quartz crystal oscillatores and nominal update + * intervals less than 1024 s, operation should be in phase-lock mode + * (STA_FLL = 0), where the loop is disciplined to phase. For update + * intervals greater than thiss, operation should be in frequency-lock + * mode (STA_FLL = 1), where the loop is disciplined to frequency. + * + * Note: splclock() is in effect. + */ +static void +hardupdate(offset, prescaled) + int64_t offset; + int prescaled; +{ + long mtemp; + int64_t ltemp; + + if (!(time_status & STA_PLL) && !(time_status & STA_PPSTIME)) + return; + if (prescaled) + ltemp = offset; + else + ltemp = offset << SHIFT_UPDATE; +#ifdef PPS_SYNC + if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) + ltemp = pps_offset << SHIFT_UPDATE; +#endif /* PPS_SYNC */ + + /* + * Scale the phase adjustment and clamp to the operating range. + */ + if (ltemp > (MAXPHASE << SHIFT_UPDATE)) + time_offset = MAXPHASE << SHIFT_UPDATE; + else if (ltemp < -(MAXPHASE << SHIFT_UPDATE)) + time_offset = -(MAXPHASE << SHIFT_UPDATE); + else + time_offset = ltemp; + + /* + * Select whether the frequency is to be controlled and in which + * mode (PLL or FLL). Clamp to the operating range. Ugly + * multiply/divide should be replaced someday. + */ + if (time_status & STA_FREQHOLD || time_reftime == 0) + time_reftime = time_second; + mtemp = time_second - time_reftime; + time_reftime = time_second; + if (time_status & STA_FLL) { + if (mtemp >= MINSEC) { + ltemp = ((time_offset / mtemp) << (SHIFT_USEC - + SHIFT_UPDATE)); + if (ltemp < 0) + time_freq -= -ltemp >> SHIFT_KH; + else + time_freq += ltemp >> SHIFT_KH; + } + } else { + if (mtemp < MAXSEC) { + ltemp = time_offset * mtemp; + if (ltemp < 0) + time_freq -= -ltemp >> ((int64_t)time_constant + + time_constant + SHIFT_KF - + SHIFT_USEC + SHIFT_UPDATE); + else + time_freq += ltemp >> ((int64_t)time_constant + + time_constant + SHIFT_KF - + SHIFT_USEC + SHIFT_UPDATE); + } + } + if (time_freq > time_tolerance) + time_freq = time_tolerance; + else if (time_freq < -time_tolerance) + time_freq = -time_tolerance; +} + +/* + * On rollover of the second the phase adjustment to be used for + * the next second is calculated. Also, the maximum error is + * increased by the tolerance. If the PPS frequency discipline + * code is present, the phase is increased to compensate for the + * CPU clock oscillator frequency error. + * + * On a 32-bit machine and given parameters in the timex.h + * header file, the maximum phase adjustment is +-512 ms and + * maximum frequency offset is a tad less than) +-512 ppm. On a + * 64-bit machine, you shouldn't need to ask. + */ +void +ntp_update_second(struct timecounter *tc) +{ + u_int32_t *newsec; + long ltemp; + + if (!time_daemon) + return; + + newsec = &tc->tc_offset_sec; + time_maxerror += time_tolerance >> SHIFT_USEC; + + /* + * Compute the phase adjustment for the next second. In + * PLL mode, the offset is reduced by a fixed factor + * times the time constant. In FLL mode the offset is + * used directly. In either mode, the maximum phase + * adjustment for each second is clamped so as to spread + * the adjustment over not more than the number of + * seconds between updates. + */ + if (time_offset < 0) { + ltemp = -time_offset; + if (!(time_status & STA_FLL)) + ltemp >>= SHIFT_KG + time_constant; + if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) + ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; + time_offset += ltemp; + time_adj = -ltemp << (SHIFT_SCALE - SHIFT_UPDATE); + } else { + ltemp = time_offset; + if (!(time_status & STA_FLL)) + ltemp >>= SHIFT_KG + time_constant; + if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) + ltemp = (MAXPHASE / MINSEC) << SHIFT_UPDATE; + time_offset -= ltemp; + time_adj = ltemp << (SHIFT_SCALE - SHIFT_UPDATE); + } + + /* + * Compute the frequency estimate and additional phase + * adjustment due to frequency error for the next + * second. When the PPS signal is engaged, gnaw on the + * watchdog counter and update the frequency computed by + * the pll and the PPS signal. + */ +#ifdef PPS_SYNC + pps_valid++; + if (pps_valid == PPS_VALID) { + pps_jitter = MAXTIME; + pps_stabil = MAXFREQ; + time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); + } + ltemp = time_freq + pps_freq; +#else + ltemp = time_freq; +#endif /* PPS_SYNC */ + if (ltemp < 0) + time_adj -= -ltemp << (SHIFT_SCALE - SHIFT_USEC); + else + time_adj += ltemp << (SHIFT_SCALE - SHIFT_USEC); + + tc->tc_adjustment = time_adj; + + /* XXX - this is really bogus, but can't be fixed until + xntpd's idea of the system clock is fixed to know how + the user wants leap seconds handled; in the mean time, + we assume that users of NTP are running without proper + leap second support (this is now the default anyway) */ + /* + * Leap second processing. If in leap-insert state at + * the end of the day, the system clock is set back one + * second; if in leap-delete state, the system clock is + * set ahead one second. The microtime() routine or + * external clock driver will insure that reported time + * is always monotonic. The ugly divides should be + * replaced. + */ + switch (time_state) { + + case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; + break; + + case TIME_INS: + if ((*newsec) % 86400 == 0) { + (*newsec)--; + time_state = TIME_OOP; + } + break; + + case TIME_DEL: + if (((*newsec) + 1) % 86400 == 0) { + (*newsec)++; + time_state = TIME_WAIT; + } + break; + + case TIME_OOP: + time_state = TIME_WAIT; + break; + + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + break; + } +} + +static int +ntp_sysctl SYSCTL_HANDLER_ARGS +{ + struct timeval atv; + struct ntptimeval ntv; + int s; + + s = splclock(); + microtime(&atv); + ntv.time = atv; + ntv.maxerror = time_maxerror; + ntv.esterror = time_esterror; + splx(s); + + ntv.time_state = time_state; + + /* + * Status word error decode. If any of these conditions + * occur, an error is returned, instead of the status + * word. Most applications will care only about the fact + * the system clock may not be trusted, not about the + * details. + * + * Hardware or software error + */ + if (time_status & (STA_UNSYNC | STA_CLOCKERR)) { + ntv.time_state = TIME_ERROR; + } + + /* + * PPS signal lost when either time or frequency + * synchronization requested + */ + if (time_status & (STA_PPSFREQ | STA_PPSTIME) && + !(time_status & STA_PPSSIGNAL)) { + ntv.time_state = TIME_ERROR; + } + + /* + * PPS jitter exceeded when time synchronization + * requested + */ + if (time_status & STA_PPSTIME && + time_status & STA_PPSJITTER) { + ntv.time_state = TIME_ERROR; + } + + /* + * PPS wander exceeded or calibration error when + * frequency synchronization requested + */ + if (time_status & STA_PPSFREQ && + time_status & (STA_PPSWANDER | STA_PPSERROR)) { + ntv.time_state = TIME_ERROR; + } + return (sysctl_handle_opaque(oidp, &ntv, sizeof ntv, req)); +} + +SYSCTL_NODE(_kern, KERN_NTP_PLL, ntp_pll, CTLFLAG_RW, 0, + "NTP kernel PLL related stuff"); +SYSCTL_PROC(_kern_ntp_pll, NTP_PLL_GETTIME, gettime, CTLTYPE_OPAQUE|CTLFLAG_RD, + 0, sizeof(struct ntptimeval) , ntp_sysctl, "S,ntptimeval", ""); + +/* + * ntp_adjtime() - NTP daemon application interface + */ +#ifndef _SYS_SYSPROTO_H_ +struct ntp_adjtime_args { + struct timex *tp; +}; +#endif + +int +ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap) +{ + struct timex ntv; + int modes; + int s; + int error; + + time_daemon = 1; + + error = copyin((caddr_t)uap->tp, (caddr_t)&ntv, sizeof(ntv)); + if (error) + return error; + + /* + * Update selected clock variables - only the superuser can + * change anything. Note that there is no error checking here on + * the assumption the superuser should know what it is doing. + */ + modes = ntv.modes; + if ((modes != 0) + && (error = suser(p->p_cred->pc_ucred, &p->p_acflag))) + return error; + + s = splclock(); + if (modes & MOD_FREQUENCY) +#ifdef PPS_SYNC + time_freq = ntv.freq - pps_freq; +#else /* PPS_SYNC */ + time_freq = ntv.freq; +#endif /* PPS_SYNC */ + if (modes & MOD_MAXERROR) + time_maxerror = ntv.maxerror; + if (modes & MOD_ESTERROR) + time_esterror = ntv.esterror; + if (modes & MOD_STATUS) { + time_status &= STA_RONLY; + time_status |= ntv.status & ~STA_RONLY; + } + if (modes & MOD_TIMECONST) + time_constant = ntv.constant; + if (modes & MOD_OFFSET) + hardupdate(ntv.offset, modes & MOD_DOSCALE); + + ntv.modes |= MOD_CANSCALE; + /* + * Retrieve all clock variables + */ + if (modes & MOD_DOSCALE) + ntv.offset = time_offset; + else if (time_offset < 0) + ntv.offset = -(-time_offset >> SHIFT_UPDATE); + else + ntv.offset = time_offset >> SHIFT_UPDATE; +#ifdef PPS_SYNC + ntv.freq = time_freq + pps_freq; +#else /* PPS_SYNC */ + ntv.freq = time_freq; +#endif /* PPS_SYNC */ + ntv.maxerror = time_maxerror; + ntv.esterror = time_esterror; + ntv.status = time_status; + ntv.constant = time_constant; + ntv.precision = time_precision; + ntv.tolerance = time_tolerance; +#ifdef PPS_SYNC + ntv.shift = pps_shift; + ntv.ppsfreq = pps_freq; + ntv.jitter = pps_jitter >> PPS_AVG; + ntv.stabil = pps_stabil; + ntv.calcnt = pps_calcnt; + ntv.errcnt = pps_errcnt; + ntv.jitcnt = pps_jitcnt; + ntv.stbcnt = pps_stbcnt; +#endif /* PPS_SYNC */ + (void)splx(s); + + error = copyout((caddr_t)&ntv, (caddr_t)uap->tp, sizeof(ntv)); + if (!error) { + /* + * Status word error decode. See comments in + * ntp_gettime() routine. + */ + p->p_retval[0] = time_state; + if (time_status & (STA_UNSYNC | STA_CLOCKERR)) + p->p_retval[0] = TIME_ERROR; + if (time_status & (STA_PPSFREQ | STA_PPSTIME) && + !(time_status & STA_PPSSIGNAL)) + p->p_retval[0] = TIME_ERROR; + if (time_status & STA_PPSTIME && + time_status & STA_PPSJITTER) + p->p_retval[0] = TIME_ERROR; + if (time_status & STA_PPSFREQ && + time_status & (STA_PPSWANDER | STA_PPSERROR)) + p->p_retval[0] = TIME_ERROR; + } + return error; +} + +#ifdef PPS_SYNC + +/* We need this ugly monster twice, so let's macroize it. */ + +#define MEDIAN3X(a, m, s, i1, i2, i3) \ + do { \ + m = a[i2]; \ + s = a[i1] - a[i3]; \ + } while (0) + +#define MEDIAN3(a, m, s) \ + do { \ + if (a[0] > a[1]) { \ + if (a[1] > a[2]) \ + MEDIAN3X(a, m, s, 0, 1, 2); \ + else if (a[2] > a[0]) \ + MEDIAN3X(a, m, s, 2, 0, 1); \ + else \ + MEDIAN3X(a, m, s, 0, 2, 1); \ + } else { \ + if (a[2] > a[1]) \ + MEDIAN3X(a, m, s, 2, 1, 0); \ + else if (a[0] > a[2]) \ + MEDIAN3X(a, m, s, 1, 0, 2); \ + else \ + MEDIAN3X(a, m, s, 1, 2, 0); \ + } \ + } while (0) + +/* + * hardpps() - discipline CPU clock oscillator to external PPS signal + * + * This routine is called at each PPS interrupt in order to discipline + * the CPU clock oscillator to the PPS signal. It measures the PPS phase + * and leaves it in a handy spot for the hardclock() routine. It + * integrates successive PPS phase differences and calculates the + * frequency offset. This is used in hardclock() to discipline the CPU + * clock oscillator so that intrinsic frequency error is cancelled out. + * The code requires the caller to capture the time and hardware counter + * value at the on-time PPS signal transition. + * + * Note that, on some Unix systems, this routine runs at an interrupt + * priority level higher than the timer interrupt routine hardclock(). + * Therefore, the variables used are distinct from the hardclock() + * variables, except for certain exceptions: The PPS frequency pps_freq + * and phase pps_offset variables are determined by this routine and + * updated atomically. The time_tolerance variable can be considered a + * constant, since it is infrequently changed, and then only when the + * PPS signal is disabled. The watchdog counter pps_valid is updated + * once per second by hardclock() and is atomically cleared in this + * routine. + */ +void +hardpps(tvp, p_usec) + struct timeval *tvp; /* time at PPS */ + long p_usec; /* hardware counter at PPS */ +{ + long u_usec, v_usec, bigtick; + long cal_sec, cal_usec; + + /* + * An occasional glitch can be produced when the PPS interrupt + * occurs in the hardclock() routine before the time variable is + * updated. Here the offset is discarded when the difference + * between it and the last one is greater than tick/2, but not + * if the interval since the first discard exceeds 30 s. + */ + time_status |= STA_PPSSIGNAL; + time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); + pps_valid = 0; + u_usec = -tvp->tv_usec; + if (u_usec < -500000) + u_usec += 1000000; + v_usec = pps_offset - u_usec; + if (v_usec < 0) + v_usec = -v_usec; + if (v_usec > (tick >> 1)) { + if (pps_glitch > MAXGLITCH) { + pps_glitch = 0; + pps_tf[2] = u_usec; + pps_tf[1] = u_usec; + } else { + pps_glitch++; + u_usec = pps_offset; + } + } else + pps_glitch = 0; + + /* + * A three-stage median filter is used to help deglitch the pps + * time. The median sample becomes the time offset estimate; the + * difference between the other two samples becomes the time + * dispersion (jitter) estimate. + */ + pps_tf[2] = pps_tf[1]; + pps_tf[1] = pps_tf[0]; + pps_tf[0] = u_usec; + MEDIAN3(pps_tf, pps_offset, v_usec); + if (v_usec > MAXTIME) + pps_jitcnt++; + v_usec = (v_usec << PPS_AVG) - pps_jitter; + if (v_usec < 0) + pps_jitter -= -v_usec >> PPS_AVG; + else + pps_jitter += v_usec >> PPS_AVG; + if (pps_jitter > (MAXTIME >> 1)) + time_status |= STA_PPSJITTER; + + /* + * During the calibration interval adjust the starting time when + * the tick overflows. At the end of the interval compute the + * duration of the interval and the difference of the hardware + * counters at the beginning and end of the interval. This code + * is deliciously complicated by the fact valid differences may + * exceed the value of tick when using long calibration + * intervals and small ticks. Note that the counter can be + * greater than tick if caught at just the wrong instant, but + * the values returned and used here are correct. + */ + bigtick = (long)tick << SHIFT_USEC; + pps_usec -= pps_freq; + if (pps_usec >= bigtick) + pps_usec -= bigtick; + if (pps_usec < 0) + pps_usec += bigtick; + pps_time.tv_sec++; + pps_count++; + if (pps_count < (1 << pps_shift)) + return; + pps_count = 0; + pps_calcnt++; + u_usec = p_usec << SHIFT_USEC; + v_usec = pps_usec - u_usec; + if (v_usec >= bigtick >> 1) + v_usec -= bigtick; + if (v_usec < -(bigtick >> 1)) + v_usec += bigtick; + if (v_usec < 0) + v_usec = -(-v_usec >> pps_shift); + else + v_usec = v_usec >> pps_shift; + pps_usec = u_usec; + cal_sec = tvp->tv_sec; + cal_usec = tvp->tv_usec; + cal_sec -= pps_time.tv_sec; + cal_usec -= pps_time.tv_usec; + if (cal_usec < 0) { + cal_usec += 1000000; + cal_sec--; + } + pps_time = *tvp; + + /* + * Check for lost interrupts, noise, excessive jitter and + * excessive frequency error. The number of timer ticks during + * the interval may vary +-1 tick. Add to this a margin of one + * tick for the PPS signal jitter and maximum frequency + * deviation. If the limits are exceeded, the calibration + * interval is reset to the minimum and we start over. + */ + u_usec = (long)tick << 1; + if (!((cal_sec == -1 && cal_usec > (1000000 - u_usec)) + || (cal_sec == 0 && cal_usec < u_usec)) + || v_usec > time_tolerance || v_usec < -time_tolerance) { + pps_errcnt++; + pps_shift = PPS_SHIFT; + pps_intcnt = 0; + time_status |= STA_PPSERROR; + return; + } + + /* + * A three-stage median filter is used to help deglitch the pps + * frequency. The median sample becomes the frequency offset + * estimate; the difference between the other two samples + * becomes the frequency dispersion (stability) estimate. + */ + pps_ff[2] = pps_ff[1]; + pps_ff[1] = pps_ff[0]; + pps_ff[0] = v_usec; + MEDIAN3(pps_ff, u_usec, v_usec); + + /* + * Here the frequency dispersion (stability) is updated. If it + * is less than one-fourth the maximum (MAXFREQ), the frequency + * offset is updated as well, but clamped to the tolerance. It + * will be processed later by the hardclock() routine. + */ + v_usec = (v_usec >> 1) - pps_stabil; + if (v_usec < 0) + pps_stabil -= -v_usec >> PPS_AVG; + else + pps_stabil += v_usec >> PPS_AVG; + if (pps_stabil > MAXFREQ >> 2) { + pps_stbcnt++; + time_status |= STA_PPSWANDER; + return; + } + if (time_status & STA_PPSFREQ) { + if (u_usec < 0) { + pps_freq -= -u_usec >> PPS_AVG; + if (pps_freq < -time_tolerance) + pps_freq = -time_tolerance; + u_usec = -u_usec; + } else { + pps_freq += u_usec >> PPS_AVG; + if (pps_freq > time_tolerance) + pps_freq = time_tolerance; + } + } + + /* + * Here the calibration interval is adjusted. If the maximum + * time difference is greater than tick / 4, reduce the interval + * by half. If this is not the case for four consecutive + * intervals, double the interval. + */ + if (u_usec << pps_shift > bigtick >> 2) { + pps_intcnt = 0; + if (pps_shift > PPS_SHIFT) + pps_shift--; + } else if (pps_intcnt >= 4) { + pps_intcnt = 0; + if (pps_shift < PPS_SHIFTMAX) + pps_shift++; + } else + pps_intcnt++; +} + +#endif /* PPS_SYNC */ + +int +std_pps_ioctl(u_long cmd, caddr_t data, pps_params_t *pp, pps_info_t *pi, int ppscap) +{ + pps_params_t *app; + pps_info_t *api; + + switch (cmd) { + case PPS_IOC_CREATE: + return (0); + case PPS_IOC_DESTROY: + return (0); + case PPS_IOC_SETPARAMS: + app = (pps_params_t *)data; + if (app->mode & ~ppscap) + return (EINVAL); + *pp = *app; + return (0); + case PPS_IOC_GETPARAMS: + app = (pps_params_t *)data; + *app = *pp; + return (0); + case PPS_IOC_GETCAP: + *(int*)data = ppscap; + return (0); + case PPS_IOC_FETCH: + api = (pps_info_t *)data; + *api = *pi; + pi->current_mode = pp->mode; + return (0); + case PPS_IOC_WAIT: + return (EOPNOTSUPP); + default: + return (ENODEV); + } +} diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c new file mode 100644 index 0000000..ad63a98 --- /dev/null +++ b/sys/kern/kern_physio.c @@ -0,0 +1,215 @@ +/* + * Copyright (c) 1994 John S. Dyson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice immediately at the beginning of the file, without modification, + * this list of conditions, and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Absolutely no warranty of function or purpose is made by the author + * John S. Dyson. + * 4. Modifications may be freely made to this file if the above conditions + * are met. + * + * $Id: kern_physio.c,v 1.29 1998/10/25 17:44:51 phk Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/buf.h> +#include <sys/conf.h> +#include <sys/proc.h> +#include <sys/uio.h> + +#include <vm/vm.h> +#include <vm/vm_extern.h> + +static void physwakeup __P((struct buf *bp)); +static struct buf * phygetvpbuf(dev_t dev, int resid); + +int +physio(strategy, bp, dev, rw, minp, uio) + d_strategy_t *strategy; + struct buf *bp; + dev_t dev; + int rw; + u_int (*minp) __P((struct buf *bp)); + struct uio *uio; +{ + int i; + int bufflags = rw?B_READ:0; + int error; + int spl; + caddr_t sa; + int bp_alloc = (bp == 0); + struct buf *bpa; + +/* + * keep the process from being swapped + */ + curproc->p_flag |= P_PHYSIO; + + /* create and build a buffer header for a transfer */ + bpa = (struct buf *)phygetvpbuf(dev, uio->uio_resid); + if (!bp_alloc) { + spl = splbio(); + while (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + tsleep((caddr_t)bp, PRIBIO, "physbw", 0); + } + bp->b_flags |= B_BUSY; + splx(spl); + } else { + bp = bpa; + } + + /* + * get a copy of the kva from the physical buffer + */ + sa = bpa->b_data; + bp->b_proc = curproc; + error = bp->b_error = 0; + + for(i=0;i<uio->uio_iovcnt;i++) { + while( uio->uio_iov[i].iov_len) { + + bp->b_dev = dev; + bp->b_bcount = uio->uio_iov[i].iov_len; + bp->b_flags = B_BUSY | B_PHYS | B_CALL | bufflags; + bp->b_iodone = physwakeup; + bp->b_data = uio->uio_iov[i].iov_base; + bp->b_bcount = minp( bp); + if( minp != minphys) + bp->b_bcount = minphys( bp); + bp->b_bufsize = bp->b_bcount; + /* + * pass in the kva from the physical buffer + * for the temporary kernel mapping. + */ + bp->b_saveaddr = sa; + bp->b_blkno = btodb(uio->uio_offset); + bp->b_offset = uio->uio_offset; + + if (uio->uio_segflg == UIO_USERSPACE) { + if (rw && !useracc(bp->b_data, bp->b_bufsize, B_WRITE)) { + error = EFAULT; + goto doerror; + } + if (!rw && !useracc(bp->b_data, bp->b_bufsize, B_READ)) { + error = EFAULT; + goto doerror; + } + + /* bring buffer into kernel space */ + vmapbuf(bp); + } + + /* perform transfer */ + (*strategy)(bp); + + spl = splbio(); + while ((bp->b_flags & B_DONE) == 0) + tsleep((caddr_t)bp, PRIBIO, "physstr", 0); + splx(spl); + + /* release mapping into kernel space */ + if (uio->uio_segflg == UIO_USERSPACE) + vunmapbuf(bp); + + /* + * update the uio data + */ + { + int iolen = bp->b_bcount - bp->b_resid; + + if (iolen == 0 && !(bp->b_flags & B_ERROR)) + goto doerror; /* EOF */ + uio->uio_iov[i].iov_len -= iolen; + uio->uio_iov[i].iov_base += iolen; + uio->uio_resid -= iolen; + uio->uio_offset += iolen; + } + + /* + * check for an error + */ + if( bp->b_flags & B_ERROR) { + error = bp->b_error; + goto doerror; + } + } + } + + +doerror: + relpbuf(bpa, NULL); + if (!bp_alloc) { + bp->b_flags &= ~(B_BUSY|B_PHYS); + if( bp->b_flags & B_WANTED) { + bp->b_flags &= ~B_WANTED; + wakeup((caddr_t)bp); + } + } +/* + * allow the process to be swapped + */ + curproc->p_flag &= ~P_PHYSIO; + + return (error); +} + +u_int +minphys(bp) + struct buf *bp; +{ + u_int maxphys = DFLTPHYS; + struct cdevsw *bdsw; + + bdsw = cdevsw[major(bp->b_dev)]; + + if (bdsw && bdsw->d_maxio) { + maxphys = bdsw->d_maxio; + } + if (bp->b_kvasize && (bp->b_kvasize < maxphys)) + maxphys = bp->b_kvasize; + + if(((vm_offset_t) bp->b_data) & PAGE_MASK) { + maxphys -= PAGE_SIZE; + } + + if( bp->b_bcount > maxphys) { + bp->b_bcount = maxphys; + } + + return bp->b_bcount; +} + +struct buf * +phygetvpbuf(dev_t dev, int resid) +{ + struct cdevsw *bdsw; + int maxio; + + bdsw = cdevsw[major(dev)]; + if ((bdsw == NULL) || (bdsw->d_bmaj == -1)) + return getpbuf(NULL); + + maxio = bdsw->d_maxio; + if (resid > maxio) + resid = maxio; + + return getpbuf(NULL); +} + +static void +physwakeup(bp) + struct buf *bp; +{ + wakeup((caddr_t) bp); + bp->b_flags &= ~B_CALL; +} diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c new file mode 100644 index 0000000..0c6feac --- /dev/null +++ b/sys/kern/kern_proc.c @@ -0,0 +1,608 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_proc.c 8.7 (Berkeley) 2/14/95 + * $Id: kern_proc.c,v 1.42 1999/01/10 01:58:24 eivind Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/filedesc.h> +#include <sys/tty.h> +#include <sys/signalvar.h> +#include <vm/vm.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <sys/user.h> +#include <vm/vm_zone.h> + +static MALLOC_DEFINE(M_PGRP, "pgrp", "process group header"); +MALLOC_DEFINE(M_SESSION, "session", "session header"); +static MALLOC_DEFINE(M_PROC, "proc", "Proc structures"); +MALLOC_DEFINE(M_SUBPROC, "subproc", "Proc sub-structures"); + +struct prochd qs[NQS]; /* as good a place as any... */ +struct prochd rtqs[NQS]; /* Space for REALTIME queues too */ +struct prochd idqs[NQS]; /* Space for IDLE queues too */ + +static void pgdelete __P((struct pgrp *)); + +/* + * Structure associated with user cacheing. + */ +struct uidinfo { + LIST_ENTRY(uidinfo) ui_hash; + uid_t ui_uid; + long ui_proccnt; +}; +#define UIHASH(uid) (&uihashtbl[(uid) & uihash]) +static LIST_HEAD(uihashhead, uidinfo) *uihashtbl; +static u_long uihash; /* size of hash table - 1 */ + +static void orphanpg __P((struct pgrp *pg)); + +/* + * Other process lists + */ +struct pidhashhead *pidhashtbl; +u_long pidhash; +struct pgrphashhead *pgrphashtbl; +u_long pgrphash; +struct proclist allproc; +struct proclist zombproc; +vm_zone_t proc_zone; + +/* + * Initialize global process hashing structures. + */ +void +procinit() +{ + + LIST_INIT(&allproc); + LIST_INIT(&zombproc); + pidhashtbl = hashinit(maxproc / 4, M_PROC, &pidhash); + pgrphashtbl = hashinit(maxproc / 4, M_PROC, &pgrphash); + uihashtbl = hashinit(maxproc / 16, M_PROC, &uihash); + proc_zone = zinit("PROC", sizeof (struct proc), 0, 0, 5); +} + +/* + * Change the count associated with number of processes + * a given user is using. + */ +int +chgproccnt(uid, diff) + uid_t uid; + int diff; +{ + register struct uidinfo *uip; + register struct uihashhead *uipp; + + uipp = UIHASH(uid); + for (uip = uipp->lh_first; uip != 0; uip = uip->ui_hash.le_next) + if (uip->ui_uid == uid) + break; + if (uip) { + uip->ui_proccnt += diff; + if (uip->ui_proccnt > 0) + return (uip->ui_proccnt); + if (uip->ui_proccnt < 0) + panic("chgproccnt: procs < 0"); + LIST_REMOVE(uip, ui_hash); + FREE(uip, M_PROC); + return (0); + } + if (diff <= 0) { + if (diff == 0) + return(0); + panic("chgproccnt: lost user"); + } + MALLOC(uip, struct uidinfo *, sizeof(*uip), M_PROC, M_WAITOK); + LIST_INSERT_HEAD(uipp, uip, ui_hash); + uip->ui_uid = uid; + uip->ui_proccnt = diff; + return (diff); +} + +/* + * Is p an inferior of the current process? + */ +int +inferior(p) + register struct proc *p; +{ + + for (; p != curproc; p = p->p_pptr) + if (p->p_pid == 0) + return (0); + return (1); +} + +/* + * Locate a process by number + */ +struct proc * +pfind(pid) + register pid_t pid; +{ + register struct proc *p; + + for (p = PIDHASH(pid)->lh_first; p != 0; p = p->p_hash.le_next) + if (p->p_pid == pid) + return (p); + return (NULL); +} + +/* + * Locate a process group by number + */ +struct pgrp * +pgfind(pgid) + register pid_t pgid; +{ + register struct pgrp *pgrp; + + for (pgrp = PGRPHASH(pgid)->lh_first; pgrp != 0; + pgrp = pgrp->pg_hash.le_next) + if (pgrp->pg_id == pgid) + return (pgrp); + return (NULL); +} + +/* + * Move p to a new or existing process group (and session) + */ +int +enterpgrp(p, pgid, mksess) + register struct proc *p; + pid_t pgid; + int mksess; +{ + register struct pgrp *pgrp = pgfind(pgid); + + KASSERT(pgrp == NULL || !mksess, + ("enterpgrp: setsid into non-empty pgrp")); + KASSERT(!SESS_LEADER(p), + ("enterpgrp: session leader attempted setpgrp")); + + if (pgrp == NULL) { + pid_t savepid = p->p_pid; + struct proc *np; + /* + * new process group + */ + KASSERT(p->p_pid == pgid, + ("enterpgrp: new pgrp and pid != pgid")); + MALLOC(pgrp, struct pgrp *, sizeof(struct pgrp), M_PGRP, + M_WAITOK); + if ((np = pfind(savepid)) == NULL || np != p) + return (ESRCH); + if (mksess) { + register struct session *sess; + + /* + * new session + */ + MALLOC(sess, struct session *, sizeof(struct session), + M_SESSION, M_WAITOK); + sess->s_leader = p; + sess->s_sid = p->p_pid; + sess->s_count = 1; + sess->s_ttyvp = NULL; + sess->s_ttyp = NULL; + bcopy(p->p_session->s_login, sess->s_login, + sizeof(sess->s_login)); + p->p_flag &= ~P_CONTROLT; + pgrp->pg_session = sess; + KASSERT(p == curproc, + ("enterpgrp: mksession and p != curproc")); + } else { + pgrp->pg_session = p->p_session; + pgrp->pg_session->s_count++; + } + pgrp->pg_id = pgid; + LIST_INIT(&pgrp->pg_members); + LIST_INSERT_HEAD(PGRPHASH(pgid), pgrp, pg_hash); + pgrp->pg_jobc = 0; + SLIST_INIT(&pgrp->pg_sigiolst); + } else if (pgrp == p->p_pgrp) + return (0); + + /* + * Adjust eligibility of affected pgrps to participate in job control. + * Increment eligibility counts before decrementing, otherwise we + * could reach 0 spuriously during the first call. + */ + fixjobc(p, pgrp, 1); + fixjobc(p, p->p_pgrp, 0); + + LIST_REMOVE(p, p_pglist); + if (p->p_pgrp->pg_members.lh_first == 0) + pgdelete(p->p_pgrp); + p->p_pgrp = pgrp; + LIST_INSERT_HEAD(&pgrp->pg_members, p, p_pglist); + return (0); +} + +/* + * remove process from process group + */ +int +leavepgrp(p) + register struct proc *p; +{ + + LIST_REMOVE(p, p_pglist); + if (p->p_pgrp->pg_members.lh_first == 0) + pgdelete(p->p_pgrp); + p->p_pgrp = 0; + return (0); +} + +/* + * delete a process group + */ +static void +pgdelete(pgrp) + register struct pgrp *pgrp; +{ + + /* + * Reset any sigio structures pointing to us as a result of + * F_SETOWN with our pgid. + */ + funsetownlst(&pgrp->pg_sigiolst); + + if (pgrp->pg_session->s_ttyp != NULL && + pgrp->pg_session->s_ttyp->t_pgrp == pgrp) + pgrp->pg_session->s_ttyp->t_pgrp = NULL; + LIST_REMOVE(pgrp, pg_hash); + if (--pgrp->pg_session->s_count == 0) + FREE(pgrp->pg_session, M_SESSION); + FREE(pgrp, M_PGRP); +} + +/* + * Adjust pgrp jobc counters when specified process changes process group. + * We count the number of processes in each process group that "qualify" + * the group for terminal job control (those with a parent in a different + * process group of the same session). If that count reaches zero, the + * process group becomes orphaned. Check both the specified process' + * process group and that of its children. + * entering == 0 => p is leaving specified group. + * entering == 1 => p is entering specified group. + */ +void +fixjobc(p, pgrp, entering) + register struct proc *p; + register struct pgrp *pgrp; + int entering; +{ + register struct pgrp *hispgrp; + register struct session *mysession = pgrp->pg_session; + + /* + * Check p's parent to see whether p qualifies its own process + * group; if so, adjust count for p's process group. + */ + if ((hispgrp = p->p_pptr->p_pgrp) != pgrp && + hispgrp->pg_session == mysession) + if (entering) + pgrp->pg_jobc++; + else if (--pgrp->pg_jobc == 0) + orphanpg(pgrp); + + /* + * Check this process' children to see whether they qualify + * their process groups; if so, adjust counts for children's + * process groups. + */ + for (p = p->p_children.lh_first; p != 0; p = p->p_sibling.le_next) + if ((hispgrp = p->p_pgrp) != pgrp && + hispgrp->pg_session == mysession && + p->p_stat != SZOMB) + if (entering) + hispgrp->pg_jobc++; + else if (--hispgrp->pg_jobc == 0) + orphanpg(hispgrp); +} + +/* + * A process group has become orphaned; + * if there are any stopped processes in the group, + * hang-up all process in that group. + */ +static void +orphanpg(pg) + struct pgrp *pg; +{ + register struct proc *p; + + for (p = pg->pg_members.lh_first; p != 0; p = p->p_pglist.le_next) { + if (p->p_stat == SSTOP) { + for (p = pg->pg_members.lh_first; p != 0; + p = p->p_pglist.le_next) { + psignal(p, SIGHUP); + psignal(p, SIGCONT); + } + return; + } + } +} + +#include "opt_ddb.h" +#ifdef DDB +#include <ddb/ddb.h> + +DB_SHOW_COMMAND(pgrpdump, pgrpdump) +{ + register struct pgrp *pgrp; + register struct proc *p; + register int i; + + for (i = 0; i <= pgrphash; i++) { + if (pgrp = pgrphashtbl[i].lh_first) { + printf("\tindx %d\n", i); + for (; pgrp != 0; pgrp = pgrp->pg_hash.le_next) { + printf( + "\tpgrp %p, pgid %ld, sess %p, sesscnt %d, mem %p\n", + (void *)pgrp, (long)pgrp->pg_id, + (void *)pgrp->pg_session, + pgrp->pg_session->s_count, + (void *)pgrp->pg_members.lh_first); + for (p = pgrp->pg_members.lh_first; p != 0; + p = p->p_pglist.le_next) { + printf("\t\tpid %ld addr %p pgrp %p\n", + (long)p->p_pid, (void *)p, + (void *)p->p_pgrp); + } + } + } + } +} +#endif /* DDB */ + +/* + * Fill in an eproc structure for the specified process. + */ +void +fill_eproc(p, ep) + register struct proc *p; + register struct eproc *ep; +{ + register struct tty *tp; + + bzero(ep, sizeof(*ep)); + + ep->e_paddr = p; + if (p->p_cred) { + ep->e_pcred = *p->p_cred; + if (p->p_ucred) + ep->e_ucred = *p->p_ucred; + } +#ifdef COMPAT_LINUX_THREADS + if (p->p_procsig){ + ep->e_procsig = *p->p_procsig; + } +#endif + if (p->p_stat != SIDL && p->p_stat != SZOMB && p->p_vmspace != NULL) { + register struct vmspace *vm = p->p_vmspace; + +#ifdef pmap_resident_count + ep->e_vm.vm_rssize = pmap_resident_count(&vm->vm_pmap); /*XXX*/ +#else + ep->e_vm.vm_rssize = vm->vm_rssize; +#endif + ep->e_vm.vm_tsize = vm->vm_tsize; + ep->e_vm.vm_dsize = vm->vm_dsize; + ep->e_vm.vm_ssize = vm->vm_ssize; + ep->e_vm.vm_taddr = vm->vm_taddr; + ep->e_vm.vm_daddr = vm->vm_daddr; + ep->e_vm.vm_minsaddr = vm->vm_minsaddr; + ep->e_vm.vm_maxsaddr = vm->vm_maxsaddr; + ep->e_vm.vm_map = vm->vm_map; +#ifndef sparc + ep->e_vm.vm_pmap = vm->vm_pmap; +#endif + } + if (p->p_pptr) + ep->e_ppid = p->p_pptr->p_pid; + if (p->p_pgrp) { + ep->e_pgid = p->p_pgrp->pg_id; + ep->e_jobc = p->p_pgrp->pg_jobc; + ep->e_sess = p->p_pgrp->pg_session; + + if (ep->e_sess) { + bcopy(ep->e_sess->s_login, ep->e_login, sizeof(ep->e_login)); + if (ep->e_sess->s_ttyvp) + ep->e_flag = EPROC_CTTY; + if (p->p_session && SESS_LEADER(p)) + ep->e_flag |= EPROC_SLEADER; + } + } + if ((p->p_flag & P_CONTROLT) && + (ep->e_sess != NULL) && + ((tp = ep->e_sess->s_ttyp) != NULL)) { + ep->e_tdev = tp->t_dev; + ep->e_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID; + ep->e_tsess = tp->t_session; + } else + ep->e_tdev = NODEV; + if (p->p_wmesg) { + strncpy(ep->e_wmesg, p->p_wmesg, WMESGLEN); + ep->e_wmesg[WMESGLEN] = 0; + } +} + +static struct proc * +zpfind(pid_t pid) +{ + struct proc *p; + + for (p = zombproc.lh_first; p != 0; p = p->p_list.le_next) + if (p->p_pid == pid) + return (p); + return (NULL); +} + + +static int +sysctl_out_proc(struct proc *p, struct sysctl_req *req, int doingzomb) +{ + struct eproc eproc; + int error; + pid_t pid = p->p_pid; + + fill_eproc(p, &eproc); + error = SYSCTL_OUT(req,(caddr_t)p, sizeof(struct proc)); + if (error) + return (error); + error = SYSCTL_OUT(req,(caddr_t)&eproc, sizeof(eproc)); + if (error) + return (error); + if (!doingzomb && pid && (pfind(pid) != p)) + return EAGAIN; + if (doingzomb && zpfind(pid) != p) + return EAGAIN; + return (0); +} + +static int +sysctl_kern_proc SYSCTL_HANDLER_ARGS +{ + int *name = (int*) arg1; + u_int namelen = arg2; + struct proc *p; + int doingzomb; + int error = 0; + + if (oidp->oid_number == KERN_PROC_PID) { + if (namelen != 1) + return (EINVAL); + p = pfind((pid_t)name[0]); + if (!p) + return (0); + error = sysctl_out_proc(p, req, 0); + return (error); + } + if (oidp->oid_number == KERN_PROC_ALL && !namelen) + ; + else if (oidp->oid_number != KERN_PROC_ALL && namelen == 1) + ; + else + return (EINVAL); + + if (!req->oldptr) { + /* overestimate by 5 procs */ + error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5); + if (error) + return (error); + } + for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) { + if (!doingzomb) + p = allproc.lh_first; + else + p = zombproc.lh_first; + for (; p != 0; p = p->p_list.le_next) { + /* + * Skip embryonic processes. + */ + if (p->p_stat == SIDL) + continue; + /* + * TODO - make more efficient (see notes below). + * do by session. + */ + switch (oidp->oid_number) { + + case KERN_PROC_PGRP: + /* could do this by traversing pgrp */ + if (p->p_pgrp == NULL || + p->p_pgrp->pg_id != (pid_t)name[0]) + continue; + break; + + case KERN_PROC_TTY: + if ((p->p_flag & P_CONTROLT) == 0 || + p->p_session == NULL || + p->p_session->s_ttyp == NULL || + p->p_session->s_ttyp->t_dev != (dev_t)name[0]) + continue; + break; + + case KERN_PROC_UID: + if (p->p_ucred == NULL || + p->p_ucred->cr_uid != (uid_t)name[0]) + continue; + break; + + case KERN_PROC_RUID: + if (p->p_ucred == NULL || + p->p_cred->p_ruid != (uid_t)name[0]) + continue; + break; + } + + error = sysctl_out_proc(p, req, doingzomb); + if (error) + return (error); + } + } + return (0); +} + + +SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD, 0, "Process table"); + +SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT, + 0, 0, sysctl_kern_proc, "S,proc", ""); + +SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD, + sysctl_kern_proc, "Process table"); + +SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD, + sysctl_kern_proc, "Process table"); + +SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD, + sysctl_kern_proc, "Process table"); + +SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD, + sysctl_kern_proc, "Process table"); + +SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD, + sysctl_kern_proc, "Process table"); diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c new file mode 100644 index 0000000..e5e1a3e --- /dev/null +++ b/sys/kern/kern_prot.c @@ -0,0 +1,898 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_prot.c 8.6 (Berkeley) 1/21/94 + * $Id: kern_prot.c,v 1.42 1998/11/10 09:16:29 peter Exp $ + */ + +/* + * System calls related to processes and protection + */ + +#include "opt_compat.h" + +#include <sys/param.h> +#include <sys/acct.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/unistd.h> +#include <sys/pioctl.h> + +static MALLOC_DEFINE(M_CRED, "cred", "credentials"); + +#ifndef _SYS_SYSPROTO_H_ +struct getpid_args { + int dummy; +}; +#endif + +/* ARGSUSED */ +int +getpid(p, uap) + struct proc *p; + struct getpid_args *uap; +{ + + p->p_retval[0] = p->p_pid; +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + p->p_retval[1] = p->p_pptr->p_pid; +#endif + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct getppid_args { + int dummy; +}; +#endif +/* ARGSUSED */ +int +getppid(p, uap) + struct proc *p; + struct getppid_args *uap; +{ + + p->p_retval[0] = p->p_pptr->p_pid; + return (0); +} + +/* Get process group ID; note that POSIX getpgrp takes no parameter */ +#ifndef _SYS_SYSPROTO_H_ +struct getpgrp_args { + int dummy; +}; +#endif + +int +getpgrp(p, uap) + struct proc *p; + struct getpgrp_args *uap; +{ + + p->p_retval[0] = p->p_pgrp->pg_id; + return (0); +} + +/* Get an arbitary pid's process group id */ +#ifndef _SYS_SYSPROTO_H_ +struct getpgid_args { + pid_t pid; +}; +#endif + +int +getpgid(p, uap) + struct proc *p; + struct getpgid_args *uap; +{ + struct proc *pt; + + pt = p; + if (uap->pid == 0) + goto found; + + if ((pt = pfind(uap->pid)) == 0) + return ESRCH; +found: + p->p_retval[0] = pt->p_pgrp->pg_id; + return 0; +} + +/* + * Get an arbitary pid's session id. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getsid_args { + pid_t pid; +}; +#endif + +int +getsid(p, uap) + struct proc *p; + struct getsid_args *uap; +{ + struct proc *pt; + + pt = p; + if (uap->pid == 0) + goto found; + + if ((pt == pfind(uap->pid)) == 0) + return ESRCH; +found: + p->p_retval[0] = pt->p_session->s_sid; + return 0; +} + + +#ifndef _SYS_SYSPROTO_H_ +struct getuid_args { + int dummy; +}; +#endif + +/* ARGSUSED */ +int +getuid(p, uap) + struct proc *p; + struct getuid_args *uap; +{ + + p->p_retval[0] = p->p_cred->p_ruid; +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + p->p_retval[1] = p->p_ucred->cr_uid; +#endif + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct geteuid_args { + int dummy; +}; +#endif + +/* ARGSUSED */ +int +geteuid(p, uap) + struct proc *p; + struct geteuid_args *uap; +{ + + p->p_retval[0] = p->p_ucred->cr_uid; + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct getgid_args { + int dummy; +}; +#endif + +/* ARGSUSED */ +int +getgid(p, uap) + struct proc *p; + struct getgid_args *uap; +{ + + p->p_retval[0] = p->p_cred->p_rgid; +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + p->p_retval[1] = p->p_ucred->cr_groups[0]; +#endif + return (0); +} + +/* + * Get effective group ID. The "egid" is groups[0], and could be obtained + * via getgroups. This syscall exists because it is somewhat painful to do + * correctly in a library function. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getegid_args { + int dummy; +}; +#endif + +/* ARGSUSED */ +int +getegid(p, uap) + struct proc *p; + struct getegid_args *uap; +{ + + p->p_retval[0] = p->p_ucred->cr_groups[0]; + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct getgroups_args { + u_int gidsetsize; + gid_t *gidset; +}; +#endif +int +getgroups(p, uap) + struct proc *p; + register struct getgroups_args *uap; +{ + register struct pcred *pc = p->p_cred; + register u_int ngrp; + int error; + + if ((ngrp = uap->gidsetsize) == 0) { + p->p_retval[0] = pc->pc_ucred->cr_ngroups; + return (0); + } + if (ngrp < pc->pc_ucred->cr_ngroups) + return (EINVAL); + ngrp = pc->pc_ucred->cr_ngroups; + if ((error = copyout((caddr_t)pc->pc_ucred->cr_groups, + (caddr_t)uap->gidset, ngrp * sizeof(gid_t)))) + return (error); + p->p_retval[0] = ngrp; + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct setsid_args { + int dummy; +}; +#endif + +/* ARGSUSED */ +int +setsid(p, uap) + register struct proc *p; + struct setsid_args *uap; +{ + + if (p->p_pgid == p->p_pid || pgfind(p->p_pid)) { + return (EPERM); + } else { + (void)enterpgrp(p, p->p_pid, 1); + p->p_retval[0] = p->p_pid; + return (0); + } +} + +/* + * set process group (setpgid/old setpgrp) + * + * caller does setpgid(targpid, targpgid) + * + * pid must be caller or child of caller (ESRCH) + * if a child + * pid must be in same session (EPERM) + * pid can't have done an exec (EACCES) + * if pgid != pid + * there must exist some pid in same session having pgid (EPERM) + * pid must not be session leader (EPERM) + */ +#ifndef _SYS_SYSPROTO_H_ +struct setpgid_args { + int pid; /* target process id */ + int pgid; /* target pgrp id */ +}; +#endif +/* ARGSUSED */ +int +setpgid(curp, uap) + struct proc *curp; + register struct setpgid_args *uap; +{ + register struct proc *targp; /* target process */ + register struct pgrp *pgrp; /* target pgrp */ + + if (uap->pgid < 0) + return (EINVAL); + if (uap->pid != 0 && uap->pid != curp->p_pid) { + if ((targp = pfind(uap->pid)) == 0 || !inferior(targp)) + return (ESRCH); + if (targp->p_pgrp == NULL || targp->p_session != curp->p_session) + return (EPERM); + if (targp->p_flag & P_EXEC) + return (EACCES); + } else + targp = curp; + if (SESS_LEADER(targp)) + return (EPERM); + if (uap->pgid == 0) + uap->pgid = targp->p_pid; + else if (uap->pgid != targp->p_pid) + if ((pgrp = pgfind(uap->pgid)) == 0 || + pgrp->pg_session != curp->p_session) + return (EPERM); + return (enterpgrp(targp, uap->pgid, 0)); +} + +/* + * Use the clause in B.4.2.2 that allows setuid/setgid to be 4.2/4.3BSD + * compatable. It says that setting the uid/gid to euid/egid is a special + * case of "appropriate privilege". Once the rules are expanded out, this + * basically means that setuid(nnn) sets all three id's, in all permitted + * cases unless _POSIX_SAVED_IDS is enabled. In that case, setuid(getuid()) + * does not set the saved id - this is dangerous for traditional BSD + * programs. For this reason, we *really* do not want to set + * _POSIX_SAVED_IDS and do not want to clear POSIX_APPENDIX_B_4_2_2. + */ +#define POSIX_APPENDIX_B_4_2_2 + +#ifndef _SYS_SYSPROTO_H_ +struct setuid_args { + uid_t uid; +}; +#endif +/* ARGSUSED */ +int +setuid(p, uap) + struct proc *p; + struct setuid_args *uap; +{ + register struct pcred *pc = p->p_cred; + register uid_t uid; + int error; + + /* + * See if we have "permission" by POSIX 1003.1 rules. + * + * Note that setuid(geteuid()) is a special case of + * "appropriate privileges" in appendix B.4.2.2. We need + * to use this clause to be compatable with traditional BSD + * semantics. Basically, it means that "setuid(xx)" sets all + * three id's (assuming you have privs). + * + * Notes on the logic. We do things in three steps. + * 1: We determine if the euid is going to change, and do EPERM + * right away. We unconditionally change the euid later if this + * test is satisfied, simplifying that part of the logic. + * 2: We determine if the real and/or saved uid's are going to + * change. Determined by compile options. + * 3: Change euid last. (after tests in #2 for "appropriate privs") + */ + uid = uap->uid; + if (uid != pc->p_ruid && /* allow setuid(getuid()) */ +#ifdef _POSIX_SAVED_IDS + uid != pc->p_svuid && /* allow setuid(saved gid) */ +#endif +#ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */ + uid != pc->pc_ucred->cr_uid && /* allow setuid(geteuid()) */ +#endif + (error = suser(pc->pc_ucred, &p->p_acflag))) + return (error); + +#ifdef _POSIX_SAVED_IDS + /* + * Do we have "appropriate privileges" (are we root or uid == euid) + * If so, we are changing the real uid and/or saved uid. + */ + if ( +#ifdef POSIX_APPENDIX_B_4_2_2 /* Use the clause from B.4.2.2 */ + uid == pc->pc_ucred->cr_uid || +#endif + suser(pc->pc_ucred, &p->p_acflag) == 0) /* we are using privs */ +#endif + { + /* + * Transfer proc count to new user. + */ + if (uid != pc->p_ruid) { + (void)chgproccnt(pc->p_ruid, -1); + (void)chgproccnt(uid, 1); + } + /* + * Set real uid + */ + if (uid != pc->p_ruid) { + pc->p_ruid = uid; + setsugid(p); + } + /* + * Set saved uid + * + * XXX always set saved uid even if not _POSIX_SAVED_IDS, as + * the security of seteuid() depends on it. B.4.2.2 says it + * is important that we should do this. + */ + if (pc->p_svuid != uid) { + pc->p_svuid = uid; + setsugid(p); + } + } + + /* + * In all permitted cases, we are changing the euid. + * Copy credentials so other references do not see our changes. + */ + if (pc->pc_ucred->cr_uid != uid) { + pc->pc_ucred = crcopy(pc->pc_ucred); + pc->pc_ucred->cr_uid = uid; + setsugid(p); + } + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct seteuid_args { + uid_t euid; +}; +#endif +/* ARGSUSED */ +int +seteuid(p, uap) + struct proc *p; + struct seteuid_args *uap; +{ + register struct pcred *pc = p->p_cred; + register uid_t euid; + int error; + + euid = uap->euid; + if (euid != pc->p_ruid && /* allow seteuid(getuid()) */ + euid != pc->p_svuid && /* allow seteuid(saved uid) */ + (error = suser(pc->pc_ucred, &p->p_acflag))) + return (error); + /* + * Everything's okay, do it. Copy credentials so other references do + * not see our changes. + */ + if (pc->pc_ucred->cr_uid != euid) { + pc->pc_ucred = crcopy(pc->pc_ucred); + pc->pc_ucred->cr_uid = euid; + setsugid(p); + } + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct setgid_args { + gid_t gid; +}; +#endif +/* ARGSUSED */ +int +setgid(p, uap) + struct proc *p; + struct setgid_args *uap; +{ + register struct pcred *pc = p->p_cred; + register gid_t gid; + int error; + + /* + * See if we have "permission" by POSIX 1003.1 rules. + * + * Note that setgid(getegid()) is a special case of + * "appropriate privileges" in appendix B.4.2.2. We need + * to use this clause to be compatable with traditional BSD + * semantics. Basically, it means that "setgid(xx)" sets all + * three id's (assuming you have privs). + * + * For notes on the logic here, see setuid() above. + */ + gid = uap->gid; + if (gid != pc->p_rgid && /* allow setgid(getgid()) */ +#ifdef _POSIX_SAVED_IDS + gid != pc->p_svgid && /* allow setgid(saved gid) */ +#endif +#ifdef POSIX_APPENDIX_B_4_2_2 /* Use BSD-compat clause from B.4.2.2 */ + gid != pc->pc_ucred->cr_groups[0] && /* allow setgid(getegid()) */ +#endif + (error = suser(pc->pc_ucred, &p->p_acflag))) + return (error); + +#ifdef _POSIX_SAVED_IDS + /* + * Do we have "appropriate privileges" (are we root or gid == egid) + * If so, we are changing the real uid and saved gid. + */ + if ( +#ifdef POSIX_APPENDIX_B_4_2_2 /* use the clause from B.4.2.2 */ + gid == pc->pc_ucred->cr_groups[0] || +#endif + suser(pc->pc_ucred, &p->p_acflag) == 0) /* we are using privs */ +#endif + { + /* + * Set real gid + */ + if (pc->p_rgid != gid) { + pc->p_rgid = gid; + setsugid(p); + } + /* + * Set saved gid + * + * XXX always set saved gid even if not _POSIX_SAVED_IDS, as + * the security of setegid() depends on it. B.4.2.2 says it + * is important that we should do this. + */ + if (pc->p_svgid != gid) { + pc->p_svgid = gid; + setsugid(p); + } + } + /* + * In all cases permitted cases, we are changing the egid. + * Copy credentials so other references do not see our changes. + */ + if (pc->pc_ucred->cr_groups[0] != gid) { + pc->pc_ucred = crcopy(pc->pc_ucred); + pc->pc_ucred->cr_groups[0] = gid; + setsugid(p); + } + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct setegid_args { + gid_t egid; +}; +#endif +/* ARGSUSED */ +int +setegid(p, uap) + struct proc *p; + struct setegid_args *uap; +{ + register struct pcred *pc = p->p_cred; + register gid_t egid; + int error; + + egid = uap->egid; + if (egid != pc->p_rgid && /* allow setegid(getgid()) */ + egid != pc->p_svgid && /* allow setegid(saved gid) */ + (error = suser(pc->pc_ucred, &p->p_acflag))) + return (error); + if (pc->pc_ucred->cr_groups[0] != egid) { + pc->pc_ucred = crcopy(pc->pc_ucred); + pc->pc_ucred->cr_groups[0] = egid; + setsugid(p); + } + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct setgroups_args { + u_int gidsetsize; + gid_t *gidset; +}; +#endif +/* ARGSUSED */ +int +setgroups(p, uap) + struct proc *p; + struct setgroups_args *uap; +{ + register struct pcred *pc = p->p_cred; + register u_int ngrp; + int error; + + if ((error = suser(pc->pc_ucred, &p->p_acflag))) + return (error); + ngrp = uap->gidsetsize; + if (ngrp > NGROUPS) + return (EINVAL); + /* + * XXX A little bit lazy here. We could test if anything has + * changed before crcopy() and setting P_SUGID. + */ + pc->pc_ucred = crcopy(pc->pc_ucred); + if (ngrp < 1) { + /* + * setgroups(0, NULL) is a legitimate way of clearing the + * groups vector on non-BSD systems (which generally do not + * have the egid in the groups[0]). We risk security holes + * when running non-BSD software if we do not do the same. + */ + pc->pc_ucred->cr_ngroups = 1; + } else { + if ((error = copyin((caddr_t)uap->gidset, + (caddr_t)pc->pc_ucred->cr_groups, ngrp * sizeof(gid_t)))) + return (error); + pc->pc_ucred->cr_ngroups = ngrp; + } + setsugid(p); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct setreuid_args { + uid_t ruid; + uid_t euid; +}; +#endif +/* ARGSUSED */ +int +setreuid(p, uap) + register struct proc *p; + struct setreuid_args *uap; +{ + register struct pcred *pc = p->p_cred; + register uid_t ruid, euid; + int error; + + ruid = uap->ruid; + euid = uap->euid; + if ((ruid != (uid_t)-1 && ruid != pc->p_ruid && ruid != pc->p_svuid || + euid != (uid_t)-1 && euid != pc->pc_ucred->cr_uid && + euid != pc->p_ruid && euid != pc->p_svuid) && + (error = suser(pc->pc_ucred, &p->p_acflag))) + return (error); + + if (euid != (uid_t)-1 && pc->pc_ucred->cr_uid != euid) { + pc->pc_ucred = crcopy(pc->pc_ucred); + pc->pc_ucred->cr_uid = euid; + setsugid(p); + } + if (ruid != (uid_t)-1 && pc->p_ruid != ruid) { + (void)chgproccnt(pc->p_ruid, -1); + (void)chgproccnt(ruid, 1); + pc->p_ruid = ruid; + setsugid(p); + } + if ((ruid != (uid_t)-1 || pc->pc_ucred->cr_uid != pc->p_ruid) && + pc->p_svuid != pc->pc_ucred->cr_uid) { + pc->p_svuid = pc->pc_ucred->cr_uid; + setsugid(p); + } + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct setregid_args { + gid_t rgid; + gid_t egid; +}; +#endif +/* ARGSUSED */ +int +setregid(p, uap) + register struct proc *p; + struct setregid_args *uap; +{ + register struct pcred *pc = p->p_cred; + register gid_t rgid, egid; + int error; + + rgid = uap->rgid; + egid = uap->egid; + if ((rgid != (gid_t)-1 && rgid != pc->p_rgid && rgid != pc->p_svgid || + egid != (gid_t)-1 && egid != pc->pc_ucred->cr_groups[0] && + egid != pc->p_rgid && egid != pc->p_svgid) && + (error = suser(pc->pc_ucred, &p->p_acflag))) + return (error); + + if (egid != (gid_t)-1 && pc->pc_ucred->cr_groups[0] != egid) { + pc->pc_ucred = crcopy(pc->pc_ucred); + pc->pc_ucred->cr_groups[0] = egid; + setsugid(p); + } + if (rgid != (gid_t)-1 && pc->p_rgid != rgid) { + pc->p_rgid = rgid; + setsugid(p); + } + if ((rgid != (gid_t)-1 || pc->pc_ucred->cr_groups[0] != pc->p_rgid) && + pc->p_svgid != pc->pc_ucred->cr_groups[0]) { + pc->p_svgid = pc->pc_ucred->cr_groups[0]; + setsugid(p); + } + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct issetugid_args { + int dummy; +}; +#endif +/* ARGSUSED */ +int +issetugid(p, uap) + register struct proc *p; + struct issetugid_args *uap; +{ + /* + * Note: OpenBSD sets a P_SUGIDEXEC flag set at execve() time, + * we use P_SUGID because we consider changing the owners as + * "tainting" as well. + * This is significant for procs that start as root and "become" + * a user without an exec - programs cannot know *everything* + * that libc *might* have put in their data segment. + */ + if (p->p_flag & P_SUGID) + return (1); + return (0); +} + +/* + * Check if gid is a member of the group set. + */ +int +groupmember(gid, cred) + gid_t gid; + register struct ucred *cred; +{ + register gid_t *gp; + gid_t *egp; + + egp = &(cred->cr_groups[cred->cr_ngroups]); + for (gp = cred->cr_groups; gp < egp; gp++) + if (*gp == gid) + return (1); + return (0); +} + +/* + * Test whether the specified credentials imply "super-user" + * privilege; if so, and we have accounting info, set the flag + * indicating use of super-powers. + * Returns 0 or error. + */ +int +suser(cred, acflag) + struct ucred *cred; + u_short *acflag; +{ + if (cred->cr_uid == 0) { + if (acflag) + *acflag |= ASU; + return (0); + } + return (EPERM); +} + +/* + * Allocate a zeroed cred structure. + */ +struct ucred * +crget() +{ + register struct ucred *cr; + + MALLOC(cr, struct ucred *, sizeof(*cr), M_CRED, M_WAITOK); + bzero((caddr_t)cr, sizeof(*cr)); + cr->cr_ref = 1; + return (cr); +} + +/* + * Free a cred structure. + * Throws away space when ref count gets to 0. + */ +void +crfree(cr) + struct ucred *cr; +{ + if (--cr->cr_ref == 0) + FREE((caddr_t)cr, M_CRED); +} + +/* + * Copy cred structure to a new one and free the old one. + */ +struct ucred * +crcopy(cr) + struct ucred *cr; +{ + struct ucred *newcr; + + if (cr->cr_ref == 1) + return (cr); + newcr = crget(); + *newcr = *cr; + crfree(cr); + newcr->cr_ref = 1; + return (newcr); +} + +/* + * Dup cred struct to a new held one. + */ +struct ucred * +crdup(cr) + struct ucred *cr; +{ + struct ucred *newcr; + + newcr = crget(); + *newcr = *cr; + newcr->cr_ref = 1; + return (newcr); +} + +/* + * Get login name, if available. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getlogin_args { + char *namebuf; + u_int namelen; +}; +#endif +/* ARGSUSED */ +int +getlogin(p, uap) + struct proc *p; + struct getlogin_args *uap; +{ + + if (uap->namelen > MAXLOGNAME) + uap->namelen = MAXLOGNAME; + return (copyout((caddr_t) p->p_pgrp->pg_session->s_login, + (caddr_t) uap->namebuf, uap->namelen)); +} + +/* + * Set login name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct setlogin_args { + char *namebuf; +}; +#endif +/* ARGSUSED */ +int +setlogin(p, uap) + struct proc *p; + struct setlogin_args *uap; +{ + int error; + char logintmp[MAXLOGNAME]; + + if ((error = suser(p->p_ucred, &p->p_acflag))) + return (error); + error = copyinstr((caddr_t) uap->namebuf, (caddr_t) logintmp, + sizeof(logintmp), (size_t *)0); + if (error == ENAMETOOLONG) + error = EINVAL; + else if (!error) + (void) memcpy(p->p_pgrp->pg_session->s_login, logintmp, + sizeof(logintmp)); + return (error); +} + +void +setsugid(p) + struct proc *p; +{ + p->p_flag |= P_SUGID; + if (!(p->p_pfsflags & PF_ISUGID)) + p->p_stops = 0; +} diff --git a/sys/kern/kern_random.c b/sys/kern/kern_random.c new file mode 100644 index 0000000..d635668 --- /dev/null +++ b/sys/kern/kern_random.c @@ -0,0 +1,379 @@ +/* + * random_machdep.c -- A strong random number generator + * + * $Id: random_machdep.c,v 1.28 1998/06/18 15:32:07 bde Exp $ + * + * Version 0.95, last modified 18-Oct-95 + * + * Copyright Theodore Ts'o, 1994, 1995. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * ALTERNATIVELY, this product may be distributed under the terms of + * the GNU Public License, in which case the provisions of the GPL are + * required INSTEAD OF the above restrictions. (This clause is + * necessary due to a potential bad interaction between the GPL and + * the restrictions contained in a BSD-style copyright.) + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/select.h> +#include <sys/poll.h> +#include <sys/md5.h> + +#include <machine/random.h> + +#include <i386/isa/icu.h> +#include <i386/isa/intr_machdep.h> + +#define MAX_BLKDEV 4 + +/* + * The pool is stirred with a primitive polynomial of degree 128 + * over GF(2), namely x^128 + x^99 + x^59 + x^31 + x^9 + x^7 + 1. + * For a pool of size 64, try x^64+x^62+x^38+x^10+x^6+x+1. + */ +#define POOLWORDS 128 /* Power of 2 - note that this is 32-bit words */ +#define POOLBITS (POOLWORDS*32) + +#if POOLWORDS == 128 +#define TAP1 99 /* The polynomial taps */ +#define TAP2 59 +#define TAP3 31 +#define TAP4 9 +#define TAP5 7 +#elif POOLWORDS == 64 +#define TAP1 62 /* The polynomial taps */ +#define TAP2 38 +#define TAP3 10 +#define TAP4 6 +#define TAP5 1 +#else +#error No primitive polynomial available for chosen POOLWORDS +#endif + +#define WRITEBUFFER 512 /* size in bytes */ + +/* There is actually only one of these, globally. */ +struct random_bucket { + u_int add_ptr; + u_int entropy_count; + int input_rotate; + u_int32_t *pool; + struct selinfo rsel; +}; + +/* There is one of these per entropy source */ +struct timer_rand_state { + u_long last_time; + int last_delta; + int nbits; +}; + +static struct random_bucket random_state; +static u_int32_t random_pool[POOLWORDS]; +static struct timer_rand_state keyboard_timer_state; +static struct timer_rand_state extract_timer_state; +static struct timer_rand_state irq_timer_state[ICU_LEN]; +#ifdef notyet +static struct timer_rand_state blkdev_timer_state[MAX_BLKDEV]; +#endif +static struct wait_queue *random_wait; + +#ifndef MIN +#define MIN(a,b) (((a) < (b)) ? (a) : (b)) +#endif + +void +rand_initialize(void) +{ + random_state.add_ptr = 0; + random_state.entropy_count = 0; + random_state.pool = random_pool; + random_wait = NULL; + random_state.rsel.si_flags = 0; + random_state.rsel.si_pid = 0; +} + +/* + * This function adds an int into the entropy "pool". It does not + * update the entropy estimate. The caller must do this if appropriate. + * + * The pool is stirred with a primitive polynomial of degree 128 + * over GF(2), namely x^128 + x^99 + x^59 + x^31 + x^9 + x^7 + 1. + * For a pool of size 64, try x^64+x^62+x^38+x^10+x^6+x+1. + * + * We rotate the input word by a changing number of bits, to help + * assure that all bits in the entropy get toggled. Otherwise, if we + * consistently feed the entropy pool small numbers (like ticks and + * scancodes, for example), the upper bits of the entropy pool don't + * get affected. --- TYT, 10/11/95 + */ +static __inline void +add_entropy_word(struct random_bucket *r, const u_int32_t input) +{ + u_int i; + u_int32_t w; + + w = (input << r->input_rotate) | (input >> (32 - r->input_rotate)); + i = r->add_ptr = (r->add_ptr - 1) & (POOLWORDS-1); + if (i) + r->input_rotate = (r->input_rotate + 7) & 31; + else + /* + * At the beginning of the pool, add an extra 7 bits + * rotation, so that successive passes spread the + * input bits across the pool evenly. + */ + r->input_rotate = (r->input_rotate + 14) & 31; + + /* XOR in the various taps */ + w ^= r->pool[(i+TAP1)&(POOLWORDS-1)]; + w ^= r->pool[(i+TAP2)&(POOLWORDS-1)]; + w ^= r->pool[(i+TAP3)&(POOLWORDS-1)]; + w ^= r->pool[(i+TAP4)&(POOLWORDS-1)]; + w ^= r->pool[(i+TAP5)&(POOLWORDS-1)]; + w ^= r->pool[i]; + /* Rotate w left 1 bit (stolen from SHA) and store */ + r->pool[i] = (w << 1) | (w >> 31); +} + +/* + * This function adds entropy to the entropy "pool" by using timing + * delays. It uses the timer_rand_state structure to make an estimate + * of how any bits of entropy this call has added to the pool. + * + * The number "num" is also added to the pool - it should somehow describe + * the type of event which just happened. This is currently 0-255 for + * keyboard scan codes, and 256 upwards for interrupts. + * On the i386, this is assumed to be at most 16 bits, and the high bits + * are used for a high-resolution timer. + */ +static void +add_timer_randomness(struct random_bucket *r, struct timer_rand_state *state, + u_int num) +{ + int delta, delta2; + u_int nbits; + u_int32_t time; + + num ^= timecounter->tc_get_timecount(timecounter) << 16; + r->entropy_count += 2; + + time = ticks; + + add_entropy_word(r, (u_int32_t) num); + add_entropy_word(r, time); + + /* + * Calculate number of bits of randomness we probably + * added. We take into account the first and second order + * deltas in order to make our estimate. + */ + delta = time - state->last_time; + state->last_time = time; + + delta2 = delta - state->last_delta; + state->last_delta = delta; + + if (delta < 0) delta = -delta; + if (delta2 < 0) delta2 = -delta2; + delta = MIN(delta, delta2) >> 1; + for (nbits = 0; delta; nbits++) + delta >>= 1; + + r->entropy_count += nbits; + + /* Prevent overflow */ + if (r->entropy_count > POOLBITS) + r->entropy_count = POOLBITS; + + if (r->entropy_count >= 8) + selwakeup(&random_state.rsel); +} + +void +add_keyboard_randomness(u_char scancode) +{ + add_timer_randomness(&random_state, &keyboard_timer_state, scancode); +} + +void +add_interrupt_randomness(void *vsc) +{ + int intr; + struct random_softc *sc = vsc; + + (sc->sc_handler)(sc->sc_arg); + intr = sc->sc_intr; + add_timer_randomness(&random_state, &irq_timer_state[intr], intr); +} + +#ifdef notused +void +add_blkdev_randomness(int major) +{ + if (major >= MAX_BLKDEV) + return; + + add_timer_randomness(&random_state, &blkdev_timer_state[major], + 0x200+major); +} +#endif /* notused */ + +#if POOLWORDS % 16 +#error extract_entropy() assumes that POOLWORDS is a multiple of 16 words. +#endif +/* + * This function extracts randomness from the "entropy pool", and + * returns it in a buffer. This function computes how many remaining + * bits of entropy are left in the pool, but it does not restrict the + * number of bytes that are actually obtained. + */ +static __inline int +extract_entropy(struct random_bucket *r, char *buf, int nbytes) +{ + int ret, i; + u_int32_t tmp[4]; + + add_timer_randomness(r, &extract_timer_state, nbytes); + + /* Redundant, but just in case... */ + if (r->entropy_count > POOLBITS) + r->entropy_count = POOLBITS; + /* Why is this here? Left in from Ted Ts'o. Perhaps to limit time. */ + if (nbytes > 32768) + nbytes = 32768; + + ret = nbytes; + if (r->entropy_count / 8 >= nbytes) + r->entropy_count -= nbytes*8; + else + r->entropy_count = 0; + + while (nbytes) { + /* Hash the pool to get the output */ + tmp[0] = 0x67452301; + tmp[1] = 0xefcdab89; + tmp[2] = 0x98badcfe; + tmp[3] = 0x10325476; + for (i = 0; i < POOLWORDS; i += 16) + MD5Transform(tmp, (char *)(r->pool+i)); + /* Modify pool so next hash will produce different results */ + add_entropy_word(r, tmp[0]); + add_entropy_word(r, tmp[1]); + add_entropy_word(r, tmp[2]); + add_entropy_word(r, tmp[3]); + /* + * Run the MD5 Transform one more time, since we want + * to add at least minimal obscuring of the inputs to + * add_entropy_word(). --- TYT + */ + MD5Transform(tmp, (char *)(r->pool)); + + /* Copy data to destination buffer */ + i = MIN(nbytes, 16); + bcopy(tmp, buf, i); + nbytes -= i; + buf += i; + } + + /* Wipe data from memory */ + bzero(tmp, sizeof(tmp)); + + return ret; +} + +#ifdef notused /* XXX NOT the exported kernel interface */ +/* + * This function is the exported kernel interface. It returns some + * number of good random numbers, suitable for seeding TCP sequence + * numbers, etc. + */ +void +get_random_bytes(void *buf, u_int nbytes) +{ + extract_entropy(&random_state, (char *) buf, nbytes); +} +#endif /* notused */ + +u_int +read_random(void *buf, u_int nbytes) +{ + if ((nbytes * 8) > random_state.entropy_count) + nbytes = random_state.entropy_count / 8; + + return extract_entropy(&random_state, (char *)buf, nbytes); +} + +u_int +read_random_unlimited(void *buf, u_int nbytes) +{ + return extract_entropy(&random_state, (char *)buf, nbytes); +} + +#ifdef notused +u_int +write_random(const char *buf, u_int nbytes) +{ + u_int i; + u_int32_t word, *p; + + for (i = nbytes, p = (u_int32_t *)buf; + i >= sizeof(u_int32_t); + i-= sizeof(u_int32_t), p++) + add_entropy_word(&random_state, *p); + if (i) { + word = 0; + bcopy(p, &word, i); + add_entropy_word(&random_state, word); + } + return nbytes; +} +#endif /* notused */ + +int +random_poll(dev_t dev, int events, struct proc *p) +{ + int s; + int revents = 0; + + s = splhigh(); + if (events & (POLLIN | POLLRDNORM)) + if (random_state.entropy_count >= 8) + revents |= events & (POLLIN | POLLRDNORM); + else + selrecord(p, &random_state.rsel); + + splx(s); + if (events & (POLLOUT | POLLWRNORM)) + revents |= events & (POLLOUT | POLLWRNORM); /* heh */ + + return (revents); +} + diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c new file mode 100644 index 0000000..1bad1d2 --- /dev/null +++ b/sys/kern/kern_resource.c @@ -0,0 +1,623 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_resource.c 8.5 (Berkeley) 1/21/94 + * $Id: kern_resource.c,v 1.37 1998/05/28 09:30:18 phk Exp $ + */ + +#include "opt_compat.h" +#include "opt_rlimit.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/file.h> +#include <sys/resourcevar.h> +#include <sys/malloc.h> +#include <sys/proc.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> + +static int donice __P((struct proc *curp, struct proc *chgp, int n)); +static int dosetrlimit __P((struct proc *p, u_int which, struct rlimit *limp)); + +/* + * Resource controls and accounting. + */ + +#ifndef _SYS_SYSPROTO_H_ +struct getpriority_args { + int which; + int who; +}; +#endif +int +getpriority(curp, uap) + struct proc *curp; + register struct getpriority_args *uap; +{ + register struct proc *p; + register int low = PRIO_MAX + 1; + + switch (uap->which) { + + case PRIO_PROCESS: + if (uap->who == 0) + p = curp; + else + p = pfind(uap->who); + if (p == 0) + break; + low = p->p_nice; + break; + + case PRIO_PGRP: { + register struct pgrp *pg; + + if (uap->who == 0) + pg = curp->p_pgrp; + else if ((pg = pgfind(uap->who)) == NULL) + break; + for (p = pg->pg_members.lh_first; p != 0; + p = p->p_pglist.le_next) { + if (p->p_nice < low) + low = p->p_nice; + } + break; + } + + case PRIO_USER: + if (uap->who == 0) + uap->who = curp->p_ucred->cr_uid; + for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) + if (p->p_ucred->cr_uid == uap->who && + p->p_nice < low) + low = p->p_nice; + break; + + default: + return (EINVAL); + } + if (low == PRIO_MAX + 1) + return (ESRCH); + curp->p_retval[0] = low; + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct setpriority_args { + int which; + int who; + int prio; +}; +#endif +/* ARGSUSED */ +int +setpriority(curp, uap) + struct proc *curp; + register struct setpriority_args *uap; +{ + register struct proc *p; + int found = 0, error = 0; + + switch (uap->which) { + + case PRIO_PROCESS: + if (uap->who == 0) + p = curp; + else + p = pfind(uap->who); + if (p == 0) + break; + error = donice(curp, p, uap->prio); + found++; + break; + + case PRIO_PGRP: { + register struct pgrp *pg; + + if (uap->who == 0) + pg = curp->p_pgrp; + else if ((pg = pgfind(uap->who)) == NULL) + break; + for (p = pg->pg_members.lh_first; p != 0; + p = p->p_pglist.le_next) { + error = donice(curp, p, uap->prio); + found++; + } + break; + } + + case PRIO_USER: + if (uap->who == 0) + uap->who = curp->p_ucred->cr_uid; + for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) + if (p->p_ucred->cr_uid == uap->who) { + error = donice(curp, p, uap->prio); + found++; + } + break; + + default: + return (EINVAL); + } + if (found == 0) + return (ESRCH); + return (error); +} + +static int +donice(curp, chgp, n) + register struct proc *curp, *chgp; + register int n; +{ + register struct pcred *pcred = curp->p_cred; + + if (pcred->pc_ucred->cr_uid && pcred->p_ruid && + pcred->pc_ucred->cr_uid != chgp->p_ucred->cr_uid && + pcred->p_ruid != chgp->p_ucred->cr_uid) + return (EPERM); + if (n > PRIO_MAX) + n = PRIO_MAX; + if (n < PRIO_MIN) + n = PRIO_MIN; + if (n < chgp->p_nice && suser(pcred->pc_ucred, &curp->p_acflag)) + return (EACCES); + chgp->p_nice = n; + (void)resetpriority(chgp); + return (0); +} + +/* rtprio system call */ +#ifndef _SYS_SYSPROTO_H_ +struct rtprio_args { + int function; + pid_t pid; + struct rtprio *rtp; +}; +#endif + +/* + * Set realtime priority + */ + +/* ARGSUSED */ +int +rtprio(curp, uap) + struct proc *curp; + register struct rtprio_args *uap; +{ + register struct proc *p; + register struct pcred *pcred = curp->p_cred; + struct rtprio rtp; + int error; + + error = copyin(uap->rtp, &rtp, sizeof(struct rtprio)); + if (error) + return (error); + + if (uap->pid == 0) + p = curp; + else + p = pfind(uap->pid); + + if (p == 0) + return (ESRCH); + + switch (uap->function) { + case RTP_LOOKUP: + return (copyout(&p->p_rtprio, uap->rtp, sizeof(struct rtprio))); + case RTP_SET: + if (pcred->pc_ucred->cr_uid && pcred->p_ruid && + pcred->pc_ucred->cr_uid != p->p_ucred->cr_uid && + pcred->p_ruid != p->p_ucred->cr_uid) + return (EPERM); + /* disallow setting rtprio in most cases if not superuser */ + if (suser(pcred->pc_ucred, &curp->p_acflag)) { + /* can't set someone else's */ + if (uap->pid) + return (EPERM); + /* can't set realtime priority */ +/* + * Realtime priority has to be restricted for reasons which should be + * obvious. However, for idle priority, there is a potential for + * system deadlock if an idleprio process gains a lock on a resource + * that other processes need (and the idleprio process can't run + * due to a CPU-bound normal process). Fix me! XXX + */ +#if 0 + if (RTP_PRIO_IS_REALTIME(rtp.type)) +#endif + if (rtp.type != RTP_PRIO_NORMAL) + return (EPERM); + } + switch (rtp.type) { +#ifdef RTP_PRIO_FIFO + case RTP_PRIO_FIFO: +#endif + case RTP_PRIO_REALTIME: + case RTP_PRIO_NORMAL: + case RTP_PRIO_IDLE: + if (rtp.prio > RTP_PRIO_MAX) + return (EINVAL); + p->p_rtprio = rtp; + return (0); + default: + return (EINVAL); + } + + default: + return (EINVAL); + } +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#ifndef _SYS_SYSPROTO_H_ +struct osetrlimit_args { + u_int which; + struct orlimit *rlp; +}; +#endif +/* ARGSUSED */ +int +osetrlimit(p, uap) + struct proc *p; + register struct osetrlimit_args *uap; +{ + struct orlimit olim; + struct rlimit lim; + int error; + + if ((error = + copyin((caddr_t)uap->rlp, (caddr_t)&olim, sizeof(struct orlimit)))) + return (error); + lim.rlim_cur = olim.rlim_cur; + lim.rlim_max = olim.rlim_max; + return (dosetrlimit(p, uap->which, &lim)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct ogetrlimit_args { + u_int which; + struct orlimit *rlp; +}; +#endif +/* ARGSUSED */ +int +ogetrlimit(p, uap) + struct proc *p; + register struct ogetrlimit_args *uap; +{ + struct orlimit olim; + + if (uap->which >= RLIM_NLIMITS) + return (EINVAL); + olim.rlim_cur = p->p_rlimit[uap->which].rlim_cur; + if (olim.rlim_cur == -1) + olim.rlim_cur = 0x7fffffff; + olim.rlim_max = p->p_rlimit[uap->which].rlim_max; + if (olim.rlim_max == -1) + olim.rlim_max = 0x7fffffff; + return (copyout((caddr_t)&olim, (caddr_t)uap->rlp, sizeof(olim))); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +#ifndef _SYS_SYSPROTO_H_ +struct __setrlimit_args { + u_int which; + struct rlimit *rlp; +}; +#endif +/* ARGSUSED */ +int +setrlimit(p, uap) + struct proc *p; + register struct __setrlimit_args *uap; +{ + struct rlimit alim; + int error; + + if ((error = + copyin((caddr_t)uap->rlp, (caddr_t)&alim, sizeof (struct rlimit)))) + return (error); + return (dosetrlimit(p, uap->which, &alim)); +} + +static int +dosetrlimit(p, which, limp) + struct proc *p; + u_int which; + struct rlimit *limp; +{ + register struct rlimit *alimp; + int error; + + if (which >= RLIM_NLIMITS) + return (EINVAL); + alimp = &p->p_rlimit[which]; + + /* + * Preserve historical bugs by treating negative limits as unsigned. + */ + if (limp->rlim_cur < 0) + limp->rlim_cur = RLIM_INFINITY; + if (limp->rlim_max < 0) + limp->rlim_max = RLIM_INFINITY; + + if (limp->rlim_cur > alimp->rlim_max || + limp->rlim_max > alimp->rlim_max) + if ((error = suser(p->p_ucred, &p->p_acflag))) + return (error); + if (limp->rlim_cur > limp->rlim_max) + limp->rlim_cur = limp->rlim_max; + if (p->p_limit->p_refcnt > 1 && + (p->p_limit->p_lflags & PL_SHAREMOD) == 0) { + p->p_limit->p_refcnt--; + p->p_limit = limcopy(p->p_limit); + alimp = &p->p_rlimit[which]; + } + + switch (which) { + + case RLIMIT_CPU: + if (limp->rlim_cur > RLIM_INFINITY / (rlim_t)1000000) + p->p_limit->p_cpulimit = RLIM_INFINITY; + else + p->p_limit->p_cpulimit = + (rlim_t)1000000 * limp->rlim_cur; + break; + case RLIMIT_DATA: + if (limp->rlim_cur > MAXDSIZ) + limp->rlim_cur = MAXDSIZ; + if (limp->rlim_max > MAXDSIZ) + limp->rlim_max = MAXDSIZ; + break; + + case RLIMIT_STACK: + if (limp->rlim_cur > MAXSSIZ) + limp->rlim_cur = MAXSSIZ; + if (limp->rlim_max > MAXSSIZ) + limp->rlim_max = MAXSSIZ; + /* + * Stack is allocated to the max at exec time with only + * "rlim_cur" bytes accessible. If stack limit is going + * up make more accessible, if going down make inaccessible. + */ + if (limp->rlim_cur != alimp->rlim_cur) { + vm_offset_t addr; + vm_size_t size; + vm_prot_t prot; + + if (limp->rlim_cur > alimp->rlim_cur) { + prot = VM_PROT_ALL; + size = limp->rlim_cur - alimp->rlim_cur; + addr = USRSTACK - limp->rlim_cur; + } else { + prot = VM_PROT_NONE; + size = alimp->rlim_cur - limp->rlim_cur; + addr = USRSTACK - alimp->rlim_cur; + } + addr = trunc_page(addr); + size = round_page(size); + (void) vm_map_protect(&p->p_vmspace->vm_map, + addr, addr+size, prot, FALSE); + } + break; + + case RLIMIT_NOFILE: + if (limp->rlim_cur > maxfilesperproc) + limp->rlim_cur = maxfilesperproc; + if (limp->rlim_max > maxfilesperproc) + limp->rlim_max = maxfilesperproc; + break; + + case RLIMIT_NPROC: + if (limp->rlim_cur > maxprocperuid) + limp->rlim_cur = maxprocperuid; + if (limp->rlim_max > maxprocperuid) + limp->rlim_max = maxprocperuid; + break; + } + *alimp = *limp; + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct __getrlimit_args { + u_int which; + struct rlimit *rlp; +}; +#endif +/* ARGSUSED */ +int +getrlimit(p, uap) + struct proc *p; + register struct __getrlimit_args *uap; +{ + + if (uap->which >= RLIM_NLIMITS) + return (EINVAL); + return (copyout((caddr_t)&p->p_rlimit[uap->which], (caddr_t)uap->rlp, + sizeof (struct rlimit))); +} + +/* + * Transform the running time and tick information in proc p into user, + * system, and interrupt time usage. + */ +void +calcru(p, up, sp, ip) + struct proc *p; + struct timeval *up; + struct timeval *sp; + struct timeval *ip; +{ + int64_t totusec; + u_int64_t u, st, ut, it, tot; + int s; + struct timeval tv; + + /* XXX: why spl-protect ? worst case is an off-by-one report */ + s = splstatclock(); + st = p->p_sticks; + ut = p->p_uticks; + it = p->p_iticks; + splx(s); + + tot = st + ut + it; + if (tot == 0) { + st = 1; + tot = 1; + } + + totusec = p->p_runtime; +#ifdef SMP + if (p->p_oncpu != (char)0xff) { +#else + if (p == curproc) { +#endif + /* + * Adjust for the current time slice. This is actually fairly + * important since the error here is on the order of a time + * quantum, which is much greater than the sampling error. + */ + microuptime(&tv); + totusec += (tv.tv_usec - p->p_switchtime.tv_usec) + + (tv.tv_sec - p->p_switchtime.tv_sec) * (int64_t)1000000; + + /* + * Copy the time that was just read to `switchtime' in case + * we are being called from exit1(). Exits don't go through + * mi_switch(), so `switchtime' doesn't get set in the normal + * way. We set it here instead of more cleanly in exit1() + * to avoid losing track of the time between the calls to + * microuptime(). + */ + switchtime = tv; + } + if (totusec < 0) { + /* XXX no %qd in kernel. Truncate. */ + printf("calcru: negative time of %ld usec for pid %d (%s)\n", + (long)totusec, p->p_pid, p->p_comm); + totusec = 0; + } + u = totusec; + st = (u * st) / tot; + sp->tv_sec = st / 1000000; + sp->tv_usec = st % 1000000; + ut = (u * ut) / tot; + up->tv_sec = ut / 1000000; + up->tv_usec = ut % 1000000; + if (ip != NULL) { + it = (u * it) / tot; + ip->tv_sec = it / 1000000; + ip->tv_usec = it % 1000000; + } +} + +#ifndef _SYS_SYSPROTO_H_ +struct getrusage_args { + int who; + struct rusage *rusage; +}; +#endif +/* ARGSUSED */ +int +getrusage(p, uap) + register struct proc *p; + register struct getrusage_args *uap; +{ + register struct rusage *rup; + + switch (uap->who) { + + case RUSAGE_SELF: + rup = &p->p_stats->p_ru; + calcru(p, &rup->ru_utime, &rup->ru_stime, NULL); + break; + + case RUSAGE_CHILDREN: + rup = &p->p_stats->p_cru; + break; + + default: + return (EINVAL); + } + return (copyout((caddr_t)rup, (caddr_t)uap->rusage, + sizeof (struct rusage))); +} + +void +ruadd(ru, ru2) + register struct rusage *ru, *ru2; +{ + register long *ip, *ip2; + register int i; + + timevaladd(&ru->ru_utime, &ru2->ru_utime); + timevaladd(&ru->ru_stime, &ru2->ru_stime); + if (ru->ru_maxrss < ru2->ru_maxrss) + ru->ru_maxrss = ru2->ru_maxrss; + ip = &ru->ru_first; ip2 = &ru2->ru_first; + for (i = &ru->ru_last - &ru->ru_first; i >= 0; i--) + *ip++ += *ip2++; +} + +/* + * Make a copy of the plimit structure. + * We share these structures copy-on-write after fork, + * and copy when a limit is changed. + */ +struct plimit * +limcopy(lim) + struct plimit *lim; +{ + register struct plimit *copy; + + MALLOC(copy, struct plimit *, sizeof(struct plimit), + M_SUBPROC, M_WAITOK); + bcopy(lim->pl_rlimit, copy->pl_rlimit, sizeof(struct plimit)); + copy->p_lflags = 0; + copy->p_refcnt = 1; + return (copy); +} diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c new file mode 100644 index 0000000..4d6db41 --- /dev/null +++ b/sys/kern/kern_shutdown.c @@ -0,0 +1,530 @@ +/*- + * Copyright (c) 1986, 1988, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_shutdown.c 8.3 (Berkeley) 1/21/94 + * $Id: kern_shutdown.c,v 1.43 1998/12/04 22:54:51 archie Exp $ + */ + +#include "opt_ddb.h" +#include "opt_hw_wdog.h" +#include "opt_panic.h" +#include "opt_show_busybufs.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/buf.h> +#include <sys/reboot.h> +#include <sys/proc.h> +#include <sys/vnode.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/mount.h> +#include <sys/queue.h> +#include <sys/sysctl.h> +#include <sys/conf.h> +#include <sys/sysproto.h> + +#include <machine/pcb.h> +#include <machine/clock.h> +#include <machine/cons.h> +#include <machine/md_var.h> +#ifdef SMP +#include <machine/smp.h> /* smp_active, cpuid */ +#endif + +#include <sys/signalvar.h> + +#ifndef PANIC_REBOOT_WAIT_TIME +#define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */ +#endif + +/* + * Note that stdarg.h and the ANSI style va_start macro is used for both + * ANSI and traditional C compilers. + */ +#include <machine/stdarg.h> + +#ifdef DDB +#ifdef DDB_UNATTENDED +int debugger_on_panic = 0; +#else +int debugger_on_panic = 1; +#endif +SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic, CTLFLAG_RW, + &debugger_on_panic, 0, ""); +#endif + +#ifdef HW_WDOG +/* + * If there is a hardware watchdog, point this at the function needed to + * hold it off. + * It's needed when the kernel needs to do some lengthy operations. + * e.g. in wd.c when dumping core.. It's most annoying to have + * your precious core-dump only half written because the wdog kicked in. + */ +watchdog_tickle_fn wdog_tickler = NULL; +#endif /* HW_WDOG */ + +/* + * Variable panicstr contains argument to first call to panic; used as flag + * to indicate that the kernel has already called panic. + */ +const char *panicstr; + +/* + * callout list for things to do a shutdown + */ +typedef struct shutdown_list_element { + LIST_ENTRY(shutdown_list_element) links; + bootlist_fn function; + void *arg; + int priority; +} *sle_p; + +/* + * There are three shutdown lists. Some things need to be shut down + * earlier than others. + */ +LIST_HEAD(shutdown_list, shutdown_list_element); + +static struct shutdown_list shutdown_lists[SHUTDOWN_FINAL + 1]; + +static void boot __P((int)) __dead2; +static void dumpsys __P((void)); + +#ifndef _SYS_SYSPROTO_H_ +struct reboot_args { + int opt; +}; +#endif +/* ARGSUSED */ + +/* + * The system call that results in a reboot + */ +int +reboot(p, uap) + struct proc *p; + struct reboot_args *uap; +{ + int error; + + if ((error = suser(p->p_ucred, &p->p_acflag))) + return (error); + + boot(uap->opt); + return (0); +} + +/* + * Called by events that want to shut down.. e.g <CTL><ALT><DEL> on a PC + */ +void +shutdown_nice() +{ + /* Send a signal to init(8) and have it shutdown the world */ + if (initproc != NULL) { + psignal(initproc, SIGINT); + } else { + /* No init(8) running, so simply reboot */ + boot(RB_NOSYNC); + } + return; +} +static int waittime = -1; +static struct pcb dumppcb; + +/* + * Go through the rigmarole of shutting down.. + * this used to be in machdep.c but I'll be dammned if I could see + * anything machine dependant in it. + */ +static void +boot(howto) + int howto; +{ + sle_p ep; + +#ifdef SMP + if (smp_active) { + printf("boot() called on cpu#%d\n", cpuid); + } +#endif + /* + * Do any callouts that should be done BEFORE syncing the filesystems. + */ + LIST_FOREACH(ep, &shutdown_lists[SHUTDOWN_PRE_SYNC], links) + (*ep->function)(howto, ep->arg); + + /* + * Now sync filesystems + */ + if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) { + register struct buf *bp; + int iter, nbusy; + + waittime = 0; + printf("\nsyncing disks... "); + + sync(&proc0, NULL); + + /* + * With soft updates, some buffers that are + * written will be remarked as dirty until other + * buffers are written. + */ + for (iter = 0; iter < 20; iter++) { + nbusy = 0; + for (bp = &buf[nbuf]; --bp >= buf; ) { + if ((bp->b_flags & (B_BUSY | B_INVAL)) + == B_BUSY) { + nbusy++; + } else if ((bp->b_flags & (B_DELWRI | B_INVAL)) + == B_DELWRI) { + /* bawrite(bp);*/ + nbusy++; + } + } + if (nbusy == 0) + break; + printf("%d ", nbusy); + sync(&proc0, NULL); + DELAY(50000 * iter); + } + /* + * Count only busy local buffers to prevent forcing + * a fsck if we're just a client of a wedged NFS server + */ + nbusy = 0; + for (bp = &buf[nbuf]; --bp >= buf; ) { + if (((bp->b_flags & (B_BUSY | B_INVAL)) == B_BUSY) + ||((bp->b_flags & (B_DELWRI | B_INVAL))== B_DELWRI)) + if(bp->b_dev == NODEV) + CIRCLEQ_REMOVE(&mountlist, bp->b_vp->v_mount, mnt_list); + else + nbusy++; + + + } + if (nbusy) { + /* + * Failed to sync all blocks. Indicate this and don't + * unmount filesystems (thus forcing an fsck on reboot). + */ + printf("giving up\n"); +#ifdef SHOW_BUSYBUFS + nbusy = 0; + for (bp = &buf[nbuf]; --bp >= buf; ) { + if ((bp->b_flags & (B_BUSY | B_INVAL)) + == B_BUSY) { + nbusy++; + printf( + "%d: dev:%08lx, flags:%08lx, blkno:%ld, lblkno:%ld\n", + nbusy, (u_long)bp->b_dev, + bp->b_flags, (long)bp->b_blkno, + (long)bp->b_lblkno); + } + } + DELAY(5000000); /* 5 seconds */ +#endif + } else { + printf("done\n"); + /* + * Unmount filesystems + */ + if (panicstr == 0) + vfs_unmountall(); + } + DELAY(100000); /* wait for console output to finish */ + } + + /* + * Ok, now do things that assume all filesystem activity has + * been completed. + */ + LIST_FOREACH(ep, &shutdown_lists[SHUTDOWN_POST_SYNC], links) + (*ep->function)(howto, ep->arg); + splhigh(); + if ((howto & (RB_HALT|RB_DUMP)) == RB_DUMP && !cold) { + savectx(&dumppcb); +#ifdef __i386__ + dumppcb.pcb_cr3 = rcr3(); +#endif + dumpsys(); + } + + /* Now that we're going to really halt the system... */ + LIST_FOREACH(ep, &shutdown_lists[SHUTDOWN_FINAL], links) + (*ep->function)(howto, ep->arg); + + if (howto & RB_HALT) { + printf("\n"); + printf("The operating system has halted.\n"); + printf("Please press any key to reboot.\n\n"); + switch (cngetc()) { + case -1: /* No console, just die */ + cpu_halt(); + /* NOTREACHED */ + default: + howto &= ~RB_HALT; + break; + } + } else if (howto & RB_DUMP) { + /* System Paniced */ + + if (PANIC_REBOOT_WAIT_TIME != 0) { + if (PANIC_REBOOT_WAIT_TIME != -1) { + int loop; + printf("Automatic reboot in %d seconds - " + "press a key on the console to abort\n", + PANIC_REBOOT_WAIT_TIME); + for (loop = PANIC_REBOOT_WAIT_TIME * 10; + loop > 0; --loop) { + DELAY(1000 * 100); /* 1/10th second */ + /* Did user type a key? */ + if (cncheckc() != -1) + break; + } + if (!loop) + goto die; + } + } else { /* zero time specified - reboot NOW */ + goto die; + } + printf("--> Press a key on the console to reboot <--\n"); + cngetc(); + } +die: + printf("Rebooting...\n"); + DELAY(1000000); /* wait 1 sec for printf's to complete and be read */ + /* cpu_boot(howto); */ /* doesn't do anything at the moment */ + cpu_reset(); + for(;;) ; + /* NOTREACHED */ +} + +/* + * Magic number for savecore + * + * exported (symorder) and used at least by savecore(8) + * + */ +static u_long const dumpmag = 0x8fca0101UL; + +static int dumpsize = 0; /* also for savecore */ + +static int dodump = 1; +SYSCTL_INT(_machdep, OID_AUTO, do_dump, CTLFLAG_RW, &dodump, 0, ""); + +/* ARGSUSED */ +static void dump_conf __P((void *dummy)); +static void +dump_conf(dummy) + void *dummy; +{ + cpu_dumpconf(); +} +SYSINIT(dump_conf, SI_SUB_DUMP_CONF, SI_ORDER_FIRST, dump_conf, NULL) + +/* + * Doadump comes here after turning off memory management and + * getting on the dump stack, either when called above, or by + * the auto-restart code. + */ +static void +dumpsys(void) +{ + + if (!dodump) + return; + if (dumpdev == NODEV) + return; + if (!(bdevsw[major(dumpdev)])) + return; + if (!(bdevsw[major(dumpdev)]->d_dump)) + return; + dumpsize = Maxmem; + printf("\ndumping to dev %lx, offset %ld\n", (u_long)dumpdev, dumplo); + printf("dump "); + switch ((*bdevsw[major(dumpdev)]->d_dump)(dumpdev)) { + + case ENXIO: + printf("device bad\n"); + break; + + case EFAULT: + printf("device not ready\n"); + break; + + case EINVAL: + printf("area improper\n"); + break; + + case EIO: + printf("i/o error\n"); + break; + + case EINTR: + printf("aborted from console\n"); + break; + + default: + printf("succeeded\n"); + break; + } +} + +/* + * Panic is called on unresolvable fatal errors. It prints "panic: mesg", + * and then reboots. If we are called twice, then we avoid trying to sync + * the disks as this often leads to recursive panics. + */ +void +panic(const char *fmt, ...) +{ + int bootopt; + va_list ap; + static char buf[256]; + + bootopt = RB_AUTOBOOT | RB_DUMP; + if (panicstr) + bootopt |= RB_NOSYNC; + else + panicstr = fmt; + + va_start(ap, fmt); + (void)vsnprintf(buf, sizeof(buf), fmt, ap); + if (panicstr == fmt) + panicstr = buf; + va_end(ap); + printf("panic: %s\n", buf); +#ifdef SMP + /* three seperate prints in case of an unmapped page and trap */ + printf("mp_lock = %08x; ", mp_lock); + printf("cpuid = %d; ", cpuid); + printf("lapic.id = %08x\n", lapic.id); +#endif + +#if defined(DDB) + if (debugger_on_panic) + Debugger ("panic"); +#endif + boot(bootopt); +} + +/* + * Three routines to handle adding/deleting items on the + * shutdown callout lists + * + * at_shutdown(): + * Take the arguments given and put them onto the shutdown callout list. + * However first make sure that it's not already there. + * returns 0 on success. + */ +int +at_shutdown(bootlist_fn function, void *arg, int queue) +{ + return(at_shutdown_pri(function, arg, queue, SHUTDOWN_PRI_DEFAULT)); +} + +/* + * at_shutdown_pri(): + * Take the arguments given and put them onto the shutdown callout list + * with the given execution priority. + * returns 0 on success. + */ +int +at_shutdown_pri(bootlist_fn function, void *arg, int queue, int pri) +{ + sle_p ep, ip; + + if (queue < SHUTDOWN_PRE_SYNC + || queue > SHUTDOWN_FINAL) { + printf("at_shutdown: bad exit callout queue %d specified\n", + queue); + return (EINVAL); + } + if (rm_at_shutdown(function, arg)) + printf("at_shutdown: exit callout entry was already present\n"); + ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT); + if (ep == NULL) + return (ENOMEM); + ep->function = function; + ep->arg = arg; + ep->priority = pri; + + /* Sort into list of items on this queue */ + ip = LIST_FIRST(&shutdown_lists[queue]); + if (ip == NULL) { + LIST_INSERT_HEAD(&shutdown_lists[queue], ep, links); + } else { + for (; LIST_NEXT(ip, links) != NULL; ip = LIST_NEXT(ip, links)) { + if (ep->priority < ip->priority) { + LIST_INSERT_BEFORE(ip, ep, links); + ep = NULL; + break; + } + } + if (ep != NULL) + LIST_INSERT_AFTER(ip, ep, links); + } + return (0); +} + +/* + * Scan the exit callout lists for the given items and remove them. + * Returns the number of items removed. + */ +int +rm_at_shutdown(bootlist_fn function, void *arg) +{ + sle_p ep; + int count; + int queue; + + count = 0; + for (queue = SHUTDOWN_PRE_SYNC; queue < SHUTDOWN_FINAL; queue++) { + LIST_FOREACH(ep, &shutdown_lists[queue], links) { + if ((ep->function == function) && (ep->arg == arg)) { + LIST_REMOVE(ep, links); + free(ep, M_TEMP); + count++; + } + } + } + return (count); +} diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c new file mode 100644 index 0000000..bf89d8a --- /dev/null +++ b/sys/kern/kern_sig.c @@ -0,0 +1,1455 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94 + * $Id: kern_sig.c,v 1.52 1999/01/08 17:31:10 eivind Exp $ + */ + +#include "opt_compat.h" +#include "opt_ktrace.h" + +#define SIGPROP /* include signal properties table */ +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/sysproto.h> +#include <sys/signalvar.h> +#include <sys/resourcevar.h> +#include <sys/namei.h> +#include <sys/vnode.h> +#include <sys/proc.h> +#include <sys/pioctl.h> +#include <sys/systm.h> +#include <sys/acct.h> +#include <sys/fcntl.h> +#include <sys/wait.h> +#include <sys/ktrace.h> +#include <sys/syslog.h> +#include <sys/stat.h> +#include <sys/sysent.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> + +#include <machine/cpu.h> +#ifdef SMP +#include <machine/smp.h> +#endif + +static int killpg1 __P((struct proc *cp, int signum, int pgid, int all)); +static void setsigvec __P((struct proc *p, int signum, struct sigaction *sa)); +static void stop __P((struct proc *)); + +static int kern_logsigexit = 1; +SYSCTL_INT(_kern, KERN_LOGSIGEXIT, logsigexit, CTLFLAG_RW, &kern_logsigexit, 0, ""); + +/* + * Can process p, with pcred pc, send the signal signum to process q? + */ +#define CANSIGNAL(p, pc, q, signum) \ + ((pc)->pc_ucred->cr_uid == 0 || \ + (pc)->p_ruid == (q)->p_cred->p_ruid || \ + (pc)->pc_ucred->cr_uid == (q)->p_cred->p_ruid || \ + (pc)->p_ruid == (q)->p_ucred->cr_uid || \ + (pc)->pc_ucred->cr_uid == (q)->p_ucred->cr_uid || \ + ((signum) == SIGCONT && (q)->p_session == (p)->p_session)) + +/* + * Policy -- Can real uid ruid with ucred uc send a signal to process q? + */ +#define CANSIGIO(ruid, uc, q) \ + ((uc)->cr_uid == 0 || \ + (ruid) == (q)->p_cred->p_ruid || \ + (uc)->cr_uid == (q)->p_cred->p_ruid || \ + (ruid) == (q)->p_ucred->cr_uid || \ + (uc)->cr_uid == (q)->p_ucred->cr_uid) + +int sugid_coredump; +SYSCTL_INT(_kern, OID_AUTO, sugid_coredump, CTLFLAG_RW, &sugid_coredump, 0, ""); + +#ifndef _SYS_SYSPROTO_H_ +struct sigaction_args { + int signum; + struct sigaction *nsa; + struct sigaction *osa; +}; +#endif +/* ARGSUSED */ +int +sigaction(p, uap) + struct proc *p; + register struct sigaction_args *uap; +{ + struct sigaction vec; + register struct sigaction *sa; + register struct sigacts *ps = p->p_sigacts; + register int signum; + int bit, error; + + signum = uap->signum; + if (signum <= 0 || signum >= NSIG) + return (EINVAL); + sa = &vec; + if (uap->osa) { + sa->sa_handler = ps->ps_sigact[signum]; + sa->sa_mask = ps->ps_catchmask[signum]; + bit = sigmask(signum); + sa->sa_flags = 0; + if ((ps->ps_sigonstack & bit) != 0) + sa->sa_flags |= SA_ONSTACK; + if ((ps->ps_sigintr & bit) == 0) + sa->sa_flags |= SA_RESTART; + if ((ps->ps_sigreset & bit) != 0) + sa->sa_flags |= SA_RESETHAND; + if ((ps->ps_signodefer & bit) != 0) + sa->sa_flags |= SA_NODEFER; +#ifndef COMPAT_LINUX_THREADS + if (signum == SIGCHLD && p->p_flag & P_NOCLDSTOP) +#else + if (signum == SIGCHLD && p->p_procsig->ps_flag & P_NOCLDSTOP) +#endif /* COMPAT_LINUX_THREADS */ + sa->sa_flags |= SA_NOCLDSTOP; +#ifndef COMPAT_LINUX_THREADS + if (signum == SIGCHLD && p->p_flag & P_NOCLDWAIT) +#else + if (signum == SIGCHLD && p->p_procsig->ps_flag & P_NOCLDWAIT) +#endif /* COMPAT_LINUX_THREADS */ + sa->sa_flags |= SA_NOCLDWAIT; + if ((error = copyout((caddr_t)sa, (caddr_t)uap->osa, + sizeof (vec)))) + return (error); + } + if (uap->nsa) { + if ((error = copyin((caddr_t)uap->nsa, (caddr_t)sa, + sizeof (vec)))) + return (error); + if ((signum == SIGKILL || signum == SIGSTOP) && + sa->sa_handler != SIG_DFL) + return (EINVAL); + setsigvec(p, signum, sa); + } + return (0); +} + +static void +setsigvec(p, signum, sa) + register struct proc *p; + int signum; + register struct sigaction *sa; +{ + register struct sigacts *ps = p->p_sigacts; + register int bit; + + bit = sigmask(signum); + /* + * Change setting atomically. + */ + (void) splhigh(); + ps->ps_sigact[signum] = sa->sa_handler; + ps->ps_catchmask[signum] = sa->sa_mask &~ sigcantmask; + if ((sa->sa_flags & SA_RESTART) == 0) + ps->ps_sigintr |= bit; + else + ps->ps_sigintr &= ~bit; + if (sa->sa_flags & SA_ONSTACK) + ps->ps_sigonstack |= bit; + else + ps->ps_sigonstack &= ~bit; + if (sa->sa_flags & SA_RESETHAND) + ps->ps_sigreset |= bit; + else + ps->ps_sigreset &= ~bit; + if (sa->sa_flags & SA_NODEFER) + ps->ps_signodefer |= bit; + else + ps->ps_signodefer &= ~bit; +#ifdef COMPAT_SUNOS + if (sa->sa_flags & SA_USERTRAMP) + ps->ps_usertramp |= bit; + else + ps->ps_usertramp &= ~bit; +#endif + if (signum == SIGCHLD) { + if (sa->sa_flags & SA_NOCLDSTOP) +#ifndef COMPAT_LINUX_THREADS + p->p_flag |= P_NOCLDSTOP; + else + p->p_flag &= ~P_NOCLDSTOP; +#else + p->p_procsig->ps_flag |= P_NOCLDSTOP; + else + p->p_procsig->ps_flag &= ~P_NOCLDSTOP; +#endif /* COMPAT_LINUX_THREADS */ + if (sa->sa_flags & SA_NOCLDWAIT) { + /* + * Paranoia: since SA_NOCLDWAIT is implemented by + * reparenting the dying child to PID 1 (and + * trust it to reap the zombie), PID 1 itself is + * forbidden to set SA_NOCLDWAIT. + */ + if (p->p_pid == 1) +#ifndef COMPAT_LINUX_THREADS + p->p_flag &= ~P_NOCLDWAIT; + else + p->p_flag |= P_NOCLDWAIT; +#else + p->p_procsig->ps_flag &= ~P_NOCLDWAIT; + else + p->p_procsig->ps_flag |= P_NOCLDWAIT; +#endif /* COMPAT_LINUX_THREADS */ + } else +#ifndef COMPAT_LINUX_THREADS + p->p_flag &= ~P_NOCLDWAIT; +#else + p->p_procsig->ps_flag &= ~P_NOCLDWAIT; +#endif /* COMPAT_LINUX_THREADS */ + } + /* + * Set bit in p_sigignore for signals that are set to SIG_IGN, + * and for signals set to SIG_DFL where the default is to ignore. + * However, don't put SIGCONT in p_sigignore, + * as we have to restart the process. + */ + if (sa->sa_handler == SIG_IGN || + (sigprop[signum] & SA_IGNORE && sa->sa_handler == SIG_DFL)) { + p->p_siglist &= ~bit; /* never to be seen again */ + if (signum != SIGCONT) + p->p_sigignore |= bit; /* easier in psignal */ + p->p_sigcatch &= ~bit; + } else { + p->p_sigignore &= ~bit; + if (sa->sa_handler == SIG_DFL) + p->p_sigcatch &= ~bit; + else + p->p_sigcatch |= bit; + } + (void) spl0(); +} + +/* + * Initialize signal state for process 0; + * set to ignore signals that are ignored by default. + */ +void +siginit(p) + struct proc *p; +{ + register int i; + + for (i = 0; i < NSIG; i++) + if (sigprop[i] & SA_IGNORE && i != SIGCONT) + p->p_sigignore |= sigmask(i); +} + +/* + * Reset signals for an exec of the specified process. + */ +void +execsigs(p) + register struct proc *p; +{ + register struct sigacts *ps = p->p_sigacts; + register int nc, mask; + + /* + * Reset caught signals. Held signals remain held + * through p_sigmask (unless they were caught, + * and are now ignored by default). + */ + while (p->p_sigcatch) { + nc = ffs((long)p->p_sigcatch); + mask = sigmask(nc); + p->p_sigcatch &= ~mask; + if (sigprop[nc] & SA_IGNORE) { + if (nc != SIGCONT) + p->p_sigignore |= mask; + p->p_siglist &= ~mask; + } + ps->ps_sigact[nc] = SIG_DFL; + } + /* + * Reset stack state to the user stack. + * Clear set of signals caught on the signal stack. + */ + ps->ps_sigstk.ss_flags = SS_DISABLE; + ps->ps_sigstk.ss_size = 0; + ps->ps_sigstk.ss_sp = 0; + ps->ps_flags = 0; +} + +/* + * Manipulate signal mask. + * Note that we receive new mask, not pointer, + * and return old mask as return value; + * the library stub does the rest. + */ +#ifndef _SYS_SYSPROTO_H_ +struct sigprocmask_args { + int how; + sigset_t mask; +}; +#endif +int +sigprocmask(p, uap) + register struct proc *p; + struct sigprocmask_args *uap; +{ + int error = 0; + + p->p_retval[0] = p->p_sigmask; + (void) splhigh(); + + switch (uap->how) { + case SIG_BLOCK: + p->p_sigmask |= uap->mask &~ sigcantmask; + break; + + case SIG_UNBLOCK: + p->p_sigmask &= ~uap->mask; + break; + + case SIG_SETMASK: + p->p_sigmask = uap->mask &~ sigcantmask; + break; + + default: + error = EINVAL; + break; + } + (void) spl0(); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct sigpending_args { + int dummy; +}; +#endif +/* ARGSUSED */ +int +sigpending(p, uap) + struct proc *p; + struct sigpending_args *uap; +{ + + p->p_retval[0] = p->p_siglist; + return (0); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Generalized interface signal handler, 4.3-compatible. + */ +#ifndef _SYS_SYSPROTO_H_ +struct osigvec_args { + int signum; + struct sigvec *nsv; + struct sigvec *osv; +}; +#endif +/* ARGSUSED */ +int +osigvec(p, uap) + struct proc *p; + register struct osigvec_args *uap; +{ + struct sigvec vec; + register struct sigacts *ps = p->p_sigacts; + register struct sigvec *sv; + register int signum; + int bit, error; + + signum = uap->signum; + if (signum <= 0 || signum >= NSIG) + return (EINVAL); + sv = &vec; + if (uap->osv) { + *(sig_t *)&sv->sv_handler = ps->ps_sigact[signum]; + sv->sv_mask = ps->ps_catchmask[signum]; + bit = sigmask(signum); + sv->sv_flags = 0; + if ((ps->ps_sigonstack & bit) != 0) + sv->sv_flags |= SV_ONSTACK; + if ((ps->ps_sigintr & bit) != 0) + sv->sv_flags |= SV_INTERRUPT; + if ((ps->ps_sigreset & bit) != 0) + sv->sv_flags |= SV_RESETHAND; + if ((ps->ps_signodefer & bit) != 0) + sv->sv_flags |= SV_NODEFER; +#ifndef COMPAT_SUNOS +#ifndef COMPAT_LINUX_THREADS + if (signum == SIGCHLD && p->p_flag & P_NOCLDSTOP) +#else + if (signum == SIGCHLD && p->p_procsig->ps_flag & P_NOCLDSTOP) +#endif /* COMPAT_LINUX_THREADS */ + sv->sv_flags |= SV_NOCLDSTOP; +#endif + if ((error = copyout((caddr_t)sv, (caddr_t)uap->osv, + sizeof (vec)))) + return (error); + } + if (uap->nsv) { + if ((error = copyin((caddr_t)uap->nsv, (caddr_t)sv, + sizeof (vec)))) + return (error); + if ((signum == SIGKILL || signum == SIGSTOP) && + sv->sv_handler != SIG_DFL) + return (EINVAL); +#ifdef COMPAT_SUNOS + sv->sv_flags |= SA_USERTRAMP; +#endif + sv->sv_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */ + setsigvec(p, signum, (struct sigaction *)sv); + } + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct osigblock_args { + int mask; +}; +#endif +int +osigblock(p, uap) + register struct proc *p; + struct osigblock_args *uap; +{ + + (void) splhigh(); + p->p_retval[0] = p->p_sigmask; + p->p_sigmask |= uap->mask &~ sigcantmask; + (void) spl0(); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct osigsetmask_args { + int mask; +}; +#endif +int +osigsetmask(p, uap) + struct proc *p; + struct osigsetmask_args *uap; +{ + + (void) splhigh(); + p->p_retval[0] = p->p_sigmask; + p->p_sigmask = uap->mask &~ sigcantmask; + (void) spl0(); + return (0); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Suspend process until signal, providing mask to be set + * in the meantime. Note nonstandard calling convention: + * libc stub passes mask, not pointer, to save a copyin. + */ +#ifndef _SYS_SYSPROTO_H_ +struct sigsuspend_args { + sigset_t mask; +}; +#endif +/* ARGSUSED */ +int +sigsuspend(p, uap) + register struct proc *p; + struct sigsuspend_args *uap; +{ + register struct sigacts *ps = p->p_sigacts; + + /* + * When returning from sigpause, we want + * the old mask to be restored after the + * signal handler has finished. Thus, we + * save it here and mark the sigacts structure + * to indicate this. + */ +#ifndef COMPAT_LINUX_THREADS + ps->ps_oldmask = p->p_sigmask; + ps->ps_flags |= SAS_OLDMASK; +#else + p->p_oldsigmask = p->p_sigmask; +#endif /* COMPAT_LINUX_THREADS */ + p->p_sigmask = uap->mask &~ sigcantmask; + while (tsleep((caddr_t) ps, PPAUSE|PCATCH, "pause", 0) == 0) + /* void */; + /* always return EINTR rather than ERESTART... */ + return (EINTR); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#ifndef _SYS_SYSPROTO_H_ +struct osigstack_args { + struct sigstack *nss; + struct sigstack *oss; +}; +#endif +/* ARGSUSED */ +int +osigstack(p, uap) + struct proc *p; + register struct osigstack_args *uap; +{ + struct sigstack ss; + struct sigacts *psp; + int error = 0; + + psp = p->p_sigacts; + ss.ss_sp = psp->ps_sigstk.ss_sp; + ss.ss_onstack = psp->ps_sigstk.ss_flags & SS_ONSTACK; + if (uap->oss && (error = copyout((caddr_t)&ss, (caddr_t)uap->oss, + sizeof (struct sigstack)))) + return (error); + if (uap->nss && (error = copyin((caddr_t)uap->nss, (caddr_t)&ss, + sizeof (ss))) == 0) { + psp->ps_sigstk.ss_sp = ss.ss_sp; + psp->ps_sigstk.ss_size = 0; + psp->ps_sigstk.ss_flags |= ss.ss_onstack & SS_ONSTACK; + psp->ps_flags |= SAS_ALTSTACK; + } + return (error); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +#ifndef _SYS_SYSPROTO_H_ +struct sigaltstack_args { + struct sigaltstack *nss; + struct sigaltstack *oss; +}; +#endif +/* ARGSUSED */ +int +sigaltstack(p, uap) + struct proc *p; + register struct sigaltstack_args *uap; +{ + struct sigacts *psp; + struct sigaltstack ss; + int error; + + psp = p->p_sigacts; + if ((psp->ps_flags & SAS_ALTSTACK) == 0) + psp->ps_sigstk.ss_flags |= SS_DISABLE; + if (uap->oss && (error = copyout((caddr_t)&psp->ps_sigstk, + (caddr_t)uap->oss, sizeof (struct sigaltstack)))) + return (error); + if (uap->nss == 0) + return (0); + if ((error = copyin((caddr_t)uap->nss, (caddr_t)&ss, sizeof (ss)))) + return (error); + if (ss.ss_flags & SS_DISABLE) { + if (psp->ps_sigstk.ss_flags & SS_ONSTACK) + return (EINVAL); + psp->ps_flags &= ~SAS_ALTSTACK; + psp->ps_sigstk.ss_flags = ss.ss_flags; + return (0); + } + if (ss.ss_size < MINSIGSTKSZ) + return (ENOMEM); + psp->ps_flags |= SAS_ALTSTACK; + psp->ps_sigstk= ss; + return (0); +} + +/* + * Common code for kill process group/broadcast kill. + * cp is calling process. + */ +int +killpg1(cp, signum, pgid, all) + register struct proc *cp; + int signum, pgid, all; +{ + register struct proc *p; + register struct pcred *pc = cp->p_cred; + struct pgrp *pgrp; + int nfound = 0; + + if (all) + /* + * broadcast + */ + for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { + if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || + p == cp || !CANSIGNAL(cp, pc, p, signum)) + continue; + nfound++; + if (signum) + psignal(p, signum); + } + else { + if (pgid == 0) + /* + * zero pgid means send to my process group. + */ + pgrp = cp->p_pgrp; + else { + pgrp = pgfind(pgid); + if (pgrp == NULL) + return (ESRCH); + } + for (p = pgrp->pg_members.lh_first; p != 0; + p = p->p_pglist.le_next) { + if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || + p->p_stat == SZOMB || + !CANSIGNAL(cp, pc, p, signum)) + continue; + nfound++; + if (signum) + psignal(p, signum); + } + } + return (nfound ? 0 : ESRCH); +} + +#ifndef _SYS_SYSPROTO_H_ +struct kill_args { + int pid; + int signum; +}; +#endif +/* ARGSUSED */ +int +kill(cp, uap) + register struct proc *cp; + register struct kill_args *uap; +{ + register struct proc *p; + register struct pcred *pc = cp->p_cred; + + if ((u_int)uap->signum >= NSIG) + return (EINVAL); + if (uap->pid > 0) { + /* kill single process */ + if ((p = pfind(uap->pid)) == NULL) + return (ESRCH); + if (!CANSIGNAL(cp, pc, p, uap->signum)) + return (EPERM); + if (uap->signum) + psignal(p, uap->signum); + return (0); + } + switch (uap->pid) { + case -1: /* broadcast signal */ + return (killpg1(cp, uap->signum, 0, 1)); + case 0: /* signal own process group */ + return (killpg1(cp, uap->signum, 0, 0)); + default: /* negative explicit process group */ + return (killpg1(cp, uap->signum, -uap->pid, 0)); + } + /* NOTREACHED */ +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#ifndef _SYS_SYSPROTO_H_ +struct okillpg_args { + int pgid; + int signum; +}; +#endif +/* ARGSUSED */ +int +okillpg(p, uap) + struct proc *p; + register struct okillpg_args *uap; +{ + + if ((u_int)uap->signum >= NSIG) + return (EINVAL); + return (killpg1(p, uap->signum, uap->pgid, 0)); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Send a signal to a process group. + */ +void +gsignal(pgid, signum) + int pgid, signum; +{ + struct pgrp *pgrp; + + if (pgid && (pgrp = pgfind(pgid))) + pgsignal(pgrp, signum, 0); +} + +/* + * Send a signal to a process group. If checktty is 1, + * limit to members which have a controlling terminal. + */ +void +pgsignal(pgrp, signum, checkctty) + struct pgrp *pgrp; + int signum, checkctty; +{ + register struct proc *p; + + if (pgrp) + for (p = pgrp->pg_members.lh_first; p != 0; + p = p->p_pglist.le_next) + if (checkctty == 0 || p->p_flag & P_CONTROLT) + psignal(p, signum); +} + +/* + * Send a signal caused by a trap to the current process. + * If it will be caught immediately, deliver it with correct code. + * Otherwise, post it normally. + */ +void +trapsignal(p, signum, code) + struct proc *p; + register int signum; + u_long code; +{ + register struct sigacts *ps = p->p_sigacts; + int mask; + + mask = sigmask(signum); + if ((p->p_flag & P_TRACED) == 0 && (p->p_sigcatch & mask) != 0 && + (p->p_sigmask & mask) == 0) { + p->p_stats->p_ru.ru_nsignals++; +#ifdef KTRACE + if (KTRPOINT(p, KTR_PSIG)) + ktrpsig(p->p_tracep, signum, ps->ps_sigact[signum], + p->p_sigmask, code); +#endif + (*p->p_sysent->sv_sendsig)(ps->ps_sigact[signum], signum, + p->p_sigmask, code); + p->p_sigmask |= ps->ps_catchmask[signum] | + (mask & ~ps->ps_signodefer); + if ((ps->ps_sigreset & mask) != 0) { + /* + * See setsigvec() for origin of this code. + */ + p->p_sigcatch &= ~mask; + if (signum != SIGCONT && sigprop[signum] & SA_IGNORE) + p->p_sigignore |= mask; + ps->ps_sigact[signum] = SIG_DFL; + } + } else { +#ifndef COMPAT_LINUX_THREADS + ps->ps_code = code; /* XXX for core dump/debugger */ + ps->ps_sig = signum; /* XXX to verify code */ +#else + p->p_code = code; /* XXX for core dump/debugger */ + p->p_sig = signum; /* XXX to verify code */ +#endif /* COMPAT_LINUX_THREADS */ + psignal(p, signum); + } +} + +/* + * Send the signal to the process. If the signal has an action, the action + * is usually performed by the target process rather than the caller; we add + * the signal to the set of pending signals for the process. + * + * Exceptions: + * o When a stop signal is sent to a sleeping process that takes the + * default action, the process is stopped without awakening it. + * o SIGCONT restarts stopped processes (or puts them back to sleep) + * regardless of the signal action (eg, blocked or ignored). + * + * Other ignored signals are discarded immediately. + */ +void +psignal(p, signum) + register struct proc *p; + register int signum; +{ + register int s, prop; + register sig_t action; + int mask; + + if ((u_int)signum >= NSIG || signum == 0) { + printf("psignal: signum %d\n", signum); + panic("psignal signal number"); + } + mask = sigmask(signum); + prop = sigprop[signum]; + + /* + * If proc is traced, always give parent a chance; + * if signal event is tracked by procfs, give *that* + * a chance, as well. + */ + if ((p->p_flag & P_TRACED) || (p->p_stops & S_SIG)) + action = SIG_DFL; + else { + /* + * If the signal is being ignored, + * then we forget about it immediately. + * (Note: we don't set SIGCONT in p_sigignore, + * and if it is set to SIG_IGN, + * action will be SIG_DFL here.) + */ +#ifndef COMPAT_LINUX_THREADS + if (p->p_sigignore & mask) +#else + if ((p->p_sigignore & mask) || (p->p_flag & P_WEXIT)) +#endif /* COMPAT_LINUX_THREADS */ + return; + if (p->p_sigmask & mask) + action = SIG_HOLD; + else if (p->p_sigcatch & mask) + action = SIG_CATCH; + else + action = SIG_DFL; + } + + if (p->p_nice > NZERO && action == SIG_DFL && (prop & SA_KILL) && + (p->p_flag & P_TRACED) == 0) + p->p_nice = NZERO; + + if (prop & SA_CONT) + p->p_siglist &= ~stopsigmask; + + if (prop & SA_STOP) { + /* + * If sending a tty stop signal to a member of an orphaned + * process group, discard the signal here if the action + * is default; don't stop the process below if sleeping, + * and don't clear any pending SIGCONT. + */ + if (prop & SA_TTYSTOP && p->p_pgrp->pg_jobc == 0 && + action == SIG_DFL) + return; + p->p_siglist &= ~contsigmask; + } + p->p_siglist |= mask; + + /* + * Defer further processing for signals which are held, + * except that stopped processes must be continued by SIGCONT. + */ + if (action == SIG_HOLD && ((prop & SA_CONT) == 0 || p->p_stat != SSTOP)) + return; + s = splhigh(); + switch (p->p_stat) { + + case SSLEEP: + /* + * If process is sleeping uninterruptibly + * we can't interrupt the sleep... the signal will + * be noticed when the process returns through + * trap() or syscall(). + */ + if ((p->p_flag & P_SINTR) == 0) + goto out; + /* + * Process is sleeping and traced... make it runnable + * so it can discover the signal in issignal() and stop + * for the parent. + */ + if (p->p_flag & P_TRACED) + goto run; + /* + * If SIGCONT is default (or ignored) and process is + * asleep, we are finished; the process should not + * be awakened. + */ + if ((prop & SA_CONT) && action == SIG_DFL) { + p->p_siglist &= ~mask; + goto out; + } + /* + * When a sleeping process receives a stop + * signal, process immediately if possible. + * All other (caught or default) signals + * cause the process to run. + */ + if (prop & SA_STOP) { + if (action != SIG_DFL) + goto runfast; + /* + * If a child holding parent blocked, + * stopping could cause deadlock. + */ + if (p->p_flag & P_PPWAIT) + goto out; + p->p_siglist &= ~mask; + p->p_xstat = signum; +#ifndef COMPAT_LINUX_THREADS + if ((p->p_pptr->p_flag & P_NOCLDSTOP) == 0) +#else + if ((p->p_pptr->p_procsig->ps_flag & P_NOCLDSTOP) == 0) +#endif /* COMPAT_LINUX_THREADS */ + psignal(p->p_pptr, SIGCHLD); + stop(p); + goto out; + } else + goto runfast; + /*NOTREACHED*/ + + case SSTOP: + /* + * If traced process is already stopped, + * then no further action is necessary. + */ + if (p->p_flag & P_TRACED) + goto out; + + /* + * Kill signal always sets processes running. + */ + if (signum == SIGKILL) + goto runfast; + + if (prop & SA_CONT) { + /* + * If SIGCONT is default (or ignored), we continue the + * process but don't leave the signal in p_siglist, as + * it has no further action. If SIGCONT is held, we + * continue the process and leave the signal in + * p_siglist. If the process catches SIGCONT, let it + * handle the signal itself. If it isn't waiting on + * an event, then it goes back to run state. + * Otherwise, process goes back to sleep state. + */ + if (action == SIG_DFL) + p->p_siglist &= ~mask; + if (action == SIG_CATCH) + goto runfast; + if (p->p_wchan == 0) + goto run; + p->p_stat = SSLEEP; + goto out; + } + + if (prop & SA_STOP) { + /* + * Already stopped, don't need to stop again. + * (If we did the shell could get confused.) + */ + p->p_siglist &= ~mask; /* take it away */ + goto out; + } + + /* + * If process is sleeping interruptibly, then simulate a + * wakeup so that when it is continued, it will be made + * runnable and can look at the signal. But don't make + * the process runnable, leave it stopped. + */ + if (p->p_wchan && p->p_flag & P_SINTR) + unsleep(p); + goto out; + + default: + /* + * SRUN, SIDL, SZOMB do nothing with the signal, + * other than kicking ourselves if we are running. + * It will either never be noticed, or noticed very soon. + */ + if (p == curproc) + signotify(p); +#ifdef SMP + else if (p->p_stat == SRUN) + forward_signal(p); +#endif + goto out; + } + /*NOTREACHED*/ + +runfast: + /* + * Raise priority to at least PUSER. + */ + if (p->p_priority > PUSER) + p->p_priority = PUSER; +run: + setrunnable(p); +out: + splx(s); +} + +/* + * If the current process has received a signal (should be caught or cause + * termination, should interrupt current syscall), return the signal number. + * Stop signals with default action are processed immediately, then cleared; + * they aren't returned. This is checked after each entry to the system for + * a syscall or trap (though this can usually be done without calling issignal + * by checking the pending signal masks in the CURSIG macro.) The normal call + * sequence is + * + * while (signum = CURSIG(curproc)) + * postsig(signum); + */ +int +issignal(p) + register struct proc *p; +{ + register int signum, mask, prop; + + for (;;) { + int traced = (p->p_flag & P_TRACED) || (p->p_stops & S_SIG); + + mask = p->p_siglist & ~p->p_sigmask; + if (p->p_flag & P_PPWAIT) + mask &= ~stopsigmask; + if (mask == 0) /* no signal to send */ + return (0); + signum = ffs((long)mask); + mask = sigmask(signum); + prop = sigprop[signum]; + + STOPEVENT(p, S_SIG, signum); + + /* + * We should see pending but ignored signals + * only if P_TRACED was on when they were posted. + */ + if ((mask & p->p_sigignore) && (traced == 0)) { + p->p_siglist &= ~mask; + continue; + } + if (p->p_flag & P_TRACED && (p->p_flag & P_PPWAIT) == 0) { + /* + * If traced, always stop, and stay + * stopped until released by the parent. + */ + p->p_xstat = signum; + psignal(p->p_pptr, SIGCHLD); + do { + stop(p); + mi_switch(); + } while (!trace_req(p) + && p->p_flag & P_TRACED); + + /* + * If the traced bit got turned off, go back up + * to the top to rescan signals. This ensures + * that p_sig* and ps_sigact are consistent. + */ + if ((p->p_flag & P_TRACED) == 0) + continue; + + /* + * If parent wants us to take the signal, + * then it will leave it in p->p_xstat; + * otherwise we just look for signals again. + */ + p->p_siglist &= ~mask; /* clear the old signal */ + signum = p->p_xstat; + if (signum == 0) + continue; + + /* + * Put the new signal into p_siglist. If the + * signal is being masked, look for other signals. + */ + mask = sigmask(signum); + p->p_siglist |= mask; + if (p->p_sigmask & mask) + continue; + } + + /* + * Decide whether the signal should be returned. + * Return the signal's number, or fall through + * to clear it from the pending mask. + */ + switch ((int)(intptr_t)p->p_sigacts->ps_sigact[signum]) { + + case (int)SIG_DFL: + /* + * Don't take default actions on system processes. + */ + if (p->p_pid <= 1) { +#ifdef DIAGNOSTIC + /* + * Are you sure you want to ignore SIGSEGV + * in init? XXX + */ + printf("Process (pid %lu) got signal %d\n", + (u_long)p->p_pid, signum); +#endif + break; /* == ignore */ + } + /* + * If there is a pending stop signal to process + * with default action, stop here, + * then clear the signal. However, + * if process is member of an orphaned + * process group, ignore tty stop signals. + */ + if (prop & SA_STOP) { + if (p->p_flag & P_TRACED || + (p->p_pgrp->pg_jobc == 0 && + prop & SA_TTYSTOP)) + break; /* == ignore */ + p->p_xstat = signum; + stop(p); +#ifndef COMPAT_LINUX_THREADS + if ((p->p_pptr->p_flag & P_NOCLDSTOP) == 0) +#else + if ((p->p_pptr->p_procsig->ps_flag & P_NOCLDSTOP) == 0) +#endif /* COMPAT_LINUX_THREADS */ + psignal(p->p_pptr, SIGCHLD); + mi_switch(); + break; + } else if (prop & SA_IGNORE) { + /* + * Except for SIGCONT, shouldn't get here. + * Default action is to ignore; drop it. + */ + break; /* == ignore */ + } else + return (signum); + /*NOTREACHED*/ + + case (int)SIG_IGN: + /* + * Masking above should prevent us ever trying + * to take action on an ignored signal other + * than SIGCONT, unless process is traced. + */ + if ((prop & SA_CONT) == 0 && + (p->p_flag & P_TRACED) == 0) + printf("issignal\n"); + break; /* == ignore */ + + default: + /* + * This signal has an action, let + * postsig() process it. + */ + return (signum); + } + p->p_siglist &= ~mask; /* take the signal! */ + } + /* NOTREACHED */ +} + +/* + * Put the argument process into the stopped state and notify the parent + * via wakeup. Signals are handled elsewhere. The process must not be + * on the run queue. + */ +void +stop(p) + register struct proc *p; +{ + + p->p_stat = SSTOP; + p->p_flag &= ~P_WAITED; + wakeup((caddr_t)p->p_pptr); +} + +/* + * Take the action for the specified signal + * from the current set of pending signals. + */ +void +postsig(signum) + register int signum; +{ + register struct proc *p = curproc; + register struct sigacts *ps = p->p_sigacts; + register sig_t action; + int code, mask, returnmask; + + KASSERT(signum != 0, ("postsig")); + + mask = sigmask(signum); + p->p_siglist &= ~mask; + action = ps->ps_sigact[signum]; +#ifdef KTRACE + if (KTRPOINT(p, KTR_PSIG)) + ktrpsig(p->p_tracep, +#ifndef COMPAT_LINUX_THREADS + signum, action, ps->ps_flags & SAS_OLDMASK ? + ps->ps_oldmask : p->p_sigmask, 0); +#else + signum, action, p->p_oldsigmask ? + p->p_oldsigmask : p->p_sigmask, 0); +#endif /* COMPAT_LINUX_THREADS */ +#endif + STOPEVENT(p, S_SIG, signum); + + if (action == SIG_DFL) { + /* + * Default action, where the default is to kill + * the process. (Other cases were ignored above.) + */ + sigexit(p, signum); + /* NOTREACHED */ + } else { + /* + * If we get here, the signal must be caught. + */ + KASSERT(action != SIG_IGN && (p->p_sigmask & mask) == 0, + ("postsig action")); + /* + * Set the new mask value and also defer further + * occurences of this signal. + * + * Special case: user has done a sigpause. Here the + * current mask is not of interest, but rather the + * mask from before the sigpause is what we want + * restored after the signal processing is completed. + */ + (void) splhigh(); +#ifndef COMPAT_LINUX_THREADS + if (ps->ps_flags & SAS_OLDMASK) { + returnmask = ps->ps_oldmask; + ps->ps_flags &= ~SAS_OLDMASK; +#else + if (p->p_oldsigmask) { + returnmask = p->p_oldsigmask; + p->p_oldsigmask = 0; +#endif /* COMPAT_LINUX_THREADS */ + } else + returnmask = p->p_sigmask; + p->p_sigmask |= ps->ps_catchmask[signum] | + (mask & ~ps->ps_signodefer); + if ((ps->ps_sigreset & mask) != 0) { + /* + * See setsigvec() for origin of this code. + */ + p->p_sigcatch &= ~mask; + if (signum != SIGCONT && sigprop[signum] & SA_IGNORE) + p->p_sigignore |= mask; + ps->ps_sigact[signum] = SIG_DFL; + } + (void) spl0(); + p->p_stats->p_ru.ru_nsignals++; +#ifndef COMPAT_LINUX_THREADS + if (ps->ps_sig != signum) { +#else + if (p->p_sig != signum) { +#endif /* COMPAT_LINUX_THREADS */ + code = 0; + } else { +#ifndef COMPAT_LINUX_THREADS + code = ps->ps_code; + ps->ps_code = 0; + ps->ps_sig = 0; +#else + code = p->p_code; + p->p_code = 0; + p->p_sig = 0; +#endif /* COMPAT_LINUX_THREADS */ + } + (*p->p_sysent->sv_sendsig)(action, signum, returnmask, code); + } +} + +/* + * Kill the current process for stated reason. + */ +void +killproc(p, why) + struct proc *p; + char *why; +{ + log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm, + p->p_cred && p->p_ucred ? p->p_ucred->cr_uid : -1, why); + psignal(p, SIGKILL); +} + +/* + * Force the current process to exit with the specified signal, dumping core + * if appropriate. We bypass the normal tests for masked and caught signals, + * allowing unrecoverable failures to terminate the process without changing + * signal state. Mark the accounting record with the signal termination. + * If dumping core, save the signal number for the debugger. Calls exit and + * does not return. + */ +void +sigexit(p, signum) + register struct proc *p; + int signum; +{ + + p->p_acflag |= AXSIG; + if (sigprop[signum] & SA_CORE) { +#ifndef COMPAT_LINUX_THREADS + p->p_sigacts->ps_sig = signum; +#else + p->p_sig = signum; +#endif /* COMPAT_LINUX_THREADS */ + /* + * Log signals which would cause core dumps + * (Log as LOG_INFO to appease those who don't want + * these messages.) + * XXX : Todo, as well as euid, write out ruid too + */ + if (p->p_sysent->sv_coredump != NULL && + (*p->p_sysent->sv_coredump)(p) == 0) + signum |= WCOREFLAG; + if (kern_logsigexit) + log(LOG_INFO, + "pid %d (%s), uid %d: exited on signal %d%s\n", + p->p_pid, p->p_comm, + p->p_cred && p->p_ucred ? p->p_ucred->cr_uid : -1, + signum &~ WCOREFLAG, + signum & WCOREFLAG ? " (core dumped)" : ""); + } + exit1(p, W_EXITCODE(0, signum)); + /* NOTREACHED */ +} + +static char corefilename[MAXPATHLEN+1] = {"%N.core"}; +SYSCTL_STRING(_kern, OID_AUTO, corefile, CTLFLAG_RW, corefilename, + sizeof(corefilename), "process corefile name format string"); + +/* + * expand_name(name, uid, pid) + * Expand the name described in corefilename, using name, uid, and pid. + * corefilename is a printf-like string, with three format specifiers: + * %N name of process ("name") + * %P process id (pid) + * %U user id (uid) + * For example, "%N.core" is the default; they can be disabled completely + * by using "/dev/null", or all core files can be stored in "/cores/%U/%N-%P". + * This is controlled by the sysctl variable kern.corefile (see above). + */ + +char * +expand_name(name, uid, pid) +const char *name; int uid; int pid; { + char *temp; + char buf[11]; /* Buffer for pid/uid -- max 4B */ + int i, n; + char *format = corefilename; + + temp = malloc(MAXPATHLEN + 3, M_TEMP, M_NOWAIT); + if (temp == NULL) + return NULL; + bzero(temp, MAXPATHLEN+3); + for (i = 0, n = 0; i < MAXPATHLEN && format[i]; i++) { + int l; + switch (format[i]) { + case '%': /* Format character */ + i++; + switch (format[i]) { + case '%': + temp[n++] = '%'; + break; + case 'N': /* process name */ + l = strlen(name); + if ((n + l) > MAXPATHLEN) { + log(LOG_ERR, "pid %d (%s), uid (%d): Path `%s%s' is too long\n", + pid, name, uid, temp, name); + free(temp, M_TEMP); + return NULL; + } + memcpy(temp+n, name, l); + n += l; + break; + case 'P': /* process id */ + sprintf(buf, "%u", pid); + l = strlen(buf); + if ((n + l) > MAXPATHLEN) { + log(LOG_ERR, "pid %d (%s), uid (%d): Path `%s%s' is too long\n", + pid, name, uid, temp, name); + free(temp, M_TEMP); + return NULL; + } + memcpy(temp+n, buf, l); + n += l; + break; + case 'U': /* user id */ + sprintf(buf, "%u", uid); + l = strlen(buf); + if ((n + l) > MAXPATHLEN) { + log(LOG_ERR, "pid %d (%s), uid (%d): Path `%s%s' is too long\n", + pid, name, uid, temp, name); + free(temp, M_TEMP); + return NULL; + } + memcpy(temp+n, buf, l); + n += l; + break; + default: + log(LOG_ERR, "Unknown format character %c in `%s'\n", format[i], format); + } + break; + default: + temp[n++] = format[i]; + } + } + return temp; +} + +/* + * Nonexistent system call-- signal process (may want to handle it). + * Flag error in case process won't see signal immediately (blocked or ignored). + */ +#ifndef _SYS_SYSPROTO_H_ +struct nosys_args { + int dummy; +}; +#endif +/* ARGSUSED */ +int +nosys(p, args) + struct proc *p; + struct nosys_args *args; +{ + + psignal(p, SIGSYS); + return (EINVAL); +} + +/* + * Send a signal to a SIGIO or SIGURG to a process or process group using + * stored credentials rather than those of the current process. + */ +void +pgsigio(sigio, signum, checkctty) + struct sigio *sigio; + int signum, checkctty; +{ + if (sigio == NULL) + return; + + if (sigio->sio_pgid > 0) { + if (CANSIGIO(sigio->sio_ruid, sigio->sio_ucred, + sigio->sio_proc)) + psignal(sigio->sio_proc, signum); + } else if (sigio->sio_pgid < 0) { + struct proc *p; + + for (p = sigio->sio_pgrp->pg_members.lh_first; p != NULL; + p = p->p_pglist.le_next) + if (CANSIGIO(sigio->sio_ruid, sigio->sio_ucred, p) && + (checkctty == 0 || (p->p_flag & P_CONTROLT))) + psignal(p, signum); + } +} diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c new file mode 100644 index 0000000..a96d554 --- /dev/null +++ b/sys/kern/kern_subr.c @@ -0,0 +1,391 @@ +/* + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 + * $Id: kern_subr.c,v 1.23 1999/01/08 17:31:10 eivind Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/lock.h> +#include <sys/vnode.h> + +#include <vm/vm.h> +#include <vm/vm_prot.h> +#include <vm/vm_page.h> +#include <vm/vm_map.h> + +int +uiomove(cp, n, uio) + register caddr_t cp; + register int n; + register struct uio *uio; +{ + register struct iovec *iov; + u_int cnt; + int error; + + KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, + ("uiomove: mode")); + KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_procp == curproc, + ("uiomove proc")); + + while (n > 0 && uio->uio_resid) { + iov = uio->uio_iov; + cnt = iov->iov_len; + if (cnt == 0) { + uio->uio_iov++; + uio->uio_iovcnt--; + continue; + } + if (cnt > n) + cnt = n; + + switch (uio->uio_segflg) { + + case UIO_USERSPACE: + case UIO_USERISPACE: + if (uio->uio_rw == UIO_READ) + error = copyout(cp, iov->iov_base, cnt); + else + error = copyin(iov->iov_base, cp, cnt); + if (error) + return (error); + break; + + case UIO_SYSSPACE: + if (uio->uio_rw == UIO_READ) + bcopy((caddr_t)cp, iov->iov_base, cnt); + else + bcopy(iov->iov_base, (caddr_t)cp, cnt); + break; + case UIO_NOCOPY: + break; + } + iov->iov_base += cnt; + iov->iov_len -= cnt; + uio->uio_resid -= cnt; + uio->uio_offset += cnt; + cp += cnt; + n -= cnt; + } + return (0); +} + +int +uiomoveco(cp, n, uio, obj) + caddr_t cp; + int n; + struct uio *uio; + struct vm_object *obj; +{ + struct iovec *iov; + u_int cnt; + int error; + + KASSERT(uio->uio_rw == UIO_READ || uio->uio_rw == UIO_WRITE, + ("uiomoveco: mode")); + KASSERT(uio->uio_segflg != UIO_USERSPACE || uio->uio_procp == curproc, + ("uiomoveco proc")); + + while (n > 0 && uio->uio_resid) { + iov = uio->uio_iov; + cnt = iov->iov_len; + if (cnt == 0) { + uio->uio_iov++; + uio->uio_iovcnt--; + continue; + } + if (cnt > n) + cnt = n; + + switch (uio->uio_segflg) { + + case UIO_USERSPACE: + case UIO_USERISPACE: + if (uio->uio_rw == UIO_READ) { + if (vfs_ioopt && ((cnt & PAGE_MASK) == 0) && + ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) && + ((uio->uio_offset & PAGE_MASK) == 0) && + ((((intptr_t) cp) & PAGE_MASK) == 0)) { + error = vm_uiomove(&curproc->p_vmspace->vm_map, obj, + uio->uio_offset, cnt, + (vm_offset_t) iov->iov_base, NULL); + } else { + error = copyout(cp, iov->iov_base, cnt); + } + } else { + error = copyin(iov->iov_base, cp, cnt); + } + if (error) + return (error); + break; + + case UIO_SYSSPACE: + if (uio->uio_rw == UIO_READ) + bcopy((caddr_t)cp, iov->iov_base, cnt); + else + bcopy(iov->iov_base, (caddr_t)cp, cnt); + break; + case UIO_NOCOPY: + break; + } + iov->iov_base += cnt; + iov->iov_len -= cnt; + uio->uio_resid -= cnt; + uio->uio_offset += cnt; + cp += cnt; + n -= cnt; + } + return (0); +} + +int +uioread(n, uio, obj, nread) + int n; + struct uio *uio; + struct vm_object *obj; + int *nread; +{ + int npagesmoved; + struct iovec *iov; + u_int cnt, tcnt; + int error; + + *nread = 0; + if (vfs_ioopt < 2) + return 0; + + error = 0; + + while (n > 0 && uio->uio_resid) { + iov = uio->uio_iov; + cnt = iov->iov_len; + if (cnt == 0) { + uio->uio_iov++; + uio->uio_iovcnt--; + continue; + } + if (cnt > n) + cnt = n; + + if ((uio->uio_segflg == UIO_USERSPACE) && + ((((intptr_t) iov->iov_base) & PAGE_MASK) == 0) && + ((uio->uio_offset & PAGE_MASK) == 0) ) { + + if (cnt < PAGE_SIZE) + break; + + cnt &= ~PAGE_MASK; + + error = vm_uiomove(&curproc->p_vmspace->vm_map, obj, + uio->uio_offset, cnt, + (vm_offset_t) iov->iov_base, &npagesmoved); + + if (npagesmoved == 0) + break; + + tcnt = npagesmoved * PAGE_SIZE; + cnt = tcnt; + + if (error) + break; + + iov->iov_base += cnt; + iov->iov_len -= cnt; + uio->uio_resid -= cnt; + uio->uio_offset += cnt; + *nread += cnt; + n -= cnt; + } else { + break; + } + } + return error; +} + +/* + * Give next character to user as result of read. + */ +int +ureadc(c, uio) + register int c; + register struct uio *uio; +{ + register struct iovec *iov; + +again: + if (uio->uio_iovcnt == 0 || uio->uio_resid == 0) + panic("ureadc"); + iov = uio->uio_iov; + if (iov->iov_len == 0) { + uio->uio_iovcnt--; + uio->uio_iov++; + goto again; + } + switch (uio->uio_segflg) { + + case UIO_USERSPACE: + if (subyte(iov->iov_base, c) < 0) + return (EFAULT); + break; + + case UIO_SYSSPACE: + *iov->iov_base = c; + break; + + case UIO_USERISPACE: + if (suibyte(iov->iov_base, c) < 0) + return (EFAULT); + break; + case UIO_NOCOPY: + break; + } + iov->iov_base++; + iov->iov_len--; + uio->uio_resid--; + uio->uio_offset++; + return (0); +} + +#ifdef vax /* unused except by ct.c, other oddities XXX */ +/* + * Get next character written in by user from uio. + */ +int +uwritec(uio) + struct uio *uio; +{ + register struct iovec *iov; + register int c; + + if (uio->uio_resid <= 0) + return (-1); +again: + if (uio->uio_iovcnt <= 0) + panic("uwritec"); + iov = uio->uio_iov; + if (iov->iov_len == 0) { + uio->uio_iov++; + if (--uio->uio_iovcnt == 0) + return (-1); + goto again; + } + switch (uio->uio_segflg) { + + case UIO_USERSPACE: + c = fubyte(iov->iov_base); + break; + + case UIO_SYSSPACE: + c = *(u_char *) iov->iov_base; + break; + + case UIO_USERISPACE: + c = fuibyte(iov->iov_base); + break; + } + if (c < 0) + return (-1); + iov->iov_base++; + iov->iov_len--; + uio->uio_resid--; + uio->uio_offset++; + return (c); +} +#endif /* vax */ + +/* + * General routine to allocate a hash table. + */ +void * +hashinit(elements, type, hashmask) + int elements; + struct malloc_type *type; + u_long *hashmask; +{ + long hashsize; + LIST_HEAD(generic, generic) *hashtbl; + int i; + + if (elements <= 0) + panic("hashinit: bad elements"); + for (hashsize = 1; hashsize <= elements; hashsize <<= 1) + continue; + hashsize >>= 1; + hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); + for (i = 0; i < hashsize; i++) + LIST_INIT(&hashtbl[i]); + *hashmask = hashsize - 1; + return (hashtbl); +} + +static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039, + 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653, + 7159, 7673, 8191, 12281, 16381, 24571, 32749 }; +#define NPRIMES (sizeof(primes) / sizeof(primes[0])) + +/* + * General routine to allocate a prime number sized hash table. + */ +void * +phashinit(elements, type, nentries) + int elements; + struct malloc_type *type; + u_long *nentries; +{ + long hashsize; + LIST_HEAD(generic, generic) *hashtbl; + int i; + + if (elements <= 0) + panic("phashinit: bad elements"); + for (i = 1, hashsize = primes[1]; hashsize <= elements;) { + i++; + if (i == NPRIMES) + break; + hashsize = primes[i]; + } + hashsize = primes[i - 1]; + hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); + for (i = 0; i < hashsize; i++) + LIST_INIT(&hashtbl[i]); + *nentries = hashsize; + return (hashtbl); +} diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c new file mode 100644 index 0000000..f8baf85 --- /dev/null +++ b/sys/kern/kern_synch.c @@ -0,0 +1,923 @@ +/*- + * Copyright (c) 1982, 1986, 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 + * $Id: kern_synch.c,v 1.71 1999/01/08 17:31:10 eivind Exp $ + */ + +#include "opt_ktrace.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/kernel.h> +#include <sys/signalvar.h> +#include <sys/resourcevar.h> +#include <sys/vmmeter.h> +#include <sys/sysctl.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> +#ifdef KTRACE +#include <sys/uio.h> +#include <sys/ktrace.h> +#endif + +#include <machine/cpu.h> +#ifdef SMP +#include <machine/smp.h> +#endif +#include <machine/limits.h> /* for UCHAR_MAX = typeof(p_priority)_MAX */ + +static void rqinit __P((void *)); +SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL) + +u_char curpriority; /* usrpri of curproc */ +int lbolt; /* once a second sleep address */ + +static void endtsleep __P((void *)); +static void roundrobin __P((void *arg)); +static void schedcpu __P((void *arg)); +static void updatepri __P((struct proc *p)); + +#define MAXIMUM_SCHEDULE_QUANTUM (1000000) /* arbitrary limit */ +#ifndef DEFAULT_SCHEDULE_QUANTUM +#define DEFAULT_SCHEDULE_QUANTUM 10 +#endif +static int quantum = DEFAULT_SCHEDULE_QUANTUM; /* default value */ + +static int +sysctl_kern_quantum SYSCTL_HANDLER_ARGS +{ + int error; + int new_val = quantum; + + new_val = quantum; + error = sysctl_handle_int(oidp, &new_val, 0, req); + if (error == 0) { + if ((new_val > 0) && (new_val < MAXIMUM_SCHEDULE_QUANTUM)) { + quantum = new_val; + } else { + error = EINVAL; + } + } + return (error); +} + +SYSCTL_PROC(_kern, OID_AUTO, quantum, CTLTYPE_INT|CTLFLAG_RW, + 0, sizeof quantum, sysctl_kern_quantum, "I", ""); + +/* maybe_resched: Decide if you need to reschedule or not + * taking the priorities and schedulers into account. + */ +static void maybe_resched(struct proc *chk) +{ + struct proc *p = curproc; /* XXX */ + + /* + * Compare priorities if the new process is on the same scheduler, + * otherwise the one on the more realtimeish scheduler wins. + * + * XXX idle scheduler still broken because proccess stays on idle + * scheduler during waits (such as when getting FS locks). If a + * standard process becomes runaway cpu-bound, the system can lockup + * due to idle-scheduler processes in wakeup never getting any cpu. + */ + if (p == 0 || + (chk->p_priority < curpriority && RTP_PRIO_BASE(p->p_rtprio.type) == RTP_PRIO_BASE(chk->p_rtprio.type)) || + RTP_PRIO_BASE(chk->p_rtprio.type) < RTP_PRIO_BASE(p->p_rtprio.type) + ) { + need_resched(); + } +} + +#define ROUNDROBIN_INTERVAL (hz / quantum) +int roundrobin_interval(void) +{ + return ROUNDROBIN_INTERVAL; +} + +/* + * Force switch among equal priority processes every 100ms. + */ +/* ARGSUSED */ +static void +roundrobin(arg) + void *arg; +{ +#ifndef SMP + struct proc *p = curproc; /* XXX */ +#endif + +#ifdef SMP + need_resched(); + forward_roundrobin(); +#else + if (p == 0 || RTP_PRIO_NEED_RR(p->p_rtprio.type)) + need_resched(); +#endif + + timeout(roundrobin, NULL, ROUNDROBIN_INTERVAL); +} + +/* + * Constants for digital decay and forget: + * 90% of (p_estcpu) usage in 5 * loadav time + * 95% of (p_pctcpu) usage in 60 seconds (load insensitive) + * Note that, as ps(1) mentions, this can let percentages + * total over 100% (I've seen 137.9% for 3 processes). + * + * Note that statclock() updates p_estcpu and p_cpticks asynchronously. + * + * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds. + * That is, the system wants to compute a value of decay such + * that the following for loop: + * for (i = 0; i < (5 * loadavg); i++) + * p_estcpu *= decay; + * will compute + * p_estcpu *= 0.1; + * for all values of loadavg: + * + * Mathematically this loop can be expressed by saying: + * decay ** (5 * loadavg) ~= .1 + * + * The system computes decay as: + * decay = (2 * loadavg) / (2 * loadavg + 1) + * + * We wish to prove that the system's computation of decay + * will always fulfill the equation: + * decay ** (5 * loadavg) ~= .1 + * + * If we compute b as: + * b = 2 * loadavg + * then + * decay = b / (b + 1) + * + * We now need to prove two things: + * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) + * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) + * + * Facts: + * For x close to zero, exp(x) =~ 1 + x, since + * exp(x) = 0! + x**1/1! + x**2/2! + ... . + * therefore exp(-1/b) =~ 1 - (1/b) = (b-1)/b. + * For x close to zero, ln(1+x) =~ x, since + * ln(1+x) = x - x**2/2 + x**3/3 - ... -1 < x < 1 + * therefore ln(b/(b+1)) = ln(1 - 1/(b+1)) =~ -1/(b+1). + * ln(.1) =~ -2.30 + * + * Proof of (1): + * Solve (factor)**(power) =~ .1 given power (5*loadav): + * solving for factor, + * ln(factor) =~ (-2.30/5*loadav), or + * factor =~ exp(-1/((5/2.30)*loadav)) =~ exp(-1/(2*loadav)) = + * exp(-1/b) =~ (b-1)/b =~ b/(b+1). QED + * + * Proof of (2): + * Solve (factor)**(power) =~ .1 given factor == (b/(b+1)): + * solving for power, + * power*ln(b/(b+1)) =~ -2.30, or + * power =~ 2.3 * (b + 1) = 4.6*loadav + 2.3 =~ 5*loadav. QED + * + * Actual power values for the implemented algorithm are as follows: + * loadav: 1 2 3 4 + * power: 5.68 10.32 14.94 19.55 + */ + +/* calculations for digital decay to forget 90% of usage in 5*loadav sec */ +#define loadfactor(loadav) (2 * (loadav)) +#define decay_cpu(loadfac, cpu) (((loadfac) * (cpu)) / ((loadfac) + FSCALE)) + +/* decay 95% of `p_pctcpu' in 60 seconds; see CCPU_SHIFT before changing */ +static fixpt_t ccpu = 0.95122942450071400909 * FSCALE; /* exp(-1/20) */ +SYSCTL_INT(_kern, OID_AUTO, ccpu, CTLFLAG_RD, &ccpu, 0, ""); + +/* kernel uses `FSCALE', userland (SHOULD) use kern.fscale */ +static int fscale __unused = FSCALE; +SYSCTL_INT(_kern, OID_AUTO, fscale, CTLFLAG_RD, 0, FSCALE, ""); + +/* + * If `ccpu' is not equal to `exp(-1/20)' and you still want to use the + * faster/more-accurate formula, you'll have to estimate CCPU_SHIFT below + * and possibly adjust FSHIFT in "param.h" so that (FSHIFT >= CCPU_SHIFT). + * + * To estimate CCPU_SHIFT for exp(-1/20), the following formula was used: + * 1 - exp(-1/20) ~= 0.0487 ~= 0.0488 == 1 (fixed pt, *11* bits). + * + * If you don't want to bother with the faster/more-accurate formula, you + * can set CCPU_SHIFT to (FSHIFT + 1) which will use a slower/less-accurate + * (more general) method of calculating the %age of CPU used by a process. + */ +#define CCPU_SHIFT 11 + +/* + * Recompute process priorities, every hz ticks. + */ +/* ARGSUSED */ +static void +schedcpu(arg) + void *arg; +{ + register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); + register struct proc *p; + register int realstathz, s; + register unsigned int newcpu; + + realstathz = stathz ? stathz : hz; + for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { + /* + * Increment time in/out of memory and sleep time + * (if sleeping). We ignore overflow; with 16-bit int's + * (remember them?) overflow takes 45 days. + */ + p->p_swtime++; + if (p->p_stat == SSLEEP || p->p_stat == SSTOP) + p->p_slptime++; + p->p_pctcpu = (p->p_pctcpu * ccpu) >> FSHIFT; + /* + * If the process has slept the entire second, + * stop recalculating its priority until it wakes up. + */ + if (p->p_slptime > 1) + continue; + s = splhigh(); /* prevent state changes and protect run queue */ + /* + * p_pctcpu is only for ps. + */ +#if (FSHIFT >= CCPU_SHIFT) + p->p_pctcpu += (realstathz == 100)? + ((fixpt_t) p->p_cpticks) << (FSHIFT - CCPU_SHIFT): + 100 * (((fixpt_t) p->p_cpticks) + << (FSHIFT - CCPU_SHIFT)) / realstathz; +#else + p->p_pctcpu += ((FSCALE - ccpu) * + (p->p_cpticks * FSCALE / realstathz)) >> FSHIFT; +#endif + p->p_cpticks = 0; + newcpu = (u_int) decay_cpu(loadfac, p->p_estcpu) + p->p_nice; + p->p_estcpu = min(newcpu, UCHAR_MAX); + resetpriority(p); + if (p->p_priority >= PUSER) { +#define PPQ (128 / NQS) /* priorities per queue */ + if ((p != curproc) && +#ifdef SMP + (u_char)p->p_oncpu == 0xff && /* idle */ +#endif + p->p_stat == SRUN && + (p->p_flag & P_INMEM) && + (p->p_priority / PPQ) != (p->p_usrpri / PPQ)) { + remrq(p); + p->p_priority = p->p_usrpri; + setrunqueue(p); + } else + p->p_priority = p->p_usrpri; + } + splx(s); + } + vmmeter(); + wakeup((caddr_t)&lbolt); + timeout(schedcpu, (void *)0, hz); +} + +/* + * Recalculate the priority of a process after it has slept for a while. + * For all load averages >= 1 and max p_estcpu of 255, sleeping for at + * least six times the loadfactor will decay p_estcpu to zero. + */ +static void +updatepri(p) + register struct proc *p; +{ + register unsigned int newcpu = p->p_estcpu; + register fixpt_t loadfac = loadfactor(averunnable.ldavg[0]); + + if (p->p_slptime > 5 * loadfac) + p->p_estcpu = 0; + else { + p->p_slptime--; /* the first time was done in schedcpu */ + while (newcpu && --p->p_slptime) + newcpu = (int) decay_cpu(loadfac, newcpu); + p->p_estcpu = min(newcpu, UCHAR_MAX); + } + resetpriority(p); +} + +/* + * We're only looking at 7 bits of the address; everything is + * aligned to 4, lots of things are aligned to greater powers + * of 2. Shift right by 8, i.e. drop the bottom 256 worth. + */ +#define TABLESIZE 128 +static TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE]; +#define LOOKUP(x) (((intptr_t)(x) >> 8) & (TABLESIZE - 1)) + +/* + * During autoconfiguration or after a panic, a sleep will simply + * lower the priority briefly to allow interrupts, then return. + * The priority to be used (safepri) is machine-dependent, thus this + * value is initialized and maintained in the machine-dependent layers. + * This priority will typically be 0, or the lowest priority + * that is safe for use on the interrupt stack; it can be made + * higher to block network software interrupts after panics. + */ +int safepri; + +void +sleepinit() +{ + int i; + + for (i = 0; i < TABLESIZE; i++) + TAILQ_INIT(&slpque[i]); +} + +/* + * General sleep call. Suspends the current process until a wakeup is + * performed on the specified identifier. The process will then be made + * runnable with the specified priority. Sleeps at most timo/hz seconds + * (0 means no timeout). If pri includes PCATCH flag, signals are checked + * before and after sleeping, else signals are not checked. Returns 0 if + * awakened, EWOULDBLOCK if the timeout expires. If PCATCH is set and a + * signal needs to be delivered, ERESTART is returned if the current system + * call should be restarted if possible, and EINTR is returned if the system + * call should be interrupted by the signal (return EINTR). + */ +int +tsleep(ident, priority, wmesg, timo) + void *ident; + int priority, timo; + const char *wmesg; +{ + struct proc *p = curproc; + int s, sig, catch = priority & PCATCH; + struct callout_handle thandle; + +#ifdef KTRACE + if (KTRPOINT(p, KTR_CSW)) + ktrcsw(p->p_tracep, 1, 0); +#endif + s = splhigh(); + if (cold || panicstr) { + /* + * After a panic, or during autoconfiguration, + * just give interrupts a chance, then just return; + * don't run any other procs or panic below, + * in case this is the idle process and already asleep. + */ + splx(safepri); + splx(s); + return (0); + } + KASSERT(p != NULL, ("tsleep1")); + KASSERT(ident != NULL && p->p_stat == SRUN, ("tsleep")); + /* + * Process may be sitting on a slpque if asleep() was called, remove + * it before re-adding. + */ + if (p->p_wchan != NULL) + unsleep(p); + + p->p_wchan = ident; + p->p_wmesg = wmesg; + p->p_slptime = 0; + p->p_priority = priority & PRIMASK; + TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq); + if (timo) + thandle = timeout(endtsleep, (void *)p, timo); + /* + * We put ourselves on the sleep queue and start our timeout + * before calling CURSIG, as we could stop there, and a wakeup + * or a SIGCONT (or both) could occur while we were stopped. + * A SIGCONT would cause us to be marked as SSLEEP + * without resuming us, thus we must be ready for sleep + * when CURSIG is called. If the wakeup happens while we're + * stopped, p->p_wchan will be 0 upon return from CURSIG. + */ + if (catch) { + p->p_flag |= P_SINTR; + if ((sig = CURSIG(p))) { + if (p->p_wchan) + unsleep(p); + p->p_stat = SRUN; + goto resume; + } + if (p->p_wchan == 0) { + catch = 0; + goto resume; + } + } else + sig = 0; + p->p_stat = SSLEEP; + p->p_stats->p_ru.ru_nvcsw++; + mi_switch(); +resume: + curpriority = p->p_usrpri; + splx(s); + p->p_flag &= ~P_SINTR; + if (p->p_flag & P_TIMEOUT) { + p->p_flag &= ~P_TIMEOUT; + if (sig == 0) { +#ifdef KTRACE + if (KTRPOINT(p, KTR_CSW)) + ktrcsw(p->p_tracep, 0, 0); +#endif + return (EWOULDBLOCK); + } + } else if (timo) + untimeout(endtsleep, (void *)p, thandle); + if (catch && (sig != 0 || (sig = CURSIG(p)))) { +#ifdef KTRACE + if (KTRPOINT(p, KTR_CSW)) + ktrcsw(p->p_tracep, 0, 0); +#endif + if (p->p_sigacts->ps_sigintr & sigmask(sig)) + return (EINTR); + return (ERESTART); + } +#ifdef KTRACE + if (KTRPOINT(p, KTR_CSW)) + ktrcsw(p->p_tracep, 0, 0); +#endif + return (0); +} + +/* + * asleep() - async sleep call. Place process on wait queue and return + * immediately without blocking. The process stays runnable until await() + * is called. If ident is NULL, remove process from wait queue if it is still + * on one. + * + * Only the most recent sleep condition is effective when making successive + * calls to asleep() or when calling tsleep(). + * + * The timeout, if any, is not initiated until await() is called. The sleep + * priority, signal, and timeout is specified in the asleep() call but may be + * overriden in the await() call. + * + * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>> + */ + +int +asleep(void *ident, int priority, const char *wmesg, int timo) +{ + struct proc *p = curproc; + int s; + + /* + * splhigh() while manipulating sleep structures and slpque. + * + * Remove preexisting wait condition (if any) and place process + * on appropriate slpque, but do not put process to sleep. + */ + + s = splhigh(); + + if (p->p_wchan != NULL) + unsleep(p); + + if (ident) { + p->p_wchan = ident; + p->p_wmesg = wmesg; + p->p_slptime = 0; + p->p_asleep.as_priority = priority; + p->p_asleep.as_timo = timo; + TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq); + } + + splx(s); + + return(0); +} + +/* + * await() - wait for async condition to occur. The process blocks until + * wakeup() is called on the most recent asleep() address. If wakeup is called + * priority to await(), await() winds up being a NOP. + * + * If await() is called more then once (without an intervening asleep() call), + * await() is still effectively a NOP but it calls mi_switch() to give other + * processes some cpu before returning. The process is left runnable. + * + * <<<<<<<< EXPERIMENTAL, UNTESTED >>>>>>>>>> + */ + +int +await(int priority, int timo) +{ + struct proc *p = curproc; + int s; + + s = splhigh(); + + if (p->p_wchan != NULL) { + struct callout_handle thandle; + int sig; + int catch; + + /* + * The call to await() can override defaults specified in + * the original asleep(). + */ + if (priority < 0) + priority = p->p_asleep.as_priority; + if (timo < 0) + timo = p->p_asleep.as_timo; + + /* + * Install timeout + */ + + if (timo) + thandle = timeout(endtsleep, (void *)p, timo); + + sig = 0; + catch = priority & PCATCH; + + if (catch) { + p->p_flag |= P_SINTR; + if ((sig = CURSIG(p))) { + if (p->p_wchan) + unsleep(p); + p->p_stat = SRUN; + goto resume; + } + if (p->p_wchan == NULL) { + catch = 0; + goto resume; + } + } + p->p_stat = SSLEEP; + p->p_stats->p_ru.ru_nvcsw++; + mi_switch(); +resume: + curpriority = p->p_usrpri; + + splx(s); + p->p_flag &= ~P_SINTR; + if (p->p_flag & P_TIMEOUT) { + p->p_flag &= ~P_TIMEOUT; + if (sig == 0) { +#ifdef KTRACE + if (KTRPOINT(p, KTR_CSW)) + ktrcsw(p->p_tracep, 0, 0); +#endif + return (EWOULDBLOCK); + } + } else if (timo) + untimeout(endtsleep, (void *)p, thandle); + if (catch && (sig != 0 || (sig = CURSIG(p)))) { +#ifdef KTRACE + if (KTRPOINT(p, KTR_CSW)) + ktrcsw(p->p_tracep, 0, 0); +#endif + if (p->p_sigacts->ps_sigintr & sigmask(sig)) + return (EINTR); + return (ERESTART); + } +#ifdef KTRACE + if (KTRPOINT(p, KTR_CSW)) + ktrcsw(p->p_tracep, 0, 0); +#endif + } else { + /* + * If as_priority is 0, await() has been called without an + * intervening asleep(). We are still effectively a NOP, + * but we call mi_switch() for safety. + */ + + if (p->p_asleep.as_priority == 0) { + p->p_stats->p_ru.ru_nvcsw++; + mi_switch(); + } + splx(s); + } + + /* + * clear p_asleep.as_priority as an indication that await() has been + * called. If await() is called again without an intervening asleep(), + * await() is still effectively a NOP but the above mi_switch() code + * is triggered as a safety. + */ + p->p_asleep.as_priority = 0; + + return (0); +} + +/* + * Implement timeout for tsleep or asleep()/await() + * + * If process hasn't been awakened (wchan non-zero), + * set timeout flag and undo the sleep. If proc + * is stopped, just unsleep so it will remain stopped. + */ +static void +endtsleep(arg) + void *arg; +{ + register struct proc *p; + int s; + + p = (struct proc *)arg; + s = splhigh(); + if (p->p_wchan) { + if (p->p_stat == SSLEEP) + setrunnable(p); + else + unsleep(p); + p->p_flag |= P_TIMEOUT; + } + splx(s); +} + +/* + * Remove a process from its wait queue + */ +void +unsleep(p) + register struct proc *p; +{ + int s; + + s = splhigh(); + if (p->p_wchan) { + TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_procq); + p->p_wchan = 0; + } + splx(s); +} + +/* + * Make all processes sleeping on the specified identifier runnable. + */ +void +wakeup(ident) + register void *ident; +{ + register struct slpquehead *qp; + register struct proc *p; + int s; + + s = splhigh(); + qp = &slpque[LOOKUP(ident)]; +restart: + for (p = qp->tqh_first; p != NULL; p = p->p_procq.tqe_next) { + if (p->p_wchan == ident) { + TAILQ_REMOVE(qp, p, p_procq); + p->p_wchan = 0; + if (p->p_stat == SSLEEP) { + /* OPTIMIZED EXPANSION OF setrunnable(p); */ + if (p->p_slptime > 1) + updatepri(p); + p->p_slptime = 0; + p->p_stat = SRUN; + if (p->p_flag & P_INMEM) { + setrunqueue(p); + maybe_resched(p); + } else { + p->p_flag |= P_SWAPINREQ; + wakeup((caddr_t)&proc0); + } + /* END INLINE EXPANSION */ + goto restart; + } + } + } + splx(s); +} + +/* + * Make a process sleeping on the specified identifier runnable. + * May wake more than one process if a target prcoess is currently + * swapped out. + */ +void +wakeup_one(ident) + register void *ident; +{ + register struct slpquehead *qp; + register struct proc *p; + int s; + + s = splhigh(); + qp = &slpque[LOOKUP(ident)]; + + for (p = qp->tqh_first; p != NULL; p = p->p_procq.tqe_next) { + if (p->p_wchan == ident) { + TAILQ_REMOVE(qp, p, p_procq); + p->p_wchan = 0; + if (p->p_stat == SSLEEP) { + /* OPTIMIZED EXPANSION OF setrunnable(p); */ + if (p->p_slptime > 1) + updatepri(p); + p->p_slptime = 0; + p->p_stat = SRUN; + if (p->p_flag & P_INMEM) { + setrunqueue(p); + maybe_resched(p); + break; + } else { + p->p_flag |= P_SWAPINREQ; + wakeup((caddr_t)&proc0); + } + /* END INLINE EXPANSION */ + } + } + } + splx(s); +} + +/* + * The machine independent parts of mi_switch(). + * Must be called at splstatclock() or higher. + */ +void +mi_switch() +{ + register struct proc *p = curproc; /* XXX */ + register struct rlimit *rlim; + int x; + + /* + * XXX this spl is almost unnecessary. It is partly to allow for + * sloppy callers that don't do it (issignal() via CURSIG() is the + * main offender). It is partly to work around a bug in the i386 + * cpu_switch() (the ipl is not preserved). We ran for years + * without it. I think there was only a interrupt latency problem. + * The main caller, tsleep(), does an splx() a couple of instructions + * after calling here. The buggy caller, issignal(), usually calls + * here at spl0() and sometimes returns at splhigh(). The process + * then runs for a little too long at splhigh(). The ipl gets fixed + * when the process returns to user mode (or earlier). + * + * It would probably be better to always call here at spl0(). Callers + * are prepared to give up control to another process, so they must + * be prepared to be interrupted. The clock stuff here may not + * actually need splstatclock(). + */ + x = splstatclock(); + +#ifdef SIMPLELOCK_DEBUG + if (p->p_simple_locks) + printf("sleep: holding simple lock\n"); +#endif + /* + * Compute the amount of time during which the current + * process was running, and add that to its total so far. + */ + microuptime(&switchtime); + p->p_runtime += (switchtime.tv_usec - p->p_switchtime.tv_usec) + + (switchtime.tv_sec - p->p_switchtime.tv_sec) * (int64_t)1000000; + + /* + * Check if the process exceeds its cpu resource allocation. + * If over max, kill it. + */ + if (p->p_stat != SZOMB && p->p_limit->p_cpulimit != RLIM_INFINITY && + p->p_runtime > p->p_limit->p_cpulimit) { + rlim = &p->p_rlimit[RLIMIT_CPU]; + if (p->p_runtime / (rlim_t)1000000 >= rlim->rlim_max) { + killproc(p, "exceeded maximum CPU limit"); + } else { + psignal(p, SIGXCPU); + if (rlim->rlim_cur < rlim->rlim_max) { + /* XXX: we should make a private copy */ + rlim->rlim_cur += 5; + } + } + } + + /* + * Pick a new current process and record its start time. + */ + cnt.v_swtch++; + cpu_switch(p); + if (switchtime.tv_sec) + p->p_switchtime = switchtime; + else + microuptime(&p->p_switchtime); + splx(x); +} + +/* + * Initialize the (doubly-linked) run queues + * to be empty. + */ +/* ARGSUSED*/ +static void +rqinit(dummy) + void *dummy; +{ + register int i; + + for (i = 0; i < NQS; i++) { + qs[i].ph_link = qs[i].ph_rlink = (struct proc *)&qs[i]; + rtqs[i].ph_link = rtqs[i].ph_rlink = (struct proc *)&rtqs[i]; + idqs[i].ph_link = idqs[i].ph_rlink = (struct proc *)&idqs[i]; + } +} + +/* + * Change process state to be runnable, + * placing it on the run queue if it is in memory, + * and awakening the swapper if it isn't in memory. + */ +void +setrunnable(p) + register struct proc *p; +{ + register int s; + + s = splhigh(); + switch (p->p_stat) { + case 0: + case SRUN: + case SZOMB: + default: + panic("setrunnable"); + case SSTOP: + case SSLEEP: + unsleep(p); /* e.g. when sending signals */ + break; + + case SIDL: + break; + } + p->p_stat = SRUN; + if (p->p_flag & P_INMEM) + setrunqueue(p); + splx(s); + if (p->p_slptime > 1) + updatepri(p); + p->p_slptime = 0; + if ((p->p_flag & P_INMEM) == 0) { + p->p_flag |= P_SWAPINREQ; + wakeup((caddr_t)&proc0); + } + else + maybe_resched(p); +} + +/* + * Compute the priority of a process when running in user mode. + * Arrange to reschedule if the resulting priority is better + * than that of the current process. + */ +void +resetpriority(p) + register struct proc *p; +{ + register unsigned int newpriority; + + if (p->p_rtprio.type == RTP_PRIO_NORMAL) { + newpriority = PUSER + p->p_estcpu / 4 + 2 * p->p_nice; + newpriority = min(newpriority, MAXPRI); + p->p_usrpri = newpriority; + } + maybe_resched(p); +} + +/* ARGSUSED */ +static void sched_setup __P((void *dummy)); +static void +sched_setup(dummy) + void *dummy; +{ + /* Kick off timeout driven events by calling first time. */ + roundrobin(NULL); + schedcpu(NULL); +} +SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL) + diff --git a/sys/kern/kern_syscalls.c b/sys/kern/kern_syscalls.c new file mode 100644 index 0000000..e1192a9 --- /dev/null +++ b/sys/kern/kern_syscalls.c @@ -0,0 +1,109 @@ +/*- + * Copyright (c) 1999 Assar Westerlund + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: kern_syscalls.c,v 1.2 1999/01/09 14:59:50 dfr Exp $ + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/sysproto.h> +#include <sys/sysent.h> +#include <sys/syscall.h> +#include <sys/module.h> +#include <sys/linker.h> +#include <sys/proc.h> + +/* + * Acts like "nosys" but can be identified in sysent for dynamic call + * number assignment for a limited number of calls. + * + * Place holder for system call slots reserved for loadable modules. + */ +int +lkmnosys(struct proc *p, struct nosys_args *args) +{ + return(nosys(p, args)); +} + +int +syscall_register(int *offset, struct sysent *new_sysent, + struct sysent *old_sysent) +{ + if (*offset == NO_SYSCALL) { + int i; + + for (i = 1; i < SYS_MAXSYSCALL; ++i) + if (sysent[i].sy_call == (sy_call_t *)lkmnosys) + break; + if (i == SYS_MAXSYSCALL) + return ENFILE; + *offset = i; + } else if (*offset < 0 || *offset >= SYS_MAXSYSCALL) + return EINVAL; + else if (sysent[*offset].sy_call != (sy_call_t *)lkmnosys) + return EEXIST; + + *old_sysent = sysent[*offset]; + sysent[*offset] = *new_sysent; + return 0; +} + +int +syscall_deregister(int *offset, struct sysent *old_sysent) +{ + if (*offset) + sysent[*offset] = *old_sysent; + return 0; +} + +int +syscall_module_handler(struct module *mod, int what, void *arg) +{ + struct syscall_module_data *data = (struct syscall_module_data*)arg; + modspecific_t ms; + int error; + + switch (what) { + case MOD_LOAD : + error = syscall_register(data->offset, data->new_sysent, + &data->old_sysent); + if (error) + return error; + ms.intval = *data->offset; + module_setspecific(mod, &ms); + break; + case MOD_UNLOAD : + error = syscall_deregister(data->offset, &data->old_sysent); + if (error) + return error; + break; + } + if (data->chainevh) + return data->chainevh(mod, what, data->chainarg); + else + return 0; +} diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c new file mode 100644 index 0000000..fbf2f6a --- /dev/null +++ b/sys/kern/kern_sysctl.c @@ -0,0 +1,1122 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Karels at Berkeley Software Design, Inc. + * + * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD + * project, to make these variables more userfriendly. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94 + * $Id: kern_sysctl.c,v 1.81 1998/12/27 18:03:29 dfr Exp $ + */ + +#include "opt_compat.h" + +#include <sys/param.h> +#include <sys/buf.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> + +static MALLOC_DEFINE(M_SYSCTL, "sysctl", "sysctl internal magic"); + +/* + * Locking and stats + */ +static struct sysctl_lock { + int sl_lock; + int sl_want; + int sl_locked; +} memlock; + +static int sysctl_root SYSCTL_HANDLER_ARGS; + +extern struct linker_set sysctl_; + +/* + * Initialization of the MIB tree. + * + * Order by number in each linker_set. + */ + +static int +sysctl_order_cmp(const void *a, const void *b) +{ + struct sysctl_oid const * const *pa; + struct sysctl_oid const * const *pb; + + pa = (struct sysctl_oid const * const *)a; + pb = (struct sysctl_oid const * const *)b; + if (*pa == NULL && *pb == NULL) + return 0; + if (*pa == NULL) + return (1); + if (*pb == NULL) + return (-1); + return ((*pa)->oid_number - (*pb)->oid_number); +} + +static void +sysctl_order(void *arg) +{ + int j, k; + struct linker_set *l = (struct linker_set *) arg; + struct sysctl_oid **oidpp; + + /* First, find the highest oid we have */ + j = l->ls_length; + oidpp = (struct sysctl_oid **) l->ls_items; + for (k = 0; j--; oidpp++) { + if (!*oidpp) + continue; + if ((*oidpp)->oid_arg1 == arg) { + *oidpp = 0; + continue; + } + if ((*oidpp)->oid_number > k) + k = (*oidpp)->oid_number; + } + + /* Next, replace all OID_AUTO oids with new numbers */ + j = l->ls_length; + oidpp = (struct sysctl_oid **) l->ls_items; + k += 100; + for (; j--; oidpp++) + if (*oidpp && (*oidpp)->oid_number == OID_AUTO) + (*oidpp)->oid_number = k++; + + /* Finally: sort by oid */ + j = l->ls_length; + oidpp = (struct sysctl_oid **) l->ls_items; + for (; j--; oidpp++) { + if (!*oidpp) + continue; + if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE) + if (!(*oidpp)->oid_handler) + sysctl_order((*oidpp)->oid_arg1); + } + qsort(l->ls_items, l->ls_length, sizeof l->ls_items[0], + sysctl_order_cmp); +} + +SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_ANY, sysctl_order, &sysctl_); + +void +sysctl_order_all(void) +{ + sysctl_order(&sysctl_); +} + +/* + * "Staff-functions" + * + * These functions implement a presently undocumented interface + * used by the sysctl program to walk the tree, and get the type + * so it can print the value. + * This interface is under work and consideration, and should probably + * be killed with a big axe by the first person who can find the time. + * (be aware though, that the proper interface isn't as obvious as it + * may seem, there are various conflicting requirements. + * + * {0,0} printf the entire MIB-tree. + * {0,1,...} return the name of the "..." OID. + * {0,2,...} return the next OID. + * {0,3} return the OID of the name in "new" + * {0,4,...} return the kind & format info for the "..." OID. + */ + +static void +sysctl_sysctl_debug_dump_node(struct linker_set *l, int i) +{ + int j, k; + struct sysctl_oid **oidpp; + + j = l->ls_length; + oidpp = (struct sysctl_oid **) l->ls_items; + for (; j--; oidpp++) { + + if (!*oidpp) + continue; + + for (k=0; k<i; k++) + printf(" "); + + printf("%d %s ", (*oidpp)->oid_number, (*oidpp)->oid_name); + + printf("%c%c", + (*oidpp)->oid_kind & CTLFLAG_RD ? 'R':' ', + (*oidpp)->oid_kind & CTLFLAG_WR ? 'W':' '); + + if ((*oidpp)->oid_handler) + printf(" *Handler"); + + switch ((*oidpp)->oid_kind & CTLTYPE) { + case CTLTYPE_NODE: + printf(" Node\n"); + if (!(*oidpp)->oid_handler) { + sysctl_sysctl_debug_dump_node( + (*oidpp)->oid_arg1, i+2); + } + break; + case CTLTYPE_INT: printf(" Int\n"); break; + case CTLTYPE_STRING: printf(" String\n"); break; + case CTLTYPE_QUAD: printf(" Quad\n"); break; + case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break; + default: printf("\n"); + } + + } +} + +static int +sysctl_sysctl_debug SYSCTL_HANDLER_ARGS +{ + sysctl_sysctl_debug_dump_node(&sysctl_, 0); + return ENOENT; +} + +SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD, + 0, 0, sysctl_sysctl_debug, "-", ""); + +static int +sysctl_sysctl_name SYSCTL_HANDLER_ARGS +{ + int *name = (int *) arg1; + u_int namelen = arg2; + int i, j, error = 0; + struct sysctl_oid **oidpp; + struct linker_set *lsp = &sysctl_; + char buf[10]; + + while (namelen) { + if (!lsp) { + snprintf(buf,sizeof(buf),"%d",*name); + if (req->oldidx) + error = SYSCTL_OUT(req, ".", 1); + if (!error) + error = SYSCTL_OUT(req, buf, strlen(buf)); + if (error) + return (error); + namelen--; + name++; + continue; + } + oidpp = (struct sysctl_oid **) lsp->ls_items; + j = lsp->ls_length; + lsp = 0; + for (i = 0; i < j; i++, oidpp++) { + if (*oidpp && ((*oidpp)->oid_number != *name)) + continue; + + if (req->oldidx) + error = SYSCTL_OUT(req, ".", 1); + if (!error) + error = SYSCTL_OUT(req, (*oidpp)->oid_name, + strlen((*oidpp)->oid_name)); + if (error) + return (error); + + namelen--; + name++; + + if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE) + break; + + if ((*oidpp)->oid_handler) + break; + + lsp = (struct linker_set*)(*oidpp)->oid_arg1; + break; + } + } + return (SYSCTL_OUT(req, "", 1)); +} + +SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD, sysctl_sysctl_name, ""); + +static int +sysctl_sysctl_next_ls (struct linker_set *lsp, int *name, u_int namelen, + int *next, int *len, int level, struct sysctl_oid **oidp) +{ + int i, j; + struct sysctl_oid **oidpp; + + oidpp = (struct sysctl_oid **) lsp->ls_items; + j = lsp->ls_length; + *len = level; + for (i = 0; i < j; i++, oidpp++) { + if (!*oidpp) + continue; + + *next = (*oidpp)->oid_number; + *oidp = *oidpp; + + if (!namelen) { + if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE) + return 0; + if ((*oidpp)->oid_handler) + /* We really should call the handler here...*/ + return 0; + lsp = (struct linker_set*)(*oidpp)->oid_arg1; + if (!sysctl_sysctl_next_ls (lsp, 0, 0, next+1, + len, level+1, oidp)) + return 0; + goto next; + } + + if ((*oidpp)->oid_number < *name) + continue; + + if ((*oidpp)->oid_number > *name) { + if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE) + return 0; + if ((*oidpp)->oid_handler) + return 0; + lsp = (struct linker_set*)(*oidpp)->oid_arg1; + if (!sysctl_sysctl_next_ls (lsp, name+1, namelen-1, + next+1, len, level+1, oidp)) + return (0); + goto next; + } + if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE) + continue; + + if ((*oidpp)->oid_handler) + continue; + + lsp = (struct linker_set*)(*oidpp)->oid_arg1; + if (!sysctl_sysctl_next_ls (lsp, name+1, namelen-1, next+1, + len, level+1, oidp)) + return (0); + next: + namelen = 1; + *len = level; + } + return 1; +} + +static int +sysctl_sysctl_next SYSCTL_HANDLER_ARGS +{ + int *name = (int *) arg1; + u_int namelen = arg2; + int i, j, error; + struct sysctl_oid *oid; + struct linker_set *lsp = &sysctl_; + int newoid[CTL_MAXNAME]; + + i = sysctl_sysctl_next_ls (lsp, name, namelen, newoid, &j, 1, &oid); + if (i) + return ENOENT; + error = SYSCTL_OUT(req, newoid, j * sizeof (int)); + return (error); +} + +SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD, sysctl_sysctl_next, ""); + +static int +name2oid (char *name, int *oid, int *len, struct sysctl_oid **oidp) +{ + int i, j; + struct sysctl_oid **oidpp; + struct linker_set *lsp = &sysctl_; + char *p; + + if (!*name) + return ENOENT; + + p = name + strlen(name) - 1 ; + if (*p == '.') + *p = '\0'; + + *len = 0; + + for (p = name; *p && *p != '.'; p++) + ; + i = *p; + if (i == '.') + *p = '\0'; + + j = lsp->ls_length; + oidpp = (struct sysctl_oid **) lsp->ls_items; + + while (j-- && *len < CTL_MAXNAME) { + if (!*oidpp) + continue; + if (strcmp(name, (*oidpp)->oid_name)) { + oidpp++; + continue; + } + *oid++ = (*oidpp)->oid_number; + (*len)++; + + if (!i) { + if (oidp) + *oidp = *oidpp; + return (0); + } + + if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE) + break; + + if ((*oidpp)->oid_handler) + break; + + lsp = (struct linker_set*)(*oidpp)->oid_arg1; + j = lsp->ls_length; + oidpp = (struct sysctl_oid **)lsp->ls_items; + name = p+1; + for (p = name; *p && *p != '.'; p++) + ; + i = *p; + if (i == '.') + *p = '\0'; + } + return ENOENT; +} + +static int +sysctl_sysctl_name2oid SYSCTL_HANDLER_ARGS +{ + char *p; + int error, oid[CTL_MAXNAME], len; + struct sysctl_oid *op = 0; + + if (!req->newlen) + return ENOENT; + + p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK); + + error = SYSCTL_IN(req, p, req->newlen); + if (error) { + free(p, M_SYSCTL); + return (error); + } + + p [req->newlen] = '\0'; + + error = name2oid(p, oid, &len, &op); + + free(p, M_SYSCTL); + + if (error) + return (error); + + error = SYSCTL_OUT(req, oid, len * sizeof *oid); + return (error); +} + +SYSCTL_PROC(_sysctl, 3, name2oid, CTLFLAG_RW|CTLFLAG_ANYBODY, 0, 0, + sysctl_sysctl_name2oid, "I", ""); + +static int +sysctl_sysctl_oidfmt SYSCTL_HANDLER_ARGS +{ + int *name = (int *) arg1, error; + u_int namelen = arg2; + int indx, j; + struct sysctl_oid **oidpp; + struct linker_set *lsp = &sysctl_; + + j = lsp->ls_length; + oidpp = (struct sysctl_oid **) lsp->ls_items; + + indx = 0; + while (j-- && indx < CTL_MAXNAME) { + if (*oidpp && ((*oidpp)->oid_number == name[indx])) { + indx++; + if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE) { + if ((*oidpp)->oid_handler) + goto found; + if (indx == namelen) + goto found; + lsp = (struct linker_set*)(*oidpp)->oid_arg1; + j = lsp->ls_length; + oidpp = (struct sysctl_oid **)lsp->ls_items; + } else { + if (indx != namelen) + return EISDIR; + goto found; + } + } else { + oidpp++; + } + } + return ENOENT; +found: + if (!(*oidpp)->oid_fmt) + return ENOENT; + error = SYSCTL_OUT(req, + &(*oidpp)->oid_kind, sizeof((*oidpp)->oid_kind)); + if (!error) + error = SYSCTL_OUT(req, (*oidpp)->oid_fmt, + strlen((*oidpp)->oid_fmt)+1); + return (error); +} + + +SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD, sysctl_sysctl_oidfmt, ""); + +/* + * Default "handler" functions. + */ + +/* + * Handle an int, signed or unsigned. + * Two cases: + * a variable: point arg1 at it. + * a constant: pass it in arg2. + */ + +int +sysctl_handle_int SYSCTL_HANDLER_ARGS +{ + int error = 0; + + if (arg1) + error = SYSCTL_OUT(req, arg1, sizeof(int)); + else + error = SYSCTL_OUT(req, &arg2, sizeof(int)); + + if (error || !req->newptr) + return (error); + + if (!arg1) + error = EPERM; + else + error = SYSCTL_IN(req, arg1, sizeof(int)); + return (error); +} + +/* + * Handle a long, signed or unsigned. + * Two cases: + * a variable: point arg1 at it. + * a constant: pass it in arg2. + */ + +int +sysctl_handle_long SYSCTL_HANDLER_ARGS +{ + int error = 0; + + error = SYSCTL_OUT(req, arg1, sizeof(long)); + + if (error || !req->newptr) + return (error); + + if (!arg1) + error = EPERM; + else + error = SYSCTL_IN(req, arg1, sizeof(long)); + return (error); +} + +/* + * Handle our generic '\0' terminated 'C' string. + * Two cases: + * a variable string: point arg1 at it, arg2 is max length. + * a constant string: point arg1 at it, arg2 is zero. + */ + +int +sysctl_handle_string SYSCTL_HANDLER_ARGS +{ + int error=0; + + error = SYSCTL_OUT(req, arg1, strlen((char *)arg1)+1); + + if (error || !req->newptr || !arg2) + return (error); + + if ((req->newlen - req->newidx) > arg2) { + error = E2BIG; + } else { + arg2 = (req->newlen - req->newidx); + error = SYSCTL_IN(req, arg1, arg2); + ((char *)arg1)[arg2] = '\0'; + } + + return (error); +} + +/* + * Handle any kind of opaque data. + * arg1 points to it, arg2 is the size. + */ + +int +sysctl_handle_opaque SYSCTL_HANDLER_ARGS +{ + int error; + + error = SYSCTL_OUT(req, arg1, arg2); + + if (error || !req->newptr) + return (error); + + error = SYSCTL_IN(req, arg1, arg2); + + return (error); +} + +/* + * Transfer functions to/from kernel space. + * XXX: rather untested at this point + */ +static int +sysctl_old_kernel(struct sysctl_req *req, const void *p, size_t l) +{ + size_t i = 0; + + if (req->oldptr) { + i = l; + if (i > req->oldlen - req->oldidx) + i = req->oldlen - req->oldidx; + if (i > 0) + bcopy(p, (char *)req->oldptr + req->oldidx, i); + } + req->oldidx += l; + if (req->oldptr && i != l) + return (ENOMEM); + return (0); +} + +static int +sysctl_new_kernel(struct sysctl_req *req, void *p, size_t l) +{ + if (!req->newptr) + return 0; + if (req->newlen - req->newidx < l) + return (EINVAL); + bcopy((char *)req->newptr + req->newidx, p, l); + req->newidx += l; + return (0); +} + +int +kernel_sysctl(struct proc *p, int *name, u_int namelen, void *old, size_t *oldlenp, void *new, size_t newlen, size_t *retval) +{ + int error = 0; + struct sysctl_req req; + + bzero(&req, sizeof req); + + req.p = p; + + if (oldlenp) { + req.oldlen = *oldlenp; + } + + if (old) { + req.oldptr= old; + } + + if (newlen) { + req.newlen = newlen; + req.newptr = new; + } + + req.oldfunc = sysctl_old_kernel; + req.newfunc = sysctl_new_kernel; + req.lock = 1; + + /* XXX this should probably be done in a general way */ + while (memlock.sl_lock) { + memlock.sl_want = 1; + (void) tsleep((caddr_t)&memlock, PRIBIO+1, "sysctl", 0); + memlock.sl_locked++; + } + memlock.sl_lock = 1; + + error = sysctl_root(0, name, namelen, &req); + + if (req.lock == 2) + vsunlock(req.oldptr, req.oldlen, B_WRITE); + + memlock.sl_lock = 0; + + if (memlock.sl_want) { + memlock.sl_want = 0; + wakeup((caddr_t)&memlock); + } + + if (error && error != ENOMEM) + return (error); + + if (retval) { + if (req.oldptr && req.oldidx > req.oldlen) + *retval = req.oldlen; + else + *retval = req.oldidx; + } + return (error); +} + +/* + * Transfer function to/from user space. + */ +static int +sysctl_old_user(struct sysctl_req *req, const void *p, size_t l) +{ + int error = 0; + size_t i = 0; + + if (req->lock == 1 && req->oldptr) { + vslock(req->oldptr, req->oldlen); + req->lock = 2; + } + if (req->oldptr) { + i = l; + if (i > req->oldlen - req->oldidx) + i = req->oldlen - req->oldidx; + if (i > 0) + error = copyout(p, (char *)req->oldptr + req->oldidx, + i); + } + req->oldidx += l; + if (error) + return (error); + if (req->oldptr && i < l) + return (ENOMEM); + return (0); +} + +static int +sysctl_new_user(struct sysctl_req *req, void *p, size_t l) +{ + int error; + + if (!req->newptr) + return 0; + if (req->newlen - req->newidx < l) + return (EINVAL); + error = copyin((char *)req->newptr + req->newidx, p, l); + req->newidx += l; + return (error); +} + +/* + * Traverse our tree, and find the right node, execute whatever it points + * at, and return the resulting error code. + */ + +int +sysctl_root SYSCTL_HANDLER_ARGS +{ + int *name = (int *) arg1; + u_int namelen = arg2; + int indx, i, j; + struct sysctl_oid **oidpp; + struct linker_set *lsp = &sysctl_; + + j = lsp->ls_length; + oidpp = (struct sysctl_oid **) lsp->ls_items; + + indx = 0; + while (j-- && indx < CTL_MAXNAME) { + if (*oidpp && ((*oidpp)->oid_number == name[indx])) { + indx++; + if ((*oidpp)->oid_kind & CTLFLAG_NOLOCK) + req->lock = 0; + if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE) { + if ((*oidpp)->oid_handler) + goto found; + if (indx == namelen) + return ENOENT; + lsp = (struct linker_set*)(*oidpp)->oid_arg1; + j = lsp->ls_length; + oidpp = (struct sysctl_oid **)lsp->ls_items; + } else { + if (indx != namelen) + return EISDIR; + goto found; + } + } else { + oidpp++; + } + } + return ENOENT; +found: + /* If writing isn't allowed */ + if (req->newptr && (!((*oidpp)->oid_kind & CTLFLAG_WR) || + (((*oidpp)->oid_kind & CTLFLAG_SECURE) && securelevel > 0))) + return (EPERM); + + /* Most likely only root can write */ + if (!((*oidpp)->oid_kind & CTLFLAG_ANYBODY) && + req->newptr && req->p && + (i = suser(req->p->p_ucred, &req->p->p_acflag))) + return (i); + + if (!(*oidpp)->oid_handler) + return EINVAL; + + if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE) { + i = ((*oidpp)->oid_handler) (*oidpp, + name + indx, namelen - indx, + req); + } else { + i = ((*oidpp)->oid_handler) (*oidpp, + (*oidpp)->oid_arg1, (*oidpp)->oid_arg2, + req); + } + return (i); +} + +#ifndef _SYS_SYSPROTO_H_ +struct sysctl_args { + int *name; + u_int namelen; + void *old; + size_t *oldlenp; + void *new; + size_t newlen; +}; +#endif + +int +__sysctl(struct proc *p, struct sysctl_args *uap) +{ + int error, i, name[CTL_MAXNAME]; + size_t j; + + if (uap->namelen > CTL_MAXNAME || uap->namelen < 2) + return (EINVAL); + + error = copyin(uap->name, &name, uap->namelen * sizeof(int)); + if (error) + return (error); + + error = userland_sysctl(p, name, uap->namelen, + uap->old, uap->oldlenp, 0, + uap->new, uap->newlen, &j); + if (error && error != ENOMEM) + return (error); + if (uap->oldlenp) { + i = copyout(&j, uap->oldlenp, sizeof(j)); + if (i) + return (i); + } + return (error); +} + +/* + * This is used from various compatibility syscalls too. That's why name + * must be in kernel space. + */ +int +userland_sysctl(struct proc *p, int *name, u_int namelen, void *old, size_t *oldlenp, int inkernel, void *new, size_t newlen, size_t *retval) +{ + int error = 0; + struct sysctl_req req, req2; + + bzero(&req, sizeof req); + + req.p = p; + + if (oldlenp) { + if (inkernel) { + req.oldlen = *oldlenp; + } else { + error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp)); + if (error) + return (error); + } + } + + if (old) { + if (!useracc(old, req.oldlen, B_WRITE)) + return (EFAULT); + req.oldptr= old; + } + + if (newlen) { + if (!useracc(new, req.newlen, B_READ)) + return (EFAULT); + req.newlen = newlen; + req.newptr = new; + } + + req.oldfunc = sysctl_old_user; + req.newfunc = sysctl_new_user; + req.lock = 1; + + /* XXX this should probably be done in a general way */ + while (memlock.sl_lock) { + memlock.sl_want = 1; + (void) tsleep((caddr_t)&memlock, PRIBIO+1, "sysctl", 0); + memlock.sl_locked++; + } + memlock.sl_lock = 1; + + do { + req2 = req; + error = sysctl_root(0, name, namelen, &req2); + } while (error == EAGAIN); + + req = req2; + if (req.lock == 2) + vsunlock(req.oldptr, req.oldlen, B_WRITE); + + memlock.sl_lock = 0; + + if (memlock.sl_want) { + memlock.sl_want = 0; + wakeup((caddr_t)&memlock); + } + + if (error && error != ENOMEM) + return (error); + + if (retval) { + if (req.oldptr && req.oldidx > req.oldlen) + *retval = req.oldlen; + else + *retval = req.oldidx; + } + return (error); +} + +#ifdef COMPAT_43 +#include <sys/socket.h> +#include <vm/vm_param.h> + +#define KINFO_PROC (0<<8) +#define KINFO_RT (1<<8) +#define KINFO_VNODE (2<<8) +#define KINFO_FILE (3<<8) +#define KINFO_METER (4<<8) +#define KINFO_LOADAVG (5<<8) +#define KINFO_CLOCKRATE (6<<8) + +/* Non-standard BSDI extension - only present on their 4.3 net-2 releases */ +#define KINFO_BSDI_SYSINFO (101<<8) + +/* + * XXX this is bloat, but I hope it's better here than on the potentially + * limited kernel stack... -Peter + */ + +static struct { + int bsdi_machine; /* "i386" on BSD/386 */ +/* ^^^ this is an offset to the string, relative to the struct start */ + char *pad0; + long pad1; + long pad2; + long pad3; + u_long pad4; + u_long pad5; + u_long pad6; + + int bsdi_ostype; /* "BSD/386" on BSD/386 */ + int bsdi_osrelease; /* "1.1" on BSD/386 */ + long pad7; + long pad8; + char *pad9; + + long pad10; + long pad11; + int pad12; + long pad13; + quad_t pad14; + long pad15; + + struct timeval pad16; + /* we dont set this, because BSDI's uname used gethostname() instead */ + int bsdi_hostname; /* hostname on BSD/386 */ + + /* the actual string data is appended here */ + +} bsdi_si; +/* + * this data is appended to the end of the bsdi_si structure during copyout. + * The "char *" offsets are relative to the base of the bsdi_si struct. + * This contains "FreeBSD\02.0-BUILT-nnnnnn\0i386\0", and these strings + * should not exceed the length of the buffer here... (or else!! :-) + */ +static char bsdi_strings[80]; /* It had better be less than this! */ + +#ifndef _SYS_SYSPROTO_H_ +struct getkerninfo_args { + int op; + char *where; + size_t *size; + int arg; +}; +#endif + +int +ogetkerninfo(struct proc *p, struct getkerninfo_args *uap) +{ + int error, name[6]; + size_t size; + + switch (uap->op & 0xff00) { + + case KINFO_RT: + name[0] = CTL_NET; + name[1] = PF_ROUTE; + name[2] = 0; + name[3] = (uap->op & 0xff0000) >> 16; + name[4] = uap->op & 0xff; + name[5] = uap->arg; + error = userland_sysctl(p, name, 6, uap->where, uap->size, + 0, 0, 0, &size); + break; + + case KINFO_VNODE: + name[0] = CTL_KERN; + name[1] = KERN_VNODE; + error = userland_sysctl(p, name, 2, uap->where, uap->size, + 0, 0, 0, &size); + break; + + case KINFO_PROC: + name[0] = CTL_KERN; + name[1] = KERN_PROC; + name[2] = uap->op & 0xff; + name[3] = uap->arg; + error = userland_sysctl(p, name, 4, uap->where, uap->size, + 0, 0, 0, &size); + break; + + case KINFO_FILE: + name[0] = CTL_KERN; + name[1] = KERN_FILE; + error = userland_sysctl(p, name, 2, uap->where, uap->size, + 0, 0, 0, &size); + break; + + case KINFO_METER: + name[0] = CTL_VM; + name[1] = VM_METER; + error = userland_sysctl(p, name, 2, uap->where, uap->size, + 0, 0, 0, &size); + break; + + case KINFO_LOADAVG: + name[0] = CTL_VM; + name[1] = VM_LOADAVG; + error = userland_sysctl(p, name, 2, uap->where, uap->size, + 0, 0, 0, &size); + break; + + case KINFO_CLOCKRATE: + name[0] = CTL_KERN; + name[1] = KERN_CLOCKRATE; + error = userland_sysctl(p, name, 2, uap->where, uap->size, + 0, 0, 0, &size); + break; + + case KINFO_BSDI_SYSINFO: { + /* + * this is pretty crude, but it's just enough for uname() + * from BSDI's 1.x libc to work. + * + * In particular, it doesn't return the same results when + * the supplied buffer is too small. BSDI's version apparently + * will return the amount copied, and set the *size to how + * much was needed. The emulation framework here isn't capable + * of that, so we just set both to the amount copied. + * BSDI's 2.x product apparently fails with ENOMEM in this + * scenario. + */ + + u_int needed; + u_int left; + char *s; + + bzero((char *)&bsdi_si, sizeof(bsdi_si)); + bzero(bsdi_strings, sizeof(bsdi_strings)); + + s = bsdi_strings; + + bsdi_si.bsdi_ostype = (s - bsdi_strings) + sizeof(bsdi_si); + strcpy(s, ostype); + s += strlen(s) + 1; + + bsdi_si.bsdi_osrelease = (s - bsdi_strings) + sizeof(bsdi_si); + strcpy(s, osrelease); + s += strlen(s) + 1; + + bsdi_si.bsdi_machine = (s - bsdi_strings) + sizeof(bsdi_si); + strcpy(s, machine); + s += strlen(s) + 1; + + needed = sizeof(bsdi_si) + (s - bsdi_strings); + + if (uap->where == NULL) { + /* process is asking how much buffer to supply.. */ + size = needed; + error = 0; + break; + } + + + /* if too much buffer supplied, trim it down */ + if (size > needed) + size = needed; + + /* how much of the buffer is remaining */ + left = size; + + if ((error = copyout((char *)&bsdi_si, uap->where, left)) != 0) + break; + + /* is there any point in continuing? */ + if (left > sizeof(bsdi_si)) { + left -= sizeof(bsdi_si); + error = copyout(&bsdi_strings, + uap->where + sizeof(bsdi_si), left); + } + break; + } + + default: + return (EOPNOTSUPP); + } + if (error) + return (error); + p->p_retval[0] = size; + if (uap->size) + error = copyout((caddr_t)&size, (caddr_t)uap->size, + sizeof(size)); + return (error); +} +#endif /* COMPAT_43 */ diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c new file mode 100644 index 0000000..2ea378f --- /dev/null +++ b/sys/kern/kern_tc.c @@ -0,0 +1,870 @@ +/*- + * Copyright (c) 1997, 1998 Poul-Henning Kamp <phk@FreeBSD.org> + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 + * $Id: kern_clock.c,v 1.85 1998/11/23 09:58:53 phk Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/dkstat.h> +#include <sys/callout.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/timex.h> +#include <vm/vm.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <sys/sysctl.h> + +#include <machine/cpu.h> +#include <machine/limits.h> + +#ifdef GPROF +#include <sys/gmon.h> +#endif + +#if defined(SMP) && defined(BETTER_CLOCK) +#include <machine/smp.h> +#endif + +/* This is where the NTIMECOUNTER option hangs out */ +#include "opt_ntp.h" + +/* + * Number of timecounters used to implement stable storage + */ +#ifndef NTIMECOUNTER +#define NTIMECOUNTER 5 +#endif + +static MALLOC_DEFINE(M_TIMECOUNTER, "timecounter", + "Timecounter stable storage"); + +static void initclocks __P((void *dummy)); +SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL) + +static void tco_forward __P((int force)); +static void tco_setscales __P((struct timecounter *tc)); +static __inline unsigned tco_delta __P((struct timecounter *tc)); + +/* Some of these don't belong here, but it's easiest to concentrate them. */ +#if defined(SMP) && defined(BETTER_CLOCK) +long cp_time[CPUSTATES]; +#else +static long cp_time[CPUSTATES]; +#endif + +long tk_cancc; +long tk_nin; +long tk_nout; +long tk_rawcc; + +time_t time_second; + +/* + * Which update policy to use. + * 0 - every tick, bad hardware may fail with "calcru negative..." + * 1 - more resistent to the above hardware, but less efficient. + */ +static int tco_method; + +/* + * Implement a dummy timecounter which we can use until we get a real one + * in the air. This allows the console and other early stuff to use + * timeservices. + */ + +static unsigned +dummy_get_timecount(struct timecounter *tc) +{ + static unsigned now; + return (++now); +} + +static struct timecounter dummy_timecounter = { + dummy_get_timecount, + 0, + ~0u, + 1000000, + "dummy" +}; + +struct timecounter *timecounter = &dummy_timecounter; + +/* + * Clock handling routines. + * + * This code is written to operate with two timers that run independently of + * each other. + * + * The main timer, running hz times per second, is used to trigger interval + * timers, timeouts and rescheduling as needed. + * + * The second timer handles kernel and user profiling, + * and does resource use estimation. If the second timer is programmable, + * it is randomized to avoid aliasing between the two clocks. For example, + * the randomization prevents an adversary from always giving up the cpu + * just before its quantum expires. Otherwise, it would never accumulate + * cpu ticks. The mean frequency of the second timer is stathz. + * + * If no second timer exists, stathz will be zero; in this case we drive + * profiling and statistics off the main clock. This WILL NOT be accurate; + * do not do it unless absolutely necessary. + * + * The statistics clock may (or may not) be run at a higher rate while + * profiling. This profile clock runs at profhz. We require that profhz + * be an integral multiple of stathz. + * + * If the statistics clock is running fast, it must be divided by the ratio + * profhz/stathz for statistics. (For profiling, every tick counts.) + * + * Time-of-day is maintained using a "timecounter", which may or may + * not be related to the hardware generating the above mentioned + * interrupts. + */ + +int stathz; +int profhz; +static int profprocs; +int ticks; +static int psdiv, pscnt; /* prof => stat divider */ +int psratio; /* ratio: prof / stat */ + +/* + * Initialize clock frequencies and start both clocks running. + */ +/* ARGSUSED*/ +static void +initclocks(dummy) + void *dummy; +{ + register int i; + + /* + * Set divisors to 1 (normal case) and let the machine-specific + * code do its bit. + */ + psdiv = pscnt = 1; + cpu_initclocks(); + + /* + * Compute profhz/stathz, and fix profhz if needed. + */ + i = stathz ? stathz : hz; + if (profhz == 0) + profhz = i; + psratio = profhz / i; +} + +/* + * The real-time timer, interrupting hz times per second. + */ +void +hardclock(frame) + register struct clockframe *frame; +{ + register struct proc *p; + + p = curproc; + if (p) { + register struct pstats *pstats; + + /* + * Run current process's virtual and profile time, as needed. + */ + pstats = p->p_stats; + if (CLKF_USERMODE(frame) && + timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) && + itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) + psignal(p, SIGVTALRM); + if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) && + itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) + psignal(p, SIGPROF); + } + +#if defined(SMP) && defined(BETTER_CLOCK) + forward_hardclock(pscnt); +#endif + + /* + * If no separate statistics clock is available, run it from here. + */ + if (stathz == 0) + statclock(frame); + + tco_forward(0); + ticks++; + + /* + * Process callouts at a very low cpu priority, so we don't keep the + * relatively high clock interrupt priority any longer than necessary. + */ + if (TAILQ_FIRST(&callwheel[ticks & callwheelmask]) != NULL) { + if (CLKF_BASEPRI(frame)) { + /* + * Save the overhead of a software interrupt; + * it will happen as soon as we return, so do it now. + */ + (void)splsoftclock(); + softclock(); + } else + setsoftclock(); + } else if (softticks + 1 == ticks) + ++softticks; +} + +/* + * Compute number of ticks in the specified amount of time. + */ +int +tvtohz(tv) + struct timeval *tv; +{ + register unsigned long ticks; + register long sec, usec; + + /* + * If the number of usecs in the whole seconds part of the time + * difference fits in a long, then the total number of usecs will + * fit in an unsigned long. Compute the total and convert it to + * ticks, rounding up and adding 1 to allow for the current tick + * to expire. Rounding also depends on unsigned long arithmetic + * to avoid overflow. + * + * Otherwise, if the number of ticks in the whole seconds part of + * the time difference fits in a long, then convert the parts to + * ticks separately and add, using similar rounding methods and + * overflow avoidance. This method would work in the previous + * case but it is slightly slower and assumes that hz is integral. + * + * Otherwise, round the time difference down to the maximum + * representable value. + * + * If ints have 32 bits, then the maximum value for any timeout in + * 10ms ticks is 248 days. + */ + sec = tv->tv_sec; + usec = tv->tv_usec; + if (usec < 0) { + sec--; + usec += 1000000; + } + if (sec < 0) { +#ifdef DIAGNOSTIC + if (usec > 0) { + sec++; + usec -= 1000000; + } + printf("tvotohz: negative time difference %ld sec %ld usec\n", + sec, usec); +#endif + ticks = 1; + } else if (sec <= LONG_MAX / 1000000) + ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1)) + / tick + 1; + else if (sec <= LONG_MAX / hz) + ticks = sec * hz + + ((unsigned long)usec + (tick - 1)) / tick + 1; + else + ticks = LONG_MAX; + if (ticks > INT_MAX) + ticks = INT_MAX; + return ((int)ticks); +} + +/* + * Start profiling on a process. + * + * Kernel profiling passes proc0 which never exits and hence + * keeps the profile clock running constantly. + */ +void +startprofclock(p) + register struct proc *p; +{ + int s; + + if ((p->p_flag & P_PROFIL) == 0) { + p->p_flag |= P_PROFIL; + if (++profprocs == 1 && stathz != 0) { + s = splstatclock(); + psdiv = pscnt = psratio; + setstatclockrate(profhz); + splx(s); + } + } +} + +/* + * Stop profiling on a process. + */ +void +stopprofclock(p) + register struct proc *p; +{ + int s; + + if (p->p_flag & P_PROFIL) { + p->p_flag &= ~P_PROFIL; + if (--profprocs == 0 && stathz != 0) { + s = splstatclock(); + psdiv = pscnt = 1; + setstatclockrate(stathz); + splx(s); + } + } +} + +/* + * Statistics clock. Grab profile sample, and if divider reaches 0, + * do process and kernel statistics. + */ +void +statclock(frame) + register struct clockframe *frame; +{ +#ifdef GPROF + register struct gmonparam *g; + int i; +#endif + register struct proc *p; + struct pstats *pstats; + long rss; + struct rusage *ru; + struct vmspace *vm; + + if (curproc != NULL && CLKF_USERMODE(frame)) { + p = curproc; + if (p->p_flag & P_PROFIL) + addupc_intr(p, CLKF_PC(frame), 1); +#if defined(SMP) && defined(BETTER_CLOCK) + if (stathz != 0) + forward_statclock(pscnt); +#endif + if (--pscnt > 0) + return; + /* + * Came from user mode; CPU was in user state. + * If this process is being profiled record the tick. + */ + p->p_uticks++; + if (p->p_nice > NZERO) + cp_time[CP_NICE]++; + else + cp_time[CP_USER]++; + } else { +#ifdef GPROF + /* + * Kernel statistics are just like addupc_intr, only easier. + */ + g = &_gmonparam; + if (g->state == GMON_PROF_ON) { + i = CLKF_PC(frame) - g->lowpc; + if (i < g->textsize) { + i /= HISTFRACTION * sizeof(*g->kcount); + g->kcount[i]++; + } + } +#endif +#if defined(SMP) && defined(BETTER_CLOCK) + if (stathz != 0) + forward_statclock(pscnt); +#endif + if (--pscnt > 0) + return; + /* + * Came from kernel mode, so we were: + * - handling an interrupt, + * - doing syscall or trap work on behalf of the current + * user process, or + * - spinning in the idle loop. + * Whichever it is, charge the time as appropriate. + * Note that we charge interrupts to the current process, + * regardless of whether they are ``for'' that process, + * so that we know how much of its real time was spent + * in ``non-process'' (i.e., interrupt) work. + */ + p = curproc; + if (CLKF_INTR(frame)) { + if (p != NULL) + p->p_iticks++; + cp_time[CP_INTR]++; + } else if (p != NULL) { + p->p_sticks++; + cp_time[CP_SYS]++; + } else + cp_time[CP_IDLE]++; + } + pscnt = psdiv; + + /* + * We maintain statistics shown by user-level statistics + * programs: the amount of time in each cpu state. + */ + + /* + * We adjust the priority of the current process. The priority of + * a process gets worse as it accumulates CPU time. The cpu usage + * estimator (p_estcpu) is increased here. The formula for computing + * priorities (in kern_synch.c) will compute a different value each + * time p_estcpu increases by 4. The cpu usage estimator ramps up + * quite quickly when the process is running (linearly), and decays + * away exponentially, at a rate which is proportionally slower when + * the system is busy. The basic principal is that the system will + * 90% forget that the process used a lot of CPU time in 5 * loadav + * seconds. This causes the system to favor processes which haven't + * run much recently, and to round-robin among other processes. + */ + if (p != NULL) { + p->p_cpticks++; + if (++p->p_estcpu == 0) + p->p_estcpu--; + if ((p->p_estcpu & 3) == 0) { + resetpriority(p); + if (p->p_priority >= PUSER) + p->p_priority = p->p_usrpri; + } + + /* Update resource usage integrals and maximums. */ + if ((pstats = p->p_stats) != NULL && + (ru = &pstats->p_ru) != NULL && + (vm = p->p_vmspace) != NULL) { + ru->ru_ixrss += vm->vm_tsize * PAGE_SIZE / 1024; + ru->ru_idrss += vm->vm_dsize * PAGE_SIZE / 1024; + ru->ru_isrss += vm->vm_ssize * PAGE_SIZE / 1024; + rss = vm->vm_pmap.pm_stats.resident_count * + PAGE_SIZE / 1024; + if (ru->ru_maxrss < rss) + ru->ru_maxrss = rss; + } + } +} + +/* + * Return information about system clocks. + */ +static int +sysctl_kern_clockrate SYSCTL_HANDLER_ARGS +{ + struct clockinfo clkinfo; + /* + * Construct clockinfo structure. + */ + clkinfo.hz = hz; + clkinfo.tick = tick; + clkinfo.tickadj = tickadj; + clkinfo.profhz = profhz; + clkinfo.stathz = stathz ? stathz : hz; + return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req)); +} + +SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD, + 0, 0, sysctl_kern_clockrate, "S,clockinfo",""); + +static __inline unsigned +tco_delta(struct timecounter *tc) +{ + + return ((tc->tc_get_timecount(tc) - tc->tc_offset_count) & + tc->tc_counter_mask); +} + +/* + * We have four functions for looking at the clock, two for microseconds + * and two for nanoseconds. For each there is fast but less precise + * version "get{nano|micro}time" which will return a time which is up + * to 1/HZ previous to the call, whereas the raw version "{nano|micro}time" + * will return a timestamp which is as precise as possible. + */ + +void +getmicrotime(struct timeval *tvp) +{ + struct timecounter *tc; + + if (!tco_method) { + tc = timecounter; + *tvp = tc->tc_microtime; + } else { + microtime(tvp); + } +} + +void +getnanotime(struct timespec *tsp) +{ + struct timecounter *tc; + + if (!tco_method) { + tc = timecounter; + *tsp = tc->tc_nanotime; + } else { + nanotime(tsp); + } +} + +void +microtime(struct timeval *tv) +{ + struct timecounter *tc; + + tc = (struct timecounter *)timecounter; + tv->tv_sec = tc->tc_offset_sec; + tv->tv_usec = tc->tc_offset_micro; + tv->tv_usec += ((u_int64_t)tco_delta(tc) * tc->tc_scale_micro) >> 32; + tv->tv_usec += boottime.tv_usec; + tv->tv_sec += boottime.tv_sec; + while (tv->tv_usec >= 1000000) { + tv->tv_usec -= 1000000; + tv->tv_sec++; + } +} + +void +nanotime(struct timespec *ts) +{ + unsigned count; + u_int64_t delta; + struct timecounter *tc; + + tc = (struct timecounter *)timecounter; + ts->tv_sec = tc->tc_offset_sec; + count = tco_delta(tc); + delta = tc->tc_offset_nano; + delta += ((u_int64_t)count * tc->tc_scale_nano_f); + delta >>= 32; + delta += ((u_int64_t)count * tc->tc_scale_nano_i); + delta += boottime.tv_usec * 1000; + ts->tv_sec += boottime.tv_sec; + while (delta >= 1000000000) { + delta -= 1000000000; + ts->tv_sec++; + } + ts->tv_nsec = delta; +} + +void +timecounter_timespec(unsigned count, struct timespec *ts) +{ + u_int64_t delta; + struct timecounter *tc; + + tc = (struct timecounter *)timecounter; + ts->tv_sec = tc->tc_offset_sec; + count -= tc->tc_offset_count; + count &= tc->tc_counter_mask; + delta = tc->tc_offset_nano; + delta += ((u_int64_t)count * tc->tc_scale_nano_f); + delta >>= 32; + delta += ((u_int64_t)count * tc->tc_scale_nano_i); + delta += boottime.tv_usec * 1000; + ts->tv_sec += boottime.tv_sec; + while (delta >= 1000000000) { + delta -= 1000000000; + ts->tv_sec++; + } + ts->tv_nsec = delta; +} + +void +getmicrouptime(struct timeval *tvp) +{ + struct timecounter *tc; + + if (!tco_method) { + tc = timecounter; + tvp->tv_sec = tc->tc_offset_sec; + tvp->tv_usec = tc->tc_offset_micro; + } else { + microuptime(tvp); + } +} + +void +getnanouptime(struct timespec *tsp) +{ + struct timecounter *tc; + + if (!tco_method) { + tc = timecounter; + tsp->tv_sec = tc->tc_offset_sec; + tsp->tv_nsec = tc->tc_offset_nano >> 32; + } else { + nanouptime(tsp); + } +} + +void +microuptime(struct timeval *tv) +{ + struct timecounter *tc; + + tc = (struct timecounter *)timecounter; + tv->tv_sec = tc->tc_offset_sec; + tv->tv_usec = tc->tc_offset_micro; + tv->tv_usec += ((u_int64_t)tco_delta(tc) * tc->tc_scale_micro) >> 32; + if (tv->tv_usec >= 1000000) { + tv->tv_usec -= 1000000; + tv->tv_sec++; + } +} + +void +nanouptime(struct timespec *ts) +{ + unsigned count; + u_int64_t delta; + struct timecounter *tc; + + tc = (struct timecounter *)timecounter; + ts->tv_sec = tc->tc_offset_sec; + count = tco_delta(tc); + delta = tc->tc_offset_nano; + delta += ((u_int64_t)count * tc->tc_scale_nano_f); + delta >>= 32; + delta += ((u_int64_t)count * tc->tc_scale_nano_i); + if (delta >= 1000000000) { + delta -= 1000000000; + ts->tv_sec++; + } + ts->tv_nsec = delta; +} + +static void +tco_setscales(struct timecounter *tc) +{ + u_int64_t scale; + + scale = 1000000000LL << 32; + if (tc->tc_adjustment > 0) + scale += (tc->tc_adjustment * 1000LL) << 10; + else + scale -= (-tc->tc_adjustment * 1000LL) << 10; + scale /= tc->tc_frequency; + tc->tc_scale_micro = scale / 1000; + tc->tc_scale_nano_f = scale & 0xffffffff; + tc->tc_scale_nano_i = scale >> 32; +} + +void +init_timecounter(struct timecounter *tc) +{ + struct timespec ts1; + struct timecounter *t1, *t2, *t3; + int i; + + tc->tc_adjustment = 0; + tco_setscales(tc); + tc->tc_offset_count = tc->tc_get_timecount(tc); + tc->tc_tweak = tc; + MALLOC(t1, struct timecounter *, sizeof *t1, M_TIMECOUNTER, M_WAITOK); + *t1 = *tc; + t2 = t1; + for (i = 1; i < NTIMECOUNTER; i++) { + MALLOC(t3, struct timecounter *, sizeof *t3, + M_TIMECOUNTER, M_WAITOK); + *t3 = *tc; + t3->tc_other = t2; + t2 = t3; + } + t1->tc_other = t3; + tc = t1; + + printf("Timecounter \"%s\" frequency %lu Hz\n", + tc->tc_name, (u_long)tc->tc_frequency); + + /* XXX: For now always start using the counter. */ + tc->tc_offset_count = tc->tc_get_timecount(tc); + nanouptime(&ts1); + tc->tc_offset_nano = (u_int64_t)ts1.tv_nsec << 32; + tc->tc_offset_micro = ts1.tv_nsec / 1000; + tc->tc_offset_sec = ts1.tv_sec; + timecounter = tc; +} + +void +set_timecounter(struct timespec *ts) +{ + struct timespec ts2; + + nanouptime(&ts2); + boottime.tv_sec = ts->tv_sec - ts2.tv_sec; + boottime.tv_usec = (ts->tv_nsec - ts2.tv_nsec) / 1000; + if (boottime.tv_usec < 0) { + boottime.tv_usec += 1000000; + boottime.tv_sec--; + } + /* fiddle all the little crinkly bits around the fiords... */ + tco_forward(1); +} + + +#if 0 /* Currently unused */ +void +switch_timecounter(struct timecounter *newtc) +{ + int s; + struct timecounter *tc; + struct timespec ts; + + s = splclock(); + tc = timecounter; + if (newtc == tc || newtc == tc->tc_other) { + splx(s); + return; + } + nanouptime(&ts); + newtc->tc_offset_sec = ts.tv_sec; + newtc->tc_offset_nano = (u_int64_t)ts.tv_nsec << 32; + newtc->tc_offset_micro = ts.tv_nsec / 1000; + newtc->tc_offset_count = newtc->tc_get_timecount(newtc); + timecounter = newtc; + splx(s); +} +#endif + +static struct timecounter * +sync_other_counter(void) +{ + struct timecounter *tc, *tcn, *tco; + unsigned delta; + + tco = timecounter; + tc = tco->tc_other; + tcn = tc->tc_other; + *tc = *tco; + tc->tc_other = tcn; + delta = tco_delta(tc); + tc->tc_offset_count += delta; + tc->tc_offset_count &= tc->tc_counter_mask; + tc->tc_offset_nano += (u_int64_t)delta * tc->tc_scale_nano_f; + tc->tc_offset_nano += (u_int64_t)delta * tc->tc_scale_nano_i << 32; + return (tc); +} + +static void +tco_forward(int force) +{ + struct timecounter *tc, *tco; + + tco = timecounter; + tc = sync_other_counter(); + /* + * We may be inducing a tiny error here, the tc_poll_pps() may + * process a latched count which happens after the tco_delta() + * in sync_other_counter(), which would extend the previous + * counters parameters into the domain of this new one. + * Since the timewindow is very small for this, the error is + * going to be only a few weenieseconds (as Dave Mills would + * say), so lets just not talk more about it, OK ? + */ + if (tco->tc_poll_pps) + tco->tc_poll_pps(tco); + if (timedelta != 0) { + tc->tc_offset_nano += (u_int64_t)(tickdelta * 1000) << 32; + timedelta -= tickdelta; + force++; + } + + while (tc->tc_offset_nano >= 1000000000ULL << 32) { + tc->tc_offset_nano -= 1000000000ULL << 32; + tc->tc_offset_sec++; + tc->tc_frequency = tc->tc_tweak->tc_frequency; + tc->tc_adjustment = tc->tc_tweak->tc_adjustment; + ntp_update_second(tc); /* XXX only needed if xntpd runs */ + tco_setscales(tc); + force++; + } + + if (tco_method && !force) + return; + + tc->tc_offset_micro = (tc->tc_offset_nano / 1000) >> 32; + + /* Figure out the wall-clock time */ + tc->tc_nanotime.tv_sec = tc->tc_offset_sec + boottime.tv_sec; + tc->tc_nanotime.tv_nsec = + (tc->tc_offset_nano >> 32) + boottime.tv_usec * 1000; + tc->tc_microtime.tv_usec = tc->tc_offset_micro + boottime.tv_usec; + if (tc->tc_nanotime.tv_nsec >= 1000000000) { + tc->tc_nanotime.tv_nsec -= 1000000000; + tc->tc_microtime.tv_usec -= 1000000; + tc->tc_nanotime.tv_sec++; + } + time_second = tc->tc_microtime.tv_sec = tc->tc_nanotime.tv_sec; + + timecounter = tc; +} + +static int +sysctl_kern_timecounter_frequency SYSCTL_HANDLER_ARGS +{ + + return (sysctl_handle_opaque(oidp, + &timecounter->tc_tweak->tc_frequency, + sizeof(timecounter->tc_tweak->tc_frequency), req)); +} + +static int +sysctl_kern_timecounter_adjustment SYSCTL_HANDLER_ARGS +{ + + return (sysctl_handle_opaque(oidp, + &timecounter->tc_tweak->tc_adjustment, + sizeof(timecounter->tc_tweak->tc_adjustment), req)); +} + +SYSCTL_NODE(_kern, OID_AUTO, timecounter, CTLFLAG_RW, 0, ""); + +SYSCTL_INT(_kern_timecounter, KERN_ARGMAX, method, CTLFLAG_RW, &tco_method, 0, + "This variable determines the method used for updating timecounters. " + "If the default algorithm (0) fails with \"calcru negative...\" messages " + "try the alternate algorithm (1) which handles bad hardware better." + +); + +SYSCTL_PROC(_kern_timecounter, OID_AUTO, frequency, CTLTYPE_INT | CTLFLAG_RW, + 0, sizeof(u_int), sysctl_kern_timecounter_frequency, "I", ""); + +SYSCTL_PROC(_kern_timecounter, OID_AUTO, adjustment, CTLTYPE_INT | CTLFLAG_RW, + 0, sizeof(int), sysctl_kern_timecounter_adjustment, "I", ""); diff --git a/sys/kern/kern_threads.c b/sys/kern/kern_threads.c new file mode 100644 index 0000000..57e8d96 --- /dev/null +++ b/sys/kern/kern_threads.c @@ -0,0 +1,154 @@ +/* + * + * Portions of this code was derived from the file kern_fork.c and as such + * is subject to the copyrights below. + * + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * Copyright (c) 1996 Douglas Santry + * + * This code is subject to the beer copyright. If I chance to meet you in a + * bar and this code helped you in some way, you owe me a beer. Only + * in Germany will I accept domestic beer. This code may or may not work + * and I certainly make no claims as to its fitness for *any* purpose. + * + * $Id: kern_threads.c,v 1.9 1998/10/25 17:44:51 phk Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/sysproto.h> + +/* + * Low level support for sleep/wakeup paradigm + * If a timeout is specified: + * returns 0 if wakeup + * returns EAGAIN if timed out + * returns EINVAL if error + * + * If a timeout is not specified: + * + * returns time waiting in ticks. + */ +int +thr_sleep(struct proc *p, struct thr_sleep_args *uap) { + int sleepstart; + struct timespec ts; + struct timeval atv; + int error, timo; + + timo = 0; + if (uap->timeout != 0) { + /* + * Get timespec struct + */ + if (error = copyin((caddr_t) uap->timeout, (caddr_t) &ts, sizeof ts)) { + p->p_wakeup = 0; + return error; + } + if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) { + p->p_wakeup = 0; + return (EINVAL); + } + TIMESPEC_TO_TIMEVAL(&atv, &ts); + if (itimerfix(&atv)) { + p->p_wakeup = 0; + return (EINVAL); + } + timo = tvtohz(&atv); + } + + p->p_retval[0] = 0; + if (p->p_wakeup == 0) { + sleepstart = ticks; + p->p_flag |= P_SINTR; + error = tsleep(p, PRIBIO, "thrslp", timo); + p->p_flag &= ~P_SINTR; + if (error == EWOULDBLOCK) { + p->p_wakeup = 0; + p->p_retval[0] = EAGAIN; + return 0; + } + if (uap->timeout == 0) + p->p_retval[0] = ticks - sleepstart; + } + p->p_wakeup = 0; + return (0); +} + +int +thr_wakeup(struct proc *p, struct thr_wakeup_args *uap) { + struct proc *pSlave = p->p_leader; + + while(pSlave && (pSlave->p_pid != uap->pid)) + pSlave = pSlave->p_peers; + + if(pSlave == 0) { + p->p_retval[0] = ESRCH; + return(0); + } + + pSlave->p_wakeup++; + if((pSlave->p_stat == SSLEEP) && (pSlave->p_wchan == pSlave)) { + wakeup(pSlave); + return(0); + } + + p->p_retval[0] = EAGAIN; + return 0; +} + +/* + * General purpose yield system call + */ +int +yield(struct proc *p, struct yield_args *uap) { + int s; + + p->p_retval[0] = 0; + + s = splhigh(); + p->p_priority = MAXPRI; + setrunqueue(p); + mi_switch(); + splx(s); + + return(0); +} + diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c new file mode 100644 index 0000000..2bd17bb --- /dev/null +++ b/sys/kern/kern_time.c @@ -0,0 +1,644 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_time.c 8.1 (Berkeley) 6/10/93 + * $Id: kern_time.c,v 1.58 1998/06/09 13:10:53 phk Exp $ + */ + +#include <sys/param.h> +#include <sys/sysproto.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/sysent.h> +#include <sys/proc.h> +#include <sys/time.h> +#include <sys/vnode.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> + +struct timezone tz; + +/* + * Time of day and interval timer support. + * + * These routines provide the kernel entry points to get and set + * the time-of-day and per-process interval timers. Subroutines + * here provide support for adding and subtracting timeval structures + * and decrementing interval timers, optionally reloading the interval + * timers when they expire. + */ + +static int nanosleep1 __P((struct proc *p, struct timespec *rqt, + struct timespec *rmt)); +static int settime __P((struct timeval *)); +static void timevalfix __P((struct timeval *)); +static void no_lease_updatetime __P((int)); + +static void +no_lease_updatetime(deltat) + int deltat; +{ +} + +void (*lease_updatetime) __P((int)) = no_lease_updatetime; + +static int +settime(tv) + struct timeval *tv; +{ + struct timeval delta, tv1; + struct timespec ts; + int s; + + s = splclock(); + microtime(&tv1); + delta = *tv; + timevalsub(&delta, &tv1); + + /* + * If the system is secure, we do not allow the time to be + * set to an earlier value (it may be slowed using adjtime, + * but not set back). This feature prevent interlopers from + * setting arbitrary time stamps on files. + */ + if (delta.tv_sec < 0 && securelevel > 1) { + splx(s); + return (EPERM); + } + + ts.tv_sec = tv->tv_sec; + ts.tv_nsec = tv->tv_usec * 1000; + set_timecounter(&ts); + (void) splsoftclock(); + lease_updatetime(delta.tv_sec); + splx(s); + resettodr(); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct clock_gettime_args { + clockid_t clock_id; + struct timespec *tp; +}; +#endif + +/* ARGSUSED */ +int +clock_gettime(p, uap) + struct proc *p; + struct clock_gettime_args *uap; +{ + struct timespec ats; + + if (SCARG(uap, clock_id) != CLOCK_REALTIME) + return (EINVAL); + nanotime(&ats); + return (copyout(&ats, SCARG(uap, tp), sizeof(ats))); +} + +#ifndef _SYS_SYSPROTO_H_ +struct clock_settime_args { + clockid_t clock_id; + const struct timespec *tp; +}; +#endif + +/* ARGSUSED */ +int +clock_settime(p, uap) + struct proc *p; + struct clock_settime_args *uap; +{ + struct timeval atv; + struct timespec ats; + int error; + + if ((error = suser(p->p_ucred, &p->p_acflag)) != 0) + return (error); + if (SCARG(uap, clock_id) != CLOCK_REALTIME) + return (EINVAL); + if ((error = copyin(SCARG(uap, tp), &ats, sizeof(ats))) != 0) + return (error); + if (ats.tv_nsec < 0 || ats.tv_nsec >= 1000000000) + return (EINVAL); + /* XXX Don't convert nsec->usec and back */ + TIMESPEC_TO_TIMEVAL(&atv, &ats); + if ((error = settime(&atv))) + return (error); + return (0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct clock_getres_args { + clockid_t clock_id; + struct timespec *tp; +}; +#endif + +int +clock_getres(p, uap) + struct proc *p; + struct clock_getres_args *uap; +{ + struct timespec ts; + int error; + + if (SCARG(uap, clock_id) != CLOCK_REALTIME) + return (EINVAL); + error = 0; + if (SCARG(uap, tp)) { + ts.tv_sec = 0; + ts.tv_nsec = 1000000000 / timecounter->tc_frequency; + error = copyout(&ts, SCARG(uap, tp), sizeof(ts)); + } + return (error); +} + +static int nanowait; + +static int +nanosleep1(p, rqt, rmt) + struct proc *p; + struct timespec *rqt, *rmt; +{ + struct timespec ts, ts2, ts3; + struct timeval tv; + int error; + + if (rqt->tv_nsec < 0 || rqt->tv_nsec >= 1000000000) + return (EINVAL); + if (rqt->tv_sec < 0 || rqt->tv_sec == 0 && rqt->tv_nsec == 0) + return (0); + getnanouptime(&ts); + timespecadd(&ts, rqt); + TIMESPEC_TO_TIMEVAL(&tv, rqt); + for (;;) { + error = tsleep(&nanowait, PWAIT | PCATCH, "nanslp", + tvtohz(&tv)); + getnanouptime(&ts2); + if (error != EWOULDBLOCK) { + if (error == ERESTART) + error = EINTR; + if (rmt != NULL) { + timespecsub(&ts, &ts2); + if (ts.tv_sec < 0) + timespecclear(&ts); + *rmt = ts; + } + return (error); + } + if (timespeccmp(&ts2, &ts, >=)) + return (0); + ts3 = ts; + timespecsub(&ts3, &ts2); + TIMESPEC_TO_TIMEVAL(&tv, &ts3); + } +} + +#ifndef _SYS_SYSPROTO_H_ +struct nanosleep_args { + struct timespec *rqtp; + struct timespec *rmtp; +}; +#endif + +/* ARGSUSED */ +int +nanosleep(p, uap) + struct proc *p; + struct nanosleep_args *uap; +{ + struct timespec rmt, rqt; + int error, error2; + + error = copyin(SCARG(uap, rqtp), &rqt, sizeof(rqt)); + if (error) + return (error); + if (SCARG(uap, rmtp)) + if (!useracc((caddr_t)SCARG(uap, rmtp), sizeof(rmt), B_WRITE)) + return (EFAULT); + error = nanosleep1(p, &rqt, &rmt); + if (error && SCARG(uap, rmtp)) { + error2 = copyout(&rmt, SCARG(uap, rmtp), sizeof(rmt)); + if (error2) /* XXX shouldn't happen, did useracc() above */ + return (error2); + } + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct gettimeofday_args { + struct timeval *tp; + struct timezone *tzp; +}; +#endif +/* ARGSUSED */ +int +gettimeofday(p, uap) + struct proc *p; + register struct gettimeofday_args *uap; +{ + struct timeval atv; + int error = 0; + + if (uap->tp) { + microtime(&atv); + if ((error = copyout((caddr_t)&atv, (caddr_t)uap->tp, + sizeof (atv)))) + return (error); + } + if (uap->tzp) + error = copyout((caddr_t)&tz, (caddr_t)uap->tzp, + sizeof (tz)); + return (error); +} + +#ifndef _SYS_SYSPROTO_H_ +struct settimeofday_args { + struct timeval *tv; + struct timezone *tzp; +}; +#endif +/* ARGSUSED */ +int +settimeofday(p, uap) + struct proc *p; + struct settimeofday_args *uap; +{ + struct timeval atv; + struct timezone atz; + int error; + + if ((error = suser(p->p_ucred, &p->p_acflag))) + return (error); + /* Verify all parameters before changing time. */ + if (uap->tv) { + if ((error = copyin((caddr_t)uap->tv, (caddr_t)&atv, + sizeof(atv)))) + return (error); + if (atv.tv_usec < 0 || atv.tv_usec >= 1000000) + return (EINVAL); + } + if (uap->tzp && + (error = copyin((caddr_t)uap->tzp, (caddr_t)&atz, sizeof(atz)))) + return (error); + if (uap->tv && (error = settime(&atv))) + return (error); + if (uap->tzp) + tz = atz; + return (0); +} + +int tickdelta; /* current clock skew, us. per tick */ +long timedelta; /* unapplied time correction, us. */ +static long bigadj = 1000000; /* use 10x skew above bigadj us. */ + +#ifndef _SYS_SYSPROTO_H_ +struct adjtime_args { + struct timeval *delta; + struct timeval *olddelta; +}; +#endif +/* ARGSUSED */ +int +adjtime(p, uap) + struct proc *p; + register struct adjtime_args *uap; +{ + struct timeval atv; + register long ndelta, ntickdelta, odelta; + int s, error; + + if ((error = suser(p->p_ucred, &p->p_acflag))) + return (error); + if ((error = + copyin((caddr_t)uap->delta, (caddr_t)&atv, sizeof(struct timeval)))) + return (error); + + /* + * Compute the total correction and the rate at which to apply it. + * Round the adjustment down to a whole multiple of the per-tick + * delta, so that after some number of incremental changes in + * hardclock(), tickdelta will become zero, lest the correction + * overshoot and start taking us away from the desired final time. + */ + ndelta = atv.tv_sec * 1000000 + atv.tv_usec; + if (ndelta > bigadj || ndelta < -bigadj) + ntickdelta = 10 * tickadj; + else + ntickdelta = tickadj; + if (ndelta % ntickdelta) + ndelta = ndelta / ntickdelta * ntickdelta; + + /* + * To make hardclock()'s job easier, make the per-tick delta negative + * if we want time to run slower; then hardclock can simply compute + * tick + tickdelta, and subtract tickdelta from timedelta. + */ + if (ndelta < 0) + ntickdelta = -ntickdelta; + s = splclock(); + odelta = timedelta; + timedelta = ndelta; + tickdelta = ntickdelta; + splx(s); + + if (uap->olddelta) { + atv.tv_sec = odelta / 1000000; + atv.tv_usec = odelta % 1000000; + (void) copyout((caddr_t)&atv, (caddr_t)uap->olddelta, + sizeof(struct timeval)); + } + return (0); +} + +/* + * Get value of an interval timer. The process virtual and + * profiling virtual time timers are kept in the p_stats area, since + * they can be swapped out. These are kept internally in the + * way they are specified externally: in time until they expire. + * + * The real time interval timer is kept in the process table slot + * for the process, and its value (it_value) is kept as an + * absolute time rather than as a delta, so that it is easy to keep + * periodic real-time signals from drifting. + * + * Virtual time timers are processed in the hardclock() routine of + * kern_clock.c. The real time timer is processed by a timeout + * routine, called from the softclock() routine. Since a callout + * may be delayed in real time due to interrupt processing in the system, + * it is possible for the real time timeout routine (realitexpire, given below), + * to be delayed in real time past when it is supposed to occur. It + * does not suffice, therefore, to reload the real timer .it_value from the + * real time timers .it_interval. Rather, we compute the next time in + * absolute time the timer should go off. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getitimer_args { + u_int which; + struct itimerval *itv; +}; +#endif +/* ARGSUSED */ +int +getitimer(p, uap) + struct proc *p; + register struct getitimer_args *uap; +{ + struct timeval ctv; + struct itimerval aitv; + int s; + + if (uap->which > ITIMER_PROF) + return (EINVAL); + s = splclock(); /* XXX still needed ? */ + if (uap->which == ITIMER_REAL) { + /* + * Convert from absolute to relative time in .it_value + * part of real time timer. If time for real time timer + * has passed return 0, else return difference between + * current time and time for the timer to go off. + */ + aitv = p->p_realtimer; + if (timevalisset(&aitv.it_value)) { + getmicrouptime(&ctv); + if (timevalcmp(&aitv.it_value, &ctv, <)) + timevalclear(&aitv.it_value); + else + timevalsub(&aitv.it_value, &ctv); + } + } else + aitv = p->p_stats->p_timer[uap->which]; + splx(s); + return (copyout((caddr_t)&aitv, (caddr_t)uap->itv, + sizeof (struct itimerval))); +} + +#ifndef _SYS_SYSPROTO_H_ +struct setitimer_args { + u_int which; + struct itimerval *itv, *oitv; +}; +#endif +/* ARGSUSED */ +int +setitimer(p, uap) + struct proc *p; + register struct setitimer_args *uap; +{ + struct itimerval aitv; + struct timeval ctv; + register struct itimerval *itvp; + int s, error; + + if (uap->which > ITIMER_PROF) + return (EINVAL); + itvp = uap->itv; + if (itvp && (error = copyin((caddr_t)itvp, (caddr_t)&aitv, + sizeof(struct itimerval)))) + return (error); + if ((uap->itv = uap->oitv) && + (error = getitimer(p, (struct getitimer_args *)uap))) + return (error); + if (itvp == 0) + return (0); + if (itimerfix(&aitv.it_value)) + return (EINVAL); + if (!timevalisset(&aitv.it_value)) + timevalclear(&aitv.it_interval); + else if (itimerfix(&aitv.it_interval)) + return (EINVAL); + s = splclock(); /* XXX: still needed ? */ + if (uap->which == ITIMER_REAL) { + if (timevalisset(&p->p_realtimer.it_value)) + untimeout(realitexpire, (caddr_t)p, p->p_ithandle); + if (timevalisset(&aitv.it_value)) + p->p_ithandle = timeout(realitexpire, (caddr_t)p, + tvtohz(&aitv.it_value)); + getmicrouptime(&ctv); + timevaladd(&aitv.it_value, &ctv); + p->p_realtimer = aitv; + } else + p->p_stats->p_timer[uap->which] = aitv; + splx(s); + return (0); +} + +/* + * Real interval timer expired: + * send process whose timer expired an alarm signal. + * If time is not set up to reload, then just return. + * Else compute next time timer should go off which is > current time. + * This is where delay in processing this timeout causes multiple + * SIGALRM calls to be compressed into one. + * tvtohz() always adds 1 to allow for the time until the next clock + * interrupt being strictly less than 1 clock tick, but we don't want + * that here since we want to appear to be in sync with the clock + * interrupt even when we're delayed. + */ +void +realitexpire(arg) + void *arg; +{ + register struct proc *p; + struct timeval ctv, ntv; + int s; + + p = (struct proc *)arg; + psignal(p, SIGALRM); + if (!timevalisset(&p->p_realtimer.it_interval)) { + timevalclear(&p->p_realtimer.it_value); + return; + } + for (;;) { + s = splclock(); /* XXX: still neeeded ? */ + timevaladd(&p->p_realtimer.it_value, + &p->p_realtimer.it_interval); + getmicrouptime(&ctv); + if (timevalcmp(&p->p_realtimer.it_value, &ctv, >)) { + ntv = p->p_realtimer.it_value; + timevalsub(&ntv, &ctv); + p->p_ithandle = timeout(realitexpire, (caddr_t)p, + tvtohz(&ntv) - 1); + splx(s); + return; + } + splx(s); + } +} + +/* + * Check that a proposed value to load into the .it_value or + * .it_interval part of an interval timer is acceptable, and + * fix it to have at least minimal value (i.e. if it is less + * than the resolution of the clock, round it up.) + */ +int +itimerfix(tv) + struct timeval *tv; +{ + + if (tv->tv_sec < 0 || tv->tv_sec > 100000000 || + tv->tv_usec < 0 || tv->tv_usec >= 1000000) + return (EINVAL); + if (tv->tv_sec == 0 && tv->tv_usec != 0 && tv->tv_usec < tick) + tv->tv_usec = tick; + return (0); +} + +/* + * Decrement an interval timer by a specified number + * of microseconds, which must be less than a second, + * i.e. < 1000000. If the timer expires, then reload + * it. In this case, carry over (usec - old value) to + * reduce the value reloaded into the timer so that + * the timer does not drift. This routine assumes + * that it is called in a context where the timers + * on which it is operating cannot change in value. + */ +int +itimerdecr(itp, usec) + register struct itimerval *itp; + int usec; +{ + + if (itp->it_value.tv_usec < usec) { + if (itp->it_value.tv_sec == 0) { + /* expired, and already in next interval */ + usec -= itp->it_value.tv_usec; + goto expire; + } + itp->it_value.tv_usec += 1000000; + itp->it_value.tv_sec--; + } + itp->it_value.tv_usec -= usec; + usec = 0; + if (timevalisset(&itp->it_value)) + return (1); + /* expired, exactly at end of interval */ +expire: + if (timevalisset(&itp->it_interval)) { + itp->it_value = itp->it_interval; + itp->it_value.tv_usec -= usec; + if (itp->it_value.tv_usec < 0) { + itp->it_value.tv_usec += 1000000; + itp->it_value.tv_sec--; + } + } else + itp->it_value.tv_usec = 0; /* sec is already 0 */ + return (0); +} + +/* + * Add and subtract routines for timevals. + * N.B.: subtract routine doesn't deal with + * results which are before the beginning, + * it just gets very confused in this case. + * Caveat emptor. + */ +void +timevaladd(t1, t2) + struct timeval *t1, *t2; +{ + + t1->tv_sec += t2->tv_sec; + t1->tv_usec += t2->tv_usec; + timevalfix(t1); +} + +void +timevalsub(t1, t2) + struct timeval *t1, *t2; +{ + + t1->tv_sec -= t2->tv_sec; + t1->tv_usec -= t2->tv_usec; + timevalfix(t1); +} + +static void +timevalfix(t1) + struct timeval *t1; +{ + + if (t1->tv_usec < 0) { + t1->tv_sec--; + t1->tv_usec += 1000000; + } + if (t1->tv_usec >= 1000000) { + t1->tv_sec++; + t1->tv_usec -= 1000000; + } +} diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c new file mode 100644 index 0000000..278fcce --- /dev/null +++ b/sys/kern/kern_timeout.c @@ -0,0 +1,286 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 + * $Id: kern_timeout.c,v 1.54 1998/02/25 06:13:32 bde Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/callout.h> +#include <sys/kernel.h> + +/* + * TODO: + * allocate more timeout table slots when table overflows. + */ + +/* Exported to machdep.c and/or kern_clock.c. */ +struct callout *callout; +struct callout_list callfree; +int callwheelsize, callwheelbits, callwheelmask; +struct callout_tailq *callwheel; +int softticks; /* Like ticks, but for softclock(). */ + +static struct callout *nextsoftcheck; /* Next callout to be checked. */ + +/* + * The callout mechanism is based on the work of Adam M. Costello and + * George Varghese, published in a technical report entitled "Redesigning + * the BSD Callout and Timer Facilities" and modified slightly for inclusion + * in FreeBSD by Justin T. Gibbs. The original work on the data structures + * used in this implementation was published by G.Varghese and A. Lauck in + * the paper "Hashed and Hierarchical Timing Wheels: Data Structures for + * the Efficient Implementation of a Timer Facility" in the Proceedings of + * the 11th ACM Annual Symposium on Operating Systems Principles, + * Austin, Texas Nov 1987. + */ + +/* + * Software (low priority) clock interrupt. + * Run periodic events from timeout queue. + */ +void +softclock() +{ + register struct callout *c; + register struct callout_tailq *bucket; + register int s; + register int curticks; + register int steps; /* #steps since we last allowed interrupts */ + +#ifndef MAX_SOFTCLOCK_STEPS +#define MAX_SOFTCLOCK_STEPS 100 /* Maximum allowed value of steps. */ +#endif /* MAX_SOFTCLOCK_STEPS */ + + steps = 0; + s = splhigh(); + while (softticks != ticks) { + softticks++; + /* + * softticks may be modified by hard clock, so cache + * it while we work on a given bucket. + */ + curticks = softticks; + bucket = &callwheel[curticks & callwheelmask]; + c = TAILQ_FIRST(bucket); + while (c) { + if (c->c_time != curticks) { + c = TAILQ_NEXT(c, c_links.tqe); + ++steps; + if (steps >= MAX_SOFTCLOCK_STEPS) { + nextsoftcheck = c; + /* Give interrupts a chance. */ + splx(s); + s = splhigh(); + c = nextsoftcheck; + steps = 0; + } + } else { + void (*c_func)(void *); + void *c_arg; + + nextsoftcheck = TAILQ_NEXT(c, c_links.tqe); + TAILQ_REMOVE(bucket, c, c_links.tqe); + c_func = c->c_func; + c_arg = c->c_arg; + c->c_func = NULL; + SLIST_INSERT_HEAD(&callfree, c, c_links.sle); + splx(s); + c_func(c_arg); + s = splhigh(); + steps = 0; + c = nextsoftcheck; + } + } + } + nextsoftcheck = NULL; + splx(s); +} + +/* + * timeout -- + * Execute a function after a specified length of time. + * + * untimeout -- + * Cancel previous timeout function call. + * + * callout_handle_init -- + * Initialize a handle so that using it with untimeout is benign. + * + * See AT&T BCI Driver Reference Manual for specification. This + * implementation differs from that one in that although an + * identification value is returned from timeout, the original + * arguments to timeout as well as the identifier are used to + * identify entries for untimeout. + */ +struct callout_handle +timeout(ftn, arg, to_ticks) + timeout_t *ftn; + void *arg; + register int to_ticks; +{ + int s; + struct callout *new; + struct callout_handle handle; + + if (to_ticks <= 0) + to_ticks = 1; + + /* Lock out the clock. */ + s = splhigh(); + + /* Fill in the next free callout structure. */ + new = SLIST_FIRST(&callfree); + if (new == NULL) + /* XXX Attempt to malloc first */ + panic("timeout table full"); + + SLIST_REMOVE_HEAD(&callfree, c_links.sle); + new->c_arg = arg; + new->c_func = ftn; + new->c_time = ticks + to_ticks; + TAILQ_INSERT_TAIL(&callwheel[new->c_time & callwheelmask], + new, c_links.tqe); + + splx(s); + handle.callout = new; + return (handle); +} + +void +untimeout(ftn, arg, handle) + timeout_t *ftn; + void *arg; + struct callout_handle handle; +{ + register int s; + + /* + * Check for a handle that was initialized + * by callout_handle_init, but never used + * for a real timeout. + */ + if (handle.callout == NULL) + return; + + s = splhigh(); + if ((handle.callout->c_func == ftn) + && (handle.callout->c_arg == arg)) { + if (nextsoftcheck == handle.callout) { + nextsoftcheck = TAILQ_NEXT(handle.callout, c_links.tqe); + } + TAILQ_REMOVE(&callwheel[handle.callout->c_time & callwheelmask], + handle.callout, c_links.tqe); + handle.callout->c_func = NULL; + SLIST_INSERT_HEAD(&callfree, handle.callout, c_links.sle); + } + splx(s); +} + +void +callout_handle_init(struct callout_handle *handle) +{ + handle->callout = NULL; +} + +#ifdef APM_FIXUP_CALLTODO +/* + * Adjust the kernel calltodo timeout list. This routine is used after + * an APM resume to recalculate the calltodo timer list values with the + * number of hz's we have been sleeping. The next hardclock() will detect + * that there are fired timers and run softclock() to execute them. + * + * Please note, I have not done an exhaustive analysis of what code this + * might break. I am motivated to have my select()'s and alarm()'s that + * have expired during suspend firing upon resume so that the applications + * which set the timer can do the maintanence the timer was for as close + * as possible to the originally intended time. Testing this code for a + * week showed that resuming from a suspend resulted in 22 to 25 timers + * firing, which seemed independant on whether the suspend was 2 hours or + * 2 days. Your milage may vary. - Ken Key <key@cs.utk.edu> + */ +void +adjust_timeout_calltodo(time_change) + struct timeval *time_change; +{ + register struct callout *p; + unsigned long delta_ticks; + int s; + + /* + * How many ticks were we asleep? + * (stolen from tvtohz()). + */ + + /* Don't do anything */ + if (time_change->tv_sec < 0) + return; + else if (time_change->tv_sec <= LONG_MAX / 1000000) + delta_ticks = (time_change->tv_sec * 1000000 + + time_change->tv_usec + (tick - 1)) / tick + 1; + else if (time_change->tv_sec <= LONG_MAX / hz) + delta_ticks = time_change->tv_sec * hz + + (time_change->tv_usec + (tick - 1)) / tick + 1; + else + delta_ticks = LONG_MAX; + + if (delta_ticks > INT_MAX) + delta_ticks = INT_MAX; + + /* + * Now rip through the timer calltodo list looking for timers + * to expire. + */ + + /* don't collide with softclock() */ + s = splhigh(); + for (p = calltodo.c_next; p != NULL; p = p->c_next) { + p->c_time -= delta_ticks; + + /* Break if the timer had more time on it than delta_ticks */ + if (p->c_time > 0) + break; + + /* take back the ticks the timer didn't use (p->c_time <= 0) */ + delta_ticks = -p->c_time; + } + splx(s); + + return; +} +#endif /* APM_FIXUP_CALLTODO */ diff --git a/sys/kern/kern_xxx.c b/sys/kern/kern_xxx.c new file mode 100644 index 0000000..b7cb83b --- /dev/null +++ b/sys/kern/kern_xxx.c @@ -0,0 +1,256 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_xxx.c 8.2 (Berkeley) 11/14/93 + * $Id: kern_xxx.c,v 1.27 1997/12/16 17:40:21 eivind Exp $ + */ + +#include "opt_compat.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/sysctl.h> +#include <sys/utsname.h> + + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + +#ifndef _SYS_SYSPROTO_H_ +struct gethostname_args { + char *hostname; + u_int len; +}; +#endif +/* ARGSUSED */ +int +ogethostname(p, uap) + struct proc *p; + struct gethostname_args *uap; +{ + int name[2]; + size_t len = uap->len; + + name[0] = CTL_KERN; + name[1] = KERN_HOSTNAME; + return (userland_sysctl(p, name, 2, uap->hostname, &len, + 1, 0, 0, 0)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct sethostname_args { + char *hostname; + u_int len; +}; +#endif +/* ARGSUSED */ +int +osethostname(p, uap) + struct proc *p; + register struct sethostname_args *uap; +{ + int name[2]; + int error; + + name[0] = CTL_KERN; + name[1] = KERN_HOSTNAME; + if ((error = suser(p->p_ucred, &p->p_acflag))) + return (error); + return (userland_sysctl(p, name, 2, 0, 0, 0, + uap->hostname, uap->len, 0)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct ogethostid_args { + int dummy; +}; +#endif +/* ARGSUSED */ +int +ogethostid(p, uap) + struct proc *p; + struct ogethostid_args *uap; +{ + + *(long *)(p->p_retval) = hostid; + return (0); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +#ifdef COMPAT_43 +#ifndef _SYS_SYSPROTO_H_ +struct osethostid_args { + long hostid; +}; +#endif +/* ARGSUSED */ +int +osethostid(p, uap) + struct proc *p; + struct osethostid_args *uap; +{ + int error; + + if ((error = suser(p->p_ucred, &p->p_acflag))) + return (error); + hostid = uap->hostid; + return (0); +} + +int +oquota(p, uap) + struct proc *p; + struct oquota_args *uap; +{ + + return (ENOSYS); +} +#endif /* COMPAT_43 */ + +#ifndef _SYS_SYSPROTO_H_ +struct uname_args { + struct utsname *name; +}; +#endif + +/* ARGSUSED */ +int +uname(p, uap) + struct proc *p; + struct uname_args *uap; +{ + int name[2], rtval; + size_t len; + char *s, *us; + + name[0] = CTL_KERN; + name[1] = KERN_OSTYPE; + len = sizeof uap->name->sysname; + rtval = userland_sysctl(p, name, 2, uap->name->sysname, &len, + 1, 0, 0, 0); + if( rtval) return rtval; + subyte( uap->name->sysname + sizeof(uap->name->sysname) - 1, 0); + + name[1] = KERN_HOSTNAME; + len = sizeof uap->name->nodename; + rtval = userland_sysctl(p, name, 2, uap->name->nodename, &len, + 1, 0, 0, 0); + if( rtval) return rtval; + subyte( uap->name->nodename + sizeof(uap->name->nodename) - 1, 0); + + name[1] = KERN_OSRELEASE; + len = sizeof uap->name->release; + rtval = userland_sysctl(p, name, 2, uap->name->release, &len, + 1, 0, 0, 0); + if( rtval) return rtval; + subyte( uap->name->release + sizeof(uap->name->release) - 1, 0); + +/* + name = KERN_VERSION; + len = sizeof uap->name->version; + rtval = userland_sysctl(p, name, 2, uap->name->version, &len, + 1, 0, 0, 0); + if( rtval) return rtval; + subyte( uap->name->version + sizeof(uap->name->version) - 1, 0); +*/ + +/* + * this stupid hackery to make the version field look like FreeBSD 1.1 + */ + for(s = version; *s && *s != '#'; s++); + + for(us = uap->name->version; *s && *s != ':'; s++) { + rtval = subyte( us++, *s); + if( rtval) + return rtval; + } + rtval = subyte( us++, 0); + if( rtval) + return rtval; + + name[0] = CTL_HW; + name[1] = HW_MACHINE; + len = sizeof uap->name->machine; + rtval = userland_sysctl(p, name, 2, uap->name->machine, &len, + 1, 0, 0, 0); + if( rtval) return rtval; + subyte( uap->name->machine + sizeof(uap->name->machine) - 1, 0); + + return 0; +} + +#ifndef _SYS_SYSPROTO_H_ +struct getdomainname_args { + char *domainname; + int len; +}; +#endif + +/* ARGSUSED */ +int +getdomainname(p, uap) + struct proc *p; + struct getdomainname_args *uap; +{ + int domainnamelen = strlen(domainname) + 1; + if ((u_int)uap->len > domainnamelen + 1) + uap->len = domainnamelen + 1; + return (copyout((caddr_t)domainname, (caddr_t)uap->domainname, uap->len)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct setdomainname_args { + char *domainname; + int len; +}; +#endif + +/* ARGSUSED */ +int +setdomainname(p, uap) + struct proc *p; + struct setdomainname_args *uap; +{ + int error, domainnamelen; + + if ((error = suser(p->p_ucred, &p->p_acflag))) + return (error); + if ((u_int)uap->len > sizeof (domainname) - 1) + return EINVAL; + domainnamelen = uap->len; + error = copyin((caddr_t)uap->domainname, domainname, uap->len); + domainname[domainnamelen] = 0; + return (error); +} + diff --git a/sys/kern/ksched.c b/sys/kern/ksched.c new file mode 100644 index 0000000..3718e253 --- /dev/null +++ b/sys/kern/ksched.c @@ -0,0 +1,262 @@ +/* + * Copyright (c) 1996, 1997 + * HD Associates, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by HD Associates, Inc + * 4. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +/* ksched: Soft real time scheduling based on "rtprio". + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/kernel.h> +#include <sys/resource.h> +#include <machine/cpu.h> /* For need_resched */ + +#include <posix4/posix4.h> + +/* ksched: Real-time extension to support POSIX priority scheduling. + */ + +struct ksched { + struct timespec rr_interval; +}; + +int ksched_attach(struct ksched **p) +{ + struct ksched *ksched= p31b_malloc(sizeof(*ksched)); + + ksched->rr_interval.tv_sec = 0; + ksched->rr_interval.tv_nsec = 1000000000L / roundrobin_interval(); + + *p = ksched; + return 0; +} + +int ksched_detach(struct ksched *p) +{ + p31b_free(p); + + return 0; +} + +/* + * XXX About priorities + * + * POSIX 1003.1b requires that numerically higher priorities be of + * higher priority. It also permits sched_setparam to be + * implementation defined for SCHED_OTHER. I don't like + * the notion of inverted priorites for normal processes when + * you can use "setpriority" for that. + * + * I'm rejecting sched_setparam for SCHED_OTHER with EINVAL. + */ + +/* Macros to convert between the unix (lower numerically is higher priority) + * and POSIX 1003.1b (higher numerically is higher priority) + */ + +#define p4prio_to_rtpprio(P) (RTP_PRIO_MAX - (P)) +#define rtpprio_to_p4prio(P) (RTP_PRIO_MAX - (P)) + +/* These improve readability a bit for me: + */ +#define P1B_PRIO_MIN rtpprio_to_p4prio(RTP_PRIO_MAX) +#define P1B_PRIO_MAX rtpprio_to_p4prio(RTP_PRIO_MIN) + +static __inline int +getscheduler(int *ret, struct ksched *ksched, struct proc *p) +{ + int e = 0; + + switch (p->p_rtprio.type) + { + case RTP_PRIO_FIFO: + *ret = SCHED_FIFO; + break; + + case RTP_PRIO_REALTIME: + *ret = SCHED_RR; + break; + + default: + *ret = SCHED_OTHER; + break; + } + + return e; +} + +int ksched_setparam(int *ret, struct ksched *ksched, + struct proc *p, const struct sched_param *param) +{ + int e, policy; + + e = getscheduler(&policy, ksched, p); + + if (e == 0) + { + if (policy == SCHED_OTHER) + e = EINVAL; + else + e = ksched_setscheduler(ret, ksched, p, policy, param); + } + + return e; +} + +int ksched_getparam(int *ret, struct ksched *ksched, + struct proc *p, struct sched_param *param) +{ + if (RTP_PRIO_IS_REALTIME(p->p_rtprio.type)) + param->sched_priority = rtpprio_to_p4prio(p->p_rtprio.prio); + + return 0; +} + +/* + * XXX The priority and scheduler modifications should + * be moved into published interfaces in kern/kern_sync. + * + * The permissions to modify process p were checked in "p31b_proc()". + * + */ +int ksched_setscheduler(int *ret, struct ksched *ksched, + struct proc *p, int policy, const struct sched_param *param) +{ + int e = 0; + struct rtprio rtp; + + switch(policy) + { + case SCHED_RR: + case SCHED_FIFO: + + if (param->sched_priority >= P1B_PRIO_MIN && + param->sched_priority <= P1B_PRIO_MAX) + { + rtp.prio = p4prio_to_rtpprio(param->sched_priority); + rtp.type = (policy == SCHED_FIFO) + ? RTP_PRIO_FIFO : RTP_PRIO_REALTIME; + + p->p_rtprio = rtp; + need_resched(); + } + else + e = EPERM; + + + break; + + case SCHED_OTHER: + { + rtp.type = RTP_PRIO_NORMAL; + rtp.prio = p4prio_to_rtpprio(param->sched_priority); + p->p_rtprio = rtp; + + /* XXX Simply revert to whatever we had for last + * normal scheduler priorities. + * This puts a requirement + * on the scheduling code: You must leave the + * scheduling info alone. + */ + need_resched(); + } + break; + } + + return e; +} + +int ksched_getscheduler(int *ret, struct ksched *ksched, struct proc *p) +{ + return getscheduler(ret, ksched, p); +} + +/* ksched_yield: Yield the CPU. + */ +int ksched_yield(int *ret, struct ksched *ksched) +{ + need_resched(); + return 0; +} + +int ksched_get_priority_max(int *ret, struct ksched *ksched, int policy) +{ + int e = 0; + + switch (policy) + { + case SCHED_FIFO: + case SCHED_RR: + *ret = RTP_PRIO_MAX; + break; + + case SCHED_OTHER: + *ret = PRIO_MAX; + break; + + default: + e = EINVAL; + } + + return e; +} + +int ksched_get_priority_min(int *ret, struct ksched *ksched, int policy) +{ + int e = 0; + + switch (policy) + { + case SCHED_FIFO: + case SCHED_RR: + *ret = P1B_PRIO_MIN; + break; + + case SCHED_OTHER: + *ret = PRIO_MIN; + break; + + default: + e = EINVAL; + } + + return e; +} + +int ksched_rr_get_interval(int *ret, struct ksched *ksched, + struct proc *p, struct timespec *timespec) +{ + *timespec = ksched->rr_interval; + + return 0; +} diff --git a/sys/kern/link_aout.c b/sys/kern/link_aout.c new file mode 100644 index 0000000..29b5884 --- /dev/null +++ b/sys/kern/link_aout.c @@ -0,0 +1,585 @@ +/*- + * Copyright (c) 1997 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: link_aout.c,v 1.16 1998/11/03 14:25:21 peter Exp $ + */ + +#ifndef __alpha__ + +#define FREEBSD_AOUT 1 + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <sys/namei.h> +#include <sys/fcntl.h> +#include <sys/vnode.h> +#include <sys/linker.h> +#include <a.out.h> +#include <link.h> + +static int link_aout_load_module(const char*, linker_file_t*); + +static int link_aout_load_file(const char*, linker_file_t*); + +static int link_aout_lookup_symbol(linker_file_t, const char*, + linker_sym_t*); +static int link_aout_symbol_values(linker_file_t file, linker_sym_t sym, + linker_symval_t* symval); +static int link_aout_search_symbol(linker_file_t lf, caddr_t value, + linker_sym_t* sym, long* diffp); +static void link_aout_unload_file(linker_file_t); +static void link_aout_unload_module(linker_file_t); + +static struct linker_class_ops link_aout_class_ops = { + link_aout_load_module, +}; + +static struct linker_file_ops link_aout_file_ops = { + link_aout_lookup_symbol, + link_aout_symbol_values, + link_aout_search_symbol, + link_aout_unload_file, +}; +static struct linker_file_ops link_aout_module_ops = { + link_aout_lookup_symbol, + link_aout_symbol_values, + link_aout_search_symbol, + link_aout_unload_module, +}; + +typedef struct aout_file { + char* address; /* Load address */ + struct _dynamic* dynamic; /* Symbol table etc. */ +} *aout_file_t; + +static int load_dependancies(linker_file_t lf); +static int relocate_file(linker_file_t lf); + +/* + * The kernel symbol table starts here. + */ +extern struct _dynamic _DYNAMIC; + +static void +link_aout_init(void* arg) +{ +#ifndef __ELF__ + struct _dynamic* dp = &_DYNAMIC; +#endif + + linker_add_class("a.out", NULL, &link_aout_class_ops); + +#ifndef __ELF__ + if (dp) { + aout_file_t af; + + af = malloc(sizeof(struct aout_file), M_LINKER, M_NOWAIT); + if (af == NULL) + panic("link_aout_init: Can't create linker structures for kernel"); + bzero(af, sizeof(*af)); + + af->address = 0; + af->dynamic = dp; + linker_kernel_file = + linker_make_file(kernelname, af, &link_aout_file_ops); + if (linker_kernel_file == NULL) + panic("link_aout_init: Can't create linker structures for kernel"); + /* + * XXX there must be a better way of getting these constants. + */ + linker_kernel_file->address = (caddr_t) 0xf0100000; + linker_kernel_file->size = -0xf0100000; + linker_current_file = linker_kernel_file; + } +#endif +} + +SYSINIT(link_aout, SI_SUB_KLD, SI_ORDER_THIRD, link_aout_init, 0); + +static int +link_aout_load_module(const char* filename, linker_file_t* result) +{ + caddr_t modptr, baseptr; + char *type; + struct exec *ehdr; + aout_file_t af; + linker_file_t lf; + int error; + + /* Look to see if we have the module preloaded. */ + if ((modptr = preload_search_by_name(filename)) == NULL) + return(link_aout_load_file(filename, result)); + + /* It's preloaded, check we can handle it and collect information. */ + if (((type = (char *)preload_search_info(modptr, MODINFO_TYPE)) == NULL) || + strcmp(type, "a.out module") || + ((baseptr = preload_search_info(modptr, MODINFO_ADDR)) == NULL) || + ((ehdr = (struct exec *)preload_search_info(modptr, MODINFO_METADATA | MODINFOMD_AOUTEXEC)) == NULL)) + return(0); /* we can't handle this */ + + /* Looks like we can handle this one */ + af = malloc(sizeof(struct aout_file), M_LINKER, M_WAITOK); + bzero(af, sizeof(*af)); + af->address = baseptr; + + /* Assume _DYNAMIC is the first data item. */ + af->dynamic = (struct _dynamic*)(af->address + ehdr->a_text); + if (af->dynamic->d_version != LD_VERSION_BSD) { + free(af, M_LINKER); + return(0); /* we can't handle this */ + } + af->dynamic->d_un.d_sdt = (struct section_dispatch_table *) + ((char *)af->dynamic->d_un.d_sdt + (vm_offset_t)af->address); + + /* Register with kld */ + lf = linker_make_file(filename, af, &link_aout_module_ops); + if (lf == NULL) { + free(af, M_LINKER); + return(ENOMEM); + } + lf->address = af->address; + lf->size = ehdr->a_text + ehdr->a_data + ehdr->a_bss; + + /* Try to load dependancies */ + if (((error = load_dependancies(lf)) != 0) || + ((error = relocate_file(lf)) != 0)) { + linker_file_unload(lf); + return(error); + } + *result = lf; + return(0); +} + +static int +link_aout_load_file(const char* filename, linker_file_t* result) +{ + struct nameidata nd; + struct proc* p = curproc; /* XXX */ + int error = 0; + int resid; + struct exec header; + aout_file_t af; + linker_file_t lf; + char *pathname; + + pathname = linker_search_path(filename); + if (pathname == NULL) + return ENOENT; + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, pathname, p); + error = vn_open(&nd, FREAD, 0); + free(pathname, M_LINKER); + if (error) + return error; + + /* + * Read the a.out header from the file. + */ + error = vn_rdwr(UIO_READ, nd.ni_vp, (void*) &header, sizeof header, 0, + UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p); + if (error) + goto out; + + if (N_BADMAG(header) || !(N_GETFLAG(header) & EX_DYNAMIC)) + goto out; + + /* + * We have an a.out file, so make some space to read it in. + */ + af = malloc(sizeof(struct aout_file), M_LINKER, M_WAITOK); + bzero(af, sizeof(*af)); + af->address = malloc(header.a_text + header.a_data + header.a_bss, + M_LINKER, M_WAITOK); + + /* + * Read the text and data sections and zero the bss. + */ + error = vn_rdwr(UIO_READ, nd.ni_vp, (void*) af->address, + header.a_text + header.a_data, 0, + UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p); + if (error) + goto out; + bzero(af->address + header.a_text + header.a_data, header.a_bss); + + /* + * Assume _DYNAMIC is the first data item. + */ + af->dynamic = (struct _dynamic*) (af->address + header.a_text); + if (af->dynamic->d_version != LD_VERSION_BSD) { + free(af->address, M_LINKER); + free(af, M_LINKER); + goto out; + } + af->dynamic->d_un.d_sdt = (struct section_dispatch_table *) + ((char *)af->dynamic->d_un.d_sdt + (vm_offset_t)af->address); + + lf = linker_make_file(filename, af, &link_aout_file_ops); + if (lf == NULL) { + free(af->address, M_LINKER); + free(af, M_LINKER); + error = ENOMEM; + goto out; + } + lf->address = af->address; + lf->size = header.a_text + header.a_data + header.a_bss; + + if ((error = load_dependancies(lf)) != 0 + || (error = relocate_file(lf)) != 0) { + linker_file_unload(lf); + goto out; + } + + *result = lf; + +out: + VOP_UNLOCK(nd.ni_vp, 0, p); + vn_close(nd.ni_vp, FREAD, p->p_ucred, p); + + return error; +} + +static void +link_aout_unload_file(linker_file_t file) +{ + aout_file_t af = file->priv; + + if (af) { + if (af->address) + free(af->address, M_LINKER); + free(af, M_LINKER); + } +} + +static void +link_aout_unload_module(linker_file_t file) +{ + aout_file_t af = file->priv; + + if (af) + free(af, M_LINKER); + if (file->filename) + preload_delete_name(file->filename); +} + +#define AOUT_RELOC(af, type, off) (type*) ((af)->address + (off)) + +static int +load_dependancies(linker_file_t lf) +{ + aout_file_t af = lf->priv; + linker_file_t lfdep; + long off; + struct sod* sodp; + char* name; + char* filename = 0; + int error = 0; + + /* + * All files are dependant on /kernel. + */ + if (linker_kernel_file) { + linker_kernel_file->refs++; + linker_file_add_dependancy(lf, linker_kernel_file); + } + + off = LD_NEED(af->dynamic); + + /* + * Load the dependancies. + */ + while (off != 0) { + sodp = AOUT_RELOC(af, struct sod, off); + name = AOUT_RELOC(af, char, sodp->sod_name); + + error = linker_load_file(name, &lfdep); + if (error) + goto out; + error = linker_file_add_dependancy(lf, lfdep); + if (error) + goto out; + off = sodp->sod_next; + } + +out: + if (filename) + free(filename, M_TEMP); + return error; +} + +/* + * XXX i386 dependant. + */ +static long +read_relocation(struct relocation_info* r, char* addr) +{ + int length = r->r_length; + if (length == 0) + return *(u_char*) addr; + else if (length == 1) + return *(u_short*) addr; + else if (length == 2) + return *(u_int*) addr; + else + printf("link_aout: unsupported relocation size %d\n", r->r_length); + return 0; +} + +static void +write_relocation(struct relocation_info* r, char* addr, long value) +{ + int length = r->r_length; + if (length == 0) + *(u_char*) addr = value; + else if (length == 1) + *(u_short*) addr = value; + else if (length == 2) + *(u_int*) addr = value; + else + printf("link_aout: unsupported relocation size %d\n", r->r_length); +} + +static int +relocate_file(linker_file_t lf) +{ + aout_file_t af = lf->priv; + struct relocation_info* rel; + struct relocation_info* erel; + struct relocation_info* r; + struct nzlist* symbolbase; + char* stringbase; + struct nzlist* np; + char* sym; + long relocation; + + rel = AOUT_RELOC(af, struct relocation_info, LD_REL(af->dynamic)); + erel = AOUT_RELOC(af, struct relocation_info, + LD_REL(af->dynamic) + LD_RELSZ(af->dynamic)); + symbolbase = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic)); + stringbase = AOUT_RELOC(af, char, LD_STRINGS(af->dynamic)); + + for (r = rel; r < erel; r++) { + char* addr; + + if (r->r_address == 0) + break; + + addr = AOUT_RELOC(af, char, r->r_address); + if (r->r_extern) { + np = &symbolbase[r->r_symbolnum]; + sym = &stringbase[np->nz_strx]; + + if (sym[0] != '_') { + printf("link_aout: bad symbol name %s\n", sym); + relocation = 0; + } else + relocation = (intptr_t) + linker_file_lookup_symbol(lf, sym + 1, + np->nz_type != (N_SETV+N_EXT)); + if (!relocation) { + printf("link_aout: symbol %s not found\n", sym); + return ENOENT; + } + + relocation += read_relocation(r, addr); + + if (r->r_jmptable) { + printf("link_aout: can't cope with jump table relocations\n"); + continue; + } + + if (r->r_pcrel) + relocation -= (intptr_t) af->address; + + if (r->r_copy) { + printf("link_aout: can't cope with copy relocations\n"); + continue; + } + + write_relocation(r, addr, relocation); + } else { + write_relocation(r, addr, + (intptr_t)(read_relocation(r, addr) + af->address)); + } + + } + + return 0; +} + +static long +symbol_hash_value(aout_file_t af, const char* name) +{ + long hashval; + const char* p; + + hashval = '_'; /* fake a starting '_' for C symbols */ + for (p = name; *p; p++) + hashval = (hashval << 1) + *p; + + return (hashval & 0x7fffffff) % LD_BUCKETS(af->dynamic); +} + +int +link_aout_lookup_symbol(linker_file_t file, const char* name, + linker_sym_t* sym) +{ + aout_file_t af = file->priv; + long hashval; + struct rrs_hash* hashbase; + struct nzlist* symbolbase; + char* stringbase; + struct rrs_hash* hp; + struct nzlist* np; + char* cp; + + if (LD_BUCKETS(af->dynamic) == 0) + return NULL; + + hashbase = AOUT_RELOC(af, struct rrs_hash, LD_HASH(af->dynamic)); + symbolbase = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic)); + stringbase = AOUT_RELOC(af, char, LD_STRINGS(af->dynamic)); + +restart: + hashval = symbol_hash_value(af, name); + hp = &hashbase[hashval]; + if (hp->rh_symbolnum == -1) + return ENOENT; + + while (hp) { + np = (struct nzlist *) &symbolbase[hp->rh_symbolnum]; + cp = stringbase + np->nz_strx; + /* + * Note: we fake the leading '_' for C symbols. + */ + if (cp[0] == '_' && !strcmp(cp + 1, name)) + break; + + if (hp->rh_next == 0) + hp = NULL; + else + hp = &hashbase[hp->rh_next]; + } + + if (hp == NULL) + /* + * Not found. + */ + return ENOENT; + + /* + * Check for an aliased symbol, whatever that is. + */ + if (np->nz_type == N_INDR+N_EXT) { + name = stringbase + (++np)->nz_strx + 1; /* +1 for '_' */ + goto restart; + } + + /* + * Check this is an actual definition of the symbol. + */ + if (np->nz_value == 0) + return ENOENT; + + if (np->nz_type == N_UNDF+N_EXT && np->nz_value != 0) { + if (np->nz_other == AUX_FUNC) + /* weak function */ + return ENOENT; + } + + *sym = (linker_sym_t) np; + + return 0; +} + + +static int +link_aout_symbol_values(linker_file_t file, linker_sym_t sym, + linker_symval_t* symval) +{ + aout_file_t af = file->priv; + struct nzlist* np = (struct nzlist*) sym; + char* stringbase; + long numsym = LD_STABSZ(af->dynamic) / sizeof(struct nzlist); + struct nzlist *symbase; + + /* Is it one of ours? It could be another module... */ + symbase = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic)); + if (np < symbase) + return ENOENT; + if ((np - symbase) > numsym) + return ENOENT; + + stringbase = AOUT_RELOC(af, char, LD_STRINGS(af->dynamic)); + + symval->name = stringbase + np->nz_strx + 1; /* +1 for '_' */ + if (np->nz_type == N_UNDF+N_EXT && np->nz_value != 0) { + symval->value = 0; + symval->size = np->nz_value; + } else { + symval->value = AOUT_RELOC(af, char, np->nz_value); + symval->size = np->nz_size; + } + return 0; +} + +static int +link_aout_search_symbol(linker_file_t lf, caddr_t value, + linker_sym_t* sym, long* diffp) +{ + aout_file_t af = lf->priv; + u_long off = (uintptr_t) (void *) value; + u_long diff = off; + struct nzlist* sp; + struct nzlist* ep; + struct nzlist* best = 0; + + for (sp = AOUT_RELOC(af, struct nzlist, LD_SYMBOL(af->dynamic)), + ep = (struct nzlist *) ((caddr_t) sp + LD_STABSZ(af->dynamic)); + sp < ep; sp++) { + if (sp->nz_name == 0) + continue; + if (off >= sp->nz_value) { + if (off - sp->nz_value < diff) { + diff = off - sp->nz_value; + best = sp; + if (diff == 0) + break; + } else if (off - sp->nz_value == diff) { + best = sp; + } + } + } + if (best == 0) + *diffp = off; + else + *diffp = diff; + *sym = (linker_sym_t) best; + + return 0; +} + +#endif /* !__alpha__ */ diff --git a/sys/kern/link_elf.c b/sys/kern/link_elf.c new file mode 100644 index 0000000..c5e84da --- /dev/null +++ b/sys/kern/link_elf.c @@ -0,0 +1,981 @@ +/*- + * Copyright (c) 1998 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: link_elf.c,v 1.10 1998/11/06 15:16:07 peter Exp $ + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <sys/namei.h> +#include <sys/fcntl.h> +#include <sys/vnode.h> +#include <sys/linker.h> +#include <machine/elf.h> + +#include <vm/vm.h> +#include <vm/vm_prot.h> +#include <vm/vm_param.h> +#include <sys/lock.h> +#include <vm/vm_object.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> + +static int link_elf_load_module(const char*, linker_file_t*); +static int link_elf_load_file(const char*, linker_file_t*); +static int link_elf_lookup_symbol(linker_file_t, const char*, + linker_sym_t*); +static int link_elf_symbol_values(linker_file_t, linker_sym_t, linker_symval_t*); +static int link_elf_search_symbol(linker_file_t, caddr_t value, + linker_sym_t* sym, long* diffp); + +static void link_elf_unload_file(linker_file_t); +static void link_elf_unload_module(linker_file_t); + +static struct linker_class_ops link_elf_class_ops = { + link_elf_load_module, +}; + +static struct linker_file_ops link_elf_file_ops = { + link_elf_lookup_symbol, + link_elf_symbol_values, + link_elf_search_symbol, + link_elf_unload_file, +}; + +static struct linker_file_ops link_elf_module_ops = { + link_elf_lookup_symbol, + link_elf_symbol_values, + link_elf_search_symbol, + link_elf_unload_module, +}; +typedef struct elf_file { + caddr_t address; /* Relocation address */ +#ifdef SPARSE_MAPPING + vm_object_t object; /* VM object to hold file pages */ +#endif + const Elf_Dyn* dynamic; /* Symbol table etc. */ + Elf_Off nbuckets; /* DT_HASH info */ + Elf_Off nchains; + const Elf_Off* buckets; + const Elf_Off* chains; + caddr_t hash; + caddr_t strtab; /* DT_STRTAB */ + int strsz; /* DT_STRSZ */ + const Elf_Sym* symtab; /* DT_SYMTAB */ + Elf_Addr* got; /* DT_PLTGOT */ + const Elf_Rel* pltrel; /* DT_JMPREL */ + int pltrelsize; /* DT_PLTRELSZ */ + const Elf_Rela* pltrela; /* DT_JMPREL */ + int pltrelasize; /* DT_PLTRELSZ */ + const Elf_Rel* rel; /* DT_REL */ + int relsize; /* DT_RELSZ */ + const Elf_Rela* rela; /* DT_RELA */ + int relasize; /* DT_RELASZ */ + caddr_t modptr; + const Elf_Sym* ddbsymtab; /* The symbol table we are using */ + long ddbsymcnt; /* Number of symbols */ + caddr_t ddbstrtab; /* String table */ + long ddbstrcnt; /* number of bytes in string table */ + caddr_t symbase; /* malloc'ed symbold base */ + caddr_t strbase; /* malloc'ed string base */ +} *elf_file_t; + +static int parse_dynamic(linker_file_t lf); +static int load_dependancies(linker_file_t lf); +static int relocate_file(linker_file_t lf); +static int parse_module_symbols(linker_file_t lf); + +/* + * The kernel symbol table starts here. + */ +extern struct _dynamic _DYNAMIC; + +static void +link_elf_init(void* arg) +{ +#ifdef __ELF__ + Elf_Dyn *dp; + caddr_t modptr, baseptr, sizeptr; + elf_file_t ef; + char *modname; +#endif + +#if ELF_TARG_CLASS == ELFCLASS32 + linker_add_class("elf32", NULL, &link_elf_class_ops); +#else + linker_add_class("elf64", NULL, &link_elf_class_ops); +#endif + +#ifdef __ELF__ + dp = (Elf_Dyn*) &_DYNAMIC; + if (dp) { + ef = malloc(sizeof(struct elf_file), M_LINKER, M_NOWAIT); + if (ef == NULL) + panic("link_elf_init: Can't create linker structures for kernel"); + bzero(ef, sizeof(*ef)); + + ef->address = 0; +#ifdef SPARSE_MAPPING + ef->object = 0; +#endif + ef->dynamic = dp; + modname = NULL; + modptr = preload_search_by_type("elf kernel"); + if (modptr) + modname = (char *)preload_search_info(modptr, MODINFO_NAME); + if (modname == NULL) + modname = "kernel"; + linker_kernel_file = linker_make_file(modname, ef, &link_elf_file_ops); + if (linker_kernel_file == NULL) + panic("link_elf_init: Can't create linker structures for kernel"); + parse_dynamic(linker_kernel_file); + /* Sigh, magic constants. */ +#ifdef __alpha__ + linker_kernel_file->address = (caddr_t) 0xfffffc0000300000; +#else + linker_kernel_file->address = (caddr_t) 0xf0100000; +#endif + linker_kernel_file->size = -(long)linker_kernel_file->address; + + if (modptr) { + ef->modptr = modptr; + baseptr = preload_search_info(modptr, MODINFO_ADDR); + if (baseptr) + linker_kernel_file->address = *(caddr_t *)baseptr; + sizeptr = preload_search_info(modptr, MODINFO_SIZE); + if (sizeptr) + linker_kernel_file->size = *(size_t *)sizeptr; + } + (void)parse_module_symbols(linker_kernel_file); + linker_current_file = linker_kernel_file; + } +#endif +} + +SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, 0); + +static int +parse_module_symbols(linker_file_t lf) +{ + elf_file_t ef = lf->priv; + caddr_t pointer; + caddr_t ssym, esym, base; + caddr_t strtab; + int strcnt; + Elf_Sym* symtab; + int symcnt; + + if (ef->modptr == NULL) + return 0; + pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_SSYM); + if (pointer == NULL) + return 0; + ssym = *(caddr_t *)pointer; + pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_ESYM); + if (pointer == NULL) + return 0; + esym = *(caddr_t *)pointer; + + base = ssym; + + symcnt = *(long *)base; + base += sizeof(long); + symtab = (Elf_Sym *)base; + base += roundup(symcnt, sizeof(long)); + + if (base > esym || base < ssym) { + printf("Symbols are corrupt!\n"); + return EINVAL; + } + + strcnt = *(long *)base; + base += sizeof(long); + strtab = base; + base += roundup(strcnt, sizeof(long)); + + if (base > esym || base < ssym) { + printf("Symbols are corrupt!\n"); + return EINVAL; + } + + ef->ddbsymtab = symtab; + ef->ddbsymcnt = symcnt / sizeof(Elf_Sym); + ef->ddbstrtab = strtab; + ef->ddbstrcnt = strcnt; + + return 0; +} + +static int +parse_dynamic(linker_file_t lf) +{ + elf_file_t ef = lf->priv; + const Elf_Dyn *dp; + int plttype = DT_REL; + + for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) { + switch (dp->d_tag) { + case DT_HASH: + { + /* From src/libexec/rtld-elf/rtld.c */ + const Elf_Off *hashtab = (const Elf_Off *) + (ef->address + dp->d_un.d_ptr); + ef->nbuckets = hashtab[0]; + ef->nchains = hashtab[1]; + ef->buckets = hashtab + 2; + ef->chains = ef->buckets + ef->nbuckets; + break; + } + case DT_STRTAB: + ef->strtab = (caddr_t) (ef->address + dp->d_un.d_ptr); + break; + case DT_STRSZ: + ef->strsz = dp->d_un.d_val; + break; + case DT_SYMTAB: + ef->symtab = (Elf_Sym*) (ef->address + dp->d_un.d_ptr); + break; + case DT_SYMENT: + if (dp->d_un.d_val != sizeof(Elf_Sym)) + return ENOEXEC; + break; + case DT_PLTGOT: + ef->got = (Elf_Addr *) (ef->address + dp->d_un.d_ptr); + break; + case DT_REL: + ef->rel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr); + break; + case DT_RELSZ: + ef->relsize = dp->d_un.d_val; + break; + case DT_RELENT: + if (dp->d_un.d_val != sizeof(Elf_Rel)) + return ENOEXEC; + break; + case DT_JMPREL: + ef->pltrel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr); + break; + case DT_PLTRELSZ: + ef->pltrelsize = dp->d_un.d_val; + break; + case DT_RELA: + ef->rela = (const Elf_Rela *) (ef->address + dp->d_un.d_ptr); + break; + case DT_RELASZ: + ef->relasize = dp->d_un.d_val; + break; + case DT_RELAENT: + if (dp->d_un.d_val != sizeof(Elf_Rela)) + return ENOEXEC; + break; + case DT_PLTREL: + plttype = dp->d_un.d_val; + if (plttype != DT_REL && plttype != DT_RELA) + return ENOEXEC; + break; + } + } + + if (plttype == DT_RELA) { + ef->pltrela = (const Elf_Rela *) ef->pltrel; + ef->pltrel = NULL; + ef->pltrelasize = ef->pltrelsize; + ef->pltrelsize = 0; + } + + ef->ddbsymtab = ef->symtab; + ef->ddbsymcnt = ef->nchains; + ef->ddbstrtab = ef->strtab; + ef->ddbstrcnt = ef->strsz; + + return 0; +} + +static void +link_elf_error(const char *s) +{ + printf("kldload: %s\n", s); +} + +static int +link_elf_load_module(const char *filename, linker_file_t *result) +{ + caddr_t modptr, baseptr, sizeptr, dynptr; + char *type; + elf_file_t ef; + linker_file_t lf; + int error; + vm_offset_t dp; + + /* Look to see if we have the module preloaded */ + modptr = preload_search_by_name(filename); + if (modptr == NULL) + return (link_elf_load_file(filename, result)); + + /* It's preloaded, check we can handle it and collect information */ + type = (char *)preload_search_info(modptr, MODINFO_TYPE); + baseptr = preload_search_info(modptr, MODINFO_ADDR); + sizeptr = preload_search_info(modptr, MODINFO_SIZE); + dynptr = preload_search_info(modptr, MODINFO_METADATA|MODINFOMD_DYNAMIC); + if (type == NULL || strcmp(type, "elf module") != 0) + return (EFTYPE); + if (baseptr == NULL || sizeptr == NULL || dynptr == NULL) + return (EINVAL); + + ef = malloc(sizeof(struct elf_file), M_LINKER, M_WAITOK); + if (ef == NULL) + return (ENOMEM); + bzero(ef, sizeof(*ef)); + ef->modptr = modptr; + ef->address = *(caddr_t *)baseptr; +#ifdef SPARSE_MAPPING + ef->object = 0; +#endif + dp = (vm_offset_t)ef->address + *(vm_offset_t *)dynptr; + ef->dynamic = (Elf_Dyn *)dp; + lf = linker_make_file(filename, ef, &link_elf_module_ops); + if (lf == NULL) { + free(ef, M_LINKER); + return ENOMEM; + } + lf->address = ef->address; + lf->size = *(size_t *)sizeptr; + + error = parse_dynamic(lf); + if (error) { + linker_file_unload(lf); + return error; + } + error = load_dependancies(lf); + if (error) { + linker_file_unload(lf); + return error; + } + error = relocate_file(lf); + if (error) { + linker_file_unload(lf); + return error; + } + (void)parse_module_symbols(lf); + *result = lf; + return (0); +} + +static int +link_elf_load_file(const char* filename, linker_file_t* result) +{ + struct nameidata nd; + struct proc* p = curproc; /* XXX */ + Elf_Ehdr *hdr; + caddr_t firstpage; + int nbytes, i; + Elf_Phdr *phdr; + Elf_Phdr *phlimit; + Elf_Phdr *segs[2]; + int nsegs; + Elf_Phdr *phdyn; + Elf_Phdr *phphdr; + caddr_t mapbase; + size_t mapsize; + Elf_Off base_offset; + Elf_Addr base_vaddr; + Elf_Addr base_vlimit; + int error = 0; + int resid; + elf_file_t ef; + linker_file_t lf; + char *pathname; + Elf_Shdr *shdr; + int symtabindex; + int symstrindex; + int symcnt; + int strcnt; + + shdr = NULL; + lf = NULL; + + pathname = linker_search_path(filename); + if (pathname == NULL) + return ENOENT; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, pathname, p); + error = vn_open(&nd, FREAD, 0); + free(pathname, M_LINKER); + if (error) + return error; + + /* + * Read the elf header from the file. + */ + firstpage = malloc(PAGE_SIZE, M_LINKER, M_WAITOK); + if (firstpage == NULL) { + error = ENOMEM; + goto out; + } + hdr = (Elf_Ehdr *)firstpage; + error = vn_rdwr(UIO_READ, nd.ni_vp, firstpage, PAGE_SIZE, 0, + UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p); + nbytes = PAGE_SIZE - resid; + if (error) + goto out; + + if (!IS_ELF(*hdr)) { + error = ENOEXEC; + goto out; + } + + if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS + || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) { + link_elf_error("Unsupported file layout"); + error = ENOEXEC; + goto out; + } + if (hdr->e_ident[EI_VERSION] != EV_CURRENT + || hdr->e_version != EV_CURRENT) { + link_elf_error("Unsupported file version"); + error = ENOEXEC; + goto out; + } + if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) { + link_elf_error("Unsupported file type"); + error = ENOEXEC; + goto out; + } + if (hdr->e_machine != ELF_TARG_MACH) { + link_elf_error("Unsupported machine"); + error = ENOEXEC; + goto out; + } + + /* + * We rely on the program header being in the first page. This is + * not strictly required by the ABI specification, but it seems to + * always true in practice. And, it simplifies things considerably. + */ + if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) && + (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) && + (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes))) + link_elf_error("Unreadable program headers"); + + /* + * Scan the program header entries, and save key information. + * + * We rely on there being exactly two load segments, text and data, + * in that order. + */ + phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff); + phlimit = phdr + hdr->e_phnum; + nsegs = 0; + phdyn = NULL; + phphdr = NULL; + while (phdr < phlimit) { + switch (phdr->p_type) { + + case PT_LOAD: + if (nsegs == 2) { + link_elf_error("Too many sections"); + error = ENOEXEC; + goto out; + } + segs[nsegs] = phdr; + ++nsegs; + break; + + case PT_PHDR: + phphdr = phdr; + break; + + case PT_DYNAMIC: + phdyn = phdr; + break; + } + + ++phdr; + } + if (phdyn == NULL) { + link_elf_error("Object is not dynamically-linked"); + error = ENOEXEC; + goto out; + } + + /* + * Allocate the entire address space of the object, to stake out our + * contiguous region, and to establish the base address for relocation. + */ + base_offset = trunc_page(segs[0]->p_offset); + base_vaddr = trunc_page(segs[0]->p_vaddr); + base_vlimit = round_page(segs[1]->p_vaddr + segs[1]->p_memsz); + mapsize = base_vlimit - base_vaddr; + + ef = malloc(sizeof(struct elf_file), M_LINKER, M_WAITOK); + bzero(ef, sizeof(*ef)); +#ifdef SPARSE_MAPPING + ef->object = vm_object_allocate(OBJT_DEFAULT, mapsize >> PAGE_SHIFT); + if (ef->object == NULL) { + free(ef, M_LINKER); + error = ENOMEM; + goto out; + } + vm_object_reference(ef->object); + ef->address = (caddr_t) vm_map_min(kernel_map); + error = vm_map_find(kernel_map, ef->object, 0, + (vm_offset_t *) &ef->address, + mapsize, 1, + VM_PROT_ALL, VM_PROT_ALL, 0); + if (error) { + vm_object_deallocate(ef->object); + free(ef, M_LINKER); + goto out; + } +#else + ef->address = malloc(mapsize, M_LINKER, M_WAITOK); +#endif + mapbase = ef->address; + + /* + * Read the text and data sections and zero the bss. + */ + for (i = 0; i < 2; i++) { + caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr; + error = vn_rdwr(UIO_READ, nd.ni_vp, + segbase, segs[i]->p_filesz, segs[i]->p_offset, + UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p); + if (error) { +#ifdef SPARSE_MAPPING + vm_map_remove(kernel_map, (vm_offset_t) ef->address, + (vm_offset_t) ef->address + + (ef->object->size << PAGE_SHIFT)); + vm_object_deallocate(ef->object); +#else + free(ef->address, M_LINKER); +#endif + free(ef, M_LINKER); + goto out; + } + bzero(segbase + segs[i]->p_filesz, + segs[i]->p_memsz - segs[i]->p_filesz); + +#ifdef SPARSE_MAPPING + /* + * Wire down the pages + */ + vm_map_pageable(kernel_map, + (vm_offset_t) segbase, + (vm_offset_t) segbase + segs[i]->p_memsz, + FALSE); +#endif + } + + ef->dynamic = (const Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr); + + lf = linker_make_file(filename, ef, &link_elf_file_ops); + if (lf == NULL) { +#ifdef SPARSE_MAPPING + vm_map_remove(kernel_map, (vm_offset_t) ef->address, + (vm_offset_t) ef->address + + (ef->object->size << PAGE_SHIFT)); + vm_object_deallocate(ef->object); +#else + free(ef->address, M_LINKER); +#endif + free(ef, M_LINKER); + error = ENOMEM; + goto out; + } + lf->address = ef->address; + lf->size = mapsize; + + error = parse_dynamic(lf); + if (error) + goto out; + error = load_dependancies(lf); + if (error) + goto out; + error = relocate_file(lf); + if (error) + goto out; + + /* Try and load the symbol table if it's present. (you can strip it!) */ + nbytes = hdr->e_shnum * hdr->e_shentsize; + if (nbytes == 0 || hdr->e_shoff == 0) + goto nosyms; + shdr = malloc(nbytes, M_LINKER, M_WAITOK); + if (shdr == NULL) { + error = ENOMEM; + goto out; + } + bzero(shdr, nbytes); + error = vn_rdwr(UIO_READ, nd.ni_vp, + (caddr_t)shdr, nbytes, hdr->e_shoff, + UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p); + if (error) + goto out; + symtabindex = -1; + symstrindex = -1; + for (i = 0; i < hdr->e_shnum; i++) { + if (shdr[i].sh_type == SHT_SYMTAB) { + symtabindex = i; + symstrindex = shdr[i].sh_link; + } + } + if (symtabindex < 0 || symstrindex < 0) + goto nosyms; + + symcnt = shdr[symtabindex].sh_size; + ef->symbase = malloc(symcnt, M_LINKER, M_WAITOK); + strcnt = shdr[symstrindex].sh_size; + ef->strbase = malloc(strcnt, M_LINKER, M_WAITOK); + + if (ef->symbase == NULL || ef->strbase == NULL) { + error = ENOMEM; + goto out; + } + error = vn_rdwr(UIO_READ, nd.ni_vp, + ef->symbase, symcnt, shdr[symtabindex].sh_offset, + UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p); + if (error) + goto out; + error = vn_rdwr(UIO_READ, nd.ni_vp, + ef->strbase, strcnt, shdr[symstrindex].sh_offset, + UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p); + if (error) + goto out; + + ef->ddbsymcnt = symcnt / sizeof(Elf_Sym); + ef->ddbsymtab = (const Elf_Sym *)ef->symbase; + ef->ddbstrcnt = strcnt; + ef->ddbstrtab = ef->strbase; + +nosyms: + + *result = lf; + +out: + if (error && lf) + linker_file_unload(lf); + if (shdr) + free(shdr, M_LINKER); + if (firstpage) + free(firstpage, M_LINKER); + VOP_UNLOCK(nd.ni_vp, 0, p); + vn_close(nd.ni_vp, FREAD, p->p_ucred, p); + + return error; +} + +static void +link_elf_unload_file(linker_file_t file) +{ + elf_file_t ef = file->priv; + + if (ef) { +#ifdef SPARSE_MAPPING + if (ef->object) { + vm_map_remove(kernel_map, (vm_offset_t) ef->address, + (vm_offset_t) ef->address + + (ef->object->size << PAGE_SHIFT)); + vm_object_deallocate(ef->object); + } +#else + if (ef->address) + free(ef->address, M_LINKER); +#endif + if (ef->symbase) + free(ef->symbase, M_LINKER); + if (ef->strbase) + free(ef->strbase, M_LINKER); + free(ef, M_LINKER); + } +} + +static void +link_elf_unload_module(linker_file_t file) +{ + elf_file_t ef = file->priv; + + if (ef) + free(ef, M_LINKER); + if (file->filename) + preload_delete_name(file->filename); +} + +static int +load_dependancies(linker_file_t lf) +{ + elf_file_t ef = lf->priv; + linker_file_t lfdep; + char* name; + const Elf_Dyn *dp; + int error = 0; + + /* + * All files are dependant on /kernel. + */ + if (linker_kernel_file) { + linker_kernel_file->refs++; + linker_file_add_dependancy(lf, linker_kernel_file); + } + + for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) { + if (dp->d_tag == DT_NEEDED) { + name = ef->strtab + dp->d_un.d_val; + + error = linker_load_file(name, &lfdep); + if (error) + goto out; + error = linker_file_add_dependancy(lf, lfdep); + if (error) + goto out; + } + } + +out: + return error; +} + +static const char * +symbol_name(elf_file_t ef, Elf_Word r_info) +{ + const Elf_Sym *ref; + + if (ELF_R_SYM(r_info)) { + ref = ef->symtab + ELF_R_SYM(r_info); + return ef->strtab + ref->st_name; + } else + return NULL; +} + +static int +relocate_file(linker_file_t lf) +{ + elf_file_t ef = lf->priv; + const Elf_Rel *rellim; + const Elf_Rel *rel; + const Elf_Rela *relalim; + const Elf_Rela *rela; + const char *symname; + + /* Perform relocations without addend if there are any: */ + rel = ef->rel; + if (rel) { + rellim = (const Elf_Rel *) ((caddr_t) ef->rel + ef->relsize); + while (rel < rellim) { + symname = symbol_name(ef, rel->r_info); + if (elf_reloc(lf, rel, ELF_RELOC_REL, symname)) { + printf("link_elf: symbol %s undefined\n", symname); + return ENOENT; + } + rel++; + } + } + + /* Perform relocations with addend if there are any: */ + rela = ef->rela; + if (rela) { + relalim = (const Elf_Rela *) ((caddr_t) ef->rela + ef->relasize); + while (rela < relalim) { + symname = symbol_name(ef, rela->r_info); + if (elf_reloc(lf, rela, ELF_RELOC_RELA, symname)) { + printf("link_elf: symbol %s undefined\n", symname); + return ENOENT; + } + rela++; + } + } + + /* Perform PLT relocations without addend if there are any: */ + rel = ef->pltrel; + if (rel) { + rellim = (const Elf_Rel *) ((caddr_t) ef->pltrel + ef->pltrelsize); + while (rel < rellim) { + symname = symbol_name(ef, rel->r_info); + if (elf_reloc(lf, rel, ELF_RELOC_REL, symname)) { + printf("link_elf: symbol %s undefined\n", symname); + return ENOENT; + } + rel++; + } + } + + /* Perform relocations with addend if there are any: */ + rela = ef->pltrela; + if (rela) { + relalim = (const Elf_Rela *) ((caddr_t) ef->pltrela + ef->pltrelasize); + while (rela < relalim) { + symname = symbol_name(ef, rela->r_info); + if (elf_reloc(lf, rela, ELF_RELOC_RELA, symname)) { + printf("link_elf: symbol %s undefined\n", symname); + return ENOENT; + } + rela++; + } + } + + return 0; +} + +/* + * Hash function for symbol table lookup. Don't even think about changing + * this. It is specified by the System V ABI. + */ +static unsigned long +elf_hash(const char *name) +{ + const unsigned char *p = (const unsigned char *) name; + unsigned long h = 0; + unsigned long g; + + while (*p != '\0') { + h = (h << 4) + *p++; + if ((g = h & 0xf0000000) != 0) + h ^= g >> 24; + h &= ~g; + } + return h; +} + +int +link_elf_lookup_symbol(linker_file_t lf, const char* name, linker_sym_t* sym) +{ + elf_file_t ef = lf->priv; + unsigned long symnum; + const Elf_Sym* symp; + const char *strp; + unsigned long hash; + int i; + + /* First, search hashed global symbols */ + hash = elf_hash(name); + symnum = ef->buckets[hash % ef->nbuckets]; + + while (symnum != STN_UNDEF) { + if (symnum >= ef->nchains) { + printf("link_elf_lookup_symbol: corrupt symbol table\n"); + return ENOENT; + } + + symp = ef->symtab + symnum; + if (symp->st_name == 0) { + printf("link_elf_lookup_symbol: corrupt symbol table\n"); + return ENOENT; + } + + strp = ef->strtab + symp->st_name; + + if (strcmp(name, strp) == 0) { + if (symp->st_shndx != SHN_UNDEF || + (symp->st_value != 0 && + ELF_ST_TYPE(symp->st_info) == STT_FUNC)) { + *sym = (linker_sym_t) symp; + return 0; + } else + return ENOENT; + } + + symnum = ef->chains[symnum]; + } + + /* If we have not found it, look at the full table (if loaded) */ + if (ef->symtab == ef->ddbsymtab) + return ENOENT; + + /* Exhaustive search */ + for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { + strp = ef->ddbstrtab + symp->st_name; + if (strcmp(name, strp) == 0) { + if (symp->st_shndx != SHN_UNDEF || + (symp->st_value != 0 && + ELF_ST_TYPE(symp->st_info) == STT_FUNC)) { + *sym = (linker_sym_t) symp; + return 0; + } else + return ENOENT; + } + } + + return ENOENT; +} + +static int +link_elf_symbol_values(linker_file_t lf, linker_sym_t sym, linker_symval_t* symval) +{ + elf_file_t ef = lf->priv; + Elf_Sym* es = (Elf_Sym*) sym; + + if (es >= ef->symtab && ((es - ef->symtab) < ef->nchains)) { + symval->name = ef->strtab + es->st_name; + symval->value = (caddr_t) ef->address + es->st_value; + symval->size = es->st_size; + return 0; + } + if (ef->symtab == ef->ddbsymtab) + return ENOENT; + if (es >= ef->ddbsymtab && ((es - ef->ddbsymtab) < ef->ddbsymcnt)) { + symval->name = ef->ddbstrtab + es->st_name; + symval->value = (caddr_t) ef->address + es->st_value; + symval->size = es->st_size; + return 0; + } + return ENOENT; +} + +static int +link_elf_search_symbol(linker_file_t lf, caddr_t value, + linker_sym_t* sym, long* diffp) +{ + elf_file_t ef = lf->priv; + u_long off = (u_long) value; + u_long diff = off; + const Elf_Sym* es; + const Elf_Sym* best = 0; + int i; + + for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) { + if (es->st_name == 0) + continue; + if (off >= es->st_value) { + if (off - es->st_value < diff) { + diff = off - es->st_value; + best = es; + if (diff == 0) + break; + } else if (off - es->st_value == diff) { + best = es; + } + } + } + if (best == 0) + *diffp = off; + else + *diffp = diff; + *sym = (linker_sym_t) best; + + return 0; +} diff --git a/sys/kern/link_elf_obj.c b/sys/kern/link_elf_obj.c new file mode 100644 index 0000000..c5e84da --- /dev/null +++ b/sys/kern/link_elf_obj.c @@ -0,0 +1,981 @@ +/*- + * Copyright (c) 1998 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: link_elf.c,v 1.10 1998/11/06 15:16:07 peter Exp $ + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <sys/namei.h> +#include <sys/fcntl.h> +#include <sys/vnode.h> +#include <sys/linker.h> +#include <machine/elf.h> + +#include <vm/vm.h> +#include <vm/vm_prot.h> +#include <vm/vm_param.h> +#include <sys/lock.h> +#include <vm/vm_object.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> + +static int link_elf_load_module(const char*, linker_file_t*); +static int link_elf_load_file(const char*, linker_file_t*); +static int link_elf_lookup_symbol(linker_file_t, const char*, + linker_sym_t*); +static int link_elf_symbol_values(linker_file_t, linker_sym_t, linker_symval_t*); +static int link_elf_search_symbol(linker_file_t, caddr_t value, + linker_sym_t* sym, long* diffp); + +static void link_elf_unload_file(linker_file_t); +static void link_elf_unload_module(linker_file_t); + +static struct linker_class_ops link_elf_class_ops = { + link_elf_load_module, +}; + +static struct linker_file_ops link_elf_file_ops = { + link_elf_lookup_symbol, + link_elf_symbol_values, + link_elf_search_symbol, + link_elf_unload_file, +}; + +static struct linker_file_ops link_elf_module_ops = { + link_elf_lookup_symbol, + link_elf_symbol_values, + link_elf_search_symbol, + link_elf_unload_module, +}; +typedef struct elf_file { + caddr_t address; /* Relocation address */ +#ifdef SPARSE_MAPPING + vm_object_t object; /* VM object to hold file pages */ +#endif + const Elf_Dyn* dynamic; /* Symbol table etc. */ + Elf_Off nbuckets; /* DT_HASH info */ + Elf_Off nchains; + const Elf_Off* buckets; + const Elf_Off* chains; + caddr_t hash; + caddr_t strtab; /* DT_STRTAB */ + int strsz; /* DT_STRSZ */ + const Elf_Sym* symtab; /* DT_SYMTAB */ + Elf_Addr* got; /* DT_PLTGOT */ + const Elf_Rel* pltrel; /* DT_JMPREL */ + int pltrelsize; /* DT_PLTRELSZ */ + const Elf_Rela* pltrela; /* DT_JMPREL */ + int pltrelasize; /* DT_PLTRELSZ */ + const Elf_Rel* rel; /* DT_REL */ + int relsize; /* DT_RELSZ */ + const Elf_Rela* rela; /* DT_RELA */ + int relasize; /* DT_RELASZ */ + caddr_t modptr; + const Elf_Sym* ddbsymtab; /* The symbol table we are using */ + long ddbsymcnt; /* Number of symbols */ + caddr_t ddbstrtab; /* String table */ + long ddbstrcnt; /* number of bytes in string table */ + caddr_t symbase; /* malloc'ed symbold base */ + caddr_t strbase; /* malloc'ed string base */ +} *elf_file_t; + +static int parse_dynamic(linker_file_t lf); +static int load_dependancies(linker_file_t lf); +static int relocate_file(linker_file_t lf); +static int parse_module_symbols(linker_file_t lf); + +/* + * The kernel symbol table starts here. + */ +extern struct _dynamic _DYNAMIC; + +static void +link_elf_init(void* arg) +{ +#ifdef __ELF__ + Elf_Dyn *dp; + caddr_t modptr, baseptr, sizeptr; + elf_file_t ef; + char *modname; +#endif + +#if ELF_TARG_CLASS == ELFCLASS32 + linker_add_class("elf32", NULL, &link_elf_class_ops); +#else + linker_add_class("elf64", NULL, &link_elf_class_ops); +#endif + +#ifdef __ELF__ + dp = (Elf_Dyn*) &_DYNAMIC; + if (dp) { + ef = malloc(sizeof(struct elf_file), M_LINKER, M_NOWAIT); + if (ef == NULL) + panic("link_elf_init: Can't create linker structures for kernel"); + bzero(ef, sizeof(*ef)); + + ef->address = 0; +#ifdef SPARSE_MAPPING + ef->object = 0; +#endif + ef->dynamic = dp; + modname = NULL; + modptr = preload_search_by_type("elf kernel"); + if (modptr) + modname = (char *)preload_search_info(modptr, MODINFO_NAME); + if (modname == NULL) + modname = "kernel"; + linker_kernel_file = linker_make_file(modname, ef, &link_elf_file_ops); + if (linker_kernel_file == NULL) + panic("link_elf_init: Can't create linker structures for kernel"); + parse_dynamic(linker_kernel_file); + /* Sigh, magic constants. */ +#ifdef __alpha__ + linker_kernel_file->address = (caddr_t) 0xfffffc0000300000; +#else + linker_kernel_file->address = (caddr_t) 0xf0100000; +#endif + linker_kernel_file->size = -(long)linker_kernel_file->address; + + if (modptr) { + ef->modptr = modptr; + baseptr = preload_search_info(modptr, MODINFO_ADDR); + if (baseptr) + linker_kernel_file->address = *(caddr_t *)baseptr; + sizeptr = preload_search_info(modptr, MODINFO_SIZE); + if (sizeptr) + linker_kernel_file->size = *(size_t *)sizeptr; + } + (void)parse_module_symbols(linker_kernel_file); + linker_current_file = linker_kernel_file; + } +#endif +} + +SYSINIT(link_elf, SI_SUB_KLD, SI_ORDER_SECOND, link_elf_init, 0); + +static int +parse_module_symbols(linker_file_t lf) +{ + elf_file_t ef = lf->priv; + caddr_t pointer; + caddr_t ssym, esym, base; + caddr_t strtab; + int strcnt; + Elf_Sym* symtab; + int symcnt; + + if (ef->modptr == NULL) + return 0; + pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_SSYM); + if (pointer == NULL) + return 0; + ssym = *(caddr_t *)pointer; + pointer = preload_search_info(ef->modptr, MODINFO_METADATA|MODINFOMD_ESYM); + if (pointer == NULL) + return 0; + esym = *(caddr_t *)pointer; + + base = ssym; + + symcnt = *(long *)base; + base += sizeof(long); + symtab = (Elf_Sym *)base; + base += roundup(symcnt, sizeof(long)); + + if (base > esym || base < ssym) { + printf("Symbols are corrupt!\n"); + return EINVAL; + } + + strcnt = *(long *)base; + base += sizeof(long); + strtab = base; + base += roundup(strcnt, sizeof(long)); + + if (base > esym || base < ssym) { + printf("Symbols are corrupt!\n"); + return EINVAL; + } + + ef->ddbsymtab = symtab; + ef->ddbsymcnt = symcnt / sizeof(Elf_Sym); + ef->ddbstrtab = strtab; + ef->ddbstrcnt = strcnt; + + return 0; +} + +static int +parse_dynamic(linker_file_t lf) +{ + elf_file_t ef = lf->priv; + const Elf_Dyn *dp; + int plttype = DT_REL; + + for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) { + switch (dp->d_tag) { + case DT_HASH: + { + /* From src/libexec/rtld-elf/rtld.c */ + const Elf_Off *hashtab = (const Elf_Off *) + (ef->address + dp->d_un.d_ptr); + ef->nbuckets = hashtab[0]; + ef->nchains = hashtab[1]; + ef->buckets = hashtab + 2; + ef->chains = ef->buckets + ef->nbuckets; + break; + } + case DT_STRTAB: + ef->strtab = (caddr_t) (ef->address + dp->d_un.d_ptr); + break; + case DT_STRSZ: + ef->strsz = dp->d_un.d_val; + break; + case DT_SYMTAB: + ef->symtab = (Elf_Sym*) (ef->address + dp->d_un.d_ptr); + break; + case DT_SYMENT: + if (dp->d_un.d_val != sizeof(Elf_Sym)) + return ENOEXEC; + break; + case DT_PLTGOT: + ef->got = (Elf_Addr *) (ef->address + dp->d_un.d_ptr); + break; + case DT_REL: + ef->rel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr); + break; + case DT_RELSZ: + ef->relsize = dp->d_un.d_val; + break; + case DT_RELENT: + if (dp->d_un.d_val != sizeof(Elf_Rel)) + return ENOEXEC; + break; + case DT_JMPREL: + ef->pltrel = (const Elf_Rel *) (ef->address + dp->d_un.d_ptr); + break; + case DT_PLTRELSZ: + ef->pltrelsize = dp->d_un.d_val; + break; + case DT_RELA: + ef->rela = (const Elf_Rela *) (ef->address + dp->d_un.d_ptr); + break; + case DT_RELASZ: + ef->relasize = dp->d_un.d_val; + break; + case DT_RELAENT: + if (dp->d_un.d_val != sizeof(Elf_Rela)) + return ENOEXEC; + break; + case DT_PLTREL: + plttype = dp->d_un.d_val; + if (plttype != DT_REL && plttype != DT_RELA) + return ENOEXEC; + break; + } + } + + if (plttype == DT_RELA) { + ef->pltrela = (const Elf_Rela *) ef->pltrel; + ef->pltrel = NULL; + ef->pltrelasize = ef->pltrelsize; + ef->pltrelsize = 0; + } + + ef->ddbsymtab = ef->symtab; + ef->ddbsymcnt = ef->nchains; + ef->ddbstrtab = ef->strtab; + ef->ddbstrcnt = ef->strsz; + + return 0; +} + +static void +link_elf_error(const char *s) +{ + printf("kldload: %s\n", s); +} + +static int +link_elf_load_module(const char *filename, linker_file_t *result) +{ + caddr_t modptr, baseptr, sizeptr, dynptr; + char *type; + elf_file_t ef; + linker_file_t lf; + int error; + vm_offset_t dp; + + /* Look to see if we have the module preloaded */ + modptr = preload_search_by_name(filename); + if (modptr == NULL) + return (link_elf_load_file(filename, result)); + + /* It's preloaded, check we can handle it and collect information */ + type = (char *)preload_search_info(modptr, MODINFO_TYPE); + baseptr = preload_search_info(modptr, MODINFO_ADDR); + sizeptr = preload_search_info(modptr, MODINFO_SIZE); + dynptr = preload_search_info(modptr, MODINFO_METADATA|MODINFOMD_DYNAMIC); + if (type == NULL || strcmp(type, "elf module") != 0) + return (EFTYPE); + if (baseptr == NULL || sizeptr == NULL || dynptr == NULL) + return (EINVAL); + + ef = malloc(sizeof(struct elf_file), M_LINKER, M_WAITOK); + if (ef == NULL) + return (ENOMEM); + bzero(ef, sizeof(*ef)); + ef->modptr = modptr; + ef->address = *(caddr_t *)baseptr; +#ifdef SPARSE_MAPPING + ef->object = 0; +#endif + dp = (vm_offset_t)ef->address + *(vm_offset_t *)dynptr; + ef->dynamic = (Elf_Dyn *)dp; + lf = linker_make_file(filename, ef, &link_elf_module_ops); + if (lf == NULL) { + free(ef, M_LINKER); + return ENOMEM; + } + lf->address = ef->address; + lf->size = *(size_t *)sizeptr; + + error = parse_dynamic(lf); + if (error) { + linker_file_unload(lf); + return error; + } + error = load_dependancies(lf); + if (error) { + linker_file_unload(lf); + return error; + } + error = relocate_file(lf); + if (error) { + linker_file_unload(lf); + return error; + } + (void)parse_module_symbols(lf); + *result = lf; + return (0); +} + +static int +link_elf_load_file(const char* filename, linker_file_t* result) +{ + struct nameidata nd; + struct proc* p = curproc; /* XXX */ + Elf_Ehdr *hdr; + caddr_t firstpage; + int nbytes, i; + Elf_Phdr *phdr; + Elf_Phdr *phlimit; + Elf_Phdr *segs[2]; + int nsegs; + Elf_Phdr *phdyn; + Elf_Phdr *phphdr; + caddr_t mapbase; + size_t mapsize; + Elf_Off base_offset; + Elf_Addr base_vaddr; + Elf_Addr base_vlimit; + int error = 0; + int resid; + elf_file_t ef; + linker_file_t lf; + char *pathname; + Elf_Shdr *shdr; + int symtabindex; + int symstrindex; + int symcnt; + int strcnt; + + shdr = NULL; + lf = NULL; + + pathname = linker_search_path(filename); + if (pathname == NULL) + return ENOENT; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, pathname, p); + error = vn_open(&nd, FREAD, 0); + free(pathname, M_LINKER); + if (error) + return error; + + /* + * Read the elf header from the file. + */ + firstpage = malloc(PAGE_SIZE, M_LINKER, M_WAITOK); + if (firstpage == NULL) { + error = ENOMEM; + goto out; + } + hdr = (Elf_Ehdr *)firstpage; + error = vn_rdwr(UIO_READ, nd.ni_vp, firstpage, PAGE_SIZE, 0, + UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p); + nbytes = PAGE_SIZE - resid; + if (error) + goto out; + + if (!IS_ELF(*hdr)) { + error = ENOEXEC; + goto out; + } + + if (hdr->e_ident[EI_CLASS] != ELF_TARG_CLASS + || hdr->e_ident[EI_DATA] != ELF_TARG_DATA) { + link_elf_error("Unsupported file layout"); + error = ENOEXEC; + goto out; + } + if (hdr->e_ident[EI_VERSION] != EV_CURRENT + || hdr->e_version != EV_CURRENT) { + link_elf_error("Unsupported file version"); + error = ENOEXEC; + goto out; + } + if (hdr->e_type != ET_EXEC && hdr->e_type != ET_DYN) { + link_elf_error("Unsupported file type"); + error = ENOEXEC; + goto out; + } + if (hdr->e_machine != ELF_TARG_MACH) { + link_elf_error("Unsupported machine"); + error = ENOEXEC; + goto out; + } + + /* + * We rely on the program header being in the first page. This is + * not strictly required by the ABI specification, but it seems to + * always true in practice. And, it simplifies things considerably. + */ + if (!((hdr->e_phentsize == sizeof(Elf_Phdr)) && + (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= PAGE_SIZE) && + (hdr->e_phoff + hdr->e_phnum*sizeof(Elf_Phdr) <= nbytes))) + link_elf_error("Unreadable program headers"); + + /* + * Scan the program header entries, and save key information. + * + * We rely on there being exactly two load segments, text and data, + * in that order. + */ + phdr = (Elf_Phdr *) (firstpage + hdr->e_phoff); + phlimit = phdr + hdr->e_phnum; + nsegs = 0; + phdyn = NULL; + phphdr = NULL; + while (phdr < phlimit) { + switch (phdr->p_type) { + + case PT_LOAD: + if (nsegs == 2) { + link_elf_error("Too many sections"); + error = ENOEXEC; + goto out; + } + segs[nsegs] = phdr; + ++nsegs; + break; + + case PT_PHDR: + phphdr = phdr; + break; + + case PT_DYNAMIC: + phdyn = phdr; + break; + } + + ++phdr; + } + if (phdyn == NULL) { + link_elf_error("Object is not dynamically-linked"); + error = ENOEXEC; + goto out; + } + + /* + * Allocate the entire address space of the object, to stake out our + * contiguous region, and to establish the base address for relocation. + */ + base_offset = trunc_page(segs[0]->p_offset); + base_vaddr = trunc_page(segs[0]->p_vaddr); + base_vlimit = round_page(segs[1]->p_vaddr + segs[1]->p_memsz); + mapsize = base_vlimit - base_vaddr; + + ef = malloc(sizeof(struct elf_file), M_LINKER, M_WAITOK); + bzero(ef, sizeof(*ef)); +#ifdef SPARSE_MAPPING + ef->object = vm_object_allocate(OBJT_DEFAULT, mapsize >> PAGE_SHIFT); + if (ef->object == NULL) { + free(ef, M_LINKER); + error = ENOMEM; + goto out; + } + vm_object_reference(ef->object); + ef->address = (caddr_t) vm_map_min(kernel_map); + error = vm_map_find(kernel_map, ef->object, 0, + (vm_offset_t *) &ef->address, + mapsize, 1, + VM_PROT_ALL, VM_PROT_ALL, 0); + if (error) { + vm_object_deallocate(ef->object); + free(ef, M_LINKER); + goto out; + } +#else + ef->address = malloc(mapsize, M_LINKER, M_WAITOK); +#endif + mapbase = ef->address; + + /* + * Read the text and data sections and zero the bss. + */ + for (i = 0; i < 2; i++) { + caddr_t segbase = mapbase + segs[i]->p_vaddr - base_vaddr; + error = vn_rdwr(UIO_READ, nd.ni_vp, + segbase, segs[i]->p_filesz, segs[i]->p_offset, + UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p); + if (error) { +#ifdef SPARSE_MAPPING + vm_map_remove(kernel_map, (vm_offset_t) ef->address, + (vm_offset_t) ef->address + + (ef->object->size << PAGE_SHIFT)); + vm_object_deallocate(ef->object); +#else + free(ef->address, M_LINKER); +#endif + free(ef, M_LINKER); + goto out; + } + bzero(segbase + segs[i]->p_filesz, + segs[i]->p_memsz - segs[i]->p_filesz); + +#ifdef SPARSE_MAPPING + /* + * Wire down the pages + */ + vm_map_pageable(kernel_map, + (vm_offset_t) segbase, + (vm_offset_t) segbase + segs[i]->p_memsz, + FALSE); +#endif + } + + ef->dynamic = (const Elf_Dyn *) (mapbase + phdyn->p_vaddr - base_vaddr); + + lf = linker_make_file(filename, ef, &link_elf_file_ops); + if (lf == NULL) { +#ifdef SPARSE_MAPPING + vm_map_remove(kernel_map, (vm_offset_t) ef->address, + (vm_offset_t) ef->address + + (ef->object->size << PAGE_SHIFT)); + vm_object_deallocate(ef->object); +#else + free(ef->address, M_LINKER); +#endif + free(ef, M_LINKER); + error = ENOMEM; + goto out; + } + lf->address = ef->address; + lf->size = mapsize; + + error = parse_dynamic(lf); + if (error) + goto out; + error = load_dependancies(lf); + if (error) + goto out; + error = relocate_file(lf); + if (error) + goto out; + + /* Try and load the symbol table if it's present. (you can strip it!) */ + nbytes = hdr->e_shnum * hdr->e_shentsize; + if (nbytes == 0 || hdr->e_shoff == 0) + goto nosyms; + shdr = malloc(nbytes, M_LINKER, M_WAITOK); + if (shdr == NULL) { + error = ENOMEM; + goto out; + } + bzero(shdr, nbytes); + error = vn_rdwr(UIO_READ, nd.ni_vp, + (caddr_t)shdr, nbytes, hdr->e_shoff, + UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p); + if (error) + goto out; + symtabindex = -1; + symstrindex = -1; + for (i = 0; i < hdr->e_shnum; i++) { + if (shdr[i].sh_type == SHT_SYMTAB) { + symtabindex = i; + symstrindex = shdr[i].sh_link; + } + } + if (symtabindex < 0 || symstrindex < 0) + goto nosyms; + + symcnt = shdr[symtabindex].sh_size; + ef->symbase = malloc(symcnt, M_LINKER, M_WAITOK); + strcnt = shdr[symstrindex].sh_size; + ef->strbase = malloc(strcnt, M_LINKER, M_WAITOK); + + if (ef->symbase == NULL || ef->strbase == NULL) { + error = ENOMEM; + goto out; + } + error = vn_rdwr(UIO_READ, nd.ni_vp, + ef->symbase, symcnt, shdr[symtabindex].sh_offset, + UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p); + if (error) + goto out; + error = vn_rdwr(UIO_READ, nd.ni_vp, + ef->strbase, strcnt, shdr[symstrindex].sh_offset, + UIO_SYSSPACE, IO_NODELOCKED, p->p_ucred, &resid, p); + if (error) + goto out; + + ef->ddbsymcnt = symcnt / sizeof(Elf_Sym); + ef->ddbsymtab = (const Elf_Sym *)ef->symbase; + ef->ddbstrcnt = strcnt; + ef->ddbstrtab = ef->strbase; + +nosyms: + + *result = lf; + +out: + if (error && lf) + linker_file_unload(lf); + if (shdr) + free(shdr, M_LINKER); + if (firstpage) + free(firstpage, M_LINKER); + VOP_UNLOCK(nd.ni_vp, 0, p); + vn_close(nd.ni_vp, FREAD, p->p_ucred, p); + + return error; +} + +static void +link_elf_unload_file(linker_file_t file) +{ + elf_file_t ef = file->priv; + + if (ef) { +#ifdef SPARSE_MAPPING + if (ef->object) { + vm_map_remove(kernel_map, (vm_offset_t) ef->address, + (vm_offset_t) ef->address + + (ef->object->size << PAGE_SHIFT)); + vm_object_deallocate(ef->object); + } +#else + if (ef->address) + free(ef->address, M_LINKER); +#endif + if (ef->symbase) + free(ef->symbase, M_LINKER); + if (ef->strbase) + free(ef->strbase, M_LINKER); + free(ef, M_LINKER); + } +} + +static void +link_elf_unload_module(linker_file_t file) +{ + elf_file_t ef = file->priv; + + if (ef) + free(ef, M_LINKER); + if (file->filename) + preload_delete_name(file->filename); +} + +static int +load_dependancies(linker_file_t lf) +{ + elf_file_t ef = lf->priv; + linker_file_t lfdep; + char* name; + const Elf_Dyn *dp; + int error = 0; + + /* + * All files are dependant on /kernel. + */ + if (linker_kernel_file) { + linker_kernel_file->refs++; + linker_file_add_dependancy(lf, linker_kernel_file); + } + + for (dp = ef->dynamic; dp->d_tag != DT_NULL; dp++) { + if (dp->d_tag == DT_NEEDED) { + name = ef->strtab + dp->d_un.d_val; + + error = linker_load_file(name, &lfdep); + if (error) + goto out; + error = linker_file_add_dependancy(lf, lfdep); + if (error) + goto out; + } + } + +out: + return error; +} + +static const char * +symbol_name(elf_file_t ef, Elf_Word r_info) +{ + const Elf_Sym *ref; + + if (ELF_R_SYM(r_info)) { + ref = ef->symtab + ELF_R_SYM(r_info); + return ef->strtab + ref->st_name; + } else + return NULL; +} + +static int +relocate_file(linker_file_t lf) +{ + elf_file_t ef = lf->priv; + const Elf_Rel *rellim; + const Elf_Rel *rel; + const Elf_Rela *relalim; + const Elf_Rela *rela; + const char *symname; + + /* Perform relocations without addend if there are any: */ + rel = ef->rel; + if (rel) { + rellim = (const Elf_Rel *) ((caddr_t) ef->rel + ef->relsize); + while (rel < rellim) { + symname = symbol_name(ef, rel->r_info); + if (elf_reloc(lf, rel, ELF_RELOC_REL, symname)) { + printf("link_elf: symbol %s undefined\n", symname); + return ENOENT; + } + rel++; + } + } + + /* Perform relocations with addend if there are any: */ + rela = ef->rela; + if (rela) { + relalim = (const Elf_Rela *) ((caddr_t) ef->rela + ef->relasize); + while (rela < relalim) { + symname = symbol_name(ef, rela->r_info); + if (elf_reloc(lf, rela, ELF_RELOC_RELA, symname)) { + printf("link_elf: symbol %s undefined\n", symname); + return ENOENT; + } + rela++; + } + } + + /* Perform PLT relocations without addend if there are any: */ + rel = ef->pltrel; + if (rel) { + rellim = (const Elf_Rel *) ((caddr_t) ef->pltrel + ef->pltrelsize); + while (rel < rellim) { + symname = symbol_name(ef, rel->r_info); + if (elf_reloc(lf, rel, ELF_RELOC_REL, symname)) { + printf("link_elf: symbol %s undefined\n", symname); + return ENOENT; + } + rel++; + } + } + + /* Perform relocations with addend if there are any: */ + rela = ef->pltrela; + if (rela) { + relalim = (const Elf_Rela *) ((caddr_t) ef->pltrela + ef->pltrelasize); + while (rela < relalim) { + symname = symbol_name(ef, rela->r_info); + if (elf_reloc(lf, rela, ELF_RELOC_RELA, symname)) { + printf("link_elf: symbol %s undefined\n", symname); + return ENOENT; + } + rela++; + } + } + + return 0; +} + +/* + * Hash function for symbol table lookup. Don't even think about changing + * this. It is specified by the System V ABI. + */ +static unsigned long +elf_hash(const char *name) +{ + const unsigned char *p = (const unsigned char *) name; + unsigned long h = 0; + unsigned long g; + + while (*p != '\0') { + h = (h << 4) + *p++; + if ((g = h & 0xf0000000) != 0) + h ^= g >> 24; + h &= ~g; + } + return h; +} + +int +link_elf_lookup_symbol(linker_file_t lf, const char* name, linker_sym_t* sym) +{ + elf_file_t ef = lf->priv; + unsigned long symnum; + const Elf_Sym* symp; + const char *strp; + unsigned long hash; + int i; + + /* First, search hashed global symbols */ + hash = elf_hash(name); + symnum = ef->buckets[hash % ef->nbuckets]; + + while (symnum != STN_UNDEF) { + if (symnum >= ef->nchains) { + printf("link_elf_lookup_symbol: corrupt symbol table\n"); + return ENOENT; + } + + symp = ef->symtab + symnum; + if (symp->st_name == 0) { + printf("link_elf_lookup_symbol: corrupt symbol table\n"); + return ENOENT; + } + + strp = ef->strtab + symp->st_name; + + if (strcmp(name, strp) == 0) { + if (symp->st_shndx != SHN_UNDEF || + (symp->st_value != 0 && + ELF_ST_TYPE(symp->st_info) == STT_FUNC)) { + *sym = (linker_sym_t) symp; + return 0; + } else + return ENOENT; + } + + symnum = ef->chains[symnum]; + } + + /* If we have not found it, look at the full table (if loaded) */ + if (ef->symtab == ef->ddbsymtab) + return ENOENT; + + /* Exhaustive search */ + for (i = 0, symp = ef->ddbsymtab; i < ef->ddbsymcnt; i++, symp++) { + strp = ef->ddbstrtab + symp->st_name; + if (strcmp(name, strp) == 0) { + if (symp->st_shndx != SHN_UNDEF || + (symp->st_value != 0 && + ELF_ST_TYPE(symp->st_info) == STT_FUNC)) { + *sym = (linker_sym_t) symp; + return 0; + } else + return ENOENT; + } + } + + return ENOENT; +} + +static int +link_elf_symbol_values(linker_file_t lf, linker_sym_t sym, linker_symval_t* symval) +{ + elf_file_t ef = lf->priv; + Elf_Sym* es = (Elf_Sym*) sym; + + if (es >= ef->symtab && ((es - ef->symtab) < ef->nchains)) { + symval->name = ef->strtab + es->st_name; + symval->value = (caddr_t) ef->address + es->st_value; + symval->size = es->st_size; + return 0; + } + if (ef->symtab == ef->ddbsymtab) + return ENOENT; + if (es >= ef->ddbsymtab && ((es - ef->ddbsymtab) < ef->ddbsymcnt)) { + symval->name = ef->ddbstrtab + es->st_name; + symval->value = (caddr_t) ef->address + es->st_value; + symval->size = es->st_size; + return 0; + } + return ENOENT; +} + +static int +link_elf_search_symbol(linker_file_t lf, caddr_t value, + linker_sym_t* sym, long* diffp) +{ + elf_file_t ef = lf->priv; + u_long off = (u_long) value; + u_long diff = off; + const Elf_Sym* es; + const Elf_Sym* best = 0; + int i; + + for (i = 0, es = ef->ddbsymtab; i < ef->ddbsymcnt; i++, es++) { + if (es->st_name == 0) + continue; + if (off >= es->st_value) { + if (off - es->st_value < diff) { + diff = off - es->st_value; + best = es; + if (diff == 0) + break; + } else if (off - es->st_value == diff) { + best = es; + } + } + } + if (best == 0) + *diffp = off; + else + *diffp = diff; + *sym = (linker_sym_t) best; + + return 0; +} diff --git a/sys/kern/makedevops.pl b/sys/kern/makedevops.pl new file mode 100644 index 0000000..24e0b14 --- /dev/null +++ b/sys/kern/makedevops.pl @@ -0,0 +1,394 @@ +#!/usr/bin/perl +# +# Copyright (c) 1992, 1993 +# The Regents of the University of California. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. All advertising materials mentioning features or use of this software +# must display the following acknowledgement: +# This product includes software developed by the University of +# California, Berkeley and its contributors. +# 4. Neither the name of the University nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# From @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 +# From @(#)makedevops.sh 1.1 1998/06/14 13:53:12 dfr Exp $ +# From @(#)makedevops.sh ?.? 1998/10/05 + +# +# Script to produce device front-end sugar. +# + +$debug = 0; +$cfile = 0; # by default do not produce any file type +$hfile = 0; + +$keepcurrentdir = 1; + +$line_width = 80; + +use File::Basename; + +# Process the command line +# +while ( $arg = shift @ARGV ) { + if ( $arg eq '-c' ) { + warn "Producing .c output files" + if $debug; + $cfile = 1; + } elsif ( $arg eq '-h' ) { + warn "Producing .h output files" + if $debug; + $hfile = 1; + } elsif ( $arg eq '-ch' || $arg eq '-hc' ) { + warn "Producing .c and .h output files" + if $debug; + $cfile = 1; + $hfile = 1; + } elsif ( $arg eq '-d' ) { + $debug = 1; + } elsif ( $arg eq '-p' ) { + warn "Will produce files in original not in current directory" + if $debug; + $keepcurrentdir = 0; + } elsif ( $arg eq '-l' ) { + if ( $line_width = shift @ARGV and $line_width > 0 ) { + warn "Line width set to $line_width" + if $debug; + } else { + die "Please specify a valid line width after -l"; + } + } elsif ( $arg =~ m/\.m$/ ) { + warn "Filename: $arg" + if $debug; + push @filenames, $arg; + } else { + warn "$arg ignored" + if $debug; + } +} + + +# Validate the command line parameters +# +die "usage: $0 [-d] [-p] [-c|-h] srcfile +where -c produce only .c files + -h produce only .h files + -p use the path component in the source file for destination dir + -l set line width for output files [80] + -d switch on debugging +" + unless ($cfile or $hfile) + and $#filenames != -1; + +# FIXME should be able to do this more easily +# +$tmpdir = $ENV{'TMPDIR'}; # environment variables +$tmpdir = $ENV{'TMP'} + if !$tmpdir; +$tmpdir = $ENV{'TEMP'} + if !$tmpdir; +$tmpdir = '/tmp' # look for a physical directory + if !$tmpdir and -d '/tmp'; +$tmpdir = '/usr/tmp' + if !$tmpdir and -d '/usr/tmp'; +$tmpdir = '/var/tmp' + if !$tmpdir and -d '/var/tmp'; +$tmpdir = '.' # give up and use current dir + if !$tmpdir; + +foreach $src ( @filenames ) { + # Names of the created files + $ctmpname = "$tmpdir/ctmp.$$"; + $htmpname = "$tmpdir/htmp.$$"; + + ($name, $path, $suffix) = &fileparse($src, '.m'); + $path = '.' + if $keepcurrentdir; + $cfilename="$path/$name.c"; + $hfilename="$path/$name.h"; + + warn "Processing from $src to $cfile / $hfile via $ctmp / $htmp" + if $debug; + + die "Could not open $src, $!" + if !open SRC, "$src"; + die "Could not open $ctmpname, $!" + if $cfile and !open CFILE, ">$ctmpname"; + die "Could not open $htmpname, $!" + if $hfile and !open HFILE, ">$htmpname"; + + if ( $cfile ) { + # Produce the header of the C file + # + print CFILE "/*\n"; + print CFILE " * This file is produced automatically.\n"; + print CFILE " * Do not modify anything in here by hand.\n"; + print CFILE " *\n"; + print CFILE " * Created from\n"; + print CFILE " * $src\n"; + print CFILE " * with\n"; + print CFILE " * $0\n"; + print CFILE " */\n"; + print CFILE "\n"; + print CFILE "#include <sys/param.h>\n"; + print CFILE "#include <sys/queue.h>\n"; + print CFILE "#include <sys/bus_private.h>\n"; + } + + if ( $hfile ) { + # Produce the header of the H file + # + print HFILE "/*\n"; + print HFILE " * This file is produced automatically.\n"; + print HFILE " * Do not modify anything in here by hand.\n"; + print HFILE " *\n"; + print HFILE " * Created from\n"; + print HFILE " * $src\n"; + print HFILE " * with\n"; + print HFILE " * $0\n"; + print HFILE " */\n"; + print HFILE "\n"; + } + + %methods = (); # clear list of methods + $lineno = 0; + $error = 0; # to signal clean up and gerror setting + + LINE: while ( $line = <SRC> ) { + $lineno++; + + # take special notice of include directives. + # + if ( $line =~ m/^#\s*include\s+(["<])([^">]+)([">]).*/i ) { + warn "Included file: $1$2" . ($1 eq '<'? '>':'"') + if $debug; + print CFILE "#include $1$2" . ($1 eq '<'? '>':'"') . "\n" + if $cfile; + } + + $line =~ s/#.*//; # remove comments + $line =~ s/^\s+//; # remove leading ... + $line =~ s/\s+$//; # remove trailing whitespace + + if ( $line =~ m/^$/ ) { # skip empty lines + # nop + + } elsif ( $line =~ m/^INTERFACE\s*([^\s;]*)(\s*;?)/i ) { + $intname = $1; + $semicolon = $2; + unless ( $intname =~ m/^[a-z_][a-z0-9_]*$/ ) { + warn $line + if $debug; + warn "$src:$lineno: Invalid interface name '$intname', use [a-z_][a-z0-9_]*"; + $error = 1; + last LINE; + } + + warn "$src:$lineno: semicolon missing at end of line, no problem" + if $semicolon !~ s/;$//; + + warn "Interface $intname" + if $debug; + + print HFILE '#ifndef _'.$intname."_if_h_\n" + if $hfile; + print HFILE '#define _'.$intname."_if_h_\n\n" + if $hfile; + print CFILE '#include "'.$intname.'_if.h"'."\n\n" + if $cfile; + + } elsif ( $line =~ m/^METHOD/i ) { + # Get the return type function name and delete that from + # the line. What is left is the possibly first function argument + # if it is on the same line. + # + # FIXME For compatibilities sake METHOD and METHODE is accepted. + # + if ( !$intname ) { + warn "$src:$lineno: No interface name defined"; + $error = 1; + last LINE; + } + $line =~ s/^METHODE?\s+([^{]+?)\s*{\s*//i; + @ret = split m/\s+/, $1; + $name = pop @ret; # last element is name of method + $ret = join(" ", @ret); # return type + + warn "Method: name=$name return type=$ret" + if $debug; + + if ( !$name or !$ret ) { + warn $line + if $debug; + warn "$src:$lineno: Invalid method specification"; + $error = 1; + last LINE; + } + + unless ( $name =~ m/^[a-z_][a-z_0-9]*$/ ) { + warn $line + if $debug; + warn "$src:$lineno: Invalid method name '$name', use [a-z_][a-z0-9_]*"; + $error = 1; + last LINE; + } + + if ( defined($methods{$name}) ) { + warn "$src:$lineno: Duplicate method name"; + $error = 1; + last LINE; + } + + $methods{$name} = 'VIS'; + + while ( $line !~ m/}/ and $line .= <SRC> ) { } + + if ( $line !~ s/};?(.*)// ) { # remove first '}' and trailing garbage + # The '}' was not there (the rest is optional), so complain + warn "$src:$lineno: Premature end of file"; + $error = 1; + last LINE; + } + warn "$src:$lineno: Ignored '$1'" # warn about garbage at end of line + if $debug and $1; + + # Create a list of variables without the types prepended + # + $line =~ s/^\s+//; # remove leading ... + $line =~ s/\s+$//; # ... and trailing whitespace + $line =~ s/\s+/ /; # remove double spaces + + @arguments = split m/\s*;\s*/, $line; + @varnames = (); # list of varnames + foreach $argument (@arguments) { + next # skip argument if argument is empty + if !$argument; + + @ar = split m/[*\s]+/, $argument; + if ( $#ar == 0 ) { # only 1 word in argument? + warn "$src:$lineno: no type for '$argument'"; + $error = 1; + last LINE; + } + + push @varnames, $ar[-1]; # last element is name of variable + }; + + warn 'Arguments: ' . join(', ', @arguments) . "\n" + . 'Varnames: ' . join(', ', @varnames) + if $debug; + + $mname = $intname.'_'.$name; # method name + $umname = uc($mname); # uppercase method name + + $arguments = join(", ", @arguments); + $varnames = join(", ", @varnames); + + if ( $hfile ) { + # the method description + print HFILE "extern struct device_op_desc $mname\_desc;\n"; + # the method typedef + print HFILE &format_line("typedef $ret $mname\_t($arguments);", + $line_width, ', ', + ',',' ' x length("typedef $ret $mname\_t(")) + . "\n"; + # the method declaration + print HFILE "$mname\_t $umname;\n\n"; + } + + if ( $cfile ) { + # Print out the method desc + print CFILE "struct device_op_desc $mname\_desc = {\n"; + print CFILE "\t0, \"$mname\"\n"; + print CFILE "};\n\n"; + + # Print out the method itself + if ( 0 ) { # haven't chosen the format yet + print CFILE "$ret $umname($varnames)\n"; + print CFILE "\t".join(";\n\t", @arguments).";\n"; + } else { + print CFILE &format_line("$ret $umname($arguments)", + $line_width, ', ', + ',', ' ' x length("$ret $umname(")) . "\n"; + } + print CFILE "{\n"; + print CFILE &format_line("\t$mname\_t *m = ($mname\_t *) DEVOPMETH(dev, $mname);", + $line_width-8, ' = ', ' =', "\t\t") + . "\n"; + print CFILE "\t".($ret eq 'void'? '':'return ') . "m($varnames);\n"; + print CFILE "}\n\n"; + } + } else { + warn $line + if $debug; + warn "$src:$lineno: Invalid line encountered"; + $error = 1; + last LINE; + } + } # end LINE + + # print the final '#endif' in the header file + # + print HFILE "#endif /* _".$intname."_if_h_ */\n" + if $hfile; + + close SRC; + close CFILE + if $cfile; + close HFILE + if $hfile; + + if ( !$error ) { + if ( $cfile ) { + ($rc = system("mv $ctmpname $cfilename")) + and warn "mv $ctmpname $cfilename failed, $rc"; + } + + if ( $hfile ) { + ($rc = system("mv $htmpname $hfilename")) + and warn "mv $htmpname $hfilename failed, $rc"; + } + } else { + warn 'File' . ($hfile and $cfile? 's':'') . ' skipped'; + ($rc = system("rm -f $htmpname $ctmpname")) + and warn "rm -f $htmpname $ctmpname failed, $rc"; + $gerror = 1; + } +} + +exit $gerror; + + +sub format_line { + my ($line, $maxlength, $break, $new_end, $new_start) = @_; + my $rline = ""; + + while ( length($line) > $maxlength + and ($i = rindex $line, $break, $maxlength-length($new_end)) != -1 ) { + $rline .= substr($line, 0, $i) . $new_end . "\n"; + $line = $new_start . substr($line, $i+length($break)); + } + + return $rline . $line; +} diff --git a/sys/kern/makedevops.sh b/sys/kern/makedevops.sh new file mode 100644 index 0000000..a5e9ebd --- /dev/null +++ b/sys/kern/makedevops.sh @@ -0,0 +1,232 @@ +#!/bin/sh - +# +# Copyright (c) 1992, 1993 +# The Regents of the University of California. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. All advertising materials mentioning features or use of this software +# must display the following acknowledgement: +# This product includes software developed by the University of +# California, Berkeley and its contributors. +# 4. Neither the name of the University nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# From @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 +# $Id: makedevops.sh,v 1.1 1998/06/14 13:53:12 dfr Exp $ +# + +# Script to produce device front-end sugar. +# +# usage: makedevops.sh srcfile +# +# These awk scripts are not particularly well written, specifically they +# don't use arrays well and figure out the same information repeatedly. +# Please rewrite them if you actually understand how to use awk. Note, +# they use nawk extensions and gawk's toupper. + +if [ $# -ne 2 ] ; then + echo 'usage: makedevops.sh [-c|-h] srcfile' + exit 1 +fi + +makec=0 +makeh=0 + +if [ "$1" = "-c" ]; then + makec=1 +fi + +if [ "$1" = "-h" ]; then + makeh=1 +fi + +# Name of the source file. +SRC=$2 + +# Names of the created files. +CTMP=ctmp$$ +HTMP=htmp$$ + +CFILE=`basename $SRC .m`.c +HFILE=`basename $SRC .m`.h + +# Awk program (must support nawk extensions and gawk's "toupper") +# Use "awk" at Berkeley, "gawk" elsewhere. +AWK=awk + +# Awk script to take file.do and turn it into file.h and file.c +$AWK " + BEGIN { + src = \"$SRC\"; + header = \"$HTMP\"; + cfile = \"$CTMP\"; + hfile = \"$HFILE\"; + "' + + printf("/*\n") > header; + printf(" * This file is produced automatically.\n") > header; + printf(" * Do not modify anything in here by hand.\n") > header; + printf(" *\n") > header; + printf(" * Created from %s with makedevops.sh\n", src) > header; + printf(" */\n\n") > header; + + printf("/*\n") > cfile; + printf(" * This file is produced automatically.\n") > cfile; + printf(" * Do not modify anything in here by hand.\n") > cfile; + printf(" *\n") > cfile; + printf(" * Created from %s with makedevops.sh\n", src) > cfile; + printf(" */\n\n") > cfile; + printf("#include <sys/param.h>\n") > cfile; + printf("#include <sys/queue.h>\n") > cfile; + printf("#include <sys/bus_private.h>\n") > cfile; + + methodcount = 0 + } + NF == 0 { + next; + } + /^#include/ { + print $0 > cfile; + } + /^#/ { + next; + } + /^INTERFACE/ { + intname = $2; + printf("#ifndef _%s_if_h_\n", intname) > header; + printf("#define _%s_if_h_\n\n", intname) > header; + printf("#include \"%s\"\n\n", hfile) > cfile; + } + /^METHOD/ { + # Get the function name and return type. + ret = ""; + sep = ""; + for (i = 2; i < NF - 1; i++) { + ret = sep $i; + sep = " "; + } + name = $i; + + # Get the function arguments. + for (c1 = 0;; ++c1) { + if (getline <= 0) + exit + if ($0 ~ "^};") + break; + a[c1] = $0; + } + + methods[methodcount++] = name; + + mname = intname "_" name; + umname = toupper(mname); + + # Print out the method declaration + printf("extern struct device_op_desc %s_desc;\n", mname) > header; + printf("%s %s(", ret, umname) > header; + sep = ", "; + for (c2 = 0; c2 < c1; ++c2) { + if (c2 == c1 - 1) + sep = " );\n"; + c3 = split(a[c2], t); + for (c4 = 0; c4 < c3; ++c4) + printf("%s ", t[c4]) > header; + beg = match(t[c3], "[^*]"); + end = match(t[c3], ";"); + printf("%s%s%s", + substr(t[c4], 0, beg - 1), + substr(t[c4], beg, end - beg), sep) > header; + } + + # Print the method desc + printf("struct device_op_desc %s_desc = {\n", mname) > cfile; + printf("\t0,\n") > cfile; + printf("\t\"%s\"\n", mname) > cfile; + printf("};\n\n") > cfile; + + # Print out the method typedef + printf("typedef %s %s_t(\n", ret, mname) > cfile; + sep = ",\n"; + for (c2 = 0; c2 < c1; ++c2) { + if (c2 == c1 - 1) + sep = ");\n"; + c3 = split(a[c2], t); + printf("\t") > cfile; + for (c4 = 0; c4 < c3; ++c4) + printf("%s ", t[c4]) > cfile; + beg = match(t[c3], "[^*]"); + end = match(t[c3], ";"); + printf("%s%s%s", + substr(t[c4], 0, beg - 1), + substr(t[c4], beg, end - beg), sep) > cfile; + } + + # Print out the method itself + printf("%s %s(\n", ret, umname) > cfile; + sep = ",\n"; + for (c2 = 0; c2 < c1; ++c2) { + if (c2 == c1 - 1) + sep = ")\n"; + c3 = split(a[c2], t); + printf("\t") > cfile; + for (c4 = 0; c4 < c3; ++c4) + printf("%s ", t[c4]) > cfile; + beg = match(t[c3], "[^*]"); + end = match(t[c3], ";"); + printf("%s%s%s", + substr(t[c4], 0, beg - 1), + substr(t[c4], beg, end - beg), sep) > cfile; + } + printf("{\n") > cfile; + printf("\t%s_t *m = (%s_t *) DEVOPMETH(dev, %s);\n", + mname, mname, mname) > cfile; + if (ret != "void") + printf("\treturn m(") > cfile; + else + printf("\tm(") > cfile; + sep = ", "; + for (c2 = 0; c2 < c1; ++c2) { + if (c2 == c1 - 1) + sep = ");\n"; + c3 = split(a[c2], t); + beg = match(t[c3], "[^*]"); + end = match(t[c3], ";"); + printf("%s%s", substr(t[c3], beg, end - beg), sep) > cfile; + } + printf("}\n\n") > cfile; + } + END { + printf("\n#endif /* _%s_if_h_ */\n", intname) > header; + }' < $SRC + +if [ $makec = 1 ]; then + mv $CTMP $CFILE +else + rm $CTMP +fi + +if [ $makeh = 1 ]; then + mv $HTMP $HFILE +else + rm $HTMP +fi diff --git a/sys/kern/makesyscalls.sh b/sys/kern/makesyscalls.sh new file mode 100644 index 0000000..0cbd247 --- /dev/null +++ b/sys/kern/makesyscalls.sh @@ -0,0 +1,394 @@ +#! /bin/sh - +# @(#)makesyscalls.sh 8.1 (Berkeley) 6/10/93 +# $Id: makesyscalls.sh,v 1.34 1998/06/09 03:32:05 bde Exp $ + +set -e + +# name of compat option: +compat=COMPAT_43 + +# output files: +sysnames="syscalls.c" +sysproto="../sys/sysproto.h" +sysproto_h=_SYS_SYSPROTO_H_ +syshdr="../sys/syscall.h" +sysmk="../sys/syscall.mk" +syssw="init_sysent.c" +syshide="../sys/syscall-hide.h" +syscallprefix="SYS_" +switchname="sysent" +namesname="syscallnames" + +# tmp files: +sysdcl="sysent.dcl.$$" +syscompat="sysent.compat.$$" +syscompatdcl="sysent.compatdcl.$$" +sysent="sysent.switch.$$" +sysinc="sysinc.switch.$$" +sysarg="sysarg.switch.$$" + +trap "rm $sysdcl $syscompat $syscompatdcl $sysent $sysinc $sysarg" 0 + +touch $sysdcl $syscompat $syscompatdcl $sysent $sysinc $sysarg + +case $# in + 0) echo "Usage: $0 input-file <config-file>" 1>&2 + exit 1 + ;; +esac + +if [ -n "$2" -a -f "$2" ]; then + . $2 +fi + +sed -e ' +s/\$//g +:join + /\\$/{a\ + + N + s/\\\n// + b join + } +2,${ + /^#/!s/\([{}()*,]\)/ \1 /g +} +' < $1 | awk " + BEGIN { + sysdcl = \"$sysdcl\" + sysproto = \"$sysproto\" + sysproto_h = \"$sysproto_h\" + syscompat = \"$syscompat\" + syscompatdcl = \"$syscompatdcl\" + sysent = \"$sysent\" + syssw = \"$syssw\" + sysinc = \"$sysinc\" + sysarg = \"$sysarg\" + sysnames = \"$sysnames\" + syshdr = \"$syshdr\" + sysmk = \"$sysmk\" + compat = \"$compat\" + syshide = \"$syshide\" + syscallprefix = \"$syscallprefix\" + switchname = \"$switchname\" + namesname = \"$namesname\" + infile = \"$1\" + "' + + printf "/*\n * System call switch table.\n *\n" > syssw + printf " * DO NOT EDIT-- this file is automatically generated.\n" > syssw + + printf "/*\n * System call prototypes.\n *\n" > sysarg + printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysarg + + printf "\n#ifdef %s\n\n", compat > syscompat + + printf "/*\n * System call names.\n *\n" > sysnames + printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnames + + printf "/*\n * System call numbers.\n *\n" > syshdr + printf " * DO NOT EDIT-- this file is automatically generated.\n" > syshdr + printf "# FreeBSD system call names.\n" > sysmk + printf "# DO NOT EDIT-- this file is automatically generated.\n" > sysmk + printf "/*\n * System call hiders.\n *\n" > syshide + printf " * DO NOT EDIT-- this file is automatically generated.\n" > syshide + } + NR == 1 { + gsub("[$]Id: ", "", $0) + gsub(" [$]", "", $0) + + printf " * created from%s\n */\n\n", $0 > syssw + + printf "\n/* The casts are bogus but will do for now. */\n" > sysent + printf "struct sysent %s[] = {\n",switchname > sysent + + printf " * created from%s\n */\n\n", $0 > sysarg + printf "#ifndef %s\n", sysproto_h > sysarg + printf "#define\t%s\n\n", sysproto_h > sysarg + printf "#include <sys/signal.h>\n\n" > sysarg + printf "struct proc;\n\n" > sysarg + printf "#define\tPAD_(t)\t(sizeof(register_t) <= sizeof(t) ? \\\n" > sysarg + printf "\t\t0 : sizeof(register_t) - sizeof(t))\n\n" > sysarg + + printf " * created from%s\n */\n\n", $0 > sysnames + printf "char *%s[] = {\n", namesname > sysnames + + printf " * created from%s\n */\n\n", $0 > syshdr + + printf "# created from%s\nMIASM = ", $0 > sysmk + + printf " * created from%s\n */\n\n", $0 > syshide + next + } + NF == 0 || $1 ~ /^;/ { + next + } + $1 ~ /^#[ ]*include/ { + print > sysinc + next + } + $1 ~ /^#[ ]*if/ { + print > sysent + print > sysdcl + print > sysarg + print > syscompat + print > sysnames + print > syshide + savesyscall = syscall + next + } + $1 ~ /^#[ ]*else/ { + print > sysent + print > sysdcl + print > sysarg + print > syscompat + print > sysnames + print > syshide + syscall = savesyscall + next + } + $1 ~ /^#/ { + print > sysent + print > sysdcl + print > sysarg + print > syscompat + print > sysnames + print > syshide + next + } + syscall != $1 { + printf "%s: line %d: syscall number out of sync at %d\n", + infile, NR, syscall + printf "line is:\n" + print + exit 1 + } + function parserr(was, wanted) { + printf "%s: line %d: unexpected %s (expected %s)\n", + infile, NR, was, wanted + exit 1 + } + function parseline() { + f=4 # toss number and type + argc= 0; + bigargc = 0; + if ($NF != "}") { + funcalias=$(NF-2) + argalias=$(NF-1) + rettype=$NF + end=NF-3 + } else { + funcalias="" + argalias="" + rettype="int" + end=NF + } + if ($2 == "NODEF") { + funcname=$4 + return + } + if ($f != "{") + parserr($f, "{") + f++ + if ($end != "}") + parserr($end, "}") + end-- + if ($end != ";") + parserr($end, ";") + end-- + if ($end != ")") + parserr($end, ")") + end-- + + f++ #function return type + + funcname=$f + if (funcalias == "") + funcalias = funcname + if (argalias == "") { + argalias = funcname "_args" + if ($2 == "COMPAT") + argalias = "o" argalias + } + f++ + + if ($f != "(") + parserr($f, ")") + f++ + + if (f == end) { + if ($f != "void") + parserr($f, "argument definition") + return + } + + while (f <= end) { + argc++ + argtype[argc]="" + oldf="" + while (f < end && $(f+1) != ",") { + if (argtype[argc] != "" && oldf != "*") + argtype[argc] = argtype[argc]" "; + argtype[argc] = argtype[argc]$f; + oldf = $f; + f++ + } + if (argtype[argc] == "") + parserr($f, "argument definition") + if (argtype[argc] == "off_t") + bigargc++ + argname[argc]=$f; + f += 2; # skip name, and any comma + } + } + { comment = $4 + if (NF < 7) + for (i = 5; i <= NF; i++) + comment = comment " " $i + } + $2 == "STD" || $2 == "NODEF" || $2 == "NOARGS" || $2 == "NOPROTO" \ + || $2 == "NOIMPL" { + parseline() + if ((!nosys || funcname != "nosys") && \ + (funcname != "lkmnosys")) { + if (argc != 0 && $2 != "NOARGS" && $2 != "NOPROTO") { + printf("struct\t%s {\n", argalias) > sysarg + for (i = 1; i <= argc; i++) + printf("\t%s\t%s;\tchar %s_[PAD_(%s)];\n", + argtype[i], argname[i], + argname[i], argtype[i]) > sysarg + printf("};\n") > sysarg + } + else if($2 != "NOARGS" && $2 != "NOPROTO") + printf("struct\t%s {\n\tregister_t dummy;\n};\n", + argalias) > sysarg + } + if ($2 != "NOPROTO" && (!nosys || funcname != "nosys") && \ + (!lkmnosys || funcname != "lkmnosys")) { + printf("%s\t%s __P((struct proc *, struct %s *))", + rettype, funcname, argalias) > sysdcl + if (funcname == "exit") + printf(" __dead2") > sysdcl + printf(";\n") > sysdcl + } + if (funcname == "nosys") + nosys = 1 + if (funcname == "lkmnosys") + lkmnosys = 1 + if ($2 != "NOIMPL") { + printf("\t{ %d, (sy_call_t *)%s },\t\t", + argc+bigargc, funcname) > sysent + if(length(funcname) < 11) + printf("\t") > sysent + printf("/* %d = %s */\n", syscall, funcalias) > sysent + } else { + printf("\t{ %d, (sy_call_t *)%s },\t\t", + argc+bigargc, "nosys") > sysent + if(length("nosys") < 11) + printf("\t") > sysent + printf("/* %d = %s */\n", syscall, funcalias) > sysent + } + printf("\t\"%s\",\t\t\t/* %d = %s */\n", + funcalias, syscall, funcalias) > sysnames + if ($2 != "NODEF") { + printf("#define\t%s%s\t%d\n", syscallprefix, + funcalias, syscall) > syshdr + printf(" \\\n\t%s.o", funcalias) > sysmk + } + if ($3 != "NOHIDE") + printf("HIDE_%s(%s)\n", $3, funcname) > syshide + syscall++ + next + } + $2 == "COMPAT" || $2 == "CPT_NOA" { + ncompat++ + parseline() + if (argc != 0 && $2 != "CPT_NOA") { + printf("struct\t%s {\n", argalias) > syscompat + for (i = 1; i <= argc; i++) + printf("\t%s\t%s;\tchar %s_[PAD_(%s)];\n", + argtype[i], argname[i], + argname[i], argtype[i]) > syscompat + printf("};\n") > syscompat + } + else if($2 != "CPT_NOA") + printf("struct\t%s {\n\tregister_t dummy;\n};\n", + argalias) > sysarg + printf("%s\to%s __P((struct proc *, struct %s *));\n", + rettype, funcname, argalias) > syscompatdcl + printf("\t{ compat(%d,%s) },\t\t/* %d = old %s */\n", + argc+bigargc, funcname, syscall, funcalias) > sysent + printf("\t\"old.%s\",\t\t/* %d = old %s */\n", + funcalias, syscall, funcalias) > sysnames + printf("\t\t\t\t/* %d is old %s */\n", + syscall, funcalias) > syshdr + if ($3 != "NOHIDE") + printf("HIDE_%s(%s)\n", $3, funcname) > syshide + syscall++ + next + } + $2 == "LIBCOMPAT" { + ncompat++ + parseline() + printf("%s\to%s();\n", rettype, funcname) > syscompatdcl + printf("\t{ compat(%d,%s) },\t\t/* %d = old %s */\n", + argc+bigargc, funcname, syscall, funcalias) > sysent + printf("\t\"old.%s\",\t\t/* %d = old %s */\n", + funcalias, syscall, funcalias) > sysnames + printf("#define\t%s%s\t%d\t/* compatibility; still used by libc */\n", + syscallprefix, funcalias, syscall) > syshdr + printf(" \\\n\t%s.o", funcalias) > sysmk + if ($3 != "NOHIDE") + printf("HIDE_%s(%s)\n", $3, funcname) > syshide + syscall++ + next + } + $2 == "OBSOL" { + printf("\t{ 0, (sy_call_t *)nosys },\t\t\t/* %d = obsolete %s */\n", + syscall, comment) > sysent + printf("\t\"obs_%s\",\t\t\t/* %d = obsolete %s */\n", + $4, syscall, comment) > sysnames + printf("\t\t\t\t/* %d is obsolete %s */\n", + syscall, comment) > syshdr + if ($3 != "NOHIDE") + printf("HIDE_%s(%s)\n", $3, $4) > syshide + syscall++ + next + } + $2 == "UNIMPL" { + printf("\t{ 0, (sy_call_t *)nosys },\t\t\t/* %d = %s */\n", + syscall, comment) > sysent + printf("\t\"#%d\",\t\t\t/* %d = %s */\n", + syscall, syscall, comment) > sysnames + if ($3 != "NOHIDE") + printf("HIDE_%s(%s)\n", $3, $4) > syshide + syscall++ + next + } + { + printf "%s: line %d: unrecognized keyword %s\n", infile, NR, $2 + exit 1 + } + END { + if (ncompat != 0) { + printf "#include \"opt_compat.h\"\n\n" > syssw + printf "\n#ifdef %s\n", compat > sysinc + printf "#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name)\n" > sysinc + printf "#else\n" > sysinc + printf "#define compat(n, name) 0, (sy_call_t *)nosys\n" > sysinc + printf "#endif\n" > sysinc + } + + printf("\n#endif /* %s */\n\n", compat) > syscompatdcl + printf("#undef PAD_\n") > syscompatdcl + printf("\n#endif /* !%s */\n", sysproto_h) > syscompatdcl + + printf("\n") > sysmk + printf("};\n") > sysent + printf("};\n") > sysnames + printf("#define\t%sMAXSYSCALL\t%d\n", syscallprefix, syscall) \ + > syshdr + } ' + +cat $sysinc $sysent >> $syssw +cat $sysarg $sysdcl $syscompat $syscompatdcl > $sysproto diff --git a/sys/kern/md5c.c b/sys/kern/md5c.c new file mode 100644 index 0000000..d6175ee --- /dev/null +++ b/sys/kern/md5c.c @@ -0,0 +1,342 @@ +/* + * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm + * + * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All + * rights reserved. + * + * License to copy and use this software is granted provided that it + * is identified as the "RSA Data Security, Inc. MD5 Message-Digest + * Algorithm" in all material mentioning or referencing this software + * or this function. + * + * License is also granted to make and use derivative works provided + * that such works are identified as "derived from the RSA Data + * Security, Inc. MD5 Message-Digest Algorithm" in all material + * mentioning or referencing the derived work. + * + * RSA Data Security, Inc. makes no representations concerning either + * the merchantability of this software or the suitability of this + * software for any particular purpose. It is provided "as is" + * without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this + * documentation and/or software. + * + * $Id: md5c.c,v 1.14 1998/05/01 16:40:19 bde Exp $ + * + * This code is the same as the code published by RSA Inc. It has been + * edited for clarity and style only. + */ + +#include <sys/types.h> + +#ifdef KERNEL +#include <sys/systm.h> +#else +#include <string.h> +#endif + +#include <sys/md5.h> + + +#ifdef KERNEL +#define memset(x,y,z) bzero(x,z); +#define memcpy(x,y,z) bcopy(y, x, z) +#endif + +#if defined(__i386__) || defined(__alpha__) +#define Encode memcpy +#define Decode memcpy +#else /* __i386__ */ + +/* + * Encodes input (u_int32_t) into output (unsigned char). Assumes len is + * a multiple of 4. + */ + +/* XXX not prototyped, and not compatible with memcpy(). */ +static void +Encode (output, input, len) + unsigned char *output; + u_int32_t *input; + unsigned int len; +{ + unsigned int i, j; + + for (i = 0, j = 0; j < len; i++, j += 4) { + output[j] = (unsigned char)(input[i] & 0xff); + output[j+1] = (unsigned char)((input[i] >> 8) & 0xff); + output[j+2] = (unsigned char)((input[i] >> 16) & 0xff); + output[j+3] = (unsigned char)((input[i] >> 24) & 0xff); + } +} + +/* + * Decodes input (unsigned char) into output (u_int32_t). Assumes len is + * a multiple of 4. + */ + +static void +Decode (output, input, len) + u_int32_t *output; + const unsigned char *input; + unsigned int len; +{ + unsigned int i, j; + + for (i = 0, j = 0; j < len; i++, j += 4) + output[i] = ((u_int32_t)input[j]) | (((u_int32_t)input[j+1]) << 8) | + (((u_int32_t)input[j+2]) << 16) | (((u_int32_t)input[j+3]) << 24); +} +#endif /* i386 */ + +static unsigned char PADDING[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* F, G, H and I are basic MD5 functions. */ +#define F(x, y, z) (((x) & (y)) | ((~x) & (z))) +#define G(x, y, z) (((x) & (z)) | ((y) & (~z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define I(x, y, z) ((y) ^ ((x) | (~z))) + +/* ROTATE_LEFT rotates x left n bits. */ +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) + +/* + * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4. + * Rotation is separate from addition to prevent recomputation. + */ +#define FF(a, b, c, d, x, s, ac) { \ + (a) += F ((b), (c), (d)) + (x) + (u_int32_t)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define GG(a, b, c, d, x, s, ac) { \ + (a) += G ((b), (c), (d)) + (x) + (u_int32_t)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define HH(a, b, c, d, x, s, ac) { \ + (a) += H ((b), (c), (d)) + (x) + (u_int32_t)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define II(a, b, c, d, x, s, ac) { \ + (a) += I ((b), (c), (d)) + (x) + (u_int32_t)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } + +/* MD5 initialization. Begins an MD5 operation, writing a new context. */ + +void +MD5Init (context) + MD5_CTX *context; +{ + + context->count[0] = context->count[1] = 0; + + /* Load magic initialization constants. */ + context->state[0] = 0x67452301; + context->state[1] = 0xefcdab89; + context->state[2] = 0x98badcfe; + context->state[3] = 0x10325476; +} + +/* + * MD5 block update operation. Continues an MD5 message-digest + * operation, processing another message block, and updating the + * context. + */ + +void +MD5Update (context, input, inputLen) + MD5_CTX *context; + const unsigned char *input; + unsigned int inputLen; +{ + unsigned int i, index, partLen; + + /* Compute number of bytes mod 64 */ + index = (unsigned int)((context->count[0] >> 3) & 0x3F); + + /* Update number of bits */ + if ((context->count[0] += ((u_int32_t)inputLen << 3)) + < ((u_int32_t)inputLen << 3)) + context->count[1]++; + context->count[1] += ((u_int32_t)inputLen >> 29); + + partLen = 64 - index; + + /* Transform as many times as possible. */ + if (inputLen >= partLen) { + memcpy((void *)&context->buffer[index], (const void *)input, + partLen); + MD5Transform (context->state, context->buffer); + + for (i = partLen; i + 63 < inputLen; i += 64) + MD5Transform (context->state, &input[i]); + + index = 0; + } + else + i = 0; + + /* Buffer remaining input */ + memcpy ((void *)&context->buffer[index], (const void *)&input[i], + inputLen-i); +} + +/* + * MD5 padding. Adds padding followed by original length. + */ + +void +MD5Pad (context) + MD5_CTX *context; +{ + unsigned char bits[8]; + unsigned int index, padLen; + + /* Save number of bits */ + Encode (bits, context->count, 8); + + /* Pad out to 56 mod 64. */ + index = (unsigned int)((context->count[0] >> 3) & 0x3f); + padLen = (index < 56) ? (56 - index) : (120 - index); + MD5Update (context, PADDING, padLen); + + /* Append length (before padding) */ + MD5Update (context, bits, 8); +} + +/* + * MD5 finalization. Ends an MD5 message-digest operation, writing the + * the message digest and zeroizing the context. + */ + +void +MD5Final (digest, context) + unsigned char digest[16]; + MD5_CTX *context; +{ + /* Do padding. */ + MD5Pad (context); + + /* Store state in digest */ + Encode (digest, context->state, 16); + + /* Zeroize sensitive information. */ + memset ((void *)context, 0, sizeof (*context)); +} + +/* MD5 basic transformation. Transforms state based on block. */ + +void +MD5Transform (state, block) + u_int32_t state[4]; + const unsigned char block[64]; +{ + u_int32_t a = state[0], b = state[1], c = state[2], d = state[3], x[16]; + + Decode (x, block, 64); + + /* Round 1 */ +#define S11 7 +#define S12 12 +#define S13 17 +#define S14 22 + FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */ + FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */ + FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */ + FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */ + FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */ + FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */ + FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */ + FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */ + FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */ + FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */ + FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */ + FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */ + FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */ + FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */ + FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */ + FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */ + + /* Round 2 */ +#define S21 5 +#define S22 9 +#define S23 14 +#define S24 20 + GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */ + GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */ + GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */ + GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */ + GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */ + GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */ + GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */ + GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */ + GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */ + GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */ + GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */ + GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */ + GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */ + GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */ + GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */ + GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */ + + /* Round 3 */ +#define S31 4 +#define S32 11 +#define S33 16 +#define S34 23 + HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */ + HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */ + HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */ + HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */ + HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */ + HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */ + HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */ + HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */ + HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */ + HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */ + HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */ + HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */ + HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */ + HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */ + HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */ + HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */ + + /* Round 4 */ +#define S41 6 +#define S42 10 +#define S43 15 +#define S44 21 + II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */ + II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */ + II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */ + II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */ + II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */ + II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */ + II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */ + II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */ + II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */ + II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */ + II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */ + II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */ + II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */ + II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */ + II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */ + II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */ + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + + /* Zeroize sensitive information. */ + memset ((void *)x, 0, sizeof (x)); +} diff --git a/sys/kern/p1003_1b.c b/sys/kern/p1003_1b.c new file mode 100644 index 0000000..9a70d5c --- /dev/null +++ b/sys/kern/p1003_1b.c @@ -0,0 +1,260 @@ +/* + * Copyright (c) 1996, 1997, 1998 + * HD Associates, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by HD Associates, Inc + * 4. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +/* p1003_1b: Real Time common code. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysent.h> +#include <sys/proc.h> +#include <sys/syslog.h> +#include <sys/module.h> +#include <sys/sysproto.h> +#include <sys/sysctl.h> + +#include <posix4/posix4.h> + +MALLOC_DEFINE(M_P31B, "p1003.1b", "Posix 1003.1B"); + +/* p31b_proc: Return a proc struct corresponding to a pid to operate on. + * + * Enforce permission policy. + * + * The policy is the same as for sending signals except there + * is no notion of process groups. + * + * pid == 0 means my process. + * + * This is disabled until I've got a permission gate in again: + * only root can do this. + */ + +#if 0 +/* + * This is stolen from CANSIGNAL in kern_sig: + * + * Can process p, with pcred pc, do "write flavor" operations to process q? + */ +#define CAN_AFFECT(p, pc, q) \ + ((pc)->pc_ucred->cr_uid == 0 || \ + (pc)->p_ruid == (q)->p_cred->p_ruid || \ + (pc)->pc_ucred->cr_uid == (q)->p_cred->p_ruid || \ + (pc)->p_ruid == (q)->p_ucred->cr_uid || \ + (pc)->pc_ucred->cr_uid == (q)->p_ucred->cr_uid) +#else +#define CAN_AFFECT(p, pc, q) ((pc)->pc_ucred->cr_uid == 0) +#endif + +/* + * p31b_proc: Look up a proc from a PID. If proc is 0 it is + * my own proc. + */ +int p31b_proc(struct proc *p, pid_t pid, struct proc **pp) +{ + int ret = 0; + struct proc *other_proc = 0; + + if (pid == 0) + other_proc = p; + else + other_proc = pfind(pid); + + if (other_proc) + { + /* Enforce permission policy. + */ + if (CAN_AFFECT(p, p->p_cred, other_proc)) + *pp = other_proc; + else + ret = EPERM; + } + else + ret = ESRCH; + + return ret; +} + +/* The system calls return ENOSYS if an entry is called that is + * not run-time supported. I am also logging since some programs + * start to use this when they shouldn't. That will be removed if annoying. + */ +int +syscall_not_present(struct proc *p, const char *s, struct nosys_args *uap) +{ + log(LOG_ERR, "cmd %s pid %d tried to use non-present %s\n", + p->p_comm, p->p_pid, s); + + /* a " return nosys(p, uap); " here causes a core dump. + */ + + return ENOSYS; +} + +#if !defined(_KPOSIX_PRIORITY_SCHEDULING) + +/* Not configured but loadable via an LKM: + */ + +static int sched_attach(void) +{ + return 0; +} + +SYSCALL_NOT_PRESENT_GEN(sched_setparam) +SYSCALL_NOT_PRESENT_GEN(sched_getparam) +SYSCALL_NOT_PRESENT_GEN(sched_setscheduler) +SYSCALL_NOT_PRESENT_GEN(sched_getscheduler) +SYSCALL_NOT_PRESENT_GEN(sched_yield) +SYSCALL_NOT_PRESENT_GEN(sched_get_priority_max) +SYSCALL_NOT_PRESENT_GEN(sched_get_priority_min) +SYSCALL_NOT_PRESENT_GEN(sched_rr_get_interval) + +#else + +/* Configured in kernel version: + */ +static struct ksched *ksched; + +static int sched_attach(void) +{ + int ret = ksched_attach(&ksched); + + if (ret == 0) + p31b_setcfg(CTL_P1003_1B_PRIORITY_SCHEDULING, 1); + + return ret; +} + +int sched_setparam(struct proc *p, + struct sched_setparam_args *uap) +{ + int e; + + struct sched_param sched_param; + copyin(uap->param, &sched_param, sizeof(sched_param)); + + (void) (0 + || (e = p31b_proc(p, uap->pid, &p)) + || (e = ksched_setparam(&p->p_retval[0], ksched, p, + (const struct sched_param *)&sched_param)) + ); + + return e; +} + +int sched_getparam(struct proc *p, + struct sched_getparam_args *uap) +{ + int e; + struct sched_param sched_param; + + (void) (0 + || (e = p31b_proc(p, uap->pid, &p)) + || (e = ksched_getparam(&p->p_retval[0], ksched, p, &sched_param)) + ); + + if (!e) + copyout(&sched_param, uap->param, sizeof(sched_param)); + + return e; +} +int sched_setscheduler(struct proc *p, + struct sched_setscheduler_args *uap) +{ + int e; + + struct sched_param sched_param; + copyin(uap->param, &sched_param, sizeof(sched_param)); + + (void) (0 + || (e = p31b_proc(p, uap->pid, &p)) + || (e = ksched_setscheduler(&p->p_retval[0], + ksched, p, uap->policy, + (const struct sched_param *)&sched_param)) + ); + + return e; +} +int sched_getscheduler(struct proc *p, + struct sched_getscheduler_args *uap) +{ + int e; + (void) (0 + || (e = p31b_proc(p, uap->pid, &p)) + || (e = ksched_getscheduler(&p->p_retval[0], ksched, p)) + ); + + return e; +} +int sched_yield(struct proc *p, + struct sched_yield_args *uap) +{ + return ksched_yield(&p->p_retval[0], ksched); +} +int sched_get_priority_max(struct proc *p, + struct sched_get_priority_max_args *uap) +{ + return ksched_get_priority_max(&p->p_retval[0], + ksched, uap->policy); +} +int sched_get_priority_min(struct proc *p, + struct sched_get_priority_min_args *uap) +{ + return ksched_get_priority_min(&p->p_retval[0], + ksched, uap->policy); +} +int sched_rr_get_interval(struct proc *p, + struct sched_rr_get_interval_args *uap) +{ + int e; + + (void) (0 + || (e = p31b_proc(p, uap->pid, &p)) + || (e = ksched_rr_get_interval(&p->p_retval[0], ksched, + p, uap->interval)) + ); + + return e; +} + +#endif + +static void p31binit(void *notused) +{ + (void) sched_attach(); + p31b_setcfg(CTL_P1003_1B_PAGESIZE, PAGE_SIZE); +} + +SYSINIT(p31b, SI_SUB_P1003_1B, SI_ORDER_FIRST, p31binit, NULL); diff --git a/sys/kern/posix4_mib.c b/sys/kern/posix4_mib.c new file mode 100644 index 0000000..523f76b --- /dev/null +++ b/sys/kern/posix4_mib.c @@ -0,0 +1,94 @@ +/*- + * Copyright (c) 1998 + * HD Associates, Inc. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by HD Associates, Inc + * 4. Neither the name of the author nor the names of any co-contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY HD ASSOCIATES AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL HD ASSOCIATES OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <posix4/posix4.h> + +static int facility[CTL_P1003_1B_MAXID - 1]; + +/* OID_AUTO isn't working with sysconf(3). I guess I'd have to + * modify it to do a lookup by name from the index. + * For now I've left it a top-level sysctl. + */ + +#if 1 + +#define P1B_SYSCTL(num, name) \ +SYSCTL_INT(_p1003_1b, num, \ + name, CTLFLAG_RD, facility + num - 1, 0, ""); + +#else + +#define P1B_SYSCTL(num, name) \ +SYSCTL_INT(_kern_p1003_1b, OID_AUTO, \ + name, CTLFLAG_RD, facility + num - 1, 0, ""); +SYSCTL_NODE(_kern, OID_AUTO, p1003_1b, CTLFLAG_RW, 0, "P1003.1B"); + +#endif + + +P1B_SYSCTL(CTL_P1003_1B_ASYNCHRONOUS_IO, asynchronous_io); +P1B_SYSCTL(CTL_P1003_1B_MAPPED_FILES, mapped_files); +P1B_SYSCTL(CTL_P1003_1B_MEMLOCK, memlock); +P1B_SYSCTL(CTL_P1003_1B_MEMLOCK_RANGE, memlock_range); +P1B_SYSCTL(CTL_P1003_1B_MEMORY_PROTECTION, memory_protection); +P1B_SYSCTL(CTL_P1003_1B_MESSAGE_PASSING, message_passing); +P1B_SYSCTL(CTL_P1003_1B_PRIORITIZED_IO, prioritized_io); +P1B_SYSCTL(CTL_P1003_1B_PRIORITY_SCHEDULING, priority_scheduling); +P1B_SYSCTL(CTL_P1003_1B_REALTIME_SIGNALS, realtime_signals); +P1B_SYSCTL(CTL_P1003_1B_SEMAPHORES, semaphores); +P1B_SYSCTL(CTL_P1003_1B_FSYNC, fsync); +P1B_SYSCTL(CTL_P1003_1B_SHARED_MEMORY_OBJECTS, shared_memory_objects); +P1B_SYSCTL(CTL_P1003_1B_SYNCHRONIZED_IO, synchronized_io); +P1B_SYSCTL(CTL_P1003_1B_TIMERS, timers); +P1B_SYSCTL(CTL_P1003_1B_AIO_LISTIO_MAX, aio_listio_max); +P1B_SYSCTL(CTL_P1003_1B_AIO_MAX, aio_max); +P1B_SYSCTL(CTL_P1003_1B_AIO_PRIO_DELTA_MAX, aio_prio_delta_max); +P1B_SYSCTL(CTL_P1003_1B_DELAYTIMER_MAX, delaytimer_max); +P1B_SYSCTL(CTL_P1003_1B_MQ_OPEN_MAX, mq_open_max); +P1B_SYSCTL(CTL_P1003_1B_PAGESIZE, pagesize); +P1B_SYSCTL(CTL_P1003_1B_RTSIG_MAX, rtsig_max); +P1B_SYSCTL(CTL_P1003_1B_SEM_NSEMS_MAX, sem_nsems_max); +P1B_SYSCTL(CTL_P1003_1B_SEM_VALUE_MAX, sem_value_max); +P1B_SYSCTL(CTL_P1003_1B_SIGQUEUE_MAX, sigqueue_max); +P1B_SYSCTL(CTL_P1003_1B_TIMER_MAX, timer_max); + +/* p31b_setcfg: Set the configuration + */ +void p31b_setcfg(int num, int value) +{ + if (num >= 1 && num < CTL_P1003_1B_MAXID) + facility[num - 1] = value; +} diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c new file mode 100644 index 0000000..9234732 --- /dev/null +++ b/sys/kern/subr_autoconf.c @@ -0,0 +1,420 @@ +/* + * Copyright (c) 1992, 1993 + * The Regents of the University of California. All rights reserved. + * + * This software was developed by the Computer Systems Engineering group + * at Lawrence Berkeley Laboratory under DARPA contract BG 91-66 and + * contributed to Berkeley. + * + * All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Lawrence Berkeley Laboratories. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)subr_autoconf.c 8.1 (Berkeley) 6/10/93 + * + * $Id: subr_autoconf.c,v 1.7 1998/12/04 22:54:51 archie Exp $ + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/device.h> +#ifdef UNUSED +#include <sys/malloc.h> +#endif + +/* + * Autoconfiguration subroutines. + */ + +#ifdef UNUSED +/* + * ioconf.c exports exactly two names: cfdata and cfroots. All system + * devices and drivers are found via these tables. + */ +extern struct cfdata cfdata[]; +extern short cfroots[]; + +#define ROOT ((struct device *)NULL) + +struct matchinfo { + cfmatch_t fn; + struct device *parent; + void *aux; + struct cfdata *match; + int pri; +}; + +/* + * Apply the matching function and choose the best. This is used + * a few times and we want to keep the code small. + */ +static void +mapply(m, cf) + register struct matchinfo *m; + register struct cfdata *cf; +{ + register int pri; + + if (m->fn != NULL) + pri = (*m->fn)(m->parent, cf, m->aux); + else + pri = (*cf->cf_driver->cd_match)(m->parent, cf, m->aux); + if (pri > m->pri) { + m->match = cf; + m->pri = pri; + } +} + +/* + * Iterate over all potential children of some device, calling the given + * function (default being the child's match function) for each one. + * Nonzero returns are matches; the highest value returned is considered + * the best match. Return the `found child' if we got a match, or NULL + * otherwise. The `aux' pointer is simply passed on through. + * + * Note that this function is designed so that it can be used to apply + * an arbitrary function to all potential children (its return value + * can be ignored). + */ +struct cfdata * +config_search(fn, parent, aux) + cfmatch_t fn; + register struct device *parent; + void *aux; +{ + register struct cfdata *cf; + register short *p; + struct matchinfo m; + + m.fn = fn; + m.parent = parent; + m.aux = aux; + m.match = NULL; + m.pri = 0; + for (cf = cfdata; cf->cf_driver; cf++) { + /* + * Skip cf if no longer eligible, otherwise scan through + * parents for one matching `parent', and try match function. + */ + if (cf->cf_fstate == FSTATE_FOUND) + continue; + for (p = cf->cf_parents; *p >= 0; p++) + if (parent->dv_cfdata == &cfdata[*p]) + mapply(&m, cf); + } + return (m.match); +} + +/* + * Find the given root device. + * This is much like config_search, but there is no parent. + */ +struct cfdata * +config_rootsearch(fn, rootname, aux) + register cfmatch_t fn; + register char *rootname; + register void *aux; +{ + register struct cfdata *cf; + register short *p; + struct matchinfo m; + + m.fn = fn; + m.parent = ROOT; + m.aux = aux; + m.match = NULL; + m.pri = 0; + /* + * Look at root entries for matching name. We do not bother + * with found-state here since only one root should ever be + * searched (and it must be done first). + */ + for (p = cfroots; *p >= 0; p++) { + cf = &cfdata[*p]; + if (strcmp(cf->cf_driver->cd_name, rootname) == 0) + mapply(&m, cf); + } + return (m.match); +} + +static char *msgs[3] = { "", " not configured\n", " unsupported\n" }; + +/* + * The given `aux' argument describes a device that has been found + * on the given parent, but not necessarily configured. Locate the + * configuration data for that device (using the cd_match configuration + * driver function) and attach it, and return true. If the device was + * not configured, call the given `print' function and return 0. + */ +int +config_found(parent, aux, print) + struct device *parent; + void *aux; + cfprint_t print; +{ + struct cfdata *cf; + + if ((cf = config_search((cfmatch_t)NULL, parent, aux)) != NULL) { + config_attach(parent, cf, aux, print); + return (1); + } + printf(msgs[(*print)(aux, parent->dv_xname)]); + return (0); +} + +/* + * As above, but for root devices. + */ +int +config_rootfound(rootname, aux) + char *rootname; + void *aux; +{ + struct cfdata *cf; + + if ((cf = config_rootsearch((cfmatch_t)NULL, rootname, aux)) != NULL) { + config_attach(ROOT, cf, aux, (cfprint_t)NULL); + return (1); + } + printf("root device %s not configured\n", rootname); + return (0); +} + +/* just like sprintf(buf, "%d") except that it works from the end */ +static char * +number(ep, n) + register char *ep; + register int n; +{ + + *--ep = 0; + while (n >= 10) { + *--ep = (n % 10) + '0'; + n /= 10; + } + *--ep = n + '0'; + return (ep); +} + +/* + * Attach a found device. Allocates memory for device variables. + */ +void +config_attach(parent, cf, aux, print) + register struct device *parent; + register struct cfdata *cf; + register void *aux; + cfprint_t print; +{ + register struct device *dev; + register struct cfdriver *cd; + register size_t lname, lunit; + register char *xunit; + int myunit; + char num[10]; + static struct device **nextp = &alldevs; + + cd = cf->cf_driver; + if (cd->cd_devsize < sizeof(struct device)) + panic("config_attach"); + myunit = cf->cf_unit; + if (cf->cf_fstate == FSTATE_NOTFOUND) + cf->cf_fstate = FSTATE_FOUND; + else + cf->cf_unit++; + + /* compute length of name and decimal expansion of unit number */ + lname = strlen(cd->cd_name); + xunit = number(&num[sizeof num], myunit); + lunit = &num[sizeof num] - xunit; + if (lname + lunit >= sizeof(dev->dv_xname)) + panic("config_attach: device name too long"); + + /* get memory for all device vars */ + dev = (struct device *)malloc(cd->cd_devsize, M_DEVBUF, M_WAITOK); + /* XXX cannot wait! */ + bzero(dev, cd->cd_devsize); + *nextp = dev; /* link up */ + nextp = &dev->dv_next; + dev->dv_class = cd->cd_class; + dev->dv_cfdata = cf; + dev->dv_unit = myunit; + bcopy(cd->cd_name, dev->dv_xname, lname); + bcopy(xunit, dev->dv_xname + lname, lunit); + dev->dv_parent = parent; + if (parent == ROOT) + printf("%s (root)", dev->dv_xname); + else { + printf("%s at %s", dev->dv_xname, parent->dv_xname); + (void) (*print)(aux, (char *)0); + } + + /* put this device in the devices array */ + if (dev->dv_unit >= cd->cd_ndevs) { + /* + * Need to expand the array. + */ + int old = cd->cd_ndevs, oldbytes, new, newbytes; + void **nsp; + + if (old == 0) { + nsp = malloc(MINALLOCSIZE, M_DEVBUF, M_WAITOK); /*XXX*/ + bzero(nsp, MINALLOCSIZE); + cd->cd_ndevs = MINALLOCSIZE / sizeof(void *); + } else { + new = cd->cd_ndevs; + do { + new *= 2; + } while (new <= dev->dv_unit); + cd->cd_ndevs = new; + oldbytes = old * sizeof(void *); + newbytes = new * sizeof(void *); + nsp = malloc(newbytes, M_DEVBUF, M_WAITOK); /*XXX*/ + bcopy(cd->cd_devs, nsp, oldbytes); + bzero(&nsp[old], newbytes - oldbytes); + free(cd->cd_devs, M_DEVBUF); + } + cd->cd_devs = nsp; + } + if (cd->cd_devs[dev->dv_unit]) + panic("config_attach: duplicate %s", dev->dv_xname); + cd->cd_devs[dev->dv_unit] = dev; + + /* + * Before attaching, clobber any unfound devices that are + * otherwise identical. + */ + for (cf = cfdata; cf->cf_driver; cf++) + if (cf->cf_driver == cd && cf->cf_unit == dev->dv_unit && + cf->cf_fstate == FSTATE_NOTFOUND) + cf->cf_fstate = FSTATE_FOUND; + (*cd->cd_attach)(parent, dev, aux); +} + +/* + * Attach an event. These must come from initially-zero space (see + * commented-out assignments below), but that occurs naturally for + * device instance variables. + */ +void +evcnt_attach(dev, name, ev) + struct device *dev; + const char *name; + struct evcnt *ev; +{ + static struct evcnt **nextp = &allevents; + + KASSERT(strlen(name) < sizeof(ev->ev_name), ("evcnt_attach")); + + /* ev->ev_next = NULL; */ + ev->ev_dev = dev; + /* ev->ev_count = 0; */ + snprintf(ev->ev_name, sizeof(ev->ev_name), "%s", name); + *nextp = ev; + nextp = &ev->ev_next; +} + +#endif + +/* + * "Interrupt driven config" functions. + */ +static TAILQ_HEAD(, intr_config_hook) intr_config_hook_list = + TAILQ_HEAD_INITIALIZER(intr_config_hook_list); + + +/* ARGSUSED */ +static void run_interrupt_driven_config_hooks __P((void *dummy)); +static void +run_interrupt_driven_config_hooks(dummy) + void *dummy; +{ + struct intr_config_hook *hook; + + for (hook = intr_config_hook_list.tqh_first; hook != NULL; + hook = hook->ich_links.tqe_next) { + (*hook->ich_func)(hook->ich_arg); + } + + while (intr_config_hook_list.tqh_first != NULL) { + tsleep(&intr_config_hook_list, PCONFIG, "conifhk", 0); + } +} +SYSINIT(intr_config_hooks, SI_SUB_INT_CONFIG_HOOKS, SI_ORDER_FIRST, + run_interrupt_driven_config_hooks, NULL) + +/* + * Register a hook that will be called after "cold" + * autoconfiguration is complete and interrupts can + * be used to complete initialization. + */ +int +config_intrhook_establish(hook) + struct intr_config_hook *hook; +{ + struct intr_config_hook *hook_entry; + + for (hook_entry = intr_config_hook_list.tqh_first; hook_entry != NULL; + hook_entry = hook_entry->ich_links.tqe_next) + if (hook_entry == hook) + break; + if (hook_entry != NULL) { + printf("config_intrhook_establish: establishing an " + "already established hook.\n"); + return (1); + } + TAILQ_INSERT_TAIL(&intr_config_hook_list, hook, ich_links); + if (cold == 0) + /* XXX Sufficient for LKMs loaded after initial config??? */ + run_interrupt_driven_config_hooks(NULL); + return (0); +} + +void +config_intrhook_disestablish(hook) + struct intr_config_hook *hook; +{ + struct intr_config_hook *hook_entry; + + for (hook_entry = intr_config_hook_list.tqh_first; hook_entry != NULL; + hook_entry = hook_entry->ich_links.tqe_next) + if (hook_entry == hook) + break; + if (hook_entry == NULL) + panic("config_intrhook_disestablish: disestablishing an " + "unestablished hook"); + + TAILQ_REMOVE(&intr_config_hook_list, hook, ich_links); + /* Wakeup anyone watching the list */ + wakeup(&intr_config_hook_list); +} diff --git a/sys/kern/subr_blist.c b/sys/kern/subr_blist.c new file mode 100644 index 0000000..10af2ea --- /dev/null +++ b/sys/kern/subr_blist.c @@ -0,0 +1,928 @@ + +/* + * BLIST.C - Bitmap allocator/deallocator, using a radix tree with hinting + * + * (c)Copyright 1998, Matthew Dillon. Terms for use and redistribution + * are covered by the BSD Copyright as found in /usr/src/COPYRIGHT. + * + * This module implements a general bitmap allocator/deallocator. The + * allocator eats around 2 bits per 'block'. The module does not + * try to interpret the meaning of a 'block' other then to return + * SWAPBLK_NONE on an allocation failure. + * + * A radix tree is used to maintain the bitmap. Two radix constants are + * involved: One for the bitmaps contained in the leaf nodes (typically + * 32), and one for the meta nodes (typically 16). Both meta and leaf + * nodes have a hint field. This field gives us a hint as to the largest + * free contiguous range of blocks under the node. It may contain a + * value that is too high, but will never contain a value that is too + * low. When the radix tree is searched, allocation failures in subtrees + * update the hint. + * + * The radix tree also implements two collapsed states for meta nodes: + * the ALL-ALLOCATED state and the ALL-FREE state. If a meta node is + * in either of these two states, all information contained underneath + * the node is considered stale. These states are used to optimize + * allocation and freeing operations. + * + * The hinting greatly increases code efficiency for allocations while + * the general radix structure optimizes both allocations and frees. The + * radix tree should be able to operate well no matter how much + * fragmentation there is and no matter how large a bitmap is used. + * + * Unlike the rlist code, the blist code wires all necessary memory at + * creation time. Neither allocations nor frees require interaction with + * the memory subsystem. In contrast, the rlist code may allocate memory + * on an rlist_free() call. The non-blocking features of the blist code + * are used to great advantage in the swap code (vm/nswap_pager.c). The + * rlist code uses a little less overall memory then the blist code (but + * due to swap interleaving not all that much less), but the blist code + * scales much, much better. + * + * LAYOUT: The radix tree is layed out recursively using a + * linear array. Each meta node is immediately followed (layed out + * sequentially in memory) by BLIST_META_RADIX lower level nodes. This + * is a recursive structure but one that can be easily scanned through + * a very simple 'skip' calculation. In order to support large radixes, + * portions of the tree may reside outside our memory allocation. We + * handle this with an early-termination optimization (when bighint is + * set to -1) on the scan. The memory allocation is only large enough + * to cover the number of blocks requested at creation time even if it + * must be encompassed in larger root-node radix. + * + * NOTE: the allocator cannot currently allocate more then + * BLIST_BMAP_RADIX blocks per call. It will panic with 'allocation too + * large' if you try. This is an area that could use improvement. The + * radix is large enough that this restriction does not effect the swap + * system, though. Currently only the allocation code is effected by + * this algorithmic unfeature. The freeing code can handle arbitrary + * ranges. + * + * This code can be compiled stand-alone for debugging. + */ + +#ifdef KERNEL + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/lock.h> +#include <sys/kernel.h> +#include <sys/blist.h> +#include <sys/malloc.h> +#include <vm/vm.h> +#include <vm/vm_prot.h> +#include <vm/vm_object.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#include <vm/vm_page.h> + +#else + +#ifndef BLIST_NO_DEBUG +#define BLIST_DEBUG +#endif + +#define SWAPBLK_NONE ((daddr_t)-1) + +#include <sys/types.h> +#include <stdio.h> +#include <string.h> +#include <stdlib.h> +#include <stdarg.h> + +#define malloc(a,b,c) malloc(a) +#define free(a,b) free(a) + +typedef unsigned int u_daddr_t; + +#include <sys/blist.h> + +void panic(const char *ctl, ...); + +#endif + +/* + * static support functions + */ + +static daddr_t blst_leaf_alloc(blmeta_t *scan, daddr_t blk, int count); +static daddr_t blst_meta_alloc(blmeta_t *scan, daddr_t blk, + daddr_t count, daddr_t radix, int skip); +static void blst_leaf_free(blmeta_t *scan, daddr_t relblk, int count); +static void blst_meta_free(blmeta_t *scan, daddr_t freeBlk, daddr_t count, + daddr_t radix, int skip, daddr_t blk); +static void blst_copy(blmeta_t *scan, daddr_t blk, daddr_t radix, + daddr_t skip, blist_t dest, daddr_t count); +static daddr_t blst_radix_init(blmeta_t *scan, daddr_t radix, + int skip, daddr_t count); +#ifndef KERNEL +static void blst_radix_print(blmeta_t *scan, daddr_t blk, + daddr_t radix, int skip, int tab); +#endif + +#ifdef KERNEL +static MALLOC_DEFINE(M_SWAP, "SWAP", "Swap space"); +#endif + +/* + * blist_create() - create a blist capable of handling up to the specified + * number of blocks + * + * blocks must be greater then 0 + * + * The smallest blist consists of a single leaf node capable of + * managing BLIST_BMAP_RADIX blocks. + */ + +blist_t +blist_create(daddr_t blocks) +{ + blist_t bl; + int radix; + int skip = 0; + + /* + * Calculate radix and skip field used for scanning. + */ + radix = BLIST_BMAP_RADIX; + + while (radix < blocks) { + radix <<= BLIST_META_RADIX_SHIFT; + skip = (skip + 1) << BLIST_META_RADIX_SHIFT; + } + + bl = malloc(sizeof(struct blist), M_SWAP, M_WAITOK); + + bzero(bl, sizeof(*bl)); + + bl->bl_blocks = blocks; + bl->bl_radix = radix; + bl->bl_skip = skip; + bl->bl_rootblks = 1 + + blst_radix_init(NULL, bl->bl_radix, bl->bl_skip, blocks); + bl->bl_root = malloc(sizeof(blmeta_t) * bl->bl_rootblks, M_SWAP, M_WAITOK); + +#if defined(BLIST_DEBUG) + printf( + "BLIST representing %d blocks (%d MB of swap)" + ", requiring %dK of ram\n", + bl->bl_blocks, + bl->bl_blocks * 4 / 1024, + (bl->bl_rootblks * sizeof(blmeta_t) + 1023) / 1024 + ); + printf("BLIST raw radix tree contains %d records\n", bl->bl_rootblks); +#endif + blst_radix_init(bl->bl_root, bl->bl_radix, bl->bl_skip, blocks); + + return(bl); +} + +void +blist_destroy(blist_t bl) +{ + free(bl->bl_root, M_SWAP); + free(bl, M_SWAP); +} + +/* + * blist_alloc() - reserve space in the block bitmap. Return the base + * of a contiguous region or SWAPBLK_NONE if space could + * not be allocated. + */ + +daddr_t +blist_alloc(blist_t bl, daddr_t count) +{ + daddr_t blk = SWAPBLK_NONE; + + if (bl) { + if (bl->bl_radix == BLIST_BMAP_RADIX) + blk = blst_leaf_alloc(bl->bl_root, 0, count); + else + blk = blst_meta_alloc(bl->bl_root, 0, count, bl->bl_radix, bl->bl_skip); + if (blk != SWAPBLK_NONE) + bl->bl_free -= count; + } + return(blk); +} + +/* + * blist_free() - free up space in the block bitmap. Return the base + * of a contiguous region. Panic if an inconsistancy is + * found. + */ + +void +blist_free(blist_t bl, daddr_t blkno, daddr_t count) +{ + if (bl) { + if (bl->bl_radix == BLIST_BMAP_RADIX) + blst_leaf_free(bl->bl_root, blkno, count); + else + blst_meta_free(bl->bl_root, blkno, count, bl->bl_radix, bl->bl_skip, 0); + bl->bl_free += count; + } +} + +/* + * blist_resize() - resize an existing radix tree to handle the + * specified number of blocks. This will reallocate + * the tree and transfer the previous bitmap to the new + * one. When extending the tree you can specify whether + * the new blocks are to left allocated or freed. + */ + +void +blist_resize(blist_t *pbl, daddr_t count, int freenew) +{ + blist_t newbl = blist_create(count); + blist_t save = *pbl; + + *pbl = newbl; + if (count > save->bl_blocks) + count = save->bl_blocks; + blst_copy(save->bl_root, 0, save->bl_radix, save->bl_skip, newbl, count); + + /* + * If resizing upwards, should we free the new space or not? + */ + if (freenew && count < newbl->bl_blocks) { + blist_free(newbl, count, newbl->bl_blocks - count); + } + blist_destroy(save); +} + +#ifdef BLIST_DEBUG + +/* + * blist_print() - dump radix tree + */ + +void +blist_print(blist_t bl) +{ + printf("BLIST {\n"); + blst_radix_print(bl->bl_root, 0, bl->bl_radix, bl->bl_skip, 4); + printf("}\n"); +} + +#endif + +/************************************************************************ + * ALLOCATION SUPPORT FUNCTIONS * + ************************************************************************ + * + * These support functions do all the actual work. They may seem + * rather longish, but that's because I've commented them up. The + * actual code is straight forward. + * + */ + +/* + * blist_leaf_alloc() - allocate at a leaf in the radix tree (a bitmap). + * + * This is the core of the allocator and is optimized for the 1 block + * and the BLIST_BMAP_RADIX block allocation cases. Other cases are + * somewhat slower. The 1 block allocation case is log2 and extremely + * quick. + */ + +static daddr_t +blst_leaf_alloc( + blmeta_t *scan, + daddr_t blk, + int count +) { + u_daddr_t orig = scan->u.bmu_bitmap; + + if (orig == 0) { + /* + * Optimize bitmap all-allocated case. Also, count = 1 + * case assumes at least 1 bit is free in the bitmap, so + * we have to take care of this case here. + */ + scan->bm_bighint = 0; + return(SWAPBLK_NONE); + } + if (count == 1) { + /* + * Optimized code to allocate one bit out of the bitmap + */ + u_daddr_t mask; + int j = BLIST_BMAP_RADIX/2; + int r = 0; + + mask = (u_daddr_t)-1 >> (BLIST_BMAP_RADIX/2); + + while (j) { + if ((orig & mask) == 0) { + r += j; + orig >>= j; + } + j >>= 1; + mask >>= j; + } + scan->u.bmu_bitmap &= ~(1 << r); + return(blk + r); + } + if (count <= BLIST_BMAP_RADIX) { + /* + * non-optimized code to allocate N bits out of the bitmap. + * The more bits, the faster the code runs. It will run + * the slowest allocating 2 bits, but since there aren't any + * memory ops in the core loop (or shouldn't be, anyway), + * you probably won't notice the difference. + */ + int j; + int n = BLIST_BMAP_RADIX - count; + u_daddr_t mask; + + mask = (u_daddr_t)-1 >> n; + + for (j = 0; j <= n; ++j) { + if ((orig & mask) == mask) { + scan->u.bmu_bitmap &= ~mask; + return(blk + j); + } + mask = (mask << 1); + } + } + /* + * We couldn't allocate count in this subtree, update bighint. + */ + scan->bm_bighint = count - 1; + return(SWAPBLK_NONE); +} + +/* + * blist_meta_alloc() - allocate at a meta in the radix tree. + * + * Attempt to allocate at a meta node. If we can't, we update + * bighint and return a failure. Updating bighint optimize future + * calls that hit this node. We have to check for our collapse cases + * and we have a few optimizations strewn in as well. + */ + +static daddr_t +blst_meta_alloc( + blmeta_t *scan, + daddr_t blk, + daddr_t count, + daddr_t radix, + int skip +) { + int i; + int next_skip = (skip >> BLIST_META_RADIX_SHIFT); + + if (scan->u.bmu_avail == 0) { + /* + * ALL-ALLOCATED special case + */ + scan->bm_bighint = count; + return(SWAPBLK_NONE); + } + + if (scan->u.bmu_avail == radix) { + radix >>= BLIST_META_RADIX_SHIFT; + + /* + * ALL-FREE special case, initialize uninitialize + * sublevel. + */ + for (i = 1; i <= skip; i += next_skip) { + if (scan[i].bm_bighint == (daddr_t)-1) + break; + if (next_skip == 1) { + scan[i].u.bmu_bitmap = (u_daddr_t)-1; + scan[i].bm_bighint = BLIST_BMAP_RADIX; + } else { + scan[i].bm_bighint = radix; + scan[i].u.bmu_avail = radix; + } + } + } else { + radix >>= BLIST_META_RADIX_SHIFT; + } + + for (i = 1; i <= skip; i += next_skip) { + if (count <= scan[i].bm_bighint) { + /* + * count fits in object + */ + daddr_t r; + if (next_skip == 1) { + r = blst_leaf_alloc(&scan[i], blk, count); + } else { + r = blst_meta_alloc(&scan[i], blk, count, radix, next_skip - 1); + } + if (r != SWAPBLK_NONE) { + scan->u.bmu_avail -= count; + if (scan->bm_bighint > scan->u.bmu_avail) + scan->bm_bighint = scan->u.bmu_avail; + return(r); + } + } else if (scan[i].bm_bighint == (daddr_t)-1) { + /* + * Terminator + */ + break; + } else if (count > radix) { + /* + * count does not fit in object even if it were + * complete free. + */ + panic("blist_meta_alloc: allocation too large"); + } + blk += radix; + } + + /* + * We couldn't allocate count in this subtree, update bighint. + */ + if (scan->bm_bighint >= count) + scan->bm_bighint = count - 1; + return(SWAPBLK_NONE); +} + +/* + * BLST_LEAF_FREE() - free allocated block from leaf bitmap + * + */ + +static void +blst_leaf_free( + blmeta_t *scan, + daddr_t blk, + int count +) { + /* + * free some data in this bitmap + * + * e.g. + * 0000111111111110000 + * \_________/\__/ + * v n + */ + int n = blk & (BLIST_BMAP_RADIX - 1); + u_daddr_t mask; + + mask = ((u_daddr_t)-1 << n) & + ((u_daddr_t)-1 >> (BLIST_BMAP_RADIX - count - n)); + + if (scan->u.bmu_bitmap & mask) + panic("blst_radix_free: freeing free block"); + scan->u.bmu_bitmap |= mask; + + /* + * We could probably do a better job here. We are required to make + * bighint at least as large as the biggest contiguous block of + * data. If we just shoehorn it, a little extra overhead will + * be incured on the next allocation (but only that one typically). + */ + scan->bm_bighint = BLIST_BMAP_RADIX; +} + +/* + * BLST_META_FREE() - free allocated blocks from radix tree meta info + * + * This support routine frees a range of blocks from the bitmap. + * The range must be entirely enclosed by this radix node. If a + * meta node, we break the range down recursively to free blocks + * in subnodes (which means that this code can free an arbitrary + * range whereas the allocation code cannot allocate an arbitrary + * range). + */ + +static void +blst_meta_free( + blmeta_t *scan, + daddr_t freeBlk, + daddr_t count, + daddr_t radix, + int skip, + daddr_t blk +) { + int i; + int next_skip = (skip >> BLIST_META_RADIX_SHIFT); + +#if 0 + printf("FREE (%x,%d) FROM (%x,%d)\n", + freeBlk, count, + blk, radix + ); +#endif + + if (scan->u.bmu_avail == 0) { + /* + * ALL-ALLOCATED special case, with possible + * shortcut to ALL-FREE special case. + */ + scan->u.bmu_avail = count; + scan->bm_bighint = count; + + if (count != radix) { + for (i = 1; i <= skip; i += next_skip) { + if (scan[i].bm_bighint == (daddr_t)-1) + break; + scan[i].bm_bighint = 0; + if (next_skip == 1) { + scan[i].u.bmu_bitmap = 0; + } else { + scan[i].u.bmu_avail = 0; + } + } + /* fall through */ + } + } else { + scan->u.bmu_avail += count; + /* scan->bm_bighint = radix; */ + } + + /* + * ALL-FREE special case. + */ + + if (scan->u.bmu_avail == radix) + return; +#if !defined(MAX_PERF) + if (scan->u.bmu_avail > radix) + panic("blst_meta_free: freeing already free blocks (%d) %d/%d", count, scan->u.bmu_avail, radix); +#endif + + /* + * Break the free down into its components + */ + + radix >>= BLIST_META_RADIX_SHIFT; + + i = (freeBlk - blk) / radix; + blk += i * radix; + i = i * next_skip + 1; + + while (i <= skip && blk < freeBlk + count) { + daddr_t v; + + v = blk + radix - freeBlk; + if (v > count) + v = count; + + if (scan->bm_bighint == (daddr_t)-1) + panic("blst_meta_free: freeing unexpected range"); + + if (next_skip == 1) { + blst_leaf_free(&scan[i], freeBlk, v); + } else { + blst_meta_free(&scan[i], freeBlk, v, radix, next_skip - 1, blk); + } + if (scan->bm_bighint < scan[i].bm_bighint) + scan->bm_bighint = scan[i].bm_bighint; + count -= v; + freeBlk += v; + blk += radix; + i += next_skip; + } +} + +/* + * BLIST_RADIX_COPY() - copy one radix tree to another + * + * Locates free space in the source tree and frees it in the destination + * tree. The space may not already be free in the destination. + */ + +static void blst_copy( + blmeta_t *scan, + daddr_t blk, + daddr_t radix, + daddr_t skip, + blist_t dest, + daddr_t count +) { + int next_skip; + int i; + + /* + * Leaf node + */ + + if (radix == BLIST_BMAP_RADIX) { + u_daddr_t v = scan->u.bmu_bitmap; + + if (v == (u_daddr_t)-1) { + blist_free(dest, blk, count); + } else if (v != 0) { + int i; + + for (i = 0; i < BLIST_BMAP_RADIX && i < count; ++i) { + if (v & (1 << i)) + blist_free(dest, blk + i, 1); + } + } + return; + } + + /* + * Meta node + */ + + if (scan->u.bmu_avail == 0) { + /* + * Source all allocated, leave dest allocated + */ + return; + } + if (scan->u.bmu_avail == radix) { + /* + * Source all free, free entire dest + */ + if (count < radix) + blist_free(dest, blk, count); + else + blist_free(dest, blk, radix); + return; + } + + + radix >>= BLIST_META_RADIX_SHIFT; + next_skip = (skip >> BLIST_META_RADIX_SHIFT); + + for (i = 1; count && i <= skip; i += next_skip) { + if (scan[i].bm_bighint == (daddr_t)-1) + break; + + if (count >= radix) { + blst_copy( + &scan[i], + blk, + radix, + next_skip - 1, + dest, + radix + ); + count -= radix; + } else { + if (count) { + blst_copy( + &scan[i], + blk, + radix, + next_skip - 1, + dest, + count + ); + } + count = 0; + } + blk += radix; + } +} + +/* + * BLST_RADIX_INIT() - initialize radix tree + * + * Initialize our meta structures and bitmaps and calculate the exact + * amount of space required to manage 'count' blocks - this space may + * be considerably less then the calculated radix due to the large + * RADIX values we use. + */ + +static daddr_t +blst_radix_init(blmeta_t *scan, daddr_t radix, int skip, daddr_t count) +{ + int i; + int next_skip; + daddr_t memindex = 0; + + /* + * Leaf node + */ + + if (radix == BLIST_BMAP_RADIX) { + if (scan) { + scan->bm_bighint = 0; + scan->u.bmu_bitmap = 0; + } + return(memindex); + } + + /* + * Meta node. If allocating the entire object we can special + * case it. However, we need to figure out how much memory + * is required to manage 'count' blocks, so we continue on anyway. + */ + + if (scan) { + scan->bm_bighint = 0; + scan->u.bmu_avail = 0; + } + + radix >>= BLIST_META_RADIX_SHIFT; + next_skip = (skip >> BLIST_META_RADIX_SHIFT); + + for (i = 1; i <= skip; i += next_skip) { + if (count >= radix) { + /* + * Allocate the entire object + */ + memindex = i + blst_radix_init( + ((scan) ? &scan[i] : NULL), + radix, + next_skip - 1, + radix + ); + count -= radix; + } else if (count > 0) { + /* + * Allocate a partial object + */ + memindex = i + blst_radix_init( + ((scan) ? &scan[i] : NULL), + radix, + next_skip - 1, + count + ); + count = 0; + } else { + /* + * Add terminator and break out + */ + if (scan) + scan[i].bm_bighint = (daddr_t)-1; + break; + } + } + if (memindex < i) + memindex = i; + return(memindex); +} + +#ifdef BLIST_DEBUG + +static void +blst_radix_print(blmeta_t *scan, daddr_t blk, daddr_t radix, int skip, int tab) +{ + int i; + int next_skip; + int lastState = 0; + + if (radix == BLIST_BMAP_RADIX) { + printf( + "%*.*s(%04x,%d): bitmap %08x big=%d\n", + tab, tab, "", + blk, radix, + scan->u.bmu_bitmap, + scan->bm_bighint + ); + return; + } + + if (scan->u.bmu_avail == 0) { + printf( + "%*.*s(%04x,%d) ALL ALLOCATED\n", + tab, tab, "", + blk, + radix + ); + return; + } + if (scan->u.bmu_avail == radix) { + printf( + "%*.*s(%04x,%d) ALL FREE\n", + tab, tab, "", + blk, + radix + ); + return; + } + + printf( + "%*.*s(%04x,%d): subtree (%d/%d) big=%d {\n", + tab, tab, "", + blk, radix, + scan->u.bmu_avail, + radix, + scan->bm_bighint + ); + + radix >>= BLIST_META_RADIX_SHIFT; + next_skip = (skip >> BLIST_META_RADIX_SHIFT); + tab += 4; + + for (i = 1; i <= skip; i += next_skip) { + if (scan[i].bm_bighint == (daddr_t)-1) { + printf( + "%*.*s(%04x,%d): Terminator\n", + tab, tab, "", + blk, radix + ); + lastState = 0; + break; + } + blst_radix_print( + &scan[i], + blk, + radix, + next_skip - 1, + tab + ); + blk += radix; + } + tab -= 4; + + printf( + "%*.*s}\n", + tab, tab, "" + ); +} + +#endif + +#ifdef BLIST_DEBUG + +int +main(int ac, char **av) +{ + int size = 1024; + int i; + blist_t bl; + + for (i = 1; i < ac; ++i) { + const char *ptr = av[i]; + if (*ptr != '-') { + size = strtol(ptr, NULL, 0); + continue; + } + ptr += 2; + fprintf(stderr, "Bad option: %s\n", ptr - 2); + exit(1); + } + bl = blist_create(size); + blist_free(bl, 0, size); + + for (;;) { + char buf[1024]; + daddr_t da = 0; + daddr_t count = 0; + + + printf("%d/%d/%d> ", bl->bl_free, size, bl->bl_radix); + fflush(stdout); + if (fgets(buf, sizeof(buf), stdin) == NULL) + break; + switch(buf[0]) { + case 'r': + if (sscanf(buf + 1, "%d", &count) == 1) { + blist_resize(&bl, count, 1); + } else { + printf("?\n"); + } + case 'p': + blist_print(bl); + break; + case 'a': + if (sscanf(buf + 1, "%d", &count) == 1) { + daddr_t blk = blist_alloc(bl, count); + printf(" R=%04x\n", blk); + } else { + printf("?\n"); + } + break; + case 'f': + if (sscanf(buf + 1, "%x %d", &da, &count) == 2) { + blist_free(bl, da, count); + } else { + printf("?\n"); + } + break; + case '?': + case 'h': + puts( + "p -print\n" + "a %d -allocate\n" + "f %x %d -free\n" + "r %d -resize\n" + "h/? -help" + ); + break; + default: + printf("?\n"); + break; + } + } + return(0); +} + +void +panic(const char *ctl, ...) +{ + va_list va; + + va_start(va, ctl); + vfprintf(stderr, ctl, va); + fprintf(stderr, "\n"); + va_end(va); + exit(1); +} + +#endif + diff --git a/sys/kern/subr_bus.c b/sys/kern/subr_bus.c new file mode 100644 index 0000000..dc4c88a --- /dev/null +++ b/sys/kern/subr_bus.c @@ -0,0 +1,1572 @@ +/*- + * Copyright (c) 1997,1998 Doug Rabson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: subr_bus.c,v 1.13 1999/01/10 22:04:05 n_hibma Exp $ + */ + +#include <sys/param.h> +#include <sys/queue.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/module.h> +#include <sys/bus_private.h> +#include <sys/systm.h> +#include <machine/stdarg.h> /* for device_printf() */ + +#include "opt_bus.h" + +#ifdef BUS_DEBUG +#define PDEBUG(a) (printf(__FUNCTION__ ":%d: ", __LINE__), printf a, printf("\n")) +#define DEVICENAME(d) ((d)? device_get_name(d): "no device") +#define DRIVERNAME(d) ((d)? d->name : "no driver") +#define DEVCLANAME(d) ((d)? d->name : "no devclass") + +/* Produce the indenting, indent*2 spaces plus a '.' ahead of that to + * prevent syslog from deleting initial spaces + */ +#define indentprintf(p) do { int iJ; printf("."); for (iJ=0; iJ<indent; iJ++) printf(" "); printf p ; } while(0) + +static void print_method_list(device_method_t *m, int indent); +static void print_device_ops(device_ops_t ops, int indent); +static void print_device_short(device_t dev, int indent); +static void print_device(device_t dev, int indent); +void print_device_tree_short(device_t dev, int indent); +void print_device_tree(device_t dev, int indent); +static void print_driver_short(driver_t *driver, int indent); +static void print_driver(driver_t *driver, int indent); +static void print_driver_list(driver_list_t drivers, int indent); +static void print_devclass_short(devclass_t dc, int indent); +static void print_devclass(devclass_t dc, int indent); +void print_devclass_list_short(void); +void print_devclass_list(void); + +#else +/* Make the compiler ignore the function calls */ +#define PDEBUG(a) /* nop */ +#define DEVICENAME(d) /* nop */ +#define DRIVERNAME(d) /* nop */ +#define DEVCLANAME(d) /* nop */ + +#define print_method_list(m,i) /* nop */ +#define print_device_ops(o,i) /* nop */ +#define print_device_short(d,i) /* nop */ +#define print_device(d,i) /* nop */ +#define print_device_tree_short(d,i) /* nop */ +#define print_device_tree(d,i) /* nop */ +#define print_driver_short(d,i) /* nop */ +#define print_driver(d,i) /* nop */ +#define print_driver_list(d,i) /* nop */ +#define print_devclass_short(d,i) /* nop */ +#define print_devclass(d,i) /* nop */ +#define print_devclass_list_short() /* nop */ +#define print_devclass_list() /* nop */ +#endif + + +/* + * Method table handling + */ +static int next_method_offset = 1; +static int methods_count = 0; +static int methods_size = 0; + +struct method { + int offset; + char* name; +}; + +static struct method *methods = 0; + +static void +register_method(struct device_op_desc *desc) +{ + int i; + struct method* m; + + for (i = 0; i < methods_count; i++) + if (!strcmp(methods[i].name, desc->name)) { + desc->offset = methods[i].offset; + PDEBUG(("methods[%d] has the same name, %s, with offset %d", + i, desc->name, desc->offset)); + return; + } + + if (methods_count == methods_size) { + struct method* p; + + methods_size += 10; + p = (struct method*) malloc(methods_size * sizeof(struct method), + M_DEVBUF, M_NOWAIT); + if (!p) + panic("register_method: out of memory"); + if (methods) { + bcopy(methods, p, methods_count * sizeof(struct method)); + free(methods, M_DEVBUF); + } + methods = p; + } + m = &methods[methods_count++]; + m->name = malloc(strlen(desc->name) + 1, M_DEVBUF, M_NOWAIT); + if (!m->name) + panic("register_method: out of memory"); + strcpy(m->name, desc->name); + desc->offset = m->offset = next_method_offset++; +} + +static int error_method(void) +{ + return ENXIO; +} + +static struct device_ops null_ops = { + 1, + { error_method } +}; + +static void +compile_methods(driver_t *driver) +{ + device_ops_t ops; + struct device_method *m; + int i; + + /* + * First register any methods which need it. + */ + for (i = 0, m = driver->methods; m->desc; i++, m++) + if (!m->desc->offset) + register_method(m->desc); + else + PDEBUG(("offset not equal to zero, method desc %d left as is", i)); + + /* + * Then allocate the compiled op table. + */ + ops = malloc(sizeof(struct device_ops) + (next_method_offset-1) * sizeof(devop_t), + M_DEVBUF, M_NOWAIT); + if (!ops) + panic("compile_methods: out of memory"); + + ops->maxoffset = next_method_offset; + for (i = 0; i < next_method_offset; i++) + ops->methods[i] = error_method; + for (i = 0, m = driver->methods; m->desc; i++, m++) + ops->methods[m->desc->offset] = m->func; + PDEBUG(("%s has %d method%s, wasting %d bytes", + DRIVERNAME(driver), i, (i==1?"":"s"), + (next_method_offset-i)*sizeof(devop_t))); + + driver->ops = ops; +} + +/* + * Devclass implementation + */ + +static devclass_list_t devclasses = TAILQ_HEAD_INITIALIZER(devclasses); + +static devclass_t +devclass_find_internal(const char *classname, int create) +{ + devclass_t dc; + + PDEBUG(("looking for %s", classname)); + if (!classname) + return NULL; + + for (dc = TAILQ_FIRST(&devclasses); dc; dc = TAILQ_NEXT(dc, link)) + if (!strcmp(dc->name, classname)) + return dc; + + PDEBUG(("%s not found%s", classname, (create? ", creating": ""))); + if (create) { + dc = malloc(sizeof(struct devclass) + strlen(classname) + 1, + M_DEVBUF, M_NOWAIT); + if (!dc) + return NULL; + dc->name = (char*) (dc + 1); + strcpy(dc->name, classname); + dc->devices = NULL; + dc->maxunit = 0; + dc->nextunit = 0; + TAILQ_INIT(&dc->drivers); + TAILQ_INSERT_TAIL(&devclasses, dc, link); + } + + return dc; +} + +devclass_t +devclass_find(const char *classname) +{ + return devclass_find_internal(classname, FALSE); +} + +int +devclass_add_driver(devclass_t dc, driver_t *driver) +{ + PDEBUG(("%s", DRIVERNAME(driver))); + /* + * Compile the drivers methods. + */ + compile_methods(driver); + + /* + * Make sure the devclass which the driver is implementing exists. + */ + devclass_find_internal(driver->name, TRUE); + + TAILQ_INSERT_TAIL(&dc->drivers, driver, link); + + return 0; +} + +int +devclass_delete_driver(devclass_t busclass, driver_t *driver) +{ + devclass_t dc = devclass_find(driver->name); + device_t dev; + int i; + int error; + + PDEBUG(("%s from devclass %s", driver->name, DEVCLANAME(busclass))); + + if (!dc) + return 0; + + /* + * Disassociate from any devices. We iterate through all the + * devices in the devclass of the driver and detach any which are + * using the driver. + */ + for (i = 0; i < dc->maxunit; i++) { + if (dc->devices[i]) { + dev = dc->devices[i]; + if (dev->driver == driver) { + if (error = device_detach(dev)) + return error; + device_set_driver(dev, NULL); + } + } + } + + TAILQ_REMOVE(&busclass->drivers, driver, link); + return 0; +} + +driver_t * +devclass_find_driver(devclass_t dc, const char *classname) +{ + driver_t *driver; + + PDEBUG(("%s in devclass %s", classname, DEVCLANAME(dc))); + + for (driver = TAILQ_FIRST(&dc->drivers); driver; + driver = TAILQ_NEXT(driver, link)) { + if (!strcmp(driver->name, classname)) + return driver; + } + + PDEBUG(("not found")); + return NULL; +} + +const char * +devclass_get_name(devclass_t dc) +{ + return dc->name; +} + +device_t +devclass_get_device(devclass_t dc, int unit) +{ + if (unit < 0 || unit >= dc->maxunit) + return NULL; + return dc->devices[unit]; +} + +void * +devclass_get_softc(devclass_t dc, int unit) +{ + device_t dev; + + if (unit < 0 || unit >= dc->maxunit) + return NULL; + dev = dc->devices[unit]; + if (!dev || dev->state < DS_ATTACHED) + return NULL; + return dev->softc; +} + +int +devclass_get_devices(devclass_t dc, device_t **devlistp, int *devcountp) +{ + int i; + int count; + device_t *list; + + count = 0; + for (i = 0; i < dc->maxunit; i++) + if (dc->devices[i]) + count++; + + list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT); + if (!list) + return ENOMEM; + + count = 0; + for (i = 0; i < dc->maxunit; i++) + if (dc->devices[i]) { + list[count] = dc->devices[i]; + count++; + } + + *devlistp = list; + *devcountp = count; + + return 0; +} + +int +devclass_get_maxunit(devclass_t dc) +{ + return dc->maxunit; +} + +static int +devclass_alloc_unit(devclass_t dc, int *unitp) +{ + int unit = *unitp; + + PDEBUG(("unit %d in devclass %s", unit, DEVCLANAME(dc))); + + /* + * If we have been given a wired unit number, check for existing + * device. + */ + if (unit != -1) { + device_t dev; + dev = devclass_get_device(dc, unit); + if (dev) { + printf("devclass_alloc_unit: %s%d already exists, using next available unit number\n", dc->name, unit); + unit = -1; + } + } + + if (unit == -1) { + unit = dc->nextunit; + dc->nextunit++; + } else if (dc->nextunit <= unit) + dc->nextunit = unit + 1; + + if (unit >= dc->maxunit) { + device_t *newlist; + int newsize; + + newsize = (dc->maxunit ? 2 * dc->maxunit + : MINALLOCSIZE / sizeof(device_t)); + newlist = malloc(sizeof(device_t) * newsize, M_DEVBUF, M_NOWAIT); + if (!newlist) + return ENOMEM; + bcopy(dc->devices, newlist, sizeof(device_t) * dc->maxunit); + bzero(newlist + dc->maxunit, + sizeof(device_t) * (newsize - dc->maxunit)); + if (dc->devices) + free(dc->devices, M_DEVBUF); + dc->devices = newlist; + dc->maxunit = newsize; + } + PDEBUG(("now: unit %d in devclass %s", unit, DEVCLANAME(dc))); + + *unitp = unit; + return 0; +} + +static int +devclass_add_device(devclass_t dc, device_t dev) +{ + int error; + + PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc))); + + if (error = devclass_alloc_unit(dc, &dev->unit)) + return error; + dc->devices[dev->unit] = dev; + dev->devclass = dc; + return 0; +} + +static int +devclass_delete_device(devclass_t dc, device_t dev) +{ + if (!dc || !dev) + return 0; + + PDEBUG(("%s in devclass %s", DEVICENAME(dev), DEVCLANAME(dc))); + + if (dev->devclass != dc + || dc->devices[dev->unit] != dev) + panic("devclass_delete_device: inconsistent device class"); + dc->devices[dev->unit] = NULL; + if (dev->flags & DF_WILDCARD) + dev->unit = -1; + dev->devclass = NULL; + while (dc->nextunit > 0 && dc->devices[dc->nextunit - 1] == NULL) + dc->nextunit--; + return 0; +} + +static device_t +make_device(device_t parent, const char *name, + int unit, void *ivars) +{ + device_t dev; + devclass_t dc; + int error; + + PDEBUG(("%s at %s as unit %d with%s ivars", + name, DEVICENAME(parent), unit, (ivars? "":"out"))); + + if (name) { + dc = devclass_find_internal(name, TRUE); + if (!dc) { + printf("make_device: can't find device class %s\n", name); + return NULL; + } + + if (error = devclass_alloc_unit(dc, &unit)) + return NULL; + } else + dc = NULL; + + dev = malloc(sizeof(struct device), M_DEVBUF, M_NOWAIT); + if (!dev) + return 0; + + dev->parent = parent; + TAILQ_INIT(&dev->children); + dev->ops = &null_ops; + dev->driver = NULL; + dev->devclass = dc; + dev->unit = unit; + dev->desc = NULL; + dev->busy = 0; + dev->flags = DF_ENABLED; + if (unit == -1) + dev->flags |= DF_WILDCARD; + if (name) + dev->flags |= DF_FIXEDCLASS; + dev->ivars = ivars; + dev->softc = NULL; + + if (dc) + dc->devices[unit] = dev; + + dev->state = DS_NOTPRESENT; + + return dev; +} + +static void +device_print_child(device_t dev, device_t child) +{ + printf("%s%d", device_get_name(child), device_get_unit(child)); + if (device_is_alive(child)) { + if (device_get_desc(child)) + printf(": <%s>", device_get_desc(child)); + BUS_PRINT_CHILD(dev, child); + } else + printf(" not found"); + printf("\n"); +} + +device_t +device_add_child(device_t dev, const char *name, int unit, void *ivars) +{ + device_t child; + + PDEBUG(("%s at %s as unit %d with%s ivars", + name, DEVICENAME(dev), unit, (ivars? "":"out"))); + + child = make_device(dev, name, unit, ivars); + + if (child) + TAILQ_INSERT_TAIL(&dev->children, child, link); + else + PDEBUG(("%s failed", name)); + + return child; +} + +device_t +device_add_child_after(device_t dev, device_t place, const char *name, + int unit, void *ivars) +{ + device_t child; + + PDEBUG(("%s at %s after %s as unit %d with%s ivars", + name, DEVICENAME(dev), DEVICENAME(place), unit, (ivars? "":"out"))); + + child = make_device(dev, name, unit, ivars); + + if (place) { + TAILQ_INSERT_AFTER(&dev->children, place, dev, link); + } else { + TAILQ_INSERT_HEAD(&dev->children, dev, link); + } + + return child; +} + +int +device_delete_child(device_t dev, device_t child) +{ + int error; + device_t grandchild; + + PDEBUG(("%s from %s", DEVICENAME(child), DEVICENAME(dev))); + + /* remove children first */ + while ( (grandchild = TAILQ_FIRST(&child->children)) ) { + error = device_delete_child(child, grandchild); + if (error) + return error; + } + + if (error = device_detach(child)) + return error; + if (child->devclass) + devclass_delete_device(child->devclass, child); + TAILQ_REMOVE(&dev->children, child, link); + free(child, M_DEVBUF); + + return 0; +} + +/* + * Find only devices attached to this bus. + */ +device_t +device_find_child(device_t dev, const char *classname, int unit) +{ + devclass_t dc; + device_t child; + + dc = devclass_find(classname); + if (!dc) + return NULL; + + child = devclass_get_device(dc, unit); + if (child && child->parent == dev) + return child; + return NULL; +} + +static driver_t * +first_matching_driver(devclass_t dc, device_t dev) +{ + if (dev->devclass) + return devclass_find_driver(dc, dev->devclass->name); + else + return TAILQ_FIRST(&dc->drivers); +} + +static driver_t * +next_matching_driver(devclass_t dc, device_t dev, driver_t *last) +{ + if (dev->devclass) { + driver_t *driver; + for (driver = TAILQ_NEXT(last, link); driver; + driver = TAILQ_NEXT(driver, link)) + if (!strcmp(dev->devclass->name, driver->name)) + return driver; + return NULL; + } else + return TAILQ_NEXT(last, link); +} + +static int +device_probe_child(device_t dev, device_t child) +{ + devclass_t dc; + driver_t *driver; + + dc = dev->devclass; + if (dc == NULL) + panic("device_probe_child: parent device has no devclass"); + + if (child->state == DS_ALIVE) + return 0; + + for (driver = first_matching_driver(dc, child); + driver; + driver = next_matching_driver(dc, child, driver)) { + PDEBUG(("Trying %s", DRIVERNAME(driver))); + device_set_driver(child, driver); + if (DEVICE_PROBE(child) == 0) { + if (!child->devclass) + device_set_devclass(child, driver->name); + child->state = DS_ALIVE; + return 0; + } + } + + return ENXIO; +} + +device_t +device_get_parent(device_t dev) +{ + return dev->parent; +} + +int +device_get_children(device_t dev, device_t **devlistp, int *devcountp) +{ + int count; + device_t child; + device_t *list; + + count = 0; + for (child = TAILQ_FIRST(&dev->children); child; + child = TAILQ_NEXT(child, link)) + count++; + + list = malloc(count * sizeof(device_t), M_TEMP, M_NOWAIT); + if (!list) + return ENOMEM; + + count = 0; + for (child = TAILQ_FIRST(&dev->children); child; + child = TAILQ_NEXT(child, link)) { + list[count] = child; + count++; + } + + *devlistp = list; + *devcountp = count; + + return 0; +} + +driver_t * +device_get_driver(device_t dev) +{ + return dev->driver; +} + +devclass_t +device_get_devclass(device_t dev) +{ + return dev->devclass; +} + +const char * +device_get_name(device_t dev) +{ + if (dev->devclass) + return devclass_get_name(dev->devclass); + return NULL; +} + +int +device_get_unit(device_t dev) +{ + return dev->unit; +} + +const char * +device_get_desc(device_t dev) +{ + return dev->desc; +} + +void +device_print_prettyname(device_t dev) +{ + const char *name = device_get_name(dev); + + if (name == 0) + name = "(no driver assigned)"; + printf("%s%d: ", name, device_get_unit(dev)); +} + +void +device_printf(device_t dev, const char * fmt, ...) +{ + va_list ap; + + device_print_prettyname(dev); + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); +} + +void +device_set_desc(device_t dev, const char* desc) +{ + dev->desc = desc; +} + +void * +device_get_softc(device_t dev) +{ + return dev->softc; +} + +void * +device_get_ivars(device_t dev) +{ + return dev->ivars; +} + +device_state_t +device_get_state(device_t dev) +{ + return dev->state; +} + +void +device_enable(device_t dev) +{ + dev->flags |= DF_ENABLED; +} + +void +device_disable(device_t dev) +{ + dev->flags &= ~DF_ENABLED; +} + +void +device_busy(device_t dev) +{ + if (dev->state < DS_ATTACHED) + panic("device_busy: called for unattached device"); + if (dev->busy == 0 && dev->parent) + device_busy(dev->parent); + dev->busy++; + dev->state = DS_BUSY; +} + +void +device_unbusy(device_t dev) +{ + if (dev->state != DS_BUSY) + panic("device_unbusy: called for non-busy device"); + dev->busy--; + if (dev->busy == 0) { + if (dev->parent) + device_unbusy(dev->parent); + dev->state = DS_ATTACHED; + } +} + +int +device_is_enabled(device_t dev) +{ + return (dev->flags & DF_ENABLED) != 0; +} + +int +device_is_alive(device_t dev) +{ + return dev->state >= DS_ALIVE; +} + +int +device_set_devclass(device_t dev, const char *classname) +{ + devclass_t dc; + + if (dev->devclass) { + printf("device_set_devclass: device class already set\n"); + return EINVAL; + } + + dc = devclass_find_internal(classname, TRUE); + if (!dc) + return ENOMEM; + + return devclass_add_device(dc, dev); +} + +int +device_set_driver(device_t dev, driver_t *driver) +{ + if (dev->state >= DS_ATTACHED) + return EBUSY; + + if (dev->driver == driver) + return 0; + + if (dev->softc) { + free(dev->softc, M_DEVBUF); + dev->softc = NULL; + } + dev->ops = &null_ops; + dev->driver = driver; + if (driver) { + dev->ops = driver->ops; + dev->softc = malloc(driver->softc, M_DEVBUF, M_NOWAIT); + if (!dev->softc) { + dev->ops = &null_ops; + dev->driver = NULL; + return ENOMEM; + } + bzero(dev->softc, driver->softc); + } + return 0; +} + +int +device_probe_and_attach(device_t dev) +{ + device_t bus = dev->parent; + int error = 0; + + if (dev->state >= DS_ALIVE) + return 0; + + if (dev->flags & DF_ENABLED) { + error = device_probe_child(bus, dev); + if (!error) { + device_print_child(bus, dev); + error = DEVICE_ATTACH(dev); + if (!error) + dev->state = DS_ATTACHED; + else { + printf("device_probe_and_attach: %s%d attach returned %d\n", + dev->driver->name, dev->unit, error); + device_set_driver(dev, NULL); + dev->state = DS_NOTPRESENT; + } + } + } else { + device_print_prettyname(dev); + printf("not probed (disabled)\n"); + } + + return error; +} + +int +device_detach(device_t dev) +{ + int error; + + PDEBUG(("%s", DEVICENAME(dev))); + if (dev->state == DS_BUSY) + return EBUSY; + if (dev->state != DS_ATTACHED) + return 0; + + if (error = DEVICE_DETACH(dev)) + return error; + + if (!(dev->flags & DF_FIXEDCLASS)) + devclass_delete_device(dev->devclass, dev); + + dev->state = DS_NOTPRESENT; + device_set_driver(dev, NULL); + + return 0; +} + +int +device_shutdown(device_t dev) +{ + if (dev->state < DS_ATTACHED) + return 0; + return DEVICE_SHUTDOWN(dev); +} + +/* + * Access functions for device resources. + */ +extern struct config_device devtab[]; +extern int devtab_count; + +static int +resource_match_string(int i, char *resname, char *value) +{ + int j; + struct config_resource *res; + + for (j = 0, res = devtab[i].resources; + j < devtab[i].resource_count; j++, res++) + if (!strcmp(res->name, resname) + && res->type == RES_STRING + && !strcmp(res->u.stringval, value)) + return TRUE; + return FALSE; +} + +static int +resource_find(const char *name, int unit, char *resname, + struct config_resource **result) +{ + int i, j; + struct config_resource *res; + + /* + * First check specific instances, then generic. + */ + for (i = 0; i < devtab_count; i++) { + if (devtab[i].unit < 0) + continue; + if (!strcmp(devtab[i].name, name) && devtab[i].unit == unit) { + res = devtab[i].resources; + for (j = 0; j < devtab[i].resource_count; j++, res++) + if (!strcmp(res->name, resname)) { + *result = res; + return 0; + } + } + } + for (i = 0; i < devtab_count; i++) { + if (devtab[i].unit >= 0) + continue; + if (!strcmp(devtab[i].name, name) && devtab[i].unit == unit) { + res = devtab[i].resources; + for (j = 0; j < devtab[i].resource_count; j++, res++) + if (!strcmp(res->name, resname)) { + *result = res; + return 0; + } + } + } + return ENOENT; +} + +int +resource_int_value(const char *name, int unit, char *resname, int *result) +{ + int error; + struct config_resource *res; + if ((error = resource_find(name, unit, resname, &res)) != 0) + return error; + if (res->type != RES_INT) + return EFTYPE; + *result = res->u.intval; + return 0; +} + +int +resource_long_value(const char *name, int unit, char *resname, long *result) +{ + int error; + struct config_resource *res; + if ((error = resource_find(name, unit, resname, &res)) != 0) + return error; + if (res->type != RES_LONG) + return EFTYPE; + *result = res->u.longval; + return 0; +} + +int +resource_string_value(const char *name, int unit, char *resname, char **result) +{ + int error; + struct config_resource *res; + if ((error = resource_find(name, unit, resname, &res)) != 0) + return error; + if (res->type != RES_STRING) + return EFTYPE; + *result = res->u.stringval; + return 0; +} + +int +resource_query_string(int i, char *resname, char *value) +{ + if (i < 0) + i = 0; + else + i = i + 1; + for (; i < devtab_count; i++) + if (resource_match_string(i, resname, value)) + return i; + return -1; +} + +char * +resource_query_name(int i) +{ + return devtab[i].name; +} + +int +resource_query_unit(int i) +{ + return devtab[i].unit; +} + + +/* + * Some useful method implementations to make life easier for bus drivers. + */ +int +bus_generic_attach(device_t dev) +{ + device_t child; + + for (child = TAILQ_FIRST(&dev->children); + child; child = TAILQ_NEXT(child, link)) + device_probe_and_attach(child); + + return 0; +} + +int +bus_generic_detach(device_t dev) +{ + device_t child; + int error; + + if (dev->state != DS_ATTACHED) + return EBUSY; + + for (child = TAILQ_FIRST(&dev->children); + child; child = TAILQ_NEXT(child, link)) + if (error = device_detach(child)) + return error; + + return 0; +} + +int +bus_generic_shutdown(device_t dev) +{ + device_t child; + + for (child = TAILQ_FIRST(&dev->children); + child; child = TAILQ_NEXT(child, link)) + DEVICE_SHUTDOWN(child); + + return 0; +} + +int +bus_generic_suspend(device_t dev) +{ + int error; + device_t child, child2; + + for (child = TAILQ_FIRST(&dev->children); + child; child = TAILQ_NEXT(child, link)) { + error = DEVICE_SUSPEND(child); + if (error) { + for (child2 = TAILQ_FIRST(&dev->children); + child2 && child2 != child; + child2 = TAILQ_NEXT(child2, link)) + DEVICE_RESUME(child2); + return (error); + } + } + return 0; +} + +int +bus_generic_resume(device_t dev) +{ + device_t child; + + for (child = TAILQ_FIRST(&dev->children); + child; child = TAILQ_NEXT(child, link)) { + DEVICE_RESUME(child); + /* if resume fails, there's nothing we can usefully do... */ + } + return 0; +} + +void +bus_generic_print_child(device_t dev, device_t child) +{ + printf(" on %s%d", device_get_name(dev), device_get_unit(dev)); +} + +int +bus_generic_read_ivar(device_t dev, device_t child, int index, + uintptr_t * result) +{ + return ENOENT; +} + +int +bus_generic_write_ivar(device_t dev, device_t child, int index, + uintptr_t value) +{ + return ENOENT; +} + +int +bus_generic_setup_intr(device_t dev, device_t child, struct resource *irq, + driver_intr_t *intr, void *arg, void **cookiep) +{ + /* Propagate up the bus hierarchy until someone handles it. */ + if (dev->parent) + return (BUS_SETUP_INTR(dev->parent, child, irq, intr, arg, + cookiep)); + else + return (EINVAL); +} + +int +bus_generic_teardown_intr(device_t dev, device_t child, struct resource *irq, + void *cookie) +{ + /* Propagate up the bus hierarchy until someone handles it. */ + if (dev->parent) + return (BUS_TEARDOWN_INTR(dev->parent, child, irq, cookie)); + else + return (EINVAL); +} + +struct resource * +bus_generic_alloc_resource(device_t dev, device_t child, int type, int *rid, + u_long start, u_long end, u_long count, u_int flags) +{ + /* Propagate up the bus hierarchy until someone handles it. */ + if (dev->parent) + return (BUS_ALLOC_RESOURCE(dev->parent, child, type, rid, + start, end, count, flags)); + else + return (NULL); +} + +int +bus_generic_release_resource(device_t dev, device_t child, int type, int rid, + struct resource *r) +{ + /* Propagate up the bus hierarchy until someone handles it. */ + if (dev->parent) + return (BUS_RELEASE_RESOURCE(dev->parent, child, type, rid, + r)); + else + return (EINVAL); +} + +int +bus_generic_activate_resource(device_t dev, device_t child, int type, int rid, + struct resource *r) +{ + /* Propagate up the bus hierarchy until someone handles it. */ + if (dev->parent) + return (BUS_ACTIVATE_RESOURCE(dev->parent, child, type, rid, + r)); + else + return (EINVAL); +} + +int +bus_generic_deactivate_resource(device_t dev, device_t child, int type, + int rid, struct resource *r) +{ + /* Propagate up the bus hierarchy until someone handles it. */ + if (dev->parent) + return (BUS_DEACTIVATE_RESOURCE(dev->parent, child, type, rid, + r)); + else + return (EINVAL); +} + +/* + * Some convenience functions to make it easier for drivers to use the + * resource-management functions. All these really do is hide the + * indirection through the parent's method table, making for slightly + * less-wordy code. In the future, it might make sense for this code + * to maintain some sort of a list of resources allocated by each device. + */ +struct resource * +bus_alloc_resource(device_t dev, int type, int *rid, u_long start, u_long end, + u_long count, u_int flags) +{ + if (dev->parent == 0) + return (0); + return (BUS_ALLOC_RESOURCE(dev->parent, dev, type, rid, start, end, + count, flags)); +} + +int +bus_activate_resource(device_t dev, int type, int rid, struct resource *r) +{ + if (dev->parent == 0) + return (EINVAL); + return (BUS_ACTIVATE_RESOURCE(dev->parent, dev, type, rid, r)); +} + +int +bus_deactivate_resource(device_t dev, int type, int rid, struct resource *r) +{ + if (dev->parent == 0) + return (EINVAL); + return (BUS_DEACTIVATE_RESOURCE(dev->parent, dev, type, rid, r)); +} + +int +bus_release_resource(device_t dev, int type, int rid, struct resource *r) +{ + if (dev->parent == 0) + return (EINVAL); + return (BUS_RELEASE_RESOURCE(dev->parent, dev, + type, rid, r)); +} + +static void +root_print_child(device_t dev, device_t child) +{ +} + +static int +root_setup_intr(device_t dev, device_t child, driver_intr_t *intr, void *arg, + void **cookiep) +{ + /* + * If an interrupt mapping gets to here something bad has happened. + */ + panic("root_setup_intr"); +} + +static device_method_t root_methods[] = { + /* Device interface */ + DEVMETHOD(device_suspend, bus_generic_suspend), + DEVMETHOD(device_resume, bus_generic_resume), + + /* Bus interface */ + DEVMETHOD(bus_print_child, root_print_child), + DEVMETHOD(bus_read_ivar, bus_generic_read_ivar), + DEVMETHOD(bus_write_ivar, bus_generic_write_ivar), + DEVMETHOD(bus_setup_intr, root_setup_intr), + + { 0, 0 } +}; + +static driver_t root_driver = { + "root", + root_methods, + DRIVER_TYPE_MISC, + 1, /* no softc */ +}; + +device_t root_bus; +devclass_t root_devclass; + +static int +root_bus_module_handler(module_t mod, int what, void* arg) +{ + switch (what) { + case MOD_LOAD: + compile_methods(&root_driver); + root_bus = make_device(NULL, "root", 0, NULL); + root_bus->desc = "System root bus"; + root_bus->ops = root_driver.ops; + root_bus->driver = &root_driver; + root_bus->state = DS_ATTACHED; + root_devclass = devclass_find_internal("root", FALSE); + return 0; + } + + return 0; +} + +static moduledata_t root_bus_mod = { + "rootbus", + root_bus_module_handler, + 0 +}; +DECLARE_MODULE(rootbus, root_bus_mod, SI_SUB_DRIVERS, SI_ORDER_FIRST); + +void +root_bus_configure(void) +{ + device_t dev; + + PDEBUG((".")); + + for (dev = TAILQ_FIRST(&root_bus->children); dev; + dev = TAILQ_NEXT(dev, link)) { + device_probe_and_attach(dev); + } +} + +int +driver_module_handler(module_t mod, int what, void *arg) +{ + int error, i; + struct driver_module_data *dmd; + devclass_t bus_devclass; + + dmd = (struct driver_module_data *)arg; + bus_devclass = devclass_find_internal(dmd->dmd_busname, TRUE); + error = 0; + + switch (what) { + case MOD_LOAD: + for (i = 0; !error && i < dmd->dmd_ndrivers; i++) { + PDEBUG(("Loading module: driver %s on bus %s", + DRIVERNAME(dmd->dmd_drivers[i]), + dmd->dmd_busname)); + error = devclass_add_driver(bus_devclass, + dmd->dmd_drivers[i]); + } + if (error) + break; + + /* + * The drivers loaded in this way are assumed to all + * implement the same devclass. + */ + *dmd->dmd_devclass = + devclass_find_internal(dmd->dmd_drivers[0]->name, + TRUE); + break; + + case MOD_UNLOAD: + for (i = 0; !error && i < dmd->dmd_ndrivers; i++) { + PDEBUG(("Unloading module: driver %s from bus %s", + DRIVERNAME(dmd->dmd_drivers[i]), + dmd->dmd_busname)); + error = devclass_delete_driver(bus_devclass, + dmd->dmd_drivers[i]); + } + break; + } + + if (!error && dmd->dmd_chainevh) + error = dmd->dmd_chainevh(mod, what, dmd->dmd_chainarg); + return (error); +} + +#ifdef BUS_DEBUG + +/* the _short versions avoid iteration by not calling anything that prints + * more than oneliners. I love oneliners. + */ + +static void +print_method_list(device_method_t *m, int indent) +{ + int i; + + if (!m) + return; + + for (i = 0; m->desc; i++, m++) + indentprintf(("method %d: %s, offset=%d\n", + i, m->desc->name, m->desc->offset)); +} + +static void +print_device_ops(device_ops_t ops, int indent) +{ + int i; + int count = 0; + + if (!ops) + return; + + /* we present a list of the methods that are pointing to the + * error_method, but ignore the 0'th elements; it is always + * error_method. + */ + for (i = 1; i < ops->maxoffset; i++) { + if (ops->methods[i] == error_method) { + if (count == 0) + indentprintf(("error_method:")); + printf(" %d", i); + count++; + } + } + if (count) + printf("\n"); + + indentprintf(("(%d method%s, %d valid, %d error_method%s)\n", + ops->maxoffset-1, (ops->maxoffset-1 == 1? "":"s"), + ops->maxoffset-1-count, + count, (count == 1? "":"'s"))); +} + +static void +print_device_short(device_t dev, int indent) +{ + if (!dev) + return; + + indentprintf(("device %d: <%s> %sparent,%schildren,%s%s%s%sivars,%ssoftc,busy=%d\n", + dev->unit, dev->desc, + (dev->parent? "":"no "), + (TAILQ_EMPTY(&dev->children)? "no ":""), + (dev->flags&DF_ENABLED? "enabled,":"disabled,"), + (dev->flags&DF_FIXEDCLASS? "fixed,":""), + (dev->flags&DF_WILDCARD? "wildcard,":""), + (dev->ivars? "":"no "), + (dev->softc? "":"no "), + dev->busy)); +} + +static void +print_device(device_t dev, int indent) +{ + if (!dev) + return; + + print_device_short(dev, indent); + + indentprintf(("Parent:\n")); + print_device_short(dev->parent, indent+1); + indentprintf(("Methods:\n")); + print_device_ops(dev->ops, indent+1); + indentprintf(("Driver:\n")); + print_driver_short(dev->driver, indent+1); + indentprintf(("Devclass:\n")); + print_devclass_short(dev->devclass, indent+1); +} + +void +print_device_tree_short(device_t dev, int indent) +/* print the device and all its children (indented) */ +{ + device_t child; + + if (!dev) + return; + + print_device_short(dev, indent); + + for (child = TAILQ_FIRST(&dev->children); child; + child = TAILQ_NEXT(child, link)) + print_device_tree_short(child, indent+1); +} + +void +print_device_tree(device_t dev, int indent) +/* print the device and all its children (indented) */ +{ + device_t child; + + if (!dev) + return; + + print_device(dev, indent); + + for (child = TAILQ_FIRST(&dev->children); child; + child = TAILQ_NEXT(child, link)) + print_device_tree(child, indent+1); +} + +static void +print_driver_short(driver_t *driver, int indent) +{ + if (!driver) + return; + + indentprintf(("driver %s: type = %s%s%s%s, softc size = %d\n", + driver->name, + /* yes, I know this looks silly, but going to bed at + * two o'clock and having to get up at 7:30 again is silly + * as well. As is sticking your head in a bucket of water. + */ + (driver->type == DRIVER_TYPE_TTY? "tty":""), + (driver->type == DRIVER_TYPE_BIO? "bio":""), + (driver->type == DRIVER_TYPE_NET? "net":""), + (driver->type == DRIVER_TYPE_MISC? "misc":""), + driver->softc)); +} + +static void +print_driver(driver_t *driver, int indent) +{ + if (!driver) + return; + + print_driver_short(driver, indent); + indentprintf(("Methods:\n")); + print_method_list(driver->methods, indent+1); + indentprintf(("Operations:\n")); + print_device_ops(driver->ops, indent+1); +} + + +static void +print_driver_list(driver_list_t drivers, int indent) +{ + driver_t *driver; + + for (driver = TAILQ_FIRST(&drivers); driver; + driver = TAILQ_NEXT(driver, link)) + print_driver(driver, indent); +} + +static void +print_devclass_short(devclass_t dc, int indent) +{ + if ( !dc ) + return; + + indentprintf(("devclass %s: max units = %d, next unit = %d\n", + dc->name, dc->maxunit, dc->nextunit)); +} + +static void +print_devclass(devclass_t dc, int indent) +{ + int i; + + if ( !dc ) + return; + + print_devclass_short(dc, indent); + indentprintf(("Drivers:\n")); + print_driver_list(dc->drivers, indent+1); + + indentprintf(("Devices:\n")); + for (i = 0; i < dc->maxunit; i++) + if (dc->devices[i]) + print_device(dc->devices[i], indent+1); +} + +void +print_devclass_list_short(void) +{ + devclass_t dc; + + printf("Short listing of devclasses, drivers & devices:\n"); + for (dc = TAILQ_FIRST(&devclasses); dc; dc = TAILQ_NEXT(dc, link)) + print_devclass_short(dc, 0); +} + +void +print_devclass_list(void) +{ + devclass_t dc; + + printf("Full listing of devclasses, drivers & devices:\n"); + for (dc = TAILQ_FIRST(&devclasses); dc; dc = TAILQ_NEXT(dc, link)) + print_devclass(dc, 0); +} + +#endif diff --git a/sys/kern/subr_clist.c b/sys/kern/subr_clist.c new file mode 100644 index 0000000..593d00c --- /dev/null +++ b/sys/kern/subr_clist.c @@ -0,0 +1,694 @@ +/* + * Copyright (c) 1994, David Greenman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: tty_subr.c,v 1.29 1998/04/15 17:46:27 bde Exp $ + */ + +/* + * clist support routines + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/tty.h> +#include <sys/clist.h> + +static void clist_init __P((void *)); +SYSINIT(clist, SI_SUB_CLIST, SI_ORDER_FIRST, clist_init, NULL) + +static struct cblock *cfreelist = 0; +int cfreecount = 0; +static int cslushcount; +static int ctotcount; + +#ifndef INITIAL_CBLOCKS +#define INITIAL_CBLOCKS 50 +#endif + +static struct cblock *cblock_alloc __P((void)); +static void cblock_alloc_cblocks __P((int number)); +static void cblock_free __P((struct cblock *cblockp)); +static void cblock_free_cblocks __P((int number)); + +#include "opt_ddb.h" +#ifdef DDB +#include <ddb/ddb.h> + +DB_SHOW_COMMAND(cbstat, cbstat) +{ + printf( + "tot = %d (active = %d, free = %d (reserved = %d, slush = %d))\n", + ctotcount * CBSIZE, ctotcount * CBSIZE - cfreecount, cfreecount, + cfreecount - cslushcount * CBSIZE, cslushcount * CBSIZE); +} +#endif /* DDB */ + +/* + * Called from init_main.c + */ +/* ARGSUSED*/ +static void +clist_init(dummy) + void *dummy; +{ + /* + * Allocate an initial base set of cblocks as a 'slush'. + * We allocate non-slush cblocks with each initial ttyopen() and + * deallocate them with each ttyclose(). + * We should adjust the slush allocation. This can't be done in + * the i/o routines because they are sometimes called from + * interrupt handlers when it may be unsafe to call malloc(). + */ + cblock_alloc_cblocks(cslushcount = INITIAL_CBLOCKS); +} + +/* + * Remove a cblock from the cfreelist queue and return a pointer + * to it. + */ +static __inline struct cblock * +cblock_alloc() +{ + struct cblock *cblockp; + + cblockp = cfreelist; + if (cblockp == NULL) + panic("clist reservation botch"); + cfreelist = cblockp->c_next; + cblockp->c_next = NULL; + cfreecount -= CBSIZE; + return (cblockp); +} + +/* + * Add a cblock to the cfreelist queue. + */ +static __inline void +cblock_free(cblockp) + struct cblock *cblockp; +{ + if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) + bzero(cblockp->c_quote, sizeof cblockp->c_quote); + cblockp->c_next = cfreelist; + cfreelist = cblockp; + cfreecount += CBSIZE; +} + +/* + * Allocate some cblocks for the cfreelist queue. + */ +static void +cblock_alloc_cblocks(number) + int number; +{ + int i; + struct cblock *cbp; + + for (i = 0; i < number; ++i) { + cbp = malloc(sizeof *cbp, M_TTYS, M_NOWAIT); + if (cbp == NULL) { + printf( +"clist_alloc_cblocks: M_NOWAIT malloc failed, trying M_WAITOK\n"); + cbp = malloc(sizeof *cbp, M_TTYS, M_WAITOK); + } + /* + * Freed cblocks have zero quotes and garbage elsewhere. + * Set the may-have-quote bit to force zeroing the quotes. + */ + setbit(cbp->c_quote, CBQSIZE * NBBY - 1); + cblock_free(cbp); + } + ctotcount += number; +} + +/* + * Set the cblock allocation policy for a a clist. + * Must be called in process context at spltty(). + */ +void +clist_alloc_cblocks(clistp, ccmax, ccreserved) + struct clist *clistp; + int ccmax; + int ccreserved; +{ + int dcbr; + + /* + * Allow for wasted space at the head. + */ + if (ccmax != 0) + ccmax += CBSIZE - 1; + if (ccreserved != 0) + ccreserved += CBSIZE - 1; + + clistp->c_cbmax = roundup(ccmax, CBSIZE) / CBSIZE; + dcbr = roundup(ccreserved, CBSIZE) / CBSIZE - clistp->c_cbreserved; + if (dcbr >= 0) + cblock_alloc_cblocks(dcbr); + else { + if (clistp->c_cbreserved + dcbr < clistp->c_cbcount) + dcbr = clistp->c_cbcount - clistp->c_cbreserved; + cblock_free_cblocks(-dcbr); + } + clistp->c_cbreserved += dcbr; +} + +/* + * Free some cblocks from the cfreelist queue back to the + * system malloc pool. + */ +static void +cblock_free_cblocks(number) + int number; +{ + int i; + + for (i = 0; i < number; ++i) + free(cblock_alloc(), M_TTYS); + ctotcount -= number; +} + +/* + * Free the cblocks reserved for a clist. + * Must be called at spltty(). + */ +void +clist_free_cblocks(clistp) + struct clist *clistp; +{ + if (clistp->c_cbcount != 0) + panic("freeing active clist cblocks"); + cblock_free_cblocks(clistp->c_cbreserved); + clistp->c_cbmax = 0; + clistp->c_cbreserved = 0; +} + +/* + * Get a character from the head of a clist. + */ +int +getc(clistp) + struct clist *clistp; +{ + int chr = -1; + int s; + struct cblock *cblockp; + + s = spltty(); + + /* If there are characters in the list, get one */ + if (clistp->c_cc) { + cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND); + chr = (u_char)*clistp->c_cf; + + /* + * If this char is quoted, set the flag. + */ + if (isset(cblockp->c_quote, clistp->c_cf - (char *)cblockp->c_info)) + chr |= TTY_QUOTE; + + /* + * Advance to next character. + */ + clistp->c_cf++; + clistp->c_cc--; + /* + * If we have advanced the 'first' character pointer + * past the end of this cblock, advance to the next one. + * If there are no more characters, set the first and + * last pointers to NULL. In either case, free the + * current cblock. + */ + if ((clistp->c_cf >= (char *)(cblockp+1)) || (clistp->c_cc == 0)) { + if (clistp->c_cc > 0) { + clistp->c_cf = cblockp->c_next->c_info; + } else { + clistp->c_cf = clistp->c_cl = NULL; + } + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + } + } + + splx(s); + return (chr); +} + +/* + * Copy 'amount' of chars, beginning at head of clist 'clistp' to + * destination linear buffer 'dest'. Return number of characters + * actually copied. + */ +int +q_to_b(clistp, dest, amount) + struct clist *clistp; + char *dest; + int amount; +{ + struct cblock *cblockp; + struct cblock *cblockn; + char *dest_orig = dest; + int numc; + int s; + + s = spltty(); + + while (clistp && amount && (clistp->c_cc > 0)) { + cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND); + cblockn = cblockp + 1; /* pointer arithmetic! */ + numc = min(amount, (char *)cblockn - clistp->c_cf); + numc = min(numc, clistp->c_cc); + bcopy(clistp->c_cf, dest, numc); + amount -= numc; + clistp->c_cf += numc; + clistp->c_cc -= numc; + dest += numc; + /* + * If this cblock has been emptied, advance to the next + * one. If there are no more characters, set the first + * and last pointer to NULL. In either case, free the + * current cblock. + */ + if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) { + if (clistp->c_cc > 0) { + clistp->c_cf = cblockp->c_next->c_info; + } else { + clistp->c_cf = clistp->c_cl = NULL; + } + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + } + } + + splx(s); + return (dest - dest_orig); +} + +/* + * Flush 'amount' of chars, beginning at head of clist 'clistp'. + */ +void +ndflush(clistp, amount) + struct clist *clistp; + int amount; +{ + struct cblock *cblockp; + struct cblock *cblockn; + int numc; + int s; + + s = spltty(); + + while (amount && (clistp->c_cc > 0)) { + cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND); + cblockn = cblockp + 1; /* pointer arithmetic! */ + numc = min(amount, (char *)cblockn - clistp->c_cf); + numc = min(numc, clistp->c_cc); + amount -= numc; + clistp->c_cf += numc; + clistp->c_cc -= numc; + /* + * If this cblock has been emptied, advance to the next + * one. If there are no more characters, set the first + * and last pointer to NULL. In either case, free the + * current cblock. + */ + if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) { + if (clistp->c_cc > 0) { + clistp->c_cf = cblockp->c_next->c_info; + } else { + clistp->c_cf = clistp->c_cl = NULL; + } + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + } + } + + splx(s); +} + +/* + * Add a character to the end of a clist. Return -1 is no + * more clists, or 0 for success. + */ +int +putc(chr, clistp) + int chr; + struct clist *clistp; +{ + struct cblock *cblockp; + int s; + + s = spltty(); + + if (clistp->c_cl == NULL) { + if (clistp->c_cbreserved < 1) { + splx(s); + printf("putc to a clist with no reserved cblocks\n"); + return (-1); /* nothing done */ + } + cblockp = cblock_alloc(); + clistp->c_cbcount = 1; + clistp->c_cf = clistp->c_cl = cblockp->c_info; + clistp->c_cc = 0; + } else { + cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND); + if (((intptr_t)clistp->c_cl & CROUND) == 0) { + struct cblock *prev = (cblockp - 1); + + if (clistp->c_cbcount >= clistp->c_cbreserved) { + if (clistp->c_cbcount >= clistp->c_cbmax + || cslushcount <= 0) { + splx(s); + return (-1); + } + --cslushcount; + } + cblockp = cblock_alloc(); + clistp->c_cbcount++; + prev->c_next = cblockp; + clistp->c_cl = cblockp->c_info; + } + } + + /* + * If this character is quoted, set the quote bit, if not, clear it. + */ + if (chr & TTY_QUOTE) { + setbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info); + /* + * Use one of the spare quote bits to record that something + * may be quoted. + */ + setbit(cblockp->c_quote, CBQSIZE * NBBY - 1); + } else + clrbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info); + + *clistp->c_cl++ = chr; + clistp->c_cc++; + + splx(s); + return (0); +} + +/* + * Copy data from linear buffer to clist chain. Return the + * number of characters not copied. + */ +int +b_to_q(src, amount, clistp) + char *src; + int amount; + struct clist *clistp; +{ + struct cblock *cblockp; + char *firstbyte, *lastbyte; + u_char startmask, endmask; + int startbit, endbit, num_between, numc; + int s; + + /* + * Avoid allocating an initial cblock and then not using it. + * c_cc == 0 must imply c_cbount == 0. + */ + if (amount <= 0) + return (amount); + + s = spltty(); + + /* + * If there are no cblocks assigned to this clist yet, + * then get one. + */ + if (clistp->c_cl == NULL) { + if (clistp->c_cbreserved < 1) { + splx(s); + printf("b_to_q to a clist with no reserved cblocks.\n"); + return (amount); /* nothing done */ + } + cblockp = cblock_alloc(); + clistp->c_cbcount = 1; + clistp->c_cf = clistp->c_cl = cblockp->c_info; + clistp->c_cc = 0; + } else { + cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND); + } + + while (amount) { + /* + * Get another cblock if needed. + */ + if (((intptr_t)clistp->c_cl & CROUND) == 0) { + struct cblock *prev = cblockp - 1; + + if (clistp->c_cbcount >= clistp->c_cbreserved) { + if (clistp->c_cbcount >= clistp->c_cbmax + || cslushcount <= 0) { + splx(s); + return (amount); + } + --cslushcount; + } + cblockp = cblock_alloc(); + clistp->c_cbcount++; + prev->c_next = cblockp; + clistp->c_cl = cblockp->c_info; + } + + /* + * Copy a chunk of the linear buffer up to the end + * of this cblock. + */ + numc = min(amount, (char *)(cblockp + 1) - clistp->c_cl); + bcopy(src, clistp->c_cl, numc); + + /* + * Clear quote bits if they aren't known to be clear. + * The following could probably be made into a seperate + * "bitzero()" routine, but why bother? + */ + if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) { + startbit = clistp->c_cl - (char *)cblockp->c_info; + endbit = startbit + numc - 1; + + firstbyte = (u_char *)cblockp->c_quote + (startbit / NBBY); + lastbyte = (u_char *)cblockp->c_quote + (endbit / NBBY); + + /* + * Calculate mask of bits to preserve in first and + * last bytes. + */ + startmask = NBBY - (startbit % NBBY); + startmask = 0xff >> startmask; + endmask = (endbit % NBBY); + endmask = 0xff << (endmask + 1); + + if (firstbyte != lastbyte) { + *firstbyte &= startmask; + *lastbyte &= endmask; + + num_between = lastbyte - firstbyte - 1; + if (num_between) + bzero(firstbyte + 1, num_between); + } else { + *firstbyte &= (startmask | endmask); + } + } + + /* + * ...and update pointer for the next chunk. + */ + src += numc; + clistp->c_cl += numc; + clistp->c_cc += numc; + amount -= numc; + /* + * If we go through the loop again, it's always + * for data in the next cblock, so by adding one (cblock), + * (which makes the pointer 1 beyond the end of this + * cblock) we prepare for the assignment of 'prev' + * above. + */ + cblockp += 1; + + } + + splx(s); + return (amount); +} + +/* + * Get the next character in the clist. Store it at dst. Don't + * advance any clist pointers, but return a pointer to the next + * character position. + */ +char * +nextc(clistp, cp, dst) + struct clist *clistp; + char *cp; + int *dst; +{ + struct cblock *cblockp; + + ++cp; + /* + * See if the next character is beyond the end of + * the clist. + */ + if (clistp->c_cc && (cp != clistp->c_cl)) { + /* + * If the next character is beyond the end of this + * cblock, advance to the next cblock. + */ + if (((intptr_t)cp & CROUND) == 0) + cp = ((struct cblock *)cp - 1)->c_next->c_info; + cblockp = (struct cblock *)((intptr_t)cp & ~CROUND); + + /* + * Get the character. Set the quote flag if this character + * is quoted. + */ + *dst = (u_char)*cp | (isset(cblockp->c_quote, cp - (char *)cblockp->c_info) ? TTY_QUOTE : 0); + + return (cp); + } + + return (NULL); +} + +/* + * "Unput" a character from a clist. + */ +int +unputc(clistp) + struct clist *clistp; +{ + struct cblock *cblockp = 0, *cbp = 0; + int s; + int chr = -1; + + + s = spltty(); + + if (clistp->c_cc) { + --clistp->c_cc; + --clistp->c_cl; + + chr = (u_char)*clistp->c_cl; + + cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND); + + /* + * Set quote flag if this character was quoted. + */ + if (isset(cblockp->c_quote, (u_char *)clistp->c_cl - cblockp->c_info)) + chr |= TTY_QUOTE; + + /* + * If all of the characters have been unput in this + * cblock, then find the previous one and free this + * one. + */ + if (clistp->c_cc && (clistp->c_cl <= (char *)cblockp->c_info)) { + cbp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND); + + while (cbp->c_next != cblockp) + cbp = cbp->c_next; + + /* + * When the previous cblock is at the end, the 'last' + * pointer always points (invalidly) one past. + */ + clistp->c_cl = (char *)(cbp+1); + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + cbp->c_next = NULL; + } + } + + /* + * If there are no more characters on the list, then + * free the last cblock. + */ + if ((clistp->c_cc == 0) && clistp->c_cl) { + cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND); + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + clistp->c_cf = clistp->c_cl = NULL; + } + + splx(s); + return (chr); +} + +/* + * Move characters in source clist to destination clist, + * preserving quote bits. + */ +void +catq(src_clistp, dest_clistp) + struct clist *src_clistp, *dest_clistp; +{ + int chr, s; + + s = spltty(); + /* + * If the destination clist is empty (has no cblocks atttached), + * and there are no possible complications with the resource counters, + * then we simply assign the current clist to the destination. + */ + if (!dest_clistp->c_cf + && src_clistp->c_cbcount <= src_clistp->c_cbmax + && src_clistp->c_cbcount <= dest_clistp->c_cbmax) { + dest_clistp->c_cf = src_clistp->c_cf; + dest_clistp->c_cl = src_clistp->c_cl; + src_clistp->c_cf = src_clistp->c_cl = NULL; + + dest_clistp->c_cc = src_clistp->c_cc; + src_clistp->c_cc = 0; + dest_clistp->c_cbcount = src_clistp->c_cbcount; + src_clistp->c_cbcount = 0; + + splx(s); + return; + } + + splx(s); + + /* + * XXX This should probably be optimized to more than one + * character at a time. + */ + while ((chr = getc(src_clistp)) != -1) + putc(chr, dest_clistp); +} diff --git a/sys/kern/subr_devstat.c b/sys/kern/subr_devstat.c new file mode 100644 index 0000000..5fcf88e --- /dev/null +++ b/sys/kern/subr_devstat.c @@ -0,0 +1,248 @@ +/* + * Copyright (c) 1997, 1998 Kenneth D. Merry. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: subr_devstat.c,v 1.7 1998/12/04 22:54:51 archie Exp $ + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/time.h> +#include <sys/types.h> +#include <sys/sysctl.h> + +#include <sys/devicestat.h> + +static int devstat_num_devs; +static long devstat_generation; +static int devstat_version = DEVSTAT_VERSION; +static int devstat_current_devnumber; + +STAILQ_HEAD(devstatlist, devstat) device_statq; + +/* + * Take a malloced and zeroed devstat structure given to us, fill it in + * and add it to the queue of devices. + */ +void +devstat_add_entry(struct devstat *ds, const char *dev_name, + int unit_number, u_int32_t block_size, + devstat_support_flags flags, + devstat_type_flags device_type) +{ + int s; + struct devstatlist *devstat_head; + + if (ds == NULL) + return; + + if (devstat_num_devs == 0) + STAILQ_INIT(&device_statq); + + devstat_generation++; + devstat_num_devs++; + + devstat_head = &device_statq; + + STAILQ_INSERT_TAIL(devstat_head, ds, dev_links); + + ds->device_number = devstat_current_devnumber++; + ds->unit_number = unit_number; + strncpy(ds->device_name, dev_name, DEVSTAT_NAME_LEN); + ds->device_name[DEVSTAT_NAME_LEN - 1] = 0; + ds->block_size = block_size; + ds->flags = flags; + ds->device_type = device_type; + + s = splclock(); + getmicrotime(&ds->dev_creation_time); + splx(s); +} + +/* + * Remove a devstat structure from the list of devices. + */ +void +devstat_remove_entry(struct devstat *ds) +{ + struct devstatlist *devstat_head; + + if (ds == NULL) + return; + + devstat_generation++; + devstat_num_devs--; + + devstat_head = &device_statq; + + /* Remove this entry from the devstat queue */ + STAILQ_REMOVE(devstat_head, ds, devstat, dev_links); +} + +/* + * Record a transaction start. + */ +void +devstat_start_transaction(struct devstat *ds) +{ + int s; + + /* sanity check */ + if (ds == NULL) + return; + + /* + * We only want to set the start time when we are going from idle + * to busy. The start time is really the start of the latest busy + * period. + */ + if (ds->busy_count == 0) { + s = splclock(); + getmicrouptime(&ds->start_time); + splx(s); + } + ds->busy_count++; +} + +/* + * Record the ending of a transaction, and incrment the various counters. + */ +void +devstat_end_transaction(struct devstat *ds, u_int32_t bytes, + devstat_tag_type tag_type, devstat_trans_flags flags) +{ + int s; + struct timeval busy_time; + + /* sanity check */ + if (ds == NULL) + return; + + s = splclock(); + getmicrouptime(&ds->last_comp_time); + splx(s); + + ds->busy_count--; + + /* + * There might be some transactions (DEVSTAT_NO_DATA) that don't + * transfer any data. + */ + if (flags == DEVSTAT_READ) { + ds->bytes_read += bytes; + ds->num_reads++; + } else if (flags == DEVSTAT_WRITE) { + ds->bytes_written += bytes; + ds->num_writes++; + } else + ds->num_other++; + + /* + * Keep a count of the various tag types sent. + */ + if (tag_type != DEVSTAT_TAG_NONE) + ds->tag_types[tag_type]++; + + /* + * We only update the busy time when we go idle. Otherwise, this + * calculation would require many more clock cycles. + */ + if (ds->busy_count == 0) { + /* Calculate how long we were busy */ + busy_time = ds->last_comp_time; + timevalsub(&busy_time, &ds->start_time); + + /* Add our busy time to the total busy time. */ + timevaladd(&ds->busy_time, &busy_time); + } else if (ds->busy_count < 0) + printf("devstat_end_transaction: HELP!! busy_count " + "for %s%d is < 0 (%d)!\n", ds->device_name, + ds->unit_number, ds->busy_count); +} + +/* + * This is the sysctl handler for the devstat package. The data pushed out + * on the kern.devstat.all sysctl variable consists of the current devstat + * generation number, and then an array of devstat structures, one for each + * device in the system. + * + * I'm really not too fond of this method of doing things, but there really + * aren't that many alternatives. We must have some method of making sure + * that the generation number the user gets corresponds with the data the + * user gets. If the user makes a separate sysctl call to get the + * generation, and then a sysctl call to get the device statistics, the + * device list could have changed in that brief period of time. By + * supplying the generation number along with the statistics output, we can + * guarantee that the generation number and the statistics match up. + */ +static int +sysctl_devstat SYSCTL_HANDLER_ARGS +{ + int error, i; + struct devstat *nds; + struct devstatlist *devstat_head; + + if (devstat_num_devs == 0) + return(EINVAL); + + error = 0; + devstat_head = &device_statq; + + /* + * First push out the generation number. + */ + error = SYSCTL_OUT(req, &devstat_generation, sizeof(long)); + + /* + * Now push out all the devices. + */ + for (i = 0, nds = devstat_head->stqh_first; + (nds != NULL) && (i < devstat_num_devs) && (error == 0); + nds = nds->dev_links.stqe_next, i++) + error = SYSCTL_OUT(req, nds, sizeof(struct devstat)); + + return(error); +} + +/* + * Sysctl entries for devstat. The first one is a node that all the rest + * hang off of. + */ +SYSCTL_NODE(_kern, OID_AUTO, devstat, CTLFLAG_RD, 0, "Device Statistics"); + +SYSCTL_PROC(_kern_devstat, OID_AUTO, all, CTLFLAG_RD|CTLTYPE_OPAQUE, + 0, 0, sysctl_devstat, "S,devstat", "All Devices"); +/* + * Export the number of devices in the system so that userland utilities + * can determine how much memory to allocate to hold all the devices. + */ +SYSCTL_INT(_kern_devstat, OID_AUTO, numdevs, CTLFLAG_RD, &devstat_num_devs, + 0, "Number of devices in the devstat list"); +SYSCTL_LONG(_kern_devstat, OID_AUTO, generation, CTLFLAG_RD, + &devstat_generation, "Devstat list generation"); +SYSCTL_INT(_kern_devstat, OID_AUTO, version, CTLFLAG_RD, &devstat_version, + 0, "Devstat list version number"); diff --git a/sys/kern/subr_disklabel.c b/sys/kern/subr_disklabel.c new file mode 100644 index 0000000..33f1d2a --- /dev/null +++ b/sys/kern/subr_disklabel.c @@ -0,0 +1,410 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 + * $Id: ufs_disksubr.c,v 1.38 1998/10/17 07:49:04 bde Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/buf.h> +#include <sys/conf.h> +#include <sys/disklabel.h> +#include <sys/diskslice.h> +#include <sys/syslog.h> + +/* + * Seek sort for disks. + * + * The buf_queue keep two queues, sorted in ascending block order. The first + * queue holds those requests which are positioned after the current block + * (in the first request); the second, which starts at queue->switch_point, + * holds requests which came in after their block number was passed. Thus + * we implement a one way scan, retracting after reaching the end of the drive + * to the first request on the second queue, at which time it becomes the + * first queue. + * + * A one-way scan is natural because of the way UNIX read-ahead blocks are + * allocated. + */ + +void +bufqdisksort(bufq, bp) + struct buf_queue_head *bufq; + struct buf *bp; +{ + struct buf *bq; + struct buf *bn; + struct buf *be; + + be = TAILQ_LAST(&bufq->queue, buf_queue); + /* + * If the queue is empty or we are an + * ordered transaction, then it's easy. + */ + if ((bq = bufq_first(bufq)) == NULL + || (bp->b_flags & B_ORDERED) != 0) { + bufq_insert_tail(bufq, bp); + return; + } else if (bufq->insert_point != NULL) { + + /* + * A certain portion of the list is + * "locked" to preserve ordering, so + * we can only insert after the insert + * point. + */ + bq = bufq->insert_point; + } else { + + /* + * If we lie before the last removed (currently active) + * request, and are not inserting ourselves into the + * "locked" portion of the list, then we must add ourselves + * to the second request list. + */ + if (bp->b_pblkno < bufq->last_pblkno) { + + bq = bufq->switch_point; + /* + * If we are starting a new secondary list, + * then it's easy. + */ + if (bq == NULL) { + bufq->switch_point = bp; + bufq_insert_tail(bufq, bp); + return; + } + /* + * If we lie ahead of the current switch point, + * insert us before the switch point and move + * the switch point. + */ + if (bp->b_pblkno < bq->b_pblkno) { + bufq->switch_point = bp; + TAILQ_INSERT_BEFORE(bq, bp, b_act); + return; + } + } else { + if (bufq->switch_point != NULL) + be = TAILQ_PREV(bufq->switch_point, + buf_queue, b_act); + /* + * If we lie between last_pblkno and bq, + * insert before bq. + */ + if (bp->b_pblkno < bq->b_pblkno) { + TAILQ_INSERT_BEFORE(bq, bp, b_act); + return; + } + } + } + + /* + * Request is at/after our current position in the list. + * Optimize for sequential I/O by seeing if we go at the tail. + */ + if (bp->b_pblkno > be->b_pblkno) { + TAILQ_INSERT_AFTER(&bufq->queue, be, bp, b_act); + return; + } + + /* Otherwise, insertion sort */ + while ((bn = TAILQ_NEXT(bq, b_act)) != NULL) { + + /* + * We want to go after the current request if it is the end + * of the first request list, or if the next request is a + * larger cylinder than our request. + */ + if (bn == bufq->switch_point + || bp->b_pblkno < bn->b_pblkno) + break; + bq = bn; + } + TAILQ_INSERT_AFTER(&bufq->queue, bq, bp, b_act); +} + + +/* + * Attempt to read a disk label from a device using the indicated strategy + * routine. The label must be partly set up before this: secpercyl, secsize + * and anything required in the strategy routine (e.g., dummy bounds for the + * partition containing the label) must be filled in before calling us. + * Returns NULL on success and an error string on failure. + */ +char * +readdisklabel(dev, strat, lp) + dev_t dev; + d_strategy_t *strat; + register struct disklabel *lp; +{ + register struct buf *bp; + struct disklabel *dlp; + char *msg = NULL; + + bp = geteblk((int)lp->d_secsize); + bp->b_dev = dev; + bp->b_blkno = LABELSECTOR * ((int)lp->d_secsize/DEV_BSIZE); + bp->b_bcount = lp->d_secsize; + bp->b_flags &= ~B_INVAL; + bp->b_flags |= B_BUSY | B_READ; + (*strat)(bp); + if (biowait(bp)) + msg = "I/O error"; + else for (dlp = (struct disklabel *)bp->b_data; + dlp <= (struct disklabel *)((char *)bp->b_data + + lp->d_secsize - sizeof(*dlp)); + dlp = (struct disklabel *)((char *)dlp + sizeof(long))) { + if (dlp->d_magic != DISKMAGIC || dlp->d_magic2 != DISKMAGIC) { + if (msg == NULL) + msg = "no disk label"; + } else if (dlp->d_npartitions > MAXPARTITIONS || + dkcksum(dlp) != 0) + msg = "disk label corrupted"; + else { + *lp = *dlp; + msg = NULL; + break; + } + } + bp->b_flags |= B_INVAL | B_AGE; + brelse(bp); + return (msg); +} + +/* + * Check new disk label for sensibility before setting it. + */ +int +setdisklabel(olp, nlp, openmask) + register struct disklabel *olp, *nlp; + u_long openmask; +{ + register int i; + register struct partition *opp, *npp; + + /* + * Check it is actually a disklabel we are looking at. + */ + if (nlp->d_magic != DISKMAGIC || nlp->d_magic2 != DISKMAGIC || + dkcksum(nlp) != 0) + return (EINVAL); + /* + * For each partition that we think is open, + */ + while ((i = ffs((long)openmask)) != 0) { + i--; + /* + * Check it is not changing.... + */ + openmask &= ~(1 << i); + if (nlp->d_npartitions <= i) + return (EBUSY); + opp = &olp->d_partitions[i]; + npp = &nlp->d_partitions[i]; + if (npp->p_offset != opp->p_offset || npp->p_size < opp->p_size) + return (EBUSY); + /* + * Copy internally-set partition information + * if new label doesn't include it. XXX + * (If we are using it then we had better stay the same type) + * This is possibly dubious, as someone else noted (XXX) + */ + if (npp->p_fstype == FS_UNUSED && opp->p_fstype != FS_UNUSED) { + npp->p_fstype = opp->p_fstype; + npp->p_fsize = opp->p_fsize; + npp->p_frag = opp->p_frag; + npp->p_cpg = opp->p_cpg; + } + } + nlp->d_checksum = 0; + nlp->d_checksum = dkcksum(nlp); + *olp = *nlp; + return (0); +} + +/* + * Write disk label back to device after modification. + */ +int +writedisklabel(dev, strat, lp) + dev_t dev; + d_strategy_t *strat; + register struct disklabel *lp; +{ + struct buf *bp; + struct disklabel *dlp; + int error = 0; + + if (lp->d_partitions[RAW_PART].p_offset != 0) + return (EXDEV); /* not quite right */ + bp = geteblk((int)lp->d_secsize); + bp->b_dev = dkmodpart(dev, RAW_PART); + bp->b_blkno = LABELSECTOR * ((int)lp->d_secsize/DEV_BSIZE); + bp->b_bcount = lp->d_secsize; +#if 1 + /* + * We read the label first to see if it's there, + * in which case we will put ours at the same offset into the block.. + * (I think this is stupid [Julian]) + * Note that you can't write a label out over a corrupted label! + * (also stupid.. how do you write the first one? by raw writes?) + */ + bp->b_flags &= ~B_INVAL; + bp->b_flags |= B_BUSY | B_READ; + (*strat)(bp); + error = biowait(bp); + if (error) + goto done; + for (dlp = (struct disklabel *)bp->b_data; + dlp <= (struct disklabel *) + ((char *)bp->b_data + lp->d_secsize - sizeof(*dlp)); + dlp = (struct disklabel *)((char *)dlp + sizeof(long))) { + if (dlp->d_magic == DISKMAGIC && dlp->d_magic2 == DISKMAGIC && + dkcksum(dlp) == 0) { + *dlp = *lp; + bp->b_flags &= ~(B_DONE | B_READ); + bp->b_flags |= B_BUSY | B_WRITE; +#ifdef __alpha__ + alpha_fix_srm_checksum(bp); +#endif + (*strat)(bp); + error = biowait(bp); + goto done; + } + } + error = ESRCH; +done: +#else + bzero(bp->b_data, lp->d_secsize); + dlp = (struct disklabel *)bp->b_data; + *dlp = *lp; + bp->b_flags &= ~B_INVAL; + bp->b_flags |= B_BUSY | B_WRITE; + (*strat)(bp); + error = biowait(bp); +#endif + bp->b_flags |= B_INVAL | B_AGE; + brelse(bp); + return (error); +} + +/* + * Compute checksum for disk label. + */ +u_int +dkcksum(lp) + register struct disklabel *lp; +{ + register u_short *start, *end; + register u_short sum = 0; + + start = (u_short *)lp; + end = (u_short *)&lp->d_partitions[lp->d_npartitions]; + while (start < end) + sum ^= *start++; + return (sum); +} + +/* + * Disk error is the preface to plaintive error messages + * about failing disk transfers. It prints messages of the form + +hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d) + + * if the offset of the error in the transfer and a disk label + * are both available. blkdone should be -1 if the position of the error + * is unknown; the disklabel pointer may be null from drivers that have not + * been converted to use them. The message is printed with printf + * if pri is LOG_PRINTF, otherwise it uses log at the specified priority. + * The message should be completed (with at least a newline) with printf + * or addlog, respectively. There is no trailing space. + */ +void +diskerr(bp, dname, what, pri, blkdone, lp) + register struct buf *bp; + char *dname, *what; + int pri, blkdone; + register struct disklabel *lp; +{ + int unit = dkunit(bp->b_dev); + int slice = dkslice(bp->b_dev); + int part = dkpart(bp->b_dev); + register int (*pr) __P((const char *, ...)); + char partname[2]; + char *sname; + daddr_t sn; + + if (pri != LOG_PRINTF) { + log(pri, "%s", ""); + pr = addlog; + } else + pr = printf; + sname = dsname(dname, unit, slice, part, partname); + (*pr)("%s%s: %s %sing fsbn ", sname, partname, what, + bp->b_flags & B_READ ? "read" : "writ"); + sn = bp->b_blkno; + if (bp->b_bcount <= DEV_BSIZE) + (*pr)("%ld", (long)sn); + else { + if (blkdone >= 0) { + sn += blkdone; + (*pr)("%ld of ", (long)sn); + } + (*pr)("%ld-%ld", (long)bp->b_blkno, + (long)(bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE)); + } + if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) { +#ifdef tahoe + sn *= DEV_BSIZE / lp->d_secsize; /* XXX */ +#endif + sn += lp->d_partitions[part].p_offset; + /* + * XXX should add slice offset and not print the slice, + * but we don't know the slice pointer. + * XXX should print bp->b_pblkno so that this will work + * independent of slices, labels and bad sector remapping, + * but some drivers don't set bp->b_pblkno. + */ + (*pr)(" (%s bn %ld; cn %ld", sname, (long)sn, + (long)(sn / lp->d_secpercyl)); + sn %= (long)lp->d_secpercyl; + (*pr)(" tn %ld sn %ld)", (long)(sn / lp->d_nsectors), + (long)(sn % lp->d_nsectors)); + } +} diff --git a/sys/kern/subr_diskmbr.c b/sys/kern/subr_diskmbr.c new file mode 100644 index 0000000..adfd39c --- /dev/null +++ b/sys/kern/subr_diskmbr.c @@ -0,0 +1,445 @@ +/*- + * Copyright (c) 1994 Bruce D. Evans. + * All rights reserved. + * + * Copyright (c) 1982, 1986, 1988 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)ufs_disksubr.c 7.16 (Berkeley) 5/4/91 + * from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $ + * $Id: diskslice_machdep.c,v 1.31 1998/08/10 07:22:14 phk Exp $ + */ + +#include <sys/param.h> +#include <sys/buf.h> +#include <sys/conf.h> +#include <sys/disklabel.h> +#define DOSPTYP_EXTENDED 5 +#define DOSPTYP_EXTENDEDX 15 +#define DOSPTYP_ONTRACK 84 +#include <sys/diskslice.h> +#include <sys/malloc.h> +#include <sys/syslog.h> +#include <sys/systm.h> + +#define TRACE(str) do { if (dsi_debug) printf str; } while (0) + +static volatile u_char dsi_debug; + +static struct dos_partition historical_bogus_partition_table[NDOSPART] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, + { 0x80, 0, 1, 0, DOSPTYP_386BSD, 255, 255, 255, 0, 50000, }, +}; + +static int check_part __P((char *sname, struct dos_partition *dp, + u_long offset, int nsectors, int ntracks, + u_long mbr_offset)); +static void extended __P((char *dname, dev_t dev, d_strategy_t *strat, + struct disklabel *lp, struct diskslices *ssp, + u_long ext_offset, u_long ext_size, + u_long base_ext_offset, int nsectors, int ntracks, + u_long mbr_offset)); + +static int +check_part(sname, dp, offset, nsectors, ntracks, mbr_offset ) + char *sname; + struct dos_partition *dp; + u_long offset; + int nsectors; + int ntracks; + u_long mbr_offset; +{ + int chs_ecyl; + int chs_esect; + int chs_scyl; + int chs_ssect; + int error; + u_long esector; + u_long esector1; + u_long secpercyl; + u_long ssector; + u_long ssector1; + + secpercyl = (u_long)nsectors * ntracks; + chs_scyl = DPCYL(dp->dp_scyl, dp->dp_ssect); + chs_ssect = DPSECT(dp->dp_ssect); + ssector = chs_ssect - 1 + dp->dp_shd * nsectors + chs_scyl * secpercyl + + mbr_offset; + ssector1 = offset + dp->dp_start; + + /* + * If ssector1 is on a cylinder >= 1024, then ssector can't be right. + * Allow the C/H/S for it to be 1023/ntracks-1/nsectors, or correct + * apart from the cylinder being reduced modulo 1024. Always allow + * 1023/255/63. + */ + if (ssector < ssector1 + && ((chs_ssect == nsectors && dp->dp_shd == ntracks - 1 + && chs_scyl == 1023) + || (secpercyl != 0 + && (ssector1 - ssector) % (1024 * secpercyl) == 0)) + || (dp->dp_scyl == 255 && dp->dp_shd == 255 + && dp->dp_ssect == 255)) { + TRACE(("%s: C/H/S start %d/%d/%d, start %lu: allow\n", + sname, chs_scyl, dp->dp_shd, chs_ssect, ssector1)); + ssector = ssector1; + } + + chs_ecyl = DPCYL(dp->dp_ecyl, dp->dp_esect); + chs_esect = DPSECT(dp->dp_esect); + esector = chs_esect - 1 + dp->dp_ehd * nsectors + chs_ecyl * secpercyl + + mbr_offset; + esector1 = ssector1 + dp->dp_size - 1; + + /* Allow certain bogus C/H/S values for esector, as above. */ + if (esector < esector1 + && ((chs_esect == nsectors && dp->dp_ehd == ntracks - 1 + && chs_ecyl == 1023) + || (secpercyl != 0 + && (esector1 - esector) % (1024 * secpercyl) == 0)) + || (dp->dp_ecyl == 255 && dp->dp_ehd == 255 + && dp->dp_esect == 255)) { + TRACE(("%s: C/H/S end %d/%d/%d, end %lu: allow\n", + sname, chs_ecyl, dp->dp_ehd, chs_esect, esector1)); + esector = esector1; + } + + error = (ssector == ssector1 && esector == esector1) ? 0 : EINVAL; + if (bootverbose) + printf("%s: type 0x%x, start %lu, end = %lu, size %lu %s\n", + sname, dp->dp_typ, ssector1, esector1, + (u_long)dp->dp_size, error ? "" : ": OK"); + if (ssector != ssector1 && bootverbose) + printf("%s: C/H/S start %d/%d/%d (%lu) != start %lu: invalid\n", + sname, chs_scyl, dp->dp_shd, chs_ssect, + ssector, ssector1); + if (esector != esector1 && bootverbose) + printf("%s: C/H/S end %d/%d/%d (%lu) != end %lu: invalid\n", + sname, chs_ecyl, dp->dp_ehd, chs_esect, + esector, esector1); + return (error); +} + +int +dsinit(dname, dev, strat, lp, sspp) + char *dname; + dev_t dev; + d_strategy_t *strat; + struct disklabel *lp; + struct diskslices **sspp; +{ + struct buf *bp; + u_char *cp; + int dospart; + struct dos_partition *dp; + struct dos_partition *dp0; + int error; + int max_ncyls; + int max_nsectors; + int max_ntracks; + u_long mbr_offset; + char partname[2]; + u_long secpercyl; + char *sname; + struct diskslice *sp; + struct diskslices *ssp; + + mbr_offset = DOSBBSECTOR; +reread_mbr: + /* Read master boot record. */ + bp = geteblk((int)lp->d_secsize); + bp->b_dev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART); + bp->b_blkno = mbr_offset; + bp->b_bcount = lp->d_secsize; + bp->b_flags |= B_BUSY | B_READ; + (*strat)(bp); + if (biowait(bp) != 0) { + diskerr(bp, dname, "error reading primary partition table", + LOG_PRINTF, 0, (struct disklabel *)NULL); + printf("\n"); + error = EIO; + goto done; + } + + /* Weakly verify it. */ + cp = bp->b_data; + sname = dsname(dname, dkunit(dev), WHOLE_DISK_SLICE, RAW_PART, + partname); + if (cp[0x1FE] != 0x55 || cp[0x1FF] != 0xAA) { + if (bootverbose) + printf("%s: invalid primary partition table: no magic\n", + sname); + error = EINVAL; + goto done; + } + dp0 = (struct dos_partition *)(cp + DOSPARTOFF); + + /* Check for "Ontrack Diskmanager". */ + for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) { + if (dp->dp_typ == DOSPTYP_ONTRACK) { + if (bootverbose) + printf( + "%s: Found \"Ontrack Disk Manager\" on this disk.\n", sname); + bp->b_flags |= B_INVAL | B_AGE; + brelse(bp); + mbr_offset = 63; + goto reread_mbr; + } + } + + if (bcmp(dp0, historical_bogus_partition_table, + sizeof historical_bogus_partition_table) == 0) { + TRACE(("%s: invalid primary partition table: historical\n", + sname)); + error = EINVAL; + goto done; + } + + /* Guess the geometry. */ + /* + * TODO: + * Perhaps skip entries with 0 size. + * Perhaps only look at entries of type DOSPTYP_386BSD. + */ + max_ncyls = 0; + max_nsectors = 0; + max_ntracks = 0; + for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) { + int ncyls; + int nsectors; + int ntracks; + + ncyls = DPCYL(dp->dp_ecyl, dp->dp_esect) + 1; + if (max_ncyls < ncyls) + max_ncyls = ncyls; + nsectors = DPSECT(dp->dp_esect); + if (max_nsectors < nsectors) + max_nsectors = nsectors; + ntracks = dp->dp_ehd + 1; + if (max_ntracks < ntracks) + max_ntracks = ntracks; + } + + /* + * Check that we have guessed the geometry right by checking the + * partition entries. + */ + /* + * TODO: + * As above. + * Check for overlaps. + * Check against d_secperunit if the latter is reliable. + */ + error = 0; + for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) { + if (dp->dp_scyl == 0 && dp->dp_shd == 0 && dp->dp_ssect == 0 + && dp->dp_start == 0 && dp->dp_size == 0) + continue; + sname = dsname(dname, dkunit(dev), BASE_SLICE + dospart, + RAW_PART, partname); + + /* + * Temporarily ignore errors from this check. We could + * simplify things by accepting the table eariler if we + * always ignore errors here. Perhaps we should always + * accept the table if the magic is right but not let + * bad entries affect the geometry. + */ + check_part(sname, dp, mbr_offset, max_nsectors, max_ntracks, + mbr_offset); + } + if (error != 0) + goto done; + + /* + * Accept the DOS partition table. + * First adjust the label (we have been careful not to change it + * before we can guarantee success). + */ + secpercyl = (u_long)max_nsectors * max_ntracks; + if (secpercyl != 0) { + u_long secperunit; + + lp->d_nsectors = max_nsectors; + lp->d_ntracks = max_ntracks; + lp->d_secpercyl = secpercyl; + secperunit = secpercyl * max_ncyls; + if (lp->d_secperunit < secperunit) + lp->d_secperunit = secperunit; + lp->d_ncylinders = lp->d_secperunit / secpercyl; + } + + /* + * We are passed a pointer to a suitably initialized minimal + * slices "struct" with no dangling pointers in it. Replace it + * by a maximal one. This usually oversizes the "struct", but + * enlarging it while searching for logical drives would be + * inconvenient. + */ + free(*sspp, M_DEVBUF); + ssp = dsmakeslicestruct(MAX_SLICES, lp); + *sspp = ssp; + + /* Initialize normal slices. */ + sp = &ssp->dss_slices[BASE_SLICE]; + for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++, sp++) { + sp->ds_offset = mbr_offset + dp->dp_start; + sp->ds_size = dp->dp_size; + sp->ds_type = dp->dp_typ; +#if 0 + lp->d_subtype |= (lp->d_subtype & 3) | dospart + | DSTYPE_INDOSPART; +#endif + } + ssp->dss_nslices = BASE_SLICE + NDOSPART; + + /* Handle extended partitions. */ + sp -= NDOSPART; + for (dospart = 0; dospart < NDOSPART; dospart++, sp++) + if (sp->ds_type == DOSPTYP_EXTENDED || + sp->ds_type == DOSPTYP_EXTENDEDX) + extended(dname, bp->b_dev, strat, lp, ssp, + sp->ds_offset, sp->ds_size, sp->ds_offset, + max_nsectors, max_ntracks, mbr_offset); + +done: + bp->b_flags |= B_INVAL | B_AGE; + brelse(bp); + if (error == EINVAL) + error = 0; + return (error); +} + +void +extended(dname, dev, strat, lp, ssp, ext_offset, ext_size, base_ext_offset, + nsectors, ntracks, mbr_offset) + char *dname; + dev_t dev; + struct disklabel *lp; + d_strategy_t *strat; + struct diskslices *ssp; + u_long ext_offset; + u_long ext_size; + u_long base_ext_offset; + int nsectors; + int ntracks; + u_long mbr_offset; +{ + struct buf *bp; + u_char *cp; + int dospart; + struct dos_partition *dp; + u_long ext_offsets[NDOSPART]; + u_long ext_sizes[NDOSPART]; + char partname[2]; + int slice; + char *sname; + struct diskslice *sp; + + /* Read extended boot record. */ + bp = geteblk((int)lp->d_secsize); + bp->b_dev = dev; + bp->b_blkno = ext_offset; + bp->b_bcount = lp->d_secsize; + bp->b_flags |= B_BUSY | B_READ; + (*strat)(bp); + if (biowait(bp) != 0) { + diskerr(bp, dname, "error reading extended partition table", + LOG_PRINTF, 0, (struct disklabel *)NULL); + printf("\n"); + goto done; + } + + /* Weakly verify it. */ + cp = bp->b_data; + if (cp[0x1FE] != 0x55 || cp[0x1FF] != 0xAA) { + sname = dsname(dname, dkunit(dev), WHOLE_DISK_SLICE, RAW_PART, + partname); + if (bootverbose) + printf("%s: invalid extended partition table: no magic\n", + sname); + goto done; + } + + for (dospart = 0, + dp = (struct dos_partition *)(bp->b_data + DOSPARTOFF), + slice = ssp->dss_nslices, sp = &ssp->dss_slices[slice]; + dospart < NDOSPART; dospart++, dp++) { + ext_sizes[dospart] = 0; + if (dp->dp_scyl == 0 && dp->dp_shd == 0 && dp->dp_ssect == 0 + && dp->dp_start == 0 && dp->dp_size == 0) + continue; + if (dp->dp_typ == DOSPTYP_EXTENDED || + dp->dp_typ == DOSPTYP_EXTENDEDX) { + char buf[32]; + + sname = dsname(dname, dkunit(dev), WHOLE_DISK_SLICE, + RAW_PART, partname); + snprintf(buf, sizeof(buf), "%s", sname); + if (strlen(buf) < sizeof buf - 11) + strcat(buf, "<extended>"); + check_part(buf, dp, base_ext_offset, nsectors, + ntracks, mbr_offset); + ext_offsets[dospart] = base_ext_offset + dp->dp_start; + ext_sizes[dospart] = dp->dp_size; + } else { + sname = dsname(dname, dkunit(dev), slice, RAW_PART, + partname); + check_part(sname, dp, ext_offset, nsectors, ntracks, + mbr_offset); + if (slice >= MAX_SLICES) { + printf("%s: too many slices\n", sname); + slice++; + continue; + } + sp->ds_offset = ext_offset + dp->dp_start; + sp->ds_size = dp->dp_size; + sp->ds_type = dp->dp_typ; + ssp->dss_nslices++; + slice++; + sp++; + } + } + + /* If we found any more slices, recursively find all the subslices. */ + for (dospart = 0; dospart < NDOSPART; dospart++) + if (ext_sizes[dospart] != 0) + extended(dname, dev, strat, lp, ssp, + ext_offsets[dospart], ext_sizes[dospart], + base_ext_offset, nsectors, ntracks, + mbr_offset); + +done: + bp->b_flags |= B_INVAL | B_AGE; + brelse(bp); +} diff --git a/sys/kern/subr_diskslice.c b/sys/kern/subr_diskslice.c new file mode 100644 index 0000000..fa0e4a4 --- /dev/null +++ b/sys/kern/subr_diskslice.c @@ -0,0 +1,1192 @@ +/*- + * Copyright (c) 1994 Bruce D. Evans. + * All rights reserved. + * + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Copyright (c) 1982, 1986, 1988 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)wd.c 7.2 (Berkeley) 5/9/91 + * from: wd.c,v 1.55 1994/10/22 01:57:12 phk Exp $ + * from: @(#)ufs_disksubr.c 7.16 (Berkeley) 5/4/91 + * from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $ + * $Id: subr_diskslice.c,v 1.60 1998/12/04 22:54:51 archie Exp $ + */ + +#include "opt_devfs.h" + +#include <stddef.h> + +#include <sys/param.h> +#include <sys/buf.h> +#include <sys/conf.h> +#ifdef DEVFS +#include <sys/devfsext.h> +#endif +#include <sys/disklabel.h> +#include <sys/diskslice.h> +#include <sys/dkbad.h> +#include <sys/fcntl.h> +#include <sys/malloc.h> +#include <sys/stat.h> +#include <sys/syslog.h> +#include <sys/systm.h> +#include <sys/vnode.h> + +#include <ufs/ffs/fs.h> + +#define TRACE(str) do { if (ds_debug) printf str; } while (0) + +typedef u_char bool_t; + +static volatile bool_t ds_debug; + +static struct disklabel *clone_label __P((struct disklabel *lp)); +static void dsiodone __P((struct buf *bp)); +static char *fixlabel __P((char *sname, struct diskslice *sp, + struct disklabel *lp, int writeflag)); +static void free_ds_label __P((struct diskslices *ssp, int slice)); +#ifdef DEVFS +static void free_ds_labeldevs __P((struct diskslices *ssp, int slice)); +#endif +static void partition_info __P((char *sname, int part, struct partition *pp)); +static void slice_info __P((char *sname, struct diskslice *sp)); +static void set_ds_bad __P((struct diskslices *ssp, int slice, + struct dkbad_intern *btp)); +static void set_ds_label __P((struct diskslices *ssp, int slice, + struct disklabel *lp)); +#ifdef DEVFS +static void set_ds_labeldevs __P((char *dname, dev_t dev, + struct diskslices *ssp)); +static void set_ds_labeldevs_unaliased __P((char *dname, dev_t dev, + struct diskslices *ssp)); +#endif +static void set_ds_wlabel __P((struct diskslices *ssp, int slice, + int wlabel)); + +/* + * Duplicate a label for the whole disk, and initialize defaults in the + * copy for fields that are not already initialized. The caller only + * needs to initialize d_secsize and d_secperunit, and zero the fields + * that are to be defaulted. + */ +static struct disklabel * +clone_label(lp) + struct disklabel *lp; +{ + struct disklabel *lp1; + + lp1 = malloc(sizeof *lp1, M_DEVBUF, M_WAITOK); + *lp1 = *lp; + lp = NULL; + if (lp1->d_typename[0] == '\0') + strncpy(lp1->d_typename, "amnesiac", sizeof(lp1->d_typename)); + if (lp1->d_packname[0] == '\0') + strncpy(lp1->d_packname, "fictitious", sizeof(lp1->d_packname)); + if (lp1->d_nsectors == 0) + lp1->d_nsectors = 32; + if (lp1->d_ntracks == 0) + lp1->d_ntracks = 64; + lp1->d_secpercyl = lp1->d_nsectors * lp1->d_ntracks; + lp1->d_ncylinders = lp1->d_secperunit / lp1->d_secpercyl; + if (lp1->d_rpm == 0) + lp1->d_rpm = 3600; + if (lp1->d_interleave == 0) + lp1->d_interleave = 1; + if (lp1->d_npartitions < RAW_PART + 1) + lp1->d_npartitions = MAXPARTITIONS; + if (lp1->d_bbsize == 0) + lp1->d_bbsize = BBSIZE; + if (lp1->d_sbsize == 0) + lp1->d_sbsize = SBSIZE; + lp1->d_partitions[RAW_PART].p_size = lp1->d_secperunit; + lp1->d_magic = DISKMAGIC; + lp1->d_magic2 = DISKMAGIC; + lp1->d_checksum = dkcksum(lp1); + return (lp1); +} + +/* + * Determine the size of the transfer, and make sure it is + * within the boundaries of the partition. Adjust transfer + * if needed, and signal errors or early completion. + * + * XXX TODO: + * o Do bad sector remapping. May need to split buffer. + * o Split buffers that are too big for the device. + * o Check for overflow. + * o Finish cleaning this up. + */ +int +dscheck(bp, ssp) + struct buf *bp; + struct diskslices *ssp; +{ + daddr_t blkno; + u_long endsecno; + daddr_t labelsect; + struct disklabel *lp; + char *msg; + long nsec; + struct partition *pp; + daddr_t secno; + daddr_t slicerel_secno; + struct diskslice *sp; + int s; + + blkno = bp->b_blkno; + if (blkno < 0) { + printf("dscheck: negative b_blkno %ld\n", (long)blkno); + bp->b_error = EINVAL; + goto bad; + } + sp = &ssp->dss_slices[dkslice(bp->b_dev)]; + lp = sp->ds_label; + if (ssp->dss_secmult == 1) { + if (bp->b_bcount % (u_long)DEV_BSIZE) + goto bad_bcount; + secno = blkno; + nsec = bp->b_bcount >> DEV_BSHIFT; + } else if (ssp->dss_secshift != -1) { + if (bp->b_bcount & (ssp->dss_secsize - 1)) + goto bad_bcount; + if (blkno & (ssp->dss_secmult - 1)) + goto bad_blkno; + secno = blkno >> ssp->dss_secshift; + nsec = bp->b_bcount >> (DEV_BSHIFT + ssp->dss_secshift); + } else { + if (bp->b_bcount % ssp->dss_secsize) + goto bad_bcount; + if (blkno % ssp->dss_secmult) + goto bad_blkno; + secno = blkno / ssp->dss_secmult; + nsec = bp->b_bcount / ssp->dss_secsize; + } + if (lp == NULL) { + labelsect = -LABELSECTOR - 1; + endsecno = sp->ds_size; + slicerel_secno = secno; + } else { + labelsect = lp->d_partitions[LABEL_PART].p_offset; +if (labelsect != 0) Debugger("labelsect != 0 in dscheck()"); + pp = &lp->d_partitions[dkpart(bp->b_dev)]; + endsecno = pp->p_size; + slicerel_secno = pp->p_offset + secno; + if (sp->ds_bad != NULL && ds_debug) { + daddr_t newsecno; + + newsecno = transbad144(sp->ds_bad, slicerel_secno); + if (newsecno != slicerel_secno) + printf("should map bad sector %ld -> %ld\n", + (long)slicerel_secno, (long)newsecno); + } + } + + /* overwriting disk label ? */ + /* XXX should also protect bootstrap in first 8K */ + if (slicerel_secno <= LABELSECTOR + labelsect && +#if LABELSECTOR != 0 + slicerel_secno + nsec > LABELSECTOR + labelsect && +#endif + (bp->b_flags & B_READ) == 0 && sp->ds_wlabel == 0) { + bp->b_error = EROFS; + goto bad; + } + +#if defined(DOSBBSECTOR) && defined(notyet) + /* overwriting master boot record? */ + if (slicerel_secno <= DOSBBSECTOR && (bp->b_flags & B_READ) == 0 && + sp->ds_wlabel == 0) { + bp->b_error = EROFS; + goto bad; + } +#endif + + /* beyond partition? */ + if (secno + nsec > endsecno) { + /* if exactly at end of disk, return an EOF */ + if (secno == endsecno) { + bp->b_resid = bp->b_bcount; + return (0); + } + /* or truncate if part of it fits */ + nsec = endsecno - secno; + if (nsec <= 0) { + bp->b_error = EINVAL; + goto bad; + } + bp->b_bcount = nsec * ssp->dss_secsize; + } + + bp->b_pblkno = sp->ds_offset + slicerel_secno; + + /* + * Snoop on label accesses if the slice offset is nonzero. Fudge + * offsets in the label to keep the in-core label coherent with + * the on-disk one. + */ + if (slicerel_secno <= LABELSECTOR + labelsect +#if LABELSECTOR != 0 + && slicerel_secno + nsec > LABELSECTOR + labelsect +#endif + && sp->ds_offset != 0) { + struct iodone_chain *ic; + + ic = malloc(sizeof *ic , M_DEVBUF, M_WAITOK); + ic->ic_prev_flags = bp->b_flags; + ic->ic_prev_iodone = bp->b_iodone; + ic->ic_prev_iodone_chain = bp->b_iodone_chain; + ic->ic_args[0].ia_long = (LABELSECTOR + labelsect - + slicerel_secno) * ssp->dss_secsize; + ic->ic_args[1].ia_ptr = sp; + bp->b_flags |= B_CALL; + bp->b_iodone = dsiodone; + bp->b_iodone_chain = ic; + if (!(bp->b_flags & B_READ)) { + /* + * XXX even disklabel(8) writes directly so we need + * to adjust writes. Perhaps we should drop support + * for DIOCWLABEL (always write protect labels) and + * require the use of DIOCWDINFO. + * + * XXX probably need to copy the data to avoid even + * temporarily corrupting the in-core copy. + */ + if (bp->b_vp != NULL) { + s = splbio(); + bp->b_vp->v_numoutput++; + splx(s); + } + /* XXX need name here. */ + msg = fixlabel((char *)NULL, sp, + (struct disklabel *) + (bp->b_data + ic->ic_args[0].ia_long), + TRUE); + if (msg != NULL) { + printf("%s\n", msg); + bp->b_error = EROFS; + goto bad; + } + } + } + return (1); + +bad_bcount: + printf("dscheck: b_bcount %ld is not on a sector boundary (ssize %d)\n", + bp->b_bcount, ssp->dss_secsize); + bp->b_error = EINVAL; + goto bad; + +bad_blkno: + printf("dscheck: b_blkno %ld is not on a sector boundary (ssize %d)\n", + (long)blkno, ssp->dss_secsize); + bp->b_error = EINVAL; + goto bad; + +bad: + bp->b_resid = bp->b_bcount; + bp->b_flags |= B_ERROR; + return (-1); +} + +void +dsclose(dev, mode, ssp) + dev_t dev; + int mode; + struct diskslices *ssp; +{ + u_char mask; + struct diskslice *sp; + + sp = &ssp->dss_slices[dkslice(dev)]; + mask = 1 << dkpart(dev); + switch (mode) { + case S_IFBLK: + sp->ds_bopenmask &= ~mask; + break; + case S_IFCHR: + sp->ds_copenmask &= ~mask; + break; + } + sp->ds_openmask = sp->ds_bopenmask | sp->ds_copenmask; +} + +void +dsgone(sspp) + struct diskslices **sspp; +{ + int slice; + struct diskslice *sp; + struct diskslices *ssp; + + for (slice = 0, ssp = *sspp; slice < ssp->dss_nslices; slice++) { + sp = &ssp->dss_slices[slice]; + if (sp->ds_bad != NULL) { + free(sp->ds_bad, M_DEVBUF); + set_ds_bad(ssp, slice, (struct dkbad_intern *)NULL); + } +#ifdef DEVFS + if (sp->ds_bdev != NULL) + devfs_remove_dev(sp->ds_bdev); + if (sp->ds_cdev != NULL) + devfs_remove_dev(sp->ds_cdev); +#endif + free_ds_label(ssp, slice); + } + free(ssp, M_DEVBUF); + *sspp = NULL; +} + +/* + * For the "write" commands (DIOCSBAD, DIOCSDINFO and DIOCWDINFO), this + * is subject to the same restriction as dsopen(). + */ +int +dsioctl(dname, dev, cmd, data, flags, sspp, strat, setgeom) + char *dname; + dev_t dev; + u_long cmd; + caddr_t data; + int flags; + struct diskslices **sspp; + d_strategy_t *strat; + ds_setgeom_t *setgeom; +{ + int error; + struct disklabel *lp; + int old_wlabel; + u_char openmask; + int part; + int slice; + struct diskslice *sp; + struct diskslices *ssp; + + slice = dkslice(dev); + ssp = *sspp; + sp = &ssp->dss_slices[slice]; + lp = sp->ds_label; + switch (cmd) { + + case DIOCGDINFO: + if (lp == NULL) + return (EINVAL); + *(struct disklabel *)data = *lp; + return (0); + +#ifdef notyet + case DIOCGDINFOP: + if (lp == NULL) + return (EINVAL); + *(struct disklabel **)data = lp; + return (0); +#endif + + case DIOCGPART: + if (lp == NULL) + return (EINVAL); + ((struct partinfo *)data)->disklab = lp; + ((struct partinfo *)data)->part + = &lp->d_partitions[dkpart(dev)]; + return (0); + + case DIOCGSLICEINFO: + bcopy(ssp, data, (char *)&ssp->dss_slices[ssp->dss_nslices] - + (char *)ssp); + return (0); + + case DIOCSBAD: + if (slice == WHOLE_DISK_SLICE) + return (ENODEV); + if (!(flags & FWRITE)) + return (EBADF); + if (lp == NULL) + return (EINVAL); + if (sp->ds_bad != NULL) + free(sp->ds_bad, M_DEVBUF); + set_ds_bad(ssp, slice, internbad144((struct dkbad *)data, lp)); + return (0); + + case DIOCSDINFO: + if (slice == WHOLE_DISK_SLICE) + return (ENODEV); + if (!(flags & FWRITE)) + return (EBADF); + lp = malloc(sizeof *lp, M_DEVBUF, M_WAITOK); + if (sp->ds_label == NULL) + bzero(lp, sizeof *lp); + else + bcopy(sp->ds_label, lp, sizeof *lp); + if (sp->ds_label == NULL) + openmask = 0; + else { + openmask = sp->ds_openmask; + if (slice == COMPATIBILITY_SLICE) + openmask |= ssp->dss_slices[ + ssp->dss_first_bsd_slice].ds_openmask; + else if (slice == ssp->dss_first_bsd_slice) + openmask |= ssp->dss_slices[ + COMPATIBILITY_SLICE].ds_openmask; + } + error = setdisklabel(lp, (struct disklabel *)data, + (u_long)openmask); + /* XXX why doesn't setdisklabel() check this? */ + if (error == 0 && lp->d_partitions[RAW_PART].p_offset != 0) + error = EXDEV; + if (error == 0) { + if (lp->d_secperunit > sp->ds_size) + error = ENOSPC; + for (part = 0; part < lp->d_npartitions; part++) + if (lp->d_partitions[part].p_size > sp->ds_size) + error = ENOSPC; + } +#if 0 /* XXX */ + if (error != 0 && setgeom != NULL) + error = setgeom(lp); +#endif + if (error != 0) { + free(lp, M_DEVBUF); + return (error); + } + free_ds_label(ssp, slice); + set_ds_label(ssp, slice, lp); +#ifdef DEVFS + set_ds_labeldevs(dname, dev, ssp); +#endif + return (0); + + case DIOCSYNCSLICEINFO: + if (slice != WHOLE_DISK_SLICE || dkpart(dev) != RAW_PART) + return (EINVAL); + if (!*(int *)data) + for (slice = 0; slice < ssp->dss_nslices; slice++) { + openmask = ssp->dss_slices[slice].ds_openmask; + if (openmask + && (slice != WHOLE_DISK_SLICE + || openmask & ~(1 << RAW_PART))) + return (EBUSY); + } + + /* + * Temporarily forget the current slices struct and read + * the current one. + * XXX should wait for current accesses on this disk to + * complete, then lock out future accesses and opens. + */ + *sspp = NULL; + lp = malloc(sizeof *lp, M_DEVBUF, M_WAITOK); + *lp = *ssp->dss_slices[WHOLE_DISK_SLICE].ds_label; + error = dsopen(dname, dev, + ssp->dss_slices[WHOLE_DISK_SLICE].ds_copenmask + & (1 << RAW_PART) ? S_IFCHR : S_IFBLK, + ssp->dss_oflags, sspp, lp, strat, setgeom, + ssp->dss_cdevsw); + if (error != 0) { + free(lp, M_DEVBUF); + *sspp = ssp; + return (error); + } + + /* + * Reopen everything. This is a no-op except in the "force" + * case and when the raw bdev and cdev are both open. Abort + * if anything fails. + */ + for (slice = 0; slice < ssp->dss_nslices; slice++) { + for (openmask = ssp->dss_slices[slice].ds_bopenmask, + part = 0; openmask; openmask >>= 1, part++) { + if (!(openmask & 1)) + continue; + error = dsopen(dname, + dkmodslice(dkmodpart(dev, part), + slice), + S_IFBLK, ssp->dss_oflags, sspp, + lp, strat, setgeom, + ssp->dss_cdevsw); + if (error != 0) { + /* XXX should free devfs toks. */ + free(lp, M_DEVBUF); + /* XXX should restore devfs toks. */ + *sspp = ssp; + return (EBUSY); + } + } + for (openmask = ssp->dss_slices[slice].ds_copenmask, + part = 0; openmask; openmask >>= 1, part++) { + if (!(openmask & 1)) + continue; + error = dsopen(dname, + dkmodslice(dkmodpart(dev, part), + slice), + S_IFCHR, ssp->dss_oflags, sspp, + lp, strat, setgeom, + ssp->dss_cdevsw); + if (error != 0) { + /* XXX should free devfs toks. */ + free(lp, M_DEVBUF); + /* XXX should restore devfs toks. */ + *sspp = ssp; + return (EBUSY); + } + } + } + + /* XXX devfs tokens? */ + free(lp, M_DEVBUF); + dsgone(&ssp); + return (0); + + case DIOCWDINFO: + error = dsioctl(dname, dev, DIOCSDINFO, data, flags, &ssp, + strat, setgeom); + if (error != 0) + return (error); + /* + * XXX this used to hack on dk_openpart to fake opening + * partition 0 in case that is used instead of dkpart(dev). + */ + old_wlabel = sp->ds_wlabel; + set_ds_wlabel(ssp, slice, TRUE); + error = writedisklabel(dev, strat, sp->ds_label); + /* XXX should invalidate in-core label if write failed. */ + set_ds_wlabel(ssp, slice, old_wlabel); + return (error); + + case DIOCWLABEL: + if (slice == WHOLE_DISK_SLICE) + return (ENODEV); + if (!(flags & FWRITE)) + return (EBADF); + set_ds_wlabel(ssp, slice, *(int *)data != 0); + return (0); + + default: + return (ENOIOCTL); + } +} + +static void +dsiodone(bp) + struct buf *bp; +{ + struct iodone_chain *ic; + char *msg; + + ic = bp->b_iodone_chain; + bp->b_flags = (ic->ic_prev_flags & B_CALL) + | (bp->b_flags & ~(B_CALL | B_DONE)); + bp->b_iodone = ic->ic_prev_iodone; + bp->b_iodone_chain = ic->ic_prev_iodone_chain; + if (!(bp->b_flags & B_READ) + || (!(bp->b_flags & B_ERROR) && bp->b_error == 0)) { + msg = fixlabel((char *)NULL, ic->ic_args[1].ia_ptr, + (struct disklabel *) + (bp->b_data + ic->ic_args[0].ia_long), + FALSE); + if (msg != NULL) + printf("%s\n", msg); + } + free(ic, M_DEVBUF); + biodone(bp); +} + +int +dsisopen(ssp) + struct diskslices *ssp; +{ + int slice; + + if (ssp == NULL) + return (0); + for (slice = 0; slice < ssp->dss_nslices; slice++) + if (ssp->dss_slices[slice].ds_openmask) + return (1); + return (0); +} + +/* + * Allocate a slices "struct" and initialize it to contain only an empty + * compatibility slice (pointing to itself), a whole disk slice (covering + * the disk as described by the label), and (nslices - BASE_SLICES) empty + * slices beginning at BASE_SLICE. + */ +struct diskslices * +dsmakeslicestruct(nslices, lp) + int nslices; + struct disklabel *lp; +{ + struct diskslice *sp; + struct diskslices *ssp; + + ssp = malloc(offsetof(struct diskslices, dss_slices) + + nslices * sizeof *sp, M_DEVBUF, M_WAITOK); + ssp->dss_cdevsw = NULL; + ssp->dss_first_bsd_slice = COMPATIBILITY_SLICE; + ssp->dss_nslices = nslices; + ssp->dss_oflags = 0; + ssp->dss_secmult = lp->d_secsize / DEV_BSIZE; + if (ssp->dss_secmult & (ssp->dss_secmult - 1)) + ssp->dss_secshift = -1; + else + ssp->dss_secshift = ffs(ssp->dss_secmult) - 1; + ssp->dss_secsize = lp->d_secsize; + sp = &ssp->dss_slices[0]; + bzero(sp, nslices * sizeof *sp); + sp[WHOLE_DISK_SLICE].ds_size = lp->d_secperunit; + return (ssp); +} + +char * +dsname(dname, unit, slice, part, partname) + char *dname; + int unit; + int slice; + int part; + char *partname; +{ + static char name[32]; + + if (strlen(dname) > 16) + dname = "nametoolong"; + snprintf(name, sizeof(name), "%s%d", dname, unit); + partname[0] = '\0'; + if (slice != WHOLE_DISK_SLICE || part != RAW_PART) { + partname[0] = 'a' + part; + partname[1] = '\0'; + if (slice != COMPATIBILITY_SLICE) + snprintf(name + strlen(name), + sizeof(name) - strlen(name), "s%d", slice - 1); + } + return (name); +} + +/* + * This should only be called when the unit is inactive and the strategy + * routine should not allow it to become active unless we call it. Our + * strategy routine must be special to allow activity. + */ +int +dsopen(dname, dev, mode, flags, sspp, lp, strat, setgeom, cdevsw) + char *dname; + dev_t dev; + int mode; + u_int flags; + struct diskslices **sspp; + struct disklabel *lp; + d_strategy_t *strat; + ds_setgeom_t *setgeom; + struct cdevsw *cdevsw; +{ + struct dkbad *btp; + dev_t dev1; + int error; + struct disklabel *lp1; + char *msg; + u_char mask; +#ifdef DEVFS + int mynor; +#endif + bool_t need_init; + int part; + char partname[2]; + int slice; + char *sname; + struct diskslice *sp; + struct diskslices *ssp; + int unit; + + if (lp->d_secsize % DEV_BSIZE) + return (EINVAL); + + /* + * XXX reinitialize the slice table unless there is an open device + * on the unit. This should only be done if the media has changed. + */ + ssp = *sspp; + need_init = !dsisopen(ssp); + if (ssp != NULL && need_init) + dsgone(sspp); + if (need_init) { + /* + * Allocate a minimal slices "struct". This will become + * the final slices "struct" if we don't want real slices + * or if we can't find any real slices. + */ + *sspp = dsmakeslicestruct(BASE_SLICE, lp); + + if (!(flags & DSO_ONESLICE)) { + TRACE(("dsinit\n")); + error = dsinit(dname, dev, strat, lp, sspp); + if (error != 0) { + dsgone(sspp); + return (error); + } + } + ssp = *sspp; + ssp->dss_oflags = flags; +#ifdef DEVFS + ssp->dss_cdevsw = cdevsw; +#endif + + /* + * If there are no real slices, then make the compatiblity + * slice cover the whole disk. + */ + if (ssp->dss_nslices == BASE_SLICE) + ssp->dss_slices[COMPATIBILITY_SLICE].ds_size + = lp->d_secperunit; + + /* Point the compatibility slice at the BSD slice, if any. */ + for (slice = BASE_SLICE; slice < ssp->dss_nslices; slice++) { + sp = &ssp->dss_slices[slice]; + if (sp->ds_type == DOSPTYP_386BSD /* XXX */) { + ssp->dss_first_bsd_slice = slice; + ssp->dss_slices[COMPATIBILITY_SLICE].ds_offset + = sp->ds_offset; + ssp->dss_slices[COMPATIBILITY_SLICE].ds_size + = sp->ds_size; + ssp->dss_slices[COMPATIBILITY_SLICE].ds_type + = sp->ds_type; + break; + } + } + + ssp->dss_slices[WHOLE_DISK_SLICE].ds_label = clone_label(lp); + ssp->dss_slices[WHOLE_DISK_SLICE].ds_wlabel = TRUE; + if (setgeom != NULL) { + error = setgeom(lp); + if (error != 0) { + dsgone(sspp); + return (error); + } + } + } + + unit = dkunit(dev); + + /* + * Initialize secondary info for all slices. It is needed for more + * than the current slice in the DEVFS case. + */ + for (slice = 0; slice < ssp->dss_nslices; slice++) { + sp = &ssp->dss_slices[slice]; + if (sp->ds_label != NULL) + continue; + dev1 = dkmodslice(dkmodpart(dev, RAW_PART), slice); + sname = dsname(dname, unit, slice, RAW_PART, partname); +#ifdef DEVFS + if (slice != COMPATIBILITY_SLICE && sp->ds_bdev == NULL + && sp->ds_size != 0) { + mynor = minor(dev1); + sp->ds_bdev = + devfs_add_devswf(bdevsw, mynor, DV_BLK, + UID_ROOT, GID_OPERATOR, 0640, + "%s", sname); + sp->ds_cdev = + devfs_add_devswf(cdevsw, mynor, DV_CHR, + UID_ROOT, GID_OPERATOR, 0640, + "r%s", sname); + } +#endif + /* + * XXX this should probably only be done for the need_init + * case, but there may be a problem with DIOCSYNCSLICEINFO. + */ + set_ds_wlabel(ssp, slice, TRUE); /* XXX invert */ + lp1 = clone_label(lp); + TRACE(("readdisklabel\n")); + if (flags & DSO_NOLABELS) + msg = NULL; + else + msg = readdisklabel(dev1, strat, lp1); +#if 0 /* XXX */ + if (msg == NULL && setgeom != NULL && setgeom(lp1) != 0) + msg = "setgeom failed"; +#endif + if (msg == NULL) + msg = fixlabel(sname, sp, lp1, FALSE); + if (msg == NULL && lp1->d_secsize != ssp->dss_secsize) + msg = "inconsistent sector size"; + if (msg != NULL) { + free(lp1, M_DEVBUF); + if (sp->ds_type == DOSPTYP_386BSD /* XXX */) + log(LOG_WARNING, "%s: cannot find label (%s)\n", + sname, msg); + continue; + } + if (lp1->d_flags & D_BADSECT) { + btp = malloc(sizeof *btp, M_DEVBUF, M_WAITOK); + TRACE(("readbad144\n")); + msg = readbad144(dev1, strat, lp1, btp); + if (msg != NULL) { + log(LOG_WARNING, + "%s: cannot find bad sector table (%s)\n", + sname, msg); + free(btp, M_DEVBUF); + free(lp1, M_DEVBUF); + continue; + } + set_ds_bad(ssp, slice, internbad144(btp, lp1)); + free(btp, M_DEVBUF); + if (sp->ds_bad == NULL) { + free(lp1, M_DEVBUF); + continue; + } + } + set_ds_label(ssp, slice, lp1); +#ifdef DEVFS + set_ds_labeldevs(dname, dev1, ssp); +#endif + set_ds_wlabel(ssp, slice, FALSE); + } + + slice = dkslice(dev); + if (slice >= ssp->dss_nslices) + return (ENXIO); + sp = &ssp->dss_slices[slice]; + part = dkpart(dev); + if (part != RAW_PART + && (sp->ds_label == NULL || part >= sp->ds_label->d_npartitions)) + return (EINVAL); /* XXX needs translation */ + mask = 1 << part; + switch (mode) { + case S_IFBLK: + sp->ds_bopenmask |= mask; + break; + case S_IFCHR: + sp->ds_copenmask |= mask; + break; + } + sp->ds_openmask = sp->ds_bopenmask | sp->ds_copenmask; + return (0); +} + +int +dssize(dev, sspp, dopen, dclose) + dev_t dev; + struct diskslices **sspp; + d_open_t dopen; + d_close_t dclose; +{ + struct disklabel *lp; + int part; + int slice; + struct diskslices *ssp; + + slice = dkslice(dev); + part = dkpart(dev); + ssp = *sspp; + if (ssp == NULL || slice >= ssp->dss_nslices + || !(ssp->dss_slices[slice].ds_bopenmask & (1 << part))) { + if (dopen(dev, FREAD, S_IFBLK, (struct proc *)NULL) != 0) + return (-1); + dclose(dev, FREAD, S_IFBLK, (struct proc *)NULL); + ssp = *sspp; + } + lp = ssp->dss_slices[slice].ds_label; + if (lp == NULL) + return (-1); + return ((int)lp->d_partitions[part].p_size); +} + +static void +free_ds_label(ssp, slice) + struct diskslices *ssp; + int slice; +{ + struct disklabel *lp; + struct diskslice *sp; + + sp = &ssp->dss_slices[slice]; + lp = sp->ds_label; + if (lp == NULL) + return; +#ifdef DEVFS + free_ds_labeldevs(ssp, slice); + if (slice == COMPATIBILITY_SLICE) + free_ds_labeldevs(ssp, ssp->dss_first_bsd_slice); + else if (slice == ssp->dss_first_bsd_slice) + free_ds_labeldevs(ssp, COMPATIBILITY_SLICE); +#endif + free(lp, M_DEVBUF); + set_ds_label(ssp, slice, (struct disklabel *)NULL); +} + +#ifdef DEVFS +static void +free_ds_labeldevs(ssp, slice) + struct diskslices *ssp; + int slice; +{ + struct disklabel *lp; + int part; + struct diskslice *sp; + + sp = &ssp->dss_slices[slice]; + lp = sp->ds_label; + if (lp == NULL) + return; + for (part = 0; part < lp->d_npartitions; part++) { + if (sp->ds_bdevs[part] != NULL) { + devfs_remove_dev(sp->ds_bdevs[part]); + sp->ds_bdevs[part] = NULL; + } + if (sp->ds_cdevs[part] != NULL) { + devfs_remove_dev(sp->ds_cdevs[part]); + sp->ds_cdevs[part] = NULL; + } + } +} +#endif + +static char * +fixlabel(sname, sp, lp, writeflag) + char *sname; + struct diskslice *sp; + struct disklabel *lp; + int writeflag; +{ + u_long end; + u_long offset; + int part; + struct partition *pp; + u_long start; + bool_t warned; + + /* These errors "can't happen" so don't bother reporting details. */ + if (lp->d_magic != DISKMAGIC || lp->d_magic2 != DISKMAGIC) + return ("fixlabel: invalid magic"); + if (dkcksum(lp) != 0) + return ("fixlabel: invalid checksum"); + + pp = &lp->d_partitions[RAW_PART]; + if (writeflag) { + start = 0; + offset = sp->ds_offset; + } else { + start = sp->ds_offset; + offset = -sp->ds_offset; + } + if (pp->p_offset != start) { + if (sname != NULL) { + printf( +"%s: rejecting BSD label: raw partition offset != slice offset\n", + sname); + slice_info(sname, sp); + partition_info(sname, RAW_PART, pp); + } + return ("fixlabel: raw partition offset != slice offset"); + } + if (pp->p_size != sp->ds_size) { + if (sname != NULL) { + printf("%s: raw partition size != slice size\n", sname); + slice_info(sname, sp); + partition_info(sname, RAW_PART, pp); + } + if (pp->p_size > sp->ds_size) { + if (sname == NULL) + return ("fixlabel: raw partition size > slice size"); + printf("%s: truncating raw partition\n", sname); + pp->p_size = sp->ds_size; + } + } + end = start + sp->ds_size; + if (start > end) + return ("fixlabel: slice wraps"); + if (lp->d_secpercyl <= 0) + return ("fixlabel: d_secpercyl <= 0"); + pp -= RAW_PART; + warned = FALSE; + for (part = 0; part < lp->d_npartitions; part++, pp++) { + if (pp->p_offset != 0 || pp->p_size != 0) { + if (pp->p_offset < start + || pp->p_offset + pp->p_size > end + || pp->p_offset + pp->p_size < pp->p_offset) { + if (sname != NULL) { + printf( +"%s: rejecting partition in BSD label: it isn't entirely within the slice\n", + sname); + if (!warned) { + slice_info(sname, sp); + warned = TRUE; + } + partition_info(sname, part, pp); + } + /* XXX else silently discard junk. */ + bzero(pp, sizeof *pp); + } else + pp->p_offset += offset; + } + } + lp->d_ncylinders = sp->ds_size / lp->d_secpercyl; + lp->d_secperunit = sp->ds_size; + lp->d_checksum = 0; + lp->d_checksum = dkcksum(lp); + return (NULL); +} + +static void +partition_info(sname, part, pp) + char *sname; + int part; + struct partition *pp; +{ + printf("%s%c: start %lu, end %lu, size %lu\n", sname, 'a' + part, + (u_long)pp->p_offset, (u_long)(pp->p_offset + pp->p_size - 1), + (u_long)pp->p_size); +} + +static void +slice_info(sname, sp) + char *sname; + struct diskslice *sp; +{ + printf("%s: start %lu, end %lu, size %lu\n", sname, + sp->ds_offset, sp->ds_offset + sp->ds_size - 1, sp->ds_size); +} + +/* + * Most changes to ds_bad, ds_label and ds_wlabel are made using the + * following functions to ensure coherency of the compatibility slice + * with the first BSD slice. The openmask fields are _not_ shared and + * the other fields (ds_offset and ds_size) aren't changed after they + * are initialized. + */ +static void +set_ds_bad(ssp, slice, btp) + struct diskslices *ssp; + int slice; + struct dkbad_intern *btp; +{ + ssp->dss_slices[slice].ds_bad = btp; + if (slice == COMPATIBILITY_SLICE) + ssp->dss_slices[ssp->dss_first_bsd_slice].ds_bad = btp; + else if (slice == ssp->dss_first_bsd_slice) + ssp->dss_slices[COMPATIBILITY_SLICE].ds_bad = btp; +} + +static void +set_ds_label(ssp, slice, lp) + struct diskslices *ssp; + int slice; + struct disklabel *lp; +{ + ssp->dss_slices[slice].ds_label = lp; + if (slice == COMPATIBILITY_SLICE) + ssp->dss_slices[ssp->dss_first_bsd_slice].ds_label = lp; + else if (slice == ssp->dss_first_bsd_slice) + ssp->dss_slices[COMPATIBILITY_SLICE].ds_label = lp; +} + +#ifdef DEVFS +static void +set_ds_labeldevs(dname, dev, ssp) + char *dname; + dev_t dev; + struct diskslices *ssp; +{ + int slice; + + set_ds_labeldevs_unaliased(dname, dev, ssp); + if (ssp->dss_first_bsd_slice == COMPATIBILITY_SLICE) + return; + slice = dkslice(dev); + if (slice == COMPATIBILITY_SLICE) + set_ds_labeldevs_unaliased(dname, + dkmodslice(dev, ssp->dss_first_bsd_slice), ssp); + else if (slice == ssp->dss_first_bsd_slice) + set_ds_labeldevs_unaliased(dname, + dkmodslice(dev, COMPATIBILITY_SLICE), ssp); +} + +static void +set_ds_labeldevs_unaliased(dname, dev, ssp) + char *dname; + dev_t dev; + struct diskslices *ssp; +{ + struct disklabel *lp; + int mynor; + int part; + char partname[2]; + struct partition *pp; + int slice; + char *sname; + struct diskslice *sp; + + slice = dkslice(dev); + sp = &ssp->dss_slices[slice]; + if (sp->ds_size == 0) + return; + lp = sp->ds_label; + for (part = 0; part < lp->d_npartitions; part++) { + pp = &lp->d_partitions[part]; + if (pp->p_size == 0) + continue; + sname = dsname(dname, dkunit(dev), slice, part, partname); + if (part == RAW_PART && sp->ds_bdev != NULL) { + sp->ds_bdevs[part] = + devfs_makelink(sp->ds_bdev, + "%s%s", sname, partname); + sp->ds_cdevs[part] = + devfs_makelink(sp->ds_cdev, + "r%s%s", sname, partname); + } else { + mynor = minor(dkmodpart(dev, part)); + sp->ds_bdevs[part] = + devfs_add_devswf(ssp->dss_cdevsw, mynor, DV_BLK, + UID_ROOT, GID_OPERATOR, 0640, + "%s%s", sname, partname); + sp->ds_cdevs[part] = + devfs_add_devswf(ssp->dss_cdevsw, mynor, DV_CHR, + UID_ROOT, GID_OPERATOR, 0640, + "r%s%s", sname, partname); + } + } +} +#endif /* DEVFS */ + +static void +set_ds_wlabel(ssp, slice, wlabel) + struct diskslices *ssp; + int slice; + int wlabel; +{ + ssp->dss_slices[slice].ds_wlabel = wlabel; + if (slice == COMPATIBILITY_SLICE) + ssp->dss_slices[ssp->dss_first_bsd_slice].ds_wlabel = wlabel; + else if (slice == ssp->dss_first_bsd_slice) + ssp->dss_slices[COMPATIBILITY_SLICE].ds_wlabel = wlabel; +} diff --git a/sys/kern/subr_dkbad.c b/sys/kern/subr_dkbad.c new file mode 100644 index 0000000..4686a17 --- /dev/null +++ b/sys/kern/subr_dkbad.c @@ -0,0 +1,160 @@ +/*- + * Copyright (c) 1994 Bruce D. Evans. + * All rights reserved. + * + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Copyright (c) 1982, 1986, 1988 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)wd.c 7.2 (Berkeley) 5/9/91 + * from: wd.c,v 1.55 1994/10/22 01:57:12 phk Exp $ + * from: @(#)ufs_disksubr.c 7.16 (Berkeley) 5/4/91 + * from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $ + * $Id: subr_dkbad.c,v 1.7 1997/11/24 04:14:21 dyson Exp $ + */ + +#include <sys/param.h> +#include <sys/buf.h> +#include <sys/conf.h> +#include <sys/disklabel.h> +#include <sys/dkbad.h> +#include <sys/malloc.h> + +/* + * Internalize the bad sector table. + * TODO: + * o Fix types. + * Type long should be daddr_t since we compare with blkno's. + * Sentinel -1 should be ((daddr_t)-1). + * o Can remove explicit test for sentinel if it is a positive + * (unsigned or not) value larger than all possible blkno's. + * o Check that the table is sorted. + * o Use faster searches. + * o Use the internal table in wddump(). + * o Don't duplicate so much code. + * o Do all bad block handing in a driver-independent file. + * o Remove limit of 126 spare sectors. + */ +struct dkbad_intern * +internbad144(btp, lp) + struct dkbad *btp; + struct disklabel *lp; +{ + struct dkbad_intern *bip; + int i; + + bip = malloc(sizeof *bip, M_DEVBUF, M_WAITOK); + /* + * Spare sectors are allocated beginning with the last sector of + * the second last track of the disk (the last track is used for + * the bad sector list). + */ + bip->bi_maxspare = lp->d_secperunit - lp->d_nsectors - 1; + bip->bi_nbad = DKBAD_MAXBAD; + i = 0; + for (; i < DKBAD_MAXBAD && btp->bt_bad[i].bt_cyl != DKBAD_NOCYL; i++) + bip->bi_bad[i] = btp->bt_bad[i].bt_cyl * lp->d_secpercyl + + (btp->bt_bad[i].bt_trksec >> 8) + * lp->d_nsectors + + (btp->bt_bad[i].bt_trksec & 0x00ff); + bip->bi_bad[i] = -1; + return (bip); +} + +char * +readbad144(dev, strat, lp, bdp) + dev_t dev; + d_strategy_t *strat; + struct disklabel *lp; + struct dkbad *bdp; +{ + struct buf *bp; + struct dkbad *db; + int i; + char *msg; + + bp = geteblk((int)lp->d_secsize); + i = 0; + do { + /* Read a bad sector table. */ + bp->b_dev = dev; + bp->b_blkno = lp->d_secperunit - lp->d_nsectors + i; + if (lp->d_secsize > DEV_BSIZE) + bp->b_blkno *= lp->d_secsize / DEV_BSIZE; + else + bp->b_blkno /= DEV_BSIZE / lp->d_secsize; + bp->b_bcount = lp->d_secsize; + bp->b_flags |= B_BUSY | B_READ; + bp->b_flags &= ~B_ERROR; + (*strat)(bp); + + /* If successful, validate, otherwise try another. */ + if (biowait(bp) == 0) { + db = (struct dkbad *)(bp->b_data); + if (db->bt_mbz == 0 && db->bt_flag == DKBAD_MAGIC) { + msg = NULL; + *bdp = *db; + break; + } + msg = "bad sector table corrupted"; + } else + msg = "bad sector table I/O error"; + } while ((bp->b_flags & B_ERROR) && (i += 2) < 10 && + i < lp->d_nsectors); + bp->b_flags |= B_INVAL | B_AGE; + brelse(bp); + return (msg); +} + +daddr_t +transbad144(bip, blkno) + struct dkbad_intern *bip; + daddr_t blkno; +{ + int i; + + /* + * List is sorted, so the search can terminate when it is past our + * sector. + */ + for (i = 0; bip->bi_bad[i] != -1 && bip->bi_bad[i] <= blkno; i++) + if (bip->bi_bad[i] == blkno) + /* + * Spare sectors are allocated in decreasing order. + */ + return (bip->bi_maxspare - i); + return (blkno); +} diff --git a/sys/kern/subr_log.c b/sys/kern/subr_log.c new file mode 100644 index 0000000..1204376 --- /dev/null +++ b/sys/kern/subr_log.c @@ -0,0 +1,274 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)subr_log.c 8.1 (Berkeley) 6/10/93 + * $Id: subr_log.c,v 1.32 1998/11/11 10:55:56 truckman Exp $ + */ + +/* + * Error log buffer for kernel printf's. + */ + +#include "opt_devfs.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/conf.h> +#include <sys/proc.h> +#include <sys/vnode.h> +#include <sys/filio.h> +#include <sys/ttycom.h> +#include <sys/msgbuf.h> +#include <sys/signalvar.h> +#include <sys/kernel.h> +#include <sys/poll.h> +#include <sys/filedesc.h> +#ifdef DEVFS +#include <sys/devfsext.h> +#endif /*DEVFS*/ + +#define LOG_RDPRI (PZERO + 1) + +#define LOG_ASYNC 0x04 +#define LOG_RDWAIT 0x08 + +static d_open_t logopen; +static d_close_t logclose; +static d_read_t logread; +static d_ioctl_t logioctl; +static d_poll_t logpoll; + +#define CDEV_MAJOR 7 +static struct cdevsw log_cdevsw = + { logopen, logclose, logread, nowrite, /*7*/ + logioctl, nostop, nullreset, nodevtotty,/* klog */ + logpoll, nommap, NULL, "log", NULL, -1 }; + +static struct logsoftc { + int sc_state; /* see above for possibilities */ + struct selinfo sc_selp; /* process waiting on select call */ + struct sigio *sc_sigio; /* information for async I/O */ +} logsoftc; + +int log_open; /* also used in log() */ + +/*ARGSUSED*/ +static int +logopen(dev, flags, mode, p) + dev_t dev; + int flags, mode; + struct proc *p; +{ + if (log_open) + return (EBUSY); + log_open = 1; + fsetown(p->p_pid, &logsoftc.sc_sigio); /* signal process only */ + return (0); +} + +/*ARGSUSED*/ +static int +logclose(dev, flag, mode, p) + dev_t dev; + int flag, mode; + struct proc *p; +{ + + log_open = 0; + logsoftc.sc_state = 0; + funsetown(logsoftc.sc_sigio); + return (0); +} + +/*ARGSUSED*/ +static int +logread(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + register struct msgbuf *mbp = msgbufp; + register long l; + register int s; + int error = 0; + + s = splhigh(); + while (mbp->msg_bufr == mbp->msg_bufx) { + if (flag & IO_NDELAY) { + splx(s); + return (EWOULDBLOCK); + } + logsoftc.sc_state |= LOG_RDWAIT; + if ((error = tsleep((caddr_t)mbp, LOG_RDPRI | PCATCH, + "klog", 0))) { + splx(s); + return (error); + } + } + splx(s); + logsoftc.sc_state &= ~LOG_RDWAIT; + + while (uio->uio_resid > 0) { + l = mbp->msg_bufx - mbp->msg_bufr; + if (l < 0) + l = mbp->msg_size - mbp->msg_bufr; + l = min(l, uio->uio_resid); + if (l == 0) + break; + error = uiomove((caddr_t)msgbufp->msg_ptr + mbp->msg_bufr, + (int)l, uio); + if (error) + break; + mbp->msg_bufr += l; + if (mbp->msg_bufr >= mbp->msg_size) + mbp->msg_bufr = 0; + } + return (error); +} + +/*ARGSUSED*/ +static int +logpoll(dev, events, p) + dev_t dev; + int events; + struct proc *p; +{ + int s; + int revents = 0; + + s = splhigh(); + + if (events & (POLLIN | POLLRDNORM)) + if (msgbufp->msg_bufr != msgbufp->msg_bufx) + revents |= events & (POLLIN | POLLRDNORM); + else + selrecord(p, &logsoftc.sc_selp); + + splx(s); + return (revents); +} + +void +logwakeup() +{ + if (!log_open) + return; + selwakeup(&logsoftc.sc_selp); + if ((logsoftc.sc_state & LOG_ASYNC) && logsoftc.sc_sigio != NULL) + pgsigio(logsoftc.sc_sigio, SIGIO, 0); + if (logsoftc.sc_state & LOG_RDWAIT) { + wakeup((caddr_t)msgbufp); + logsoftc.sc_state &= ~LOG_RDWAIT; + } +} + +/*ARGSUSED*/ +static int +logioctl(dev, com, data, flag, p) + dev_t dev; + u_long com; + caddr_t data; + int flag; + struct proc *p; +{ + long l; + int s; + + switch (com) { + + /* return number of characters immediately available */ + case FIONREAD: + s = splhigh(); + l = msgbufp->msg_bufx - msgbufp->msg_bufr; + splx(s); + if (l < 0) + l += msgbufp->msg_size; + *(int *)data = l; + break; + + case FIONBIO: + break; + + case FIOASYNC: + if (*(int *)data) + logsoftc.sc_state |= LOG_ASYNC; + else + logsoftc.sc_state &= ~LOG_ASYNC; + break; + + case FIOSETOWN: + return (fsetown(*(int *)data, &logsoftc.sc_sigio)); + + case FIOGETOWN: + *(int *)data = fgetown(logsoftc.sc_sigio); + break; + + /* This is deprecated, FIOSETOWN should be used instead. */ + case TIOCSPGRP: + return (fsetown(-(*(int *)data), &logsoftc.sc_sigio)); + + /* This is deprecated, FIOGETOWN should be used instead */ + case TIOCGPGRP: + *(int *)data = -fgetown(logsoftc.sc_sigio); + break; + + default: + return (ENOTTY); + } + return (0); +} + +static int log_devsw_installed; +#ifdef DEVFS +static void *log_devfs_token; +#endif + +static void log_drvinit __P((void *unused)); +static void +log_drvinit(unused) + void *unused; +{ + dev_t dev; + + if( ! log_devsw_installed ) { + dev = makedev(CDEV_MAJOR,0); + cdevsw_add(&dev,&log_cdevsw,NULL); + log_devsw_installed = 1; +#ifdef DEVFS + log_devfs_token = devfs_add_devswf(&log_cdevsw, 0, DV_CHR, + UID_ROOT, GID_WHEEL, 0600, + "klog"); +#endif + } +} + +SYSINIT(logdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,log_drvinit,NULL) diff --git a/sys/kern/subr_module.c b/sys/kern/subr_module.c new file mode 100644 index 0000000..7eb635a --- /dev/null +++ b/sys/kern/subr_module.c @@ -0,0 +1,267 @@ +/*- + * Copyright (c) 1998 Michael Smith + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: subr_module.c,v 1.3 1998/10/12 09:03:48 peter Exp $ + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/linker.h> + +/* + * Preloaded module support + */ + +caddr_t preload_metadata; + +/* + * Search for the preloaded module (name) + */ +caddr_t +preload_search_by_name(const char *name) +{ + caddr_t curp; + u_int32_t *hdr; + int next; + + if (preload_metadata != NULL) { + + curp = preload_metadata; + for (;;) { + hdr = (u_int32_t *)curp; + if (hdr[0] == 0 && hdr[1] == 0) + break; + + /* Search for a MODINFO_NAME field */ + if ((hdr[0] == MODINFO_NAME) && + !strcmp(name, curp + sizeof(u_int32_t) * 2)) + return(curp); + + /* skip to next field */ + next = sizeof(u_int32_t) * 2 + hdr[1]; + next = roundup(next, sizeof(u_long)); + curp += next; + } + } + return(NULL); +} + +/* + * Search for the first preloaded module of (type) + */ +caddr_t +preload_search_by_type(const char *type) +{ + caddr_t curp, lname; + u_int32_t *hdr; + int next; + + if (preload_metadata != NULL) { + + curp = preload_metadata; + lname = NULL; + for (;;) { + hdr = (u_int32_t *)curp; + if (hdr[0] == 0 && hdr[1] == 0) + break; + + /* remember the start of each record */ + if (hdr[0] == MODINFO_NAME) + lname = curp; + + /* Search for a MODINFO_TYPE field */ + if ((hdr[0] == MODINFO_TYPE) && + !strcmp(type, curp + sizeof(u_int32_t) * 2)) + return(lname); + + /* skip to next field */ + next = sizeof(u_int32_t) * 2 + hdr[1]; + next = roundup(next, sizeof(u_long)); + curp += next; + } + } + return(NULL); +} + +/* + * Walk through the preloaded module list + */ +caddr_t +preload_search_next_name(caddr_t base) +{ + caddr_t curp; + u_int32_t *hdr; + int next; + + if (preload_metadata != NULL) { + + /* Pick up where we left off last time */ + if (base) { + /* skip to next field */ + curp = base; + hdr = (u_int32_t *)curp; + next = sizeof(u_int32_t) * 2 + hdr[1]; + next = roundup(next, sizeof(u_long)); + curp += next; + } else + curp = preload_metadata; + + for (;;) { + hdr = (u_int32_t *)curp; + if (hdr[0] == 0 && hdr[1] == 0) + break; + + /* Found a new record? */ + if (hdr[0] == MODINFO_NAME) + return curp; + + /* skip to next field */ + next = sizeof(u_int32_t) * 2 + hdr[1]; + next = roundup(next, sizeof(u_long)); + curp += next; + } + } + return(NULL); +} + +/* + * Given a preloaded module handle (mod), return a pointer + * to the data for the attribute (inf). + */ +caddr_t +preload_search_info(caddr_t mod, int inf) +{ + caddr_t curp; + u_int32_t *hdr; + u_int32_t type = 0; + int next; + + curp = mod; + for (;;) { + hdr = (u_int32_t *)curp; + /* end of module data? */ + if (hdr[0] == 0 && hdr[1] == 0) + break; + /* + * We give up once we've looped back to what we were looking at + * first - this should normally be a MODINFO_NAME field. + */ + if (type == 0) { + type = hdr[0]; + } else { + if (hdr[0] == type) + break; + } + + /* + * Attribute match? Return pointer to data. + * Consumer may safely assume that size value preceeds + * data. + */ + if (hdr[0] == inf) + return(curp + (sizeof(u_int32_t) * 2)); + + /* skip to next field */ + next = sizeof(u_int32_t) * 2 + hdr[1]; + next = roundup(next, sizeof(u_long)); + curp += next; + } + return(NULL); +} + +/* + * Delete a preload record by name. + */ +void +preload_delete_name(const char *name) +{ + caddr_t curp; + u_int32_t *hdr; + int next; + int clearing; + + if (preload_metadata != NULL) { + + clearing = 0; + curp = preload_metadata; + for (;;) { + hdr = (u_int32_t *)curp; + if (hdr[0] == 0 && hdr[1] == 0) + break; + + /* Search for a MODINFO_NAME field */ + if (hdr[0] == MODINFO_NAME) { + if (!strcmp(name, curp + sizeof(u_int32_t) * 2)) + clearing = 1; /* got it, start clearing */ + else if (clearing) + clearing = 0; /* at next one now.. better stop */ + } + if (clearing) + hdr[0] = MODINFO_EMPTY; + + /* skip to next field */ + next = sizeof(u_int32_t) * 2 + hdr[1]; + next = roundup(next, sizeof(u_long)); + curp += next; + } + } +} + +/* Called from locore on i386. Convert physical pointers to kvm. Sigh. */ +void +preload_bootstrap_relocate(vm_offset_t offset) +{ + caddr_t curp; + u_int32_t *hdr; + vm_offset_t *ptr; + int next; + + if (preload_metadata != NULL) { + + curp = preload_metadata; + for (;;) { + hdr = (u_int32_t *)curp; + if (hdr[0] == 0 && hdr[1] == 0) + break; + + /* Deal with the ones that we know we have to fix */ + switch (hdr[0]) { + case MODINFO_ADDR: + case MODINFO_METADATA|MODINFOMD_SSYM: + case MODINFO_METADATA|MODINFOMD_ESYM: + ptr = (vm_offset_t *)(curp + (sizeof(u_int32_t) * 2)); + *ptr += offset; + break; + } + /* The rest is beyond us for now */ + + /* skip to next field */ + next = sizeof(u_int32_t) * 2 + hdr[1]; + next = roundup(next, sizeof(u_long)); + curp += next; + } + } +} diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c new file mode 100644 index 0000000..ef98c59 --- /dev/null +++ b/sys/kern/subr_param.c @@ -0,0 +1,189 @@ +/* + * Copyright (c) 1980, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)param.c 8.3 (Berkeley) 8/20/94 + * $Id: param.c,v 1.31 1998/11/05 14:28:17 dg Exp $ + */ + +#include <stddef.h> + +#include "opt_sysvipc.h" +#include "opt_param.h" + +#include <sys/param.h> + +#ifdef SYSVSHM +#include <machine/vmparam.h> +#include <sys/shm.h> +#endif +#ifdef SYSVSEM +#include <sys/sem.h> +#endif +#ifdef SYSVMSG +#include <sys/msg.h> +#endif + +/* + * System parameter formulae. + * + * This file is copied into each directory where we compile + * the kernel; it should be modified there to suit local taste + * if necessary. + * + * Compiled with -DMAXUSERS=xx + */ + +#ifndef HZ +#define HZ 100 +#endif +int hz = HZ; +int tick = 1000000 / HZ; +int tickadj = howmany(30000, 60 * HZ); /* can adjust 30ms in 60s */ +#define NPROC (20 + 16 * MAXUSERS) +#define MAXFILES (NPROC*2) +int maxproc = NPROC; /* maximum # of processes */ +int maxprocperuid = NPROC-1; /* maximum # of processes per user */ +int maxfiles = MAXFILES; /* system wide open files limit */ +int maxfilesperproc = MAXFILES; /* per-process open files limit */ +int ncallout = 16 + NPROC + MAXFILES; /* maximum # of timer events */ + +/* maximum # of mbuf clusters */ +#ifndef NMBCLUSTERS +#define NMBCLUSTERS (512 + MAXUSERS * 16) +#endif +int nmbclusters = NMBCLUSTERS; + +#if MAXFILES > NMBCLUSTERS +#define MAXSOCKETS MAXFILES +#else +#define MAXSOCKETS NMBCLUSTERS +#endif +int maxsockets = MAXSOCKETS; + +/* allocate 1/4th amount of virtual address space for mbufs XXX */ +int nmbufs = NMBCLUSTERS * 4; + +/* maximum # of sf_bufs (sendfile(2) zero-copy virtual buffers) */ +#ifndef NSFBUFS +#define NSFBUFS (512 + MAXUSERS * 16) +#endif +int nsfbufs = NSFBUFS; + +/* + * Values in support of System V compatible shared memory. XXX + */ +#ifdef SYSVSHM +#ifndef SHMMAX +#define SHMMAX (SHMMAXPGS*PAGE_SIZE) +#endif +#ifndef SHMMIN +#define SHMMIN 1 +#endif +#ifndef SHMMNI +#define SHMMNI 32 /* <= SHMMMNI in shm.h */ +#endif +#ifndef SHMSEG +#define SHMSEG 8 +#endif +#ifndef SHMALL +#define SHMALL (SHMMAXPGS) +#endif + +struct shminfo shminfo = { + SHMMAX, + SHMMIN, + SHMMNI, + SHMSEG, + SHMALL +}; +#endif + +/* + * Values in support of System V compatible semaphores. + */ + +#ifdef SYSVSEM + +struct seminfo seminfo = { + SEMMAP, /* # of entries in semaphore map */ + SEMMNI, /* # of semaphore identifiers */ + SEMMNS, /* # of semaphores in system */ + SEMMNU, /* # of undo structures in system */ + SEMMSL, /* max # of semaphores per id */ + SEMOPM, /* max # of operations per semop call */ + SEMUME, /* max # of undo entries per process */ + SEMUSZ, /* size in bytes of undo structure */ + SEMVMX, /* semaphore maximum value */ + SEMAEM /* adjust on exit max value */ +}; +#endif + +/* + * Values in support of System V compatible messages. + */ + +#ifdef SYSVMSG + +struct msginfo msginfo = { + MSGMAX, /* max chars in a message */ + MSGMNI, /* # of message queue identifiers */ + MSGMNB, /* max chars in a queue */ + MSGTQL, /* max messages in system */ + MSGSSZ, /* size of a message segment */ + /* (must be small power of 2 greater than 4) */ + MSGSEG /* number of message segments */ +}; +#endif + +/* + * These may be set to nonzero here or by patching. + * If they are nonzero at bootstrap time then they are + * initialized to values dependent on the memory size. + */ +#ifdef NBUF +int nbuf = NBUF; +#else +int nbuf = 0; +#endif +int nswbuf = 0; + +/* + * These have to be allocated somewhere; allocating + * them here forces loader errors if this file is omitted + * (if they've been externed everywhere else; hah!). + */ +struct buf *swbuf; diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c new file mode 100644 index 0000000..424ac9f --- /dev/null +++ b/sys/kern/subr_prf.c @@ -0,0 +1,716 @@ +/*- + * Copyright (c) 1986, 1988, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)subr_prf.c 8.3 (Berkeley) 1/21/94 + * $Id: subr_prf.c,v 1.50 1998/09/06 06:25:04 ache Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/msgbuf.h> +#include <sys/malloc.h> +#include <sys/proc.h> +#include <sys/tty.h> +#include <sys/tprintf.h> +#include <sys/syslog.h> +#include <machine/cons.h> + +/* + * Note that stdarg.h and the ANSI style va_start macro is used for both + * ANSI and traditional C compilers. + */ +#include <machine/stdarg.h> + +#define TOCONS 0x01 +#define TOTTY 0x02 +#define TOLOG 0x04 + +struct tty *constty; /* pointer to console "window" tty */ + +struct putchar_arg { + int flags; + struct tty *tty; +}; + +struct snprintf_arg { + char *str; + size_t remain; +}; + +static void (*v_putc)(int) = cnputc; /* routine to putc on virtual console */ +static void logpri __P((int level)); +static void msglogchar(int c, void *dummyarg); +static void putchar __P((int ch, void *arg)); +static char *ksprintn __P((u_long num, int base, int *len)); +static void snprintf_func __P((int ch, void *arg)); + +static int consintr = 1; /* Ok to handle console interrupts? */ +static int msgbufmapped; /* Set when safe to use msgbuf */ + +/* + * Warn that a system table is full. + */ +void +tablefull(tab) + const char *tab; +{ + + log(LOG_ERR, "%s: table is full\n", tab); +} + +/* + * Uprintf prints to the controlling terminal for the current process. + * It may block if the tty queue is overfull. No message is printed if + * the queue does not clear in a reasonable time. + */ +void +uprintf(const char *fmt, ...) +{ + struct proc *p = curproc; + va_list ap; + struct putchar_arg pca; + + if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) { + va_start(ap, fmt); + pca.tty = p->p_session->s_ttyp; + pca.flags = TOTTY; + kvprintf(fmt, putchar, &pca, 10, ap); + va_end(ap); + } +} + +tpr_t +tprintf_open(p) + register struct proc *p; +{ + + if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) { + SESSHOLD(p->p_session); + return ((tpr_t) p->p_session); + } + return ((tpr_t) NULL); +} + +void +tprintf_close(sess) + tpr_t sess; +{ + + if (sess) + SESSRELE((struct session *) sess); +} + +/* + * tprintf prints on the controlling terminal associated + * with the given session. + */ +void +tprintf(tpr_t tpr, const char *fmt, ...) +{ + register struct session *sess = (struct session *)tpr; + struct tty *tp = NULL; + int flags = TOLOG; + va_list ap; + struct putchar_arg pca; + + logpri(LOG_INFO); + if (sess && sess->s_ttyvp && ttycheckoutq(sess->s_ttyp, 0)) { + flags |= TOTTY; + tp = sess->s_ttyp; + } + va_start(ap, fmt); + pca.tty = tp; + pca.flags = flags; + kvprintf(fmt, putchar, &pca, 10, ap); + va_end(ap); + logwakeup(); +} + +/* + * Ttyprintf displays a message on a tty; it should be used only by + * the tty driver, or anything that knows the underlying tty will not + * be revoke(2)'d away. Other callers should use tprintf. + */ +void +ttyprintf(struct tty *tp, const char *fmt, ...) +{ + va_list ap; + struct putchar_arg pca; + va_start(ap, fmt); + pca.tty = tp; + pca.flags = TOTTY; + kvprintf(fmt, putchar, &pca, 10, ap); + va_end(ap); +} + +extern int log_open; + +/* + * Log writes to the log buffer, and guarantees not to sleep (so can be + * called by interrupt routines). If there is no process reading the + * log yet, it writes to the console also. + */ +void +log(int level, const char *fmt, ...) +{ + register int s; + va_list ap; + + s = splhigh(); + logpri(level); + va_start(ap, fmt); + + kvprintf(fmt, msglogchar, NULL, 10, ap); + va_end(ap); + + splx(s); + if (!log_open) { + struct putchar_arg pca; + va_start(ap, fmt); + pca.tty = NULL; + pca.flags = TOCONS; + kvprintf(fmt, putchar, &pca, 10, ap); + va_end(ap); + } + logwakeup(); +} + +static void +logpri(level) + int level; +{ + register char *p; + + msglogchar('<', NULL); + for (p = ksprintn((u_long)level, 10, NULL); *p;) + msglogchar(*p--, NULL); + msglogchar('>', NULL); +} + +int +addlog(const char *fmt, ...) +{ + register int s; + va_list ap; + int retval; + + s = splhigh(); + va_start(ap, fmt); + retval = kvprintf(fmt, msglogchar, NULL, 10, ap); + splx(s); + va_end(ap); + if (!log_open) { + struct putchar_arg pca; + va_start(ap, fmt); + pca.tty = NULL; + pca.flags = TOCONS; + kvprintf(fmt, putchar, &pca, 10, ap); + va_end(ap); + } + logwakeup(); + return (retval); +} + +int +printf(const char *fmt, ...) +{ + va_list ap; + register int savintr; + struct putchar_arg pca; + int retval; + + savintr = consintr; /* disable interrupts */ + consintr = 0; + va_start(ap, fmt); + pca.tty = NULL; + pca.flags = TOCONS | TOLOG; + retval = kvprintf(fmt, putchar, &pca, 10, ap); + va_end(ap); + if (!panicstr) + logwakeup(); + consintr = savintr; /* reenable interrupts */ + return retval; +} + +void +vprintf(const char *fmt, va_list ap) +{ + register int savintr; + struct putchar_arg pca; + + savintr = consintr; /* disable interrupts */ + consintr = 0; + pca.tty = NULL; + pca.flags = TOCONS | TOLOG; + kvprintf(fmt, putchar, &pca, 10, ap); + if (!panicstr) + logwakeup(); + consintr = savintr; /* reenable interrupts */ +} + +/* + * Print a character on console or users terminal. If destination is + * the console then the last bunch of characters are saved in msgbuf for + * inspection later. + */ +static void +putchar(int c, void *arg) +{ + struct putchar_arg *ap = (struct putchar_arg*) arg; + int flags = ap->flags; + struct tty *tp = ap->tty; + if (panicstr) + constty = NULL; + if ((flags & TOCONS) && tp == NULL && constty) { + tp = constty; + flags |= TOTTY; + } + if ((flags & TOTTY) && tp && tputchar(c, tp) < 0 && + (flags & TOCONS) && tp == constty) + constty = NULL; + if ((flags & TOLOG)) + msglogchar(c, NULL); + if ((flags & TOCONS) && constty == NULL && c != '\0') + (*v_putc)(c); +} + +/* + * Scaled down version of sprintf(3). + */ +int +sprintf(char *buf, const char *cfmt, ...) +{ + int retval; + va_list ap; + + va_start(ap, cfmt); + retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap); + buf[retval] = '\0'; + va_end(ap); + return retval; +} + +/* + * Scaled down version of vsprintf(3). + */ +int +vsprintf(char *buf, const char *cfmt, va_list ap) +{ + int retval; + + retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap); + buf[retval] = '\0'; + return retval; +} + +/* + * Scaled down version of snprintf(3). + */ +int +snprintf(char *str, size_t size, const char *format, ...) +{ + int retval; + va_list ap; + + va_start(ap, format); + retval = vsnprintf(str, size, format, ap); + va_end(ap); + return(retval); +} + +/* + * Scaled down version of vsnprintf(3). + */ +int +vsnprintf(char *str, size_t size, const char *format, va_list ap) +{ + struct snprintf_arg info; + int retval; + + info.str = str; + info.remain = size; + retval = kvprintf(format, snprintf_func, &info, 10, ap); + if (info.remain >= 1) + *info.str++ = '\0'; + return retval; +} + +static void +snprintf_func(int ch, void *arg) +{ + struct snprintf_arg *const info = arg; + + if (info->remain >= 2) { + *info->str++ = ch; + info->remain--; + } +} + +/* + * Put a number (base <= 16) in a buffer in reverse order; return an + * optional length and a pointer to the NULL terminated (preceded?) + * buffer. + */ +static char * +ksprintn(ul, base, lenp) + register u_long ul; + register int base, *lenp; +{ /* A long in base 8, plus NULL. */ + static char buf[sizeof(long) * NBBY / 3 + 2]; + register char *p; + + p = buf; + do { + *++p = hex2ascii(ul % base); + } while (ul /= base); + if (lenp) + *lenp = p - buf; + return (p); +} + +/* + * Scaled down version of printf(3). + * + * Two additional formats: + * + * The format %b is supported to decode error registers. + * Its usage is: + * + * printf("reg=%b\n", regval, "<base><arg>*"); + * + * where <base> is the output base expressed as a control character, e.g. + * \10 gives octal; \20 gives hex. Each arg is a sequence of characters, + * the first of which gives the bit number to be inspected (origin 1), and + * the next characters (up to a control character, i.e. a character <= 32), + * give the name of the register. Thus: + * + * kvprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE\n"); + * + * would produce output: + * + * reg=3<BITTWO,BITONE> + * + * XXX: %D -- Hexdump, takes pointer and separator string: + * ("%6D", ptr, ":") -> XX:XX:XX:XX:XX:XX + * ("%*D", len, ptr, " " -> XX XX XX XX ... + */ +int +kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_list ap) +{ +#define PCHAR(c) {int cc=(c); if (func) (*func)(cc,arg); else *d++ = cc; retval++; } + char *p, *q, *d; + u_char *up; + int ch, n; + u_long ul; + int base, lflag, tmp, width, ladjust, sharpflag, neg, sign, dot; + int dwidth; + char padc; + int retval = 0; + + if (!func) + d = (char *) arg; + else + d = NULL; + + if (fmt == NULL) + fmt = "(fmt null)\n"; + + if (radix < 2 || radix > 36) + radix = 10; + + for (;;) { + padc = ' '; + width = 0; + while ((ch = (u_char)*fmt++) != '%') { + if (ch == '\0') + return retval; + PCHAR(ch); + } + lflag = 0; ladjust = 0; sharpflag = 0; neg = 0; + sign = 0; dot = 0; dwidth = 0; +reswitch: switch (ch = (u_char)*fmt++) { + case '.': + dot = 1; + goto reswitch; + case '#': + sharpflag = 1; + goto reswitch; + case '+': + sign = 1; + goto reswitch; + case '-': + ladjust = 1; + goto reswitch; + case '%': + PCHAR(ch); + break; + case '*': + if (!dot) { + width = va_arg(ap, int); + if (width < 0) { + ladjust = !ladjust; + width = -width; + } + } else { + dwidth = va_arg(ap, int); + } + goto reswitch; + case '0': + if (!dot) { + padc = '0'; + goto reswitch; + } + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + for (n = 0;; ++fmt) { + n = n * 10 + ch - '0'; + ch = *fmt; + if (ch < '0' || ch > '9') + break; + } + if (dot) + dwidth = n; + else + width = n; + goto reswitch; + case 'b': + ul = va_arg(ap, int); + p = va_arg(ap, char *); + for (q = ksprintn(ul, *p++, NULL); *q;) + PCHAR(*q--); + + if (!ul) + break; + + for (tmp = 0; *p;) { + n = *p++; + if (ul & (1 << (n - 1))) { + PCHAR(tmp ? ',' : '<'); + for (; (n = *p) > ' '; ++p) + PCHAR(n); + tmp = 1; + } else + for (; *p > ' '; ++p) + continue; + } + if (tmp) + PCHAR('>'); + break; + case 'c': + PCHAR(va_arg(ap, int)); + break; + case 'D': + up = va_arg(ap, u_char *); + p = va_arg(ap, char *); + if (!width) + width = 16; + while(width--) { + PCHAR(hex2ascii(*up >> 4)); + PCHAR(hex2ascii(*up & 0x0f)); + up++; + if (width) + for (q=p;*q;q++) + PCHAR(*q); + } + break; + case 'd': + ul = lflag ? va_arg(ap, long) : va_arg(ap, int); + sign = 1; + base = 10; + goto number; + case 'l': + lflag = 1; + goto reswitch; + case 'o': + ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int); + base = 8; + goto nosign; + case 'p': + ul = (uintptr_t)va_arg(ap, void *); + base = 16; + sharpflag = (width == 0); + goto nosign; + case 'n': + case 'r': + ul = lflag ? va_arg(ap, u_long) : + sign ? (u_long)va_arg(ap, int) : va_arg(ap, u_int); + base = radix; + goto number; + case 's': + p = va_arg(ap, char *); + if (p == NULL) + p = "(null)"; + if (!dot) + n = strlen (p); + else + for (n = 0; n < dwidth && p[n]; n++) + continue; + + width -= n; + + if (!ladjust && width > 0) + while (width--) + PCHAR(padc); + while (n--) + PCHAR(*p++); + if (ladjust && width > 0) + while (width--) + PCHAR(padc); + break; + case 'u': + ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int); + base = 10; + goto nosign; + case 'x': + ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int); + base = 16; + goto nosign; + case 'z': + ul = lflag ? va_arg(ap, u_long) : + sign ? (u_long)va_arg(ap, int) : va_arg(ap, u_int); + base = 16; + goto number; +nosign: sign = 0; +number: if (sign && (long)ul < 0L) { + neg = 1; + ul = -(long)ul; + } + p = ksprintn(ul, base, &tmp); + if (sharpflag && ul != 0) { + if (base == 8) + tmp++; + else if (base == 16) + tmp += 2; + } + if (neg) + tmp++; + + if (!ladjust && width && (width -= tmp) > 0) + while (width--) + PCHAR(padc); + if (neg) + PCHAR('-'); + if (sharpflag && ul != 0) { + if (base == 8) { + PCHAR('0'); + } else if (base == 16) { + PCHAR('0'); + PCHAR('x'); + } + } + + while (*p) + PCHAR(*p--); + + if (ladjust && width && (width -= tmp) > 0) + while (width--) + PCHAR(padc); + + break; + default: + PCHAR('%'); + if (lflag) + PCHAR('l'); + PCHAR(ch); + break; + } + } +#undef PCHAR +} + +/* + * Put character in log buffer. + */ +static void +msglogchar(int c, void *dummyarg) +{ + struct msgbuf *mbp; + + if (c != '\0' && c != '\r' && c != 0177 && msgbufmapped) { + mbp = msgbufp; + mbp->msg_ptr[mbp->msg_bufx++] = c; + if (mbp->msg_bufx >= mbp->msg_size) + mbp->msg_bufx = 0; + /* If the buffer is full, keep the most recent data. */ + if (mbp->msg_bufr == mbp->msg_bufx) { + if (++mbp->msg_bufr >= mbp->msg_size) + mbp->msg_bufr = 0; + } + } +} + +void +msgbufinit(void *ptr, size_t size) +{ + char *cp; + + cp = (char *)ptr; + msgbufp = (struct msgbuf *) (cp + size - sizeof(*msgbufp)); + if (msgbufp->msg_magic != MSG_MAGIC || msgbufp->msg_ptr != cp) { + bzero(cp, size); + msgbufp->msg_magic = MSG_MAGIC; + msgbufp->msg_size = (char *)msgbufp - cp; + msgbufp->msg_ptr = cp; + } + msgbufmapped = 1; +} + +#include "opt_ddb.h" +#ifdef DDB +#include <ddb/ddb.h> + +DB_SHOW_COMMAND(msgbuf, db_show_msgbuf) +{ + int i, j; + + if (!msgbufmapped) { + db_printf("msgbuf not mapped yet\n"); + return; + } + db_printf("msgbufp = %p\n", msgbufp); + db_printf("magic = %x, size = %d, r= %d, w = %d, ptr = %p\n", + msgbufp->msg_magic, msgbufp->msg_size, msgbufp->msg_bufr, + msgbufp->msg_bufx, msgbufp->msg_ptr); + for (i = 0; i < msgbufp->msg_size; i++) { + j = (i + msgbufp->msg_bufr) % msgbufp->msg_size; + db_printf("%c", msgbufp->msg_ptr[j]); + } + db_printf("\n"); +} + +#endif /* DDB */ diff --git a/sys/kern/subr_prof.c b/sys/kern/subr_prof.c new file mode 100644 index 0000000..d0ecad7 --- /dev/null +++ b/sys/kern/subr_prof.c @@ -0,0 +1,457 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)subr_prof.c 8.3 (Berkeley) 9/23/93 + * $Id: subr_prof.c,v 1.27 1998/07/14 05:09:46 bde Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/sysctl.h> + +#include <machine/cpu.h> + +#ifdef GPROF +#include <sys/malloc.h> +#include <sys/gmon.h> + +static MALLOC_DEFINE(M_GPROF, "gprof", "kernel profiling buffer"); + +static void kmstartup __P((void *)); +SYSINIT(kmem, SI_SUB_KPROF, SI_ORDER_FIRST, kmstartup, NULL) + +struct gmonparam _gmonparam = { GMON_PROF_OFF }; + +#ifdef GUPROF +void +nullfunc_loop_profiled() +{ + int i; + + for (i = 0; i < CALIB_SCALE; i++) + nullfunc_profiled(); +} + +#define nullfunc_loop_profiled_end nullfunc_profiled /* XXX */ + +void +nullfunc_profiled() +{ +} +#endif /* GUPROF */ + +static void +kmstartup(dummy) + void *dummy; +{ + char *cp; + struct gmonparam *p = &_gmonparam; +#ifdef GUPROF + int cputime_overhead; + int empty_loop_time; + int i; + int mcount_overhead; + int mexitcount_overhead; + int nullfunc_loop_overhead; + int nullfunc_loop_profiled_time; + uintfptr_t tmp_addr; +#endif + + /* + * Round lowpc and highpc to multiples of the density we're using + * so the rest of the scaling (here and in gprof) stays in ints. + */ + p->lowpc = ROUNDDOWN((u_long)btext, HISTFRACTION * sizeof(HISTCOUNTER)); + p->highpc = ROUNDUP((u_long)etext, HISTFRACTION * sizeof(HISTCOUNTER)); + p->textsize = p->highpc - p->lowpc; + printf("Profiling kernel, textsize=%lu [%x..%x]\n", + p->textsize, p->lowpc, p->highpc); + p->kcountsize = p->textsize / HISTFRACTION; + p->hashfraction = HASHFRACTION; + p->fromssize = p->textsize / HASHFRACTION; + p->tolimit = p->textsize * ARCDENSITY / 100; + if (p->tolimit < MINARCS) + p->tolimit = MINARCS; + else if (p->tolimit > MAXARCS) + p->tolimit = MAXARCS; + p->tossize = p->tolimit * sizeof(struct tostruct); + cp = (char *)malloc(p->kcountsize + p->fromssize + p->tossize, + M_GPROF, M_NOWAIT); + if (cp == 0) { + printf("No memory for profiling.\n"); + return; + } + bzero(cp, p->kcountsize + p->tossize + p->fromssize); + p->tos = (struct tostruct *)cp; + cp += p->tossize; + p->kcount = (HISTCOUNTER *)cp; + cp += p->kcountsize; + p->froms = (u_short *)cp; + +#ifdef GUPROF + /* Initialize pointers to overhead counters. */ + p->cputime_count = &KCOUNT(p, PC_TO_I(p, cputime)); + p->mcount_count = &KCOUNT(p, PC_TO_I(p, mcount)); + p->mexitcount_count = &KCOUNT(p, PC_TO_I(p, mexitcount)); + + /* + * Disable interrupts to avoid interference while we calibrate + * things. + */ + disable_intr(); + + /* + * Determine overheads. + * XXX this needs to be repeated for each useful timer/counter. + */ + cputime_overhead = 0; + startguprof(p); + for (i = 0; i < CALIB_SCALE; i++) + cputime_overhead += cputime(); + + empty_loop(); + startguprof(p); + empty_loop(); + empty_loop_time = cputime(); + + nullfunc_loop_profiled(); + + /* + * Start profiling. There won't be any normal function calls since + * interrupts are disabled, but we will call the profiling routines + * directly to determine their overheads. + */ + p->state = GMON_PROF_HIRES; + + startguprof(p); + nullfunc_loop_profiled(); + + startguprof(p); + for (i = 0; i < CALIB_SCALE; i++) +#if defined(__i386__) && __GNUC__ >= 2 + __asm("pushl %0; call __mcount; popl %%ecx" + : + : "i" (profil) + : "ax", "bx", "cx", "dx", "memory"); +#else +#error +#endif + mcount_overhead = KCOUNT(p, PC_TO_I(p, profil)); + + startguprof(p); + for (i = 0; i < CALIB_SCALE; i++) +#if defined(__i386__) && __GNUC__ >= 2 + __asm("call mexitcount; 1:" + : : : "ax", "bx", "cx", "dx", "memory"); + __asm("movl $1b,%0" : "=rm" (tmp_addr)); +#else +#error +#endif + mexitcount_overhead = KCOUNT(p, PC_TO_I(p, tmp_addr)); + + p->state = GMON_PROF_OFF; + stopguprof(p); + + enable_intr(); + + nullfunc_loop_profiled_time = 0; + for (tmp_addr = (uintfptr_t)nullfunc_loop_profiled; + tmp_addr < (uintfptr_t)nullfunc_loop_profiled_end; + tmp_addr += HISTFRACTION * sizeof(HISTCOUNTER)) + nullfunc_loop_profiled_time += KCOUNT(p, PC_TO_I(p, tmp_addr)); +#define CALIB_DOSCALE(count) (((count) + CALIB_SCALE / 3) / CALIB_SCALE) +#define c2n(count, freq) ((int)((count) * 1000000000LL / freq)) + printf("cputime %d, empty_loop %d, nullfunc_loop_profiled %d, mcount %d, mexitcount %d\n", + CALIB_DOSCALE(c2n(cputime_overhead, p->profrate)), + CALIB_DOSCALE(c2n(empty_loop_time, p->profrate)), + CALIB_DOSCALE(c2n(nullfunc_loop_profiled_time, p->profrate)), + CALIB_DOSCALE(c2n(mcount_overhead, p->profrate)), + CALIB_DOSCALE(c2n(mexitcount_overhead, p->profrate))); + cputime_overhead -= empty_loop_time; + mcount_overhead -= empty_loop_time; + mexitcount_overhead -= empty_loop_time; + + /*- + * Profiling overheads are determined by the times between the + * following events: + * MC1: mcount() is called + * MC2: cputime() (called from mcount()) latches the timer + * MC3: mcount() completes + * ME1: mexitcount() is called + * ME2: cputime() (called from mexitcount()) latches the timer + * ME3: mexitcount() completes. + * The times between the events vary slightly depending on instruction + * combination and cache misses, etc. Attempt to determine the + * minimum times. These can be subtracted from the profiling times + * without much risk of reducing the profiling times below what they + * would be when profiling is not configured. Abbreviate: + * ab = minimum time between MC1 and MC3 + * a = minumum time between MC1 and MC2 + * b = minimum time between MC2 and MC3 + * cd = minimum time between ME1 and ME3 + * c = minimum time between ME1 and ME2 + * d = minimum time between ME2 and ME3. + * These satisfy the relations: + * ab <= mcount_overhead (just measured) + * a + b <= ab + * cd <= mexitcount_overhead (just measured) + * c + d <= cd + * a + d <= nullfunc_loop_profiled_time (just measured) + * a >= 0, b >= 0, c >= 0, d >= 0. + * Assume that ab and cd are equal to the minimums. + */ + p->cputime_overhead = CALIB_DOSCALE(cputime_overhead); + p->mcount_overhead = CALIB_DOSCALE(mcount_overhead - cputime_overhead); + p->mexitcount_overhead = CALIB_DOSCALE(mexitcount_overhead + - cputime_overhead); + nullfunc_loop_overhead = nullfunc_loop_profiled_time - empty_loop_time; + p->mexitcount_post_overhead = CALIB_DOSCALE((mcount_overhead + - nullfunc_loop_overhead) + / 4); + p->mexitcount_pre_overhead = p->mexitcount_overhead + + p->cputime_overhead + - p->mexitcount_post_overhead; + p->mcount_pre_overhead = CALIB_DOSCALE(nullfunc_loop_overhead) + - p->mexitcount_post_overhead; + p->mcount_post_overhead = p->mcount_overhead + + p->cputime_overhead + - p->mcount_pre_overhead; + printf( +"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d nsec\n", + c2n(p->cputime_overhead, p->profrate), + c2n(p->mcount_overhead, p->profrate), + c2n(p->mcount_pre_overhead, p->profrate), + c2n(p->mcount_post_overhead, p->profrate), + c2n(p->cputime_overhead, p->profrate), + c2n(p->mexitcount_overhead, p->profrate), + c2n(p->mexitcount_pre_overhead, p->profrate), + c2n(p->mexitcount_post_overhead, p->profrate)); + printf( +"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d cycles\n", + p->cputime_overhead, p->mcount_overhead, + p->mcount_pre_overhead, p->mcount_post_overhead, + p->cputime_overhead, p->mexitcount_overhead, + p->mexitcount_pre_overhead, p->mexitcount_post_overhead); +#endif /* GUPROF */ +} + +/* + * Return kernel profiling information. + */ +static int +sysctl_kern_prof SYSCTL_HANDLER_ARGS +{ + int *name = (int *) arg1; + u_int namelen = arg2; + struct gmonparam *gp = &_gmonparam; + int error; + int state; + + /* all sysctl names at this level are terminal */ + if (namelen != 1) + return (ENOTDIR); /* overloaded */ + + switch (name[0]) { + case GPROF_STATE: + state = gp->state; + error = sysctl_handle_int(oidp, &state, 0, req); + if (error) + return (error); + if (!req->newptr) + return (0); + if (state == GMON_PROF_OFF) { + gp->state = state; + stopprofclock(&proc0); + stopguprof(gp); + } else if (state == GMON_PROF_ON) { + gp->state = GMON_PROF_OFF; + stopguprof(gp); + gp->profrate = profhz; + startprofclock(&proc0); + gp->state = state; +#ifdef GUPROF + } else if (state == GMON_PROF_HIRES) { + gp->state = GMON_PROF_OFF; + stopprofclock(&proc0); + startguprof(gp); + gp->state = state; +#endif + } else if (state != gp->state) + return (EINVAL); + return (0); + case GPROF_COUNT: + return (sysctl_handle_opaque(oidp, + gp->kcount, gp->kcountsize, req)); + case GPROF_FROMS: + return (sysctl_handle_opaque(oidp, + gp->froms, gp->fromssize, req)); + case GPROF_TOS: + return (sysctl_handle_opaque(oidp, + gp->tos, gp->tossize, req)); + case GPROF_GMONPARAM: + return (sysctl_handle_opaque(oidp, gp, sizeof *gp, req)); + default: + return (EOPNOTSUPP); + } + /* NOTREACHED */ +} + +SYSCTL_NODE(_kern, KERN_PROF, prof, CTLFLAG_RW, sysctl_kern_prof, ""); +#endif /* GPROF */ + +/* + * Profiling system call. + * + * The scale factor is a fixed point number with 16 bits of fraction, so that + * 1.0 is represented as 0x10000. A scale factor of 0 turns off profiling. + */ +#ifndef _SYS_SYSPROTO_H_ +struct profil_args { + caddr_t samples; + size_t size; + size_t offset; + u_int scale; +}; +#endif +/* ARGSUSED */ +int +profil(p, uap) + struct proc *p; + register struct profil_args *uap; +{ + register struct uprof *upp; + int s; + + if (uap->scale > (1 << 16)) + return (EINVAL); + if (uap->scale == 0) { + stopprofclock(p); + return (0); + } + upp = &p->p_stats->p_prof; + + /* Block profile interrupts while changing state. */ + s = splstatclock(); + upp->pr_off = uap->offset; + upp->pr_scale = uap->scale; + upp->pr_base = uap->samples; + upp->pr_size = uap->size; + startprofclock(p); + splx(s); + + return (0); +} + +/* + * Scale is a fixed-point number with the binary point 16 bits + * into the value, and is <= 1.0. pc is at most 32 bits, so the + * intermediate result is at most 48 bits. + */ +#define PC_TO_INDEX(pc, prof) \ + ((int)(((u_quad_t)((pc) - (prof)->pr_off) * \ + (u_quad_t)((prof)->pr_scale)) >> 16) & ~1) + +/* + * Collect user-level profiling statistics; called on a profiling tick, + * when a process is running in user-mode. This routine may be called + * from an interrupt context. We try to update the user profiling buffers + * cheaply with fuswintr() and suswintr(). If that fails, we revert to + * an AST that will vector us to trap() with a context in which copyin + * and copyout will work. Trap will then call addupc_task(). + * + * Note that we may (rarely) not get around to the AST soon enough, and + * lose profile ticks when the next tick overwrites this one, but in this + * case the system is overloaded and the profile is probably already + * inaccurate. + */ +void +addupc_intr(p, pc, ticks) + register struct proc *p; + register u_long pc; + u_int ticks; +{ + register struct uprof *prof; + register caddr_t addr; + register u_int i; + register int v; + + if (ticks == 0) + return; + prof = &p->p_stats->p_prof; + if (pc < prof->pr_off || + (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) + return; /* out of range; ignore */ + + addr = prof->pr_base + i; + if ((v = fuswintr(addr)) == -1 || suswintr(addr, v + ticks) == -1) { + prof->pr_addr = pc; + prof->pr_ticks = ticks; + need_proftick(p); + } +} + +/* + * Much like before, but we can afford to take faults here. If the + * update fails, we simply turn off profiling. + */ +void +addupc_task(p, pc, ticks) + register struct proc *p; + register u_long pc; + u_int ticks; +{ + register struct uprof *prof; + register caddr_t addr; + register u_int i; + u_short v; + + /* Testing P_PROFIL may be unnecessary, but is certainly safe. */ + if ((p->p_flag & P_PROFIL) == 0 || ticks == 0) + return; + + prof = &p->p_stats->p_prof; + if (pc < prof->pr_off || + (i = PC_TO_INDEX(pc, prof)) >= prof->pr_size) + return; + + addr = prof->pr_base + i; + if (copyin(addr, (caddr_t)&v, sizeof(v)) == 0) { + v += ticks; + if (copyout((caddr_t)&v, addr, sizeof(v)) == 0) + return; + } + stopprofclock(p); +} diff --git a/sys/kern/subr_rlist.c b/sys/kern/subr_rlist.c index 3adf5a8..80a39cf 100644 --- a/sys/kern/subr_rlist.c +++ b/sys/kern/subr_rlist.c @@ -12,25 +12,25 @@ * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: - * This software is a component of "386BSD" developed by - William F. Jolitz, TeleMuse. + * This software is a component of "386BSD" developed by + * William F. Jolitz, TeleMuse. * 4. Neither the name of the developer nor the name "386BSD" * may be used to endorse or promote products derived from this software * without specific prior written permission. * - * THIS SOFTWARE IS A COMPONENT OF 386BSD DEVELOPED BY WILLIAM F. JOLITZ - * AND IS INTENDED FOR RESEARCH AND EDUCATIONAL PURPOSES ONLY. THIS - * SOFTWARE SHOULD NOT BE CONSIDERED TO BE A COMMERCIAL PRODUCT. - * THE DEVELOPER URGES THAT USERS WHO REQUIRE A COMMERCIAL PRODUCT + * THIS SOFTWARE IS A COMPONENT OF 386BSD DEVELOPED BY WILLIAM F. JOLITZ + * AND IS INTENDED FOR RESEARCH AND EDUCATIONAL PURPOSES ONLY. THIS + * SOFTWARE SHOULD NOT BE CONSIDERED TO BE A COMMERCIAL PRODUCT. + * THE DEVELOPER URGES THAT USERS WHO REQUIRE A COMMERCIAL PRODUCT * NOT MAKE USE THIS WORK. * * FOR USERS WHO WISH TO UNDERSTAND THE 386BSD SYSTEM DEVELOPED - * BY WILLIAM F. JOLITZ, WE RECOMMEND THE USER STUDY WRITTEN - * REFERENCES SUCH AS THE "PORTING UNIX TO THE 386" SERIES - * (BEGINNING JANUARY 1991 "DR. DOBBS JOURNAL", USA AND BEGINNING - * JUNE 1991 "UNIX MAGAZIN", GERMANY) BY WILLIAM F. JOLITZ AND - * LYNNE GREER JOLITZ, AS WELL AS OTHER BOOKS ON UNIX AND THE - * ON-LINE 386BSD USER MANUAL BEFORE USE. A BOOK DISCUSSING THE INTERNALS + * BY WILLIAM F. JOLITZ, WE RECOMMEND THE USER STUDY WRITTEN + * REFERENCES SUCH AS THE "PORTING UNIX TO THE 386" SERIES + * (BEGINNING JANUARY 1991 "DR. DOBBS JOURNAL", USA AND BEGINNING + * JUNE 1991 "UNIX MAGAZIN", GERMANY) BY WILLIAM F. JOLITZ AND + * LYNNE GREER JOLITZ, AS WELL AS OTHER BOOKS ON UNIX AND THE + * ON-LINE 386BSD USER MANUAL BEFORE USE. A BOOK DISCUSSING THE INTERNALS * OF 386BSD ENTITLED "386BSD FROM THE INSIDE OUT" WILL BE AVAILABLE LATE 1992. * * THIS SOFTWARE IS PROVIDED BY THE DEVELOPER ``AS IS'' AND @@ -46,99 +46,185 @@ * SUCH DAMAGE. * */ -static char rcsid[] = "$Header: /usr/bill/working/sys/kern/RCS/subr_rlist.c,v 1.2 92/01/21 21:29:31 william Exp $"; +/* + * Changes Copyright (C) 1995, David Greenman & John Dyson; This software may + * be used, modified, copied, distributed, and sold, in both source and + * binary form provided that the above copyright and these terms are + * retained. Under no circumstances is the author responsible for the proper + * functioning of this software, nor does the author assume any responsibility + * for damages incurred with its use. + * + * --------- DEPRECIATED --------- + * + * $Id: subr_rlist.c,v 1.30 1999/01/21 08:29:04 dillon Exp $ + */ -#include "sys/param.h" -#include "sys/cdefs.h" -#include "sys/malloc.h" -#include "rlist.h" +#if 0 + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/rlist.h> +#include <vm/vm.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> /* * Resource lists. */ -/* - * Add space to a resource list. Used to either - * initialize a list or return free space to it. - */ -rlist_free (rlp, start, end) -register struct rlist **rlp; unsigned start, end; { - struct rlist *head; - - head = *rlp; - -loop: - /* if nothing here, insert (tail of list) */ - if (*rlp == 0) { - *rlp = (struct rlist *)malloc(sizeof(**rlp), M_TEMP, M_NOWAIT); - (*rlp)->rl_start = start; - (*rlp)->rl_end = end; - (*rlp)->rl_next = 0; - return; - } +#define RLIST_MIN 128 +static int rlist_count=0; +static struct rlist *rlfree; - /* if new region overlaps something currently present, panic */ - if (start >= (*rlp)->rl_start && start <= (*rlp)->rl_end) { - printf("Frag %d:%d, ent %d:%d ", start, end, - (*rlp)->rl_start, (*rlp)->rl_end); - panic("overlapping front rlist_free: freed twice?"); +static struct rlist *rlist_malloc __P((void)); +static __inline void rlist_mfree __P((struct rlist *rl)); + +static struct rlist * +rlist_malloc() +{ + struct rlist *rl; + int i; + while( rlist_count < RLIST_MIN) { + int s = splhigh(); + rl = (struct rlist *)kmem_alloc(kernel_map, PAGE_SIZE); + splx(s); + if( !rl) + break; + + for(i=0;i<(PAGE_SIZE/(sizeof *rl));i++) { + rl->rl_next = rlfree; + rlfree = rl; + rlist_count++; + rl++; + } } - if (end >= (*rlp)->rl_start && end <= (*rlp)->rl_end) { - printf("Frag %d:%d, ent %d:%d ", start, end, - (*rlp)->rl_start, (*rlp)->rl_end); - panic("overlapping tail rlist_free: freed twice?"); + + if( (rl = rlfree) == 0 ) + panic("Cannot get an rlist entry"); + + --rlist_count; + rlfree = rl->rl_next; + return rl; +} + +static __inline void +rlist_mfree(rl) + struct rlist *rl; +{ + rl->rl_next = rlfree; + rlfree = rl; + ++rlist_count; +} + +void +rlist_free(rlh, start, end) + struct rlisthdr *rlh; + u_int start, end; +{ + struct rlist **rlp = &rlh->rlh_list; + struct rlist *prev_rlp = NULL, *cur_rlp, *next_rlp = NULL; + int s; + + s = splhigh(); + while (rlh->rlh_lock & RLH_LOCKED) { + rlh->rlh_lock |= RLH_DESIRED; + tsleep(rlh, PSWP, "rlistf", 0); } + rlh->rlh_lock |= RLH_LOCKED; + splx(s); - /* are we adjacent to this element? (in front) */ - if (end+1 == (*rlp)->rl_start) { - /* coalesce */ - (*rlp)->rl_start = start; - goto scan; + /* + * Traverse the list looking for an entry after the one we want + * to insert. + */ + cur_rlp = *rlp; + while (cur_rlp != NULL) { + if (start < cur_rlp->rl_start) + break; + if (prev_rlp) { + KASSERT(prev_rlp->rl_end + 1 != cur_rlp->rl_start, + ("rlist_free: missed coalesce opportunity")); + KASSERT(prev_rlp->rl_end != cur_rlp->rl_start, + ("rlist_free: entries overlap")); + KASSERT(prev_rlp->rl_end <= cur_rlp->rl_start, + ("entries out of order")); + } + prev_rlp = cur_rlp; + cur_rlp = cur_rlp->rl_next; } - /* are we before this element? */ - if (end < (*rlp)->rl_start) { - register struct rlist *nlp; + if (cur_rlp != NULL) { + + if (end >= cur_rlp->rl_start) + panic("rlist_free: free end overlaps already freed area"); - nlp = (struct rlist *)malloc(sizeof(*nlp), M_TEMP, M_NOWAIT); - nlp->rl_start = start; - nlp->rl_end = end; - nlp->rl_next = *rlp; - *rlp = nlp; - return; + if (prev_rlp) { + if (start <= prev_rlp->rl_end) + panic("rlist_free: free start overlaps already freed area"); + /* + * Attempt to append + */ + if (prev_rlp->rl_end + 1 == start) { + prev_rlp->rl_end = end; + /* + * Attempt to prepend and coalesce + */ + if (end + 1 == cur_rlp->rl_start) { + prev_rlp->rl_end = cur_rlp->rl_end; + prev_rlp->rl_next = cur_rlp->rl_next; + rlist_mfree(cur_rlp); + } + goto done; + } + } + /* + * Attempt to prepend + */ + if (end + 1 == cur_rlp->rl_start) { + cur_rlp->rl_start = start; + goto done; + } + } + /* + * Reached the end of the list without finding a larger entry. + * Append to last entry if there is one and it's adjacent. + */ + if (prev_rlp) { + if (start <= prev_rlp->rl_end) + panic("rlist_free: free start overlaps already freed area at list tail"); + /* + * Attempt to append + */ + if (prev_rlp->rl_end + 1 == start) { + prev_rlp->rl_end = end; + goto done; + } } - /* are we adjacent to this element? (at tail) */ - if ((*rlp)->rl_end + 1 == start) { - /* coalesce */ - (*rlp)->rl_end = end; - goto scan; + /* + * Could neither append nor prepend; allocate a new entry. + */ + next_rlp = cur_rlp; + cur_rlp = rlist_malloc(); + cur_rlp->rl_start = start; + cur_rlp->rl_end = end; + cur_rlp->rl_next = next_rlp; + if (prev_rlp) { + prev_rlp->rl_next = cur_rlp; + } else { + /* + * No previous - this entry is the new list head. + */ + *rlp = cur_rlp; } - /* are we after this element */ - if (start > (*rlp)->rl_end) { - rlp = &((*rlp)->rl_next); - goto loop; - } else - panic("rlist_free: can't happen"); - -scan: - /* can we coalesce list now that we've filled a void? */ - { - register struct rlist *lp, *lpn; - - for (lp = head; lp->rl_next ;) { - lpn = lp->rl_next; - - /* coalesce ? */ - if (lp->rl_end + 1 == lpn->rl_start) { - lp->rl_end = lpn->rl_end; - lp->rl_next = lpn->rl_next; - free(lpn, M_TEMP); - } else - lp = lp->rl_next; - } +done: + rlh->rlh_lock &= ~RLH_LOCKED; + if (rlh->rlh_lock & RLH_DESIRED) { + wakeup(rlh); + rlh->rlh_lock &= ~RLH_DESIRED; } + return; } /* @@ -147,10 +233,23 @@ scan: * return a value of 1 and set resource start location with * "*loc". (Note: loc can be zero if we don't wish the value) */ -int rlist_alloc (rlp, size, loc) -struct rlist **rlp; unsigned size, *loc; { +int +rlist_alloc (rlh, size, loc) + struct rlisthdr *rlh; + unsigned size, *loc; +{ + struct rlist **rlp = &rlh->rlh_list; register struct rlist *lp; + int s; + register struct rlist *olp = 0; + s = splhigh(); + while (rlh->rlh_lock & RLH_LOCKED) { + rlh->rlh_lock |= RLH_DESIRED; + tsleep(rlh, PSWP, "rlistf", 0); + } + rlh->rlh_lock |= RLH_LOCKED; + splx(s); /* walk list, allocating first thing that's big enough (first fit) */ for (; *rlp; rlp = &((*rlp)->rl_next)) @@ -163,13 +262,33 @@ struct rlist **rlp; unsigned size, *loc; { /* did we eat this element entirely? */ if ((*rlp)->rl_start > (*rlp)->rl_end) { lp = (*rlp)->rl_next; - free (*rlp, M_TEMP); - *rlp = lp; + rlist_mfree(*rlp); + /* + * if the deleted element was in fromt + * of the list, adjust *rlp, else don't. + */ + if (olp) { + olp->rl_next = lp; + } else { + *rlp = lp; + } } + rlh->rlh_lock &= ~RLH_LOCKED; + if (rlh->rlh_lock & RLH_DESIRED) { + wakeup(rlh); + rlh->rlh_lock &= ~RLH_DESIRED; + } return (1); + } else { + olp = *rlp; } + rlh->rlh_lock &= ~RLH_LOCKED; + if (rlh->rlh_lock & RLH_DESIRED) { + wakeup(rlh); + rlh->rlh_lock &= ~RLH_DESIRED; + } /* nothing in list that's big enough */ return (0); } @@ -178,14 +297,20 @@ struct rlist **rlp; unsigned size, *loc; { * Finished with this resource list, reclaim all space and * mark it as being empty. */ -rlist_destroy (rlp) -struct rlist **rlp; { +void +rlist_destroy (rlh) + struct rlisthdr *rlh; +{ + struct rlist **rlp = &rlh->rlh_list; struct rlist *lp, *nlp; lp = *rlp; *rlp = 0; for (; lp; lp = nlp) { nlp = lp->rl_next; - free (lp, M_TEMP); + rlist_mfree(lp); } } + +#endif + diff --git a/sys/kern/subr_rman.c b/sys/kern/subr_rman.c new file mode 100644 index 0000000..e0526bb --- /dev/null +++ b/sys/kern/subr_rman.c @@ -0,0 +1,591 @@ +/* + * Copyright 1998 Massachusetts Institute of Technology + * + * Permission to use, copy, modify, and distribute this software and + * its documentation for any purpose and without fee is hereby + * granted, provided that both the above copyright notice and this + * permission notice appear in all copies, that both the above + * copyright notice and this permission notice appear in all + * supporting documentation, and that the name of M.I.T. not be used + * in advertising or publicity pertaining to distribution of the + * software without specific, written prior permission. M.I.T. makes + * no representations about the suitability of this software for any + * purpose. It is provided "as is" without express or implied + * warranty. + * + * THIS SOFTWARE IS PROVIDED BY M.I.T. ``AS IS''. M.I.T. DISCLAIMS + * ALL EXPRESS OR IMPLIED WARRANTIES WITH REGARD TO THIS SOFTWARE, + * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. IN NO EVENT + * SHALL M.I.T. BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF + * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND + * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT + * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: subr_rman.c,v 1.3 1998/12/07 21:58:29 archie Exp $ + */ + +/* + * The kernel resource manager. This code is responsible for keeping track + * of hardware resources which are apportioned out to various drivers. + * It does not actually assign those resources, and it is not expected + * that end-device drivers will call into this code directly. Rather, + * the code which implements the buses that those devices are attached to, + * and the code which manages CPU resources, will call this code, and the + * end-device drivers will make upcalls to that code to actually perform + * the allocation. + * + * There are two sorts of resources managed by this code. The first is + * the more familiar array (RMAN_ARRAY) type; resources in this class + * consist of a sequence of individually-allocatable objects which have + * been numbered in some well-defined order. Most of the resources + * are of this type, as it is the most familiar. The second type is + * called a gauge (RMAN_GAUGE), and models fungible resources (i.e., + * resources in which each instance is indistinguishable from every + * other instance). The principal anticipated application of gauges + * is in the context of power consumption, where a bus may have a specific + * power budget which all attached devices share. RMAN_GAUGE is not + * implemented yet. + * + * For array resources, we make one simplifying assumption: two clients + * sharing the same resource must use the same range of indices. That + * is to say, sharing of overlapping-but-not-identical regions is not + * permitted. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/rman.h> +#include <sys/bus.h> /* XXX debugging */ + +MALLOC_DEFINE(M_RMAN, "rman", "Resource manager"); + +struct rman_head rman_head; +#ifndef NULL_SIMPLELOCKS +static struct simplelock rman_lock; /* mutex to protect rman_head */ +#endif +static int int_rman_activate_resource(struct rman *rm, struct resource *r, + struct resource **whohas); +static int int_rman_release_resource(struct rman *rm, struct resource *r); + +#define CIRCLEQ_TERMCOND(var, head) (var == (void *)&(head)) + +int +rman_init(struct rman *rm) +{ + static int once; + + if (once == 0) { + once = 1; + TAILQ_INIT(&rman_head); + simple_lock_init(&rman_lock); + } + + if (rm->rm_type == RMAN_UNINIT) + panic("rman_init"); + if (rm->rm_type == RMAN_GAUGE) + panic("implement RMAN_GAUGE"); + + CIRCLEQ_INIT(&rm->rm_list); + rm->rm_slock = malloc(sizeof *rm->rm_slock, M_RMAN, M_NOWAIT); + if (rm->rm_slock == 0) + return ENOMEM; + simple_lock_init(rm->rm_slock); + + simple_lock(&rman_lock); + TAILQ_INSERT_TAIL(&rman_head, rm, rm_link); + simple_unlock(&rman_lock); + return 0; +} + +/* + * NB: this interface is not robust against programming errors which + * add multiple copies of the same region. + */ +int +rman_manage_region(struct rman *rm, u_long start, u_long end) +{ + struct resource *r, *s; + + r = malloc(sizeof *r, M_RMAN, M_NOWAIT); + if (r == 0) + return ENOMEM; + r->r_sharehead = 0; + r->r_start = start; + r->r_end = end; + r->r_flags = 0; + r->r_dev = 0; + r->r_rm = rm; + + simple_lock(rm->rm_slock); + for (s = rm->rm_list.cqh_first; + !CIRCLEQ_TERMCOND(s, rm->rm_list) && s->r_end < r->r_start; + s = s->r_link.cqe_next) + ; + + if (CIRCLEQ_TERMCOND(s, rm->rm_list)) { + CIRCLEQ_INSERT_TAIL(&rm->rm_list, r, r_link); + } else { + CIRCLEQ_INSERT_BEFORE(&rm->rm_list, s, r, r_link); + } + + simple_unlock(rm->rm_slock); + return 0; +} + +int +rman_fini(struct rman *rm) +{ + struct resource *r; + + simple_lock(rm->rm_slock); + for (r = rm->rm_list.cqh_first; !CIRCLEQ_TERMCOND(r, rm->rm_list); + r = r->r_link.cqe_next) { + if (r->r_flags & RF_ALLOCATED) + return EBUSY; + } + + /* + * There really should only be one of these if we are in this + * state and the code is working properly, but it can't hurt. + */ + for (r = rm->rm_list.cqh_first; !CIRCLEQ_TERMCOND(r, rm->rm_list); + r = rm->rm_list.cqh_first) { + CIRCLEQ_REMOVE(&rm->rm_list, r, r_link); + free(r, M_RMAN); + } + simple_unlock(rm->rm_slock); + simple_lock(&rman_lock); + TAILQ_REMOVE(&rman_head, rm, rm_link); + simple_unlock(&rman_lock); + free(rm->rm_slock, M_RMAN); + + return 0; +} + +struct resource * +rman_reserve_resource(struct rman *rm, u_long start, u_long end, u_long count, + u_int flags, struct device *dev) +{ + u_int want_activate; + struct resource *r, *s, *rv; + u_long rstart, rend; + + rv = 0; + +#ifdef RMAN_DEBUG + printf("rman_reserve_resource: <%s> request: [%#lx, %#lx], length " + "%#lx, flags %u, device %s%d\n", rm->rm_descr, start, end, + count, flags, device_get_name(dev), device_get_unit(dev)); +#endif /* RMAN_DEBUG */ + want_activate = (flags & RF_ACTIVE); + flags &= ~RF_ACTIVE; + + simple_lock(rm->rm_slock); + + for (r = rm->rm_list.cqh_first; + !CIRCLEQ_TERMCOND(r, rm->rm_list) && r->r_end < start; + r = r->r_link.cqe_next) + ; + + if (CIRCLEQ_TERMCOND(r, rm->rm_list)) { +#ifdef RMAN_DEBUG + printf("could not find a region\n"); +#endif RMAN_DEBUG + goto out; + } + + /* + * First try to find an acceptable totally-unshared region. + */ + for (s = r; !CIRCLEQ_TERMCOND(s, rm->rm_list); + s = s->r_link.cqe_next) { +#ifdef RMAN_DEBUG + printf("considering [%#lx, %#lx]\n", s->r_start, s->r_end); +#endif /* RMAN_DEBUG */ + if (s->r_start > end) { +#ifdef RMAN_DEBUG + printf("s->r_start (%#lx) > end (%#lx)\n", s->r_start, end); +#endif /* RMAN_DEBUG */ + break; + } + if (s->r_flags & RF_ALLOCATED) { +#ifdef RMAN_DEBUG + printf("region is allocated\n"); +#endif /* RMAN_DEBUG */ + continue; + } + rstart = max(s->r_start, start); + rend = min(s->r_end, max(start + count, end)); +#ifdef RMAN_DEBUG + printf("truncated region: [%#lx, %#lx]; size %#lx (requested %#lx)\n", + rstart, rend, (rend - rstart + 1), count); +#endif /* RMAN_DEBUG */ + + if ((rend - rstart + 1) >= count) { +#ifdef RMAN_DEBUG + printf("candidate region: [%#lx, %#lx], size %#lx\n", + rend, rstart, (rend - rstart + 1)); +#endif /* RMAN_DEBUG */ + if ((s->r_end - s->r_start + 1) == count) { +#ifdef RMAN_DEBUG + printf("candidate region is entire chunk\n"); +#endif /* RMAN_DEBUG */ + rv = s; + rv->r_flags |= RF_ALLOCATED; + rv->r_dev = dev; + goto out; + } + + /* + * If s->r_start < rstart and + * s->r_end > rstart + count - 1, then + * we need to split the region into three pieces + * (the middle one will get returned to the user). + * Otherwise, we are allocating at either the + * beginning or the end of s, so we only need to + * split it in two. The first case requires + * two new allocations; the second requires but one. + */ + rv = malloc(sizeof *r, M_RMAN, M_NOWAIT); + if (rv == 0) + goto out; + rv->r_start = rstart; + rv->r_end = rstart + count - 1; + rv->r_flags = flags | RF_ALLOCATED; + rv->r_dev = dev; + rv->r_sharehead = 0; + + if (s->r_start < rv->r_start && s->r_end > rv->r_end) { +#ifdef RMAN_DEBUG + printf("splitting region in three parts: " + "[%#lx, %#lx]; [%#lx, %#lx]; [%#lx, %#lx]\n", + s->r_start, rv->r_start - 1, + rv->r_start, rv->r_end, + rv->r_end + 1, s->r_end); +#endif /* RMAN_DEBUG */ + /* + * We are allocating in the middle. + */ + r = malloc(sizeof *r, M_RMAN, M_NOWAIT); + if (r == 0) { + free(rv, M_RMAN); + rv = 0; + goto out; + } + r->r_start = rv->r_end + 1; + r->r_end = s->r_end; + r->r_flags = s->r_flags; + r->r_dev = 0; + r->r_sharehead = 0; + s->r_end = rv->r_start - 1; + CIRCLEQ_INSERT_AFTER(&rm->rm_list, s, rv, + r_link); + CIRCLEQ_INSERT_AFTER(&rm->rm_list, rv, r, + r_link); + } else if (s->r_start == rv->r_start) { +#ifdef RMAN_DEBUG + printf("allocating from the beginning\n"); +#endif /* RMAN_DEBUG */ + /* + * We are allocating at the beginning. + */ + s->r_start = rv->r_end + 1; + CIRCLEQ_INSERT_BEFORE(&rm->rm_list, s, rv, + r_link); + } else { +#ifdef RMAN_DEBUG + printf("allocating at the end\n"); +#endif /* RMAN_DEBUG */ + /* + * We are allocating at the end. + */ + s->r_end = rv->r_start - 1; + CIRCLEQ_INSERT_AFTER(&rm->rm_list, s, rv, + r_link); + } + goto out; + } + } + + /* + * Now find an acceptable shared region, if the client's requirements + * allow sharing. By our implementation restriction, a candidate + * region must match exactly by both size and sharing type in order + * to be considered compatible with the client's request. (The + * former restriction could probably be lifted without too much + * additional work, but this does not seem warranted.) + */ +#ifdef RMAN_DEBUG + printf("no unshared regions found\n"); +#endif /* RMAN_DEBUG */ + if ((flags & (RF_SHAREABLE | RF_TIMESHARE)) == 0) + goto out; + + for (s = r; !CIRCLEQ_TERMCOND(s, rm->rm_list); + s = s->r_link.cqe_next) { + if (s->r_start > end) + break; + if ((s->r_flags & flags) != flags) + continue; + rstart = max(s->r_start, start); + rend = min(s->r_end, max(start + count, end)); + if (s->r_start >= start && s->r_end <= end + && (s->r_end - s->r_start + 1) == count) { + rv = malloc(sizeof *rv, M_RMAN, M_NOWAIT); + if (rv == 0) + goto out; + rv->r_start = s->r_start; + rv->r_end = s->r_end; + rv->r_flags = s->r_flags & + (RF_ALLOCATED | RF_SHAREABLE | RF_TIMESHARE); + rv->r_dev = dev; + rv->r_rm = rm; + if (s->r_sharehead == 0) { + s->r_sharehead = malloc(sizeof *s->r_sharehead, + M_RMAN, M_NOWAIT); + if (s->r_sharehead == 0) { + free(rv, M_RMAN); + rv = 0; + goto out; + } + LIST_INIT(s->r_sharehead); + LIST_INSERT_HEAD(s->r_sharehead, s, + r_sharelink); + s->r_flags = RF_FIRSTSHARE; + } + rv->r_sharehead = s->r_sharehead; + LIST_INSERT_HEAD(s->r_sharehead, rv, r_sharelink); + goto out; + } + } + + /* + * We couldn't find anything. + */ +out: + /* + * If the user specified RF_ACTIVE in the initial flags, + * which is reflected in `want_activate', we attempt to atomically + * activate the resource. If this fails, we release the resource + * and indicate overall failure. (This behavior probably doesn't + * make sense for RF_TIMESHARE-type resources.) + */ + if (rv && want_activate) { + struct resource *whohas; + if (int_rman_activate_resource(rm, rv, &whohas)) { + int_rman_release_resource(rm, rv); + rv = 0; + } + } + + simple_unlock(rm->rm_slock); + return (rv); +} + +static int +int_rman_activate_resource(struct rman *rm, struct resource *r, + struct resource **whohas) +{ + struct resource *s; + int ok; + + /* + * If we are not timesharing, then there is nothing much to do. + * If we already have the resource, then there is nothing at all to do. + * If we are not on a sharing list with anybody else, then there is + * little to do. + */ + if ((r->r_flags & RF_TIMESHARE) == 0 + || (r->r_flags & RF_ACTIVE) != 0 + || r->r_sharehead == 0) { + r->r_flags |= RF_ACTIVE; + return 0; + } + + ok = 1; + for (s = r->r_sharehead->lh_first; s && ok; + s = s->r_sharelink.le_next) { + if ((s->r_flags & RF_ACTIVE) != 0) { + ok = 0; + *whohas = s; + } + } + if (ok) { + r->r_flags |= RF_ACTIVE; + return 0; + } + return EBUSY; +} + +int +rman_activate_resource(struct resource *r) +{ + int rv; + struct resource *whohas; + struct rman *rm; + + rm = r->r_rm; + simple_lock(rm->rm_slock); + rv = int_rman_activate_resource(rm, r, &whohas); + simple_unlock(rm->rm_slock); + return rv; +} + +int +rman_await_resource(struct resource *r, int pri, int timo) +{ + int rv, s; + struct resource *whohas; + struct rman *rm; + + rm = r->r_rm; + for (;;) { + simple_lock(rm->rm_slock); + rv = int_rman_activate_resource(rm, r, &whohas); + if (rv != EBUSY) + return (rv); + + if (r->r_sharehead == 0) + panic("rman_await_resource"); + /* + * splhigh hopefully will prevent a race between + * simple_unlock and tsleep where a process + * could conceivably get in and release the resource + * before we have a chance to sleep on it. + */ + s = splhigh(); + whohas->r_flags |= RF_WANTED; + simple_unlock(rm->rm_slock); + rv = tsleep(r->r_sharehead, pri, "rmwait", timo); + if (rv) { + splx(s); + return rv; + } + simple_lock(rm->rm_slock); + splx(s); + } +} + +int +rman_deactivate_resource(struct resource *r) +{ + struct rman *rm; + + rm = r->r_rm; + simple_lock(rm->rm_slock); + r->r_flags &= ~RF_ACTIVE; + if (r->r_flags & RF_WANTED) { + r->r_flags &= ~RF_WANTED; + wakeup(r->r_sharehead); + } + simple_unlock(rm->rm_slock); + return 0; +} + +static int +int_rman_release_resource(struct rman *rm, struct resource *r) +{ + struct resource *s, *t; + + if (r->r_flags & RF_ACTIVE) + return EBUSY; + + /* + * Check for a sharing list first. If there is one, then we don't + * have to think as hard. + */ + if (r->r_sharehead) { + /* + * If a sharing list exists, then we know there are at + * least two sharers. + * + * If we are in the main circleq, appoint someone else. + */ + LIST_REMOVE(r, r_sharelink); + s = r->r_sharehead->lh_first; + if (r->r_flags & RF_FIRSTSHARE) { + s->r_flags |= RF_FIRSTSHARE; + CIRCLEQ_INSERT_BEFORE(&rm->rm_list, r, s, r_link); + CIRCLEQ_REMOVE(&rm->rm_list, r, r_link); + } + + /* + * Make sure that the sharing list goes away completely + * if the resource is no longer being shared at all. + */ + if (s->r_sharelink.le_next == 0) { + free(s->r_sharehead, M_RMAN); + s->r_sharehead = 0; + s->r_flags &= ~RF_FIRSTSHARE; + } + goto out; + } + + /* + * Look at the adjacent resources in the list and see if our + * segment can be merged with any of them. + */ + s = r->r_link.cqe_prev; + t = r->r_link.cqe_next; + + if (s != (void *)&rm->rm_list && (s->r_flags & RF_ALLOCATED) == 0 + && t != (void *)&rm->rm_list && (t->r_flags & RF_ALLOCATED) == 0) { + /* + * Merge all three segments. + */ + s->r_end = t->r_end; + CIRCLEQ_REMOVE(&rm->rm_list, r, r_link); + CIRCLEQ_REMOVE(&rm->rm_list, t, r_link); + free(t, M_RMAN); + } else if (s != (void *)&rm->rm_list + && (s->r_flags & RF_ALLOCATED) == 0) { + /* + * Merge previous segment with ours. + */ + s->r_end = r->r_end; + CIRCLEQ_REMOVE(&rm->rm_list, r, r_link); + } else if (t != (void *)&rm->rm_list + && (t->r_flags & RF_ALLOCATED) == 0) { + /* + * Merge next segment with ours. + */ + t->r_start = r->r_start; + CIRCLEQ_REMOVE(&rm->rm_list, r, r_link); + } else { + /* + * At this point, we know there is nothing we + * can potentially merge with, because on each + * side, there is either nothing there or what is + * there is still allocated. In that case, we don't + * want to remove r from the list; we simply want to + * change it to an unallocated region and return + * without freeing anything. + */ + r->r_flags &= ~RF_ALLOCATED; + return 0; + } + +out: + free(r, M_RMAN); + return 0; +} + +int +rman_release_resource(struct resource *r) +{ + int rv; + struct rman *rm = r->r_rm; + + simple_lock(rm->rm_slock); + rv = int_rman_release_resource(rm, r); + simple_unlock(rm->rm_slock); + return (rv); +} diff --git a/sys/kern/subr_scanf.c b/sys/kern/subr_scanf.c new file mode 100644 index 0000000..24f8846 --- /dev/null +++ b/sys/kern/subr_scanf.c @@ -0,0 +1,793 @@ +/*- + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Chris Torek. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + * From: Id: vfscanf.c,v 1.13 1998/09/25 12:20:27 obrien Exp + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <machine/limits.h> + +/* + * Note that stdarg.h and the ANSI style va_start macro is used for both + * ANSI and traditional C compilers. + */ +#include <machine/stdarg.h> + +#define BUF 32 /* Maximum length of numeric string. */ + +/* + * Flags used during conversion. + */ +#define LONG 0x01 /* l: long or double */ +#define SHORT 0x04 /* h: short */ +#define SUPPRESS 0x08 /* suppress assignment */ +#define POINTER 0x10 /* weird %p pointer (`fake hex') */ +#define NOSKIP 0x20 /* do not skip blanks */ +#define QUAD 0x400 + +/* + * The following are used in numeric conversions only: + * SIGNOK, NDIGITS, DPTOK, and EXPOK are for floating point; + * SIGNOK, NDIGITS, PFXOK, and NZDIGITS are for integral. + */ +#define SIGNOK 0x40 /* +/- is (still) legal */ +#define NDIGITS 0x80 /* no digits detected */ + +#define DPTOK 0x100 /* (float) decimal point is still legal */ +#define EXPOK 0x200 /* (float) exponent (e+3, etc) still legal */ + +#define PFXOK 0x100 /* 0x prefix is (still) legal */ +#define NZDIGITS 0x200 /* no zero digits detected */ + +/* + * Conversion types. + */ +#define CT_CHAR 0 /* %c conversion */ +#define CT_CCL 1 /* %[...] conversion */ +#define CT_STRING 2 /* %s conversion */ +#define CT_INT 3 /* integer, i.e., strtoq or strtouq */ +typedef u_quad_t (*ccfntype)(const char *, char **, int); + +#define isspace(c) ((c) == ' ' || (c) == '\t' || \ + (c) == '\r' || (c) == '\n') +#define isascii(c) (((c) & ~0x7f) == 0) +#define isupper(c) ((c) >= 'A' && (c) <= 'Z') +#define islower(c) ((c) >= 'a' && (c) <= 'z') +#define isalpha(c) (isupper(c) || (islower(c))) +#define isdigit(c) ((c) >= '0' && (c) <= '9') + +static u_char *__sccl(char *, u_char *); + +int +sscanf(const char *ibuf, const char *fmt, ...) +{ + va_list ap; + int ret; + + va_start(ap, fmt); + ret = vsscanf(ibuf, fmt, ap); + va_end(ap); + return(ret); +} + +int +vsscanf(const char *inp, char const *fmt0, va_list ap) +{ + int inr; + u_char *fmt = (u_char *)fmt0; + int c; /* character from format, or conversion */ + size_t width; /* field width, or 0 */ + char *p; /* points into all kinds of strings */ + int n; /* handy integer */ + int flags; /* flags as defined above */ + char *p0; /* saves original value of p when necessary */ + int nassigned; /* number of fields assigned */ + int nconversions; /* number of conversions */ + int nread; /* number of characters consumed from fp */ + int base; /* base argument to strtoq/strtouq */ + ccfntype ccfn; /* conversion function (strtoq/strtouq) */ + char ccltab[256]; /* character class table for %[...] */ + char buf[BUF]; /* buffer for numeric conversions */ + + /* `basefix' is used to avoid `if' tests in the integer scanner */ + static short basefix[17] = + { 10, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16 }; + + inr = strlen(inp); + + nassigned = 0; + nconversions = 0; + nread = 0; + base = 0; /* XXX just to keep gcc happy */ + ccfn = NULL; /* XXX just to keep gcc happy */ + for (;;) { + c = *fmt++; + if (c == 0) + return (nassigned); + if (isspace(c)) { + while (inr > 0 && isspace(*inp)) + nread++, inr--, inp++; + continue; + } + if (c != '%') + goto literal; + width = 0; + flags = 0; + /* + * switch on the format. continue if done; + * break once format type is derived. + */ +again: c = *fmt++; + switch (c) { + case '%': +literal: + if (inr <= 0) + goto input_failure; + if (*inp != c) + goto match_failure; + inr--, inp++; + nread++; + continue; + + case '*': + flags |= SUPPRESS; + goto again; + case 'l': + flags |= LONG; + goto again; + case 'q': + flags |= QUAD; + goto again; + case 'h': + flags |= SHORT; + goto again; + + case '0': case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + width = width * 10 + c - '0'; + goto again; + + /* + * Conversions. + * + */ + case 'd': + c = CT_INT; + ccfn = (ccfntype)strtoq; + base = 10; + break; + + case 'i': + c = CT_INT; + ccfn = (ccfntype)strtoq; + base = 0; + break; + + case 'o': + c = CT_INT; + ccfn = strtouq; + base = 8; + break; + + case 'u': + c = CT_INT; + ccfn = strtouq; + base = 10; + break; + + case 'x': + flags |= PFXOK; /* enable 0x prefixing */ + c = CT_INT; + ccfn = strtouq; + base = 16; + break; + + case 's': + c = CT_STRING; + break; + + case '[': + fmt = __sccl(ccltab, fmt); + flags |= NOSKIP; + c = CT_CCL; + break; + + case 'c': + flags |= NOSKIP; + c = CT_CHAR; + break; + + case 'p': /* pointer format is like hex */ + flags |= POINTER | PFXOK; + c = CT_INT; + ccfn = strtouq; + base = 16; + break; + + case 'n': + nconversions++; + if (flags & SUPPRESS) /* ??? */ + continue; + if (flags & SHORT) + *va_arg(ap, short *) = nread; + else if (flags & LONG) + *va_arg(ap, long *) = nread; + else if (flags & QUAD) + *va_arg(ap, quad_t *) = nread; + else + *va_arg(ap, int *) = nread; + continue; + } + + /* + * We have a conversion that requires input. + */ + if (inr <= 0) + goto input_failure; + + /* + * Consume leading white space, except for formats + * that suppress this. + */ + if ((flags & NOSKIP) == 0) { + while (isspace(*inp)) { + nread++; + if (--inr > 0) + inp++; + else + goto input_failure; + } + /* + * Note that there is at least one character in + * the buffer, so conversions that do not set NOSKIP + * can no longer result in an input failure. + */ + } + + /* + * Do the conversion. + */ + switch (c) { + + case CT_CHAR: + /* scan arbitrary characters (sets NOSKIP) */ + if (width == 0) + width = 1; + if (flags & SUPPRESS) { + size_t sum = 0; + for (;;) { + if ((n = inr) < width) { + sum += n; + width -= n; + inp += n; + if (sum == 0) + goto input_failure; + break; + } else { + sum += width; + inr -= width; + inp += width; + break; + } + } + nread += sum; + } else { + bcopy(inp, va_arg(ap, char *), width); + inr -= width; + inp += width; + nread += width; + nassigned++; + } + nconversions++; + break; + + case CT_CCL: + /* scan a (nonempty) character class (sets NOSKIP) */ + if (width == 0) + width = (size_t)~0; /* `infinity' */ + /* take only those things in the class */ + if (flags & SUPPRESS) { + n = 0; + while (ccltab[*inp]) { + n++, inr--, inp++; + if (--width == 0) + break; + if (inr <= 0) { + if (n == 0) + goto input_failure; + break; + } + } + if (n == 0) + goto match_failure; + } else { + p0 = p = va_arg(ap, char *); + while (ccltab[*inp]) { + inr--; + *p++ = *inp++; + if (--width == 0) + break; + if (inr <= 0) { + if (p == p0) + goto input_failure; + break; + } + } + n = p - p0; + if (n == 0) + goto match_failure; + *p = 0; + nassigned++; + } + nread += n; + nconversions++; + break; + + case CT_STRING: + /* like CCL, but zero-length string OK, & no NOSKIP */ + if (width == 0) + width = (size_t)~0; + if (flags & SUPPRESS) { + n = 0; + while (!isspace(*inp)) { + n++, inr--, inp++; + if (--width == 0) + break; + if (inr <= 0) + break; + } + nread += n; + } else { + p0 = p = va_arg(ap, char *); + while (!isspace(*inp)) { + inr--; + *p++ = *inp++; + if (--width == 0) + break; + if (inr <= 0) + break; + } + *p = 0; + nread += p - p0; + nassigned++; + } + nconversions++; + continue; + + case CT_INT: + /* scan an integer as if by strtoq/strtouq */ +#ifdef hardway + if (width == 0 || width > sizeof(buf) - 1) + width = sizeof(buf) - 1; +#else + /* size_t is unsigned, hence this optimisation */ + if (--width > sizeof(buf) - 2) + width = sizeof(buf) - 2; + width++; +#endif + flags |= SIGNOK | NDIGITS | NZDIGITS; + for (p = buf; width; width--) { + c = *inp; + /* + * Switch on the character; `goto ok' + * if we accept it as a part of number. + */ + switch (c) { + + /* + * The digit 0 is always legal, but is + * special. For %i conversions, if no + * digits (zero or nonzero) have been + * scanned (only signs), we will have + * base==0. In that case, we should set + * it to 8 and enable 0x prefixing. + * Also, if we have not scanned zero digits + * before this, do not turn off prefixing + * (someone else will turn it off if we + * have scanned any nonzero digits). + */ + case '0': + if (base == 0) { + base = 8; + flags |= PFXOK; + } + if (flags & NZDIGITS) + flags &= ~(SIGNOK|NZDIGITS|NDIGITS); + else + flags &= ~(SIGNOK|PFXOK|NDIGITS); + goto ok; + + /* 1 through 7 always legal */ + case '1': case '2': case '3': + case '4': case '5': case '6': case '7': + base = basefix[base]; + flags &= ~(SIGNOK | PFXOK | NDIGITS); + goto ok; + + /* digits 8 and 9 ok iff decimal or hex */ + case '8': case '9': + base = basefix[base]; + if (base <= 8) + break; /* not legal here */ + flags &= ~(SIGNOK | PFXOK | NDIGITS); + goto ok; + + /* letters ok iff hex */ + case 'A': case 'B': case 'C': + case 'D': case 'E': case 'F': + case 'a': case 'b': case 'c': + case 'd': case 'e': case 'f': + /* no need to fix base here */ + if (base <= 10) + break; /* not legal here */ + flags &= ~(SIGNOK | PFXOK | NDIGITS); + goto ok; + + /* sign ok only as first character */ + case '+': case '-': + if (flags & SIGNOK) { + flags &= ~SIGNOK; + goto ok; + } + break; + + /* x ok iff flag still set & 2nd char */ + case 'x': case 'X': + if (flags & PFXOK && p == buf + 1) { + base = 16; /* if %i */ + flags &= ~PFXOK; + goto ok; + } + break; + } + + /* + * If we got here, c is not a legal character + * for a number. Stop accumulating digits. + */ + break; + ok: + /* + * c is legal: store it and look at the next. + */ + *p++ = c; + if (--inr > 0) + inp++; + else + break; /* end of input */ + } + /* + * If we had only a sign, it is no good; push + * back the sign. If the number ends in `x', + * it was [sign] '0' 'x', so push back the x + * and treat it as [sign] '0'. + */ + if (flags & NDIGITS) { + if (p > buf) { + inp--; + inr++; + } + goto match_failure; + } + c = ((u_char *)p)[-1]; + if (c == 'x' || c == 'X') { + --p; + inp--; + inr++; + } + if ((flags & SUPPRESS) == 0) { + u_quad_t res; + + *p = 0; + res = (*ccfn)(buf, (char **)NULL, base); + if (flags & POINTER) + *va_arg(ap, void **) = + (void *)(u_long)res; + else if (flags & SHORT) + *va_arg(ap, short *) = res; + else if (flags & LONG) + *va_arg(ap, long *) = res; + else if (flags & QUAD) + *va_arg(ap, quad_t *) = res; + else + *va_arg(ap, int *) = res; + nassigned++; + } + nread += p - buf; + nconversions++; + break; + + } + } +input_failure: + return (nconversions != 0 ? nassigned : -1); +match_failure: + return (nassigned); +} + +/* + * Fill in the given table from the scanset at the given format + * (just after `['). Return a pointer to the character past the + * closing `]'. The table has a 1 wherever characters should be + * considered part of the scanset. + */ +static u_char * +__sccl(char *tab, u_char *fmt) +{ + int c, n, v; + + /* first `clear' the whole table */ + c = *fmt++; /* first char hat => negated scanset */ + if (c == '^') { + v = 1; /* default => accept */ + c = *fmt++; /* get new first char */ + } else + v = 0; /* default => reject */ + + /* XXX: Will not work if sizeof(tab*) > sizeof(char) */ + for (n = 0; n < 256; n++) + tab[n] = v; /* memset(tab, v, 256) */ + + if (c == 0) + return (fmt - 1);/* format ended before closing ] */ + + /* + * Now set the entries corresponding to the actual scanset + * to the opposite of the above. + * + * The first character may be ']' (or '-') without being special; + * the last character may be '-'. + */ + v = 1 - v; + for (;;) { + tab[c] = v; /* take character c */ +doswitch: + n = *fmt++; /* and examine the next */ + switch (n) { + + case 0: /* format ended too soon */ + return (fmt - 1); + + case '-': + /* + * A scanset of the form + * [01+-] + * is defined as `the digit 0, the digit 1, + * the character +, the character -', but + * the effect of a scanset such as + * [a-zA-Z0-9] + * is implementation defined. The V7 Unix + * scanf treats `a-z' as `the letters a through + * z', but treats `a-a' as `the letter a, the + * character -, and the letter a'. + * + * For compatibility, the `-' is not considerd + * to define a range if the character following + * it is either a close bracket (required by ANSI) + * or is not numerically greater than the character + * we just stored in the table (c). + */ + n = *fmt; + if (n == ']' || n < c) { + c = '-'; + break; /* resume the for(;;) */ + } + fmt++; + /* fill in the range */ + do { + tab[++c] = v; + } while (c < n); + c = n; + /* + * Alas, the V7 Unix scanf also treats formats + * such as [a-c-e] as `the letters a through e'. + * This too is permitted by the standard.... + */ + goto doswitch; + break; + + case ']': /* end of scanset */ + return (fmt); + + default: /* just another character */ + c = n; + break; + } + } + /* NOTREACHED */ +} + +/* + * Convert a string to an unsigned quad integer. + * + * Ignores `locale' stuff. Assumes that the upper and lower case + * alphabets and digits are each contiguous. + */ +u_quad_t +strtouq(const char *nptr, char **endptr, int base) +{ + const char *s = nptr; + u_quad_t acc; + unsigned char c; + u_quad_t qbase, cutoff; + int neg, any, cutlim; + + /* + * See strtoq for comments as to the logic used. + */ + s = nptr; + do { + c = *s++; + } while (isspace(c)); + if (c == '-') { + neg = 1; + c = *s++; + } else { + neg = 0; + if (c == '+') + c = *s++; + } + if ((base == 0 || base == 16) && + c == '0' && (*s == 'x' || *s == 'X')) { + c = s[1]; + s += 2; + base = 16; + } + if (base == 0) + base = c == '0' ? 8 : 10; + qbase = (unsigned)base; + cutoff = (u_quad_t)UQUAD_MAX / qbase; + cutlim = (u_quad_t)UQUAD_MAX % qbase; + for (acc = 0, any = 0;; c = *s++) { + if (!isascii(c)) + break; + if (isdigit(c)) + c -= '0'; + else if (isalpha(c)) + c -= isupper(c) ? 'A' - 10 : 'a' - 10; + else + break; + if (c >= base) + break; + if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim)) + any = -1; + else { + any = 1; + acc *= qbase; + acc += c; + } + } + if (any < 0) { + acc = UQUAD_MAX; + } else if (neg) + acc = -acc; + if (endptr != 0) + *endptr = (char *)(any ? s - 1 : nptr); + return (acc); +} + +/* + * Convert a string to a quad integer. + * + * Ignores `locale' stuff. Assumes that the upper and lower case + * alphabets and digits are each contiguous. + */ +quad_t +strtoq(const char *nptr, char **endptr, int base) +{ + const char *s; + u_quad_t acc; + unsigned char c; + u_quad_t qbase, cutoff; + int neg, any, cutlim; + + /* + * Skip white space and pick up leading +/- sign if any. + * If base is 0, allow 0x for hex and 0 for octal, else + * assume decimal; if base is already 16, allow 0x. + */ + s = nptr; + do { + c = *s++; + } while (isspace(c)); + if (c == '-') { + neg = 1; + c = *s++; + } else { + neg = 0; + if (c == '+') + c = *s++; + } + if ((base == 0 || base == 16) && + c == '0' && (*s == 'x' || *s == 'X')) { + c = s[1]; + s += 2; + base = 16; + } + if (base == 0) + base = c == '0' ? 8 : 10; + + /* + * Compute the cutoff value between legal numbers and illegal + * numbers. That is the largest legal value, divided by the + * base. An input number that is greater than this value, if + * followed by a legal input character, is too big. One that + * is equal to this value may be valid or not; the limit + * between valid and invalid numbers is then based on the last + * digit. For instance, if the range for quads is + * [-9223372036854775808..9223372036854775807] and the input base + * is 10, cutoff will be set to 922337203685477580 and cutlim to + * either 7 (neg==0) or 8 (neg==1), meaning that if we have + * accumulated a value > 922337203685477580, or equal but the + * next digit is > 7 (or 8), the number is too big, and we will + * return a range error. + * + * Set any if any `digits' consumed; make it negative to indicate + * overflow. + */ + qbase = (unsigned)base; + cutoff = neg ? (u_quad_t)-(QUAD_MIN + QUAD_MAX) + QUAD_MAX : QUAD_MAX; + cutlim = cutoff % qbase; + cutoff /= qbase; + for (acc = 0, any = 0;; c = *s++) { + if (!isascii(c)) + break; + if (isdigit(c)) + c -= '0'; + else if (isalpha(c)) + c -= isupper(c) ? 'A' - 10 : 'a' - 10; + else + break; + if (c >= base) + break; + if (any < 0 || acc > cutoff || (acc == cutoff && c > cutlim)) + any = -1; + else { + any = 1; + acc *= qbase; + acc += c; + } + } + if (any < 0) { + acc = neg ? QUAD_MIN : QUAD_MAX; + } else if (neg) + acc = -acc; + if (endptr != 0) + *endptr = (char *)(any ? s - 1 : nptr); + return (acc); +} diff --git a/sys/kern/subr_smp.c b/sys/kern/subr_smp.c new file mode 100644 index 0000000..569f04b --- /dev/null +++ b/sys/kern/subr_smp.c @@ -0,0 +1,2663 @@ +/* + * Copyright (c) 1996, by Steve Passe + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. The name of the developer may NOT be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: mp_machdep.c,v 1.87 1999/01/12 00:19:31 eivind Exp $ + */ + +#include "opt_smp.h" +#include "opt_vm86.h" +#include "opt_cpu.h" +#include "opt_user_ldt.h" + +#ifdef SMP +#include <machine/smptests.h> +#else +#error +#endif + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/sysctl.h> +#ifdef BETTER_CLOCK +#include <sys/dkstat.h> +#endif + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#ifdef BETTER_CLOCK +#include <sys/lock.h> +#include <vm/vm_map.h> +#include <sys/user.h> +#ifdef GPROF +#include <sys/gmon.h> +#endif +#endif + +#include <machine/smp.h> +#include <machine/apic.h> +#include <machine/mpapic.h> +#include <machine/segments.h> +#include <machine/smptests.h> /** TEST_DEFAULT_CONFIG, TEST_TEST1 */ +#include <machine/tss.h> +#include <machine/specialreg.h> +#include <machine/cputypes.h> +#include <machine/globaldata.h> + +#include <i386/i386/cons.h> /* cngetc() */ + +#if defined(APIC_IO) +#include <machine/md_var.h> /* setidt() */ +#include <i386/isa/icu.h> /* IPIs */ +#include <i386/isa/intr_machdep.h> /* IPIs */ +#endif /* APIC_IO */ + +#if defined(TEST_DEFAULT_CONFIG) +#define MPFPS_MPFB1 TEST_DEFAULT_CONFIG +#else +#define MPFPS_MPFB1 mpfps->mpfb1 +#endif /* TEST_DEFAULT_CONFIG */ + +#define WARMBOOT_TARGET 0 +#define WARMBOOT_OFF (KERNBASE + 0x0467) +#define WARMBOOT_SEG (KERNBASE + 0x0469) + +#ifdef PC98 +#define BIOS_BASE (0xe8000) +#define BIOS_SIZE (0x18000) +#else +#define BIOS_BASE (0xf0000) +#define BIOS_SIZE (0x10000) +#endif +#define BIOS_COUNT (BIOS_SIZE/4) + +#define CMOS_REG (0x70) +#define CMOS_DATA (0x71) +#define BIOS_RESET (0x0f) +#define BIOS_WARM (0x0a) + +#define PROCENTRY_FLAG_EN 0x01 +#define PROCENTRY_FLAG_BP 0x02 +#define IOAPICENTRY_FLAG_EN 0x01 + + +/* MP Floating Pointer Structure */ +typedef struct MPFPS { + char signature[4]; + void *pap; + u_char length; + u_char spec_rev; + u_char checksum; + u_char mpfb1; + u_char mpfb2; + u_char mpfb3; + u_char mpfb4; + u_char mpfb5; +} *mpfps_t; + +/* MP Configuration Table Header */ +typedef struct MPCTH { + char signature[4]; + u_short base_table_length; + u_char spec_rev; + u_char checksum; + u_char oem_id[8]; + u_char product_id[12]; + void *oem_table_pointer; + u_short oem_table_size; + u_short entry_count; + void *apic_address; + u_short extended_table_length; + u_char extended_table_checksum; + u_char reserved; +} *mpcth_t; + + +typedef struct PROCENTRY { + u_char type; + u_char apic_id; + u_char apic_version; + u_char cpu_flags; + u_long cpu_signature; + u_long feature_flags; + u_long reserved1; + u_long reserved2; +} *proc_entry_ptr; + +typedef struct BUSENTRY { + u_char type; + u_char bus_id; + char bus_type[6]; +} *bus_entry_ptr; + +typedef struct IOAPICENTRY { + u_char type; + u_char apic_id; + u_char apic_version; + u_char apic_flags; + void *apic_address; +} *io_apic_entry_ptr; + +typedef struct INTENTRY { + u_char type; + u_char int_type; + u_short int_flags; + u_char src_bus_id; + u_char src_bus_irq; + u_char dst_apic_id; + u_char dst_apic_int; +} *int_entry_ptr; + +/* descriptions of MP basetable entries */ +typedef struct BASETABLE_ENTRY { + u_char type; + u_char length; + char name[16]; +} basetable_entry; + +/* + * this code MUST be enabled here and in mpboot.s. + * it follows the very early stages of AP boot by placing values in CMOS ram. + * it NORMALLY will never be needed and thus the primitive method for enabling. + * +#define CHECK_POINTS + */ + +#if defined(CHECK_POINTS) && !defined(PC98) +#define CHECK_READ(A) (outb(CMOS_REG, (A)), inb(CMOS_DATA)) +#define CHECK_WRITE(A,D) (outb(CMOS_REG, (A)), outb(CMOS_DATA, (D))) + +#define CHECK_INIT(D); \ + CHECK_WRITE(0x34, (D)); \ + CHECK_WRITE(0x35, (D)); \ + CHECK_WRITE(0x36, (D)); \ + CHECK_WRITE(0x37, (D)); \ + CHECK_WRITE(0x38, (D)); \ + CHECK_WRITE(0x39, (D)); + +#define CHECK_PRINT(S); \ + printf("%s: %d, %d, %d, %d, %d, %d\n", \ + (S), \ + CHECK_READ(0x34), \ + CHECK_READ(0x35), \ + CHECK_READ(0x36), \ + CHECK_READ(0x37), \ + CHECK_READ(0x38), \ + CHECK_READ(0x39)); + +#else /* CHECK_POINTS */ + +#define CHECK_INIT(D) +#define CHECK_PRINT(S) + +#endif /* CHECK_POINTS */ + +/* + * Values to send to the POST hardware. + */ +#define MP_BOOTADDRESS_POST 0x10 +#define MP_PROBE_POST 0x11 +#define MPTABLE_PASS1_POST 0x12 + +#define MP_START_POST 0x13 +#define MP_ENABLE_POST 0x14 +#define MPTABLE_PASS2_POST 0x15 + +#define START_ALL_APS_POST 0x16 +#define INSTALL_AP_TRAMP_POST 0x17 +#define START_AP_POST 0x18 + +#define MP_ANNOUNCE_POST 0x19 + + +/** XXX FIXME: where does this really belong, isa.h/isa.c perhaps? */ +int current_postcode; + +/** XXX FIXME: what system files declare these??? */ +extern struct region_descriptor r_gdt, r_idt; + +int bsp_apic_ready = 0; /* flags useability of BSP apic */ +int mp_ncpus; /* # of CPUs, including BSP */ +int mp_naps; /* # of Applications processors */ +int mp_nbusses; /* # of busses */ +int mp_napics; /* # of IO APICs */ +int boot_cpu_id; /* designated BSP */ +vm_offset_t cpu_apic_address; +vm_offset_t io_apic_address[NAPICID]; /* NAPICID is more than enough */ +extern int nkpt; + +u_int32_t cpu_apic_versions[NCPU]; +u_int32_t io_apic_versions[NAPIC]; + +#ifdef APIC_INTR_DIAGNOSTIC +int apic_itrace_enter[32]; +int apic_itrace_tryisrlock[32]; +int apic_itrace_gotisrlock[32]; +int apic_itrace_active[32]; +int apic_itrace_masked[32]; +int apic_itrace_noisrlock[32]; +int apic_itrace_masked2[32]; +int apic_itrace_unmask[32]; +int apic_itrace_noforward[32]; +int apic_itrace_leave[32]; +int apic_itrace_enter2[32]; +int apic_itrace_doreti[32]; +int apic_itrace_splz[32]; +int apic_itrace_eoi[32]; +#ifdef APIC_INTR_DIAGNOSTIC_IRQ +unsigned short apic_itrace_debugbuffer[32768]; +int apic_itrace_debugbuffer_idx; +struct simplelock apic_itrace_debuglock; +#endif +#endif + +#ifdef APIC_INTR_REORDER +struct { + volatile int *location; + int bit; +} apic_isrbit_location[32]; +#endif + +struct apic_intmapinfo int_to_apicintpin[APIC_INTMAPSIZE]; + +/* + * APIC ID logical/physical mapping structures. + * We oversize these to simplify boot-time config. + */ +int cpu_num_to_apic_id[NAPICID]; +int io_num_to_apic_id[NAPICID]; +int apic_id_to_logical[NAPICID]; + + +/* Bitmap of all available CPUs */ +u_int all_cpus; + +/* AP uses this PTD during bootstrap. Do not staticize. */ +pd_entry_t *bootPTD; + +/* Hotwire a 0->4MB V==P mapping */ +extern pt_entry_t *KPTphys; + +/* Virtual address of per-cpu common_tss */ +extern struct i386tss common_tss; +#ifdef VM86 +extern struct segment_descriptor common_tssd; +extern u_int private_tss; /* flag indicating private tss */ +extern u_int my_tr; +#endif /* VM86 */ + +/* IdlePTD per cpu */ +pd_entry_t *IdlePTDS[NCPU]; + +/* "my" private page table page, for BSP init */ +extern pt_entry_t SMP_prvpt[]; + +/* Private page pointer to curcpu's PTD, used during BSP init */ +extern pd_entry_t *my_idlePTD; + +struct pcb stoppcbs[NCPU]; + +int smp_started; /* has the system started? */ + +/* + * Local data and functions. + */ + +static int mp_capable; +static u_int boot_address; +static u_int base_memory; + +static int picmode; /* 0: virtual wire mode, 1: PIC mode */ +static mpfps_t mpfps; +static int search_for_sig(u_int32_t target, int count); +static void mp_enable(u_int boot_addr); + +static int mptable_pass1(void); +static int mptable_pass2(void); +static void default_mp_table(int type); +static void fix_mp_table(void); +static void setup_apic_irq_mapping(void); +static void init_locks(void); +static int start_all_aps(u_int boot_addr); +static void install_ap_tramp(u_int boot_addr); +static int start_ap(int logicalCpu, u_int boot_addr); + +/* + * Calculate usable address in base memory for AP trampoline code. + */ +u_int +mp_bootaddress(u_int basemem) +{ + POSTCODE(MP_BOOTADDRESS_POST); + + base_memory = basemem * 1024; /* convert to bytes */ + + boot_address = base_memory & ~0xfff; /* round down to 4k boundary */ + if ((base_memory - boot_address) < bootMP_size) + boot_address -= 4096; /* not enough, lower by 4k */ + + return boot_address; +} + + +/* + * Look for an Intel MP spec table (ie, SMP capable hardware). + */ +int +mp_probe(void) +{ + int x; + u_long segment; + u_int32_t target; + + POSTCODE(MP_PROBE_POST); + + /* see if EBDA exists */ + if (segment = (u_long) * (u_short *) (KERNBASE + 0x40e)) { + /* search first 1K of EBDA */ + target = (u_int32_t) (segment << 4); + if ((x = search_for_sig(target, 1024 / 4)) >= 0) + goto found; + } else { + /* last 1K of base memory, effective 'top of base' passed in */ + target = (u_int32_t) (base_memory - 0x400); + if ((x = search_for_sig(target, 1024 / 4)) >= 0) + goto found; + } + + /* search the BIOS */ + target = (u_int32_t) BIOS_BASE; + if ((x = search_for_sig(target, BIOS_COUNT)) >= 0) + goto found; + + /* nothing found */ + mpfps = (mpfps_t)0; + mp_capable = 0; + return 0; + +found: + /* calculate needed resources */ + mpfps = (mpfps_t)x; + if (mptable_pass1()) + panic("you must reconfigure your kernel"); + + /* flag fact that we are running multiple processors */ + mp_capable = 1; + return 1; +} + + +/* + * Startup the SMP processors. + */ +void +mp_start(void) +{ + POSTCODE(MP_START_POST); + + /* look for MP capable motherboard */ + if (mp_capable) + mp_enable(boot_address); + else + panic("MP hardware not found!"); +} + + +/* + * Print various information about the SMP system hardware and setup. + */ +void +mp_announce(void) +{ + int x; + + POSTCODE(MP_ANNOUNCE_POST); + + printf("FreeBSD/SMP: Multiprocessor motherboard\n"); + printf(" cpu0 (BSP): apic id: %2d", CPU_TO_ID(0)); + printf(", version: 0x%08x", cpu_apic_versions[0]); + printf(", at 0x%08x\n", cpu_apic_address); + for (x = 1; x <= mp_naps; ++x) { + printf(" cpu%d (AP): apic id: %2d", x, CPU_TO_ID(x)); + printf(", version: 0x%08x", cpu_apic_versions[x]); + printf(", at 0x%08x\n", cpu_apic_address); + } + +#if defined(APIC_IO) + for (x = 0; x < mp_napics; ++x) { + printf(" io%d (APIC): apic id: %2d", x, IO_TO_ID(x)); + printf(", version: 0x%08x", io_apic_versions[x]); + printf(", at 0x%08x\n", io_apic_address[x]); + } +#else + printf(" Warning: APIC I/O disabled\n"); +#endif /* APIC_IO */ +} + +/* + * AP cpu's call this to sync up protected mode. + */ +void +init_secondary(void) +{ + int gsel_tss; +#ifndef VM86 + u_int my_tr; +#endif + + r_gdt.rd_limit = sizeof(gdt[0]) * (NGDT + NCPU) - 1; + r_gdt.rd_base = (int) gdt; + lgdt(&r_gdt); /* does magic intra-segment return */ + lidt(&r_idt); + lldt(_default_ldt); +#ifdef USER_LDT + currentldt = _default_ldt; +#endif + + my_tr = NGDT + cpuid; + gsel_tss = GSEL(my_tr, SEL_KPL); + gdt[my_tr].sd.sd_type = SDT_SYS386TSS; + common_tss.tss_esp0 = 0; /* not used until after switch */ + common_tss.tss_ss0 = GSEL(GDATA_SEL, SEL_KPL); + common_tss.tss_ioopt = (sizeof common_tss) << 16; +#ifdef VM86 + common_tssd = gdt[my_tr].sd; + private_tss = 0; +#endif /* VM86 */ + ltr(gsel_tss); + + load_cr0(0x8005003b); /* XXX! */ + + PTD[0] = 0; + pmap_set_opt((unsigned *)PTD); + + putmtrr(); + pmap_setvidram(); + + invltlb(); +} + + +#if defined(APIC_IO) +/* + * Final configuration of the BSP's local APIC: + * - disable 'pic mode'. + * - disable 'virtual wire mode'. + * - enable NMI. + */ +void +bsp_apic_configure(void) +{ + u_char byte; + u_int32_t temp; + + /* leave 'pic mode' if necessary */ + if (picmode) { + outb(0x22, 0x70); /* select IMCR */ + byte = inb(0x23); /* current contents */ + byte |= 0x01; /* mask external INTR */ + outb(0x23, byte); /* disconnect 8259s/NMI */ + } + + /* mask lint0 (the 8259 'virtual wire' connection) */ + temp = lapic.lvt_lint0; + temp |= APIC_LVT_M; /* set the mask */ + lapic.lvt_lint0 = temp; + + /* setup lint1 to handle NMI */ + temp = lapic.lvt_lint1; + temp &= ~APIC_LVT_M; /* clear the mask */ + lapic.lvt_lint1 = temp; + + if (bootverbose) + apic_dump("bsp_apic_configure()"); +} +#endif /* APIC_IO */ + + +/******************************************************************* + * local functions and data + */ + +/* + * start the SMP system + */ +static void +mp_enable(u_int boot_addr) +{ + int x; +#if defined(APIC_IO) + int apic; + u_int ux; +#endif /* APIC_IO */ + + getmtrr(); + pmap_setvidram(); + + POSTCODE(MP_ENABLE_POST); + + /* turn on 4MB of V == P addressing so we can get to MP table */ + *(int *)PTD = PG_V | PG_RW | ((uintptr_t)(void *)KPTphys & PG_FRAME); + invltlb(); + + /* examine the MP table for needed info, uses physical addresses */ + x = mptable_pass2(); + + *(int *)PTD = 0; + invltlb(); + + /* can't process default configs till the CPU APIC is pmapped */ + if (x) + default_mp_table(x); + + /* post scan cleanup */ + fix_mp_table(); + setup_apic_irq_mapping(); + +#if defined(APIC_IO) + + /* fill the LOGICAL io_apic_versions table */ + for (apic = 0; apic < mp_napics; ++apic) { + ux = io_apic_read(apic, IOAPIC_VER); + io_apic_versions[apic] = ux; + } + + /* program each IO APIC in the system */ + for (apic = 0; apic < mp_napics; ++apic) + if (io_apic_setup(apic) < 0) + panic("IO APIC setup failure"); + + /* install a 'Spurious INTerrupt' vector */ + setidt(XSPURIOUSINT_OFFSET, Xspuriousint, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + + /* install an inter-CPU IPI for TLB invalidation */ + setidt(XINVLTLB_OFFSET, Xinvltlb, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + +#ifdef BETTER_CLOCK + /* install an inter-CPU IPI for reading processor state */ + setidt(XCPUCHECKSTATE_OFFSET, Xcpucheckstate, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#endif + + /* install an inter-CPU IPI for forcing an additional software trap */ + setidt(XCPUAST_OFFSET, Xcpuast, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + + /* install an inter-CPU IPI for interrupt forwarding */ + setidt(XFORWARD_IRQ_OFFSET, Xforward_irq, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + + /* install an inter-CPU IPI for CPU stop/restart */ + setidt(XCPUSTOP_OFFSET, Xcpustop, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); + +#if defined(TEST_TEST1) + /* install a "fake hardware INTerrupt" vector */ + setidt(XTEST1_OFFSET, Xtest1, + SDT_SYS386IGT, SEL_KPL, GSEL(GCODE_SEL, SEL_KPL)); +#endif /** TEST_TEST1 */ + +#endif /* APIC_IO */ + + /* initialize all SMP locks */ + init_locks(); + + /* start each Application Processor */ + start_all_aps(boot_addr); + + /* + * The init process might be started on a different CPU now, + * and the boot CPU might not call prepare_usermode to get + * cr0 correctly configured. Thus we initialize cr0 here. + */ + load_cr0(rcr0() | CR0_WP | CR0_AM); +} + + +/* + * look for the MP spec signature + */ + +/* string defined by the Intel MP Spec as identifying the MP table */ +#define MP_SIG 0x5f504d5f /* _MP_ */ +#define NEXT(X) ((X) += 4) +static int +search_for_sig(u_int32_t target, int count) +{ + int x; + u_int32_t *addr = (u_int32_t *) (KERNBASE + target); + + for (x = 0; x < count; NEXT(x)) + if (addr[x] == MP_SIG) + /* make array index a byte index */ + return (target + (x * sizeof(u_int32_t))); + + return -1; +} + + +static basetable_entry basetable_entry_types[] = +{ + {0, 20, "Processor"}, + {1, 8, "Bus"}, + {2, 8, "I/O APIC"}, + {3, 8, "I/O INT"}, + {4, 8, "Local INT"} +}; + +typedef struct BUSDATA { + u_char bus_id; + enum busTypes bus_type; +} bus_datum; + +typedef struct INTDATA { + u_char int_type; + u_short int_flags; + u_char src_bus_id; + u_char src_bus_irq; + u_char dst_apic_id; + u_char dst_apic_int; + u_char int_vector; +} io_int, local_int; + +typedef struct BUSTYPENAME { + u_char type; + char name[7]; +} bus_type_name; + +static bus_type_name bus_type_table[] = +{ + {CBUS, "CBUS"}, + {CBUSII, "CBUSII"}, + {EISA, "EISA"}, + {UNKNOWN_BUSTYPE, "---"}, + {UNKNOWN_BUSTYPE, "---"}, + {ISA, "ISA"}, + {UNKNOWN_BUSTYPE, "---"}, + {UNKNOWN_BUSTYPE, "---"}, + {UNKNOWN_BUSTYPE, "---"}, + {UNKNOWN_BUSTYPE, "---"}, + {UNKNOWN_BUSTYPE, "---"}, + {UNKNOWN_BUSTYPE, "---"}, + {PCI, "PCI"}, + {UNKNOWN_BUSTYPE, "---"}, + {UNKNOWN_BUSTYPE, "---"}, + {UNKNOWN_BUSTYPE, "---"}, + {UNKNOWN_BUSTYPE, "---"}, + {XPRESS, "XPRESS"}, + {UNKNOWN_BUSTYPE, "---"} +}; +/* from MP spec v1.4, table 5-1 */ +static int default_data[7][5] = +{ +/* nbus, id0, type0, id1, type1 */ + {1, 0, ISA, 255, 255}, + {1, 0, EISA, 255, 255}, + {1, 0, EISA, 255, 255}, + {0, 255, 255, 255, 255},/* MCA not supported */ + {2, 0, ISA, 1, PCI}, + {2, 0, EISA, 1, PCI}, + {0, 255, 255, 255, 255} /* MCA not supported */ +}; + + +/* the bus data */ +static bus_datum bus_data[NBUS]; + +/* the IO INT data, one entry per possible APIC INTerrupt */ +static io_int io_apic_ints[NINTR]; + +static int nintrs; + +static int processor_entry __P((proc_entry_ptr entry, int cpu)); +static int bus_entry __P((bus_entry_ptr entry, int bus)); +static int io_apic_entry __P((io_apic_entry_ptr entry, int apic)); +static int int_entry __P((int_entry_ptr entry, int intr)); +static int lookup_bus_type __P((char *name)); + + +/* + * 1st pass on motherboard's Intel MP specification table. + * + * initializes: + * mp_ncpus = 1 + * + * determines: + * cpu_apic_address (common to all CPUs) + * io_apic_address[N] + * mp_naps + * mp_nbusses + * mp_napics + * nintrs + */ +static int +mptable_pass1(void) +{ + int x; + mpcth_t cth; + int totalSize; + void* position; + int count; + int type; + int mustpanic; + + POSTCODE(MPTABLE_PASS1_POST); + + mustpanic = 0; + + /* clear various tables */ + for (x = 0; x < NAPICID; ++x) { + io_apic_address[x] = ~0; /* IO APIC address table */ + } + + /* init everything to empty */ + mp_naps = 0; + mp_nbusses = 0; + mp_napics = 0; + nintrs = 0; + + /* check for use of 'default' configuration */ + if (MPFPS_MPFB1 != 0) { + /* use default addresses */ + cpu_apic_address = DEFAULT_APIC_BASE; + io_apic_address[0] = DEFAULT_IO_APIC_BASE; + + /* fill in with defaults */ + mp_naps = 2; /* includes BSP */ + mp_nbusses = default_data[MPFPS_MPFB1 - 1][0]; +#if defined(APIC_IO) + mp_napics = 1; + nintrs = 16; +#endif /* APIC_IO */ + } + else { + if ((cth = mpfps->pap) == 0) + panic("MP Configuration Table Header MISSING!"); + + cpu_apic_address = (vm_offset_t) cth->apic_address; + + /* walk the table, recording info of interest */ + totalSize = cth->base_table_length - sizeof(struct MPCTH); + position = (u_char *) cth + sizeof(struct MPCTH); + count = cth->entry_count; + + while (count--) { + switch (type = *(u_char *) position) { + case 0: /* processor_entry */ + if (((proc_entry_ptr)position)->cpu_flags + & PROCENTRY_FLAG_EN) + ++mp_naps; + break; + case 1: /* bus_entry */ + ++mp_nbusses; + break; + case 2: /* io_apic_entry */ + if (((io_apic_entry_ptr)position)->apic_flags + & IOAPICENTRY_FLAG_EN) + io_apic_address[mp_napics++] = + (vm_offset_t)((io_apic_entry_ptr) + position)->apic_address; + break; + case 3: /* int_entry */ + ++nintrs; + break; + case 4: /* int_entry */ + break; + default: + panic("mpfps Base Table HOSED!"); + /* NOTREACHED */ + } + + totalSize -= basetable_entry_types[type].length; + (u_char*)position += basetable_entry_types[type].length; + } + } + + /* qualify the numbers */ + if (mp_naps > NCPU) +#if 0 /* XXX FIXME: kern/4255 */ + printf("Warning: only using %d of %d available CPUs!\n", + NCPU, mp_naps); +#else + { + printf("NCPU cannot be different than actual CPU count.\n"); + printf(" add 'options NCPU=%d' to your kernel config file,\n", + mp_naps); + printf(" then rerun config & rebuild your SMP kernel\n"); + mustpanic = 1; + } +#endif /* XXX FIXME: kern/4255 */ + if (mp_nbusses > NBUS) { + printf("found %d busses, increase NBUS\n", mp_nbusses); + mustpanic = 1; + } + if (mp_napics > NAPIC) { + printf("found %d apics, increase NAPIC\n", mp_napics); + mustpanic = 1; + } + if (nintrs > NINTR) { + printf("found %d intrs, increase NINTR\n", nintrs); + mustpanic = 1; + } + + /* + * Count the BSP. + * This is also used as a counter while starting the APs. + */ + mp_ncpus = 1; + + --mp_naps; /* subtract the BSP */ + + return mustpanic; +} + + +/* + * 2nd pass on motherboard's Intel MP specification table. + * + * sets: + * boot_cpu_id + * ID_TO_IO(N), phy APIC ID to log CPU/IO table + * CPU_TO_ID(N), logical CPU to APIC ID table + * IO_TO_ID(N), logical IO to APIC ID table + * bus_data[N] + * io_apic_ints[N] + */ +static int +mptable_pass2(void) +{ + int x; + mpcth_t cth; + int totalSize; + void* position; + int count; + int type; + int apic, bus, cpu, intr; + + POSTCODE(MPTABLE_PASS2_POST); + + /* clear various tables */ + for (x = 0; x < NAPICID; ++x) { + ID_TO_IO(x) = -1; /* phy APIC ID to log CPU/IO table */ + CPU_TO_ID(x) = -1; /* logical CPU to APIC ID table */ + IO_TO_ID(x) = -1; /* logical IO to APIC ID table */ + } + + /* clear bus data table */ + for (x = 0; x < NBUS; ++x) + bus_data[x].bus_id = 0xff; + + /* clear IO APIC INT table */ + for (x = 0; x < NINTR; ++x) { + io_apic_ints[x].int_type = 0xff; + io_apic_ints[x].int_vector = 0xff; + } + + /* setup the cpu/apic mapping arrays */ + boot_cpu_id = -1; + + /* record whether PIC or virtual-wire mode */ + picmode = (mpfps->mpfb2 & 0x80) ? 1 : 0; + + /* check for use of 'default' configuration */ + if (MPFPS_MPFB1 != 0) + return MPFPS_MPFB1; /* return default configuration type */ + + if ((cth = mpfps->pap) == 0) + panic("MP Configuration Table Header MISSING!"); + + /* walk the table, recording info of interest */ + totalSize = cth->base_table_length - sizeof(struct MPCTH); + position = (u_char *) cth + sizeof(struct MPCTH); + count = cth->entry_count; + apic = bus = intr = 0; + cpu = 1; /* pre-count the BSP */ + + while (count--) { + switch (type = *(u_char *) position) { + case 0: + if (processor_entry(position, cpu)) + ++cpu; + break; + case 1: + if (bus_entry(position, bus)) + ++bus; + break; + case 2: + if (io_apic_entry(position, apic)) + ++apic; + break; + case 3: + if (int_entry(position, intr)) + ++intr; + break; + case 4: + /* int_entry(position); */ + break; + default: + panic("mpfps Base Table HOSED!"); + /* NOTREACHED */ + } + + totalSize -= basetable_entry_types[type].length; + (u_char *) position += basetable_entry_types[type].length; + } + + if (boot_cpu_id == -1) + panic("NO BSP found!"); + + /* report fact that its NOT a default configuration */ + return 0; +} + + +static void +assign_apic_irq(int apic, int intpin, int irq) +{ + int x; + + if (int_to_apicintpin[irq].ioapic != -1) + panic("assign_apic_irq: inconsistent table"); + + int_to_apicintpin[irq].ioapic = apic; + int_to_apicintpin[irq].int_pin = intpin; + int_to_apicintpin[irq].apic_address = ioapic[apic]; + int_to_apicintpin[irq].redirindex = IOAPIC_REDTBL + 2 * intpin; + + for (x = 0; x < nintrs; x++) { + if ((io_apic_ints[x].int_type == 0 || + io_apic_ints[x].int_type == 3) && + io_apic_ints[x].int_vector == 0xff && + io_apic_ints[x].dst_apic_id == IO_TO_ID(apic) && + io_apic_ints[x].dst_apic_int == intpin) + io_apic_ints[x].int_vector = irq; + } +} + +/* + * parse an Intel MP specification table + */ +static void +fix_mp_table(void) +{ + int x; + int id; + int bus_0 = 0; /* Stop GCC warning */ + int bus_pci = 0; /* Stop GCC warning */ + int num_pci_bus; + + /* + * Fix mis-numbering of the PCI bus and its INT entries if the BIOS + * did it wrong. The MP spec says that when more than 1 PCI bus + * exists the BIOS must begin with bus entries for the PCI bus and use + * actual PCI bus numbering. This implies that when only 1 PCI bus + * exists the BIOS can choose to ignore this ordering, and indeed many + * MP motherboards do ignore it. This causes a problem when the PCI + * sub-system makes requests of the MP sub-system based on PCI bus + * numbers. So here we look for the situation and renumber the + * busses and associated INTs in an effort to "make it right". + */ + + /* find bus 0, PCI bus, count the number of PCI busses */ + for (num_pci_bus = 0, x = 0; x < mp_nbusses; ++x) { + if (bus_data[x].bus_id == 0) { + bus_0 = x; + } + if (bus_data[x].bus_type == PCI) { + ++num_pci_bus; + bus_pci = x; + } + } + /* + * bus_0 == slot of bus with ID of 0 + * bus_pci == slot of last PCI bus encountered + */ + + /* check the 1 PCI bus case for sanity */ + if (num_pci_bus == 1) { + + /* if it is number 0 all is well */ + if (bus_data[bus_pci].bus_id == 0) + return; + + /* mis-numbered, swap with whichever bus uses slot 0 */ + + /* swap the bus entry types */ + bus_data[bus_pci].bus_type = bus_data[bus_0].bus_type; + bus_data[bus_0].bus_type = PCI; + + /* swap each relavant INTerrupt entry */ + id = bus_data[bus_pci].bus_id; + for (x = 0; x < nintrs; ++x) { + if (io_apic_ints[x].src_bus_id == id) { + io_apic_ints[x].src_bus_id = 0; + } + else if (io_apic_ints[x].src_bus_id == 0) { + io_apic_ints[x].src_bus_id = id; + } + } + } + /* sanity check if more than 1 PCI bus */ + else if (num_pci_bus > 1) { + for (x = 0; x < mp_nbusses; ++x) { + if (bus_data[x].bus_type != PCI) + continue; + if (bus_data[x].bus_id >= num_pci_bus) + panic("bad PCI bus numbering"); + } + } +} + + +static void +setup_apic_irq_mapping(void) +{ + int x; + int int_vector; + + /* Assign low level interrupt handlers */ + for (x = 0; x < APIC_INTMAPSIZE; x++) { + int_to_apicintpin[x].ioapic = -1; + int_to_apicintpin[x].int_pin = 0; + int_to_apicintpin[x].apic_address = NULL; + int_to_apicintpin[x].redirindex = 0; + } + for (x = 0; x < nintrs; x++) { + if (io_apic_ints[x].dst_apic_int <= APIC_INTMAPSIZE && + io_apic_ints[x].dst_apic_id == IO_TO_ID(0) && + io_apic_ints[x].int_vector == 0xff && + (io_apic_ints[x].int_type == 0 || + io_apic_ints[x].int_type == 3)) { + assign_apic_irq(0, + io_apic_ints[x].dst_apic_int, + io_apic_ints[x].dst_apic_int); + } + } + int_vector = 0; + while (int_vector < APIC_INTMAPSIZE && + int_to_apicintpin[int_vector].ioapic != -1) + int_vector++; + for (x = 0; x < nintrs && int_vector < APIC_INTMAPSIZE; x++) { + if ((io_apic_ints[x].int_type == 0 || + io_apic_ints[x].int_type == 3) && + io_apic_ints[x].int_vector == 0xff) { + assign_apic_irq(ID_TO_IO(io_apic_ints[x].dst_apic_id), + io_apic_ints[x].dst_apic_int, + int_vector); + int_vector++; + while (int_vector < APIC_INTMAPSIZE && + int_to_apicintpin[int_vector].ioapic != -1) + int_vector++; + } + } +} + + +static int +processor_entry(proc_entry_ptr entry, int cpu) +{ + /* check for usability */ + if ((cpu >= NCPU) || !(entry->cpu_flags & PROCENTRY_FLAG_EN)) + return 0; + + /* check for BSP flag */ + if (entry->cpu_flags & PROCENTRY_FLAG_BP) { + boot_cpu_id = entry->apic_id; + CPU_TO_ID(0) = entry->apic_id; + ID_TO_CPU(entry->apic_id) = 0; + return 0; /* its already been counted */ + } + + /* add another AP to list, if less than max number of CPUs */ + else { + CPU_TO_ID(cpu) = entry->apic_id; + ID_TO_CPU(entry->apic_id) = cpu; + return 1; + } +} + + +static int +bus_entry(bus_entry_ptr entry, int bus) +{ + int x; + char c, name[8]; + + /* encode the name into an index */ + for (x = 0; x < 6; ++x) { + if ((c = entry->bus_type[x]) == ' ') + break; + name[x] = c; + } + name[x] = '\0'; + + if ((x = lookup_bus_type(name)) == UNKNOWN_BUSTYPE) + panic("unknown bus type: '%s'", name); + + bus_data[bus].bus_id = entry->bus_id; + bus_data[bus].bus_type = x; + + return 1; +} + + +static int +io_apic_entry(io_apic_entry_ptr entry, int apic) +{ + if (!(entry->apic_flags & IOAPICENTRY_FLAG_EN)) + return 0; + + IO_TO_ID(apic) = entry->apic_id; + ID_TO_IO(entry->apic_id) = apic; + + return 1; +} + + +static int +lookup_bus_type(char *name) +{ + int x; + + for (x = 0; x < MAX_BUSTYPE; ++x) + if (strcmp(bus_type_table[x].name, name) == 0) + return bus_type_table[x].type; + + return UNKNOWN_BUSTYPE; +} + + +static int +int_entry(int_entry_ptr entry, int intr) +{ + int apic; + + io_apic_ints[intr].int_type = entry->int_type; + io_apic_ints[intr].int_flags = entry->int_flags; + io_apic_ints[intr].src_bus_id = entry->src_bus_id; + io_apic_ints[intr].src_bus_irq = entry->src_bus_irq; + if (entry->dst_apic_id == 255) { + /* This signal goes to all IO APICS. Select an IO APIC + with sufficient number of interrupt pins */ + for (apic = 0; apic < mp_napics; apic++) + if (((io_apic_read(apic, IOAPIC_VER) & + IOART_VER_MAXREDIR) >> MAXREDIRSHIFT) >= + entry->dst_apic_int) + break; + if (apic < mp_napics) + io_apic_ints[intr].dst_apic_id = IO_TO_ID(apic); + else + io_apic_ints[intr].dst_apic_id = entry->dst_apic_id; + } else + io_apic_ints[intr].dst_apic_id = entry->dst_apic_id; + io_apic_ints[intr].dst_apic_int = entry->dst_apic_int; + + return 1; +} + + +static int +apic_int_is_bus_type(int intr, int bus_type) +{ + int bus; + + for (bus = 0; bus < mp_nbusses; ++bus) + if ((bus_data[bus].bus_id == io_apic_ints[intr].src_bus_id) + && ((int) bus_data[bus].bus_type == bus_type)) + return 1; + + return 0; +} + + +/* + * Given a traditional ISA INT mask, return an APIC mask. + */ +u_int +isa_apic_mask(u_int isa_mask) +{ + int isa_irq; + int apic_pin; + +#if defined(SKIP_IRQ15_REDIRECT) + if (isa_mask == (1 << 15)) { + printf("skipping ISA IRQ15 redirect\n"); + return isa_mask; + } +#endif /* SKIP_IRQ15_REDIRECT */ + + isa_irq = ffs(isa_mask); /* find its bit position */ + if (isa_irq == 0) /* doesn't exist */ + return 0; + --isa_irq; /* make it zero based */ + + apic_pin = isa_apic_irq(isa_irq); /* look for APIC connection */ + if (apic_pin == -1) + return 0; + + return (1 << apic_pin); /* convert pin# to a mask */ +} + + +/* + * Determine which APIC pin an ISA/EISA INT is attached to. + */ +#define INTTYPE(I) (io_apic_ints[(I)].int_type) +#define INTPIN(I) (io_apic_ints[(I)].dst_apic_int) +#define INTIRQ(I) (io_apic_ints[(I)].int_vector) +#define INTAPIC(I) (ID_TO_IO(io_apic_ints[(I)].dst_apic_id)) + +#define SRCBUSIRQ(I) (io_apic_ints[(I)].src_bus_irq) +int +isa_apic_irq(int isa_irq) +{ + int intr; + + for (intr = 0; intr < nintrs; ++intr) { /* check each record */ + if (INTTYPE(intr) == 0) { /* standard INT */ + if (SRCBUSIRQ(intr) == isa_irq) { + if (apic_int_is_bus_type(intr, ISA) || + apic_int_is_bus_type(intr, EISA)) + return INTIRQ(intr); /* found */ + } + } + } + return -1; /* NOT found */ +} + + +/* + * Determine which APIC pin a PCI INT is attached to. + */ +#define SRCBUSID(I) (io_apic_ints[(I)].src_bus_id) +#define SRCBUSDEVICE(I) ((io_apic_ints[(I)].src_bus_irq >> 2) & 0x1f) +#define SRCBUSLINE(I) (io_apic_ints[(I)].src_bus_irq & 0x03) +int +pci_apic_irq(int pciBus, int pciDevice, int pciInt) +{ + int intr; + + --pciInt; /* zero based */ + + for (intr = 0; intr < nintrs; ++intr) /* check each record */ + if ((INTTYPE(intr) == 0) /* standard INT */ + && (SRCBUSID(intr) == pciBus) + && (SRCBUSDEVICE(intr) == pciDevice) + && (SRCBUSLINE(intr) == pciInt)) /* a candidate IRQ */ + if (apic_int_is_bus_type(intr, PCI)) + return INTIRQ(intr); /* exact match */ + + return -1; /* NOT found */ +} + +int +next_apic_irq(int irq) +{ + int intr, ointr; + int bus, bustype; + + bus = 0; + bustype = 0; + for (intr = 0; intr < nintrs; intr++) { + if (INTIRQ(intr) != irq || INTTYPE(intr) != 0) + continue; + bus = SRCBUSID(intr); + bustype = apic_bus_type(bus); + if (bustype != ISA && + bustype != EISA && + bustype != PCI) + continue; + break; + } + if (intr >= nintrs) { + return -1; + } + for (ointr = intr + 1; ointr < nintrs; ointr++) { + if (INTTYPE(ointr) != 0) + continue; + if (bus != SRCBUSID(ointr)) + continue; + if (bustype == PCI) { + if (SRCBUSDEVICE(intr) != SRCBUSDEVICE(ointr)) + continue; + if (SRCBUSLINE(intr) != SRCBUSLINE(ointr)) + continue; + } + if (bustype == ISA || bustype == EISA) { + if (SRCBUSIRQ(intr) != SRCBUSIRQ(ointr)) + continue; + } + if (INTPIN(intr) == INTPIN(ointr)) + continue; + break; + } + if (ointr >= nintrs) { + return -1; + } + return INTIRQ(ointr); +} +#undef SRCBUSLINE +#undef SRCBUSDEVICE +#undef SRCBUSID +#undef SRCBUSIRQ + +#undef INTPIN +#undef INTIRQ +#undef INTAPIC +#undef INTTYPE + + +/* + * Reprogram the MB chipset to NOT redirect an ISA INTerrupt. + * + * XXX FIXME: + * Exactly what this means is unclear at this point. It is a solution + * for motherboards that redirect the MBIRQ0 pin. Generically a motherboard + * could route any of the ISA INTs to upper (>15) IRQ values. But most would + * NOT be redirected via MBIRQ0, thus "undirect()ing" them would NOT be an + * option. + */ +int +undirect_isa_irq(int rirq) +{ +#if defined(READY) + if (bootverbose) + printf("Freeing redirected ISA irq %d.\n", rirq); + /** FIXME: tickle the MB redirector chip */ + return ???; +#else + if (bootverbose) + printf("Freeing (NOT implemented) redirected ISA irq %d.\n", rirq); + return 0; +#endif /* READY */ +} + + +/* + * Reprogram the MB chipset to NOT redirect a PCI INTerrupt + */ +int +undirect_pci_irq(int rirq) +{ +#if defined(READY) + if (bootverbose) + printf("Freeing redirected PCI irq %d.\n", rirq); + + /** FIXME: tickle the MB redirector chip */ + return ???; +#else + if (bootverbose) + printf("Freeing (NOT implemented) redirected PCI irq %d.\n", + rirq); + return 0; +#endif /* READY */ +} + + +/* + * given a bus ID, return: + * the bus type if found + * -1 if NOT found + */ +int +apic_bus_type(int id) +{ + int x; + + for (x = 0; x < mp_nbusses; ++x) + if (bus_data[x].bus_id == id) + return bus_data[x].bus_type; + + return -1; +} + + +/* + * given a LOGICAL APIC# and pin#, return: + * the associated src bus ID if found + * -1 if NOT found + */ +int +apic_src_bus_id(int apic, int pin) +{ + int x; + + /* search each of the possible INTerrupt sources */ + for (x = 0; x < nintrs; ++x) + if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && + (pin == io_apic_ints[x].dst_apic_int)) + return (io_apic_ints[x].src_bus_id); + + return -1; /* NOT found */ +} + + +/* + * given a LOGICAL APIC# and pin#, return: + * the associated src bus IRQ if found + * -1 if NOT found + */ +int +apic_src_bus_irq(int apic, int pin) +{ + int x; + + for (x = 0; x < nintrs; x++) + if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && + (pin == io_apic_ints[x].dst_apic_int)) + return (io_apic_ints[x].src_bus_irq); + + return -1; /* NOT found */ +} + + +/* + * given a LOGICAL APIC# and pin#, return: + * the associated INTerrupt type if found + * -1 if NOT found + */ +int +apic_int_type(int apic, int pin) +{ + int x; + + /* search each of the possible INTerrupt sources */ + for (x = 0; x < nintrs; ++x) + if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && + (pin == io_apic_ints[x].dst_apic_int)) + return (io_apic_ints[x].int_type); + + return -1; /* NOT found */ +} + +int +apic_irq(int apic, int pin) +{ + int x; + int res; + + for (x = 0; x < nintrs; ++x) + if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && + (pin == io_apic_ints[x].dst_apic_int)) { + res = io_apic_ints[x].int_vector; + if (res == 0xff) + return -1; + if (apic != int_to_apicintpin[res].ioapic) + panic("apic_irq: inconsistent table"); + if (pin != int_to_apicintpin[res].int_pin) + panic("apic_irq inconsistent table (2)"); + return res; + } + return -1; +} + + +/* + * given a LOGICAL APIC# and pin#, return: + * the associated trigger mode if found + * -1 if NOT found + */ +int +apic_trigger(int apic, int pin) +{ + int x; + + /* search each of the possible INTerrupt sources */ + for (x = 0; x < nintrs; ++x) + if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && + (pin == io_apic_ints[x].dst_apic_int)) + return ((io_apic_ints[x].int_flags >> 2) & 0x03); + + return -1; /* NOT found */ +} + + +/* + * given a LOGICAL APIC# and pin#, return: + * the associated 'active' level if found + * -1 if NOT found + */ +int +apic_polarity(int apic, int pin) +{ + int x; + + /* search each of the possible INTerrupt sources */ + for (x = 0; x < nintrs; ++x) + if ((apic == ID_TO_IO(io_apic_ints[x].dst_apic_id)) && + (pin == io_apic_ints[x].dst_apic_int)) + return (io_apic_ints[x].int_flags & 0x03); + + return -1; /* NOT found */ +} + + +/* + * set data according to MP defaults + * FIXME: probably not complete yet... + */ +static void +default_mp_table(int type) +{ + int ap_cpu_id; +#if defined(APIC_IO) + u_int32_t ux; + int io_apic_id; + int pin; +#endif /* APIC_IO */ + +#if 0 + printf(" MP default config type: %d\n", type); + switch (type) { + case 1: + printf(" bus: ISA, APIC: 82489DX\n"); + break; + case 2: + printf(" bus: EISA, APIC: 82489DX\n"); + break; + case 3: + printf(" bus: EISA, APIC: 82489DX\n"); + break; + case 4: + printf(" bus: MCA, APIC: 82489DX\n"); + break; + case 5: + printf(" bus: ISA+PCI, APIC: Integrated\n"); + break; + case 6: + printf(" bus: EISA+PCI, APIC: Integrated\n"); + break; + case 7: + printf(" bus: MCA+PCI, APIC: Integrated\n"); + break; + default: + printf(" future type\n"); + break; + /* NOTREACHED */ + } +#endif /* 0 */ + + boot_cpu_id = (lapic.id & APIC_ID_MASK) >> 24; + ap_cpu_id = (boot_cpu_id == 0) ? 1 : 0; + + /* BSP */ + CPU_TO_ID(0) = boot_cpu_id; + ID_TO_CPU(boot_cpu_id) = 0; + + /* one and only AP */ + CPU_TO_ID(1) = ap_cpu_id; + ID_TO_CPU(ap_cpu_id) = 1; + +#if defined(APIC_IO) + /* one and only IO APIC */ + io_apic_id = (io_apic_read(0, IOAPIC_ID) & APIC_ID_MASK) >> 24; + + /* + * sanity check, refer to MP spec section 3.6.6, last paragraph + * necessary as some hardware isn't properly setting up the IO APIC + */ +#if defined(REALLY_ANAL_IOAPICID_VALUE) + if (io_apic_id != 2) { +#else + if ((io_apic_id == 0) || (io_apic_id == 1) || (io_apic_id == 15)) { +#endif /* REALLY_ANAL_IOAPICID_VALUE */ + ux = io_apic_read(0, IOAPIC_ID); /* get current contents */ + ux &= ~APIC_ID_MASK; /* clear the ID field */ + ux |= 0x02000000; /* set it to '2' */ + io_apic_write(0, IOAPIC_ID, ux); /* write new value */ + ux = io_apic_read(0, IOAPIC_ID); /* re-read && test */ + if ((ux & APIC_ID_MASK) != 0x02000000) + panic("can't control IO APIC ID, reg: 0x%08x", ux); + io_apic_id = 2; + } + IO_TO_ID(0) = io_apic_id; + ID_TO_IO(io_apic_id) = 0; +#endif /* APIC_IO */ + + /* fill out bus entries */ + switch (type) { + case 1: + case 2: + case 3: + case 5: + case 6: + bus_data[0].bus_id = default_data[type - 1][1]; + bus_data[0].bus_type = default_data[type - 1][2]; + bus_data[1].bus_id = default_data[type - 1][3]; + bus_data[1].bus_type = default_data[type - 1][4]; + break; + + /* case 4: case 7: MCA NOT supported */ + default: /* illegal/reserved */ + panic("BAD default MP config: %d", type); + /* NOTREACHED */ + } + +#if defined(APIC_IO) + /* general cases from MP v1.4, table 5-2 */ + for (pin = 0; pin < 16; ++pin) { + io_apic_ints[pin].int_type = 0; + io_apic_ints[pin].int_flags = 0x05; /* edge/active-hi */ + io_apic_ints[pin].src_bus_id = 0; + io_apic_ints[pin].src_bus_irq = pin; /* IRQ2 caught below */ + io_apic_ints[pin].dst_apic_id = io_apic_id; + io_apic_ints[pin].dst_apic_int = pin; /* 1-to-1 */ + } + + /* special cases from MP v1.4, table 5-2 */ + if (type == 2) { + io_apic_ints[2].int_type = 0xff; /* N/C */ + io_apic_ints[13].int_type = 0xff; /* N/C */ +#if !defined(APIC_MIXED_MODE) + /** FIXME: ??? */ + panic("sorry, can't support type 2 default yet"); +#endif /* APIC_MIXED_MODE */ + } + else + io_apic_ints[2].src_bus_irq = 0; /* ISA IRQ0 is on APIC INT 2 */ + + if (type == 7) + io_apic_ints[0].int_type = 0xff; /* N/C */ + else + io_apic_ints[0].int_type = 3; /* vectored 8259 */ +#endif /* APIC_IO */ +} + + +/* + * initialize all the SMP locks + */ + +/* critical region around IO APIC, apic_imen */ +struct simplelock imen_lock; + +/* critical region around splxx(), cpl, cml, cil, ipending */ +struct simplelock cpl_lock; + +/* Make FAST_INTR() routines sequential */ +struct simplelock fast_intr_lock; + +/* critical region around INTR() routines */ +struct simplelock intr_lock; + +/* lock regions protected in UP kernel via cli/sti */ +struct simplelock mpintr_lock; + +/* lock region used by kernel profiling */ +struct simplelock mcount_lock; + +#ifdef USE_COMLOCK +/* locks com (tty) data/hardware accesses: a FASTINTR() */ +struct simplelock com_lock; +#endif /* USE_COMLOCK */ + +#ifdef USE_CLOCKLOCK +/* lock regions around the clock hardware */ +struct simplelock clock_lock; +#endif /* USE_CLOCKLOCK */ + +static void +init_locks(void) +{ + /* + * Get the initial mp_lock with a count of 1 for the BSP. + * This uses a LOGICAL cpu ID, ie BSP == 0. + */ + mp_lock = 0x00000001; + + /* ISR uses its own "giant lock" */ + isr_lock = FREE_LOCK; + +#if defined(APIC_INTR_DIAGNOSTIC) && defined(APIC_INTR_DIAGNOSTIC_IRQ) + s_lock_init((struct simplelock*)&apic_itrace_debuglock); +#endif + + s_lock_init((struct simplelock*)&mpintr_lock); + + s_lock_init((struct simplelock*)&mcount_lock); + + s_lock_init((struct simplelock*)&fast_intr_lock); + s_lock_init((struct simplelock*)&intr_lock); + s_lock_init((struct simplelock*)&imen_lock); + s_lock_init((struct simplelock*)&cpl_lock); + +#ifdef USE_COMLOCK + s_lock_init((struct simplelock*)&com_lock); +#endif /* USE_COMLOCK */ +#ifdef USE_CLOCKLOCK + s_lock_init((struct simplelock*)&clock_lock); +#endif /* USE_CLOCKLOCK */ +} + + +/* + * start each AP in our list + */ +static int +start_all_aps(u_int boot_addr) +{ + int x, i; + u_char mpbiosreason; + u_long mpbioswarmvec; + pd_entry_t *newptd; + pt_entry_t *newpt; + struct globaldata *gd; + char *stack; + pd_entry_t *myPTD; + + POSTCODE(START_ALL_APS_POST); + + /* initialize BSP's local APIC */ + apic_initialize(); + bsp_apic_ready = 1; + + /* install the AP 1st level boot code */ + install_ap_tramp(boot_addr); + + + /* save the current value of the warm-start vector */ + mpbioswarmvec = *((u_long *) WARMBOOT_OFF); +#ifndef PC98 + outb(CMOS_REG, BIOS_RESET); + mpbiosreason = inb(CMOS_DATA); +#endif + + /* record BSP in CPU map */ + all_cpus = 1; + + /* start each AP */ + for (x = 1; x <= mp_naps; ++x) { + + /* This is a bit verbose, it will go away soon. */ + + /* alloc new page table directory */ + newptd = (pd_entry_t *)(kmem_alloc(kernel_map, PAGE_SIZE)); + + /* Store the virtual PTD address for this CPU */ + IdlePTDS[x] = newptd; + + /* clone currently active one (ie: IdlePTD) */ + bcopy(PTD, newptd, PAGE_SIZE); /* inc prv page pde */ + + /* set up 0 -> 4MB P==V mapping for AP boot */ + newptd[0] = (void *)(uintptr_t)(PG_V | PG_RW | + ((uintptr_t)(void *)KPTphys & PG_FRAME)); + + /* store PTD for this AP's boot sequence */ + myPTD = (pd_entry_t *)vtophys(newptd); + + /* alloc new page table page */ + newpt = (pt_entry_t *)(kmem_alloc(kernel_map, PAGE_SIZE)); + + /* set the new PTD's private page to point there */ + newptd[MPPTDI] = (pt_entry_t)(PG_V | PG_RW | vtophys(newpt)); + + /* install self referential entry */ + newptd[PTDPTDI] = (pd_entry_t)(PG_V | PG_RW | vtophys(newptd)); + + /* allocate a new private data page */ + gd = (struct globaldata *)kmem_alloc(kernel_map, PAGE_SIZE); + + /* wire it into the private page table page */ + newpt[0] = (pt_entry_t)(PG_V | PG_RW | vtophys(gd)); + + /* wire the ptp into itself for access */ + newpt[1] = (pt_entry_t)(PG_V | PG_RW | vtophys(newpt)); + + /* copy in the pointer to the local apic */ + newpt[2] = SMP_prvpt[2]; + + /* and the IO apic mapping[s] */ + for (i = 16; i < 32; i++) + newpt[i] = SMP_prvpt[i]; + + /* allocate and set up an idle stack data page */ + stack = (char *)kmem_alloc(kernel_map, UPAGES*PAGE_SIZE); + for (i = 0; i < UPAGES; i++) + newpt[i + 3] = (pt_entry_t)(PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack)); + + newpt[3 + UPAGES] = 0; /* *prv_CMAP1 */ + newpt[4 + UPAGES] = 0; /* *prv_CMAP2 */ + newpt[5 + UPAGES] = 0; /* *prv_CMAP3 */ + newpt[6 + UPAGES] = 0; /* *prv_PMAP1 */ + + /* prime data page for it to use */ + gd->cpuid = x; + gd->cpu_lockid = x << 24; + gd->my_idlePTD = myPTD; + gd->prv_CMAP1 = &newpt[3 + UPAGES]; + gd->prv_CMAP2 = &newpt[4 + UPAGES]; + gd->prv_CMAP3 = &newpt[5 + UPAGES]; + gd->prv_PMAP1 = &newpt[6 + UPAGES]; + + /* setup a vector to our boot code */ + *((volatile u_short *) WARMBOOT_OFF) = WARMBOOT_TARGET; + *((volatile u_short *) WARMBOOT_SEG) = (boot_addr >> 4); +#ifndef PC98 + outb(CMOS_REG, BIOS_RESET); + outb(CMOS_DATA, BIOS_WARM); /* 'warm-start' */ +#endif + + bootPTD = myPTD; + /* attempt to start the Application Processor */ + CHECK_INIT(99); /* setup checkpoints */ + if (!start_ap(x, boot_addr)) { + printf("AP #%d (PHY# %d) failed!\n", x, CPU_TO_ID(x)); + CHECK_PRINT("trace"); /* show checkpoints */ + /* better panic as the AP may be running loose */ + printf("panic y/n? [y] "); + if (cngetc() != 'n') + panic("bye-bye"); + } + CHECK_PRINT("trace"); /* show checkpoints */ + + /* record its version info */ + cpu_apic_versions[x] = cpu_apic_versions[0]; + + all_cpus |= (1 << x); /* record AP in CPU map */ + } + + /* build our map of 'other' CPUs */ + other_cpus = all_cpus & ~(1 << cpuid); + + /* fill in our (BSP) APIC version */ + cpu_apic_versions[0] = lapic.version; + + /* restore the warmstart vector */ + *(u_long *) WARMBOOT_OFF = mpbioswarmvec; +#ifndef PC98 + outb(CMOS_REG, BIOS_RESET); + outb(CMOS_DATA, mpbiosreason); +#endif + + /* + * Set up the idle context for the BSP. Similar to above except + * that some was done by locore, some by pmap.c and some is implicit + * because the BSP is cpu#0 and the page is initially zero, and also + * because we can refer to variables by name on the BSP.. + */ + newptd = (pd_entry_t *)(kmem_alloc(kernel_map, PAGE_SIZE)); + + bcopy(PTD, newptd, PAGE_SIZE); /* inc prv page pde */ + IdlePTDS[0] = newptd; + + /* Point PTD[] to this page instead of IdlePTD's physical page */ + newptd[PTDPTDI] = (pd_entry_t)(PG_V | PG_RW | vtophys(newptd)); + + my_idlePTD = (pd_entry_t *)vtophys(newptd); + + /* Allocate and setup BSP idle stack */ + stack = (char *)kmem_alloc(kernel_map, UPAGES * PAGE_SIZE); + for (i = 0; i < UPAGES; i++) + SMP_prvpt[i + 3] = (pt_entry_t)(PG_V | PG_RW | vtophys(PAGE_SIZE * i + stack)); + + pmap_set_opt_bsp(); + + for (i = 0; i < mp_ncpus; i++) { + bcopy( (int *) PTD + KPTDI, (int *) IdlePTDS[i] + KPTDI, NKPDE * sizeof (int)); + } + + /* number of APs actually started */ + return mp_ncpus - 1; +} + + +/* + * load the 1st level AP boot code into base memory. + */ + +/* targets for relocation */ +extern void bigJump(void); +extern void bootCodeSeg(void); +extern void bootDataSeg(void); +extern void MPentry(void); +extern u_int MP_GDT; +extern u_int mp_gdtbase; + +static void +install_ap_tramp(u_int boot_addr) +{ + int x; + int size = *(int *) ((u_long) & bootMP_size); + u_char *src = (u_char *) ((u_long) bootMP); + u_char *dst = (u_char *) boot_addr + KERNBASE; + u_int boot_base = (u_int) bootMP; + u_int8_t *dst8; + u_int16_t *dst16; + u_int32_t *dst32; + + POSTCODE(INSTALL_AP_TRAMP_POST); + + for (x = 0; x < size; ++x) + *dst++ = *src++; + + /* + * modify addresses in code we just moved to basemem. unfortunately we + * need fairly detailed info about mpboot.s for this to work. changes + * to mpboot.s might require changes here. + */ + + /* boot code is located in KERNEL space */ + dst = (u_char *) boot_addr + KERNBASE; + + /* modify the lgdt arg */ + dst32 = (u_int32_t *) (dst + ((u_int) & mp_gdtbase - boot_base)); + *dst32 = boot_addr + ((u_int) & MP_GDT - boot_base); + + /* modify the ljmp target for MPentry() */ + dst32 = (u_int32_t *) (dst + ((u_int) bigJump - boot_base) + 1); + *dst32 = ((u_int) MPentry - KERNBASE); + + /* modify the target for boot code segment */ + dst16 = (u_int16_t *) (dst + ((u_int) bootCodeSeg - boot_base)); + dst8 = (u_int8_t *) (dst16 + 1); + *dst16 = (u_int) boot_addr & 0xffff; + *dst8 = ((u_int) boot_addr >> 16) & 0xff; + + /* modify the target for boot data segment */ + dst16 = (u_int16_t *) (dst + ((u_int) bootDataSeg - boot_base)); + dst8 = (u_int8_t *) (dst16 + 1); + *dst16 = (u_int) boot_addr & 0xffff; + *dst8 = ((u_int) boot_addr >> 16) & 0xff; +} + + +/* + * this function starts the AP (application processor) identified + * by the APIC ID 'physicalCpu'. It does quite a "song and dance" + * to accomplish this. This is necessary because of the nuances + * of the different hardware we might encounter. It ain't pretty, + * but it seems to work. + */ +static int +start_ap(int logical_cpu, u_int boot_addr) +{ + int physical_cpu; + int vector; + int cpus; + u_long icr_lo, icr_hi; + + POSTCODE(START_AP_POST); + + /* get the PHYSICAL APIC ID# */ + physical_cpu = CPU_TO_ID(logical_cpu); + + /* calculate the vector */ + vector = (boot_addr >> 12) & 0xff; + + /* used as a watchpoint to signal AP startup */ + cpus = mp_ncpus; + + /* + * first we do an INIT/RESET IPI this INIT IPI might be run, reseting + * and running the target CPU. OR this INIT IPI might be latched (P5 + * bug), CPU waiting for STARTUP IPI. OR this INIT IPI might be + * ignored. + */ + + /* setup the address for the target AP */ + icr_hi = lapic.icr_hi & ~APIC_ID_MASK; + icr_hi |= (physical_cpu << 24); + lapic.icr_hi = icr_hi; + + /* do an INIT IPI: assert RESET */ + icr_lo = lapic.icr_lo & 0xfff00000; + lapic.icr_lo = icr_lo | 0x0000c500; + + /* wait for pending status end */ + while (lapic.icr_lo & APIC_DELSTAT_MASK) + /* spin */ ; + + /* do an INIT IPI: deassert RESET */ + lapic.icr_lo = icr_lo | 0x00008500; + + /* wait for pending status end */ + u_sleep(10000); /* wait ~10mS */ + while (lapic.icr_lo & APIC_DELSTAT_MASK) + /* spin */ ; + + /* + * next we do a STARTUP IPI: the previous INIT IPI might still be + * latched, (P5 bug) this 1st STARTUP would then terminate + * immediately, and the previously started INIT IPI would continue. OR + * the previous INIT IPI has already run. and this STARTUP IPI will + * run. OR the previous INIT IPI was ignored. and this STARTUP IPI + * will run. + */ + + /* do a STARTUP IPI */ + lapic.icr_lo = icr_lo | 0x00000600 | vector; + while (lapic.icr_lo & APIC_DELSTAT_MASK) + /* spin */ ; + u_sleep(200); /* wait ~200uS */ + + /* + * finally we do a 2nd STARTUP IPI: this 2nd STARTUP IPI should run IF + * the previous STARTUP IPI was cancelled by a latched INIT IPI. OR + * this STARTUP IPI will be ignored, as only ONE STARTUP IPI is + * recognized after hardware RESET or INIT IPI. + */ + + lapic.icr_lo = icr_lo | 0x00000600 | vector; + while (lapic.icr_lo & APIC_DELSTAT_MASK) + /* spin */ ; + u_sleep(200); /* wait ~200uS */ + + /* wait for it to start */ + set_apic_timer(5000000);/* == 5 seconds */ + while (read_apic_timer()) + if (mp_ncpus > cpus) + return 1; /* return SUCCESS */ + + return 0; /* return FAILURE */ +} + + +/* + * Flush the TLB on all other CPU's + * + * XXX: Needs to handshake and wait for completion before proceding. + */ +void +smp_invltlb(void) +{ +#if defined(APIC_IO) + if (smp_started && invltlb_ok) + all_but_self_ipi(XINVLTLB_OFFSET); +#endif /* APIC_IO */ +} + +void +invlpg(u_int addr) +{ + __asm __volatile("invlpg (%0)"::"r"(addr):"memory"); + + /* send a message to the other CPUs */ + smp_invltlb(); +} + +void +invltlb(void) +{ + u_long temp; + + /* + * This should be implemented as load_cr3(rcr3()) when load_cr3() is + * inlined. + */ + __asm __volatile("movl %%cr3, %0; movl %0, %%cr3":"=r"(temp) :: "memory"); + + /* send a message to the other CPUs */ + smp_invltlb(); +} + + +/* + * When called the executing CPU will send an IPI to all other CPUs + * requesting that they halt execution. + * + * Usually (but not necessarily) called with 'other_cpus' as its arg. + * + * - Signals all CPUs in map to stop. + * - Waits for each to stop. + * + * Returns: + * -1: error + * 0: NA + * 1: ok + * + * XXX FIXME: this is not MP-safe, needs a lock to prevent multiple CPUs + * from executing at same time. + */ +int +stop_cpus(u_int map) +{ + if (!smp_started) + return 0; + + /* send the Xcpustop IPI to all CPUs in map */ + selected_apic_ipi(map, XCPUSTOP_OFFSET, APIC_DELMODE_FIXED); + + while ((stopped_cpus & map) != map) + /* spin */ ; + + return 1; +} + + +/* + * Called by a CPU to restart stopped CPUs. + * + * Usually (but not necessarily) called with 'stopped_cpus' as its arg. + * + * - Signals all CPUs in map to restart. + * - Waits for each to restart. + * + * Returns: + * -1: error + * 0: NA + * 1: ok + */ +int +restart_cpus(u_int map) +{ + if (!smp_started) + return 0; + + started_cpus = map; /* signal other cpus to restart */ + + while ((stopped_cpus & map) != 0) /* wait for each to clear its bit */ + /* spin */ ; + + return 1; +} + +int smp_active = 0; /* are the APs allowed to run? */ +SYSCTL_INT(_machdep, OID_AUTO, smp_active, CTLFLAG_RW, &smp_active, 0, ""); + +/* XXX maybe should be hw.ncpu */ +static int smp_cpus = 1; /* how many cpu's running */ +SYSCTL_INT(_machdep, OID_AUTO, smp_cpus, CTLFLAG_RD, &smp_cpus, 0, ""); + +int invltlb_ok = 0; /* throttle smp_invltlb() till safe */ +SYSCTL_INT(_machdep, OID_AUTO, invltlb_ok, CTLFLAG_RW, &invltlb_ok, 0, ""); + +/* Warning: Do not staticize. Used from swtch.s */ +int do_page_zero_idle = 1; /* bzero pages for fun and profit in idleloop */ +SYSCTL_INT(_machdep, OID_AUTO, do_page_zero_idle, CTLFLAG_RW, + &do_page_zero_idle, 0, ""); + +/* Is forwarding of a interrupt to the CPU holding the ISR lock enabled ? */ +int forward_irq_enabled = 1; +SYSCTL_INT(_machdep, OID_AUTO, forward_irq_enabled, CTLFLAG_RW, + &forward_irq_enabled, 0, ""); + +/* Enable forwarding of a signal to a process running on a different CPU */ +static int forward_signal_enabled = 1; +SYSCTL_INT(_machdep, OID_AUTO, forward_signal_enabled, CTLFLAG_RW, + &forward_signal_enabled, 0, ""); + +/* Enable forwarding of roundrobin to all other cpus */ +static int forward_roundrobin_enabled = 1; +SYSCTL_INT(_machdep, OID_AUTO, forward_roundrobin_enabled, CTLFLAG_RW, + &forward_roundrobin_enabled, 0, ""); + +/* + * This is called once the rest of the system is up and running and we're + * ready to let the AP's out of the pen. + */ +void ap_init(void); + +void +ap_init() +{ + u_int apic_id; + + smp_cpus++; + +#if defined(I586_CPU) && !defined(NO_F00F_HACK) + lidt(&r_idt); +#endif + + /* Build our map of 'other' CPUs. */ + other_cpus = all_cpus & ~(1 << cpuid); + + printf("SMP: AP CPU #%d Launched!\n", cpuid); + + /* XXX FIXME: i386 specific, and redundant: Setup the FPU. */ + load_cr0((rcr0() & ~CR0_EM) | CR0_MP | CR0_NE | CR0_TS); + + /* A quick check from sanity claus */ + apic_id = (apic_id_to_logical[(lapic.id & 0x0f000000) >> 24]); + if (cpuid != apic_id) { + printf("SMP: cpuid = %d\n", cpuid); + printf("SMP: apic_id = %d\n", apic_id); + printf("PTD[MPPTDI] = %p\n", (void *)PTD[MPPTDI]); + panic("cpuid mismatch! boom!!"); + } + + getmtrr(); + + /* Init local apic for irq's */ + apic_initialize(); + + /* + * Activate smp_invltlb, although strictly speaking, this isn't + * quite correct yet. We should have a bitfield for cpus willing + * to accept TLB flush IPI's or something and sync them. + */ + if (smp_cpus == mp_ncpus) { + invltlb_ok = 1; + smp_started = 1; /* enable IPI's, tlb shootdown, freezes etc */ + smp_active = 1; /* historic */ + } + + curproc = NULL; /* make sure */ +} + +#ifdef BETTER_CLOCK + +#define CHECKSTATE_USER 0 +#define CHECKSTATE_SYS 1 +#define CHECKSTATE_INTR 2 + +/* Do not staticize. Used from apic_vector.s */ +struct proc* checkstate_curproc[NCPU]; +int checkstate_cpustate[NCPU]; +u_long checkstate_pc[NCPU]; + +extern long cp_time[CPUSTATES]; + +#define PC_TO_INDEX(pc, prof) \ + ((int)(((u_quad_t)((pc) - (prof)->pr_off) * \ + (u_quad_t)((prof)->pr_scale)) >> 16) & ~1) + +static void +addupc_intr_forwarded(struct proc *p, int id, int *astmap) +{ + int i; + struct uprof *prof; + u_long pc; + + pc = checkstate_pc[id]; + prof = &p->p_stats->p_prof; + if (pc >= prof->pr_off && + (i = PC_TO_INDEX(pc, prof)) < prof->pr_size) { + if ((p->p_flag & P_OWEUPC) == 0) { + prof->pr_addr = pc; + prof->pr_ticks = 1; + p->p_flag |= P_OWEUPC; + } + *astmap |= (1 << id); + } +} + +static void +forwarded_statclock(int id, int pscnt, int *astmap) +{ + struct pstats *pstats; + long rss; + struct rusage *ru; + struct vmspace *vm; + int cpustate; + struct proc *p; +#ifdef GPROF + register struct gmonparam *g; + int i; +#endif + + p = checkstate_curproc[id]; + cpustate = checkstate_cpustate[id]; + + switch (cpustate) { + case CHECKSTATE_USER: + if (p->p_flag & P_PROFIL) + addupc_intr_forwarded(p, id, astmap); + if (pscnt > 1) + return; + p->p_uticks++; + if (p->p_nice > NZERO) + cp_time[CP_NICE]++; + else + cp_time[CP_USER]++; + break; + case CHECKSTATE_SYS: +#ifdef GPROF + /* + * Kernel statistics are just like addupc_intr, only easier. + */ + g = &_gmonparam; + if (g->state == GMON_PROF_ON) { + i = checkstate_pc[id] - g->lowpc; + if (i < g->textsize) { + i /= HISTFRACTION * sizeof(*g->kcount); + g->kcount[i]++; + } + } +#endif + if (pscnt > 1) + return; + + if (!p) + cp_time[CP_IDLE]++; + else { + p->p_sticks++; + cp_time[CP_SYS]++; + } + break; + case CHECKSTATE_INTR: + default: +#ifdef GPROF + /* + * Kernel statistics are just like addupc_intr, only easier. + */ + g = &_gmonparam; + if (g->state == GMON_PROF_ON) { + i = checkstate_pc[id] - g->lowpc; + if (i < g->textsize) { + i /= HISTFRACTION * sizeof(*g->kcount); + g->kcount[i]++; + } + } +#endif + if (pscnt > 1) + return; + if (p) + p->p_iticks++; + cp_time[CP_INTR]++; + } + if (p != NULL) { + p->p_cpticks++; + if (++p->p_estcpu == 0) + p->p_estcpu--; + if ((p->p_estcpu & 3) == 0) { + resetpriority(p); + if (p->p_priority >= PUSER) + p->p_priority = p->p_usrpri; + } + + /* Update resource usage integrals and maximums. */ + if ((pstats = p->p_stats) != NULL && + (ru = &pstats->p_ru) != NULL && + (vm = p->p_vmspace) != NULL) { + ru->ru_ixrss += vm->vm_tsize * PAGE_SIZE / 1024; + ru->ru_idrss += vm->vm_dsize * PAGE_SIZE / 1024; + ru->ru_isrss += vm->vm_ssize * PAGE_SIZE / 1024; + rss = vm->vm_pmap.pm_stats.resident_count * + PAGE_SIZE / 1024; + if (ru->ru_maxrss < rss) + ru->ru_maxrss = rss; + } + } +} + +void +forward_statclock(int pscnt) +{ + int map; + int id; + int i; + + /* Kludge. We don't yet have separate locks for the interrupts + * and the kernel. This means that we cannot let the other processors + * handle complex interrupts while inhibiting them from entering + * the kernel in a non-interrupt context. + * + * What we can do, without changing the locking mechanisms yet, + * is letting the other processors handle a very simple interrupt + * (wich determines the processor states), and do the main + * work ourself. + */ + + if (!smp_started || !invltlb_ok || cold || panicstr) + return; + + /* Step 1: Probe state (user, cpu, interrupt, spinlock, idle ) */ + + map = other_cpus & ~stopped_cpus ; + checkstate_probed_cpus = 0; + if (map != 0) + selected_apic_ipi(map, + XCPUCHECKSTATE_OFFSET, APIC_DELMODE_FIXED); + + i = 0; + while (checkstate_probed_cpus != map) { + /* spin */ + i++; + if (i == 100000) { +#ifdef BETTER_CLOCK_DIAGNOSTIC + printf("forward_statclock: checkstate %x\n", + checkstate_probed_cpus); +#endif + break; + } + } + + /* + * Step 2: walk through other processors processes, update ticks and + * profiling info. + */ + + map = 0; + for (id = 0; id < mp_ncpus; id++) { + if (id == cpuid) + continue; + if (((1 << id) & checkstate_probed_cpus) == 0) + continue; + forwarded_statclock(id, pscnt, &map); + } + if (map != 0) { + checkstate_need_ast |= map; + selected_apic_ipi(map, XCPUAST_OFFSET, APIC_DELMODE_FIXED); + i = 0; + while ((checkstate_need_ast & map) != 0) { + /* spin */ + i++; + if (i > 100000) { +#ifdef BETTER_CLOCK_DIAGNOSTIC + printf("forward_statclock: dropped ast 0x%x\n", + checkstate_need_ast & map); +#endif + break; + } + } + } +} + +void +forward_hardclock(int pscnt) +{ + int map; + int id; + struct proc *p; + struct pstats *pstats; + int i; + + /* Kludge. We don't yet have separate locks for the interrupts + * and the kernel. This means that we cannot let the other processors + * handle complex interrupts while inhibiting them from entering + * the kernel in a non-interrupt context. + * + * What we can do, without changing the locking mechanisms yet, + * is letting the other processors handle a very simple interrupt + * (wich determines the processor states), and do the main + * work ourself. + */ + + if (!smp_started || !invltlb_ok || cold || panicstr) + return; + + /* Step 1: Probe state (user, cpu, interrupt, spinlock, idle) */ + + map = other_cpus & ~stopped_cpus ; + checkstate_probed_cpus = 0; + if (map != 0) + selected_apic_ipi(map, + XCPUCHECKSTATE_OFFSET, APIC_DELMODE_FIXED); + + i = 0; + while (checkstate_probed_cpus != map) { + /* spin */ + i++; + if (i == 100000) { +#ifdef BETTER_CLOCK_DIAGNOSTIC + printf("forward_hardclock: checkstate %x\n", + checkstate_probed_cpus); +#endif + break; + } + } + + /* + * Step 2: walk through other processors processes, update virtual + * timer and profiling timer. If stathz == 0, also update ticks and + * profiling info. + */ + + map = 0; + for (id = 0; id < mp_ncpus; id++) { + if (id == cpuid) + continue; + if (((1 << id) & checkstate_probed_cpus) == 0) + continue; + p = checkstate_curproc[id]; + if (p) { + pstats = p->p_stats; + if (checkstate_cpustate[id] == CHECKSTATE_USER && + timevalisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) && + itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) { + psignal(p, SIGVTALRM); + map |= (1 << id); + } + if (timevalisset(&pstats->p_timer[ITIMER_PROF].it_value) && + itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) { + psignal(p, SIGPROF); + map |= (1 << id); + } + } + if (stathz == 0) { + forwarded_statclock( id, pscnt, &map); + } + } + if (map != 0) { + checkstate_need_ast |= map; + selected_apic_ipi(map, XCPUAST_OFFSET, APIC_DELMODE_FIXED); + i = 0; + while ((checkstate_need_ast & map) != 0) { + /* spin */ + i++; + if (i > 100000) { +#ifdef BETTER_CLOCK_DIAGNOSTIC + printf("forward_hardclock: dropped ast 0x%x\n", + checkstate_need_ast & map); +#endif + break; + } + } + } +} + +#endif /* BETTER_CLOCK */ + +void +forward_signal(struct proc *p) +{ + int map; + int id; + int i; + + /* Kludge. We don't yet have separate locks for the interrupts + * and the kernel. This means that we cannot let the other processors + * handle complex interrupts while inhibiting them from entering + * the kernel in a non-interrupt context. + * + * What we can do, without changing the locking mechanisms yet, + * is letting the other processors handle a very simple interrupt + * (wich determines the processor states), and do the main + * work ourself. + */ + + if (!smp_started || !invltlb_ok || cold || panicstr) + return; + if (!forward_signal_enabled) + return; + while (1) { + if (p->p_stat != SRUN) + return; + id = (u_char) p->p_oncpu; + if (id == 0xff) + return; + map = (1<<id); + checkstate_need_ast |= map; + selected_apic_ipi(map, XCPUAST_OFFSET, APIC_DELMODE_FIXED); + i = 0; + while ((checkstate_need_ast & map) != 0) { + /* spin */ + i++; + if (i > 100000) { +#if 0 + printf("forward_signal: dropped ast 0x%x\n", + checkstate_need_ast & map); +#endif + break; + } + } + if (id == (u_char) p->p_oncpu) + return; + } +} + +void +forward_roundrobin(void) +{ + u_int map; + int i; + + if (!smp_started || !invltlb_ok || cold || panicstr) + return; + if (!forward_roundrobin_enabled) + return; + resched_cpus |= other_cpus; + map = other_cpus & ~stopped_cpus ; +#if 1 + selected_apic_ipi(map, XCPUAST_OFFSET, APIC_DELMODE_FIXED); +#else + (void) all_but_self_ipi(XCPUAST_OFFSET); +#endif + i = 0; + while ((checkstate_need_ast & map) != 0) { + /* spin */ + i++; + if (i > 100000) { +#if 0 + printf("forward_roundrobin: dropped ast 0x%x\n", + checkstate_need_ast & map); +#endif + break; + } + } +} + + +#ifdef APIC_INTR_REORDER +/* + * Maintain mapping from softintr vector to isr bit in local apic. + */ +void +set_lapic_isrloc(int intr, int vector) +{ + if (intr < 0 || intr > 32) + panic("set_apic_isrloc: bad intr argument: %d",intr); + if (vector < ICU_OFFSET || vector > 255) + panic("set_apic_isrloc: bad vector argument: %d",vector); + apic_isrbit_location[intr].location = &lapic.isr0 + ((vector>>5)<<2); + apic_isrbit_location[intr].bit = (1<<(vector & 31)); +} +#endif diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c index 57195f3..42b0c85 100644 --- a/sys/kern/subr_trap.c +++ b/sys/kern/subr_trap.c @@ -1,6 +1,7 @@ /*- - * Copyright (c) 1990 The Regents of the University of California. - * All rights reserved. + * Copyright (C) 1994, David Greenman + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. * * This code is derived from software contributed to Berkeley by * the University of Utah, and William Jolitz. @@ -33,515 +34,1145 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)trap.c 7.4 (Berkeley) 5/13/91 - * - * PATCHES MAGIC LEVEL PATCH THAT GOT US HERE - * -------------------- ----- ---------------------- - * CURRENT PATCH LEVEL: 1 00137 - * -------------------- ----- ---------------------- - * - * 08 Apr 93 Bruce Evans Several VM system fixes - * Paul Kranenburg Add counter for vmstat + * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 + * $Id: trap.c,v 1.132 1998/12/28 23:02:56 msmith Exp $ */ -static char rcsid[] = "$Header: /usr/bill/working/sys/i386/i386/RCS/trap.c,v 1.2 92/01/21 14:22:13 william Exp $"; /* - * 386 Trap and System call handleing + * 386 Trap and System call handling */ -#include "machine/cpu.h" -#include "machine/psl.h" -#include "machine/reg.h" +#include "opt_cpu.h" +#include "opt_ddb.h" +#include "opt_ktrace.h" +#include "opt_trap.h" +#include "opt_vm86.h" -#include "param.h" -#include "systm.h" -#include "proc.h" -#include "user.h" -#include "acct.h" -#include "kernel.h" +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/pioctl.h> +#include <sys/kernel.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/syscall.h> +#include <sys/sysent.h> +#include <sys/uio.h> +#include <sys/vmmeter.h> #ifdef KTRACE -#include "ktrace.h" +#include <sys/ktrace.h> +#endif + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_kern.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/vm_extern.h> + +#include <machine/cpu.h> +#include <machine/ipl.h> +#include <machine/md_var.h> +#include <machine/pcb.h> +#ifdef SMP +#include <machine/smp.h> #endif +#include <machine/tss.h> + +#include <i386/isa/intr_machdep.h> + +#ifdef POWERFAIL_NMI +#include <sys/syslog.h> +#include <machine/clock.h> +#endif + +#ifdef VM86 +#include <machine/vm86.h> +#endif + +#ifdef DDB + extern int in_Debugger, debugger_on_panic; +#endif + +#include "isa.h" +#include "npx.h" + +extern struct i386tss common_tss; + +int (*pmath_emulate) __P((struct trapframe *)); + +extern void trap __P((struct trapframe frame)); +extern int trapwrite __P((unsigned addr)); +extern void syscall __P((struct trapframe frame)); + +static int trap_pfault __P((struct trapframe *, int, vm_offset_t)); +static void trap_fatal __P((struct trapframe *, vm_offset_t)); +void dblfault_handler __P((void)); + +extern inthand_t IDTVEC(syscall); + +#define MAX_TRAP_MSG 28 +static char *trap_msg[] = { + "", /* 0 unused */ + "privileged instruction fault", /* 1 T_PRIVINFLT */ + "", /* 2 unused */ + "breakpoint instruction fault", /* 3 T_BPTFLT */ + "", /* 4 unused */ + "", /* 5 unused */ + "arithmetic trap", /* 6 T_ARITHTRAP */ + "system forced exception", /* 7 T_ASTFLT */ + "", /* 8 unused */ + "general protection fault", /* 9 T_PROTFLT */ + "trace trap", /* 10 T_TRCTRAP */ + "", /* 11 unused */ + "page fault", /* 12 T_PAGEFLT */ + "", /* 13 unused */ + "alignment fault", /* 14 T_ALIGNFLT */ + "", /* 15 unused */ + "", /* 16 unused */ + "", /* 17 unused */ + "integer divide fault", /* 18 T_DIVIDE */ + "non-maskable interrupt trap", /* 19 T_NMI */ + "overflow trap", /* 20 T_OFLOW */ + "FPU bounds check fault", /* 21 T_BOUND */ + "FPU device not available", /* 22 T_DNA */ + "double fault", /* 23 T_DOUBLEFLT */ + "FPU operand fetch fault", /* 24 T_FPOPFLT */ + "invalid TSS fault", /* 25 T_TSSFLT */ + "segment not present fault", /* 26 T_SEGNPFLT */ + "stack fault", /* 27 T_STKFLT */ + "machine check trap", /* 28 T_MCHK */ +}; + +static __inline void userret __P((struct proc *p, struct trapframe *frame, + u_quad_t oticks)); -#include "vm/vm_param.h" -#include "vm/pmap.h" -#include "vm/vm_map.h" -#include "sys/vmmeter.h" +#if defined(I586_CPU) && !defined(NO_F00F_HACK) +extern struct gate_descriptor *t_idt; +extern int has_f00f_bug; +#endif -#include "machine/trap.h" +static __inline void +userret(p, frame, oticks) + struct proc *p; + struct trapframe *frame; + u_quad_t oticks; +{ + int sig, s; + while ((sig = CURSIG(p)) != 0) + postsig(sig); -struct sysent sysent[]; -int nsysent; -int dostacklimits; -unsigned rcr2(); -extern short cpl; +#if 0 + if (!want_resched && + (p->p_priority <= p->p_usrpri) && + (p->p_rtprio.type == RTP_PRIO_NORMAL)) { + int newpriority; + p->p_estcpu += 1; + newpriority = PUSER + p->p_estcpu / 4 + 2 * p->p_nice; + newpriority = min(newpriority, MAXPRI); + p->p_usrpri = newpriority; + } +#endif + + p->p_priority = p->p_usrpri; + if (want_resched) { + /* + * Since we are curproc, clock will normally just change + * our priority without moving us from one queue to another + * (since the running process is not on a queue.) + * If that happened after we setrunqueue ourselves but before we + * mi_switch()'ed, we might not be on the queue indicated by + * our priority. + */ + s = splhigh(); + setrunqueue(p); + p->p_stats->p_ru.ru_nivcsw++; + mi_switch(); + splx(s); + while ((sig = CURSIG(p)) != 0) + postsig(sig); + } + /* + * Charge system time if profiling. + */ + if (p->p_flag & P_PROFIL) + addupc_task(p, frame->tf_eip, + (u_int)(p->p_sticks - oticks) * psratio); + curpriority = p->p_priority; +} /* - * trap(frame): - * Exception, fault, and trap interface to BSD kernel. This - * common code is called from assembly language IDT gate entry + * Exception, fault, and trap interface to the FreeBSD kernel. + * This common code is called from assembly language IDT gate entry * routines that prepare a suitable stack frame, and restore this - * frame after the exception has been processed. Note that the - * effect is as if the arguments were passed call by reference. + * frame after the exception has been processed. */ -/*ARGSUSED*/ +void trap(frame) struct trapframe frame; { - register int i; - register struct proc *p = curproc; - struct timeval syst; - int ucode, type, code, eva; + struct proc *p = curproc; + u_quad_t sticks = 0; + int i = 0, ucode = 0, type, code; + vm_offset_t eva; - frame.tf_eflags &= ~PSL_NT; /* clear nested trap XXX */ - type = frame.tf_trapno; -#include "ddb.h" -#if NDDB > 0 - if (curpcb && curpcb->pcb_onfault) { - if (frame.tf_trapno == T_BPTFLT - || frame.tf_trapno == T_TRCTRAP) - if (kdb_trap (type, 0, &frame)) - return; - } -#endif - -/*pg("trap type %d code = %x eip = %x cs = %x eva = %x esp %x", - frame.tf_trapno, frame.tf_err, frame.tf_eip, - frame.tf_cs, rcr2(), frame.tf_esp);*/ -if(curpcb == 0 || curproc == 0) goto we_re_toast; - if (curpcb->pcb_onfault && frame.tf_trapno != 0xc) { -copyfault: - frame.tf_eip = (int)curpcb->pcb_onfault; - return; + if (!(frame.tf_eflags & PSL_I)) { + /* + * Buggy application or kernel code has disabled interrupts + * and then trapped. Enabling interrupts now is wrong, but + * it is better than running with interrupts disabled until + * they are accidentally enabled later. + */ + type = frame.tf_trapno; + if (ISPL(frame.tf_cs) == SEL_UPL || (frame.tf_eflags & PSL_VM)) + printf( + "pid %ld (%s): trap %d with interrupts disabled\n", + (long)curproc->p_pid, curproc->p_comm, type); + else if (type != T_BPTFLT && type != T_TRCTRAP) + /* + * XXX not quite right, since this may be for a + * multiple fault in user mode. + */ + printf("kernel trap %d with interrupts disabled\n", + type); + enable_intr(); } - syst = p->p_stime; - if (ISPL(frame.tf_cs) == SEL_UPL) { - type |= T_USER; - p->p_regs = (int *)&frame; - curpcb->pcb_flags |= FM_TRAP; /* used by sendsig */ + eva = 0; + if (frame.tf_trapno == T_PAGEFLT) { + /* + * For some Cyrix CPUs, %cr2 is clobbered by interrupts. + * This problem is worked around by using an interrupt + * gate for the pagefault handler. We are finally ready + * to read %cr2 and then must reenable interrupts. + * + * XXX this should be in the switch statement, but the + * NO_FOOF_HACK and VM86 goto and ifdefs obfuscate the + * flow of control too much for this to be obviously + * correct. + */ + eva = rcr2(); + enable_intr(); } - ucode=0; - eva = rcr2(); +#if defined(I586_CPU) && !defined(NO_F00F_HACK) +restart: +#endif + type = frame.tf_trapno; code = frame.tf_err; - switch (type) { - default: - we_re_toast: -#ifdef KDB - if (kdb_trap(&psl)) +#ifdef VM86 + if (in_vm86call) { + if (frame.tf_eflags & PSL_VM && + (type == T_PROTFLT || type == T_STKFLT)) { + i = vm86_emulate((struct vm86frame *)&frame); + if (i != 0) + /* + * returns to original process + */ + vm86_trap((struct vm86frame *)&frame); return; -#endif -#if NDDB > 0 - if (kdb_trap (type, 0, &frame)) + } + switch (type) { + /* + * these traps want either a process context, or + * assume a normal userspace trap. + */ + case T_PROTFLT: + case T_SEGNPFLT: + trap_fatal(&frame, eva); return; + case T_TRCTRAP: + type = T_BPTFLT; /* kernel breakpoint */ + /* FALL THROUGH */ + } + goto kernel_trap; /* normal kernel trap handling */ + } #endif - printf("trap type %d code = %x eip = %x cs = %x eflags = %x ", - frame.tf_trapno, frame.tf_err, frame.tf_eip, - frame.tf_cs, frame.tf_eflags); - eva = rcr2(); - printf("cr2 %x cpl %x\n", eva, cpl); - /* type &= ~T_USER; */ /* XXX what the hell is this */ - panic("trap"); - /*NOTREACHED*/ - - case T_SEGNPFLT|T_USER: - case T_STKFLT|T_USER: - case T_PROTFLT|T_USER: /* protection fault */ - ucode = code + BUS_SEGM_FAULT ; - i = SIGBUS; - break; + if ((ISPL(frame.tf_cs) == SEL_UPL) || (frame.tf_eflags & PSL_VM)) { + /* user trap */ - case T_PRIVINFLT|T_USER: /* privileged instruction fault */ - case T_RESADFLT|T_USER: /* reserved addressing fault */ - case T_RESOPFLT|T_USER: /* reserved operand fault */ - case T_FPOPFLT|T_USER: /* coprocessor operand fault */ - ucode = type &~ T_USER; - i = SIGILL; - break; + sticks = p->p_sticks; + p->p_md.md_regs = &frame; + + switch (type) { + case T_PRIVINFLT: /* privileged instruction fault */ + ucode = type; + i = SIGILL; + break; + + case T_BPTFLT: /* bpt instruction fault */ + case T_TRCTRAP: /* trace trap */ + frame.tf_eflags &= ~PSL_T; + i = SIGTRAP; + break; + + case T_ARITHTRAP: /* arithmetic trap */ + ucode = code; + i = SIGFPE; + break; + + case T_ASTFLT: /* Allow process switch */ + astoff(); + cnt.v_soft++; + if (p->p_flag & P_OWEUPC) { + p->p_flag &= ~P_OWEUPC; + addupc_task(p, p->p_stats->p_prof.pr_addr, + p->p_stats->p_prof.pr_ticks); + } + goto out; + + /* + * The following two traps can happen in + * vm86 mode, and, if so, we want to handle + * them specially. + */ + case T_PROTFLT: /* general protection fault */ + case T_STKFLT: /* stack fault */ +#ifdef VM86 + if (frame.tf_eflags & PSL_VM) { + i = vm86_emulate((struct vm86frame *)&frame); + if (i == 0) + goto out; + break; + } +#endif /* VM86 */ + /* FALL THROUGH */ + + case T_SEGNPFLT: /* segment not present fault */ + case T_TSSFLT: /* invalid TSS fault */ + case T_DOUBLEFLT: /* double fault */ + default: + ucode = code + BUS_SEGM_FAULT ; + i = SIGBUS; + break; + + case T_PAGEFLT: /* page fault */ + i = trap_pfault(&frame, TRUE, eva); + if (i == -1) + return; +#if defined(I586_CPU) && !defined(NO_F00F_HACK) + if (i == -2) + goto restart; +#endif + if (i == 0) + goto out; + + ucode = T_PAGEFLT; + break; + + case T_DIVIDE: /* integer divide fault */ + ucode = FPE_INTDIV_TRAP; + i = SIGFPE; + break; + +#if NISA > 0 + case T_NMI: +#ifdef POWERFAIL_NMI + goto handle_powerfail; +#else /* !POWERFAIL_NMI */ +#ifdef DDB + /* NMI can be hooked up to a pushbutton for debugging */ + printf ("NMI ... going to debugger\n"); + if (kdb_trap (type, 0, &frame)) + return; +#endif /* DDB */ + /* machine/parity/power fail/"kitchen sink" faults */ + if (isa_nmi(code) == 0) return; + panic("NMI indicates hardware failure"); +#endif /* POWERFAIL_NMI */ +#endif /* NISA > 0 */ + + case T_OFLOW: /* integer overflow fault */ + ucode = FPE_INTOVF_TRAP; + i = SIGFPE; + break; - case T_ASTFLT|T_USER: /* Allow process switch */ - astoff(); - cnt.v_soft++; - if ((p->p_flag & SOWEUPC) && p->p_stats->p_prof.pr_scale) { - addupc(frame.tf_eip, &p->p_stats->p_prof, 1); - p->p_flag &= ~SOWEUPC; + case T_BOUND: /* bounds check fault */ + ucode = FPE_SUBRNG_TRAP; + i = SIGFPE; + break; + + case T_DNA: +#if NNPX > 0 + /* if a transparent fault (due to context switch "late") */ + if (npxdna()) + return; +#endif + if (!pmath_emulate) { + i = SIGFPE; + ucode = FPE_FPU_NP_TRAP; + break; + } + i = (*pmath_emulate)(&frame); + if (i == 0) { + if (!(frame.tf_eflags & PSL_T)) + return; + frame.tf_eflags &= ~PSL_T; + i = SIGTRAP; + } + /* else ucode = emulator_only_knows() XXX */ + break; + + case T_FPOPFLT: /* FPU operand fetch fault */ + ucode = T_FPOPFLT; + i = SIGILL; + break; } - goto out; + } else { +#ifdef VM86 +kernel_trap: +#endif + /* kernel trap */ + + switch (type) { + case T_PAGEFLT: /* page fault */ + (void) trap_pfault(&frame, FALSE, eva); + return; - case T_DNA|T_USER: -#ifdef NPX - /* if a transparent fault (due to context switch "late") */ - if (npxdna()) return; + case T_DNA: +#if NNPX > 0 + /* + * The kernel is apparently using npx for copying. + * XXX this should be fatal unless the kernel has + * registered such use. + */ + if (npxdna()) + return; #endif - i = math_emulate(&frame); - if (i == 0) return; - ucode = FPE_FPU_NP_TRAP; - break; + break; - case T_BOUND|T_USER: - ucode = FPE_SUBRNG_TRAP; - i = SIGFPE; - break; + case T_PROTFLT: /* general protection fault */ + case T_SEGNPFLT: /* segment not present fault */ + /* + * Invalid segment selectors and out of bounds + * %eip's and %esp's can be set up in user mode. + * This causes a fault in kernel mode when the + * kernel tries to return to user mode. We want + * to get this fault so that we can fix the + * problem here and not have to check all the + * selectors and pointers when the user changes + * them. + */ +#define MAYBE_DORETI_FAULT(where, whereto) \ + do { \ + if (frame.tf_eip == (int)where) { \ + frame.tf_eip = (int)whereto; \ + return; \ + } \ + } while (0) - case T_OFLOW|T_USER: - ucode = FPE_INTOVF_TRAP; - i = SIGFPE; - break; + if (intr_nesting_level == 0) { + /* + * Invalid %fs's and %gs's can be created using + * procfs or PT_SETREGS or by invalidating the + * underlying LDT entry. This causes a fault + * in kernel mode when the kernel attempts to + * switch contexts. Lose the bad context + * (XXX) so that we can continue, and generate + * a signal. + */ + if (frame.tf_eip == (int)cpu_switch_load_fs) { + curpcb->pcb_fs = 0; + psignal(p, SIGBUS); + return; + } + if (frame.tf_eip == (int)cpu_switch_load_gs) { + curpcb->pcb_gs = 0; + psignal(p, SIGBUS); + return; + } + MAYBE_DORETI_FAULT(doreti_iret, + doreti_iret_fault); + MAYBE_DORETI_FAULT(doreti_popl_ds, + doreti_popl_ds_fault); + MAYBE_DORETI_FAULT(doreti_popl_es, + doreti_popl_es_fault); + if (curpcb && curpcb->pcb_onfault) { + frame.tf_eip = (int)curpcb->pcb_onfault; + return; + } + } + break; - case T_DIVIDE|T_USER: - ucode = FPE_INTDIV_TRAP; - i = SIGFPE; - break; + case T_TSSFLT: + /* + * PSL_NT can be set in user mode and isn't cleared + * automatically when the kernel is entered. This + * causes a TSS fault when the kernel attempts to + * `iret' because the TSS link is uninitialized. We + * want to get this fault so that we can fix the + * problem here and not every time the kernel is + * entered. + */ + if (frame.tf_eflags & PSL_NT) { + frame.tf_eflags &= ~PSL_NT; + return; + } + break; - case T_ARITHTRAP|T_USER: - ucode = code; - i = SIGFPE; - break; + case T_TRCTRAP: /* trace trap */ + if (frame.tf_eip == (int)IDTVEC(syscall)) { + /* + * We've just entered system mode via the + * syscall lcall. Continue single stepping + * silently until the syscall handler has + * saved the flags. + */ + return; + } + if (frame.tf_eip == (int)IDTVEC(syscall) + 1) { + /* + * The syscall handler has now saved the + * flags. Stop single stepping it. + */ + frame.tf_eflags &= ~PSL_T; + return; + } + /* + * Fall through. + */ + case T_BPTFLT: + /* + * If DDB is enabled, let it handle the debugger trap. + * Otherwise, debugger traps "can't happen". + */ +#ifdef DDB + if (kdb_trap (type, 0, &frame)) + return; +#endif + break; - case T_PAGEFLT: /* allow page faults in kernel mode */ -#if 0 - /* XXX - check only applies to 386's and 486's with WP off */ - if (code & PGEX_P) goto we_re_toast; +#if NISA > 0 + case T_NMI: +#ifdef POWERFAIL_NMI +#ifndef TIMER_FREQ +# define TIMER_FREQ 1193182 #endif + handle_powerfail: + { + static unsigned lastalert = 0; + + if(time_second - lastalert > 10) + { + log(LOG_WARNING, "NMI: power fail\n"); + sysbeep(TIMER_FREQ/880, hz); + lastalert = time_second; + } + return; + } +#else /* !POWERFAIL_NMI */ +#ifdef DDB + /* NMI can be hooked up to a pushbutton for debugging */ + printf ("NMI ... going to debugger\n"); + if (kdb_trap (type, 0, &frame)) + return; +#endif /* DDB */ + /* machine/parity/power fail/"kitchen sink" faults */ + if (isa_nmi(code) == 0) return; + /* FALL THROUGH */ +#endif /* POWERFAIL_NMI */ +#endif /* NISA > 0 */ + } + + trap_fatal(&frame, eva); + return; + } + + /* Translate fault for emulators (e.g. Linux) */ + if (*p->p_sysent->sv_transtrap) + i = (*p->p_sysent->sv_transtrap)(i, type); + + trapsignal(p, i, ucode); + +#ifdef DEBUG + if (type <= MAX_TRAP_MSG) { + uprintf("fatal process exception: %s", + trap_msg[type]); + if ((type == T_PAGEFLT) || (type == T_PROTFLT)) + uprintf(", fault VA = 0x%lx", (u_long)eva); + uprintf("\n"); + } +#endif + +out: + userret(p, &frame, sticks); +} + +#ifdef notyet +/* + * This version doesn't allow a page fault to user space while + * in the kernel. The rest of the kernel needs to be made "safe" + * before this can be used. I think the only things remaining + * to be made safe are the iBCS2 code and the process tracing/ + * debugging code. + */ +static int +trap_pfault(frame, usermode, eva) + struct trapframe *frame; + int usermode; + vm_offset_t eva; +{ + vm_offset_t va; + struct vmspace *vm = NULL; + vm_map_t map = 0; + int rv = 0; + vm_prot_t ftype; + struct proc *p = curproc; + + if (frame->tf_err & PGEX_W) + ftype = VM_PROT_READ | VM_PROT_WRITE; + else + ftype = VM_PROT_READ; + + va = trunc_page(eva); + if (va < VM_MIN_KERNEL_ADDRESS) { + vm_offset_t v; + vm_page_t mpte; + + if (p == NULL || + (!usermode && va < VM_MAXUSER_ADDRESS && + (intr_nesting_level != 0 || curpcb == NULL || + curpcb->pcb_onfault == NULL))) { + trap_fatal(frame, eva); + return (-1); + } - /* fall into */ - case T_PAGEFLT|T_USER: /* page fault */ - { - register vm_offset_t va; - register struct vmspace *vm = p->p_vmspace; - register vm_map_t map; - int rv; - vm_prot_t ftype; - extern vm_map_t kernel_map; - unsigned nss,v; - - va = trunc_page((vm_offset_t)eva); /* - * Avoid even looking at pde_v(va) for high va's. va's - * above VM_MAX_KERNEL_ADDRESS don't correspond to normal - * PDE's (half of them correspond to APDEpde and half to - * an unmapped kernel PDE). va's betweeen 0xFEC00000 and - * VM_MAX_KERNEL_ADDRESS correspond to unmapped kernel PDE's - * (XXX - why are only 3 initialized when 6 are required to - * reach VM_MAX_KERNEL_ADDRESS?). Faulting in an unmapped - * kernel page table would give inconsistent PTD's. - * - * XXX - faulting in unmapped page tables wastes a page if - * va turns out to be invalid. - * - * XXX - should "kernel address space" cover the kernel page - * tables? Might have same problem with PDEpde as with - * APDEpde (or there may be no problem with APDEpde). + * This is a fault on non-kernel virtual memory. + * vm is initialized above to NULL. If curproc is NULL + * or curproc->p_vmspace is NULL the fault is fatal. */ - if (va > 0xFEBFF000) { - rv = KERN_FAILURE; /* becomes SIGBUS */ + vm = p->p_vmspace; + if (vm == NULL) goto nogo; - } + + map = &vm->vm_map; + /* - * It is only a kernel address space fault iff: - * 1. (type & T_USER) == 0 and - * 2. pcb_onfault not set or - * 3. pcb_onfault set but supervisor space fault - * The last can occur during an exec() copyin where the - * argument space is lazy-allocated. + * Keep swapout from messing with us during this + * critical time. */ - if (type == T_PAGEFLT && va >= KERNBASE) - map = kernel_map; - else - map = &vm->vm_map; - if (code & PGEX_W) - ftype = VM_PROT_READ | VM_PROT_WRITE; - else - ftype = VM_PROT_READ; - -#ifdef DEBUG - if (map == kernel_map && va == 0) { - printf("trap: bad kernel access at %x\n", va); - goto we_re_toast; - } -#endif + ++p->p_lock; /* - * XXX: rude hack to make stack limits "work" + * Grow the stack if necessary */ - nss = 0; - if ((caddr_t)va >= vm->vm_maxsaddr && map != kernel_map - && dostacklimits) { - nss = clrnd(btoc((unsigned)vm->vm_maxsaddr - + MAXSSIZ - (unsigned)va)); - if (nss > btoc(p->p_rlimit[RLIMIT_STACK].rlim_cur)) { -/*pg("trap rlimit %d, maxsaddr %x va %x ", nss, vm->vm_maxsaddr, va);*/ +#ifndef VM_STACK + if ((caddr_t)va > vm->vm_maxsaddr && va < USRSTACK) { + if (!grow(p, va)) { rv = KERN_FAILURE; + --p->p_lock; goto nogo; } } - /* check if page table is mapped, if not, fault it first */ -#define pde_v(v) (PTD[((v)>>PD_SHIFT)&1023].pd_v) - if (!pde_v(va)) { - v = trunc_page(vtopte(va)); - rv = vm_fault(map, v, ftype, FALSE); - if (rv != KERN_SUCCESS) goto nogo; - /* check if page table fault, increment wiring */ - vm_map_pageable(map, v, round_page(v+1), FALSE); - } else v=0; - rv = vm_fault(map, va, ftype, FALSE); - if (rv == KERN_SUCCESS) { - /* - * XXX: continuation of rude stack hack - */ - if (nss > vm->vm_ssize) - vm->vm_ssize = nss; - va = trunc_page(vtopte(va)); - /* for page table, increment wiring - as long as not a page table fault as well */ - if (!v && type != T_PAGEFLT) - vm_map_pageable(map, va, round_page(va+1), FALSE); - if (type == T_PAGEFLT) - return; - goto out; +#else + /* grow_stack returns false only if va falls into + * a growable stack region and the stack growth + * fails. It returns true if va was not within + * a growable stack region, or if the stack + * growth succeeded. + */ + if (!grow_stack (p, va)) { + rv = KERN_FAILURE; + --p->p_lock; + goto nogo; } +#endif + + /* Fault in the user page: */ + rv = vm_fault(map, va, ftype, + (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : 0); + + --p->p_lock; + } else { + /* + * Don't allow user-mode faults in kernel address space. + */ + if (usermode) + goto nogo; + + /* + * Since we know that kernel virtual address addresses + * always have pte pages mapped, we just have to fault + * the page. + */ + rv = vm_fault(kernel_map, va, ftype, FALSE); + } + + if (rv == KERN_SUCCESS) + return (0); nogo: - if (type == T_PAGEFLT) { - if (curpcb->pcb_onfault) - goto copyfault; - printf("vm_fault(%x, %x, %x, 0) -> %x\n", - map, va, ftype, rv); - printf(" type %x, code %x\n", - type, code); - goto we_re_toast; + if (!usermode) { + if (intr_nesting_level == 0 && curpcb && curpcb->pcb_onfault) { + frame->tf_eip = (int)curpcb->pcb_onfault; + return (0); } - i = (rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV; - break; - } + trap_fatal(frame, eva); + return (-1); + } -#if NDDB == 0 - case T_TRCTRAP: /* trace trap -- someone single stepping lcall's */ - frame.tf_eflags &= ~PSL_T; + /* kludge to pass faulting virtual address to sendsig */ + frame->tf_err = eva; - /* Q: how do we turn it on again? */ - return; + return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); +} #endif - - case T_BPTFLT|T_USER: /* bpt instruction fault */ - case T_TRCTRAP|T_USER: /* trace trap */ - frame.tf_eflags &= ~PSL_T; - i = SIGTRAP; - break; -#include "isa.h" -#if NISA > 0 - case T_NMI: - case T_NMI|T_USER: -#if NDDB > 0 - /* NMI can be hooked up to a pushbutton for debugging */ - printf ("NMI ... going to debugger\n"); - if (kdb_trap (type, 0, &frame)) - return; -#endif - /* machine/parity/power fail/"kitchen sink" faults */ - if(isa_nmi(code) == 0) return; - else goto we_re_toast; +int +trap_pfault(frame, usermode, eva) + struct trapframe *frame; + int usermode; + vm_offset_t eva; +{ + vm_offset_t va; + struct vmspace *vm = NULL; + vm_map_t map = 0; + int rv = 0; + vm_prot_t ftype; + struct proc *p = curproc; + + va = trunc_page(eva); + if (va >= KERNBASE) { + /* + * Don't allow user-mode faults in kernel address space. + * An exception: if the faulting address is the invalid + * instruction entry in the IDT, then the Intel Pentium + * F00F bug workaround was triggered, and we need to + * treat it is as an illegal instruction, and not a page + * fault. + */ +#if defined(I586_CPU) && !defined(NO_F00F_HACK) + if ((eva == (unsigned int)&t_idt[6]) && has_f00f_bug) { + frame->tf_trapno = T_PRIVINFLT; + return -2; + } #endif - } + if (usermode) + goto nogo; - trapsignal(p, i, ucode); - if ((type & T_USER) == 0) - return; -out: - while (i = CURSIG(p)) - psig(i); - p->p_pri = p->p_usrpri; - if (want_resched) { + map = kernel_map; + } else { /* - * Since we are curproc, clock will normally just change - * our priority without moving us from one queue to another - * (since the running process is not on a queue.) - * If that happened after we setrq ourselves but before we - * swtch()'ed, we might not be on the queue indicated by - * our priority. + * This is a fault on non-kernel virtual memory. + * vm is initialized above to NULL. If curproc is NULL + * or curproc->p_vmspace is NULL the fault is fatal. */ - (void) splclock(); - setrq(p); - p->p_stats->p_ru.ru_nivcsw++; - swtch(); - (void) splnone(); - while (i = CURSIG(p)) - psig(i); + if (p != NULL) + vm = p->p_vmspace; + + if (vm == NULL) + goto nogo; + + map = &vm->vm_map; } - if (p->p_stats->p_prof.pr_scale) { - int ticks; - struct timeval *tv = &p->p_stime; - - ticks = ((tv->tv_sec - syst.tv_sec) * 1000 + - (tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000); - if (ticks) { -#ifdef PROFTIMER - extern int profscale; - addupc(frame.tf_eip, &p->p_stats->p_prof, - ticks * profscale); + + if (frame->tf_err & PGEX_W) + ftype = VM_PROT_READ | VM_PROT_WRITE; + else + ftype = VM_PROT_READ; + + if (map != kernel_map) { + /* + * Keep swapout from messing with us during this + * critical time. + */ + ++p->p_lock; + + /* + * Grow the stack if necessary + */ +#ifndef VM_STACK + if ((caddr_t)va > vm->vm_maxsaddr && va < USRSTACK) { + if (!grow(p, va)) { + rv = KERN_FAILURE; + --p->p_lock; + goto nogo; + } + } #else - addupc(frame.tf_eip, &p->p_stats->p_prof, ticks); + /* grow_stack returns false only if va falls into + * a growable stack region and the stack growth + * fails. It returns true if va was not within + * a growable stack region, or if the stack + * growth succeeded. + */ + if (!grow_stack (p, va)) { + rv = KERN_FAILURE; + --p->p_lock; + goto nogo; + } #endif + + /* Fault in the user page: */ + rv = vm_fault(map, va, ftype, + (ftype & VM_PROT_WRITE) ? VM_FAULT_DIRTY : 0); + + --p->p_lock; + } else { + /* + * Don't have to worry about process locking or stacks in the kernel. + */ + rv = vm_fault(map, va, ftype, FALSE); + } + + if (rv == KERN_SUCCESS) + return (0); +nogo: + if (!usermode) { + if (intr_nesting_level == 0 && curpcb && curpcb->pcb_onfault) { + frame->tf_eip = (int)curpcb->pcb_onfault; + return (0); } + trap_fatal(frame, eva); + return (-1); } - curpri = p->p_pri; - curpcb->pcb_flags &= ~FM_TRAP; /* used by sendsig */ + + /* kludge to pass faulting virtual address to sendsig */ + frame->tf_err = eva; + + return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); +} + +static void +trap_fatal(frame, eva) + struct trapframe *frame; + vm_offset_t eva; +{ + int code, type, ss, esp; + struct soft_segment_descriptor softseg; + + code = frame->tf_err; + type = frame->tf_trapno; + sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); + + if (type <= MAX_TRAP_MSG) + printf("\n\nFatal trap %d: %s while in %s mode\n", + type, trap_msg[type], + frame->tf_eflags & PSL_VM ? "vm86" : + ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); +#ifdef SMP + /* three seperate prints in case of a trap on an unmapped page */ + printf("mp_lock = %08x; ", mp_lock); + printf("cpuid = %d; ", cpuid); + printf("lapic.id = %08x\n", lapic.id); +#endif + if (type == T_PAGEFLT) { + printf("fault virtual address = 0x%x\n", eva); + printf("fault code = %s %s, %s\n", + code & PGEX_U ? "user" : "supervisor", + code & PGEX_W ? "write" : "read", + code & PGEX_P ? "protection violation" : "page not present"); + } + printf("instruction pointer = 0x%x:0x%x\n", + frame->tf_cs & 0xffff, frame->tf_eip); + if ((ISPL(frame->tf_cs) == SEL_UPL) || (frame->tf_eflags & PSL_VM)) { + ss = frame->tf_ss & 0xffff; + esp = frame->tf_esp; + } else { + ss = GSEL(GDATA_SEL, SEL_KPL); + esp = (int)&frame->tf_esp; + } + printf("stack pointer = 0x%x:0x%x\n", ss, esp); + printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp); + printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", + softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); + printf(" = DPL %d, pres %d, def32 %d, gran %d\n", + softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, + softseg.ssd_gran); + printf("processor eflags = "); + if (frame->tf_eflags & PSL_T) + printf("trace trap, "); + if (frame->tf_eflags & PSL_I) + printf("interrupt enabled, "); + if (frame->tf_eflags & PSL_NT) + printf("nested task, "); + if (frame->tf_eflags & PSL_RF) + printf("resume, "); + if (frame->tf_eflags & PSL_VM) + printf("vm86, "); + printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); + printf("current process = "); + if (curproc) { + printf("%lu (%s)\n", + (u_long)curproc->p_pid, curproc->p_comm ? + curproc->p_comm : ""); + } else { + printf("Idle\n"); + } + printf("interrupt mask = "); + if ((cpl & net_imask) == net_imask) + printf("net "); + if ((cpl & tty_imask) == tty_imask) + printf("tty "); + if ((cpl & bio_imask) == bio_imask) + printf("bio "); + if ((cpl & cam_imask) == cam_imask) + printf("cam "); + if (cpl == 0) + printf("none"); +#ifdef SMP +/** + * XXX FIXME: + * we probably SHOULD have stopped the other CPUs before now! + * another CPU COULD have been touching cpl at this moment... + */ + printf(" <- SMP: XXX"); +#endif + printf("\n"); + +#ifdef KDB + if (kdb_trap(&psl)) + return; +#endif +#ifdef DDB + if ((debugger_on_panic || in_Debugger) && kdb_trap(type, 0, frame)) + return; +#endif + printf("trap number = %d\n", type); + if (type <= MAX_TRAP_MSG) + panic(trap_msg[type]); + else + panic("unknown/reserved trap"); } /* - * Compensate for 386 brain damage (missing URKR) + * Double fault handler. Called when a fault occurs while writing + * a frame for a trap/exception onto the stack. This usually occurs + * when the stack overflows (such is the case with infinite recursion, + * for example). + * + * XXX Note that the current PTD gets replaced by IdlePTD when the + * task switch occurs. This means that the stack that was active at + * the time of the double fault is not available at <kstack> unless + * the machine was idle when the double fault occurred. The downside + * of this is that "trace <ebp>" in ddb won't work. */ -int trapwrite(unsigned addr) { - int rv; +void +dblfault_handler() +{ + printf("\nFatal double fault:\n"); + printf("eip = 0x%x\n", common_tss.tss_eip); + printf("esp = 0x%x\n", common_tss.tss_esp); + printf("ebp = 0x%x\n", common_tss.tss_ebp); +#ifdef SMP + /* three seperate prints in case of a trap on an unmapped page */ + printf("mp_lock = %08x; ", mp_lock); + printf("cpuid = %d; ", cpuid); + printf("lapic.id = %08x\n", lapic.id); +#endif + panic("double fault"); +} + +/* + * Compensate for 386 brain damage (missing URKR). + * This is a little simpler than the pagefault handler in trap() because + * it the page tables have already been faulted in and high addresses + * are thrown out early for other reasons. + */ +int trapwrite(addr) + unsigned addr; +{ + struct proc *p; vm_offset_t va; + struct vmspace *vm; + int rv; va = trunc_page((vm_offset_t)addr); - if (va > VM_MAXUSER_ADDRESS) return(1); - rv = vm_fault(&curproc->p_vmspace->vm_map, va, - VM_PROT_READ | VM_PROT_WRITE, FALSE); - if (rv == KERN_SUCCESS) return(0); - else return(1); + /* + * XXX - MAX is END. Changed > to >= for temp. fix. + */ + if (va >= VM_MAXUSER_ADDRESS) + return (1); + + p = curproc; + vm = p->p_vmspace; + + ++p->p_lock; + +#ifndef VM_STACK + if ((caddr_t)va >= vm->vm_maxsaddr && va < USRSTACK) { + if (!grow(p, va)) { + --p->p_lock; + return (1); + } + } +#else + if (!grow_stack (p, va)) { + --p->p_lock; + return (1); + } +#endif + + /* + * fault the data page + */ + rv = vm_fault(&vm->vm_map, va, VM_PROT_READ|VM_PROT_WRITE, VM_FAULT_DIRTY); + + --p->p_lock; + + if (rv != KERN_SUCCESS) + return 1; + + return (0); } /* - * syscall(frame): - * System call request from POSIX system call gate interface to kernel. + * System call request from POSIX system call gate interface to kernel. * Like trap(), argument is call by reference. */ -/*ARGSUSED*/ +void syscall(frame) - volatile struct syscframe frame; + struct trapframe frame; { - register int *locr0 = ((int *)&frame); - register caddr_t params; - register int i; - register struct sysent *callp; - register struct proc *p = curproc; - struct timeval syst; - int error, opc; - int args[8], rval[2]; - int code; - -#ifdef lint - r0 = 0; r0 = r0; r1 = 0; r1 = r1; -#endif - syst = p->p_stime; - if (ISPL(frame.sf_cs) != SEL_UPL) + caddr_t params; + int i; + struct sysent *callp; + struct proc *p = curproc; + u_quad_t sticks; + int error; + int args[8]; + u_int code; + +#ifdef DIAGNOSTIC + if (ISPL(frame.tf_cs) != SEL_UPL) panic("syscall"); +#endif + sticks = p->p_sticks; + p->p_md.md_regs = &frame; + params = (caddr_t)frame.tf_esp + sizeof(int); + code = frame.tf_eax; + if (p->p_sysent->sv_prepsyscall) { + (*p->p_sysent->sv_prepsyscall)(&frame, args, &code, ¶ms); + } else { + /* + * Need to check if this is a 32 bit or 64 bit syscall. + */ + if (code == SYS_syscall) { + /* + * Code is first argument, followed by actual args. + */ + code = fuword(params); + params += sizeof(int); + } else if (code == SYS___syscall) { + /* + * Like syscall, but code is a quad, so as to maintain + * quad alignment for the rest of the arguments. + */ + code = fuword(params); + params += sizeof(quad_t); + } + } - code = frame.sf_eax; - curpcb->pcb_flags &= ~FM_TRAP; /* used by sendsig */ - p->p_regs = (int *)&frame; - params = (caddr_t)frame.sf_esp + sizeof (int) ; + if (p->p_sysent->sv_mask) + code &= p->p_sysent->sv_mask; - /* - * Reconstruct pc, assuming lcall $X,y is 7 bytes, as it is always. - */ - opc = frame.sf_eip - 7; - callp = (code >= nsysent) ? &sysent[63] : &sysent[code]; - if (callp == sysent) { - i = fuword(params); - params += sizeof (int); - callp = (code >= nsysent) ? &sysent[63] : &sysent[code]; - } + if (code >= p->p_sysent->sv_size) + callp = &p->p_sysent->sv_table[0]; + else + callp = &p->p_sysent->sv_table[code]; - if ((i = callp->sy_narg * sizeof (int)) && + if (params && (i = callp->sy_narg * sizeof(int)) && (error = copyin(params, (caddr_t)args, (u_int)i))) { - frame.sf_eax = error; - frame.sf_eflags |= PSL_C; /* carry bit */ #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) - ktrsyscall(p->p_tracep, code, callp->sy_narg, &args); + ktrsyscall(p->p_tracep, code, callp->sy_narg, args); #endif - goto done; + goto bad; } #ifdef KTRACE if (KTRPOINT(p, KTR_SYSCALL)) - ktrsyscall(p->p_tracep, code, callp->sy_narg, &args); + ktrsyscall(p->p_tracep, code, callp->sy_narg, args); #endif - rval[0] = 0; - rval[1] = frame.sf_edx; -/*pg("%d. s %d\n", p->p_pid, code);*/ - error = (*callp->sy_call)(p, args, rval); - if (error == ERESTART) - frame.sf_eip = opc; - else if (error != EJUSTRETURN) { - if (error) { -/*pg("error %d", error);*/ - frame.sf_eax = error; - frame.sf_eflags |= PSL_C; /* carry bit */ - } else { - frame.sf_eax = rval[0]; - frame.sf_edx = rval[1]; - frame.sf_eflags &= ~PSL_C; /* carry bit */ - } - } - /* else if (error == EJUSTRETURN) */ - /* nothing to do */ -done: - /* - * Reinitialize proc pointer `p' as it may be different - * if this is a child returning from fork syscall. - */ - p = curproc; - while (i = CURSIG(p)) - psig(i); - p->p_pri = p->p_usrpri; - if (want_resched) { + p->p_retval[0] = 0; + p->p_retval[1] = frame.tf_edx; + + STOPEVENT(p, S_SCE, callp->sy_narg); + + error = (*callp->sy_call)(p, args); + + switch (error) { + + case 0: /* - * Since we are curproc, clock will normally just change - * our priority without moving us from one queue to another - * (since the running process is not on a queue.) - * If that happened after we setrq ourselves but before we - * swtch()'ed, we might not be on the queue indicated by - * our priority. + * Reinitialize proc pointer `p' as it may be different + * if this is a child returning from fork syscall. */ - (void) splclock(); - setrq(p); - p->p_stats->p_ru.ru_nivcsw++; - swtch(); - (void) splnone(); - while (i = CURSIG(p)) - psig(i); + p = curproc; + frame.tf_eax = p->p_retval[0]; + frame.tf_edx = p->p_retval[1]; + frame.tf_eflags &= ~PSL_C; + break; + + case ERESTART: + /* + * Reconstruct pc, assuming lcall $X,y is 7 bytes, + * int 0x80 is 2 bytes. We saved this in tf_err. + */ + frame.tf_eip -= frame.tf_err; + break; + + case EJUSTRETURN: + break; + + default: +bad: + if (p->p_sysent->sv_errsize) + if (error >= p->p_sysent->sv_errsize) + error = -1; /* XXX */ + else + error = p->p_sysent->sv_errtbl[error]; + frame.tf_eax = error; + frame.tf_eflags |= PSL_C; + break; } - if (p->p_stats->p_prof.pr_scale) { - int ticks; - struct timeval *tv = &p->p_stime; - - ticks = ((tv->tv_sec - syst.tv_sec) * 1000 + - (tv->tv_usec - syst.tv_usec) / 1000) / (tick / 1000); - if (ticks) { -#ifdef PROFTIMER - extern int profscale; - addupc(frame.sf_eip, &p->p_stats->p_prof, - ticks * profscale); -#else - addupc(frame.sf_eip, &p->p_stats->p_prof, ticks); -#endif - } + + if ((frame.tf_eflags & PSL_T) && !(frame.tf_eflags & PSL_VM)) { + /* Traced syscall. */ + frame.tf_eflags &= ~PSL_T; + trapsignal(p, SIGTRAP, 0); } - curpri = p->p_pri; + + userret(p, &frame, sticks); + #ifdef KTRACE if (KTRPOINT(p, KTR_SYSRET)) - ktrsysret(p->p_tracep, code, error, rval[0]); + ktrsysret(p->p_tracep, code, error, p->p_retval[0]); #endif -#ifdef DIAGNOSTICx -{ extern int _udatasel, _ucodesel; - if (frame.sf_ss != _udatasel) - printf("ss %x call %d\n", frame.sf_ss, code); - if ((frame.sf_cs&0xffff) != _ucodesel) - printf("cs %x call %d\n", frame.sf_cs, code); - if (frame.sf_eip > VM_MAXUSER_ADDRESS) { - printf("eip %x call %d\n", frame.sf_eip, code); - frame.sf_eip = 0; - } + + /* + * This works because errno is findable through the + * register set. If we ever support an emulation where this + * is not the case, this code will need to be revisited. + */ + STOPEVENT(p, S_SCX, code); + } + +/* + * Simplified back end of syscall(), used when returning from fork() + * directly into user mode. + */ +void +fork_return(p, frame) + struct proc *p; + struct trapframe frame; +{ + frame.tf_eax = 0; /* Child returns zero */ + frame.tf_eflags &= ~PSL_C; /* success */ + frame.tf_edx = 1; + + userret(p, &frame, 0); +#ifdef KTRACE + if (KTRPOINT(p, KTR_SYSRET)) + ktrsysret(p->p_tracep, SYS_fork, 0, 0); #endif } diff --git a/sys/kern/subr_xxx.c b/sys/kern/subr_xxx.c new file mode 100644 index 0000000..7ff3366 --- /dev/null +++ b/sys/kern/subr_xxx.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)subr_xxx.c 8.1 (Berkeley) 6/10/93 + * $Id: subr_xxx.c,v 1.11 1998/08/20 06:10:40 bde Exp $ + */ + +/* + * Miscellaneous trivial functions. + */ +#include <sys/param.h> +#include <sys/systm.h> + +/* + * Return error for operation not supported + * on a specific object or file type. + */ +int +eopnotsupp() +{ + + return (EOPNOTSUPP); +} + +/* + * Return error for an inval operation + * on a specific object or file type. + */ +int +einval() +{ + + return (EINVAL); +} + +/* + * Generic null operation, always returns success. + */ +int +nullop() +{ + + return (0); +} + +#include <sys/conf.h> + +/* + * Unsupported devswitch functions (e.g. for writing to read-only device). + * XXX may belong elsewhere. + */ + +int +noopen(dev, flags, fmt, p) + dev_t dev; + int flags; + int fmt; + struct proc *p; +{ + + return (ENODEV); +} + +int +noclose(dev, flags, fmt, p) + dev_t dev; + int flags; + int fmt; + struct proc *p; +{ + + return (ENODEV); +} + +int +noread(dev, uio, ioflag) + dev_t dev; + struct uio *uio; + int ioflag; +{ + + return (ENODEV); +} + +int +nowrite(dev, uio, ioflag) + dev_t dev; + struct uio *uio; + int ioflag; +{ + + return (ENODEV); +} + +int +noioctl(dev, cmd, data, flags, p) + dev_t dev; + u_long cmd; + caddr_t data; + int flags; + struct proc *p; +{ + + return (ENODEV); +} + +void +nostop(tp, rw) + struct tty *tp; + int rw; +{ + +} + +int +noreset(dev) + dev_t dev; +{ + + printf("noreset(0x%x) called\n", dev); + return (ENODEV); +} + +struct tty * +nodevtotty(dev) + dev_t dev; +{ + + return (NULL); +} + +int +nommap(dev, offset, nprot) + dev_t dev; + vm_offset_t offset; + int nprot; +{ + + /* Don't return ENODEV. That would allow mapping address ENODEV! */ + return (-1); +} + +int +nodump(dev) + dev_t dev; +{ + + return (ENODEV); +} + +/* + * Null devswitch functions (for when the operation always succeeds). + * XXX may belong elsewhere. + * XXX not all are here (e.g., seltrue() isn't). + */ + +/* + * XXX this is probably bogus. Any device that uses it isn't checking the + * minor number. + */ +int +nullopen(dev, flags, fmt, p) + dev_t dev; + int flags; + int fmt; + struct proc *p; +{ + + return (0); +} + +int +nullclose(dev, flags, fmt, p) + dev_t dev; + int flags; + int fmt; + struct proc *p; +{ + + return (0); +} diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c new file mode 100644 index 0000000..8d90ee9 --- /dev/null +++ b/sys/kern/sys_generic.c @@ -0,0 +1,872 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 + * $Id: sys_generic.c,v 1.42 1998/11/11 10:03:55 truckman Exp $ + */ + +#include "opt_ktrace.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/filedesc.h> +#include <sys/filio.h> +#include <sys/ttycom.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/proc.h> +#include <sys/signalvar.h> +#include <sys/socketvar.h> +#include <sys/uio.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/poll.h> +#include <sys/sysent.h> +#ifdef KTRACE +#include <sys/ktrace.h> +#endif + +#include <machine/limits.h> + +static MALLOC_DEFINE(M_IOCTLOPS, "ioctlops", "ioctl data buffer"); +static MALLOC_DEFINE(M_SELECT, "select", "select() buffer"); +MALLOC_DEFINE(M_IOV, "iov", "large iov's"); + +static int pollscan __P((struct proc *, struct pollfd *, int)); +static int selscan __P((struct proc *, fd_mask **, fd_mask **, int)); + +/* + * Read system call. + */ +#ifndef _SYS_SYSPROTO_H_ +struct read_args { + int fd; + void *buf; + size_t nbyte; +}; +#endif +/* ARGSUSED */ +int +read(p, uap) + struct proc *p; + register struct read_args *uap; +{ + register struct file *fp; + register struct filedesc *fdp = p->p_fd; + struct uio auio; + struct iovec aiov; + long cnt, error = 0; +#ifdef KTRACE + struct iovec ktriov; +#endif + + if (((u_int)uap->fd) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL || + (fp->f_flag & FREAD) == 0) + return (EBADF); + aiov.iov_base = (caddr_t)uap->buf; + aiov.iov_len = uap->nbyte; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = -1; + if (uap->nbyte > INT_MAX) + return (EINVAL); + auio.uio_resid = uap->nbyte; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; +#ifdef KTRACE + /* + * if tracing, save a copy of iovec + */ + if (KTRPOINT(p, KTR_GENIO)) + ktriov = aiov; +#endif + cnt = uap->nbyte; + if ((error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred))) + if (auio.uio_resid != cnt && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + cnt -= auio.uio_resid; +#ifdef KTRACE + if (KTRPOINT(p, KTR_GENIO) && error == 0) + ktrgenio(p->p_tracep, uap->fd, UIO_READ, &ktriov, cnt, error); +#endif + p->p_retval[0] = cnt; + return (error); +} + +/* + * Scatter read system call. + */ +#ifndef _SYS_SYSPROTO_H_ +struct readv_args { + int fd; + struct iovec *iovp; + u_int iovcnt; +}; +#endif +int +readv(p, uap) + struct proc *p; + register struct readv_args *uap; +{ + register struct file *fp; + register struct filedesc *fdp = p->p_fd; + struct uio auio; + register struct iovec *iov; + struct iovec *needfree; + struct iovec aiov[UIO_SMALLIOV]; + long i, cnt, error = 0; + u_int iovlen; +#ifdef KTRACE + struct iovec *ktriov = NULL; +#endif + + if (((u_int)uap->fd) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL || + (fp->f_flag & FREAD) == 0) + return (EBADF); + /* note: can't use iovlen until iovcnt is validated */ + iovlen = uap->iovcnt * sizeof (struct iovec); + if (uap->iovcnt > UIO_SMALLIOV) { + if (uap->iovcnt > UIO_MAXIOV) + return (EINVAL); + MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); + needfree = iov; + } else { + iov = aiov; + needfree = NULL; + } + auio.uio_iov = iov; + auio.uio_iovcnt = uap->iovcnt; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + auio.uio_offset = -1; + if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) + goto done; + auio.uio_resid = 0; + for (i = 0; i < uap->iovcnt; i++) { + if (iov->iov_len > INT_MAX - auio.uio_resid) { + error = EINVAL; + goto done; + } + auio.uio_resid += iov->iov_len; + iov++; + } +#ifdef KTRACE + /* + * if tracing, save a copy of iovec + */ + if (KTRPOINT(p, KTR_GENIO)) { + MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); + bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); + } +#endif + cnt = auio.uio_resid; + if ((error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred))) + if (auio.uio_resid != cnt && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + cnt -= auio.uio_resid; +#ifdef KTRACE + if (ktriov != NULL) { + if (error == 0) + ktrgenio(p->p_tracep, uap->fd, UIO_READ, ktriov, + cnt, error); + FREE(ktriov, M_TEMP); + } +#endif + p->p_retval[0] = cnt; +done: + if (needfree) + FREE(needfree, M_IOV); + return (error); +} + +/* + * Write system call + */ +#ifndef _SYS_SYSPROTO_H_ +struct write_args { + int fd; + const void *buf; + size_t nbyte; +}; +#endif +int +write(p, uap) + struct proc *p; + register struct write_args *uap; +{ + register struct file *fp; + register struct filedesc *fdp = p->p_fd; + struct uio auio; + struct iovec aiov; + long cnt, error = 0; +#ifdef KTRACE + struct iovec ktriov; +#endif + + if (((u_int)uap->fd) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL || + (fp->f_flag & FWRITE) == 0) + return (EBADF); + aiov.iov_base = (caddr_t)uap->buf; + aiov.iov_len = uap->nbyte; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = -1; + if (uap->nbyte > INT_MAX) + return (EINVAL); + auio.uio_resid = uap->nbyte; + auio.uio_rw = UIO_WRITE; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; +#ifdef KTRACE + /* + * if tracing, save a copy of iovec + */ + if (KTRPOINT(p, KTR_GENIO)) + ktriov = aiov; +#endif + cnt = uap->nbyte; + if ((error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred))) { + if (auio.uio_resid != cnt && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + if (error == EPIPE) + psignal(p, SIGPIPE); + } + cnt -= auio.uio_resid; +#ifdef KTRACE + if (KTRPOINT(p, KTR_GENIO) && error == 0) + ktrgenio(p->p_tracep, uap->fd, UIO_WRITE, + &ktriov, cnt, error); +#endif + p->p_retval[0] = cnt; + return (error); +} + +/* + * Gather write system call + */ +#ifndef _SYS_SYSPROTO_H_ +struct writev_args { + int fd; + struct iovec *iovp; + u_int iovcnt; +}; +#endif +int +writev(p, uap) + struct proc *p; + register struct writev_args *uap; +{ + register struct file *fp; + register struct filedesc *fdp = p->p_fd; + struct uio auio; + register struct iovec *iov; + struct iovec *needfree; + struct iovec aiov[UIO_SMALLIOV]; + long i, cnt, error = 0; + u_int iovlen; +#ifdef KTRACE + struct iovec *ktriov = NULL; +#endif + + if (((u_int)uap->fd) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL || + (fp->f_flag & FWRITE) == 0) + return (EBADF); + /* note: can't use iovlen until iovcnt is validated */ + iovlen = uap->iovcnt * sizeof (struct iovec); + if (uap->iovcnt > UIO_SMALLIOV) { + if (uap->iovcnt > UIO_MAXIOV) + return (EINVAL); + MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); + needfree = iov; + } else { + iov = aiov; + needfree = NULL; + } + auio.uio_iov = iov; + auio.uio_iovcnt = uap->iovcnt; + auio.uio_rw = UIO_WRITE; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + auio.uio_offset = -1; + if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) + goto done; + auio.uio_resid = 0; + for (i = 0; i < uap->iovcnt; i++) { + if (iov->iov_len > INT_MAX - auio.uio_resid) { + error = EINVAL; + goto done; + } + auio.uio_resid += iov->iov_len; + iov++; + } +#ifdef KTRACE + /* + * if tracing, save a copy of iovec + */ + if (KTRPOINT(p, KTR_GENIO)) { + MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); + bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); + } +#endif + cnt = auio.uio_resid; + if ((error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred))) { + if (auio.uio_resid != cnt && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + if (error == EPIPE) + psignal(p, SIGPIPE); + } + cnt -= auio.uio_resid; +#ifdef KTRACE + if (ktriov != NULL) { + if (error == 0) + ktrgenio(p->p_tracep, uap->fd, UIO_WRITE, + ktriov, cnt, error); + FREE(ktriov, M_TEMP); + } +#endif + p->p_retval[0] = cnt; +done: + if (needfree) + FREE(needfree, M_IOV); + return (error); +} + +/* + * Ioctl system call + */ +#ifndef _SYS_SYSPROTO_H_ +struct ioctl_args { + int fd; + u_long com; + caddr_t data; +}; +#endif +/* ARGSUSED */ +int +ioctl(p, uap) + struct proc *p; + register struct ioctl_args *uap; +{ + register struct file *fp; + register struct filedesc *fdp; + register u_long com; + int error; + register u_int size; + caddr_t data, memp; + int tmp; +#define STK_PARAMS 128 + char stkbuf[STK_PARAMS]; + + fdp = p->p_fd; + if ((u_int)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) + return (EBADF); + + if ((fp->f_flag & (FREAD | FWRITE)) == 0) + return (EBADF); + + switch (com = uap->com) { + case FIONCLEX: + fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; + return (0); + case FIOCLEX: + fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; + return (0); + } + + /* + * Interpret high order word to find amount of data to be + * copied to/from the user's address space. + */ + size = IOCPARM_LEN(com); + if (size > IOCPARM_MAX) + return (ENOTTY); + memp = NULL; + if (size > sizeof (stkbuf)) { + memp = (caddr_t)malloc((u_long)size, M_IOCTLOPS, M_WAITOK); + data = memp; + } else + data = stkbuf; + if (com&IOC_IN) { + if (size) { + error = copyin(uap->data, data, (u_int)size); + if (error) { + if (memp) + free(memp, M_IOCTLOPS); + return (error); + } + } else + *(caddr_t *)data = uap->data; + } else if ((com&IOC_OUT) && size) + /* + * Zero the buffer so the user always + * gets back something deterministic. + */ + bzero(data, size); + else if (com&IOC_VOID) + *(caddr_t *)data = uap->data; + + switch (com) { + + case FIONBIO: + if ((tmp = *(int *)data)) + fp->f_flag |= FNONBLOCK; + else + fp->f_flag &= ~FNONBLOCK; + error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p); + break; + + case FIOASYNC: + if ((tmp = *(int *)data)) + fp->f_flag |= FASYNC; + else + fp->f_flag &= ~FASYNC; + error = (*fp->f_ops->fo_ioctl)(fp, FIOASYNC, (caddr_t)&tmp, p); + break; + + default: + error = (*fp->f_ops->fo_ioctl)(fp, com, data, p); + /* + * Copy any data to user, size was + * already set and checked above. + */ + if (error == 0 && (com&IOC_OUT) && size) + error = copyout(data, uap->data, (u_int)size); + break; + } + if (memp) + free(memp, M_IOCTLOPS); + return (error); +} + +static int nselcoll; +int selwait; + +/* + * Select system call. + */ +#ifndef _SYS_SYSPROTO_H_ +struct select_args { + int nd; + fd_set *in, *ou, *ex; + struct timeval *tv; +}; +#endif +int +select(p, uap) + register struct proc *p; + register struct select_args *uap; +{ + /* + * The magic 2048 here is chosen to be just enough for FD_SETSIZE + * infds with the new FD_SETSIZE of 1024, and more than enough for + * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE + * of 256. + */ + fd_mask s_selbits[howmany(2048, NFDBITS)]; + fd_mask *ibits[3], *obits[3], *selbits, *sbp; + struct timeval atv, rtv, ttv; + int s, ncoll, error, timo; + u_int nbufbytes, ncpbytes, nfdbits; + + if (uap->nd < 0) + return (EINVAL); + if (uap->nd > p->p_fd->fd_nfiles) + uap->nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */ + + /* + * Allocate just enough bits for the non-null fd_sets. Use the + * preallocated auto buffer if possible. + */ + nfdbits = roundup(uap->nd, NFDBITS); + ncpbytes = nfdbits / NBBY; + nbufbytes = 0; + if (uap->in != NULL) + nbufbytes += 2 * ncpbytes; + if (uap->ou != NULL) + nbufbytes += 2 * ncpbytes; + if (uap->ex != NULL) + nbufbytes += 2 * ncpbytes; + if (nbufbytes <= sizeof s_selbits) + selbits = &s_selbits[0]; + else + selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); + + /* + * Assign pointers into the bit buffers and fetch the input bits. + * Put the output buffers together so that they can be bzeroed + * together. + */ + sbp = selbits; +#define getbits(name, x) \ + do { \ + if (uap->name == NULL) \ + ibits[x] = NULL; \ + else { \ + ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ + obits[x] = sbp; \ + sbp += ncpbytes / sizeof *sbp; \ + error = copyin(uap->name, ibits[x], ncpbytes); \ + if (error != 0) \ + goto done; \ + } \ + } while (0) + getbits(in, 0); + getbits(ou, 1); + getbits(ex, 2); +#undef getbits + if (nbufbytes != 0) + bzero(selbits, nbufbytes / 2); + + if (uap->tv) { + error = copyin((caddr_t)uap->tv, (caddr_t)&atv, + sizeof (atv)); + if (error) + goto done; + if (itimerfix(&atv)) { + error = EINVAL; + goto done; + } + getmicrouptime(&rtv); + timevaladd(&atv, &rtv); + } else + atv.tv_sec = 0; + timo = 0; +retry: + ncoll = nselcoll; + p->p_flag |= P_SELECT; + error = selscan(p, ibits, obits, uap->nd); + if (error || p->p_retval[0]) + goto done; + if (atv.tv_sec) { + getmicrouptime(&rtv); + if (timevalcmp(&rtv, &atv, >=)) + goto done; + ttv = atv; + timevalsub(&ttv, &rtv); + timo = ttv.tv_sec > 24 * 60 * 60 ? + 24 * 60 * 60 * hz : tvtohz(&ttv); + } + s = splhigh(); + if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) { + splx(s); + goto retry; + } + p->p_flag &= ~P_SELECT; + error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "select", timo); + splx(s); + if (error == 0) + goto retry; +done: + p->p_flag &= ~P_SELECT; + /* select is not restarted after signals... */ + if (error == ERESTART) + error = EINTR; + if (error == EWOULDBLOCK) + error = 0; +#define putbits(name, x) \ + if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \ + error = error2; + if (error == 0) { + int error2; + + putbits(in, 0); + putbits(ou, 1); + putbits(ex, 2); +#undef putbits + } + if (selbits != &s_selbits[0]) + free(selbits, M_SELECT); + return (error); +} + +static int +selscan(p, ibits, obits, nfd) + struct proc *p; + fd_mask **ibits, **obits; + int nfd; +{ + register struct filedesc *fdp = p->p_fd; + register int msk, i, j, fd; + register fd_mask bits; + struct file *fp; + int n = 0; + /* Note: backend also returns POLLHUP/POLLERR if appropriate. */ + static int flag[3] = { POLLRDNORM, POLLWRNORM, POLLRDBAND }; + + for (msk = 0; msk < 3; msk++) { + if (ibits[msk] == NULL) + continue; + for (i = 0; i < nfd; i += NFDBITS) { + bits = ibits[msk][i/NFDBITS]; + while ((j = ffs(bits)) && (fd = i + --j) < nfd) { + bits &= ~(1 << j); + fp = fdp->fd_ofiles[fd]; + if (fp == NULL) + return (EBADF); + if ((*fp->f_ops->fo_poll)(fp, flag[msk], + fp->f_cred, p)) { + obits[msk][(fd)/NFDBITS] |= + (1 << ((fd) % NFDBITS)); + n++; + } + } + } + } + p->p_retval[0] = n; + return (0); +} + +/* + * Poll system call. + */ +#ifndef _SYS_SYSPROTO_H_ +struct poll_args { + struct pollfd *fds; + u_int nfds; + int timeout; +}; +#endif +int +poll(p, uap) + register struct proc *p; + register struct poll_args *uap; +{ + caddr_t bits; + char smallbits[32 * sizeof(struct pollfd)]; + struct timeval atv, rtv, ttv; + int s, ncoll, error = 0, timo; + size_t ni; + + if (SCARG(uap, nfds) > p->p_fd->fd_nfiles) { + /* forgiving; slightly wrong */ + SCARG(uap, nfds) = p->p_fd->fd_nfiles; + } + ni = SCARG(uap, nfds) * sizeof(struct pollfd); + if (ni > sizeof(smallbits)) + bits = malloc(ni, M_TEMP, M_WAITOK); + else + bits = smallbits; + error = copyin(SCARG(uap, fds), bits, ni); + if (error) + goto done; + if (SCARG(uap, timeout) != INFTIM) { + atv.tv_sec = SCARG(uap, timeout) / 1000; + atv.tv_usec = (SCARG(uap, timeout) % 1000) * 1000; + if (itimerfix(&atv)) { + error = EINVAL; + goto done; + } + getmicrouptime(&rtv); + timevaladd(&atv, &rtv); + } else + atv.tv_sec = 0; + timo = 0; +retry: + ncoll = nselcoll; + p->p_flag |= P_SELECT; + error = pollscan(p, (struct pollfd *)bits, SCARG(uap, nfds)); + if (error || p->p_retval[0]) + goto done; + if (atv.tv_sec) { + getmicrouptime(&rtv); + if (timevalcmp(&rtv, &atv, >=)) + goto done; + ttv = atv; + timevalsub(&ttv, &rtv); + timo = ttv.tv_sec > 24 * 60 * 60 ? + 24 * 60 * 60 * hz : tvtohz(&ttv); + } + s = splhigh(); + if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) { + splx(s); + goto retry; + } + p->p_flag &= ~P_SELECT; + error = tsleep((caddr_t)&selwait, PSOCK | PCATCH, "poll", timo); + splx(s); + if (error == 0) + goto retry; +done: + p->p_flag &= ~P_SELECT; + /* poll is not restarted after signals... */ + if (error == ERESTART) + error = EINTR; + if (error == EWOULDBLOCK) + error = 0; + if (error == 0) { + error = copyout(bits, SCARG(uap, fds), ni); + if (error) + goto out; + } +out: + if (ni > sizeof(smallbits)) + free(bits, M_TEMP); + return (error); +} + +static int +pollscan(p, fds, nfd) + struct proc *p; + struct pollfd *fds; + int nfd; +{ + register struct filedesc *fdp = p->p_fd; + int i; + struct file *fp; + int n = 0; + + for (i = 0; i < nfd; i++, fds++) { + if (fds->fd >= fdp->fd_nfiles) { + fds->revents = POLLNVAL; + n++; + } else if (fds->fd < 0) { + fds->revents = 0; + } else { + fp = fdp->fd_ofiles[fds->fd]; + if (fp == 0) { + fds->revents = POLLNVAL; + n++; + } else { + /* + * Note: backend also returns POLLHUP and + * POLLERR if appropriate. + */ + fds->revents = (*fp->f_ops->fo_poll)(fp, + fds->events, fp->f_cred, p); + if (fds->revents != 0) + n++; + } + } + } + p->p_retval[0] = n; + return (0); +} + +/* + * OpenBSD poll system call. + * XXX this isn't quite a true representation.. OpenBSD uses select ops. + */ +#ifndef _SYS_SYSPROTO_H_ +struct openbsd_poll_args { + struct pollfd *fds; + u_int nfds; + int timeout; +}; +#endif +int +openbsd_poll(p, uap) + register struct proc *p; + register struct openbsd_poll_args *uap; +{ + return (poll(p, (struct poll_args *)uap)); +} + +/*ARGSUSED*/ +int +seltrue(dev, events, p) + dev_t dev; + int events; + struct proc *p; +{ + + return (events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); +} + +/* + * Record a select request. + */ +void +selrecord(selector, sip) + struct proc *selector; + struct selinfo *sip; +{ + struct proc *p; + pid_t mypid; + + mypid = selector->p_pid; + if (sip->si_pid == mypid) + return; + if (sip->si_pid && (p = pfind(sip->si_pid)) && + p->p_wchan == (caddr_t)&selwait) + sip->si_flags |= SI_COLL; + else + sip->si_pid = mypid; +} + +/* + * Do a wakeup when a selectable event occurs. + */ +void +selwakeup(sip) + register struct selinfo *sip; +{ + register struct proc *p; + int s; + + if (sip->si_pid == 0) + return; + if (sip->si_flags & SI_COLL) { + nselcoll++; + sip->si_flags &= ~SI_COLL; + wakeup((caddr_t)&selwait); + } + p = pfind(sip->si_pid); + sip->si_pid = 0; + if (p != NULL) { + s = splhigh(); + if (p->p_wchan == (caddr_t)&selwait) { + if (p->p_stat == SSLEEP) + setrunnable(p); + else + unsleep(p); + } else if (p->p_flag & P_SELECT) + p->p_flag &= ~P_SELECT; + splx(s); + } +} diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c new file mode 100644 index 0000000..29e1e97 --- /dev/null +++ b/sys/kern/sys_pipe.c @@ -0,0 +1,1102 @@ +/* + * Copyright (c) 1996 John S. Dyson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice immediately at the beginning of the file, without modification, + * this list of conditions, and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Absolutely no warranty of function or purpose is made by the author + * John S. Dyson. + * 4. Modifications may be freely made to this file if the above conditions + * are met. + * + * $Id: sys_pipe.c,v 1.45 1998/11/11 10:03:55 truckman Exp $ + */ + +/* + * This file contains a high-performance replacement for the socket-based + * pipes scheme originally used in FreeBSD/4.4Lite. It does not support + * all features of sockets, but does do everything that pipes normally + * do. + */ + +/* + * This code has two modes of operation, a small write mode and a large + * write mode. The small write mode acts like conventional pipes with + * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the + * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT + * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and + * the receiving process can copy it directly from the pages in the sending + * process. + * + * If the sending process receives a signal, it is possible that it will + * go away, and certainly its address space can change, because control + * is returned back to the user-mode side. In that case, the pipe code + * arranges to copy the buffer supplied by the user process, to a pageable + * kernel buffer, and the receiving process will grab the data from the + * pageable kernel buffer. Since signals don't happen all that often, + * the copy operation is normally eliminated. + * + * The constant PIPE_MINDIRECT is chosen to make sure that buffering will + * happen for small transfers so that the system will not spend all of + * its time context switching. PIPE_SIZE is constrained by the + * amount of kernel virtual memory. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/filedesc.h> +#include <sys/filio.h> +#include <sys/ttycom.h> +#include <sys/stat.h> +#include <sys/poll.h> +#include <sys/signalvar.h> +#include <sys/sysproto.h> +#include <sys/pipe.h> +#include <sys/uio.h> + +#include <vm/vm.h> +#include <vm/vm_prot.h> +#include <vm/vm_param.h> +#include <sys/lock.h> +#include <vm/vm_object.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/vm_zone.h> + +/* + * Use this define if you want to disable *fancy* VM things. Expect an + * approx 30% decrease in transfer rate. This could be useful for + * NetBSD or OpenBSD. + */ +/* #define PIPE_NODIRECT */ + +/* + * interfaces to the outside world + */ +static int pipe_read __P((struct file *fp, struct uio *uio, + struct ucred *cred)); +static int pipe_write __P((struct file *fp, struct uio *uio, + struct ucred *cred)); +static int pipe_close __P((struct file *fp, struct proc *p)); +static int pipe_poll __P((struct file *fp, int events, struct ucred *cred, + struct proc *p)); +static int pipe_ioctl __P((struct file *fp, u_long cmd, caddr_t data, struct proc *p)); + +static struct fileops pipeops = + { pipe_read, pipe_write, pipe_ioctl, pipe_poll, pipe_close }; + +/* + * Default pipe buffer size(s), this can be kind-of large now because pipe + * space is pageable. The pipe code will try to maintain locality of + * reference for performance reasons, so small amounts of outstanding I/O + * will not wipe the cache. + */ +#define MINPIPESIZE (PIPE_SIZE/3) +#define MAXPIPESIZE (2*PIPE_SIZE/3) + +/* + * Maximum amount of kva for pipes -- this is kind-of a soft limit, but + * is there so that on large systems, we don't exhaust it. + */ +#define MAXPIPEKVA (8*1024*1024) + +/* + * Limit for direct transfers, we cannot, of course limit + * the amount of kva for pipes in general though. + */ +#define LIMITPIPEKVA (16*1024*1024) + +/* + * Limit the number of "big" pipes + */ +#define LIMITBIGPIPES 32 +static int nbigpipe; + +static int amountpipekva; + +static void pipeclose __P((struct pipe *cpipe)); +static void pipeinit __P((struct pipe *cpipe)); +static __inline int pipelock __P((struct pipe *cpipe, int catch)); +static __inline void pipeunlock __P((struct pipe *cpipe)); +static __inline void pipeselwakeup __P((struct pipe *cpipe)); +#ifndef PIPE_NODIRECT +static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio)); +static void pipe_destroy_write_buffer __P((struct pipe *wpipe)); +static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio)); +static void pipe_clone_write_buffer __P((struct pipe *wpipe)); +#endif +static void pipespace __P((struct pipe *cpipe)); + +static vm_zone_t pipe_zone; + +/* + * The pipe system call for the DTYPE_PIPE type of pipes + */ + +/* ARGSUSED */ +int +pipe(p, uap) + struct proc *p; + struct pipe_args /* { + int dummy; + } */ *uap; +{ + register struct filedesc *fdp = p->p_fd; + struct file *rf, *wf; + struct pipe *rpipe, *wpipe; + int fd, error; + + if (pipe_zone == NULL) + pipe_zone = zinit("PIPE", sizeof (struct pipe), 0, 0, 4); + + rpipe = zalloc( pipe_zone); + pipeinit(rpipe); + rpipe->pipe_state |= PIPE_DIRECTOK; + wpipe = zalloc( pipe_zone); + pipeinit(wpipe); + wpipe->pipe_state |= PIPE_DIRECTOK; + + error = falloc(p, &rf, &fd); + if (error) + goto free2; + p->p_retval[0] = fd; + rf->f_flag = FREAD | FWRITE; + rf->f_type = DTYPE_PIPE; + rf->f_ops = &pipeops; + rf->f_data = (caddr_t)rpipe; + error = falloc(p, &wf, &fd); + if (error) + goto free3; + wf->f_flag = FREAD | FWRITE; + wf->f_type = DTYPE_PIPE; + wf->f_ops = &pipeops; + wf->f_data = (caddr_t)wpipe; + p->p_retval[1] = fd; + + rpipe->pipe_peer = wpipe; + wpipe->pipe_peer = rpipe; + + return (0); +free3: + ffree(rf); + fdp->fd_ofiles[p->p_retval[0]] = 0; +free2: + (void)pipeclose(wpipe); + (void)pipeclose(rpipe); + return (error); +} + +/* + * Allocate kva for pipe circular buffer, the space is pageable + */ +static void +pipespace(cpipe) + struct pipe *cpipe; +{ + int npages, error; + + npages = round_page(cpipe->pipe_buffer.size)/PAGE_SIZE; + /* + * Create an object, I don't like the idea of paging to/from + * kernel_object. + * XXX -- minor change needed here for NetBSD/OpenBSD VM systems. + */ + cpipe->pipe_buffer.object = vm_object_allocate(OBJT_DEFAULT, npages); + cpipe->pipe_buffer.buffer = (caddr_t) vm_map_min(kernel_map); + + /* + * Insert the object into the kernel map, and allocate kva for it. + * The map entry is, by default, pageable. + * XXX -- minor change needed here for NetBSD/OpenBSD VM systems. + */ + error = vm_map_find(kernel_map, cpipe->pipe_buffer.object, 0, + (vm_offset_t *) &cpipe->pipe_buffer.buffer, + cpipe->pipe_buffer.size, 1, + VM_PROT_ALL, VM_PROT_ALL, 0); + + if (error != KERN_SUCCESS) + panic("pipeinit: cannot allocate pipe -- out of kvm -- code = %d", error); + amountpipekva += cpipe->pipe_buffer.size; +} + +/* + * initialize and allocate VM and memory for pipe + */ +static void +pipeinit(cpipe) + struct pipe *cpipe; +{ + + cpipe->pipe_buffer.in = 0; + cpipe->pipe_buffer.out = 0; + cpipe->pipe_buffer.cnt = 0; + cpipe->pipe_buffer.size = PIPE_SIZE; + + /* Buffer kva gets dynamically allocated */ + cpipe->pipe_buffer.buffer = NULL; + /* cpipe->pipe_buffer.object = invalid */ + + cpipe->pipe_state = 0; + cpipe->pipe_peer = NULL; + cpipe->pipe_busy = 0; + getnanotime(&cpipe->pipe_ctime); + cpipe->pipe_atime = cpipe->pipe_ctime; + cpipe->pipe_mtime = cpipe->pipe_ctime; + bzero(&cpipe->pipe_sel, sizeof cpipe->pipe_sel); + +#ifndef PIPE_NODIRECT + /* + * pipe data structure initializations to support direct pipe I/O + */ + cpipe->pipe_map.cnt = 0; + cpipe->pipe_map.kva = 0; + cpipe->pipe_map.pos = 0; + cpipe->pipe_map.npages = 0; + /* cpipe->pipe_map.ms[] = invalid */ +#endif +} + + +/* + * lock a pipe for I/O, blocking other access + */ +static __inline int +pipelock(cpipe, catch) + struct pipe *cpipe; + int catch; +{ + int error; + while (cpipe->pipe_state & PIPE_LOCK) { + cpipe->pipe_state |= PIPE_LWANT; + if (error = tsleep( cpipe, + catch?(PRIBIO|PCATCH):PRIBIO, "pipelk", 0)) { + return error; + } + } + cpipe->pipe_state |= PIPE_LOCK; + return 0; +} + +/* + * unlock a pipe I/O lock + */ +static __inline void +pipeunlock(cpipe) + struct pipe *cpipe; +{ + cpipe->pipe_state &= ~PIPE_LOCK; + if (cpipe->pipe_state & PIPE_LWANT) { + cpipe->pipe_state &= ~PIPE_LWANT; + wakeup(cpipe); + } +} + +static __inline void +pipeselwakeup(cpipe) + struct pipe *cpipe; +{ + if (cpipe->pipe_state & PIPE_SEL) { + cpipe->pipe_state &= ~PIPE_SEL; + selwakeup(&cpipe->pipe_sel); + } + if ((cpipe->pipe_state & PIPE_ASYNC) && cpipe->pipe_sigio) + pgsigio(cpipe->pipe_sigio, SIGIO, 0); +} + +/* ARGSUSED */ +static int +pipe_read(fp, uio, cred) + struct file *fp; + struct uio *uio; + struct ucred *cred; +{ + + struct pipe *rpipe = (struct pipe *) fp->f_data; + int error = 0; + int nread = 0; + u_int size; + + ++rpipe->pipe_busy; + while (uio->uio_resid) { + /* + * normal pipe buffer receive + */ + if (rpipe->pipe_buffer.cnt > 0) { + size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; + if (size > rpipe->pipe_buffer.cnt) + size = rpipe->pipe_buffer.cnt; + if (size > (u_int) uio->uio_resid) + size = (u_int) uio->uio_resid; + if ((error = pipelock(rpipe,1)) == 0) { + error = uiomove( &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], + size, uio); + pipeunlock(rpipe); + } + if (error) { + break; + } + rpipe->pipe_buffer.out += size; + if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) + rpipe->pipe_buffer.out = 0; + + rpipe->pipe_buffer.cnt -= size; + nread += size; +#ifndef PIPE_NODIRECT + /* + * Direct copy, bypassing a kernel buffer. + */ + } else if ((size = rpipe->pipe_map.cnt) && + (rpipe->pipe_state & PIPE_DIRECTW)) { + caddr_t va; + if (size > (u_int) uio->uio_resid) + size = (u_int) uio->uio_resid; + if ((error = pipelock(rpipe,1)) == 0) { + va = (caddr_t) rpipe->pipe_map.kva + rpipe->pipe_map.pos; + error = uiomove(va, size, uio); + pipeunlock(rpipe); + } + if (error) + break; + nread += size; + rpipe->pipe_map.pos += size; + rpipe->pipe_map.cnt -= size; + if (rpipe->pipe_map.cnt == 0) { + rpipe->pipe_state &= ~PIPE_DIRECTW; + wakeup(rpipe); + } +#endif + } else { + /* + * detect EOF condition + */ + if (rpipe->pipe_state & PIPE_EOF) { + /* XXX error = ? */ + break; + } + /* + * If the "write-side" has been blocked, wake it up now. + */ + if (rpipe->pipe_state & PIPE_WANTW) { + rpipe->pipe_state &= ~PIPE_WANTW; + wakeup(rpipe); + } + if (nread > 0) + break; + + if (fp->f_flag & FNONBLOCK) { + error = EAGAIN; + break; + } + + /* + * If there is no more to read in the pipe, reset + * its pointers to the beginning. This improves + * cache hit stats. + */ + + if ((error = pipelock(rpipe,1)) == 0) { + if (rpipe->pipe_buffer.cnt == 0) { + rpipe->pipe_buffer.in = 0; + rpipe->pipe_buffer.out = 0; + } + pipeunlock(rpipe); + } else { + break; + } + + if (rpipe->pipe_state & PIPE_WANTW) { + rpipe->pipe_state &= ~PIPE_WANTW; + wakeup(rpipe); + } + + rpipe->pipe_state |= PIPE_WANTR; + if (error = tsleep(rpipe, PRIBIO|PCATCH, "piperd", 0)) { + break; + } + } + } + + if (error == 0) + getnanotime(&rpipe->pipe_atime); + + --rpipe->pipe_busy; + if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { + rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); + wakeup(rpipe); + } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { + /* + * If there is no more to read in the pipe, reset + * its pointers to the beginning. This improves + * cache hit stats. + */ + if (rpipe->pipe_buffer.cnt == 0) { + if ((error == 0) && (error = pipelock(rpipe,1)) == 0) { + rpipe->pipe_buffer.in = 0; + rpipe->pipe_buffer.out = 0; + pipeunlock(rpipe); + } + } + + /* + * If the "write-side" has been blocked, wake it up now. + */ + if (rpipe->pipe_state & PIPE_WANTW) { + rpipe->pipe_state &= ~PIPE_WANTW; + wakeup(rpipe); + } + } + + if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) + pipeselwakeup(rpipe); + + return error; +} + +#ifndef PIPE_NODIRECT +/* + * Map the sending processes' buffer into kernel space and wire it. + * This is similar to a physical write operation. + */ +static int +pipe_build_write_buffer(wpipe, uio) + struct pipe *wpipe; + struct uio *uio; +{ + u_int size; + int i; + vm_offset_t addr, endaddr, paddr; + + size = (u_int) uio->uio_iov->iov_len; + if (size > wpipe->pipe_buffer.size) + size = wpipe->pipe_buffer.size; + + endaddr = round_page((vm_offset_t)uio->uio_iov->iov_base + size); + for(i = 0, addr = trunc_page((vm_offset_t)uio->uio_iov->iov_base); + addr < endaddr; + addr += PAGE_SIZE, i+=1) { + + vm_page_t m; + + vm_fault_quick( (caddr_t) addr, VM_PROT_READ); + paddr = pmap_kextract(addr); + if (!paddr) { + int j; + for(j=0;j<i;j++) + vm_page_unwire(wpipe->pipe_map.ms[j], 1); + return EFAULT; + } + + m = PHYS_TO_VM_PAGE(paddr); + vm_page_wire(m); + wpipe->pipe_map.ms[i] = m; + } + +/* + * set up the control block + */ + wpipe->pipe_map.npages = i; + wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; + wpipe->pipe_map.cnt = size; + +/* + * and map the buffer + */ + if (wpipe->pipe_map.kva == 0) { + /* + * We need to allocate space for an extra page because the + * address range might (will) span pages at times. + */ + wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map, + wpipe->pipe_buffer.size + PAGE_SIZE); + amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE; + } + pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms, + wpipe->pipe_map.npages); + +/* + * and update the uio data + */ + + uio->uio_iov->iov_len -= size; + uio->uio_iov->iov_base += size; + if (uio->uio_iov->iov_len == 0) + uio->uio_iov++; + uio->uio_resid -= size; + uio->uio_offset += size; + return 0; +} + +/* + * unmap and unwire the process buffer + */ +static void +pipe_destroy_write_buffer(wpipe) +struct pipe *wpipe; +{ + int i; + if (wpipe->pipe_map.kva) { + pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages); + + if (amountpipekva > MAXPIPEKVA) { + vm_offset_t kva = wpipe->pipe_map.kva; + wpipe->pipe_map.kva = 0; + kmem_free(kernel_map, kva, + wpipe->pipe_buffer.size + PAGE_SIZE); + amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE; + } + } + for (i=0;i<wpipe->pipe_map.npages;i++) + vm_page_unwire(wpipe->pipe_map.ms[i], 1); +} + +/* + * In the case of a signal, the writing process might go away. This + * code copies the data into the circular buffer so that the source + * pages can be freed without loss of data. + */ +static void +pipe_clone_write_buffer(wpipe) +struct pipe *wpipe; +{ + int size; + int pos; + + size = wpipe->pipe_map.cnt; + pos = wpipe->pipe_map.pos; + bcopy((caddr_t) wpipe->pipe_map.kva+pos, + (caddr_t) wpipe->pipe_buffer.buffer, + size); + + wpipe->pipe_buffer.in = size; + wpipe->pipe_buffer.out = 0; + wpipe->pipe_buffer.cnt = size; + wpipe->pipe_state &= ~PIPE_DIRECTW; + + pipe_destroy_write_buffer(wpipe); +} + +/* + * This implements the pipe buffer write mechanism. Note that only + * a direct write OR a normal pipe write can be pending at any given time. + * If there are any characters in the pipe buffer, the direct write will + * be deferred until the receiving process grabs all of the bytes from + * the pipe buffer. Then the direct mapping write is set-up. + */ +static int +pipe_direct_write(wpipe, uio) + struct pipe *wpipe; + struct uio *uio; +{ + int error; +retry: + while (wpipe->pipe_state & PIPE_DIRECTW) { + if ( wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + wpipe->pipe_state |= PIPE_WANTW; + error = tsleep(wpipe, + PRIBIO|PCATCH, "pipdww", 0); + if (error) + goto error1; + if (wpipe->pipe_state & PIPE_EOF) { + error = EPIPE; + goto error1; + } + } + wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ + if (wpipe->pipe_buffer.cnt > 0) { + if ( wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + + wpipe->pipe_state |= PIPE_WANTW; + error = tsleep(wpipe, + PRIBIO|PCATCH, "pipdwc", 0); + if (error) + goto error1; + if (wpipe->pipe_state & PIPE_EOF) { + error = EPIPE; + goto error1; + } + goto retry; + } + + wpipe->pipe_state |= PIPE_DIRECTW; + + error = pipe_build_write_buffer(wpipe, uio); + if (error) { + wpipe->pipe_state &= ~PIPE_DIRECTW; + goto error1; + } + + error = 0; + while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { + if (wpipe->pipe_state & PIPE_EOF) { + pipelock(wpipe, 0); + pipe_destroy_write_buffer(wpipe); + pipeunlock(wpipe); + pipeselwakeup(wpipe); + error = EPIPE; + goto error1; + } + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + pipeselwakeup(wpipe); + error = tsleep(wpipe, PRIBIO|PCATCH, "pipdwt", 0); + } + + pipelock(wpipe,0); + if (wpipe->pipe_state & PIPE_DIRECTW) { + /* + * this bit of trickery substitutes a kernel buffer for + * the process that might be going away. + */ + pipe_clone_write_buffer(wpipe); + } else { + pipe_destroy_write_buffer(wpipe); + } + pipeunlock(wpipe); + return error; + +error1: + wakeup(wpipe); + return error; +} +#endif + +static int +pipe_write(fp, uio, cred) + struct file *fp; + struct uio *uio; + struct ucred *cred; +{ + int error = 0; + int orig_resid; + + struct pipe *wpipe, *rpipe; + + rpipe = (struct pipe *) fp->f_data; + wpipe = rpipe->pipe_peer; + + /* + * detect loss of pipe read side, issue SIGPIPE if lost. + */ + if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { + return EPIPE; + } + + /* + * If it is advantageous to resize the pipe buffer, do + * so. + */ + if ((uio->uio_resid > PIPE_SIZE) && + (nbigpipe < LIMITBIGPIPES) && + (wpipe->pipe_state & PIPE_DIRECTW) == 0 && + (wpipe->pipe_buffer.size <= PIPE_SIZE) && + (wpipe->pipe_buffer.cnt == 0)) { + + if (wpipe->pipe_buffer.buffer) { + amountpipekva -= wpipe->pipe_buffer.size; + kmem_free(kernel_map, + (vm_offset_t)wpipe->pipe_buffer.buffer, + wpipe->pipe_buffer.size); + } + +#ifndef PIPE_NODIRECT + if (wpipe->pipe_map.kva) { + amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE; + kmem_free(kernel_map, + wpipe->pipe_map.kva, + wpipe->pipe_buffer.size + PAGE_SIZE); + } +#endif + + wpipe->pipe_buffer.in = 0; + wpipe->pipe_buffer.out = 0; + wpipe->pipe_buffer.cnt = 0; + wpipe->pipe_buffer.size = BIG_PIPE_SIZE; + wpipe->pipe_buffer.buffer = NULL; + ++nbigpipe; + +#ifndef PIPE_NODIRECT + wpipe->pipe_map.cnt = 0; + wpipe->pipe_map.kva = 0; + wpipe->pipe_map.pos = 0; + wpipe->pipe_map.npages = 0; +#endif + + } + + + if( wpipe->pipe_buffer.buffer == NULL) { + if ((error = pipelock(wpipe,1)) == 0) { + pipespace(wpipe); + pipeunlock(wpipe); + } else { + return error; + } + } + + ++wpipe->pipe_busy; + orig_resid = uio->uio_resid; + while (uio->uio_resid) { + int space; +#ifndef PIPE_NODIRECT + /* + * If the transfer is large, we can gain performance if + * we do process-to-process copies directly. + * If the write is non-blocking, we don't use the + * direct write mechanism. + */ + if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) && + (fp->f_flag & FNONBLOCK) == 0 && + (wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) && + (uio->uio_iov->iov_len >= PIPE_MINDIRECT)) { + error = pipe_direct_write( wpipe, uio); + if (error) { + break; + } + continue; + } +#endif + + /* + * Pipe buffered writes cannot be coincidental with + * direct writes. We wait until the currently executing + * direct write is completed before we start filling the + * pipe buffer. + */ + retrywrite: + while (wpipe->pipe_state & PIPE_DIRECTW) { + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + error = tsleep(wpipe, + PRIBIO|PCATCH, "pipbww", 0); + if (error) + break; + } + + space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; + + /* Writes of size <= PIPE_BUF must be atomic. */ + /* XXX perhaps they need to be contiguous to be atomic? */ + if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) + space = 0; + + if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) { + /* + * This set the maximum transfer as a segment of + * the buffer. + */ + int size = wpipe->pipe_buffer.size - wpipe->pipe_buffer.in; + /* + * space is the size left in the buffer + */ + if (size > space) + size = space; + /* + * now limit it to the size of the uio transfer + */ + if (size > uio->uio_resid) + size = uio->uio_resid; + if ((error = pipelock(wpipe,1)) == 0) { + /* + * It is possible for a direct write to + * slip in on us... handle it here... + */ + if (wpipe->pipe_state & PIPE_DIRECTW) { + pipeunlock(wpipe); + goto retrywrite; + } + error = uiomove( &wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], + size, uio); + pipeunlock(wpipe); + } + if (error) + break; + + wpipe->pipe_buffer.in += size; + if (wpipe->pipe_buffer.in >= wpipe->pipe_buffer.size) + wpipe->pipe_buffer.in = 0; + + wpipe->pipe_buffer.cnt += size; + } else { + /* + * If the "read-side" has been blocked, wake it up now. + */ + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + + /* + * don't block on non-blocking I/O + */ + if (fp->f_flag & FNONBLOCK) { + error = EAGAIN; + break; + } + + /* + * We have no more space and have something to offer, + * wake up select/poll. + */ + pipeselwakeup(wpipe); + + wpipe->pipe_state |= PIPE_WANTW; + if (error = tsleep(wpipe, (PRIBIO+1)|PCATCH, "pipewr", 0)) { + break; + } + /* + * If read side wants to go away, we just issue a signal + * to ourselves. + */ + if (wpipe->pipe_state & PIPE_EOF) { + error = EPIPE; + break; + } + } + } + + --wpipe->pipe_busy; + if ((wpipe->pipe_busy == 0) && + (wpipe->pipe_state & PIPE_WANT)) { + wpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTR); + wakeup(wpipe); + } else if (wpipe->pipe_buffer.cnt > 0) { + /* + * If we have put any characters in the buffer, we wake up + * the reader. + */ + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + } + + /* + * Don't return EPIPE if I/O was successful + */ + if ((wpipe->pipe_buffer.cnt == 0) && + (uio->uio_resid == 0) && + (error == EPIPE)) + error = 0; + + if (error == 0) + getnanotime(&wpipe->pipe_mtime); + + /* + * We have something to offer, + * wake up select/poll. + */ + if (wpipe->pipe_buffer.cnt) + pipeselwakeup(wpipe); + + return error; +} + +/* + * we implement a very minimal set of ioctls for compatibility with sockets. + */ +int +pipe_ioctl(fp, cmd, data, p) + struct file *fp; + u_long cmd; + register caddr_t data; + struct proc *p; +{ + register struct pipe *mpipe = (struct pipe *)fp->f_data; + + switch (cmd) { + + case FIONBIO: + return (0); + + case FIOASYNC: + if (*(int *)data) { + mpipe->pipe_state |= PIPE_ASYNC; + } else { + mpipe->pipe_state &= ~PIPE_ASYNC; + } + return (0); + + case FIONREAD: + if (mpipe->pipe_state & PIPE_DIRECTW) + *(int *)data = mpipe->pipe_map.cnt; + else + *(int *)data = mpipe->pipe_buffer.cnt; + return (0); + + case FIOSETOWN: + return (fsetown(*(int *)data, &mpipe->pipe_sigio)); + + case FIOGETOWN: + *(int *)data = fgetown(mpipe->pipe_sigio); + return (0); + + /* This is deprecated, FIOSETOWN should be used instead. */ + case TIOCSPGRP: + return (fsetown(-(*(int *)data), &mpipe->pipe_sigio)); + + /* This is deprecated, FIOGETOWN should be used instead. */ + case TIOCGPGRP: + *(int *)data = -fgetown(mpipe->pipe_sigio); + return (0); + + } + return (ENOTTY); +} + +int +pipe_poll(fp, events, cred, p) + struct file *fp; + int events; + struct ucred *cred; + struct proc *p; +{ + register struct pipe *rpipe = (struct pipe *)fp->f_data; + struct pipe *wpipe; + int revents = 0; + + wpipe = rpipe->pipe_peer; + if (events & (POLLIN | POLLRDNORM)) + if ((rpipe->pipe_state & PIPE_DIRECTW) || + (rpipe->pipe_buffer.cnt > 0) || + (rpipe->pipe_state & PIPE_EOF)) + revents |= events & (POLLIN | POLLRDNORM); + + if (events & (POLLOUT | POLLWRNORM)) + if (wpipe == NULL || (wpipe->pipe_state & PIPE_EOF) || + ((wpipe->pipe_state & PIPE_DIRECTW) == 0) && + (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF) + revents |= events & (POLLOUT | POLLWRNORM); + + if ((rpipe->pipe_state & PIPE_EOF) || + (wpipe == NULL) || + (wpipe->pipe_state & PIPE_EOF)) + revents |= POLLHUP; + + if (revents == 0) { + if (events & (POLLIN | POLLRDNORM)) { + selrecord(p, &rpipe->pipe_sel); + rpipe->pipe_state |= PIPE_SEL; + } + + if (events & (POLLOUT | POLLWRNORM)) { + selrecord(p, &wpipe->pipe_sel); + wpipe->pipe_state |= PIPE_SEL; + } + } + + return (revents); +} + +int +pipe_stat(pipe, ub) + register struct pipe *pipe; + register struct stat *ub; +{ + bzero((caddr_t)ub, sizeof (*ub)); + ub->st_mode = S_IFIFO; + ub->st_blksize = pipe->pipe_buffer.size; + ub->st_size = pipe->pipe_buffer.cnt; + ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; + ub->st_atimespec = pipe->pipe_atime; + ub->st_mtimespec = pipe->pipe_mtime; + ub->st_ctimespec = pipe->pipe_ctime; + /* + * Left as 0: st_dev, st_ino, st_nlink, st_uid, st_gid, st_rdev, + * st_flags, st_gen. + * XXX (st_dev, st_ino) should be unique. + */ + return 0; +} + +/* ARGSUSED */ +static int +pipe_close(fp, p) + struct file *fp; + struct proc *p; +{ + struct pipe *cpipe = (struct pipe *)fp->f_data; + + funsetown(cpipe->pipe_sigio); + pipeclose(cpipe); + fp->f_data = NULL; + return 0; +} + +/* + * shutdown the pipe + */ +static void +pipeclose(cpipe) + struct pipe *cpipe; +{ + struct pipe *ppipe; + if (cpipe) { + + pipeselwakeup(cpipe); + + /* + * If the other side is blocked, wake it up saying that + * we want to close it down. + */ + while (cpipe->pipe_busy) { + wakeup(cpipe); + cpipe->pipe_state |= PIPE_WANT|PIPE_EOF; + tsleep(cpipe, PRIBIO, "pipecl", 0); + } + + /* + * Disconnect from peer + */ + if (ppipe = cpipe->pipe_peer) { + pipeselwakeup(ppipe); + + ppipe->pipe_state |= PIPE_EOF; + wakeup(ppipe); + ppipe->pipe_peer = NULL; + } + + /* + * free resources + */ + if (cpipe->pipe_buffer.buffer) { + if (cpipe->pipe_buffer.size > PIPE_SIZE) + --nbigpipe; + amountpipekva -= cpipe->pipe_buffer.size; + kmem_free(kernel_map, + (vm_offset_t)cpipe->pipe_buffer.buffer, + cpipe->pipe_buffer.size); + } +#ifndef PIPE_NODIRECT + if (cpipe->pipe_map.kva) { + amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE; + kmem_free(kernel_map, + cpipe->pipe_map.kva, + cpipe->pipe_buffer.size + PAGE_SIZE); + } +#endif + zfree(pipe_zone, cpipe); + } +} diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c new file mode 100644 index 0000000..4756127 --- /dev/null +++ b/sys/kern/sys_process.c @@ -0,0 +1,534 @@ +/* + * Copyright (c) 1994, Sean Eric Fagan + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Sean Eric Fagan. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: sys_process.c,v 1.40 1998/07/29 18:41:30 dfr Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/proc.h> +#include <sys/vnode.h> +#include <sys/ptrace.h> + +#include <machine/reg.h> +#include <vm/vm.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/vm_extern.h> + +#include <sys/user.h> +#include <miscfs/procfs/procfs.h> + +/* use the equivalent procfs code */ +#if 0 +static int +pread (struct proc *procp, unsigned int addr, unsigned int *retval) { + int rv; + vm_map_t map, tmap; + vm_object_t object; + vm_offset_t kva = 0; + int page_offset; /* offset into page */ + vm_offset_t pageno; /* page number */ + vm_map_entry_t out_entry; + vm_prot_t out_prot; + boolean_t wired; + vm_pindex_t pindex; + + /* Map page into kernel space */ + + map = &procp->p_vmspace->vm_map; + + page_offset = addr - trunc_page(addr); + pageno = trunc_page(addr); + + tmap = map; + rv = vm_map_lookup (&tmap, pageno, VM_PROT_READ, &out_entry, + &object, &pindex, &out_prot, &wired); + + if (rv != KERN_SUCCESS) + return EINVAL; + + vm_map_lookup_done (tmap, out_entry); + + /* Find space in kernel_map for the page we're interested in */ + rv = vm_map_find (kernel_map, object, IDX_TO_OFF(pindex), + &kva, PAGE_SIZE, 0, VM_PROT_ALL, VM_PROT_ALL, 0); + + if (!rv) { + vm_object_reference (object); + + rv = vm_map_pageable (kernel_map, kva, kva + PAGE_SIZE, 0); + if (!rv) { + *retval = 0; + bcopy ((caddr_t)kva + page_offset, + retval, sizeof *retval); + } + vm_map_remove (kernel_map, kva, kva + PAGE_SIZE); + } + + return rv; +} + +static int +pwrite (struct proc *procp, unsigned int addr, unsigned int datum) { + int rv; + vm_map_t map, tmap; + vm_object_t object; + vm_offset_t kva = 0; + int page_offset; /* offset into page */ + vm_offset_t pageno; /* page number */ + vm_map_entry_t out_entry; + vm_prot_t out_prot; + boolean_t wired; + vm_pindex_t pindex; + boolean_t fix_prot = 0; + + /* Map page into kernel space */ + + map = &procp->p_vmspace->vm_map; + + page_offset = addr - trunc_page(addr); + pageno = trunc_page(addr); + + /* + * Check the permissions for the area we're interested in. + */ + + if (vm_map_check_protection (map, pageno, pageno + PAGE_SIZE, + VM_PROT_WRITE) == FALSE) { + /* + * If the page was not writable, we make it so. + * XXX It is possible a page may *not* be read/executable, + * if a process changes that! + */ + fix_prot = 1; + /* The page isn't writable, so let's try making it so... */ + if ((rv = vm_map_protect (map, pageno, pageno + PAGE_SIZE, + VM_PROT_ALL, 0)) != KERN_SUCCESS) + return EFAULT; /* I guess... */ + } + + /* + * Now we need to get the page. out_entry, out_prot, wired, and + * single_use aren't used. One would think the vm code would be + * a *bit* nicer... We use tmap because vm_map_lookup() can + * change the map argument. + */ + + tmap = map; + rv = vm_map_lookup (&tmap, pageno, VM_PROT_WRITE, &out_entry, + &object, &pindex, &out_prot, &wired); + if (rv != KERN_SUCCESS) { + return EINVAL; + } + + /* + * Okay, we've got the page. Let's release tmap. + */ + + vm_map_lookup_done (tmap, out_entry); + + /* + * Fault the page in... + */ + + rv = vm_fault(map, pageno, VM_PROT_WRITE|VM_PROT_READ, FALSE); + if (rv != KERN_SUCCESS) + return EFAULT; + + /* Find space in kernel_map for the page we're interested in */ + rv = vm_map_find (kernel_map, object, IDX_TO_OFF(pindex), + &kva, PAGE_SIZE, 0, + VM_PROT_ALL, VM_PROT_ALL, 0); + if (!rv) { + vm_object_reference (object); + + rv = vm_map_pageable (kernel_map, kva, kva + PAGE_SIZE, 0); + if (!rv) { + bcopy (&datum, (caddr_t)kva + page_offset, sizeof datum); + } + vm_map_remove (kernel_map, kva, kva + PAGE_SIZE); + } + + if (fix_prot) + vm_map_protect (map, pageno, pageno + PAGE_SIZE, + VM_PROT_READ|VM_PROT_EXECUTE, 0); + return rv; +} +#endif + +/* + * Process debugging system call. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ptrace_args { + int req; + pid_t pid; + caddr_t addr; + int data; +}; +#endif + +int +ptrace(curp, uap) + struct proc *curp; + struct ptrace_args *uap; +{ + struct proc *p; + struct iovec iov; + struct uio uio; + int error = 0; + int write; + int s; + + if (uap->req == PT_TRACE_ME) + p = curp; + else { + if ((p = pfind(uap->pid)) == NULL) + return ESRCH; + } + + /* + * Permissions check + */ + switch (uap->req) { + case PT_TRACE_ME: + /* Always legal. */ + break; + + case PT_ATTACH: + /* Self */ + if (p->p_pid == curp->p_pid) + return EINVAL; + + /* Already traced */ + if (p->p_flag & P_TRACED) + return EBUSY; + + /* not owned by you, has done setuid (unless you're root) */ + if ((p->p_cred->p_ruid != curp->p_cred->p_ruid) || + (p->p_flag & P_SUGID)) { + if (error = suser(curp->p_ucred, &curp->p_acflag)) + return error; + } + + /* can't trace init when securelevel > 0 */ + if (securelevel > 0 && p->p_pid == 1) + return EPERM; + + /* OK */ + break; + + case PT_READ_I: + case PT_READ_D: + case PT_READ_U: + case PT_WRITE_I: + case PT_WRITE_D: + case PT_WRITE_U: + case PT_CONTINUE: + case PT_KILL: + case PT_STEP: + case PT_DETACH: +#ifdef PT_GETREGS + case PT_GETREGS: +#endif +#ifdef PT_SETREGS + case PT_SETREGS: +#endif +#ifdef PT_GETFPREGS + case PT_GETFPREGS: +#endif +#ifdef PT_SETFPREGS + case PT_SETFPREGS: +#endif + /* not being traced... */ + if ((p->p_flag & P_TRACED) == 0) + return EPERM; + + /* not being traced by YOU */ + if (p->p_pptr != curp) + return EBUSY; + + /* not currently stopped */ + if (p->p_stat != SSTOP || (p->p_flag & P_WAITED) == 0) + return EBUSY; + + /* OK */ + break; + + default: + return EINVAL; + } + +#ifdef FIX_SSTEP + /* + * Single step fixup ala procfs + */ + FIX_SSTEP(p); +#endif + + /* + * Actually do the requests + */ + + write = 0; + curp->p_retval[0] = 0; + + switch (uap->req) { + case PT_TRACE_ME: + /* set my trace flag and "owner" so it can read/write me */ + p->p_flag |= P_TRACED; + p->p_oppid = p->p_pptr->p_pid; + return 0; + + case PT_ATTACH: + /* security check done above */ + p->p_flag |= P_TRACED; + p->p_oppid = p->p_pptr->p_pid; + if (p->p_pptr != curp) + proc_reparent(p, curp); + uap->data = SIGSTOP; + goto sendsig; /* in PT_CONTINUE below */ + + case PT_STEP: + case PT_CONTINUE: + case PT_DETACH: + if ((unsigned)uap->data >= NSIG) + return EINVAL; + + PHOLD(p); + + if (uap->req == PT_STEP) { + if ((error = ptrace_single_step (p))) { + PRELE(p); + return error; + } + } + + if (uap->addr != (caddr_t)1) { + fill_eproc (p, &p->p_addr->u_kproc.kp_eproc); + if ((error = ptrace_set_pc (p, + (u_long)(uintfptr_t)uap->addr))) { + PRELE(p); + return error; + } + } + PRELE(p); + + if (uap->req == PT_DETACH) { + /* reset process parent */ + if (p->p_oppid != p->p_pptr->p_pid) { + struct proc *pp; + + pp = pfind(p->p_oppid); + proc_reparent(p, pp ? pp : initproc); + } + + p->p_flag &= ~(P_TRACED | P_WAITED); + p->p_oppid = 0; + + /* should we send SIGCHLD? */ + + } + + sendsig: + /* deliver or queue signal */ + s = splhigh(); + if (p->p_stat == SSTOP) { + p->p_xstat = uap->data; + setrunnable(p); + } else if (uap->data) { + psignal(p, uap->data); + } + splx(s); + return 0; + + case PT_WRITE_I: + case PT_WRITE_D: + write = 1; + /* fallthrough */ + case PT_READ_I: + case PT_READ_D: + /* write = 0 set above */ + iov.iov_base = write ? (caddr_t)&uap->data : (caddr_t)curp->p_retval; + iov.iov_len = sizeof(int); + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_offset = (off_t)(uintptr_t)uap->addr; + uio.uio_resid = sizeof(int); + uio.uio_segflg = UIO_SYSSPACE; /* ie: the uap */ + uio.uio_rw = write ? UIO_WRITE : UIO_READ; + uio.uio_procp = p; + error = procfs_domem(curp, p, NULL, &uio); + if (uio.uio_resid != 0) { + /* + * XXX procfs_domem() doesn't currently return ENOSPC, + * so I think write() can bogusly return 0. + * XXX what happens for short writes? We don't want + * to write partial data. + * XXX procfs_domem() returns EPERM for other invalid + * addresses. Convert this to EINVAL. Does this + * clobber returns of EPERM for other reasons? + */ + if (error == 0 || error == ENOSPC || error == EPERM) + error = EINVAL; /* EOF */ + } + return (error); + + case PT_READ_U: + if ((uintptr_t)uap->addr > UPAGES * PAGE_SIZE - sizeof(int)) { + return EFAULT; + } + if ((uintptr_t)uap->addr & (sizeof(int) - 1)) { + return EFAULT; + } + if (ptrace_read_u_check(p,(vm_offset_t) uap->addr, + sizeof(long)) && + !procfs_kmemaccess(curp)) { + return EFAULT; + } + error = 0; + PHOLD(p); /* user had damn well better be incore! */ + if (p->p_flag & P_INMEM) { + p->p_addr->u_kproc.kp_proc = *p; + fill_eproc (p, &p->p_addr->u_kproc.kp_eproc); + curp->p_retval[0] = *(int *) + ((uintptr_t)p->p_addr + (uintptr_t)uap->addr); + } else { + curp->p_retval[0] = 0; + error = EFAULT; + } + PRELE(p); + return error; + + case PT_WRITE_U: + PHOLD(p); /* user had damn well better be incore! */ + if (p->p_flag & P_INMEM) { + p->p_addr->u_kproc.kp_proc = *p; + fill_eproc (p, &p->p_addr->u_kproc.kp_eproc); + error = ptrace_write_u(p, (vm_offset_t)uap->addr, uap->data); + } else { + error = EFAULT; + } + PRELE(p); + return error; + + case PT_KILL: + uap->data = SIGKILL; + goto sendsig; /* in PT_CONTINUE above */ + +#ifdef PT_SETREGS + case PT_SETREGS: + write = 1; + /* fallthrough */ +#endif /* PT_SETREGS */ +#ifdef PT_GETREGS + case PT_GETREGS: + /* write = 0 above */ +#endif /* PT_SETREGS */ +#if defined(PT_SETREGS) || defined(PT_GETREGS) + if (!procfs_validregs(p)) /* no P_SYSTEM procs please */ + return EINVAL; + else { + iov.iov_base = uap->addr; + iov.iov_len = sizeof(struct reg); + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_offset = 0; + uio.uio_resid = sizeof(struct reg); + uio.uio_segflg = UIO_USERSPACE; + uio.uio_rw = write ? UIO_WRITE : UIO_READ; + uio.uio_procp = curp; + return (procfs_doregs(curp, p, NULL, &uio)); + } +#endif /* defined(PT_SETREGS) || defined(PT_GETREGS) */ + +#ifdef PT_SETFPREGS + case PT_SETFPREGS: + write = 1; + /* fallthrough */ +#endif /* PT_SETFPREGS */ +#ifdef PT_GETFPREGS + case PT_GETFPREGS: + /* write = 0 above */ +#endif /* PT_SETFPREGS */ +#if defined(PT_SETFPREGS) || defined(PT_GETFPREGS) + if (!procfs_validfpregs(p)) /* no P_SYSTEM procs please */ + return EINVAL; + else { + iov.iov_base = uap->addr; + iov.iov_len = sizeof(struct fpreg); + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_offset = 0; + uio.uio_resid = sizeof(struct fpreg); + uio.uio_segflg = UIO_USERSPACE; + uio.uio_rw = write ? UIO_WRITE : UIO_READ; + uio.uio_procp = curp; + return (procfs_dofpregs(curp, p, NULL, &uio)); + } +#endif /* defined(PT_SETFPREGS) || defined(PT_GETFPREGS) */ + + default: + break; + } + + return 0; +} + +int +trace_req(p) + struct proc *p; +{ + return 1; +} + +/* + * stopevent() + * Stop a process because of a procfs event; + * stay stopped until p->p_step is cleared + * (cleared by PIOCCONT in procfs). + */ + +void +stopevent(struct proc *p, unsigned int event, unsigned int val) { + p->p_step = 1; + + do { + p->p_xstat = val; + p->p_stype = event; /* Which event caused the stop? */ + wakeup(&p->p_stype); /* Wake up any PIOCWAIT'ing procs */ + tsleep(&p->p_step, PWAIT, "stopevent", 0); + } while (p->p_step); +} diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c new file mode 100644 index 0000000..8cf30cd --- /dev/null +++ b/sys/kern/sys_socket.c @@ -0,0 +1,182 @@ +/* + * Copyright (c) 1982, 1986, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)sys_socket.c 8.1 (Berkeley) 6/10/93 + * $Id: sys_socket.c,v 1.18 1998/06/07 17:11:40 dfr Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/file.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/filio.h> /* XXX */ +#include <sys/sockio.h> +#include <sys/stat.h> +#include <sys/uio.h> +#include <sys/filedesc.h> + +#include <net/if.h> +#include <net/route.h> + +static int soo_read __P((struct file *fp, struct uio *uio, + struct ucred *cred)); +static int soo_write __P((struct file *fp, struct uio *uio, + struct ucred *cred)); +static int soo_close __P((struct file *fp, struct proc *p)); + +struct fileops socketops = + { soo_read, soo_write, soo_ioctl, soo_poll, soo_close }; + +/* ARGSUSED */ +static int +soo_read(fp, uio, cred) + struct file *fp; + struct uio *uio; + struct ucred *cred; +{ + struct socket *so = (struct socket *)fp->f_data; + return so->so_proto->pr_usrreqs->pru_soreceive(so, 0, uio, 0, 0, 0); +} + +/* ARGSUSED */ +static int +soo_write(fp, uio, cred) + struct file *fp; + struct uio *uio; + struct ucred *cred; +{ + struct socket *so = (struct socket *)fp->f_data; + return so->so_proto->pr_usrreqs->pru_sosend(so, 0, uio, 0, 0, 0, + uio->uio_procp); +} + +int +soo_ioctl(fp, cmd, data, p) + struct file *fp; + u_long cmd; + register caddr_t data; + struct proc *p; +{ + register struct socket *so = (struct socket *)fp->f_data; + + switch (cmd) { + + case FIONBIO: + if (*(int *)data) + so->so_state |= SS_NBIO; + else + so->so_state &= ~SS_NBIO; + return (0); + + case FIOASYNC: + if (*(int *)data) { + so->so_state |= SS_ASYNC; + so->so_rcv.sb_flags |= SB_ASYNC; + so->so_snd.sb_flags |= SB_ASYNC; + } else { + so->so_state &= ~SS_ASYNC; + so->so_rcv.sb_flags &= ~SB_ASYNC; + so->so_snd.sb_flags &= ~SB_ASYNC; + } + return (0); + + case FIONREAD: + *(int *)data = so->so_rcv.sb_cc; + return (0); + + case FIOSETOWN: + return (fsetown(*(int *)data, &so->so_sigio)); + + case FIOGETOWN: + *(int *)data = fgetown(so->so_sigio); + return (0); + + case SIOCSPGRP: + return (fsetown(-(*(int *)data), &so->so_sigio)); + + case SIOCGPGRP: + *(int *)data = -fgetown(so->so_sigio); + return (0); + + case SIOCATMARK: + *(int *)data = (so->so_state&SS_RCVATMARK) != 0; + return (0); + } + /* + * Interface/routing/protocol specific ioctls: + * interface and routing ioctls should have a + * different entry since a socket's unnecessary + */ + if (IOCGROUP(cmd) == 'i') + return (ifioctl(so, cmd, data, p)); + if (IOCGROUP(cmd) == 'r') + return (rtioctl(cmd, data, p)); + return ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, 0, p)); +} + +int +soo_poll(fp, events, cred, p) + struct file *fp; + int events; + struct ucred *cred; + struct proc *p; +{ + struct socket *so = (struct socket *)fp->f_data; + return so->so_proto->pr_usrreqs->pru_sopoll(so, events, cred, p); +} + +int +soo_stat(so, ub) + register struct socket *so; + register struct stat *ub; +{ + + bzero((caddr_t)ub, sizeof (*ub)); + ub->st_mode = S_IFSOCK; + return ((*so->so_proto->pr_usrreqs->pru_sense)(so, ub)); +} + +/* ARGSUSED */ +static int +soo_close(fp, p) + struct file *fp; + struct proc *p; +{ + int error = 0; + + if (fp->f_data) + error = soclose((struct socket *)fp->f_data); + fp->f_data = 0; + return (error); +} diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c new file mode 100644 index 0000000..22e9e8e --- /dev/null +++ b/sys/kern/syscalls.c @@ -0,0 +1,347 @@ +/* + * System call names. + * + * DO NOT EDIT-- this file is automatically generated. + * created from Id: syscalls.master,v 1.55 1998/11/11 12:45:14 peter Exp + */ + +char *syscallnames[] = { + "syscall", /* 0 = syscall */ + "exit", /* 1 = exit */ + "fork", /* 2 = fork */ + "read", /* 3 = read */ + "write", /* 4 = write */ + "open", /* 5 = open */ + "close", /* 6 = close */ + "wait4", /* 7 = wait4 */ + "old.creat", /* 8 = old creat */ + "link", /* 9 = link */ + "unlink", /* 10 = unlink */ + "obs_execv", /* 11 = obsolete execv */ + "chdir", /* 12 = chdir */ + "fchdir", /* 13 = fchdir */ + "mknod", /* 14 = mknod */ + "chmod", /* 15 = chmod */ + "chown", /* 16 = chown */ + "break", /* 17 = break */ + "getfsstat", /* 18 = getfsstat */ + "old.lseek", /* 19 = old lseek */ + "getpid", /* 20 = getpid */ + "mount", /* 21 = mount */ + "unmount", /* 22 = unmount */ + "setuid", /* 23 = setuid */ + "getuid", /* 24 = getuid */ + "geteuid", /* 25 = geteuid */ + "ptrace", /* 26 = ptrace */ + "recvmsg", /* 27 = recvmsg */ + "sendmsg", /* 28 = sendmsg */ + "recvfrom", /* 29 = recvfrom */ + "accept", /* 30 = accept */ + "getpeername", /* 31 = getpeername */ + "getsockname", /* 32 = getsockname */ + "access", /* 33 = access */ + "chflags", /* 34 = chflags */ + "fchflags", /* 35 = fchflags */ + "sync", /* 36 = sync */ + "kill", /* 37 = kill */ + "old.stat", /* 38 = old stat */ + "getppid", /* 39 = getppid */ + "old.lstat", /* 40 = old lstat */ + "dup", /* 41 = dup */ + "pipe", /* 42 = pipe */ + "getegid", /* 43 = getegid */ + "profil", /* 44 = profil */ + "ktrace", /* 45 = ktrace */ + "sigaction", /* 46 = sigaction */ + "getgid", /* 47 = getgid */ + "sigprocmask", /* 48 = sigprocmask */ + "getlogin", /* 49 = getlogin */ + "setlogin", /* 50 = setlogin */ + "acct", /* 51 = acct */ + "sigpending", /* 52 = sigpending */ + "sigaltstack", /* 53 = sigaltstack */ + "ioctl", /* 54 = ioctl */ + "reboot", /* 55 = reboot */ + "revoke", /* 56 = revoke */ + "symlink", /* 57 = symlink */ + "readlink", /* 58 = readlink */ + "execve", /* 59 = execve */ + "umask", /* 60 = umask */ + "chroot", /* 61 = chroot */ + "old.fstat", /* 62 = old fstat */ + "old.getkerninfo", /* 63 = old getkerninfo */ + "old.getpagesize", /* 64 = old getpagesize */ + "msync", /* 65 = msync */ + "vfork", /* 66 = vfork */ + "obs_vread", /* 67 = obsolete vread */ + "obs_vwrite", /* 68 = obsolete vwrite */ + "sbrk", /* 69 = sbrk */ + "sstk", /* 70 = sstk */ + "old.mmap", /* 71 = old mmap */ + "vadvise", /* 72 = vadvise */ + "munmap", /* 73 = munmap */ + "mprotect", /* 74 = mprotect */ + "madvise", /* 75 = madvise */ + "obs_vhangup", /* 76 = obsolete vhangup */ + "obs_vlimit", /* 77 = obsolete vlimit */ + "mincore", /* 78 = mincore */ + "getgroups", /* 79 = getgroups */ + "setgroups", /* 80 = setgroups */ + "getpgrp", /* 81 = getpgrp */ + "setpgid", /* 82 = setpgid */ + "setitimer", /* 83 = setitimer */ + "old.wait", /* 84 = old wait */ + "swapon", /* 85 = swapon */ + "getitimer", /* 86 = getitimer */ + "old.gethostname", /* 87 = old gethostname */ + "old.sethostname", /* 88 = old sethostname */ + "getdtablesize", /* 89 = getdtablesize */ + "dup2", /* 90 = dup2 */ + "#91", /* 91 = getdopt */ + "fcntl", /* 92 = fcntl */ + "select", /* 93 = select */ + "#94", /* 94 = setdopt */ + "fsync", /* 95 = fsync */ + "setpriority", /* 96 = setpriority */ + "socket", /* 97 = socket */ + "connect", /* 98 = connect */ + "old.accept", /* 99 = old accept */ + "getpriority", /* 100 = getpriority */ + "old.send", /* 101 = old send */ + "old.recv", /* 102 = old recv */ + "sigreturn", /* 103 = sigreturn */ + "bind", /* 104 = bind */ + "setsockopt", /* 105 = setsockopt */ + "listen", /* 106 = listen */ + "obs_vtimes", /* 107 = obsolete vtimes */ + "old.sigvec", /* 108 = old sigvec */ + "old.sigblock", /* 109 = old sigblock */ + "old.sigsetmask", /* 110 = old sigsetmask */ + "sigsuspend", /* 111 = sigsuspend */ + "old.sigstack", /* 112 = old sigstack */ + "old.recvmsg", /* 113 = old recvmsg */ + "old.sendmsg", /* 114 = old sendmsg */ + "obs_vtrace", /* 115 = obsolete vtrace */ + "gettimeofday", /* 116 = gettimeofday */ + "getrusage", /* 117 = getrusage */ + "getsockopt", /* 118 = getsockopt */ + "#119", /* 119 = resuba */ + "readv", /* 120 = readv */ + "writev", /* 121 = writev */ + "settimeofday", /* 122 = settimeofday */ + "fchown", /* 123 = fchown */ + "fchmod", /* 124 = fchmod */ + "old.recvfrom", /* 125 = old recvfrom */ + "setreuid", /* 126 = setreuid */ + "setregid", /* 127 = setregid */ + "rename", /* 128 = rename */ + "old.truncate", /* 129 = old truncate */ + "old.ftruncate", /* 130 = old ftruncate */ + "flock", /* 131 = flock */ + "mkfifo", /* 132 = mkfifo */ + "sendto", /* 133 = sendto */ + "shutdown", /* 134 = shutdown */ + "socketpair", /* 135 = socketpair */ + "mkdir", /* 136 = mkdir */ + "rmdir", /* 137 = rmdir */ + "utimes", /* 138 = utimes */ + "obs_4.2", /* 139 = obsolete 4.2 sigreturn */ + "adjtime", /* 140 = adjtime */ + "old.getpeername", /* 141 = old getpeername */ + "old.gethostid", /* 142 = old gethostid */ + "old.sethostid", /* 143 = old sethostid */ + "old.getrlimit", /* 144 = old getrlimit */ + "old.setrlimit", /* 145 = old setrlimit */ + "old.killpg", /* 146 = old killpg */ + "setsid", /* 147 = setsid */ + "quotactl", /* 148 = quotactl */ + "old.quota", /* 149 = old quota */ + "old.getsockname", /* 150 = old getsockname */ + "#151", /* 151 = sem_lock */ + "#152", /* 152 = sem_wakeup */ + "#153", /* 153 = asyncdaemon */ + "#154", /* 154 = nosys */ + "nfssvc", /* 155 = nfssvc */ + "old.getdirentries", /* 156 = old getdirentries */ + "statfs", /* 157 = statfs */ + "fstatfs", /* 158 = fstatfs */ + "#159", /* 159 = nosys */ + "#160", /* 160 = nosys */ + "getfh", /* 161 = getfh */ + "getdomainname", /* 162 = getdomainname */ + "setdomainname", /* 163 = setdomainname */ + "uname", /* 164 = uname */ + "sysarch", /* 165 = sysarch */ + "rtprio", /* 166 = rtprio */ + "#167", /* 167 = nosys */ + "#168", /* 168 = nosys */ + "semsys", /* 169 = semsys */ + "msgsys", /* 170 = msgsys */ + "shmsys", /* 171 = shmsys */ + "#172", /* 172 = nosys */ + "#173", /* 173 = nosys */ + "#174", /* 174 = nosys */ + "#175", /* 175 = nosys */ + "ntp_adjtime", /* 176 = ntp_adjtime */ + "#177", /* 177 = sfork */ + "#178", /* 178 = getdescriptor */ + "#179", /* 179 = setdescriptor */ + "#180", /* 180 = nosys */ + "setgid", /* 181 = setgid */ + "setegid", /* 182 = setegid */ + "seteuid", /* 183 = seteuid */ + "#184", /* 184 = lfs_bmapv */ + "#185", /* 185 = lfs_markv */ + "#186", /* 186 = lfs_segclean */ + "#187", /* 187 = lfs_segwait */ + "stat", /* 188 = stat */ + "fstat", /* 189 = fstat */ + "lstat", /* 190 = lstat */ + "pathconf", /* 191 = pathconf */ + "fpathconf", /* 192 = fpathconf */ + "#193", /* 193 = nosys */ + "getrlimit", /* 194 = getrlimit */ + "setrlimit", /* 195 = setrlimit */ + "getdirentries", /* 196 = getdirentries */ + "mmap", /* 197 = mmap */ + "__syscall", /* 198 = __syscall */ + "lseek", /* 199 = lseek */ + "truncate", /* 200 = truncate */ + "ftruncate", /* 201 = ftruncate */ + "__sysctl", /* 202 = __sysctl */ + "mlock", /* 203 = mlock */ + "munlock", /* 204 = munlock */ + "undelete", /* 205 = undelete */ + "futimes", /* 206 = futimes */ + "getpgid", /* 207 = getpgid */ + "#208", /* 208 = newreboot */ + "poll", /* 209 = poll */ + "lkmnosys", /* 210 = lkmnosys */ + "lkmnosys", /* 211 = lkmnosys */ + "lkmnosys", /* 212 = lkmnosys */ + "lkmnosys", /* 213 = lkmnosys */ + "lkmnosys", /* 214 = lkmnosys */ + "lkmnosys", /* 215 = lkmnosys */ + "lkmnosys", /* 216 = lkmnosys */ + "lkmnosys", /* 217 = lkmnosys */ + "lkmnosys", /* 218 = lkmnosys */ + "lkmnosys", /* 219 = lkmnosys */ + "__semctl", /* 220 = __semctl */ + "semget", /* 221 = semget */ + "semop", /* 222 = semop */ + "semconfig", /* 223 = semconfig */ + "msgctl", /* 224 = msgctl */ + "msgget", /* 225 = msgget */ + "msgsnd", /* 226 = msgsnd */ + "msgrcv", /* 227 = msgrcv */ + "shmat", /* 228 = shmat */ + "shmctl", /* 229 = shmctl */ + "shmdt", /* 230 = shmdt */ + "shmget", /* 231 = shmget */ + "clock_gettime", /* 232 = clock_gettime */ + "clock_settime", /* 233 = clock_settime */ + "clock_getres", /* 234 = clock_getres */ + "#235", /* 235 = timer_create */ + "#236", /* 236 = timer_delete */ + "#237", /* 237 = timer_settime */ + "#238", /* 238 = timer_gettime */ + "#239", /* 239 = timer_getoverrun */ + "nanosleep", /* 240 = nanosleep */ + "#241", /* 241 = nosys */ + "#242", /* 242 = nosys */ + "#243", /* 243 = nosys */ + "#244", /* 244 = nosys */ + "#245", /* 245 = nosys */ + "#246", /* 246 = nosys */ + "#247", /* 247 = nosys */ + "#248", /* 248 = nosys */ + "#249", /* 249 = nosys */ + "minherit", /* 250 = minherit */ + "rfork", /* 251 = rfork */ + "openbsd_poll", /* 252 = openbsd_poll */ + "issetugid", /* 253 = issetugid */ + "lchown", /* 254 = lchown */ + "#255", /* 255 = nosys */ + "#256", /* 256 = nosys */ + "#257", /* 257 = nosys */ + "#258", /* 258 = nosys */ + "#259", /* 259 = nosys */ + "#260", /* 260 = nosys */ + "#261", /* 261 = nosys */ + "#262", /* 262 = nosys */ + "#263", /* 263 = nosys */ + "#264", /* 264 = nosys */ + "#265", /* 265 = nosys */ + "#266", /* 266 = nosys */ + "#267", /* 267 = nosys */ + "#268", /* 268 = nosys */ + "#269", /* 269 = nosys */ + "#270", /* 270 = nosys */ + "#271", /* 271 = nosys */ + "getdents", /* 272 = getdents */ + "#273", /* 273 = nosys */ + "lchmod", /* 274 = lchmod */ + "netbsd_lchown", /* 275 = netbsd_lchown */ + "lutimes", /* 276 = lutimes */ + "netbsd_msync", /* 277 = netbsd_msync */ + "nstat", /* 278 = nstat */ + "nfstat", /* 279 = nfstat */ + "nlstat", /* 280 = nlstat */ + "#281", /* 281 = nosys */ + "#282", /* 282 = nosys */ + "#283", /* 283 = nosys */ + "#284", /* 284 = nosys */ + "#285", /* 285 = nosys */ + "#286", /* 286 = nosys */ + "#287", /* 287 = nosys */ + "#288", /* 288 = nosys */ + "#289", /* 289 = nosys */ + "#290", /* 290 = nosys */ + "#291", /* 291 = nosys */ + "#292", /* 292 = nosys */ + "#293", /* 293 = nosys */ + "#294", /* 294 = nosys */ + "#295", /* 295 = nosys */ + "#296", /* 296 = nosys */ + "#297", /* 297 = nosys */ + "#298", /* 298 = nosys */ + "#299", /* 299 = nosys */ + "modnext", /* 300 = modnext */ + "modstat", /* 301 = modstat */ + "modfnext", /* 302 = modfnext */ + "modfind", /* 303 = modfind */ + "kldload", /* 304 = kldload */ + "kldunload", /* 305 = kldunload */ + "kldfind", /* 306 = kldfind */ + "kldnext", /* 307 = kldnext */ + "kldstat", /* 308 = kldstat */ + "kldfirstmod", /* 309 = kldfirstmod */ + "getsid", /* 310 = getsid */ + "#311", /* 311 = setresuid */ + "#312", /* 312 = setresgid */ + "obs_signanosleep", /* 313 = obsolete signanosleep */ + "aio_return", /* 314 = aio_return */ + "aio_suspend", /* 315 = aio_suspend */ + "aio_cancel", /* 316 = aio_cancel */ + "aio_error", /* 317 = aio_error */ + "aio_read", /* 318 = aio_read */ + "aio_write", /* 319 = aio_write */ + "lio_listio", /* 320 = lio_listio */ + "yield", /* 321 = yield */ + "thr_sleep", /* 322 = thr_sleep */ + "thr_wakeup", /* 323 = thr_wakeup */ + "mlockall", /* 324 = mlockall */ + "munlockall", /* 325 = munlockall */ + "__getcwd", /* 326 = __getcwd */ + "sched_setparam", /* 327 = sched_setparam */ + "sched_getparam", /* 328 = sched_getparam */ + "sched_setscheduler", /* 329 = sched_setscheduler */ + "sched_getscheduler", /* 330 = sched_getscheduler */ + "sched_yield", /* 331 = sched_yield */ + "sched_get_priority_max", /* 332 = sched_get_priority_max */ + "sched_get_priority_min", /* 333 = sched_get_priority_min */ + "sched_rr_get_interval", /* 334 = sched_rr_get_interval */ + "utrace", /* 335 = utrace */ + "sendfile", /* 336 = sendfile */ + "kldsym", /* 337 = kldsym */ +}; diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master new file mode 100644 index 0000000..6772363 --- /dev/null +++ b/sys/kern/syscalls.master @@ -0,0 +1,473 @@ + $Id: syscalls.master,v 1.54 1998/11/05 14:28:24 dg Exp $ +; from: @(#)syscalls.master 8.2 (Berkeley) 1/13/94 +; +; System call name/number master file. +; Processed to created init_sysent.c, syscalls.c and syscall.h. + +; Columns: number type nargs namespc name alt{name,tag,rtyp}/comments +; number system call number, must be in order +; type one of STD, OBSOL, UNIMPL, COMPAT +; namespc one of POSIX, BSD, NOHIDE +; name psuedo-prototype of syscall routine +; If one of the following alts is different, then all appear: +; altname name of system call if different +; alttag name of args struct tag if different from [o]`name'"_args" +; altrtyp return type if not int (bogus - syscalls always return int) +; for UNIMPL/OBSOL, name continues with comments + +; types: +; STD always included +; COMPAT included on COMPAT #ifdef +; LIBCOMPAT included on COMPAT #ifdef, and placed in syscall.h +; OBSOL obsolete, not included in system, only specifies name +; UNIMPL not implemented, placeholder only + +; #ifdef's, etc. may be included, and are copied to the output files. + +#include <sys/param.h> +#include <sys/sysent.h> +#include <sys/sysproto.h> + +; Reserved/unimplemented system calls in the range 0-150 inclusive +; are reserved for use in future Berkeley releases. +; Additional system calls implemented in vendor and other +; redistributions should be placed in the reserved range at the end +; of the current calls. + +0 STD NOHIDE { int nosys(void); } syscall nosys_args int +1 STD NOHIDE { void exit(int rval); } exit rexit_args void +2 STD POSIX { int fork(void); } +3 STD POSIX { ssize_t read(int fd, void *buf, size_t nbyte); } +4 STD POSIX { ssize_t write(int fd, const void *buf, size_t nbyte); } +5 STD POSIX { int open(char *path, int flags, int mode); } +; XXX should be { int open(const char *path, int flags, ...); } +; but we're not ready for `const' or varargs. +; XXX man page says `mode_t mode'. +6 STD POSIX { int close(int fd); } +7 STD BSD { int wait4(int pid, int *status, int options, \ + struct rusage *rusage); } wait4 wait_args int +8 COMPAT BSD { int creat(char *path, int mode); } +9 STD POSIX { int link(char *path, char *link); } +10 STD POSIX { int unlink(char *path); } +11 OBSOL NOHIDE execv +12 STD POSIX { int chdir(char *path); } +13 STD BSD { int fchdir(int fd); } +14 STD POSIX { int mknod(char *path, int mode, int dev); } +15 STD POSIX { int chmod(char *path, int mode); } +16 STD POSIX { int chown(char *path, int uid, int gid); } +17 STD BSD { int obreak(char *nsize); } break obreak_args int +18 STD BSD { int getfsstat(struct statfs *buf, long bufsize, \ + int flags); } +19 COMPAT POSIX { long lseek(int fd, long offset, int whence); } +20 STD POSIX { pid_t getpid(void); } +21 STD BSD { int mount(char *type, char *path, int flags, \ + caddr_t data); } +; XXX 4.4lite2 uses `char *type' but we're not ready for that. +; XXX `path' should have type `const char *' but we're not ready for that. +22 STD BSD { int unmount(char *path, int flags); } +23 STD POSIX { int setuid(uid_t uid); } +24 STD POSIX { uid_t getuid(void); } +25 STD POSIX { uid_t geteuid(void); } +26 STD BSD { int ptrace(int req, pid_t pid, caddr_t addr, \ + int data); } +27 STD BSD { int recvmsg(int s, struct msghdr *msg, int flags); } +28 STD BSD { int sendmsg(int s, caddr_t msg, int flags); } +29 STD BSD { int recvfrom(int s, caddr_t buf, size_t len, \ + int flags, caddr_t from, int *fromlenaddr); } +30 STD BSD { int accept(int s, caddr_t name, int *anamelen); } +31 STD BSD { int getpeername(int fdes, caddr_t asa, int *alen); } +32 STD BSD { int getsockname(int fdes, caddr_t asa, int *alen); } +33 STD POSIX { int access(char *path, int flags); } +34 STD BSD { int chflags(char *path, int flags); } +35 STD BSD { int fchflags(int fd, int flags); } +36 STD BSD { int sync(void); } +37 STD POSIX { int kill(int pid, int signum); } +38 COMPAT POSIX { int stat(char *path, struct ostat *ub); } +39 STD POSIX { pid_t getppid(void); } +40 COMPAT POSIX { int lstat(char *path, struct ostat *ub); } +41 STD POSIX { int dup(u_int fd); } +42 STD POSIX { int pipe(void); } +43 STD POSIX { gid_t getegid(void); } +44 STD BSD { int profil(caddr_t samples, size_t size, \ + size_t offset, u_int scale); } +45 STD BSD { int ktrace(char *fname, int ops, int facs, \ + int pid); } +46 STD POSIX { int sigaction(int signum, struct sigaction *nsa, \ + struct sigaction *osa); } +47 STD POSIX { gid_t getgid(void); } +48 STD POSIX { int sigprocmask(int how, sigset_t mask); } +; XXX note nonstandard (bogus) calling convention - the libc stub passes +; us the mask, not a pointer to it, and we return the old mask as the +; (int) return value. +49 STD BSD { int getlogin(char *namebuf, u_int namelen); } +50 STD BSD { int setlogin(char *namebuf); } +51 STD BSD { int acct(char *path); } +52 STD POSIX { int sigpending(void); } +53 STD BSD { int sigaltstack(struct sigaltstack *nss, \ + struct sigaltstack *oss); } +54 STD POSIX { int ioctl(int fd, u_long com, caddr_t data); } +55 STD BSD { int reboot(int opt); } +56 STD POSIX { int revoke(char *path); } +57 STD POSIX { int symlink(char *path, char *link); } +58 STD POSIX { int readlink(char *path, char *buf, int count); } +59 STD POSIX { int execve(char *fname, char **argv, char **envv); } +60 STD POSIX { int umask(int newmask); } umask umask_args int +61 STD BSD { int chroot(char *path); } +62 COMPAT POSIX { int fstat(int fd, struct ostat *sb); } +63 COMPAT BSD { int getkerninfo(int op, char *where, size_t *size, \ + int arg); } getkerninfo getkerninfo_args int +64 COMPAT BSD { int getpagesize(void); } \ + getpagesize getpagesize_args int +65 STD BSD { int msync(void *addr, size_t len, int flags); } +66 STD BSD { int vfork(void); } +67 OBSOL NOHIDE vread +68 OBSOL NOHIDE vwrite +69 STD BSD { int sbrk(int incr); } +70 STD BSD { int sstk(int incr); } +71 COMPAT BSD { int mmap(void *addr, int len, int prot, \ + int flags, int fd, long pos); } +72 STD BSD { int ovadvise(int anom); } vadvise ovadvise_args int +73 STD BSD { int munmap(void *addr, size_t len); } +74 STD BSD { int mprotect(const void *addr, size_t len, int prot); } +75 STD BSD { int madvise(void *addr, size_t len, int behav); } +76 OBSOL NOHIDE vhangup +77 OBSOL NOHIDE vlimit +78 STD BSD { int mincore(const void *addr, size_t len, \ + char *vec); } +79 STD POSIX { int getgroups(u_int gidsetsize, gid_t *gidset); } +80 STD POSIX { int setgroups(u_int gidsetsize, gid_t *gidset); } +81 STD POSIX { int getpgrp(void); } +82 STD POSIX { int setpgid(int pid, int pgid); } +83 STD BSD { int setitimer(u_int which, struct itimerval *itv, \ + struct itimerval *oitv); } +84 COMPAT BSD { int wait(void); } +85 STD BSD { int swapon(char *name); } +86 STD BSD { int getitimer(u_int which, struct itimerval *itv); } +87 COMPAT BSD { int gethostname(char *hostname, u_int len); } \ + gethostname gethostname_args int +88 COMPAT BSD { int sethostname(char *hostname, u_int len); } \ + sethostname sethostname_args int +89 STD BSD { int getdtablesize(void); } +90 STD POSIX { int dup2(u_int from, u_int to); } +91 UNIMPL BSD getdopt +92 STD POSIX { int fcntl(int fd, int cmd, long arg); } +; XXX should be { int fcntl(int fd, int cmd, ...); } +; but we're not ready for varargs. +; XXX man page says `int arg' too. +93 STD BSD { int select(int nd, fd_set *in, fd_set *ou, \ + fd_set *ex, struct timeval *tv); } +94 UNIMPL BSD setdopt +95 STD POSIX { int fsync(int fd); } +96 STD BSD { int setpriority(int which, int who, int prio); } +97 STD BSD { int socket(int domain, int type, int protocol); } +98 STD BSD { int connect(int s, caddr_t name, int namelen); } +99 CPT_NOA BSD { int accept(int s, caddr_t name, int *anamelen); } \ + accept accept_args int +100 STD BSD { int getpriority(int which, int who); } +101 COMPAT BSD { int send(int s, caddr_t buf, int len, int flags); } +102 COMPAT BSD { int recv(int s, caddr_t buf, int len, int flags); } +103 STD BSD { int sigreturn(struct sigcontext *sigcntxp); } +104 STD BSD { int bind(int s, caddr_t name, int namelen); } +105 STD BSD { int setsockopt(int s, int level, int name, \ + caddr_t val, int valsize); } +106 STD BSD { int listen(int s, int backlog); } +107 OBSOL NOHIDE vtimes +108 COMPAT BSD { int sigvec(int signum, struct sigvec *nsv, \ + struct sigvec *osv); } +109 COMPAT BSD { int sigblock(int mask); } +110 COMPAT BSD { int sigsetmask(int mask); } +111 STD POSIX { int sigsuspend(sigset_t mask); } +; XXX note nonstandard (bogus) calling convention - the libc stub passes +; us the mask, not a pointer to it. +112 COMPAT BSD { int sigstack(struct sigstack *nss, \ + struct sigstack *oss); } +113 COMPAT BSD { int recvmsg(int s, struct omsghdr *msg, int flags); } +114 COMPAT BSD { int sendmsg(int s, caddr_t msg, int flags); } +115 OBSOL NOHIDE vtrace +116 STD BSD { int gettimeofday(struct timeval *tp, \ + struct timezone *tzp); } +117 STD BSD { int getrusage(int who, struct rusage *rusage); } +118 STD BSD { int getsockopt(int s, int level, int name, \ + caddr_t val, int *avalsize); } +119 UNIMPL NOHIDE resuba (BSD/OS 2.x) +120 STD BSD { int readv(int fd, struct iovec *iovp, u_int iovcnt); } +121 STD BSD { int writev(int fd, struct iovec *iovp, \ + u_int iovcnt); } +122 STD BSD { int settimeofday(struct timeval *tv, \ + struct timezone *tzp); } +123 STD BSD { int fchown(int fd, int uid, int gid); } +124 STD BSD { int fchmod(int fd, int mode); } +125 CPT_NOA BSD { int recvfrom(int s, caddr_t buf, size_t len, \ + int flags, caddr_t from, int *fromlenaddr); } \ + recvfrom recvfrom_args int +126 STD BSD { int setreuid(int ruid, int euid); } +127 STD BSD { int setregid(int rgid, int egid); } +128 STD POSIX { int rename(char *from, char *to); } +129 COMPAT BSD { int truncate(char *path, long length); } +130 COMPAT BSD { int ftruncate(int fd, long length); } +131 STD BSD { int flock(int fd, int how); } +132 STD POSIX { int mkfifo(char *path, int mode); } +133 STD BSD { int sendto(int s, caddr_t buf, size_t len, \ + int flags, caddr_t to, int tolen); } +134 STD BSD { int shutdown(int s, int how); } +135 STD BSD { int socketpair(int domain, int type, int protocol, \ + int *rsv); } +136 STD POSIX { int mkdir(char *path, int mode); } +137 STD POSIX { int rmdir(char *path); } +138 STD BSD { int utimes(char *path, struct timeval *tptr); } +139 OBSOL NOHIDE 4.2 sigreturn +140 STD BSD { int adjtime(struct timeval *delta, \ + struct timeval *olddelta); } +141 COMPAT BSD { int getpeername(int fdes, caddr_t asa, int *alen); } +142 COMPAT BSD { long gethostid(void); } +143 COMPAT BSD { int sethostid(long hostid); } +144 COMPAT BSD { int getrlimit(u_int which, struct ogetrlimit *rlp); } +145 COMPAT BSD { int setrlimit(u_int which, struct ogetrlimit *rlp); } +146 COMPAT BSD { int killpg(int pgid, int signum); } +147 STD POSIX { int setsid(void); } +148 STD BSD { int quotactl(char *path, int cmd, int uid, \ + caddr_t arg); } +149 COMPAT BSD { int quota(void); } +150 CPT_NOA BSD { int getsockname(int fdec, caddr_t asa, int *alen); }\ + getsockname getsockname_args int + +; Syscalls 151-180 inclusive are reserved for vendor-specific +; system calls. (This includes various calls added for compatibity +; with other Unix variants.) +; Some of these calls are now supported by BSD... +151 UNIMPL NOHIDE sem_lock (BSD/OS 2.x) +152 UNIMPL NOHIDE sem_wakeup (BSD/OS 2.x) +153 UNIMPL NOHIDE asyncdaemon (BSD/OS 2.x) +154 UNIMPL NOHIDE nosys +; 155 is initialized by the NFS code, if present. +155 NOIMPL BSD { int nfssvc(int flag, caddr_t argp); } +156 COMPAT BSD { int getdirentries(int fd, char *buf, u_int count, \ + long *basep); } +157 STD BSD { int statfs(char *path, struct statfs *buf); } +158 STD BSD { int fstatfs(int fd, struct statfs *buf); } +159 UNIMPL NOHIDE nosys +160 UNIMPL NOHIDE nosys +; 161 is initialized by the NFS code, if present. +161 NOIMPL BSD { int getfh(char *fname, struct fhandle *fhp); } +162 STD BSD { int getdomainname(char *domainname, int len); } +163 STD BSD { int setdomainname(char *domainname, int len); } +164 STD BSD { int uname(struct utsname *name); } +165 STD BSD { int sysarch(int op, char *parms); } +166 STD BSD { int rtprio(int function, pid_t pid, \ + struct rtprio *rtp); } +167 UNIMPL NOHIDE nosys +168 UNIMPL NOHIDE nosys +169 STD BSD { int semsys(int which, int a2, int a3, int a4, \ + int a5); } +; XXX should be { int semsys(int which, ...); } +170 STD BSD { int msgsys(int which, int a2, int a3, int a4, \ + int a5, int a6); } +; XXX should be { int msgsys(int which, ...); } +171 STD BSD { int shmsys(int which, int a2, int a3, int a4); } +; XXX should be { int shmsys(int which, ...); } +172 UNIMPL NOHIDE nosys +173 UNIMPL NOHIDE nosys +174 UNIMPL NOHIDE nosys +175 UNIMPL NOHIDE nosys +176 STD BSD { int ntp_adjtime(struct timex *tp); } +177 UNIMPL NOHIDE sfork (BSD/OS 2.x) +178 UNIMPL NOHIDE getdescriptor (BSD/OS 2.x) +179 UNIMPL NOHIDE setdescriptor (BSD/OS 2.x) +180 UNIMPL NOHIDE nosys + +; Syscalls 180-199 are used by/reserved for BSD +181 STD POSIX { int setgid(gid_t gid); } +182 STD BSD { int setegid(gid_t egid); } +183 STD BSD { int seteuid(uid_t euid); } +184 UNIMPL BSD lfs_bmapv +185 UNIMPL BSD lfs_markv +186 UNIMPL BSD lfs_segclean +187 UNIMPL BSD lfs_segwait +188 STD POSIX { int stat(char *path, struct stat *ub); } +189 STD POSIX { int fstat(int fd, struct stat *sb); } +190 STD POSIX { int lstat(char *path, struct stat *ub); } +191 STD POSIX { int pathconf(char *path, int name); } +192 STD POSIX { int fpathconf(int fd, int name); } +193 UNIMPL NOHIDE nosys +194 STD BSD { int getrlimit(u_int which, \ + struct orlimit *rlp); } \ + getrlimit __getrlimit_args int +195 STD BSD { int setrlimit(u_int which, \ + struct orlimit *rlp); } \ + setrlimit __setrlimit_args int +196 STD BSD { int getdirentries(int fd, char *buf, u_int count, \ + long *basep); } +197 STD BSD { caddr_t mmap(caddr_t addr, size_t len, int prot, \ + int flags, int fd, long pad, off_t pos); } +198 STD NOHIDE { int nosys(void); } __syscall __syscall_args int +199 STD POSIX { off_t lseek(int fd, int pad, off_t offset, \ + int whence); } +200 STD BSD { int truncate(char *path, int pad, off_t length); } +201 STD BSD { int ftruncate(int fd, int pad, off_t length); } +202 STD BSD { int __sysctl(int *name, u_int namelen, void *old, \ + size_t *oldlenp, void *new, size_t newlen); } \ + __sysctl sysctl_args int +; properly, __sysctl should be a NOHIDE, but making an exception +; here allows to avoid one in libc/sys/Makefile.inc. +203 STD BSD { int mlock(const void *addr, size_t len); } +204 STD BSD { int munlock(const void *addr, size_t len); } +205 STD BSD { int undelete(char *path); } +206 STD BSD { int futimes(int fd, struct timeval *tptr); } +207 STD BSD { int getpgid(pid_t pid); } +208 UNIMPL NOHIDE newreboot (NetBSD) +209 STD BSD { int poll(struct pollfd *fds, u_int nfds, \ + int timeout); } + +; +; The following are reserved for loadable syscalls +; +210 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +211 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +212 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +213 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +214 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +215 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +216 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +217 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +218 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +219 NODEF NOHIDE lkmnosys lkmnosys nosys_args int + +; +; The following were introduced with NetBSD/4.4Lite-2 +; +220 STD BSD { int __semctl(int semid, int semnum, int cmd, \ + union semun *arg); } +221 STD BSD { int semget(key_t key, int nsems, int semflg); } +222 STD BSD { int semop(int semid, struct sembuf *sops, \ + u_int nsops); } +223 STD BSD { int semconfig(int flag); } +224 STD BSD { int msgctl(int msqid, int cmd, \ + struct msqid_ds *buf); } +225 STD BSD { int msgget(key_t key, int msgflg); } +226 STD BSD { int msgsnd(int msqid, void *msgp, size_t msgsz, \ + int msgflg); } +227 STD BSD { int msgrcv(int msqid, void *msgp, size_t msgsz, \ + long msgtyp, int msgflg); } +228 STD BSD { int shmat(int shmid, void *shmaddr, int shmflg); } +229 STD BSD { int shmctl(int shmid, int cmd, \ + struct shmid_ds *buf); } +230 STD BSD { int shmdt(void *shmaddr); } +231 STD BSD { int shmget(key_t key, int size, int shmflg); } +; +232 STD POSIX { int clock_gettime(clockid_t clock_id, \ + struct timespec *tp); } +233 STD POSIX { int clock_settime(clockid_t clock_id, \ + const struct timespec *tp); } +234 STD POSIX { int clock_getres(clockid_t clock_id, \ + struct timespec *tp); } +235 UNIMPL NOHIDE timer_create +236 UNIMPL NOHIDE timer_delete +237 UNIMPL NOHIDE timer_settime +238 UNIMPL NOHIDE timer_gettime +239 UNIMPL NOHIDE timer_getoverrun +240 STD POSIX { int nanosleep(const struct timespec *rqtp, \ + struct timespec *rmtp); } +241 UNIMPL NOHIDE nosys +242 UNIMPL NOHIDE nosys +243 UNIMPL NOHIDE nosys +244 UNIMPL NOHIDE nosys +245 UNIMPL NOHIDE nosys +246 UNIMPL NOHIDE nosys +247 UNIMPL NOHIDE nosys +248 UNIMPL NOHIDE nosys +249 UNIMPL NOHIDE nosys +; syscall numbers initially used in OpenBSD +250 STD BSD { int minherit(void *addr, size_t len, int inherit); } +251 STD BSD { int rfork(int flags); } +252 STD BSD { int openbsd_poll(struct pollfd *fds, u_int nfds, \ + int timeout); } +253 STD BSD { int issetugid(void); } +254 STD BSD { int lchown(char *path, int uid, int gid); } +255 UNIMPL NOHIDE nosys +256 UNIMPL NOHIDE nosys +257 UNIMPL NOHIDE nosys +258 UNIMPL NOHIDE nosys +259 UNIMPL NOHIDE nosys +260 UNIMPL NOHIDE nosys +261 UNIMPL NOHIDE nosys +262 UNIMPL NOHIDE nosys +263 UNIMPL NOHIDE nosys +264 UNIMPL NOHIDE nosys +265 UNIMPL NOHIDE nosys +266 UNIMPL NOHIDE nosys +267 UNIMPL NOHIDE nosys +268 UNIMPL NOHIDE nosys +269 UNIMPL NOHIDE nosys +270 UNIMPL NOHIDE nosys +271 UNIMPL NOHIDE nosys +272 STD BSD { int getdents(int fd, char *buf, size_t count); } +273 UNIMPL NOHIDE nosys +274 STD BSD { int lchmod(char *path, mode_t mode); } +275 NOPROTO BSD { int lchown(char *path, uid_t uid, gid_t gid); } netbsd_lchown netbsd_lchown int +276 STD BSD { int lutimes(char *path, struct timeval *tptr); } +277 NOPROTO BSD { int msync(void *addr, size_t len, int flags); } netbsd_msync netbsd_msync int +278 STD BSD { int nstat(char *path, struct nstat *ub); } +279 STD BSD { int nfstat(int fd, struct nstat *sb); } +280 STD BSD { int nlstat(char *path, struct nstat *ub); } +281 UNIMPL NOHIDE nosys +282 UNIMPL NOHIDE nosys +283 UNIMPL NOHIDE nosys +284 UNIMPL NOHIDE nosys +285 UNIMPL NOHIDE nosys +286 UNIMPL NOHIDE nosys +287 UNIMPL NOHIDE nosys +288 UNIMPL NOHIDE nosys +289 UNIMPL NOHIDE nosys +290 UNIMPL NOHIDE nosys +291 UNIMPL NOHIDE nosys +292 UNIMPL NOHIDE nosys +293 UNIMPL NOHIDE nosys +294 UNIMPL NOHIDE nosys +295 UNIMPL NOHIDE nosys +296 UNIMPL NOHIDE nosys +297 UNIMPL NOHIDE nosys +298 UNIMPL NOHIDE nosys +299 UNIMPL NOHIDE nosys +; syscall numbers for FreeBSD +300 STD BSD { int modnext(int modid); } +301 STD BSD { int modstat(int modid, struct module_stat* stat); } +302 STD BSD { int modfnext(int modid); } +303 STD BSD { int modfind(char *name); } +304 STD BSD { int kldload(const char *file); } +305 STD BSD { int kldunload(int fileid); } +306 STD BSD { int kldfind(const char *file); } +307 STD BSD { int kldnext(int fileid); } +308 STD BSD { int kldstat(int fileid, struct kld_file_stat* stat); } +309 STD BSD { int kldfirstmod(int fileid); } +310 STD BSD { int getsid(pid_t pid); } +311 UNIMPL NOHIDE setresuid +312 UNIMPL NOHIDE setresgid +313 OBSOL NOHIDE signanosleep +314 STD BSD { int aio_return(struct aiocb *aiocbp); } +315 STD BSD { int aio_suspend(struct aiocb * const * aiocbp, int nent, const struct timespec *timeout); } +316 STD BSD { int aio_cancel(int fd, struct aiocb *aiocbp); } +317 STD BSD { int aio_error(struct aiocb *aiocbp); } +318 STD BSD { int aio_read(struct aiocb *aiocbp); } +319 STD BSD { int aio_write(struct aiocb *aiocbp); } +320 STD BSD { int lio_listio(int mode, struct aiocb * const *acb_list, int nent, struct sigevent *sig); } +321 STD BSD { int yield(void); } +322 STD BSD { int thr_sleep(const struct timespec *timeout); } +323 STD BSD { int thr_wakeup(pid_t pid); } +324 STD BSD { int mlockall(int how); } +325 STD BSD { int munlockall(void); } +326 STD BSD { int __getcwd(u_char *buf, u_int buflen); } + +327 STD POSIX { int sched_setparam (pid_t pid, const struct sched_param *param); } +328 STD POSIX { int sched_getparam (pid_t pid, struct sched_param *param); } + +329 STD POSIX { int sched_setscheduler (pid_t pid, int policy, const struct sched_param *param); } +330 STD POSIX { int sched_getscheduler (pid_t pid); } + +331 STD POSIX { int sched_yield (void); } +332 STD POSIX { int sched_get_priority_max (int policy); } +333 STD POSIX { int sched_get_priority_min (int policy); } +334 STD POSIX { int sched_rr_get_interval (pid_t pid, struct timespec *interval); } +335 STD BSD { int utrace(caddr_t addr, size_t len); } +336 STD BSD { int sendfile(int fd, int s, off_t offset, size_t nbytes, \ + struct sf_hdtr *hdtr, off_t *sbytes, int flags); } +337 STD BSD { int kldsym(int fileid, int cmd, void *data); } diff --git a/sys/kern/sysv_ipc.c b/sys/kern/sysv_ipc.c new file mode 100644 index 0000000..553c213 --- /dev/null +++ b/sys/kern/sysv_ipc.c @@ -0,0 +1,283 @@ +/* $Id: sysv_ipc.c,v 1.7 1997/11/06 19:29:22 phk Exp $ */ +/* $NetBSD: sysv_ipc.c,v 1.7 1994/06/29 06:33:11 cgd Exp $ */ + +/* + * Copyright (c) 1994 Herb Peyerl <hpeyerl@novatel.ca> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Herb Peyerl. + * 4. The name of Herb Peyerl may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "opt_sysvipc.h" + +#include <sys/param.h> +#include <sys/ipc.h> +#include <sys/ucred.h> + +#if defined(SYSVSEM) || defined(SYSVSHM) || defined(SYSVMSG) + +/* + * Check for ipc permission + */ + +int +ipcperm(cred, perm, mode) + struct ucred *cred; + struct ipc_perm *perm; + int mode; +{ + + if (cred->cr_uid == 0) + return (0); + + /* Check for user match. */ + if (cred->cr_uid != perm->cuid && cred->cr_uid != perm->uid) { + if (mode & IPC_M) + return (EPERM); + /* Check for group match. */ + mode >>= 3; + if (!groupmember(perm->gid, cred) && + !groupmember(perm->cgid, cred)) + /* Check for `other' match. */ + mode >>= 3; + } + + if (mode & IPC_M) + return (0); + return ((mode & perm->mode) == mode ? 0 : EACCES); +} + +#endif /* defined(SYSVSEM) || defined(SYSVSHM) || defined(SYSVMSG) */ + + +#if !defined(SYSVSEM) || !defined(SYSVSHM) || !defined(SYSVMSG) + +#include <sys/proc.h> +#include <sys/sem.h> +#include <sys/shm.h> +#include <sys/syslog.h> +#include <sys/sysproto.h> +#include <sys/systm.h> + +static void sysv_nosys __P((struct proc *p, char *s)); + +static void +sysv_nosys(p, s) + struct proc *p; + char *s; +{ + log(LOG_ERR, "cmd %s pid %d tried to use non-present %s\n", + p->p_comm, p->p_pid, s); +} + +#if !defined(SYSVSEM) + +/* + * SYSVSEM stubs + */ + +int +semsys(p, uap) + struct proc *p; + struct semsys_args *uap; +{ + sysv_nosys(p, "SYSVSEM"); + return nosys(p, (struct nosys_args *)uap); +}; + +int +semconfig(p, uap) + struct proc *p; + struct semconfig_args *uap; +{ + sysv_nosys(p, "SYSVSEM"); + return nosys(p, (struct nosys_args *)uap); +}; + +int +__semctl(p, uap) + struct proc *p; + register struct __semctl_args *uap; +{ + sysv_nosys(p, "SYSVSEM"); + return nosys(p, (struct nosys_args *)uap); +}; + +int +semget(p, uap) + struct proc *p; + register struct semget_args *uap; +{ + sysv_nosys(p, "SYSVSEM"); + return nosys(p, (struct nosys_args *)uap); +}; + +int +semop(p, uap) + struct proc *p; + register struct semop_args *uap; +{ + sysv_nosys(p, "SYSVSEM"); + return nosys(p, (struct nosys_args *)uap); +}; + +/* called from kern_exit.c */ +void +semexit(p) + struct proc *p; +{ + return; +} + +#endif /* !defined(SYSVSEM) */ + + +#if !defined(SYSVMSG) + +/* + * SYSVMSG stubs + */ + +int +msgsys(p, uap) + struct proc *p; + /* XXX actually varargs. */ + struct msgsys_args *uap; +{ + sysv_nosys(p, "SYSVMSG"); + return nosys(p, (struct nosys_args *)uap); +}; + +int +msgctl(p, uap) + struct proc *p; + register struct msgctl_args *uap; +{ + sysv_nosys(p, "SYSVMSG"); + return nosys(p, (struct nosys_args *)uap); +}; + +int +msgget(p, uap) + struct proc *p; + register struct msgget_args *uap; +{ + sysv_nosys(p, "SYSVMSG"); + return nosys(p, (struct nosys_args *)uap); +}; + +int +msgsnd(p, uap) + struct proc *p; + register struct msgsnd_args *uap; +{ + sysv_nosys(p, "SYSVMSG"); + return nosys(p, (struct nosys_args *)uap); +}; + +int +msgrcv(p, uap) + struct proc *p; + register struct msgrcv_args *uap; +{ + sysv_nosys(p, "SYSVMSG"); + return nosys(p, (struct nosys_args *)uap); +}; + +#endif /* !defined(SYSVMSG) */ + + +#if !defined(SYSVSHM) + +/* + * SYSVSHM stubs + */ + +int +shmdt(p, uap) + struct proc *p; + struct shmdt_args *uap; +{ + sysv_nosys(p, "SYSVSHM"); + return nosys(p, (struct nosys_args *)uap); +}; + +int +shmat(p, uap) + struct proc *p; + struct shmat_args *uap; +{ + sysv_nosys(p, "SYSVSHM"); + return nosys(p, (struct nosys_args *)uap); +}; + +int +shmctl(p, uap) + struct proc *p; + struct shmctl_args *uap; +{ + sysv_nosys(p, "SYSVSHM"); + return nosys(p, (struct nosys_args *)uap); +}; + +int +shmget(p, uap) + struct proc *p; + struct shmget_args *uap; +{ + sysv_nosys(p, "SYSVSHM"); + return nosys(p, (struct nosys_args *)uap); +}; + +int +shmsys(p, uap) + struct proc *p; + /* XXX actually varargs. */ + struct shmsys_args *uap; +{ + sysv_nosys(p, "SYSVSHM"); + return nosys(p, (struct nosys_args *)uap); +}; + +/* called from kern_fork.c */ +void +shmfork(p1, p2) + struct proc *p1, *p2; +{ + return; +} + +/* called from kern_exit.c */ +void +shmexit(p) + struct proc *p; +{ + return; +} + +#endif /* !defined(SYSVSHM) */ + +#endif /* !defined(SYSVSEM) || !defined(SYSVSHM) || !defined(SYSVMSG) */ diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c new file mode 100644 index 0000000..d3b8a98 --- /dev/null +++ b/sys/kern/sysv_msg.c @@ -0,0 +1,1027 @@ +/* $Id: sysv_msg.c,v 1.17 1997/11/06 19:29:24 phk Exp $ */ + +/* + * Implementation of SVID messages + * + * Author: Daniel Boulet + * + * Copyright 1993 Daniel Boulet and RTMX Inc. + * + * This system call was implemented by Daniel Boulet under contract from RTMX. + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/msg.h> +#include <sys/sysent.h> + +static void msginit __P((void *)); +SYSINIT(sysv_msg, SI_SUB_SYSV_MSG, SI_ORDER_FIRST, msginit, NULL) + +#define MSG_DEBUG +#undef MSG_DEBUG_OK + +#ifndef _SYS_SYSPROTO_H_ +struct msgctl_args; +int msgctl __P((struct proc *p, struct msgctl_args *uap)); +struct msgget_args; +int msgget __P((struct proc *p, struct msgget_args *uap)); +struct msgsnd_args; +int msgsnd __P((struct proc *p, struct msgsnd_args *uap)); +struct msgrcv_args; +int msgrcv __P((struct proc *p, struct msgrcv_args *uap)); +#endif +static void msg_freehdr __P((struct msg *msghdr)); + +/* XXX casting to (sy_call_t *) is bogus, as usual. */ +static sy_call_t *msgcalls[] = { + (sy_call_t *)msgctl, (sy_call_t *)msgget, + (sy_call_t *)msgsnd, (sy_call_t *)msgrcv +}; + +static int nfree_msgmaps; /* # of free map entries */ +static short free_msgmaps; /* head of linked list of free map entries */ +static struct msg *free_msghdrs; /* list of free msg headers */ +char *msgpool; /* MSGMAX byte long msg buffer pool */ +struct msgmap *msgmaps; /* MSGSEG msgmap structures */ +struct msg *msghdrs; /* MSGTQL msg headers */ +struct msqid_ds *msqids; /* MSGMNI msqid_ds struct's */ + +void +msginit(dummy) + void *dummy; +{ + register int i; + + /* + * msginfo.msgssz should be a power of two for efficiency reasons. + * It is also pretty silly if msginfo.msgssz is less than 8 + * or greater than about 256 so ... + */ + + i = 8; + while (i < 1024 && i != msginfo.msgssz) + i <<= 1; + if (i != msginfo.msgssz) { + printf("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz, + msginfo.msgssz); + panic("msginfo.msgssz not a small power of 2"); + } + + if (msginfo.msgseg > 32767) { + printf("msginfo.msgseg=%d\n", msginfo.msgseg); + panic("msginfo.msgseg > 32767"); + } + + if (msgmaps == NULL) + panic("msgmaps is NULL"); + + for (i = 0; i < msginfo.msgseg; i++) { + if (i > 0) + msgmaps[i-1].next = i; + msgmaps[i].next = -1; /* implies entry is available */ + } + free_msgmaps = 0; + nfree_msgmaps = msginfo.msgseg; + + if (msghdrs == NULL) + panic("msghdrs is NULL"); + + for (i = 0; i < msginfo.msgtql; i++) { + msghdrs[i].msg_type = 0; + if (i > 0) + msghdrs[i-1].msg_next = &msghdrs[i]; + msghdrs[i].msg_next = NULL; + } + free_msghdrs = &msghdrs[0]; + + if (msqids == NULL) + panic("msqids is NULL"); + + for (i = 0; i < msginfo.msgmni; i++) { + msqids[i].msg_qbytes = 0; /* implies entry is available */ + msqids[i].msg_perm.seq = 0; /* reset to a known value */ + } +} + +/* + * Entry point for all MSG calls + */ +int +msgsys(p, uap) + struct proc *p; + /* XXX actually varargs. */ + struct msgsys_args /* { + u_int which; + int a2; + int a3; + int a4; + int a5; + int a6; + } */ *uap; +{ + + if (uap->which >= sizeof(msgcalls)/sizeof(msgcalls[0])) + return (EINVAL); + return ((*msgcalls[uap->which])(p, &uap->a2)); +} + +static void +msg_freehdr(msghdr) + struct msg *msghdr; +{ + while (msghdr->msg_ts > 0) { + short next; + if (msghdr->msg_spot < 0 || msghdr->msg_spot >= msginfo.msgseg) + panic("msghdr->msg_spot out of range"); + next = msgmaps[msghdr->msg_spot].next; + msgmaps[msghdr->msg_spot].next = free_msgmaps; + free_msgmaps = msghdr->msg_spot; + nfree_msgmaps++; + msghdr->msg_spot = next; + if (msghdr->msg_ts >= msginfo.msgssz) + msghdr->msg_ts -= msginfo.msgssz; + else + msghdr->msg_ts = 0; + } + if (msghdr->msg_spot != -1) + panic("msghdr->msg_spot != -1"); + msghdr->msg_next = free_msghdrs; + free_msghdrs = msghdr; +} + +#ifndef _SYS_SYSPROTO_H_ +struct msgctl_args { + int msqid; + int cmd; + struct msqid_ds *buf; +}; +#endif + +int +msgctl(p, uap) + struct proc *p; + register struct msgctl_args *uap; +{ + int msqid = uap->msqid; + int cmd = uap->cmd; + struct msqid_ds *user_msqptr = uap->buf; + struct ucred *cred = p->p_ucred; + int rval, eval; + struct msqid_ds msqbuf; + register struct msqid_ds *msqptr; + +#ifdef MSG_DEBUG_OK + printf("call to msgctl(%d, %d, 0x%x)\n", msqid, cmd, user_msqptr); +#endif + + msqid = IPCID_TO_IX(msqid); + + if (msqid < 0 || msqid >= msginfo.msgmni) { +#ifdef MSG_DEBUG_OK + printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid, + msginfo.msgmni); +#endif + return(EINVAL); + } + + msqptr = &msqids[msqid]; + + if (msqptr->msg_qbytes == 0) { +#ifdef MSG_DEBUG_OK + printf("no such msqid\n"); +#endif + return(EINVAL); + } + if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) { +#ifdef MSG_DEBUG_OK + printf("wrong sequence number\n"); +#endif + return(EINVAL); + } + + eval = 0; + rval = 0; + + switch (cmd) { + + case IPC_RMID: + { + struct msg *msghdr; + if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_M))) + return(eval); + /* Free the message headers */ + msghdr = msqptr->msg_first; + while (msghdr != NULL) { + struct msg *msghdr_tmp; + + /* Free the segments of each message */ + msqptr->msg_cbytes -= msghdr->msg_ts; + msqptr->msg_qnum--; + msghdr_tmp = msghdr; + msghdr = msghdr->msg_next; + msg_freehdr(msghdr_tmp); + } + + if (msqptr->msg_cbytes != 0) + panic("msg_cbytes is screwed up"); + if (msqptr->msg_qnum != 0) + panic("msg_qnum is screwed up"); + + msqptr->msg_qbytes = 0; /* Mark it as free */ + + wakeup((caddr_t)msqptr); + } + + break; + + case IPC_SET: + if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_M))) + return(eval); + if ((eval = copyin(user_msqptr, &msqbuf, sizeof(msqbuf))) != 0) + return(eval); + if (msqbuf.msg_qbytes > msqptr->msg_qbytes && cred->cr_uid != 0) + return(EPERM); + if (msqbuf.msg_qbytes > msginfo.msgmnb) { +#ifdef MSG_DEBUG_OK + printf("can't increase msg_qbytes beyond %d (truncating)\n", + msginfo.msgmnb); +#endif + msqbuf.msg_qbytes = msginfo.msgmnb; /* silently restrict qbytes to system limit */ + } + if (msqbuf.msg_qbytes == 0) { +#ifdef MSG_DEBUG_OK + printf("can't reduce msg_qbytes to 0\n"); +#endif + return(EINVAL); /* non-standard errno! */ + } + msqptr->msg_perm.uid = msqbuf.msg_perm.uid; /* change the owner */ + msqptr->msg_perm.gid = msqbuf.msg_perm.gid; /* change the owner */ + msqptr->msg_perm.mode = (msqptr->msg_perm.mode & ~0777) | + (msqbuf.msg_perm.mode & 0777); + msqptr->msg_qbytes = msqbuf.msg_qbytes; + msqptr->msg_ctime = time_second; + break; + + case IPC_STAT: + if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_R))) { +#ifdef MSG_DEBUG_OK + printf("requester doesn't have read access\n"); +#endif + return(eval); + } + eval = copyout((caddr_t)msqptr, user_msqptr, + sizeof(struct msqid_ds)); + break; + + default: +#ifdef MSG_DEBUG_OK + printf("invalid command %d\n", cmd); +#endif + return(EINVAL); + } + + if (eval == 0) + p->p_retval[0] = rval; + return(eval); +} + +#ifndef _SYS_SYSPROTO_H_ +struct msgget_args { + key_t key; + int msgflg; +}; +#endif + +int +msgget(p, uap) + struct proc *p; + register struct msgget_args *uap; +{ + int msqid, eval; + int key = uap->key; + int msgflg = uap->msgflg; + struct ucred *cred = p->p_ucred; + register struct msqid_ds *msqptr = NULL; + +#ifdef MSG_DEBUG_OK + printf("msgget(0x%x, 0%o)\n", key, msgflg); +#endif + + if (key != IPC_PRIVATE) { + for (msqid = 0; msqid < msginfo.msgmni; msqid++) { + msqptr = &msqids[msqid]; + if (msqptr->msg_qbytes != 0 && + msqptr->msg_perm.key == key) + break; + } + if (msqid < msginfo.msgmni) { +#ifdef MSG_DEBUG_OK + printf("found public key\n"); +#endif + if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) { +#ifdef MSG_DEBUG_OK + printf("not exclusive\n"); +#endif + return(EEXIST); + } + if ((eval = ipcperm(cred, &msqptr->msg_perm, msgflg & 0700 ))) { +#ifdef MSG_DEBUG_OK + printf("requester doesn't have 0%o access\n", + msgflg & 0700); +#endif + return(eval); + } + goto found; + } + } + +#ifdef MSG_DEBUG_OK + printf("need to allocate the msqid_ds\n"); +#endif + if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) { + for (msqid = 0; msqid < msginfo.msgmni; msqid++) { + /* + * Look for an unallocated and unlocked msqid_ds. + * msqid_ds's can be locked by msgsnd or msgrcv while + * they are copying the message in/out. We can't + * re-use the entry until they release it. + */ + msqptr = &msqids[msqid]; + if (msqptr->msg_qbytes == 0 && + (msqptr->msg_perm.mode & MSG_LOCKED) == 0) + break; + } + if (msqid == msginfo.msgmni) { +#ifdef MSG_DEBUG_OK + printf("no more msqid_ds's available\n"); +#endif + return(ENOSPC); + } +#ifdef MSG_DEBUG_OK + printf("msqid %d is available\n", msqid); +#endif + msqptr->msg_perm.key = key; + msqptr->msg_perm.cuid = cred->cr_uid; + msqptr->msg_perm.uid = cred->cr_uid; + msqptr->msg_perm.cgid = cred->cr_gid; + msqptr->msg_perm.gid = cred->cr_gid; + msqptr->msg_perm.mode = (msgflg & 0777); + /* Make sure that the returned msqid is unique */ + msqptr->msg_perm.seq++; + msqptr->msg_first = NULL; + msqptr->msg_last = NULL; + msqptr->msg_cbytes = 0; + msqptr->msg_qnum = 0; + msqptr->msg_qbytes = msginfo.msgmnb; + msqptr->msg_lspid = 0; + msqptr->msg_lrpid = 0; + msqptr->msg_stime = 0; + msqptr->msg_rtime = 0; + msqptr->msg_ctime = time_second; + } else { +#ifdef MSG_DEBUG_OK + printf("didn't find it and wasn't asked to create it\n"); +#endif + return(ENOENT); + } + +found: + /* Construct the unique msqid */ + p->p_retval[0] = IXSEQ_TO_IPCID(msqid, msqptr->msg_perm); + return(0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct msgsnd_args { + int msqid; + void *msgp; + size_t msgsz; + int msgflg; +}; +#endif + +int +msgsnd(p, uap) + struct proc *p; + register struct msgsnd_args *uap; +{ + int msqid = uap->msqid; + void *user_msgp = uap->msgp; + size_t msgsz = uap->msgsz; + int msgflg = uap->msgflg; + int segs_needed, eval; + struct ucred *cred = p->p_ucred; + register struct msqid_ds *msqptr; + register struct msg *msghdr; + short next; + +#ifdef MSG_DEBUG_OK + printf("call to msgsnd(%d, 0x%x, %d, %d)\n", msqid, user_msgp, msgsz, + msgflg); +#endif + + msqid = IPCID_TO_IX(msqid); + + if (msqid < 0 || msqid >= msginfo.msgmni) { +#ifdef MSG_DEBUG_OK + printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid, + msginfo.msgmni); +#endif + return(EINVAL); + } + + msqptr = &msqids[msqid]; + if (msqptr->msg_qbytes == 0) { +#ifdef MSG_DEBUG_OK + printf("no such message queue id\n"); +#endif + return(EINVAL); + } + if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) { +#ifdef MSG_DEBUG_OK + printf("wrong sequence number\n"); +#endif + return(EINVAL); + } + + if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_W))) { +#ifdef MSG_DEBUG_OK + printf("requester doesn't have write access\n"); +#endif + return(eval); + } + + segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz; +#ifdef MSG_DEBUG_OK + printf("msgsz=%d, msgssz=%d, segs_needed=%d\n", msgsz, msginfo.msgssz, + segs_needed); +#endif + for (;;) { + int need_more_resources = 0; + + /* + * check msgsz + * (inside this loop in case msg_qbytes changes while we sleep) + */ + + if (msgsz > msqptr->msg_qbytes) { +#ifdef MSG_DEBUG_OK + printf("msgsz > msqptr->msg_qbytes\n"); +#endif + return(EINVAL); + } + + if (msqptr->msg_perm.mode & MSG_LOCKED) { +#ifdef MSG_DEBUG_OK + printf("msqid is locked\n"); +#endif + need_more_resources = 1; + } + if (msgsz + msqptr->msg_cbytes > msqptr->msg_qbytes) { +#ifdef MSG_DEBUG_OK + printf("msgsz + msg_cbytes > msg_qbytes\n"); +#endif + need_more_resources = 1; + } + if (segs_needed > nfree_msgmaps) { +#ifdef MSG_DEBUG_OK + printf("segs_needed > nfree_msgmaps\n"); +#endif + need_more_resources = 1; + } + if (free_msghdrs == NULL) { +#ifdef MSG_DEBUG_OK + printf("no more msghdrs\n"); +#endif + need_more_resources = 1; + } + + if (need_more_resources) { + int we_own_it; + + if ((msgflg & IPC_NOWAIT) != 0) { +#ifdef MSG_DEBUG_OK + printf("need more resources but caller doesn't want to wait\n"); +#endif + return(EAGAIN); + } + + if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0) { +#ifdef MSG_DEBUG_OK + printf("we don't own the msqid_ds\n"); +#endif + we_own_it = 0; + } else { + /* Force later arrivals to wait for our + request */ +#ifdef MSG_DEBUG_OK + printf("we own the msqid_ds\n"); +#endif + msqptr->msg_perm.mode |= MSG_LOCKED; + we_own_it = 1; + } +#ifdef MSG_DEBUG_OK + printf("goodnight\n"); +#endif + eval = tsleep((caddr_t)msqptr, (PZERO - 4) | PCATCH, + "msgwait", 0); +#ifdef MSG_DEBUG_OK + printf("good morning, eval=%d\n", eval); +#endif + if (we_own_it) + msqptr->msg_perm.mode &= ~MSG_LOCKED; + if (eval != 0) { +#ifdef MSG_DEBUG_OK + printf("msgsnd: interrupted system call\n"); +#endif + return(EINTR); + } + + /* + * Make sure that the msq queue still exists + */ + + if (msqptr->msg_qbytes == 0) { +#ifdef MSG_DEBUG_OK + printf("msqid deleted\n"); +#endif + /* The SVID says to return EIDRM. */ +#ifdef EIDRM + return(EIDRM); +#else + /* Unfortunately, BSD doesn't define that code + yet! */ + return(EINVAL); +#endif + } + + } else { +#ifdef MSG_DEBUG_OK + printf("got all the resources that we need\n"); +#endif + break; + } + } + + /* + * We have the resources that we need. + * Make sure! + */ + + if (msqptr->msg_perm.mode & MSG_LOCKED) + panic("msg_perm.mode & MSG_LOCKED"); + if (segs_needed > nfree_msgmaps) + panic("segs_needed > nfree_msgmaps"); + if (msgsz + msqptr->msg_cbytes > msqptr->msg_qbytes) + panic("msgsz + msg_cbytes > msg_qbytes"); + if (free_msghdrs == NULL) + panic("no more msghdrs"); + + /* + * Re-lock the msqid_ds in case we page-fault when copying in the + * message + */ + + if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0) + panic("msqid_ds is already locked"); + msqptr->msg_perm.mode |= MSG_LOCKED; + + /* + * Allocate a message header + */ + + msghdr = free_msghdrs; + free_msghdrs = msghdr->msg_next; + msghdr->msg_spot = -1; + msghdr->msg_ts = msgsz; + + /* + * Allocate space for the message + */ + + while (segs_needed > 0) { + if (nfree_msgmaps <= 0) + panic("not enough msgmaps"); + if (free_msgmaps == -1) + panic("nil free_msgmaps"); + next = free_msgmaps; + if (next <= -1) + panic("next too low #1"); + if (next >= msginfo.msgseg) + panic("next out of range #1"); +#ifdef MSG_DEBUG_OK + printf("allocating segment %d to message\n", next); +#endif + free_msgmaps = msgmaps[next].next; + nfree_msgmaps--; + msgmaps[next].next = msghdr->msg_spot; + msghdr->msg_spot = next; + segs_needed--; + } + + /* + * Copy in the message type + */ + + if ((eval = copyin(user_msgp, &msghdr->msg_type, + sizeof(msghdr->msg_type))) != 0) { +#ifdef MSG_DEBUG_OK + printf("error %d copying the message type\n", eval); +#endif + msg_freehdr(msghdr); + msqptr->msg_perm.mode &= ~MSG_LOCKED; + wakeup((caddr_t)msqptr); + return(eval); + } + user_msgp = (char *)user_msgp + sizeof(msghdr->msg_type); + + /* + * Validate the message type + */ + + if (msghdr->msg_type < 1) { + msg_freehdr(msghdr); + msqptr->msg_perm.mode &= ~MSG_LOCKED; + wakeup((caddr_t)msqptr); +#ifdef MSG_DEBUG_OK + printf("mtype (%d) < 1\n", msghdr->msg_type); +#endif + return(EINVAL); + } + + /* + * Copy in the message body + */ + + next = msghdr->msg_spot; + while (msgsz > 0) { + size_t tlen; + if (msgsz > msginfo.msgssz) + tlen = msginfo.msgssz; + else + tlen = msgsz; + if (next <= -1) + panic("next too low #2"); + if (next >= msginfo.msgseg) + panic("next out of range #2"); + if ((eval = copyin(user_msgp, &msgpool[next * msginfo.msgssz], + tlen)) != 0) { +#ifdef MSG_DEBUG_OK + printf("error %d copying in message segment\n", eval); +#endif + msg_freehdr(msghdr); + msqptr->msg_perm.mode &= ~MSG_LOCKED; + wakeup((caddr_t)msqptr); + return(eval); + } + msgsz -= tlen; + user_msgp = (char *)user_msgp + tlen; + next = msgmaps[next].next; + } + if (next != -1) + panic("didn't use all the msg segments"); + + /* + * We've got the message. Unlock the msqid_ds. + */ + + msqptr->msg_perm.mode &= ~MSG_LOCKED; + + /* + * Make sure that the msqid_ds is still allocated. + */ + + if (msqptr->msg_qbytes == 0) { + msg_freehdr(msghdr); + wakeup((caddr_t)msqptr); + /* The SVID says to return EIDRM. */ +#ifdef EIDRM + return(EIDRM); +#else + /* Unfortunately, BSD doesn't define that code yet! */ + return(EINVAL); +#endif + } + + /* + * Put the message into the queue + */ + + if (msqptr->msg_first == NULL) { + msqptr->msg_first = msghdr; + msqptr->msg_last = msghdr; + } else { + msqptr->msg_last->msg_next = msghdr; + msqptr->msg_last = msghdr; + } + msqptr->msg_last->msg_next = NULL; + + msqptr->msg_cbytes += msghdr->msg_ts; + msqptr->msg_qnum++; + msqptr->msg_lspid = p->p_pid; + msqptr->msg_stime = time_second; + + wakeup((caddr_t)msqptr); + p->p_retval[0] = 0; + return(0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct msgrcv_args { + int msqid; + void *msgp; + size_t msgsz; + long msgtyp; + int msgflg; +}; +#endif + +int +msgrcv(p, uap) + struct proc *p; + register struct msgrcv_args *uap; +{ + int msqid = uap->msqid; + void *user_msgp = uap->msgp; + size_t msgsz = uap->msgsz; + long msgtyp = uap->msgtyp; + int msgflg = uap->msgflg; + size_t len; + struct ucred *cred = p->p_ucred; + register struct msqid_ds *msqptr; + register struct msg *msghdr; + int eval; + short next; + +#ifdef MSG_DEBUG_OK + printf("call to msgrcv(%d, 0x%x, %d, %ld, %d)\n", msqid, user_msgp, + msgsz, msgtyp, msgflg); +#endif + + msqid = IPCID_TO_IX(msqid); + + if (msqid < 0 || msqid >= msginfo.msgmni) { +#ifdef MSG_DEBUG_OK + printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid, + msginfo.msgmni); +#endif + return(EINVAL); + } + + msqptr = &msqids[msqid]; + if (msqptr->msg_qbytes == 0) { +#ifdef MSG_DEBUG_OK + printf("no such message queue id\n"); +#endif + return(EINVAL); + } + if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) { +#ifdef MSG_DEBUG_OK + printf("wrong sequence number\n"); +#endif + return(EINVAL); + } + + if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_R))) { +#ifdef MSG_DEBUG_OK + printf("requester doesn't have read access\n"); +#endif + return(eval); + } + + msghdr = NULL; + while (msghdr == NULL) { + if (msgtyp == 0) { + msghdr = msqptr->msg_first; + if (msghdr != NULL) { + if (msgsz < msghdr->msg_ts && + (msgflg & MSG_NOERROR) == 0) { +#ifdef MSG_DEBUG_OK + printf("first message on the queue is too big (want %d, got %d)\n", + msgsz, msghdr->msg_ts); +#endif + return(E2BIG); + } + if (msqptr->msg_first == msqptr->msg_last) { + msqptr->msg_first = NULL; + msqptr->msg_last = NULL; + } else { + msqptr->msg_first = msghdr->msg_next; + if (msqptr->msg_first == NULL) + panic("msg_first/last screwed up #1"); + } + } + } else { + struct msg *previous; + struct msg **prev; + + previous = NULL; + prev = &(msqptr->msg_first); + while ((msghdr = *prev) != NULL) { + /* + * Is this message's type an exact match or is + * this message's type less than or equal to + * the absolute value of a negative msgtyp? + * Note that the second half of this test can + * NEVER be true if msgtyp is positive since + * msg_type is always positive! + */ + + if (msgtyp == msghdr->msg_type || + msghdr->msg_type <= -msgtyp) { +#ifdef MSG_DEBUG_OK + printf("found message type %d, requested %d\n", + msghdr->msg_type, msgtyp); +#endif + if (msgsz < msghdr->msg_ts && + (msgflg & MSG_NOERROR) == 0) { +#ifdef MSG_DEBUG_OK + printf("requested message on the queue is too big (want %d, got %d)\n", + msgsz, msghdr->msg_ts); +#endif + return(E2BIG); + } + *prev = msghdr->msg_next; + if (msghdr == msqptr->msg_last) { + if (previous == NULL) { + if (prev != + &msqptr->msg_first) + panic("msg_first/last screwed up #2"); + msqptr->msg_first = + NULL; + msqptr->msg_last = + NULL; + } else { + if (prev == + &msqptr->msg_first) + panic("msg_first/last screwed up #3"); + msqptr->msg_last = + previous; + } + } + break; + } + previous = msghdr; + prev = &(msghdr->msg_next); + } + } + + /* + * We've either extracted the msghdr for the appropriate + * message or there isn't one. + * If there is one then bail out of this loop. + */ + + if (msghdr != NULL) + break; + + /* + * Hmph! No message found. Does the user want to wait? + */ + + if ((msgflg & IPC_NOWAIT) != 0) { +#ifdef MSG_DEBUG_OK + printf("no appropriate message found (msgtyp=%d)\n", + msgtyp); +#endif + /* The SVID says to return ENOMSG. */ +#ifdef ENOMSG + return(ENOMSG); +#else + /* Unfortunately, BSD doesn't define that code yet! */ + return(EAGAIN); +#endif + } + + /* + * Wait for something to happen + */ + +#ifdef MSG_DEBUG_OK + printf("msgrcv: goodnight\n"); +#endif + eval = tsleep((caddr_t)msqptr, (PZERO - 4) | PCATCH, "msgwait", + 0); +#ifdef MSG_DEBUG_OK + printf("msgrcv: good morning (eval=%d)\n", eval); +#endif + + if (eval != 0) { +#ifdef MSG_DEBUG_OK + printf("msgsnd: interrupted system call\n"); +#endif + return(EINTR); + } + + /* + * Make sure that the msq queue still exists + */ + + if (msqptr->msg_qbytes == 0 || + msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) { +#ifdef MSG_DEBUG_OK + printf("msqid deleted\n"); +#endif + /* The SVID says to return EIDRM. */ +#ifdef EIDRM + return(EIDRM); +#else + /* Unfortunately, BSD doesn't define that code yet! */ + return(EINVAL); +#endif + } + } + + /* + * Return the message to the user. + * + * First, do the bookkeeping (before we risk being interrupted). + */ + + msqptr->msg_cbytes -= msghdr->msg_ts; + msqptr->msg_qnum--; + msqptr->msg_lrpid = p->p_pid; + msqptr->msg_rtime = time_second; + + /* + * Make msgsz the actual amount that we'll be returning. + * Note that this effectively truncates the message if it is too long + * (since msgsz is never increased). + */ + +#ifdef MSG_DEBUG_OK + printf("found a message, msgsz=%d, msg_ts=%d\n", msgsz, + msghdr->msg_ts); +#endif + if (msgsz > msghdr->msg_ts) + msgsz = msghdr->msg_ts; + + /* + * Return the type to the user. + */ + + eval = copyout((caddr_t)&(msghdr->msg_type), user_msgp, + sizeof(msghdr->msg_type)); + if (eval != 0) { +#ifdef MSG_DEBUG_OK + printf("error (%d) copying out message type\n", eval); +#endif + msg_freehdr(msghdr); + wakeup((caddr_t)msqptr); + return(eval); + } + user_msgp = (char *)user_msgp + sizeof(msghdr->msg_type); + + /* + * Return the segments to the user + */ + + next = msghdr->msg_spot; + for (len = 0; len < msgsz; len += msginfo.msgssz) { + size_t tlen; + + if (msgsz > msginfo.msgssz) + tlen = msginfo.msgssz; + else + tlen = msgsz; + if (next <= -1) + panic("next too low #3"); + if (next >= msginfo.msgseg) + panic("next out of range #3"); + eval = copyout((caddr_t)&msgpool[next * msginfo.msgssz], + user_msgp, tlen); + if (eval != 0) { +#ifdef MSG_DEBUG_OK + printf("error (%d) copying out message segment\n", + eval); +#endif + msg_freehdr(msghdr); + wakeup((caddr_t)msqptr); + return(eval); + } + user_msgp = (char *)user_msgp + tlen; + next = msgmaps[next].next; + } + + /* + * Done, return the actual number of bytes copied out. + */ + + msg_freehdr(msghdr); + wakeup((caddr_t)msqptr); + p->p_retval[0] = msgsz; + return(0); +} diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c new file mode 100644 index 0000000..fb04c42 --- /dev/null +++ b/sys/kern/sysv_sem.c @@ -0,0 +1,977 @@ +/* $Id: sysv_sem.c,v 1.21 1998/03/30 09:50:41 phk Exp $ */ + +/* + * Implementation of SVID semaphores + * + * Author: Daniel Boulet + * + * This software is provided ``AS IS'' without any warranties of any kind. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/sem.h> +#include <sys/sysent.h> + +static void seminit __P((void *)); +SYSINIT(sysv_sem, SI_SUB_SYSV_SEM, SI_ORDER_FIRST, seminit, NULL) + +#ifndef _SYS_SYSPROTO_H_ +struct __semctl_args; +int __semctl __P((struct proc *p, struct __semctl_args *uap)); +struct semget_args; +int semget __P((struct proc *p, struct semget_args *uap)); +struct semop_args; +int semop __P((struct proc *p, struct semop_args *uap)); +struct semconfig_args; +int semconfig __P((struct proc *p, struct semconfig_args *uap)); +#endif + +static struct sem_undo *semu_alloc __P((struct proc *p)); +static int semundo_adjust __P((struct proc *p, struct sem_undo **supptr, + int semid, int semnum, int adjval)); +static void semundo_clear __P((int semid, int semnum)); + +/* XXX casting to (sy_call_t *) is bogus, as usual. */ +static sy_call_t *semcalls[] = { + (sy_call_t *)__semctl, (sy_call_t *)semget, + (sy_call_t *)semop, (sy_call_t *)semconfig +}; + +static int semtot = 0; +struct semid_ds *sema; /* semaphore id pool */ +struct sem *sem; /* semaphore pool */ +static struct sem_undo *semu_list; /* list of active undo structures */ +int *semu; /* undo structure pool */ + +static struct proc *semlock_holder = NULL; + +void +seminit(dummy) + void *dummy; +{ + register int i; + + if (sema == NULL) + panic("sema is NULL"); + if (semu == NULL) + panic("semu is NULL"); + + for (i = 0; i < seminfo.semmni; i++) { + sema[i].sem_base = 0; + sema[i].sem_perm.mode = 0; + } + for (i = 0; i < seminfo.semmnu; i++) { + register struct sem_undo *suptr = SEMU(i); + suptr->un_proc = NULL; + } + semu_list = NULL; +} + +/* + * Entry point for all SEM calls + */ +int +semsys(p, uap) + struct proc *p; + /* XXX actually varargs. */ + struct semsys_args /* { + u_int which; + int a2; + int a3; + int a4; + int a5; + } */ *uap; +{ + + while (semlock_holder != NULL && semlock_holder != p) + (void) tsleep((caddr_t)&semlock_holder, (PZERO - 4), "semsys", 0); + + if (uap->which >= sizeof(semcalls)/sizeof(semcalls[0])) + return (EINVAL); + return ((*semcalls[uap->which])(p, &uap->a2)); +} + +/* + * Lock or unlock the entire semaphore facility. + * + * This will probably eventually evolve into a general purpose semaphore + * facility status enquiry mechanism (I don't like the "read /dev/kmem" + * approach currently taken by ipcs and the amount of info that we want + * to be able to extract for ipcs is probably beyond what the capability + * of the getkerninfo facility. + * + * At the time that the current version of semconfig was written, ipcs is + * the only user of the semconfig facility. It uses it to ensure that the + * semaphore facility data structures remain static while it fishes around + * in /dev/kmem. + */ + +#ifndef _SYS_SYSPROTO_H_ +struct semconfig_args { + semconfig_ctl_t flag; +}; +#endif + +int +semconfig(p, uap) + struct proc *p; + struct semconfig_args *uap; +{ + int eval = 0; + + switch (uap->flag) { + case SEM_CONFIG_FREEZE: + semlock_holder = p; + break; + + case SEM_CONFIG_THAW: + semlock_holder = NULL; + wakeup((caddr_t)&semlock_holder); + break; + + default: + printf("semconfig: unknown flag parameter value (%d) - ignored\n", + uap->flag); + eval = EINVAL; + break; + } + + p->p_retval[0] = 0; + return(eval); +} + +/* + * Allocate a new sem_undo structure for a process + * (returns ptr to structure or NULL if no more room) + */ + +static struct sem_undo * +semu_alloc(p) + struct proc *p; +{ + register int i; + register struct sem_undo *suptr; + register struct sem_undo **supptr; + int attempt; + + /* + * Try twice to allocate something. + * (we'll purge any empty structures after the first pass so + * two passes are always enough) + */ + + for (attempt = 0; attempt < 2; attempt++) { + /* + * Look for a free structure. + * Fill it in and return it if we find one. + */ + + for (i = 0; i < seminfo.semmnu; i++) { + suptr = SEMU(i); + if (suptr->un_proc == NULL) { + suptr->un_next = semu_list; + semu_list = suptr; + suptr->un_cnt = 0; + suptr->un_proc = p; + return(suptr); + } + } + + /* + * We didn't find a free one, if this is the first attempt + * then try to free some structures. + */ + + if (attempt == 0) { + /* All the structures are in use - try to free some */ + int did_something = 0; + + supptr = &semu_list; + while ((suptr = *supptr) != NULL) { + if (suptr->un_cnt == 0) { + suptr->un_proc = NULL; + *supptr = suptr->un_next; + did_something = 1; + } else + supptr = &(suptr->un_next); + } + + /* If we didn't free anything then just give-up */ + if (!did_something) + return(NULL); + } else { + /* + * The second pass failed even though we freed + * something after the first pass! + * This is IMPOSSIBLE! + */ + panic("semu_alloc - second attempt failed"); + } + } + return (NULL); +} + +/* + * Adjust a particular entry for a particular proc + */ + +static int +semundo_adjust(p, supptr, semid, semnum, adjval) + register struct proc *p; + struct sem_undo **supptr; + int semid, semnum; + int adjval; +{ + register struct sem_undo *suptr; + register struct undo *sunptr; + int i; + + /* Look for and remember the sem_undo if the caller doesn't provide + it */ + + suptr = *supptr; + if (suptr == NULL) { + for (suptr = semu_list; suptr != NULL; + suptr = suptr->un_next) { + if (suptr->un_proc == p) { + *supptr = suptr; + break; + } + } + if (suptr == NULL) { + if (adjval == 0) + return(0); + suptr = semu_alloc(p); + if (suptr == NULL) + return(ENOSPC); + *supptr = suptr; + } + } + + /* + * Look for the requested entry and adjust it (delete if adjval becomes + * 0). + */ + sunptr = &suptr->un_ent[0]; + for (i = 0; i < suptr->un_cnt; i++, sunptr++) { + if (sunptr->un_id != semid || sunptr->un_num != semnum) + continue; + if (adjval == 0) + sunptr->un_adjval = 0; + else + sunptr->un_adjval += adjval; + if (sunptr->un_adjval == 0) { + suptr->un_cnt--; + if (i < suptr->un_cnt) + suptr->un_ent[i] = + suptr->un_ent[suptr->un_cnt]; + } + return(0); + } + + /* Didn't find the right entry - create it */ + if (adjval == 0) + return(0); + if (suptr->un_cnt != seminfo.semume) { + sunptr = &suptr->un_ent[suptr->un_cnt]; + suptr->un_cnt++; + sunptr->un_adjval = adjval; + sunptr->un_id = semid; sunptr->un_num = semnum; + } else + return(EINVAL); + return(0); +} + +static void +semundo_clear(semid, semnum) + int semid, semnum; +{ + register struct sem_undo *suptr; + + for (suptr = semu_list; suptr != NULL; suptr = suptr->un_next) { + register struct undo *sunptr = &suptr->un_ent[0]; + register int i = 0; + + while (i < suptr->un_cnt) { + if (sunptr->un_id == semid) { + if (semnum == -1 || sunptr->un_num == semnum) { + suptr->un_cnt--; + if (i < suptr->un_cnt) { + suptr->un_ent[i] = + suptr->un_ent[suptr->un_cnt]; + continue; + } + } + if (semnum != -1) + break; + } + i++, sunptr++; + } + } +} + +/* + * Note that the user-mode half of this passes a union, not a pointer + */ +#ifndef _SYS_SYSPROTO_H_ +struct __semctl_args { + int semid; + int semnum; + int cmd; + union semun *arg; +}; +#endif + +int +__semctl(p, uap) + struct proc *p; + register struct __semctl_args *uap; +{ + int semid = uap->semid; + int semnum = uap->semnum; + int cmd = uap->cmd; + union semun *arg = uap->arg; + union semun real_arg; + struct ucred *cred = p->p_ucred; + int i, rval, eval; + struct semid_ds sbuf; + register struct semid_ds *semaptr; + +#ifdef SEM_DEBUG + printf("call to semctl(%d, %d, %d, 0x%x)\n", semid, semnum, cmd, arg); +#endif + + semid = IPCID_TO_IX(semid); + if (semid < 0 || semid >= seminfo.semmsl) + return(EINVAL); + + semaptr = &sema[semid]; + if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 || + semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) + return(EINVAL); + + eval = 0; + rval = 0; + + switch (cmd) { + case IPC_RMID: + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_M))) + return(eval); + semaptr->sem_perm.cuid = cred->cr_uid; + semaptr->sem_perm.uid = cred->cr_uid; + semtot -= semaptr->sem_nsems; + for (i = semaptr->sem_base - sem; i < semtot; i++) + sem[i] = sem[i + semaptr->sem_nsems]; + for (i = 0; i < seminfo.semmni; i++) { + if ((sema[i].sem_perm.mode & SEM_ALLOC) && + sema[i].sem_base > semaptr->sem_base) + sema[i].sem_base -= semaptr->sem_nsems; + } + semaptr->sem_perm.mode = 0; + semundo_clear(semid, -1); + wakeup((caddr_t)semaptr); + break; + + case IPC_SET: + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_M))) + return(eval); + if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0) + return(eval); + if ((eval = copyin(real_arg.buf, (caddr_t)&sbuf, + sizeof(sbuf))) != 0) + return(eval); + semaptr->sem_perm.uid = sbuf.sem_perm.uid; + semaptr->sem_perm.gid = sbuf.sem_perm.gid; + semaptr->sem_perm.mode = (semaptr->sem_perm.mode & ~0777) | + (sbuf.sem_perm.mode & 0777); + semaptr->sem_ctime = time_second; + break; + + case IPC_STAT: + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R))) + return(eval); + if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0) + return(eval); + eval = copyout((caddr_t)semaptr, real_arg.buf, + sizeof(struct semid_ds)); + break; + + case GETNCNT: + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R))) + return(eval); + if (semnum < 0 || semnum >= semaptr->sem_nsems) + return(EINVAL); + rval = semaptr->sem_base[semnum].semncnt; + break; + + case GETPID: + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R))) + return(eval); + if (semnum < 0 || semnum >= semaptr->sem_nsems) + return(EINVAL); + rval = semaptr->sem_base[semnum].sempid; + break; + + case GETVAL: + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R))) + return(eval); + if (semnum < 0 || semnum >= semaptr->sem_nsems) + return(EINVAL); + rval = semaptr->sem_base[semnum].semval; + break; + + case GETALL: + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R))) + return(eval); + if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0) + return(eval); + for (i = 0; i < semaptr->sem_nsems; i++) { + eval = copyout((caddr_t)&semaptr->sem_base[i].semval, + &real_arg.array[i], sizeof(real_arg.array[0])); + if (eval != 0) + break; + } + break; + + case GETZCNT: + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R))) + return(eval); + if (semnum < 0 || semnum >= semaptr->sem_nsems) + return(EINVAL); + rval = semaptr->sem_base[semnum].semzcnt; + break; + + case SETVAL: + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_W))) + return(eval); + if (semnum < 0 || semnum >= semaptr->sem_nsems) + return(EINVAL); + if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0) + return(eval); + semaptr->sem_base[semnum].semval = real_arg.val; + semundo_clear(semid, semnum); + wakeup((caddr_t)semaptr); + break; + + case SETALL: + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_W))) + return(eval); + if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0) + return(eval); + for (i = 0; i < semaptr->sem_nsems; i++) { + eval = copyin(&real_arg.array[i], + (caddr_t)&semaptr->sem_base[i].semval, + sizeof(real_arg.array[0])); + if (eval != 0) + break; + } + semundo_clear(semid, -1); + wakeup((caddr_t)semaptr); + break; + + default: + return(EINVAL); + } + + if (eval == 0) + p->p_retval[0] = rval; + return(eval); +} + +#ifndef _SYS_SYSPROTO_H_ +struct semget_args { + key_t key; + int nsems; + int semflg; +}; +#endif + +int +semget(p, uap) + struct proc *p; + register struct semget_args *uap; +{ + int semid, eval; + int key = uap->key; + int nsems = uap->nsems; + int semflg = uap->semflg; + struct ucred *cred = p->p_ucred; + +#ifdef SEM_DEBUG + printf("semget(0x%x, %d, 0%o)\n", key, nsems, semflg); +#endif + + if (key != IPC_PRIVATE) { + for (semid = 0; semid < seminfo.semmni; semid++) { + if ((sema[semid].sem_perm.mode & SEM_ALLOC) && + sema[semid].sem_perm.key == key) + break; + } + if (semid < seminfo.semmni) { +#ifdef SEM_DEBUG + printf("found public key\n"); +#endif + if ((eval = ipcperm(cred, &sema[semid].sem_perm, + semflg & 0700))) + return(eval); + if (nsems > 0 && sema[semid].sem_nsems < nsems) { +#ifdef SEM_DEBUG + printf("too small\n"); +#endif + return(EINVAL); + } + if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) { +#ifdef SEM_DEBUG + printf("not exclusive\n"); +#endif + return(EEXIST); + } + goto found; + } + } + +#ifdef SEM_DEBUG + printf("need to allocate the semid_ds\n"); +#endif + if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) { + if (nsems <= 0 || nsems > seminfo.semmsl) { +#ifdef SEM_DEBUG + printf("nsems out of range (0<%d<=%d)\n", nsems, + seminfo.semmsl); +#endif + return(EINVAL); + } + if (nsems > seminfo.semmns - semtot) { +#ifdef SEM_DEBUG + printf("not enough semaphores left (need %d, got %d)\n", + nsems, seminfo.semmns - semtot); +#endif + return(ENOSPC); + } + for (semid = 0; semid < seminfo.semmni; semid++) { + if ((sema[semid].sem_perm.mode & SEM_ALLOC) == 0) + break; + } + if (semid == seminfo.semmni) { +#ifdef SEM_DEBUG + printf("no more semid_ds's available\n"); +#endif + return(ENOSPC); + } +#ifdef SEM_DEBUG + printf("semid %d is available\n", semid); +#endif + sema[semid].sem_perm.key = key; + sema[semid].sem_perm.cuid = cred->cr_uid; + sema[semid].sem_perm.uid = cred->cr_uid; + sema[semid].sem_perm.cgid = cred->cr_gid; + sema[semid].sem_perm.gid = cred->cr_gid; + sema[semid].sem_perm.mode = (semflg & 0777) | SEM_ALLOC; + sema[semid].sem_perm.seq = + (sema[semid].sem_perm.seq + 1) & 0x7fff; + sema[semid].sem_nsems = nsems; + sema[semid].sem_otime = 0; + sema[semid].sem_ctime = time_second; + sema[semid].sem_base = &sem[semtot]; + semtot += nsems; + bzero(sema[semid].sem_base, + sizeof(sema[semid].sem_base[0])*nsems); +#ifdef SEM_DEBUG + printf("sembase = 0x%x, next = 0x%x\n", sema[semid].sem_base, + &sem[semtot]); +#endif + } else { +#ifdef SEM_DEBUG + printf("didn't find it and wasn't asked to create it\n"); +#endif + return(ENOENT); + } + +found: + p->p_retval[0] = IXSEQ_TO_IPCID(semid, sema[semid].sem_perm); + return(0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct semop_args { + int semid; + struct sembuf *sops; + int nsops; +}; +#endif + +int +semop(p, uap) + struct proc *p; + register struct semop_args *uap; +{ + int semid = uap->semid; + int nsops = uap->nsops; + struct sembuf sops[MAX_SOPS]; + register struct semid_ds *semaptr; + register struct sembuf *sopptr; + register struct sem *semptr; + struct sem_undo *suptr = NULL; + struct ucred *cred = p->p_ucred; + int i, j, eval; + int do_wakeup, do_undos; + +#ifdef SEM_DEBUG + printf("call to semop(%d, 0x%x, %d)\n", semid, sops, nsops); +#endif + + semid = IPCID_TO_IX(semid); /* Convert back to zero origin */ + + if (semid < 0 || semid >= seminfo.semmsl) + return(EINVAL); + + semaptr = &sema[semid]; + if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0) + return(EINVAL); + if (semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) + return(EINVAL); + + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_W))) { +#ifdef SEM_DEBUG + printf("eval = %d from ipaccess\n", eval); +#endif + return(eval); + } + + if (nsops > MAX_SOPS) { +#ifdef SEM_DEBUG + printf("too many sops (max=%d, nsops=%d)\n", MAX_SOPS, nsops); +#endif + return(E2BIG); + } + + if ((eval = copyin(uap->sops, &sops, nsops * sizeof(sops[0]))) != 0) { +#ifdef SEM_DEBUG + printf("eval = %d from copyin(%08x, %08x, %d)\n", eval, + uap->sops, &sops, nsops * sizeof(sops[0])); +#endif + return(eval); + } + + /* + * Loop trying to satisfy the vector of requests. + * If we reach a point where we must wait, any requests already + * performed are rolled back and we go to sleep until some other + * process wakes us up. At this point, we start all over again. + * + * This ensures that from the perspective of other tasks, a set + * of requests is atomic (never partially satisfied). + */ + do_undos = 0; + + for (;;) { + do_wakeup = 0; + + for (i = 0; i < nsops; i++) { + sopptr = &sops[i]; + + if (sopptr->sem_num >= semaptr->sem_nsems) + return(EFBIG); + + semptr = &semaptr->sem_base[sopptr->sem_num]; + +#ifdef SEM_DEBUG + printf("semop: semaptr=%x, sem_base=%x, semptr=%x, sem[%d]=%d : op=%d, flag=%s\n", + semaptr, semaptr->sem_base, semptr, + sopptr->sem_num, semptr->semval, sopptr->sem_op, + (sopptr->sem_flg & IPC_NOWAIT) ? "nowait" : "wait"); +#endif + + if (sopptr->sem_op < 0) { + if (semptr->semval + sopptr->sem_op < 0) { +#ifdef SEM_DEBUG + printf("semop: can't do it now\n"); +#endif + break; + } else { + semptr->semval += sopptr->sem_op; + if (semptr->semval == 0 && + semptr->semzcnt > 0) + do_wakeup = 1; + } + if (sopptr->sem_flg & SEM_UNDO) + do_undos = 1; + } else if (sopptr->sem_op == 0) { + if (semptr->semval > 0) { +#ifdef SEM_DEBUG + printf("semop: not zero now\n"); +#endif + break; + } + } else { + if (semptr->semncnt > 0) + do_wakeup = 1; + semptr->semval += sopptr->sem_op; + if (sopptr->sem_flg & SEM_UNDO) + do_undos = 1; + } + } + + /* + * Did we get through the entire vector? + */ + if (i >= nsops) + goto done; + + /* + * No ... rollback anything that we've already done + */ +#ifdef SEM_DEBUG + printf("semop: rollback 0 through %d\n", i-1); +#endif + for (j = 0; j < i; j++) + semaptr->sem_base[sops[j].sem_num].semval -= + sops[j].sem_op; + + /* + * If the request that we couldn't satisfy has the + * NOWAIT flag set then return with EAGAIN. + */ + if (sopptr->sem_flg & IPC_NOWAIT) + return(EAGAIN); + + if (sopptr->sem_op == 0) + semptr->semzcnt++; + else + semptr->semncnt++; + +#ifdef SEM_DEBUG + printf("semop: good night!\n"); +#endif + eval = tsleep((caddr_t)semaptr, (PZERO - 4) | PCATCH, + "semwait", 0); +#ifdef SEM_DEBUG + printf("semop: good morning (eval=%d)!\n", eval); +#endif + + suptr = NULL; /* sem_undo may have been reallocated */ + + if (eval != 0) + return(EINTR); +#ifdef SEM_DEBUG + printf("semop: good morning!\n"); +#endif + + /* + * Make sure that the semaphore still exists + */ + if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 || + semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) { + /* The man page says to return EIDRM. */ + /* Unfortunately, BSD doesn't define that code! */ +#ifdef EIDRM + return(EIDRM); +#else + return(EINVAL); +#endif + } + + /* + * The semaphore is still alive. Readjust the count of + * waiting processes. + */ + if (sopptr->sem_op == 0) + semptr->semzcnt--; + else + semptr->semncnt--; + } + +done: + /* + * Process any SEM_UNDO requests. + */ + if (do_undos) { + for (i = 0; i < nsops; i++) { + /* + * We only need to deal with SEM_UNDO's for non-zero + * op's. + */ + int adjval; + + if ((sops[i].sem_flg & SEM_UNDO) == 0) + continue; + adjval = sops[i].sem_op; + if (adjval == 0) + continue; + eval = semundo_adjust(p, &suptr, semid, + sops[i].sem_num, -adjval); + if (eval == 0) + continue; + + /* + * Oh-Oh! We ran out of either sem_undo's or undo's. + * Rollback the adjustments to this point and then + * rollback the semaphore ups and down so we can return + * with an error with all structures restored. We + * rollback the undo's in the exact reverse order that + * we applied them. This guarantees that we won't run + * out of space as we roll things back out. + */ + for (j = i - 1; j >= 0; j--) { + if ((sops[j].sem_flg & SEM_UNDO) == 0) + continue; + adjval = sops[j].sem_op; + if (adjval == 0) + continue; + if (semundo_adjust(p, &suptr, semid, + sops[j].sem_num, adjval) != 0) + panic("semop - can't undo undos"); + } + + for (j = 0; j < nsops; j++) + semaptr->sem_base[sops[j].sem_num].semval -= + sops[j].sem_op; + +#ifdef SEM_DEBUG + printf("eval = %d from semundo_adjust\n", eval); +#endif + return(eval); + } /* loop through the sops */ + } /* if (do_undos) */ + + /* We're definitely done - set the sempid's */ + for (i = 0; i < nsops; i++) { + sopptr = &sops[i]; + semptr = &semaptr->sem_base[sopptr->sem_num]; + semptr->sempid = p->p_pid; + } + + /* Do a wakeup if any semaphore was up'd. */ + if (do_wakeup) { +#ifdef SEM_DEBUG + printf("semop: doing wakeup\n"); +#ifdef SEM_WAKEUP + sem_wakeup((caddr_t)semaptr); +#else + wakeup((caddr_t)semaptr); +#endif + printf("semop: back from wakeup\n"); +#else + wakeup((caddr_t)semaptr); +#endif + } +#ifdef SEM_DEBUG + printf("semop: done\n"); +#endif + p->p_retval[0] = 0; + return(0); +} + +/* + * Go through the undo structures for this process and apply the adjustments to + * semaphores. + */ +void +semexit(p) + struct proc *p; +{ + register struct sem_undo *suptr; + register struct sem_undo **supptr; + int did_something; + + /* + * If somebody else is holding the global semaphore facility lock + * then sleep until it is released. + */ + while (semlock_holder != NULL && semlock_holder != p) { +#ifdef SEM_DEBUG + printf("semaphore facility locked - sleeping ...\n"); +#endif + (void) tsleep((caddr_t)&semlock_holder, (PZERO - 4), "semext", 0); + } + + did_something = 0; + + /* + * Go through the chain of undo vectors looking for one + * associated with this process. + */ + + for (supptr = &semu_list; (suptr = *supptr) != NULL; + supptr = &suptr->un_next) { + if (suptr->un_proc == p) + break; + } + + if (suptr == NULL) + goto unlock; + +#ifdef SEM_DEBUG + printf("proc @%08x has undo structure with %d entries\n", p, + suptr->un_cnt); +#endif + + /* + * If there are any active undo elements then process them. + */ + if (suptr->un_cnt > 0) { + int ix; + + for (ix = 0; ix < suptr->un_cnt; ix++) { + int semid = suptr->un_ent[ix].un_id; + int semnum = suptr->un_ent[ix].un_num; + int adjval = suptr->un_ent[ix].un_adjval; + struct semid_ds *semaptr; + + semaptr = &sema[semid]; + if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0) + panic("semexit - semid not allocated"); + if (semnum >= semaptr->sem_nsems) + panic("semexit - semnum out of range"); + +#ifdef SEM_DEBUG + printf("semexit: %08x id=%d num=%d(adj=%d) ; sem=%d\n", + suptr->un_proc, suptr->un_ent[ix].un_id, + suptr->un_ent[ix].un_num, + suptr->un_ent[ix].un_adjval, + semaptr->sem_base[semnum].semval); +#endif + + if (adjval < 0) { + if (semaptr->sem_base[semnum].semval < -adjval) + semaptr->sem_base[semnum].semval = 0; + else + semaptr->sem_base[semnum].semval += + adjval; + } else + semaptr->sem_base[semnum].semval += adjval; + +#ifdef SEM_WAKEUP + sem_wakeup((caddr_t)semaptr); +#else + wakeup((caddr_t)semaptr); +#endif +#ifdef SEM_DEBUG + printf("semexit: back from wakeup\n"); +#endif + } + } + + /* + * Deallocate the undo vector. + */ +#ifdef SEM_DEBUG + printf("removing vector\n"); +#endif + suptr->un_proc = NULL; + *supptr = suptr->un_next; + +unlock: + /* + * If the exiting process is holding the global semaphore facility + * lock then release it. + */ + if (semlock_holder == p) { + semlock_holder = NULL; + wakeup((caddr_t)&semlock_holder); + } +} diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c new file mode 100644 index 0000000..a6c2dfe --- /dev/null +++ b/sys/kern/sysv_shm.c @@ -0,0 +1,617 @@ +/* $Id: sysv_shm.c,v 1.39 1998/10/13 08:24:40 dg Exp $ */ +/* $NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $ */ + +/* + * Copyright (c) 1994 Adam Glass and Charles Hannum. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Adam Glass and Charles + * Hannum. + * 4. The names of the authors may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "opt_compat.h" +#include "opt_rlimit.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/shm.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/sysent.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_object.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/vm_pager.h> +#include <vm/vm_inherit.h> + +#ifndef _SYS_SYSPROTO_H_ +struct shmat_args; +extern int shmat __P((struct proc *p, struct shmat_args *uap)); +struct shmctl_args; +extern int shmctl __P((struct proc *p, struct shmctl_args *uap)); +struct shmdt_args; +extern int shmdt __P((struct proc *p, struct shmdt_args *uap)); +struct shmget_args; +extern int shmget __P((struct proc *p, struct shmget_args *uap)); +#endif + +static MALLOC_DEFINE(M_SHM, "shm", "SVID compatible shared memory segments"); + +static void shminit __P((void *)); +SYSINIT(sysv_shm, SI_SUB_SYSV_SHM, SI_ORDER_FIRST, shminit, NULL) + +struct oshmctl_args; +static int oshmctl __P((struct proc *p, struct oshmctl_args *uap)); +static int shmget_allocate_segment __P((struct proc *p, struct shmget_args *uap, int mode)); +static int shmget_existing __P((struct proc *p, struct shmget_args *uap, int mode, int segnum)); + +/* XXX casting to (sy_call_t *) is bogus, as usual. */ +static sy_call_t *shmcalls[] = { + (sy_call_t *)shmat, (sy_call_t *)oshmctl, + (sy_call_t *)shmdt, (sy_call_t *)shmget, + (sy_call_t *)shmctl +}; + +#define SHMSEG_FREE 0x0200 +#define SHMSEG_REMOVED 0x0400 +#define SHMSEG_ALLOCATED 0x0800 +#define SHMSEG_WANTED 0x1000 + +static int shm_last_free, shm_nused, shm_committed; +struct shmid_ds *shmsegs; + +struct shm_handle { + /* vm_offset_t kva; */ + vm_object_t shm_object; +}; + +struct shmmap_state { + vm_offset_t va; + int shmid; +}; + +static void shm_deallocate_segment __P((struct shmid_ds *)); +static int shm_find_segment_by_key __P((key_t)); +static struct shmid_ds *shm_find_segment_by_shmid __P((int)); +static int shm_delete_mapping __P((struct proc *, struct shmmap_state *)); + +static int +shm_find_segment_by_key(key) + key_t key; +{ + int i; + + for (i = 0; i < shminfo.shmmni; i++) + if ((shmsegs[i].shm_perm.mode & SHMSEG_ALLOCATED) && + shmsegs[i].shm_perm.key == key) + return i; + return -1; +} + +static struct shmid_ds * +shm_find_segment_by_shmid(shmid) + int shmid; +{ + int segnum; + struct shmid_ds *shmseg; + + segnum = IPCID_TO_IX(shmid); + if (segnum < 0 || segnum >= shminfo.shmmni) + return NULL; + shmseg = &shmsegs[segnum]; + if ((shmseg->shm_perm.mode & (SHMSEG_ALLOCATED | SHMSEG_REMOVED)) + != SHMSEG_ALLOCATED || + shmseg->shm_perm.seq != IPCID_TO_SEQ(shmid)) + return NULL; + return shmseg; +} + +static void +shm_deallocate_segment(shmseg) + struct shmid_ds *shmseg; +{ + struct shm_handle *shm_handle; + size_t size; + + shm_handle = shmseg->shm_internal; + vm_object_deallocate(shm_handle->shm_object); + free((caddr_t)shm_handle, M_SHM); + shmseg->shm_internal = NULL; + size = round_page(shmseg->shm_segsz); + shm_committed -= btoc(size); + shm_nused--; + shmseg->shm_perm.mode = SHMSEG_FREE; +} + +static int +shm_delete_mapping(p, shmmap_s) + struct proc *p; + struct shmmap_state *shmmap_s; +{ + struct shmid_ds *shmseg; + int segnum, result; + size_t size; + + segnum = IPCID_TO_IX(shmmap_s->shmid); + shmseg = &shmsegs[segnum]; + size = round_page(shmseg->shm_segsz); + result = vm_map_remove(&p->p_vmspace->vm_map, shmmap_s->va, shmmap_s->va + size); + if (result != KERN_SUCCESS) + return EINVAL; + shmmap_s->shmid = -1; + shmseg->shm_dtime = time_second; + if ((--shmseg->shm_nattch <= 0) && + (shmseg->shm_perm.mode & SHMSEG_REMOVED)) { + shm_deallocate_segment(shmseg); + shm_last_free = segnum; + } + return 0; +} + +#ifndef _SYS_SYSPROTO_H_ +struct shmdt_args { + void *shmaddr; +}; +#endif + +int +shmdt(p, uap) + struct proc *p; + struct shmdt_args *uap; +{ + struct shmmap_state *shmmap_s; + int i; + + shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm; + if (shmmap_s == NULL) + return EINVAL; + for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) + if (shmmap_s->shmid != -1 && + shmmap_s->va == (vm_offset_t)uap->shmaddr) + break; + if (i == shminfo.shmseg) + return EINVAL; + return shm_delete_mapping(p, shmmap_s); +} + +#ifndef _SYS_SYSPROTO_H_ +struct shmat_args { + int shmid; + void *shmaddr; + int shmflg; +}; +#endif + +int +shmat(p, uap) + struct proc *p; + struct shmat_args *uap; +{ + int error, i, flags; + struct ucred *cred = p->p_ucred; + struct shmid_ds *shmseg; + struct shmmap_state *shmmap_s = NULL; + struct shm_handle *shm_handle; + vm_offset_t attach_va; + vm_prot_t prot; + vm_size_t size; + int rv; + + shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm; + if (shmmap_s == NULL) { + size = shminfo.shmseg * sizeof(struct shmmap_state); + shmmap_s = malloc(size, M_SHM, M_WAITOK); + for (i = 0; i < shminfo.shmseg; i++) + shmmap_s[i].shmid = -1; + p->p_vmspace->vm_shm = (caddr_t)shmmap_s; + } + shmseg = shm_find_segment_by_shmid(uap->shmid); + if (shmseg == NULL) + return EINVAL; + error = ipcperm(cred, &shmseg->shm_perm, + (uap->shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W); + if (error) + return error; + for (i = 0; i < shminfo.shmseg; i++) { + if (shmmap_s->shmid == -1) + break; + shmmap_s++; + } + if (i >= shminfo.shmseg) + return EMFILE; + size = round_page(shmseg->shm_segsz); + prot = VM_PROT_READ; + if ((uap->shmflg & SHM_RDONLY) == 0) + prot |= VM_PROT_WRITE; + flags = MAP_ANON | MAP_SHARED; + if (uap->shmaddr) { + flags |= MAP_FIXED; + if (uap->shmflg & SHM_RND) + attach_va = (vm_offset_t)uap->shmaddr & ~(SHMLBA-1); + else if (((vm_offset_t)uap->shmaddr & (SHMLBA-1)) == 0) + attach_va = (vm_offset_t)uap->shmaddr; + else + return EINVAL; + } else { + /* This is just a hint to vm_map_find() about where to put it. */ + attach_va = round_page((vm_offset_t)p->p_vmspace->vm_taddr + MAXTSIZ + MAXDSIZ); + } + + shm_handle = shmseg->shm_internal; + vm_object_reference(shm_handle->shm_object); + rv = vm_map_find(&p->p_vmspace->vm_map, shm_handle->shm_object, + 0, &attach_va, size, (flags & MAP_FIXED)?0:1, prot, prot, 0); + if (rv != KERN_SUCCESS) { + return ENOMEM; + } + vm_map_inherit(&p->p_vmspace->vm_map, + attach_va, attach_va + size, VM_INHERIT_SHARE); + + shmmap_s->va = attach_va; + shmmap_s->shmid = uap->shmid; + shmseg->shm_lpid = p->p_pid; + shmseg->shm_atime = time_second; + shmseg->shm_nattch++; + p->p_retval[0] = attach_va; + return 0; +} + +struct oshmid_ds { + struct ipc_perm shm_perm; /* operation perms */ + int shm_segsz; /* size of segment (bytes) */ + ushort shm_cpid; /* pid, creator */ + ushort shm_lpid; /* pid, last operation */ + short shm_nattch; /* no. of current attaches */ + time_t shm_atime; /* last attach time */ + time_t shm_dtime; /* last detach time */ + time_t shm_ctime; /* last change time */ + void *shm_handle; /* internal handle for shm segment */ +}; + +struct oshmctl_args { + int shmid; + int cmd; + struct oshmid_ds *ubuf; +}; + +static int +oshmctl(p, uap) + struct proc *p; + struct oshmctl_args *uap; +{ +#ifdef COMPAT_43 + int error; + struct ucred *cred = p->p_ucred; + struct shmid_ds *shmseg; + struct oshmid_ds outbuf; + + shmseg = shm_find_segment_by_shmid(uap->shmid); + if (shmseg == NULL) + return EINVAL; + switch (uap->cmd) { + case IPC_STAT: + error = ipcperm(cred, &shmseg->shm_perm, IPC_R); + if (error) + return error; + outbuf.shm_perm = shmseg->shm_perm; + outbuf.shm_segsz = shmseg->shm_segsz; + outbuf.shm_cpid = shmseg->shm_cpid; + outbuf.shm_lpid = shmseg->shm_lpid; + outbuf.shm_nattch = shmseg->shm_nattch; + outbuf.shm_atime = shmseg->shm_atime; + outbuf.shm_dtime = shmseg->shm_dtime; + outbuf.shm_ctime = shmseg->shm_ctime; + outbuf.shm_handle = shmseg->shm_internal; + error = copyout((caddr_t)&outbuf, uap->ubuf, sizeof(outbuf)); + if (error) + return error; + break; + default: + /* XXX casting to (sy_call_t *) is bogus, as usual. */ + return ((sy_call_t *)shmctl)(p, uap); + } + return 0; +#else + return EINVAL; +#endif +} + +#ifndef _SYS_SYSPROTO_H_ +struct shmctl_args { + int shmid; + int cmd; + struct shmid_ds *buf; +}; +#endif + +int +shmctl(p, uap) + struct proc *p; + struct shmctl_args *uap; +{ + int error; + struct ucred *cred = p->p_ucred; + struct shmid_ds inbuf; + struct shmid_ds *shmseg; + + shmseg = shm_find_segment_by_shmid(uap->shmid); + if (shmseg == NULL) + return EINVAL; + switch (uap->cmd) { + case IPC_STAT: + error = ipcperm(cred, &shmseg->shm_perm, IPC_R); + if (error) + return error; + error = copyout((caddr_t)shmseg, uap->buf, sizeof(inbuf)); + if (error) + return error; + break; + case IPC_SET: + error = ipcperm(cred, &shmseg->shm_perm, IPC_M); + if (error) + return error; + error = copyin(uap->buf, (caddr_t)&inbuf, sizeof(inbuf)); + if (error) + return error; + shmseg->shm_perm.uid = inbuf.shm_perm.uid; + shmseg->shm_perm.gid = inbuf.shm_perm.gid; + shmseg->shm_perm.mode = + (shmseg->shm_perm.mode & ~ACCESSPERMS) | + (inbuf.shm_perm.mode & ACCESSPERMS); + shmseg->shm_ctime = time_second; + break; + case IPC_RMID: + error = ipcperm(cred, &shmseg->shm_perm, IPC_M); + if (error) + return error; + shmseg->shm_perm.key = IPC_PRIVATE; + shmseg->shm_perm.mode |= SHMSEG_REMOVED; + if (shmseg->shm_nattch <= 0) { + shm_deallocate_segment(shmseg); + shm_last_free = IPCID_TO_IX(uap->shmid); + } + break; +#if 0 + case SHM_LOCK: + case SHM_UNLOCK: +#endif + default: + return EINVAL; + } + return 0; +} + +#ifndef _SYS_SYSPROTO_H_ +struct shmget_args { + key_t key; + size_t size; + int shmflg; +}; +#endif + +static int +shmget_existing(p, uap, mode, segnum) + struct proc *p; + struct shmget_args *uap; + int mode; + int segnum; +{ + struct shmid_ds *shmseg; + struct ucred *cred = p->p_ucred; + int error; + + shmseg = &shmsegs[segnum]; + if (shmseg->shm_perm.mode & SHMSEG_REMOVED) { + /* + * This segment is in the process of being allocated. Wait + * until it's done, and look the key up again (in case the + * allocation failed or it was freed). + */ + shmseg->shm_perm.mode |= SHMSEG_WANTED; + error = tsleep((caddr_t)shmseg, PLOCK | PCATCH, "shmget", 0); + if (error) + return error; + return EAGAIN; + } + error = ipcperm(cred, &shmseg->shm_perm, mode); + if (error) + return error; + if (uap->size && uap->size > shmseg->shm_segsz) + return EINVAL; + if ((uap->shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL)) + return EEXIST; + p->p_retval[0] = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm); + return 0; +} + +static int +shmget_allocate_segment(p, uap, mode) + struct proc *p; + struct shmget_args *uap; + int mode; +{ + int i, segnum, shmid, size; + struct ucred *cred = p->p_ucred; + struct shmid_ds *shmseg; + struct shm_handle *shm_handle; + + if (uap->size < shminfo.shmmin || uap->size > shminfo.shmmax) + return EINVAL; + if (shm_nused >= shminfo.shmmni) /* any shmids left? */ + return ENOSPC; + size = round_page(uap->size); + if (shm_committed + btoc(size) > shminfo.shmall) + return ENOMEM; + if (shm_last_free < 0) { + for (i = 0; i < shminfo.shmmni; i++) + if (shmsegs[i].shm_perm.mode & SHMSEG_FREE) + break; + if (i == shminfo.shmmni) + panic("shmseg free count inconsistent"); + segnum = i; + } else { + segnum = shm_last_free; + shm_last_free = -1; + } + shmseg = &shmsegs[segnum]; + /* + * In case we sleep in malloc(), mark the segment present but deleted + * so that noone else tries to create the same key. + */ + shmseg->shm_perm.mode = SHMSEG_ALLOCATED | SHMSEG_REMOVED; + shmseg->shm_perm.key = uap->key; + shmseg->shm_perm.seq = (shmseg->shm_perm.seq + 1) & 0x7fff; + shm_handle = (struct shm_handle *) + malloc(sizeof(struct shm_handle), M_SHM, M_WAITOK); + shmid = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm); + + /* + * We make sure that we have allocated a pager before we need + * to. + */ + shm_handle->shm_object = + vm_pager_allocate(OBJT_SWAP, 0, size, VM_PROT_DEFAULT, 0); + vm_object_clear_flag(shm_handle->shm_object, OBJ_ONEMAPPING); + vm_object_set_flag(shm_handle->shm_object, OBJ_NOSPLIT); + + shmseg->shm_internal = shm_handle; + shmseg->shm_perm.cuid = shmseg->shm_perm.uid = cred->cr_uid; + shmseg->shm_perm.cgid = shmseg->shm_perm.gid = cred->cr_gid; + shmseg->shm_perm.mode = (shmseg->shm_perm.mode & SHMSEG_WANTED) | + (mode & ACCESSPERMS) | SHMSEG_ALLOCATED; + shmseg->shm_segsz = uap->size; + shmseg->shm_cpid = p->p_pid; + shmseg->shm_lpid = shmseg->shm_nattch = 0; + shmseg->shm_atime = shmseg->shm_dtime = 0; + shmseg->shm_ctime = time_second; + shm_committed += btoc(size); + shm_nused++; + if (shmseg->shm_perm.mode & SHMSEG_WANTED) { + /* + * Somebody else wanted this key while we were asleep. Wake + * them up now. + */ + shmseg->shm_perm.mode &= ~SHMSEG_WANTED; + wakeup((caddr_t)shmseg); + } + p->p_retval[0] = shmid; + return 0; +} + +int +shmget(p, uap) + struct proc *p; + struct shmget_args *uap; +{ + int segnum, mode, error; + + mode = uap->shmflg & ACCESSPERMS; + if (uap->key != IPC_PRIVATE) { + again: + segnum = shm_find_segment_by_key(uap->key); + if (segnum >= 0) { + error = shmget_existing(p, uap, mode, segnum); + if (error == EAGAIN) + goto again; + return error; + } + if ((uap->shmflg & IPC_CREAT) == 0) + return ENOENT; + } + return shmget_allocate_segment(p, uap, mode); +} + +int +shmsys(p, uap) + struct proc *p; + /* XXX actually varargs. */ + struct shmsys_args /* { + u_int which; + int a2; + int a3; + int a4; + } */ *uap; +{ + + if (uap->which >= sizeof(shmcalls)/sizeof(shmcalls[0])) + return EINVAL; + return ((*shmcalls[uap->which])(p, &uap->a2)); +} + +void +shmfork(p1, p2) + struct proc *p1, *p2; +{ + struct shmmap_state *shmmap_s; + size_t size; + int i; + + size = shminfo.shmseg * sizeof(struct shmmap_state); + shmmap_s = malloc(size, M_SHM, M_WAITOK); + bcopy((caddr_t)p1->p_vmspace->vm_shm, (caddr_t)shmmap_s, size); + p2->p_vmspace->vm_shm = (caddr_t)shmmap_s; + for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) + if (shmmap_s->shmid != -1) + shmsegs[IPCID_TO_IX(shmmap_s->shmid)].shm_nattch++; +} + +void +shmexit(p) + struct proc *p; +{ + struct shmmap_state *shmmap_s; + int i; + + shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm; + for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) + if (shmmap_s->shmid != -1) + shm_delete_mapping(p, shmmap_s); + free((caddr_t)p->p_vmspace->vm_shm, M_SHM); + p->p_vmspace->vm_shm = NULL; +} + +void +shminit(dummy) + void *dummy; +{ + int i; + for (i = 0; i < shminfo.shmmni; i++) { + shmsegs[i].shm_perm.mode = SHMSEG_FREE; + shmsegs[i].shm_perm.seq = 0; + } + shm_last_free = 0; + shm_nused = 0; + shm_committed = 0; +} diff --git a/sys/kern/tty.c b/sys/kern/tty.c new file mode 100644 index 0000000..1adf784 --- /dev/null +++ b/sys/kern/tty.c @@ -0,0 +1,2437 @@ +/*- + * Copyright (c) 1982, 1986, 1990, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty.c 8.8 (Berkeley) 1/21/94 + * $Id: tty.c,v 1.110 1998/12/08 10:22:07 bde Exp $ + */ + +/*- + * TODO: + * o Fix races for sending the start char in ttyflush(). + * o Handle inter-byte timeout for "MIN > 0, TIME > 0" in ttyselect(). + * With luck, there will be MIN chars before select() returns(). + * o Handle CLOCAL consistently for ptys. Perhaps disallow setting it. + * o Don't allow input in TS_ZOMBIE case. It would be visible through + * FIONREAD. + * o Do the new sio locking stuff here and use it to avoid special + * case for EXTPROC? + * o Lock PENDIN too? + * o Move EXTPROC and/or PENDIN to t_state? + * o Wrap most of ttioctl in spltty/splx. + * o Implement TIOCNOTTY or remove it from <sys/ioctl.h>. + * o Send STOP if IXOFF is toggled off while TS_TBLOCK is set. + * o Don't allow certain termios flags to affect disciplines other + * than TTYDISC. Cancel their effects before switch disciplines + * and ignore them if they are set while we are in another + * discipline. + * o Now that historical speed conversions are handled here, don't + * do them in drivers. + * o Check for TS_CARR_ON being set while everything is closed and not + * waiting for carrier. TS_CARR_ON isn't cleared if nothing is open, + * so it would live until the next open even if carrier drops. + * o Restore TS_WOPEN since it is useful in pstat. It must be cleared + * only when _all_ openers leave open(). + */ + +#include "snp.h" +#include "opt_compat.h" +#include "opt_uconsole.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/filio.h> +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#include <sys/ioctl_compat.h> +#endif +#include <sys/proc.h> +#define TTYDEFCHARS +#include <sys/tty.h> +#undef TTYDEFCHARS +#include <sys/fcntl.h> +#include <sys/conf.h> +#include <sys/dkstat.h> +#include <sys/poll.h> +#include <sys/kernel.h> +#include <sys/vnode.h> +#include <sys/signalvar.h> +#include <sys/resourcevar.h> +#include <sys/malloc.h> +#include <sys/filedesc.h> +#if NSNP > 0 +#include <sys/snoop.h> +#endif + +#include <vm/vm.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> + +MALLOC_DEFINE(M_TTYS, "ttys", "tty data structures"); + +static int proc_compare __P((struct proc *p1, struct proc *p2)); +static int ttnread __P((struct tty *tp)); +static void ttyecho __P((int c, struct tty *tp)); +static int ttyoutput __P((int c, register struct tty *tp)); +static void ttypend __P((struct tty *tp)); +static void ttyretype __P((struct tty *tp)); +static void ttyrub __P((int c, struct tty *tp)); +static void ttyrubo __P((struct tty *tp, int cnt)); +static void ttyunblock __P((struct tty *tp)); +static int ttywflush __P((struct tty *tp)); + +/* + * Table with character classes and parity. The 8th bit indicates parity, + * the 7th bit indicates the character is an alphameric or underscore (for + * ALTWERASE), and the low 6 bits indicate delay type. If the low 6 bits + * are 0 then the character needs no special processing on output; classes + * other than 0 might be translated or (not currently) require delays. + */ +#define E 0x00 /* Even parity. */ +#define O 0x80 /* Odd parity. */ +#define PARITY(c) (char_type[c] & O) + +#define ALPHA 0x40 /* Alpha or underscore. */ +#define ISALPHA(c) (char_type[(c) & TTY_CHARMASK] & ALPHA) + +#define CCLASSMASK 0x3f +#define CCLASS(c) (char_type[c] & CCLASSMASK) + +#define BS BACKSPACE +#define CC CONTROL +#define CR RETURN +#define NA ORDINARY | ALPHA +#define NL NEWLINE +#define NO ORDINARY +#define TB TAB +#define VT VTAB + +static u_char const char_type[] = { + E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* nul - bel */ + O|BS, E|TB, E|NL, O|CC, E|VT, O|CR, O|CC, E|CC, /* bs - si */ + O|CC, E|CC, E|CC, O|CC, E|CC, O|CC, O|CC, E|CC, /* dle - etb */ + E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* can - us */ + O|NO, E|NO, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* sp - ' */ + E|NO, O|NO, O|NO, E|NO, O|NO, E|NO, E|NO, O|NO, /* ( - / */ + E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* 0 - 7 */ + O|NA, E|NA, E|NO, O|NO, E|NO, O|NO, O|NO, E|NO, /* 8 - ? */ + O|NO, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* @ - G */ + E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* H - O */ + E|NA, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* P - W */ + O|NA, E|NA, E|NA, O|NO, E|NO, O|NO, O|NO, O|NA, /* X - _ */ + E|NO, O|NA, O|NA, E|NA, O|NA, E|NA, E|NA, O|NA, /* ` - g */ + O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* h - o */ + O|NA, E|NA, E|NA, O|NA, E|NA, O|NA, O|NA, E|NA, /* p - w */ + E|NA, O|NA, O|NA, E|NO, O|NO, E|NO, E|NO, O|CC, /* x - del */ + /* + * Meta chars; should be settable per character set; + * for now, treat them all as normal characters. + */ + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, + NA, NA, NA, NA, NA, NA, NA, NA, +}; +#undef BS +#undef CC +#undef CR +#undef NA +#undef NL +#undef NO +#undef TB +#undef VT + +/* Macros to clear/set/test flags. */ +#define SET(t, f) (t) |= (f) +#define CLR(t, f) (t) &= ~(f) +#define ISSET(t, f) ((t) & (f)) + +#undef MAX_INPUT /* XXX wrong in <sys/syslimits.h> */ +#define MAX_INPUT TTYHOG /* XXX limit is usually larger for !ICANON */ + +/* + * Initial open of tty, or (re)entry to standard tty line discipline. + */ +int +ttyopen(device, tp) + dev_t device; + register struct tty *tp; +{ + int s; + + s = spltty(); + tp->t_dev = device; + if (!ISSET(tp->t_state, TS_ISOPEN)) { + SET(tp->t_state, TS_ISOPEN); + if (ISSET(tp->t_cflag, CLOCAL)) + SET(tp->t_state, TS_CONNECTED); + bzero(&tp->t_winsize, sizeof(tp->t_winsize)); + } + ttsetwater(tp); + splx(s); + return (0); +} + +/* + * Handle close() on a tty line: flush and set to initial state, + * bumping generation number so that pending read/write calls + * can detect recycling of the tty. + * XXX our caller should have done `spltty(); l_close(); ttyclose();' + * and l_close() should have flushed, but we repeat the spltty() and + * the flush in case there are buggy callers. + */ +int +ttyclose(tp) + register struct tty *tp; +{ + int s; + + funsetown(tp->t_sigio); + s = spltty(); + if (constty == tp) + constty = NULL; + + ttyflush(tp, FREAD | FWRITE); + clist_free_cblocks(&tp->t_canq); + clist_free_cblocks(&tp->t_outq); + clist_free_cblocks(&tp->t_rawq); + +#if NSNP > 0 + if (ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL) + snpdown((struct snoop *)tp->t_sc); +#endif + + tp->t_gen++; + tp->t_line = TTYDISC; + tp->t_pgrp = NULL; + tp->t_session = NULL; + tp->t_state = 0; + splx(s); + return (0); +} + +#define FLUSHQ(q) { \ + if ((q)->c_cc) \ + ndflush(q, (q)->c_cc); \ +} + +/* Is 'c' a line delimiter ("break" character)? */ +#define TTBREAKC(c, lflag) \ + ((c) == '\n' || (((c) == cc[VEOF] || \ + (c) == cc[VEOL] || ((c) == cc[VEOL2] && lflag & IEXTEN)) && \ + (c) != _POSIX_VDISABLE)) + +/* + * Process input of a single character received on a tty. + */ +int +ttyinput(c, tp) + register int c; + register struct tty *tp; +{ + register tcflag_t iflag, lflag; + register cc_t *cc; + int i, err; + + /* + * If input is pending take it first. + */ + lflag = tp->t_lflag; + if (ISSET(lflag, PENDIN)) + ttypend(tp); + /* + * Gather stats. + */ + if (ISSET(lflag, ICANON)) { + ++tk_cancc; + ++tp->t_cancc; + } else { + ++tk_rawcc; + ++tp->t_rawcc; + } + ++tk_nin; + + /* + * Block further input iff: + * current input > threshold AND input is available to user program + * AND input flow control is enabled and not yet invoked. + * The 3 is slop for PARMRK. + */ + iflag = tp->t_iflag; + if (tp->t_rawq.c_cc + tp->t_canq.c_cc > tp->t_ihiwat - 3 && + (!ISSET(lflag, ICANON) || tp->t_canq.c_cc != 0) && + (ISSET(tp->t_cflag, CRTS_IFLOW) || ISSET(iflag, IXOFF)) && + !ISSET(tp->t_state, TS_TBLOCK)) + ttyblock(tp); + + /* Handle exceptional conditions (break, parity, framing). */ + cc = tp->t_cc; + err = (ISSET(c, TTY_ERRORMASK)); + if (err) { + CLR(c, TTY_ERRORMASK); + if (ISSET(err, TTY_BI)) { + if (ISSET(iflag, IGNBRK)) + return (0); + if (ISSET(iflag, BRKINT)) { + ttyflush(tp, FREAD | FWRITE); + pgsignal(tp->t_pgrp, SIGINT, 1); + goto endcase; + } + if (ISSET(iflag, PARMRK)) + goto parmrk; + } else if ((ISSET(err, TTY_PE) && ISSET(iflag, INPCK)) + || ISSET(err, TTY_FE)) { + if (ISSET(iflag, IGNPAR)) + return (0); + else if (ISSET(iflag, PARMRK)) { +parmrk: + if (tp->t_rawq.c_cc + tp->t_canq.c_cc > + MAX_INPUT - 3) + goto input_overflow; + (void)putc(0377 | TTY_QUOTE, &tp->t_rawq); + (void)putc(0 | TTY_QUOTE, &tp->t_rawq); + (void)putc(c | TTY_QUOTE, &tp->t_rawq); + goto endcase; + } else + c = 0; + } + } + + if (!ISSET(tp->t_state, TS_TYPEN) && ISSET(iflag, ISTRIP)) + CLR(c, 0x80); + if (!ISSET(lflag, EXTPROC)) { + /* + * Check for literal nexting very first + */ + if (ISSET(tp->t_state, TS_LNCH)) { + SET(c, TTY_QUOTE); + CLR(tp->t_state, TS_LNCH); + } + /* + * Scan for special characters. This code + * is really just a big case statement with + * non-constant cases. The bottom of the + * case statement is labeled ``endcase'', so goto + * it after a case match, or similar. + */ + + /* + * Control chars which aren't controlled + * by ICANON, ISIG, or IXON. + */ + if (ISSET(lflag, IEXTEN)) { + if (CCEQ(cc[VLNEXT], c)) { + if (ISSET(lflag, ECHO)) { + if (ISSET(lflag, ECHOE)) { + (void)ttyoutput('^', tp); + (void)ttyoutput('\b', tp); + } else + ttyecho(c, tp); + } + SET(tp->t_state, TS_LNCH); + goto endcase; + } + if (CCEQ(cc[VDISCARD], c)) { + if (ISSET(lflag, FLUSHO)) + CLR(tp->t_lflag, FLUSHO); + else { + ttyflush(tp, FWRITE); + ttyecho(c, tp); + if (tp->t_rawq.c_cc + tp->t_canq.c_cc) + ttyretype(tp); + SET(tp->t_lflag, FLUSHO); + } + goto startoutput; + } + } + /* + * Signals. + */ + if (ISSET(lflag, ISIG)) { + if (CCEQ(cc[VINTR], c) || CCEQ(cc[VQUIT], c)) { + if (!ISSET(lflag, NOFLSH)) + ttyflush(tp, FREAD | FWRITE); + ttyecho(c, tp); + pgsignal(tp->t_pgrp, + CCEQ(cc[VINTR], c) ? SIGINT : SIGQUIT, 1); + goto endcase; + } + if (CCEQ(cc[VSUSP], c)) { + if (!ISSET(lflag, NOFLSH)) + ttyflush(tp, FREAD); + ttyecho(c, tp); + pgsignal(tp->t_pgrp, SIGTSTP, 1); + goto endcase; + } + } + /* + * Handle start/stop characters. + */ + if (ISSET(iflag, IXON)) { + if (CCEQ(cc[VSTOP], c)) { + if (!ISSET(tp->t_state, TS_TTSTOP)) { + SET(tp->t_state, TS_TTSTOP); +#ifdef sun4c /* XXX */ + (*tp->t_stop)(tp, 0); +#else + (*cdevsw[major(tp->t_dev)]->d_stop)(tp, + 0); +#endif + return (0); + } + if (!CCEQ(cc[VSTART], c)) + return (0); + /* + * if VSTART == VSTOP then toggle + */ + goto endcase; + } + if (CCEQ(cc[VSTART], c)) + goto restartoutput; + } + /* + * IGNCR, ICRNL, & INLCR + */ + if (c == '\r') { + if (ISSET(iflag, IGNCR)) + return (0); + else if (ISSET(iflag, ICRNL)) + c = '\n'; + } else if (c == '\n' && ISSET(iflag, INLCR)) + c = '\r'; + } + if (!ISSET(tp->t_lflag, EXTPROC) && ISSET(lflag, ICANON)) { + /* + * From here on down canonical mode character + * processing takes place. + */ + /* + * erase (^H / ^?) + */ + if (CCEQ(cc[VERASE], c)) { + if (tp->t_rawq.c_cc) + ttyrub(unputc(&tp->t_rawq), tp); + goto endcase; + } + /* + * kill (^U) + */ + if (CCEQ(cc[VKILL], c)) { + if (ISSET(lflag, ECHOKE) && + tp->t_rawq.c_cc == tp->t_rocount && + !ISSET(lflag, ECHOPRT)) + while (tp->t_rawq.c_cc) + ttyrub(unputc(&tp->t_rawq), tp); + else { + ttyecho(c, tp); + if (ISSET(lflag, ECHOK) || + ISSET(lflag, ECHOKE)) + ttyecho('\n', tp); + FLUSHQ(&tp->t_rawq); + tp->t_rocount = 0; + } + CLR(tp->t_state, TS_LOCAL); + goto endcase; + } + /* + * word erase (^W) + */ + if (CCEQ(cc[VWERASE], c) && ISSET(lflag, IEXTEN)) { + int ctype; + + /* + * erase whitespace + */ + while ((c = unputc(&tp->t_rawq)) == ' ' || c == '\t') + ttyrub(c, tp); + if (c == -1) + goto endcase; + /* + * erase last char of word and remember the + * next chars type (for ALTWERASE) + */ + ttyrub(c, tp); + c = unputc(&tp->t_rawq); + if (c == -1) + goto endcase; + if (c == ' ' || c == '\t') { + (void)putc(c, &tp->t_rawq); + goto endcase; + } + ctype = ISALPHA(c); + /* + * erase rest of word + */ + do { + ttyrub(c, tp); + c = unputc(&tp->t_rawq); + if (c == -1) + goto endcase; + } while (c != ' ' && c != '\t' && + (!ISSET(lflag, ALTWERASE) || ISALPHA(c) == ctype)); + (void)putc(c, &tp->t_rawq); + goto endcase; + } + /* + * reprint line (^R) + */ + if (CCEQ(cc[VREPRINT], c) && ISSET(lflag, IEXTEN)) { + ttyretype(tp); + goto endcase; + } + /* + * ^T - kernel info and generate SIGINFO + */ + if (CCEQ(cc[VSTATUS], c) && ISSET(lflag, IEXTEN)) { + if (ISSET(lflag, ISIG)) + pgsignal(tp->t_pgrp, SIGINFO, 1); + if (!ISSET(lflag, NOKERNINFO)) + ttyinfo(tp); + goto endcase; + } + } + /* + * Check for input buffer overflow + */ + if (tp->t_rawq.c_cc + tp->t_canq.c_cc >= MAX_INPUT) { +input_overflow: + if (ISSET(iflag, IMAXBEL)) { + if (tp->t_outq.c_cc < tp->t_ohiwat) + (void)ttyoutput(CTRL('g'), tp); + } + goto endcase; + } + + if ( c == 0377 && ISSET(iflag, PARMRK) && !ISSET(iflag, ISTRIP) + && ISSET(iflag, IGNBRK|IGNPAR) != (IGNBRK|IGNPAR)) + (void)putc(0377 | TTY_QUOTE, &tp->t_rawq); + + /* + * Put data char in q for user and + * wakeup on seeing a line delimiter. + */ + if (putc(c, &tp->t_rawq) >= 0) { + if (!ISSET(lflag, ICANON)) { + ttwakeup(tp); + ttyecho(c, tp); + goto endcase; + } + if (TTBREAKC(c, lflag)) { + tp->t_rocount = 0; + catq(&tp->t_rawq, &tp->t_canq); + ttwakeup(tp); + } else if (tp->t_rocount++ == 0) + tp->t_rocol = tp->t_column; + if (ISSET(tp->t_state, TS_ERASE)) { + /* + * end of prterase \.../ + */ + CLR(tp->t_state, TS_ERASE); + (void)ttyoutput('/', tp); + } + i = tp->t_column; + ttyecho(c, tp); + if (CCEQ(cc[VEOF], c) && ISSET(lflag, ECHO)) { + /* + * Place the cursor over the '^' of the ^D. + */ + i = imin(2, tp->t_column - i); + while (i > 0) { + (void)ttyoutput('\b', tp); + i--; + } + } + } +endcase: + /* + * IXANY means allow any character to restart output. + */ + if (ISSET(tp->t_state, TS_TTSTOP) && + !ISSET(iflag, IXANY) && cc[VSTART] != cc[VSTOP]) + return (0); +restartoutput: + CLR(tp->t_lflag, FLUSHO); + CLR(tp->t_state, TS_TTSTOP); +startoutput: + return (ttstart(tp)); +} + +/* + * Output a single character on a tty, doing output processing + * as needed (expanding tabs, newline processing, etc.). + * Returns < 0 if succeeds, otherwise returns char to resend. + * Must be recursive. + */ +static int +ttyoutput(c, tp) + register int c; + register struct tty *tp; +{ + register tcflag_t oflag; + register int col, s; + + oflag = tp->t_oflag; + if (!ISSET(oflag, OPOST)) { + if (ISSET(tp->t_lflag, FLUSHO)) + return (-1); + if (putc(c, &tp->t_outq)) + return (c); + tk_nout++; + tp->t_outcc++; + return (-1); + } + /* + * Do tab expansion if OXTABS is set. Special case if we external + * processing, we don't do the tab expansion because we'll probably + * get it wrong. If tab expansion needs to be done, let it happen + * externally. + */ + CLR(c, ~TTY_CHARMASK); + if (c == '\t' && + ISSET(oflag, OXTABS) && !ISSET(tp->t_lflag, EXTPROC)) { + c = 8 - (tp->t_column & 7); + if (!ISSET(tp->t_lflag, FLUSHO)) { + s = spltty(); /* Don't interrupt tabs. */ + c -= b_to_q(" ", c, &tp->t_outq); + tk_nout += c; + tp->t_outcc += c; + splx(s); + } + tp->t_column += c; + return (c ? -1 : '\t'); + } + if (c == CEOT && ISSET(oflag, ONOEOT)) + return (-1); + + /* + * Newline translation: if ONLCR is set, + * translate newline into "\r\n". + */ + if (c == '\n' && ISSET(tp->t_oflag, ONLCR)) { + tk_nout++; + tp->t_outcc++; + if (putc('\r', &tp->t_outq)) + return (c); + } + tk_nout++; + tp->t_outcc++; + if (!ISSET(tp->t_lflag, FLUSHO) && putc(c, &tp->t_outq)) + return (c); + + col = tp->t_column; + switch (CCLASS(c)) { + case BACKSPACE: + if (col > 0) + --col; + break; + case CONTROL: + break; + case NEWLINE: + case RETURN: + col = 0; + break; + case ORDINARY: + ++col; + break; + case TAB: + col = (col + 8) & ~7; + break; + } + tp->t_column = col; + return (-1); +} + +/* + * Ioctls for all tty devices. Called after line-discipline specific ioctl + * has been called to do discipline-specific functions and/or reject any + * of these ioctl commands. + */ +/* ARGSUSED */ +int +ttioctl(tp, cmd, data, flag) + register struct tty *tp; + u_long cmd; + int flag; + void *data; +{ + register struct proc *p; + int s, error; + + p = curproc; /* XXX */ + + /* If the ioctl involves modification, hang if in the background. */ + switch (cmd) { + case TIOCCBRK: + case TIOCCONS: + case TIOCDRAIN: + case TIOCEXCL: + case TIOCFLUSH: +#ifdef TIOCHPCL + case TIOCHPCL: +#endif + case TIOCNXCL: + case TIOCSBRK: + case TIOCSCTTY: + case TIOCSDRAINWAIT: + case TIOCSETA: + case TIOCSETAF: + case TIOCSETAW: + case TIOCSETD: + case TIOCSPGRP: + case TIOCSTART: + case TIOCSTAT: + case TIOCSTI: + case TIOCSTOP: + case TIOCSWINSZ: +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + case TIOCLBIC: + case TIOCLBIS: + case TIOCLSET: + case TIOCSETC: + case OTIOCSETD: + case TIOCSETN: + case TIOCSETP: + case TIOCSLTC: +#endif + while (isbackground(p, tp) && + (p->p_flag & P_PPWAIT) == 0 && + (p->p_sigignore & sigmask(SIGTTOU)) == 0 && + (p->p_sigmask & sigmask(SIGTTOU)) == 0) { + if (p->p_pgrp->pg_jobc == 0) + return (EIO); + pgsignal(p->p_pgrp, SIGTTOU, 1); + error = ttysleep(tp, &lbolt, TTOPRI | PCATCH, "ttybg1", + 0); + if (error) + return (error); + } + break; + } + + switch (cmd) { /* Process the ioctl. */ + case FIOASYNC: /* set/clear async i/o */ + s = spltty(); + if (*(int *)data) + SET(tp->t_state, TS_ASYNC); + else + CLR(tp->t_state, TS_ASYNC); + splx(s); + break; + case FIONBIO: /* set/clear non-blocking i/o */ + break; /* XXX: delete. */ + case FIONREAD: /* get # bytes to read */ + s = spltty(); + *(int *)data = ttnread(tp); + splx(s); + break; + + case FIOSETOWN: + /* + * Policy -- Don't allow FIOSETOWN on someone else's + * controlling tty + */ + if (tp->t_session != NULL && !isctty(p, tp)) + return (ENOTTY); + + error = fsetown(*(int *)data, &tp->t_sigio); + if (error) + return (error); + break; + case FIOGETOWN: + if (tp->t_session != NULL && !isctty(p, tp)) + return (ENOTTY); + *(int *)data = fgetown(tp->t_sigio); + break; + + case TIOCEXCL: /* set exclusive use of tty */ + s = spltty(); + SET(tp->t_state, TS_XCLUDE); + splx(s); + break; + case TIOCFLUSH: { /* flush buffers */ + register int flags = *(int *)data; + + if (flags == 0) + flags = FREAD | FWRITE; + else + flags &= FREAD | FWRITE; + ttyflush(tp, flags); + break; + } + case TIOCCONS: /* become virtual console */ + if (*(int *)data) { + if (constty && constty != tp && + ISSET(constty->t_state, TS_CONNECTED)) + return (EBUSY); +#ifndef UCONSOLE + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); +#endif + constty = tp; + } else if (tp == constty) + constty = NULL; + break; + case TIOCDRAIN: /* wait till output drained */ + error = ttywait(tp); + if (error) + return (error); + break; + case TIOCGETA: { /* get termios struct */ + struct termios *t = (struct termios *)data; + + bcopy(&tp->t_termios, t, sizeof(struct termios)); + break; + } + case TIOCGETD: /* get line discipline */ + *(int *)data = tp->t_line; + break; + case TIOCGWINSZ: /* get window size */ + *(struct winsize *)data = tp->t_winsize; + break; + case TIOCGPGRP: /* get pgrp of tty */ + if (!isctty(p, tp)) + return (ENOTTY); + *(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID; + break; +#ifdef TIOCHPCL + case TIOCHPCL: /* hang up on last close */ + s = spltty(); + SET(tp->t_cflag, HUPCL); + splx(s); + break; +#endif + case TIOCNXCL: /* reset exclusive use of tty */ + s = spltty(); + CLR(tp->t_state, TS_XCLUDE); + splx(s); + break; + case TIOCOUTQ: /* output queue size */ + *(int *)data = tp->t_outq.c_cc; + break; + case TIOCSETA: /* set termios struct */ + case TIOCSETAW: /* drain output, set */ + case TIOCSETAF: { /* drn out, fls in, set */ + register struct termios *t = (struct termios *)data; + + if (t->c_ispeed == 0) + t->c_ispeed = t->c_ospeed; + if (t->c_ispeed == 0) + t->c_ispeed = tp->t_ospeed; + if (t->c_ispeed == 0) + return (EINVAL); + s = spltty(); + if (cmd == TIOCSETAW || cmd == TIOCSETAF) { + error = ttywait(tp); + if (error) { + splx(s); + return (error); + } + if (cmd == TIOCSETAF) + ttyflush(tp, FREAD); + } + if (!ISSET(t->c_cflag, CIGNORE)) { + /* + * Set device hardware. + */ + if (tp->t_param && (error = (*tp->t_param)(tp, t))) { + splx(s); + return (error); + } + if (ISSET(t->c_cflag, CLOCAL) && + !ISSET(tp->t_cflag, CLOCAL)) { + /* + * XXX disconnections would be too hard to + * get rid of without this kludge. The only + * way to get rid of controlling terminals + * is to exit from the session leader. + */ + CLR(tp->t_state, TS_ZOMBIE); + + wakeup(TSA_CARR_ON(tp)); + ttwakeup(tp); + ttwwakeup(tp); + } + if ((ISSET(tp->t_state, TS_CARR_ON) || + ISSET(t->c_cflag, CLOCAL)) && + !ISSET(tp->t_state, TS_ZOMBIE)) + SET(tp->t_state, TS_CONNECTED); + else + CLR(tp->t_state, TS_CONNECTED); + tp->t_cflag = t->c_cflag; + tp->t_ispeed = t->c_ispeed; + if (t->c_ospeed != 0) + tp->t_ospeed = t->c_ospeed; + ttsetwater(tp); + } + if (ISSET(t->c_lflag, ICANON) != ISSET(tp->t_lflag, ICANON) && + cmd != TIOCSETAF) { + if (ISSET(t->c_lflag, ICANON)) + SET(tp->t_lflag, PENDIN); + else { + /* + * XXX we really shouldn't allow toggling + * ICANON while we're in a non-termios line + * discipline. Now we have to worry about + * panicing for a null queue. + */ + if (tp->t_canq.c_cbreserved > 0 && + tp->t_rawq.c_cbreserved > 0) { + catq(&tp->t_rawq, &tp->t_canq); + /* + * XXX the queue limits may be + * different, so the old queue + * swapping method no longer works. + */ + catq(&tp->t_canq, &tp->t_rawq); + } + CLR(tp->t_lflag, PENDIN); + } + ttwakeup(tp); + } + tp->t_iflag = t->c_iflag; + tp->t_oflag = t->c_oflag; + /* + * Make the EXTPROC bit read only. + */ + if (ISSET(tp->t_lflag, EXTPROC)) + SET(t->c_lflag, EXTPROC); + else + CLR(t->c_lflag, EXTPROC); + tp->t_lflag = t->c_lflag | ISSET(tp->t_lflag, PENDIN); + if (t->c_cc[VMIN] != tp->t_cc[VMIN] || + t->c_cc[VTIME] != tp->t_cc[VTIME]) + ttwakeup(tp); + bcopy(t->c_cc, tp->t_cc, sizeof(t->c_cc)); + splx(s); + break; + } + case TIOCSETD: { /* set line discipline */ + register int t = *(int *)data; + dev_t device = tp->t_dev; + + if ((u_int)t >= nlinesw) + return (ENXIO); + if (t != tp->t_line) { + s = spltty(); + (*linesw[tp->t_line].l_close)(tp, flag); + error = (*linesw[t].l_open)(device, tp); + if (error) { + (void)(*linesw[tp->t_line].l_open)(device, tp); + splx(s); + return (error); + } + tp->t_line = t; + splx(s); + } + break; + } + case TIOCSTART: /* start output, like ^Q */ + s = spltty(); + if (ISSET(tp->t_state, TS_TTSTOP) || + ISSET(tp->t_lflag, FLUSHO)) { + CLR(tp->t_lflag, FLUSHO); + CLR(tp->t_state, TS_TTSTOP); + ttstart(tp); + } + splx(s); + break; + case TIOCSTI: /* simulate terminal input */ + if (p->p_ucred->cr_uid && (flag & FREAD) == 0) + return (EPERM); + if (p->p_ucred->cr_uid && !isctty(p, tp)) + return (EACCES); + s = spltty(); + (*linesw[tp->t_line].l_rint)(*(u_char *)data, tp); + splx(s); + break; + case TIOCSTOP: /* stop output, like ^S */ + s = spltty(); + if (!ISSET(tp->t_state, TS_TTSTOP)) { + SET(tp->t_state, TS_TTSTOP); +#ifdef sun4c /* XXX */ + (*tp->t_stop)(tp, 0); +#else + (*cdevsw[major(tp->t_dev)]->d_stop)(tp, 0); +#endif + } + splx(s); + break; + case TIOCSCTTY: /* become controlling tty */ + /* Session ctty vnode pointer set in vnode layer. */ + if (!SESS_LEADER(p) || + ((p->p_session->s_ttyvp || tp->t_session) && + (tp->t_session != p->p_session))) + return (EPERM); + tp->t_session = p->p_session; + tp->t_pgrp = p->p_pgrp; + p->p_session->s_ttyp = tp; + p->p_flag |= P_CONTROLT; + break; + case TIOCSPGRP: { /* set pgrp of tty */ + register struct pgrp *pgrp = pgfind(*(int *)data); + + if (!isctty(p, tp)) + return (ENOTTY); + else if (pgrp == NULL || pgrp->pg_session != p->p_session) + return (EPERM); + tp->t_pgrp = pgrp; + break; + } + case TIOCSTAT: /* simulate control-T */ + s = spltty(); + ttyinfo(tp); + splx(s); + break; + case TIOCSWINSZ: /* set window size */ + if (bcmp((caddr_t)&tp->t_winsize, data, + sizeof (struct winsize))) { + tp->t_winsize = *(struct winsize *)data; + pgsignal(tp->t_pgrp, SIGWINCH, 1); + } + break; + case TIOCSDRAINWAIT: + error = suser(p->p_ucred, &p->p_acflag); + if (error) + return (error); + tp->t_timeout = *(int *)data * hz; + wakeup(TSA_OCOMPLETE(tp)); + wakeup(TSA_OLOWAT(tp)); + break; + case TIOCGDRAINWAIT: + *(int *)data = tp->t_timeout / hz; + break; + default: +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + return (ttcompat(tp, cmd, data, flag)); +#else + return (ENOIOCTL); +#endif + } + return (0); +} + +int +ttypoll(tp, events, p) + struct tty *tp; + int events; + struct proc *p; +{ + int s; + int revents = 0; + + if (tp == NULL) /* XXX used to return ENXIO, but that means true! */ + return ((events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)) + | POLLHUP); + + s = spltty(); + if (events & (POLLIN | POLLRDNORM)) + if (ttnread(tp) > 0 || ISSET(tp->t_state, TS_ZOMBIE)) + revents |= events & (POLLIN | POLLRDNORM); + else + selrecord(p, &tp->t_rsel); + + if (events & (POLLOUT | POLLWRNORM)) + if ((tp->t_outq.c_cc <= tp->t_olowat && + ISSET(tp->t_state, TS_CONNECTED)) + || ISSET(tp->t_state, TS_ZOMBIE)) + revents |= events & (POLLOUT | POLLWRNORM); + else + selrecord(p, &tp->t_wsel); + splx(s); + return (revents); +} + +/* + * This is a wrapper for compatibility with the select vector used by + * cdevsw. It relies on a proper xxxdevtotty routine. + */ +int +ttpoll(dev, events, p) + dev_t dev; + int events; + struct proc *p; +{ + return ttypoll((*cdevsw[major(dev)]->d_devtotty)(dev), events, p); +} + +/* + * Must be called at spltty(). + */ +static int +ttnread(tp) + struct tty *tp; +{ + int nread; + + if (ISSET(tp->t_lflag, PENDIN)) + ttypend(tp); + nread = tp->t_canq.c_cc; + if (!ISSET(tp->t_lflag, ICANON)) { + nread += tp->t_rawq.c_cc; + if (nread < tp->t_cc[VMIN] && tp->t_cc[VTIME] == 0) + nread = 0; + } + return (nread); +} + +/* + * Wait for output to drain. + */ +int +ttywait(tp) + register struct tty *tp; +{ + int error, s; + + error = 0; + s = spltty(); + while ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) && + ISSET(tp->t_state, TS_CONNECTED) && tp->t_oproc) { + (*tp->t_oproc)(tp); + if ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) && + ISSET(tp->t_state, TS_CONNECTED)) { + SET(tp->t_state, TS_SO_OCOMPLETE); + error = ttysleep(tp, TSA_OCOMPLETE(tp), + TTOPRI | PCATCH, "ttywai", + tp->t_timeout); + if (error) { + if (error == EWOULDBLOCK) + error = EIO; + break; + } + } else + break; + } + if (!error && (tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY))) + error = EIO; + splx(s); + return (error); +} + +/* + * Flush if successfully wait. + */ +static int +ttywflush(tp) + struct tty *tp; +{ + int error; + + if ((error = ttywait(tp)) == 0) + ttyflush(tp, FREAD); + return (error); +} + +/* + * Flush tty read and/or write queues, notifying anyone waiting. + */ +void +ttyflush(tp, rw) + register struct tty *tp; + int rw; +{ + register int s; + + s = spltty(); +#if 0 +again: +#endif + if (rw & FWRITE) { + FLUSHQ(&tp->t_outq); + CLR(tp->t_state, TS_TTSTOP); + } +#ifdef sun4c /* XXX */ + (*tp->t_stop)(tp, rw); +#else + (*cdevsw[major(tp->t_dev)]->d_stop)(tp, rw); +#endif + if (rw & FREAD) { + FLUSHQ(&tp->t_canq); + FLUSHQ(&tp->t_rawq); + CLR(tp->t_lflag, PENDIN); + tp->t_rocount = 0; + tp->t_rocol = 0; + CLR(tp->t_state, TS_LOCAL); + ttwakeup(tp); + if (ISSET(tp->t_state, TS_TBLOCK)) { + if (rw & FWRITE) + FLUSHQ(&tp->t_outq); + ttyunblock(tp); + + /* + * Don't let leave any state that might clobber the + * next line discipline (although we should do more + * to send the START char). Not clearing the state + * may have caused the "putc to a clist with no + * reserved cblocks" panic/printf. + */ + CLR(tp->t_state, TS_TBLOCK); + +#if 0 /* forget it, sleeping isn't always safe and we don't know when it is */ + if (ISSET(tp->t_iflag, IXOFF)) { + /* + * XXX wait a bit in the hope that the stop + * character (if any) will go out. Waiting + * isn't good since it allows races. This + * will be fixed when the stop character is + * put in a special queue. Don't bother with + * the checks in ttywait() since the timeout + * will save us. + */ + SET(tp->t_state, TS_SO_OCOMPLETE); + ttysleep(tp, TSA_OCOMPLETE(tp), TTOPRI, + "ttyfls", hz / 10); + /* + * Don't try sending the stop character again. + */ + CLR(tp->t_state, TS_TBLOCK); + goto again; + } +#endif + } + } + if (rw & FWRITE) { + FLUSHQ(&tp->t_outq); + ttwwakeup(tp); + } + splx(s); +} + +/* + * Copy in the default termios characters. + */ +void +termioschars(t) + struct termios *t; +{ + + bcopy(ttydefchars, t->c_cc, sizeof t->c_cc); +} + +/* + * Old interface. + */ +void +ttychars(tp) + struct tty *tp; +{ + + termioschars(&tp->t_termios); +} + +/* + * Handle input high water. Send stop character for the IXOFF case. Turn + * on our input flow control bit and propagate the changes to the driver. + * XXX the stop character should be put in a special high priority queue. + */ +void +ttyblock(tp) + struct tty *tp; +{ + + SET(tp->t_state, TS_TBLOCK); + if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTOP] != _POSIX_VDISABLE && + putc(tp->t_cc[VSTOP], &tp->t_outq) != 0) + CLR(tp->t_state, TS_TBLOCK); /* try again later */ + ttstart(tp); +} + +/* + * Handle input low water. Send start character for the IXOFF case. Turn + * off our input flow control bit and propagate the changes to the driver. + * XXX the start character should be put in a special high priority queue. + */ +static void +ttyunblock(tp) + struct tty *tp; +{ + + CLR(tp->t_state, TS_TBLOCK); + if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTART] != _POSIX_VDISABLE && + putc(tp->t_cc[VSTART], &tp->t_outq) != 0) + SET(tp->t_state, TS_TBLOCK); /* try again later */ + ttstart(tp); +} + +#ifdef notyet +/* Not used by any current (i386) drivers. */ +/* + * Restart after an inter-char delay. + */ +void +ttrstrt(tp_arg) + void *tp_arg; +{ + struct tty *tp; + int s; + + KASSERT(tp_arg != NULL, ("ttrstrt")); + + tp = tp_arg; + s = spltty(); + + CLR(tp->t_state, TS_TIMEOUT); + ttstart(tp); + + splx(s); +} +#endif + +int +ttstart(tp) + struct tty *tp; +{ + + if (tp->t_oproc != NULL) /* XXX: Kludge for pty. */ + (*tp->t_oproc)(tp); + return (0); +} + +/* + * "close" a line discipline + */ +int +ttylclose(tp, flag) + struct tty *tp; + int flag; +{ + + if (flag & FNONBLOCK || ttywflush(tp)) + ttyflush(tp, FREAD | FWRITE); + return (0); +} + +/* + * Handle modem control transition on a tty. + * Flag indicates new state of carrier. + * Returns 0 if the line should be turned off, otherwise 1. + */ +int +ttymodem(tp, flag) + register struct tty *tp; + int flag; +{ + + if (ISSET(tp->t_state, TS_CARR_ON) && ISSET(tp->t_cflag, MDMBUF)) { + /* + * MDMBUF: do flow control according to carrier flag + * XXX TS_CAR_OFLOW doesn't do anything yet. TS_TTSTOP + * works if IXON and IXANY are clear. + */ + if (flag) { + CLR(tp->t_state, TS_CAR_OFLOW); + CLR(tp->t_state, TS_TTSTOP); + ttstart(tp); + } else if (!ISSET(tp->t_state, TS_CAR_OFLOW)) { + SET(tp->t_state, TS_CAR_OFLOW); + SET(tp->t_state, TS_TTSTOP); +#ifdef sun4c /* XXX */ + (*tp->t_stop)(tp, 0); +#else + (*cdevsw[major(tp->t_dev)]->d_stop)(tp, 0); +#endif + } + } else if (flag == 0) { + /* + * Lost carrier. + */ + CLR(tp->t_state, TS_CARR_ON); + if (ISSET(tp->t_state, TS_ISOPEN) && + !ISSET(tp->t_cflag, CLOCAL)) { + SET(tp->t_state, TS_ZOMBIE); + CLR(tp->t_state, TS_CONNECTED); + if (tp->t_session && tp->t_session->s_leader) + psignal(tp->t_session->s_leader, SIGHUP); + ttyflush(tp, FREAD | FWRITE); + return (0); + } + } else { + /* + * Carrier now on. + */ + SET(tp->t_state, TS_CARR_ON); + if (!ISSET(tp->t_state, TS_ZOMBIE)) + SET(tp->t_state, TS_CONNECTED); + wakeup(TSA_CARR_ON(tp)); + ttwakeup(tp); + ttwwakeup(tp); + } + return (1); +} + +/* + * Reinput pending characters after state switch + * call at spltty(). + */ +static void +ttypend(tp) + register struct tty *tp; +{ + struct clist tq; + register int c; + + CLR(tp->t_lflag, PENDIN); + SET(tp->t_state, TS_TYPEN); + /* + * XXX this assumes too much about clist internals. It may even + * fail if the cblock slush pool is empty. We can't allocate more + * cblocks here because we are called from an interrupt handler + * and clist_alloc_cblocks() can wait. + */ + tq = tp->t_rawq; + bzero(&tp->t_rawq, sizeof tp->t_rawq); + tp->t_rawq.c_cbmax = tq.c_cbmax; + tp->t_rawq.c_cbreserved = tq.c_cbreserved; + while ((c = getc(&tq)) >= 0) + ttyinput(c, tp); + CLR(tp->t_state, TS_TYPEN); +} + +/* + * Process a read call on a tty device. + */ +int +ttread(tp, uio, flag) + register struct tty *tp; + struct uio *uio; + int flag; +{ + register struct clist *qp; + register int c; + register tcflag_t lflag; + register cc_t *cc = tp->t_cc; + register struct proc *p = curproc; + int s, first, error = 0; + int has_stime = 0, last_cc = 0; + long slp = 0; /* XXX this should be renamed `timo'. */ + struct timeval stime; + +loop: + s = spltty(); + lflag = tp->t_lflag; + /* + * take pending input first + */ + if (ISSET(lflag, PENDIN)) { + ttypend(tp); + splx(s); /* reduce latency */ + s = spltty(); + lflag = tp->t_lflag; /* XXX ttypend() clobbers it */ + } + + /* + * Hang process if it's in the background. + */ + if (isbackground(p, tp)) { + splx(s); + if ((p->p_sigignore & sigmask(SIGTTIN)) || + (p->p_sigmask & sigmask(SIGTTIN)) || + p->p_flag & P_PPWAIT || p->p_pgrp->pg_jobc == 0) + return (EIO); + pgsignal(p->p_pgrp, SIGTTIN, 1); + error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg2", 0); + if (error) + return (error); + goto loop; + } + + if (ISSET(tp->t_state, TS_ZOMBIE)) { + splx(s); + return (0); /* EOF */ + } + + /* + * If canonical, use the canonical queue, + * else use the raw queue. + * + * (should get rid of clists...) + */ + qp = ISSET(lflag, ICANON) ? &tp->t_canq : &tp->t_rawq; + + if (flag & IO_NDELAY) { + if (qp->c_cc > 0) + goto read; + if (!ISSET(lflag, ICANON) && cc[VMIN] == 0) { + splx(s); + return (0); + } + splx(s); + return (EWOULDBLOCK); + } + if (!ISSET(lflag, ICANON)) { + int m = cc[VMIN]; + long t = cc[VTIME]; + struct timeval timecopy; + + /* + * Check each of the four combinations. + * (m > 0 && t == 0) is the normal read case. + * It should be fairly efficient, so we check that and its + * companion case (m == 0 && t == 0) first. + * For the other two cases, we compute the target sleep time + * into slp. + */ + if (t == 0) { + if (qp->c_cc < m) + goto sleep; + if (qp->c_cc > 0) + goto read; + + /* m, t and qp->c_cc are all 0. 0 is enough input. */ + splx(s); + return (0); + } + t *= 100000; /* time in us */ +#define diff(t1, t2) (((t1).tv_sec - (t2).tv_sec) * 1000000 + \ + ((t1).tv_usec - (t2).tv_usec)) + if (m > 0) { + if (qp->c_cc <= 0) + goto sleep; + if (qp->c_cc >= m) + goto read; + getmicrotime(&timecopy); + if (!has_stime) { + /* first character, start timer */ + has_stime = 1; + stime = timecopy; + slp = t; + } else if (qp->c_cc > last_cc) { + /* got a character, restart timer */ + stime = timecopy; + slp = t; + } else { + /* nothing, check expiration */ + slp = t - diff(timecopy, stime); + if (slp <= 0) + goto read; + } + last_cc = qp->c_cc; + } else { /* m == 0 */ + if (qp->c_cc > 0) + goto read; + getmicrotime(&timecopy); + if (!has_stime) { + has_stime = 1; + stime = timecopy; + slp = t; + } else { + slp = t - diff(timecopy, stime); + if (slp <= 0) { + /* Timed out, but 0 is enough input. */ + splx(s); + return (0); + } + } + } +#undef diff + /* + * Rounding down may make us wake up just short + * of the target, so we round up. + * The formula is ceiling(slp * hz/1000000). + * 32-bit arithmetic is enough for hz < 169. + * XXX see tvtohz() for how to avoid overflow if hz + * is large (divide by `tick' and/or arrange to + * use tvtohz() if hz is large). + */ + slp = (long) (((u_long)slp * hz) + 999999) / 1000000; + goto sleep; + } + if (qp->c_cc <= 0) { +sleep: + /* + * There is no input, or not enough input and we can block. + */ + error = ttysleep(tp, TSA_HUP_OR_INPUT(tp), TTIPRI | PCATCH, + ISSET(tp->t_state, TS_CONNECTED) ? + "ttyin" : "ttyhup", (int)slp); + splx(s); + if (error == EWOULDBLOCK) + error = 0; + else if (error) + return (error); + /* + * XXX what happens if another process eats some input + * while we are asleep (not just here)? It would be + * safest to detect changes and reset our state variables + * (has_stime and last_cc). + */ + slp = 0; + goto loop; + } +read: + splx(s); + /* + * Input present, check for input mapping and processing. + */ + first = 1; + if (ISSET(lflag, ICANON | ISIG)) + goto slowcase; + for (;;) { + char ibuf[IBUFSIZ]; + int icc; + + icc = imin(uio->uio_resid, IBUFSIZ); + icc = q_to_b(qp, ibuf, icc); + if (icc <= 0) { + if (first) + goto loop; + break; + } + error = uiomove(ibuf, icc, uio); + /* + * XXX if there was an error then we should ungetc() the + * unmoved chars and reduce icc here. + */ +#if NSNP > 0 + if (ISSET(tp->t_lflag, ECHO) && + ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL) + snpin((struct snoop *)tp->t_sc, ibuf, icc); +#endif + if (error) + break; + if (uio->uio_resid == 0) + break; + first = 0; + } + goto out; +slowcase: + for (;;) { + c = getc(qp); + if (c < 0) { + if (first) + goto loop; + break; + } + /* + * delayed suspend (^Y) + */ + if (CCEQ(cc[VDSUSP], c) && + ISSET(lflag, IEXTEN | ISIG) == (IEXTEN | ISIG)) { + pgsignal(tp->t_pgrp, SIGTSTP, 1); + if (first) { + error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, + "ttybg3", 0); + if (error) + break; + goto loop; + } + break; + } + /* + * Interpret EOF only in canonical mode. + */ + if (CCEQ(cc[VEOF], c) && ISSET(lflag, ICANON)) + break; + /* + * Give user character. + */ + error = ureadc(c, uio); + if (error) + /* XXX should ungetc(c, qp). */ + break; +#if NSNP > 0 + /* + * Only snoop directly on input in echo mode. Non-echoed + * input will be snooped later iff the application echoes it. + */ + if (ISSET(tp->t_lflag, ECHO) && + ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL) + snpinc((struct snoop *)tp->t_sc, (char)c); +#endif + if (uio->uio_resid == 0) + break; + /* + * In canonical mode check for a "break character" + * marking the end of a "line of input". + */ + if (ISSET(lflag, ICANON) && TTBREAKC(c, lflag)) + break; + first = 0; + } + +out: + /* + * Look to unblock input now that (presumably) + * the input queue has gone down. + */ + s = spltty(); + if (ISSET(tp->t_state, TS_TBLOCK) && + tp->t_rawq.c_cc + tp->t_canq.c_cc <= tp->t_ilowat) + ttyunblock(tp); + splx(s); + + return (error); +} + +/* + * Check the output queue on tp for space for a kernel message (from uprintf + * or tprintf). Allow some space over the normal hiwater mark so we don't + * lose messages due to normal flow control, but don't let the tty run amok. + * Sleeps here are not interruptible, but we return prematurely if new signals + * arrive. + */ +int +ttycheckoutq(tp, wait) + register struct tty *tp; + int wait; +{ + int hiwat, s, oldsig; + + hiwat = tp->t_ohiwat; + s = spltty(); + oldsig = wait ? curproc->p_siglist : 0; + if (tp->t_outq.c_cc > hiwat + OBUFSIZ + 100) + while (tp->t_outq.c_cc > hiwat) { + ttstart(tp); + if (tp->t_outq.c_cc <= hiwat) + break; + if (wait == 0 || curproc->p_siglist != oldsig) { + splx(s); + return (0); + } + SET(tp->t_state, TS_SO_OLOWAT); + tsleep(TSA_OLOWAT(tp), PZERO - 1, "ttoutq", hz); + } + splx(s); + return (1); +} + +/* + * Process a write call on a tty device. + */ +int +ttwrite(tp, uio, flag) + register struct tty *tp; + register struct uio *uio; + int flag; +{ + register char *cp = NULL; + register int cc, ce; + register struct proc *p; + int i, hiwat, cnt, error, s; + char obuf[OBUFSIZ]; + + hiwat = tp->t_ohiwat; + cnt = uio->uio_resid; + error = 0; + cc = 0; +loop: + s = spltty(); + if (ISSET(tp->t_state, TS_ZOMBIE)) { + splx(s); + if (uio->uio_resid == cnt) + error = EIO; + goto out; + } + if (!ISSET(tp->t_state, TS_CONNECTED)) { + if (flag & IO_NDELAY) { + splx(s); + error = EWOULDBLOCK; + goto out; + } + error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH, + "ttydcd", 0); + splx(s); + if (error) + goto out; + goto loop; + } + splx(s); + /* + * Hang the process if it's in the background. + */ + p = curproc; + if (isbackground(p, tp) && + ISSET(tp->t_lflag, TOSTOP) && (p->p_flag & P_PPWAIT) == 0 && + (p->p_sigignore & sigmask(SIGTTOU)) == 0 && + (p->p_sigmask & sigmask(SIGTTOU)) == 0) { + if (p->p_pgrp->pg_jobc == 0) { + error = EIO; + goto out; + } + pgsignal(p->p_pgrp, SIGTTOU, 1); + error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg4", 0); + if (error) + goto out; + goto loop; + } + /* + * Process the user's data in at most OBUFSIZ chunks. Perform any + * output translation. Keep track of high water mark, sleep on + * overflow awaiting device aid in acquiring new space. + */ + while (uio->uio_resid > 0 || cc > 0) { + if (ISSET(tp->t_lflag, FLUSHO)) { + uio->uio_resid = 0; + return (0); + } + if (tp->t_outq.c_cc > hiwat) + goto ovhiwat; + /* + * Grab a hunk of data from the user, unless we have some + * leftover from last time. + */ + if (cc == 0) { + cc = imin(uio->uio_resid, OBUFSIZ); + cp = obuf; + error = uiomove(cp, cc, uio); + if (error) { + cc = 0; + break; + } +#if NSNP > 0 + if (ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL) + snpin((struct snoop *)tp->t_sc, cp, cc); +#endif + } + /* + * If nothing fancy need be done, grab those characters we + * can handle without any of ttyoutput's processing and + * just transfer them to the output q. For those chars + * which require special processing (as indicated by the + * bits in char_type), call ttyoutput. After processing + * a hunk of data, look for FLUSHO so ^O's will take effect + * immediately. + */ + while (cc > 0) { + if (!ISSET(tp->t_oflag, OPOST)) + ce = cc; + else { + ce = cc - scanc((u_int)cc, (u_char *)cp, + char_type, CCLASSMASK); + /* + * If ce is zero, then we're processing + * a special character through ttyoutput. + */ + if (ce == 0) { + tp->t_rocount = 0; + if (ttyoutput(*cp, tp) >= 0) { + /* No Clists, wait a bit. */ + ttstart(tp); + if (flag & IO_NDELAY) { + error = EWOULDBLOCK; + goto out; + } + error = ttysleep(tp, &lbolt, + TTOPRI|PCATCH, + "ttybf1", 0); + if (error) + goto out; + goto loop; + } + cp++; + cc--; + if (ISSET(tp->t_lflag, FLUSHO) || + tp->t_outq.c_cc > hiwat) + goto ovhiwat; + continue; + } + } + /* + * A bunch of normal characters have been found. + * Transfer them en masse to the output queue and + * continue processing at the top of the loop. + * If there are any further characters in this + * <= OBUFSIZ chunk, the first should be a character + * requiring special handling by ttyoutput. + */ + tp->t_rocount = 0; + i = b_to_q(cp, ce, &tp->t_outq); + ce -= i; + tp->t_column += ce; + cp += ce, cc -= ce, tk_nout += ce; + tp->t_outcc += ce; + if (i > 0) { + /* No Clists, wait a bit. */ + ttstart(tp); + if (flag & IO_NDELAY) { + error = EWOULDBLOCK; + goto out; + } + error = ttysleep(tp, &lbolt, TTOPRI | PCATCH, + "ttybf2", 0); + if (error) + goto out; + goto loop; + } + if (ISSET(tp->t_lflag, FLUSHO) || + tp->t_outq.c_cc > hiwat) + break; + } + ttstart(tp); + } +out: + /* + * If cc is nonzero, we leave the uio structure inconsistent, as the + * offset and iov pointers have moved forward, but it doesn't matter + * (the call will either return short or restart with a new uio). + */ + uio->uio_resid += cc; + return (error); + +ovhiwat: + ttstart(tp); + s = spltty(); + /* + * This can only occur if FLUSHO is set in t_lflag, + * or if ttstart/oproc is synchronous (or very fast). + */ + if (tp->t_outq.c_cc <= hiwat) { + splx(s); + goto loop; + } + if (flag & IO_NDELAY) { + splx(s); + uio->uio_resid += cc; + return (uio->uio_resid == cnt ? EWOULDBLOCK : 0); + } + SET(tp->t_state, TS_SO_OLOWAT); + error = ttysleep(tp, TSA_OLOWAT(tp), TTOPRI | PCATCH, "ttywri", + tp->t_timeout); + splx(s); + if (error == EWOULDBLOCK) + error = EIO; + if (error) + goto out; + goto loop; +} + +/* + * Rubout one character from the rawq of tp + * as cleanly as possible. + */ +static void +ttyrub(c, tp) + register int c; + register struct tty *tp; +{ + register char *cp; + register int savecol; + int tabc, s; + + if (!ISSET(tp->t_lflag, ECHO) || ISSET(tp->t_lflag, EXTPROC)) + return; + CLR(tp->t_lflag, FLUSHO); + if (ISSET(tp->t_lflag, ECHOE)) { + if (tp->t_rocount == 0) { + /* + * Screwed by ttwrite; retype + */ + ttyretype(tp); + return; + } + if (c == ('\t' | TTY_QUOTE) || c == ('\n' | TTY_QUOTE)) + ttyrubo(tp, 2); + else { + CLR(c, ~TTY_CHARMASK); + switch (CCLASS(c)) { + case ORDINARY: + ttyrubo(tp, 1); + break; + case BACKSPACE: + case CONTROL: + case NEWLINE: + case RETURN: + case VTAB: + if (ISSET(tp->t_lflag, ECHOCTL)) + ttyrubo(tp, 2); + break; + case TAB: + if (tp->t_rocount < tp->t_rawq.c_cc) { + ttyretype(tp); + return; + } + s = spltty(); + savecol = tp->t_column; + SET(tp->t_state, TS_CNTTB); + SET(tp->t_lflag, FLUSHO); + tp->t_column = tp->t_rocol; + cp = tp->t_rawq.c_cf; + if (cp) + tabc = *cp; /* XXX FIX NEXTC */ + for (; cp; cp = nextc(&tp->t_rawq, cp, &tabc)) + ttyecho(tabc, tp); + CLR(tp->t_lflag, FLUSHO); + CLR(tp->t_state, TS_CNTTB); + splx(s); + + /* savecol will now be length of the tab. */ + savecol -= tp->t_column; + tp->t_column += savecol; + if (savecol > 8) + savecol = 8; /* overflow screw */ + while (--savecol >= 0) + (void)ttyoutput('\b', tp); + break; + default: /* XXX */ +#define PANICSTR "ttyrub: would panic c = %d, val = %d\n" + (void)printf(PANICSTR, c, CCLASS(c)); +#ifdef notdef + panic(PANICSTR, c, CCLASS(c)); +#endif + } + } + } else if (ISSET(tp->t_lflag, ECHOPRT)) { + if (!ISSET(tp->t_state, TS_ERASE)) { + SET(tp->t_state, TS_ERASE); + (void)ttyoutput('\\', tp); + } + ttyecho(c, tp); + } else + ttyecho(tp->t_cc[VERASE], tp); + --tp->t_rocount; +} + +/* + * Back over cnt characters, erasing them. + */ +static void +ttyrubo(tp, cnt) + register struct tty *tp; + int cnt; +{ + + while (cnt-- > 0) { + (void)ttyoutput('\b', tp); + (void)ttyoutput(' ', tp); + (void)ttyoutput('\b', tp); + } +} + +/* + * ttyretype -- + * Reprint the rawq line. Note, it is assumed that c_cc has already + * been checked. + */ +static void +ttyretype(tp) + register struct tty *tp; +{ + register char *cp; + int s, c; + + /* Echo the reprint character. */ + if (tp->t_cc[VREPRINT] != _POSIX_VDISABLE) + ttyecho(tp->t_cc[VREPRINT], tp); + + (void)ttyoutput('\n', tp); + + /* + * XXX + * FIX: NEXTC IS BROKEN - DOESN'T CHECK QUOTE + * BIT OF FIRST CHAR. + */ + s = spltty(); + for (cp = tp->t_canq.c_cf, c = (cp != NULL ? *cp : 0); + cp != NULL; cp = nextc(&tp->t_canq, cp, &c)) + ttyecho(c, tp); + for (cp = tp->t_rawq.c_cf, c = (cp != NULL ? *cp : 0); + cp != NULL; cp = nextc(&tp->t_rawq, cp, &c)) + ttyecho(c, tp); + CLR(tp->t_state, TS_ERASE); + splx(s); + + tp->t_rocount = tp->t_rawq.c_cc; + tp->t_rocol = 0; +} + +/* + * Echo a typed character to the terminal. + */ +static void +ttyecho(c, tp) + register int c; + register struct tty *tp; +{ + + if (!ISSET(tp->t_state, TS_CNTTB)) + CLR(tp->t_lflag, FLUSHO); + if ((!ISSET(tp->t_lflag, ECHO) && + (c != '\n' || !ISSET(tp->t_lflag, ECHONL))) || + ISSET(tp->t_lflag, EXTPROC)) + return; + if (ISSET(tp->t_lflag, ECHOCTL) && + ((ISSET(c, TTY_CHARMASK) <= 037 && c != '\t' && c != '\n') || + ISSET(c, TTY_CHARMASK) == 0177)) { + (void)ttyoutput('^', tp); + CLR(c, ~TTY_CHARMASK); + if (c == 0177) + c = '?'; + else + c += 'A' - 1; + } + (void)ttyoutput(c, tp); +} + +/* + * Wake up any readers on a tty. + */ +void +ttwakeup(tp) + register struct tty *tp; +{ + + if (tp->t_rsel.si_pid != 0) + selwakeup(&tp->t_rsel); + if (ISSET(tp->t_state, TS_ASYNC) && tp->t_sigio != NULL) + pgsigio(tp->t_sigio, SIGIO, (tp->t_session != NULL)); + wakeup(TSA_HUP_OR_INPUT(tp)); +} + +/* + * Wake up any writers on a tty. + */ +void +ttwwakeup(tp) + register struct tty *tp; +{ + + if (tp->t_wsel.si_pid != 0 && tp->t_outq.c_cc <= tp->t_olowat) + selwakeup(&tp->t_wsel); + if (ISSET(tp->t_state, TS_BUSY | TS_SO_OCOMPLETE) == + TS_SO_OCOMPLETE && tp->t_outq.c_cc == 0) { + CLR(tp->t_state, TS_SO_OCOMPLETE); + wakeup(TSA_OCOMPLETE(tp)); + } + if (ISSET(tp->t_state, TS_SO_OLOWAT) && + tp->t_outq.c_cc <= tp->t_olowat) { + CLR(tp->t_state, TS_SO_OLOWAT); + wakeup(TSA_OLOWAT(tp)); + } +} + +/* + * Look up a code for a specified speed in a conversion table; + * used by drivers to map software speed values to hardware parameters. + */ +int +ttspeedtab(speed, table) + int speed; + register struct speedtab *table; +{ + + for ( ; table->sp_speed != -1; table++) + if (table->sp_speed == speed) + return (table->sp_code); + return (-1); +} + +/* + * Set input and output watermarks and buffer sizes. For input, the + * high watermark is about one second's worth of input above empty, the + * low watermark is slightly below high water, and the buffer size is a + * driver-dependent amount above high water. For output, the watermarks + * are near the ends of the buffer, with about 1 second's worth of input + * between them. All this only applies to the standard line discipline. + */ +void +ttsetwater(tp) + struct tty *tp; +{ + register int cps, ttmaxhiwat, x; + + /* Input. */ + clist_alloc_cblocks(&tp->t_canq, TTYHOG, 512); + switch (tp->t_ispeedwat) { + case (speed_t)-1: + cps = tp->t_ispeed / 10; + break; + case 0: + /* + * This case is for old drivers that don't know about + * t_ispeedwat. Arrange for them to get the old buffer + * sizes and watermarks. + */ + cps = TTYHOG - 2 * 256; + tp->t_ififosize = 2 * 256; + break; + default: + cps = tp->t_ispeedwat / 10; + break; + } + tp->t_ihiwat = cps; + tp->t_ilowat = 7 * cps / 8; + x = cps + tp->t_ififosize; + clist_alloc_cblocks(&tp->t_rawq, x, x); + + /* Output. */ + switch (tp->t_ospeedwat) { + case (speed_t)-1: + cps = tp->t_ospeed / 10; + ttmaxhiwat = 2 * TTMAXHIWAT; + break; + case 0: + cps = tp->t_ospeed / 10; + ttmaxhiwat = TTMAXHIWAT; + break; + default: + cps = tp->t_ospeedwat / 10; + ttmaxhiwat = 8 * TTMAXHIWAT; + break; + } +#define CLAMP(x, h, l) ((x) > h ? h : ((x) < l) ? l : (x)) + tp->t_olowat = x = CLAMP(cps / 2, TTMAXLOWAT, TTMINLOWAT); + x += cps; + x = CLAMP(x, ttmaxhiwat, TTMINHIWAT); /* XXX clamps are too magic */ + tp->t_ohiwat = roundup(x, CBSIZE); /* XXX for compat */ + x = imax(tp->t_ohiwat, TTMAXHIWAT); /* XXX for compat/safety */ + x += OBUFSIZ + 100; + clist_alloc_cblocks(&tp->t_outq, x, x); +#undef CLAMP +} + +/* + * Report on state of foreground process group. + */ +void +ttyinfo(tp) + register struct tty *tp; +{ + register struct proc *p, *pick; + struct timeval utime, stime; + int tmp; + + if (ttycheckoutq(tp,0) == 0) + return; + + /* Print load average. */ + tmp = (averunnable.ldavg[0] * 100 + FSCALE / 2) >> FSHIFT; + ttyprintf(tp, "load: %d.%02d ", tmp / 100, tmp % 100); + + if (tp->t_session == NULL) + ttyprintf(tp, "not a controlling terminal\n"); + else if (tp->t_pgrp == NULL) + ttyprintf(tp, "no foreground process group\n"); + else if ((p = tp->t_pgrp->pg_members.lh_first) == 0) + ttyprintf(tp, "empty foreground process group\n"); + else { + /* Pick interesting process. */ + for (pick = NULL; p != 0; p = p->p_pglist.le_next) + if (proc_compare(pick, p)) + pick = p; + + ttyprintf(tp, " cmd: %s %d [%s] ", pick->p_comm, pick->p_pid, + pick->p_stat == SRUN ? "running" : + pick->p_wmesg ? pick->p_wmesg : "iowait"); + + calcru(pick, &utime, &stime, NULL); + + /* Print user time. */ + ttyprintf(tp, "%ld.%02ldu ", + utime.tv_sec, utime.tv_usec / 10000); + + /* Print system time. */ + ttyprintf(tp, "%ld.%02lds ", + stime.tv_sec, stime.tv_usec / 10000); + +#define pgtok(a) (((a) * PAGE_SIZE) / 1024) + /* Print percentage cpu, resident set size. */ + tmp = (pick->p_pctcpu * 10000 + FSCALE / 2) >> FSHIFT; + ttyprintf(tp, "%d%% %ldk\n", + tmp / 100, + pick->p_stat == SIDL || pick->p_stat == SZOMB ? 0 : +#ifdef pmap_resident_count + (long)pgtok(pmap_resident_count(&pick->p_vmspace->vm_pmap)) +#else + (long)pgtok(pick->p_vmspace->vm_rssize) +#endif + ); + } + tp->t_rocount = 0; /* so pending input will be retyped if BS */ +} + +/* + * Returns 1 if p2 is "better" than p1 + * + * The algorithm for picking the "interesting" process is thus: + * + * 1) Only foreground processes are eligible - implied. + * 2) Runnable processes are favored over anything else. The runner + * with the highest cpu utilization is picked (p_estcpu). Ties are + * broken by picking the highest pid. + * 3) The sleeper with the shortest sleep time is next. With ties, + * we pick out just "short-term" sleepers (P_SINTR == 0). + * 4) Further ties are broken by picking the highest pid. + */ +#define ISRUN(p) (((p)->p_stat == SRUN) || ((p)->p_stat == SIDL)) +#define TESTAB(a, b) ((a)<<1 | (b)) +#define ONLYA 2 +#define ONLYB 1 +#define BOTH 3 + +static int +proc_compare(p1, p2) + register struct proc *p1, *p2; +{ + + if (p1 == NULL) + return (1); + /* + * see if at least one of them is runnable + */ + switch (TESTAB(ISRUN(p1), ISRUN(p2))) { + case ONLYA: + return (0); + case ONLYB: + return (1); + case BOTH: + /* + * tie - favor one with highest recent cpu utilization + */ + if (p2->p_estcpu > p1->p_estcpu) + return (1); + if (p1->p_estcpu > p2->p_estcpu) + return (0); + return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ + } + /* + * weed out zombies + */ + switch (TESTAB(p1->p_stat == SZOMB, p2->p_stat == SZOMB)) { + case ONLYA: + return (1); + case ONLYB: + return (0); + case BOTH: + return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ + } + /* + * pick the one with the smallest sleep time + */ + if (p2->p_slptime > p1->p_slptime) + return (0); + if (p1->p_slptime > p2->p_slptime) + return (1); + /* + * favor one sleeping in a non-interruptible sleep + */ + if (p1->p_flag & P_SINTR && (p2->p_flag & P_SINTR) == 0) + return (1); + if (p2->p_flag & P_SINTR && (p1->p_flag & P_SINTR) == 0) + return (0); + return (p2->p_pid > p1->p_pid); /* tie - return highest pid */ +} + +/* + * Output char to tty; console putchar style. + */ +int +tputchar(c, tp) + int c; + struct tty *tp; +{ + register int s; + + s = spltty(); + if (!ISSET(tp->t_state, TS_CONNECTED)) { + splx(s); + return (-1); + } + if (c == '\n') + (void)ttyoutput('\r', tp); + (void)ttyoutput(c, tp); + ttstart(tp); + splx(s); + return (0); +} + +/* + * Sleep on chan, returning ERESTART if tty changed while we napped and + * returning any errors (e.g. EINTR/EWOULDBLOCK) reported by tsleep. If + * the tty is revoked, restarting a pending call will redo validation done + * at the start of the call. + */ +int +ttysleep(tp, chan, pri, wmesg, timo) + struct tty *tp; + void *chan; + int pri, timo; + char *wmesg; +{ + int error; + int gen; + + gen = tp->t_gen; + error = tsleep(chan, pri, wmesg, timo); + if (error) + return (error); + return (tp->t_gen == gen ? 0 : ERESTART); +} + +#ifdef notyet +/* + * XXX this is usable not useful or used. Most tty drivers have + * ifdefs for using ttymalloc() but assume a different interface. + */ +/* + * Allocate a tty struct. Clists in the struct will be allocated by + * ttyopen(). + */ +struct tty * +ttymalloc() +{ + struct tty *tp; + + tp = malloc(sizeof *tp, M_TTYS, M_WAITOK); + bzero(tp, sizeof *tp); + return (tp); +} +#endif + +#if 0 /* XXX not yet usable: session leader holds a ref (see kern_exit.c). */ +/* + * Free a tty struct. Clists in the struct should have been freed by + * ttyclose(). + */ +void +ttyfree(tp) + struct tty *tp; +{ + free(tp, M_TTYS); +} +#endif /* 0 */ diff --git a/sys/kern/tty_compat.c b/sys/kern/tty_compat.c new file mode 100644 index 0000000..fa2ae5c --- /dev/null +++ b/sys/kern/tty_compat.c @@ -0,0 +1,490 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty_compat.c 8.1 (Berkeley) 6/10/93 + * $Id: tty_compat.c,v 1.27 1998/02/25 06:16:37 bde Exp $ + */ + +#include "opt_compat.h" + +/* + * mapping routines for old line discipline (yuck) + */ +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/ioctl_compat.h> +#include <sys/tty.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> + +static int ttcompatgetflags __P((struct tty *tp)); +static void ttcompatsetflags __P((struct tty *tp, struct termios *t)); +static void ttcompatsetlflags __P((struct tty *tp, struct termios *t)); +static int ttcompatspeedtab __P((int speed, struct speedtab *table)); + +static int ttydebug = 0; +SYSCTL_INT(_debug, OID_AUTO, ttydebug, CTLFLAG_RW, &ttydebug, 0, ""); + +static struct speedtab compatspeeds[] = { +#define MAX_SPEED 17 + { 115200, 17 }, + { 57600, 16 }, + { 38400, 15 }, + { 19200, 14 }, + { 9600, 13 }, + { 4800, 12 }, + { 2400, 11 }, + { 1800, 10 }, + { 1200, 9 }, + { 600, 8 }, + { 300, 7 }, + { 200, 6 }, + { 150, 5 }, + { 134, 4 }, + { 110, 3 }, + { 75, 2 }, + { 50, 1 }, + { 0, 0 }, + { -1, -1 }, +}; +static int compatspcodes[] = { + 0, 50, 75, 110, 134, 150, 200, 300, 600, 1200, + 1800, 2400, 4800, 9600, 19200, 38400, 57600, 115200, +}; + +static int +ttcompatspeedtab(speed, table) + int speed; + register struct speedtab *table; +{ + if (speed == 0) + return (0); /* hangup */ + for ( ; table->sp_speed > 0; table++) + if (table->sp_speed <= speed) /* nearest one, rounded down */ + return (table->sp_code); + return (1); /* 50, min and not hangup */ +} + +int +ttsetcompat(tp, com, data, term) + register struct tty *tp; + u_long *com; + caddr_t data; + struct termios *term; +{ + switch (*com) { + case TIOCSETP: + case TIOCSETN: { + register struct sgttyb *sg = (struct sgttyb *)data; + int speed; + + if ((speed = sg->sg_ispeed) > MAX_SPEED || speed < 0) + return(EINVAL); + else if (speed != ttcompatspeedtab(tp->t_ispeed, compatspeeds)) + term->c_ispeed = compatspcodes[speed]; + else + term->c_ispeed = tp->t_ispeed; + if ((speed = sg->sg_ospeed) > MAX_SPEED || speed < 0) + return(EINVAL); + else if (speed != ttcompatspeedtab(tp->t_ospeed, compatspeeds)) + term->c_ospeed = compatspcodes[speed]; + else + term->c_ospeed = tp->t_ospeed; + term->c_cc[VERASE] = sg->sg_erase; + term->c_cc[VKILL] = sg->sg_kill; + tp->t_flags = (tp->t_flags&0xffff0000) | (sg->sg_flags&0xffff); + ttcompatsetflags(tp, term); + *com = (*com == TIOCSETP) ? TIOCSETAF : TIOCSETA; + break; + } + case TIOCSETC: { + struct tchars *tc = (struct tchars *)data; + register cc_t *cc; + + cc = term->c_cc; + cc[VINTR] = tc->t_intrc; + cc[VQUIT] = tc->t_quitc; + cc[VSTART] = tc->t_startc; + cc[VSTOP] = tc->t_stopc; + cc[VEOF] = tc->t_eofc; + cc[VEOL] = tc->t_brkc; + if (tc->t_brkc == -1) + cc[VEOL2] = _POSIX_VDISABLE; + *com = TIOCSETA; + break; + } + case TIOCSLTC: { + struct ltchars *ltc = (struct ltchars *)data; + register cc_t *cc; + + cc = term->c_cc; + cc[VSUSP] = ltc->t_suspc; + cc[VDSUSP] = ltc->t_dsuspc; + cc[VREPRINT] = ltc->t_rprntc; + cc[VDISCARD] = ltc->t_flushc; + cc[VWERASE] = ltc->t_werasc; + cc[VLNEXT] = ltc->t_lnextc; + *com = TIOCSETA; + break; + } + case TIOCLBIS: + case TIOCLBIC: + case TIOCLSET: + if (*com == TIOCLSET) + tp->t_flags = (tp->t_flags&0xffff) | *(int *)data<<16; + else { + tp->t_flags = + (ttcompatgetflags(tp)&0xffff0000)|(tp->t_flags&0xffff); + if (*com == TIOCLBIS) + tp->t_flags |= *(int *)data<<16; + else + tp->t_flags &= ~(*(int *)data<<16); + } + ttcompatsetlflags(tp, term); + *com = TIOCSETA; + break; + } + return 0; +} + +/*ARGSUSED*/ +int +ttcompat(tp, com, data, flag) + register struct tty *tp; + u_long com; + caddr_t data; + int flag; +{ + switch (com) { + case TIOCSETP: + case TIOCSETN: + case TIOCSETC: + case TIOCSLTC: + case TIOCLBIS: + case TIOCLBIC: + case TIOCLSET: { + struct termios term; + int error; + + term = tp->t_termios; + if ((error = ttsetcompat(tp, &com, data, &term)) != 0) + return error; + return ttioctl(tp, com, &term, flag); + } + case TIOCGETP: { + register struct sgttyb *sg = (struct sgttyb *)data; + register cc_t *cc = tp->t_cc; + + sg->sg_ospeed = ttcompatspeedtab(tp->t_ospeed, compatspeeds); + if (tp->t_ispeed == 0) + sg->sg_ispeed = sg->sg_ospeed; + else + sg->sg_ispeed = ttcompatspeedtab(tp->t_ispeed, compatspeeds); + sg->sg_erase = cc[VERASE]; + sg->sg_kill = cc[VKILL]; + sg->sg_flags = tp->t_flags = ttcompatgetflags(tp); + break; + } + case TIOCGETC: { + struct tchars *tc = (struct tchars *)data; + register cc_t *cc = tp->t_cc; + + tc->t_intrc = cc[VINTR]; + tc->t_quitc = cc[VQUIT]; + tc->t_startc = cc[VSTART]; + tc->t_stopc = cc[VSTOP]; + tc->t_eofc = cc[VEOF]; + tc->t_brkc = cc[VEOL]; + break; + } + case TIOCGLTC: { + struct ltchars *ltc = (struct ltchars *)data; + register cc_t *cc = tp->t_cc; + + ltc->t_suspc = cc[VSUSP]; + ltc->t_dsuspc = cc[VDSUSP]; + ltc->t_rprntc = cc[VREPRINT]; + ltc->t_flushc = cc[VDISCARD]; + ltc->t_werasc = cc[VWERASE]; + ltc->t_lnextc = cc[VLNEXT]; + break; + } + case TIOCLGET: + tp->t_flags = + (ttcompatgetflags(tp) & 0xffff0000UL) + | (tp->t_flags & 0xffff); + *(int *)data = tp->t_flags>>16; + if (ttydebug) + printf("CLGET: returning %x\n", *(int *)data); + break; + + case OTIOCGETD: + *(int *)data = tp->t_line ? tp->t_line : 2; + break; + + case OTIOCSETD: { + int ldisczero = 0; + + return (ttioctl(tp, TIOCSETD, + *(int *)data == 2 ? (caddr_t)&ldisczero : data, flag)); + } + + case OTIOCCONS: + *(int *)data = 1; + return (ttioctl(tp, TIOCCONS, data, flag)); + + default: + return (ENOIOCTL); + } + return (0); +} + +static int +ttcompatgetflags(tp) + register struct tty *tp; +{ + register tcflag_t iflag = tp->t_iflag; + register tcflag_t lflag = tp->t_lflag; + register tcflag_t oflag = tp->t_oflag; + register tcflag_t cflag = tp->t_cflag; + register int flags = 0; + + if (iflag&IXOFF) + flags |= TANDEM; + if (iflag&ICRNL || oflag&ONLCR) + flags |= CRMOD; + if ((cflag&CSIZE) == CS8) { + flags |= PASS8; + if (iflag&ISTRIP) + flags |= ANYP; + } + else if (cflag&PARENB) { + if (iflag&INPCK) { + if (cflag&PARODD) + flags |= ODDP; + else + flags |= EVENP; + } else + flags |= EVENP | ODDP; + } + + if ((lflag&ICANON) == 0) { + /* fudge */ + if (iflag&(INPCK|ISTRIP|IXON) || lflag&(IEXTEN|ISIG) + || (cflag&(CSIZE|PARENB)) != CS8) + flags |= CBREAK; + else + flags |= RAW; + } + if (!(flags&RAW) && !(oflag&OPOST) && (cflag&(CSIZE|PARENB)) == CS8) + flags |= LITOUT; + if (cflag&MDMBUF) + flags |= MDMBUF; + if ((cflag&HUPCL) == 0) + flags |= NOHANG; + if (oflag&OXTABS) + flags |= XTABS; + if (lflag&ECHOE) + flags |= CRTERA|CRTBS; + if (lflag&ECHOKE) + flags |= CRTKIL|CRTBS; + if (lflag&ECHOPRT) + flags |= PRTERA; + if (lflag&ECHOCTL) + flags |= CTLECH; + if ((iflag&IXANY) == 0) + flags |= DECCTQ; + flags |= lflag&(ECHO|TOSTOP|FLUSHO|PENDIN|NOFLSH); + if (ttydebug) + printf("getflags: %x\n", flags); + return (flags); +} + +static void +ttcompatsetflags(tp, t) + register struct tty *tp; + register struct termios *t; +{ + register int flags = tp->t_flags; + register tcflag_t iflag = t->c_iflag; + register tcflag_t oflag = t->c_oflag; + register tcflag_t lflag = t->c_lflag; + register tcflag_t cflag = t->c_cflag; + + if (flags & RAW) { + iflag = IGNBRK; + lflag &= ~(ECHOCTL|ISIG|ICANON|IEXTEN); + } else { + iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR); + iflag |= BRKINT|IXON|IMAXBEL; + lflag |= ISIG|IEXTEN|ECHOCTL; /* XXX was echoctl on ? */ + if (flags & XTABS) + oflag |= OXTABS; + else + oflag &= ~OXTABS; + if (flags & CBREAK) + lflag &= ~ICANON; + else + lflag |= ICANON; + if (flags&CRMOD) { + iflag |= ICRNL; + oflag |= ONLCR; + } else { + iflag &= ~ICRNL; + oflag &= ~ONLCR; + } + } + if (flags&ECHO) + lflag |= ECHO; + else + lflag &= ~ECHO; + + cflag &= ~(CSIZE|PARENB); + if (flags&(RAW|LITOUT|PASS8)) { + cflag |= CS8; + if (!(flags&(RAW|PASS8)) + || (flags&(RAW|PASS8|ANYP)) == (PASS8|ANYP)) + iflag |= ISTRIP; + else + iflag &= ~ISTRIP; + if (flags&(RAW|LITOUT)) + oflag &= ~OPOST; + else + oflag |= OPOST; + } else { + cflag |= CS7|PARENB; + iflag |= ISTRIP; + oflag |= OPOST; + } + /* XXX don't set INPCK if RAW or PASS8? */ + if ((flags&(EVENP|ODDP)) == EVENP) { + iflag |= INPCK; + cflag &= ~PARODD; + } else if ((flags&(EVENP|ODDP)) == ODDP) { + iflag |= INPCK; + cflag |= PARODD; + } else + iflag &= ~INPCK; + if (flags&TANDEM) + iflag |= IXOFF; + else + iflag &= ~IXOFF; + if ((flags&DECCTQ) == 0) + iflag |= IXANY; + else + iflag &= ~IXANY; + t->c_iflag = iflag; + t->c_oflag = oflag; + t->c_lflag = lflag; + t->c_cflag = cflag; +} + +static void +ttcompatsetlflags(tp, t) + register struct tty *tp; + register struct termios *t; +{ + register int flags = tp->t_flags; + register tcflag_t iflag = t->c_iflag; + register tcflag_t oflag = t->c_oflag; + register tcflag_t lflag = t->c_lflag; + register tcflag_t cflag = t->c_cflag; + + iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR); + if (flags&CRTERA) + lflag |= ECHOE; + else + lflag &= ~ECHOE; + if (flags&CRTKIL) + lflag |= ECHOKE; + else + lflag &= ~ECHOKE; + if (flags&PRTERA) + lflag |= ECHOPRT; + else + lflag &= ~ECHOPRT; + if (flags&CTLECH) + lflag |= ECHOCTL; + else + lflag &= ~ECHOCTL; + if (flags&TANDEM) + iflag |= IXOFF; + else + iflag &= ~IXOFF; + if ((flags&DECCTQ) == 0) + iflag |= IXANY; + else + iflag &= ~IXANY; + if (flags & MDMBUF) + cflag |= MDMBUF; + else + cflag &= ~MDMBUF; + if (flags&NOHANG) + cflag &= ~HUPCL; + else + cflag |= HUPCL; + lflag &= ~(TOSTOP|FLUSHO|PENDIN|NOFLSH); + lflag |= flags&(TOSTOP|FLUSHO|PENDIN|NOFLSH); + + /* + * The next if-else statement is copied from above so don't bother + * checking it separately. We could avoid fiddlling with the + * character size if the mode is already RAW or if neither the + * LITOUT bit or the PASS8 bit is being changed, but the delta of + * the change is not available here and skipping the RAW case would + * make the code different from above. + */ + cflag &= ~(CSIZE|PARENB); + if (flags&(RAW|LITOUT|PASS8)) { + cflag |= CS8; + if (!(flags&(RAW|PASS8)) + || (flags&(RAW|PASS8|ANYP)) == (PASS8|ANYP)) + iflag |= ISTRIP; + else + iflag &= ~ISTRIP; + if (flags&(RAW|LITOUT)) + oflag &= ~OPOST; + else + oflag |= OPOST; + } else { + cflag |= CS7|PARENB; + iflag |= ISTRIP; + oflag |= OPOST; + } + t->c_iflag = iflag; + t->c_oflag = oflag; + t->c_lflag = lflag; + t->c_cflag = cflag; +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ diff --git a/sys/kern/tty_conf.c b/sys/kern/tty_conf.c new file mode 100644 index 0000000..12f26e0 --- /dev/null +++ b/sys/kern/tty_conf.c @@ -0,0 +1,210 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty_conf.c 8.4 (Berkeley) 1/21/94 + * $Id: tty_conf.c,v 1.12 1997/12/16 17:40:27 eivind Exp $ + */ + +#include "opt_compat.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/tty.h> +#include <sys/conf.h> + +#ifndef MAXLDISC +#define MAXLDISC 8 +#endif + +static l_open_t l_noopen; +static l_close_t l_noclose; +static l_ioctl_t l_nullioctl; +static l_rint_t l_norint; +static l_start_t l_nostart; + +/* + * XXX it probably doesn't matter what the entries other than the l_open + * entry are here. The l_nullioctl and ttymodem entries still look fishy. + * Reconsider the removal of nullmodem anyway. It was too much like + * ttymodem, but a completely null version might be useful. + */ +#define NODISC(n) \ + { l_noopen, l_noclose, l_noread, l_nowrite, \ + l_nullioctl, l_norint, l_nostart, ttymodem } + +struct linesw linesw[MAXLDISC] = +{ + /* 0- termios */ + { ttyopen, ttylclose, ttread, ttwrite, + l_nullioctl, ttyinput, ttstart, ttymodem }, + NODISC(1), /* 1- defunct */ + /* 2- NTTYDISC */ +#ifdef COMPAT_43 + { ttyopen, ttylclose, ttread, ttwrite, + l_nullioctl, ttyinput, ttstart, ttymodem }, +#else + NODISC(2), +#endif + NODISC(3), /* TABLDISC */ + NODISC(4), /* SLIPDISC */ + NODISC(5), /* PPPDISC */ + NODISC(6), /* loadable */ + NODISC(7), /* loadable */ +}; + +int nlinesw = sizeof (linesw) / sizeof (linesw[0]); + +static struct linesw nodisc = NODISC(0); + +#define LOADABLE_LDISC 6 +/* + * ldisc_register: Register a line discipline. + * + * discipline: Index for discipline to load, or LDISC_LOAD for us to choose. + * linesw_p: Pointer to linesw_p. + * + * Returns: Index used or -1 on failure. + */ +int +ldisc_register(discipline, linesw_p) + int discipline; + struct linesw *linesw_p; +{ + int slot = -1; + + if (discipline == LDISC_LOAD) { + int i; + for (i = LOADABLE_LDISC; i < MAXLDISC; i++) + if (bcmp(linesw + i, &nodisc, sizeof(nodisc)) == 0) { + slot = i; + } + } + else if (discipline >= 0 && discipline < MAXLDISC) { + slot = discipline; + } + + if (slot != -1 && linesw_p) + linesw[slot] = *linesw_p; + + return slot; +} + +/* + * ldisc_deregister: Deregister a line discipline obtained with + * ldisc_register. Can only deregister "loadable" ones now. + * + * discipline: Index for discipline to unload. + */ +void +ldisc_deregister(discipline) + int discipline; +{ + if (discipline >= LOADABLE_LDISC && discipline < MAXLDISC) { + linesw[discipline] = nodisc; + } +} + +static int +l_noopen(dev, tp) + dev_t dev; + struct tty *tp; +{ + + return (ENODEV); +} + +static int +l_noclose(tp, flag) + struct tty *tp; + int flag; +{ + + return (ENODEV); +} + +int +l_noread(tp, uio, flag) + struct tty *tp; + struct uio *uio; + int flag; +{ + + return (ENODEV); +} + +int +l_nowrite(tp, uio, flag) + struct tty *tp; + struct uio *uio; + int flag; +{ + + return (ENODEV); +} + +static int +l_norint(c, tp) + int c; + struct tty *tp; +{ + + return (ENODEV); +} + +static int +l_nostart(tp) + struct tty *tp; +{ + + return (ENODEV); +} + +/* + * Do nothing specific version of line + * discipline specific ioctl command. + */ +static int +l_nullioctl(tp, cmd, data, flags, p) + struct tty *tp; + u_long cmd; + char *data; + int flags; + struct proc *p; +{ + + return (ENOIOCTL); +} diff --git a/sys/kern/tty_cons.c b/sys/kern/tty_cons.c index 6189d72..581ff3f 100644 --- a/sys/kern/tty_cons.c +++ b/sys/kern/tty_cons.c @@ -35,129 +35,323 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)cons.c 7.2 (Berkeley) 5/9/91 - * - * PATCHES MAGIC LEVEL PATCH THAT GOT US HERE - * -------------------- ----- ---------------------- - * CURRENT PATCH LEVEL: 1 00083 - * -------------------- ----- ---------------------- - * - * 16 Aug 92 Pace Willisson /dev/console redirect (xterm -C, etc.) - * 14 Mar 93 Chris G. Demetriou Moved pg() here from isa/pccons.c + * from: @(#)cons.c 7.2 (Berkeley) 5/9/91 + * $Id: cons.c,v 1.59 1998/08/23 08:26:40 bde Exp $ */ +#include "opt_devfs.h" -#include "sys/param.h" -#include "sys/proc.h" -#include "sys/user.h" -#include "sys/systm.h" -#include "sys/buf.h" -#include "sys/ioctl.h" -#include "sys/tty.h" -#include "sys/file.h" -#include "sys/conf.h" +#include <sys/param.h> +#ifdef DEVFS +#include <sys/devfsext.h> +#endif /*DEVFS*/ +#include <sys/systm.h> +#include <sys/conf.h> +#include <sys/kernel.h> +#include <sys/reboot.h> +#include <sys/sysctl.h> +#include <sys/proc.h> +#include <sys/tty.h> +#include <sys/uio.h> -#include "cons.h" +#include <machine/cpu.h> +#include <machine/cons.h> -/* XXX - all this could be autoconfig()ed */ -int pccnprobe(), pccninit(), pccngetc(), pccnputc(); -#include "com.h" -#if NCOM > 0 -int comcnprobe(), comcninit(), comcngetc(), comcnputc(); -#endif +static d_open_t cnopen; +static d_close_t cnclose; +static d_read_t cnread; +static d_write_t cnwrite; +static d_ioctl_t cnioctl; +static d_poll_t cnpoll; -struct consdev constab[] = { - { pccnprobe, pccninit, pccngetc, pccnputc }, -#if NCOM > 0 - { comcnprobe, comcninit, comcngetc, comcnputc }, -#endif - { 0 }, +#define CDEV_MAJOR 0 +static struct cdevsw cn_cdevsw = { + cnopen, cnclose, cnread, cnwrite, + cnioctl, nullstop, nullreset, nodevtotty, + cnpoll, nommap, NULL, "console", + NULL, -1, nodump, nopsize, + D_TTY, }; -/* end XXX */ -struct tty *constty = 0; /* virtual console output device */ -struct consdev *cn_tab; /* physical console device info */ -struct tty *cn_tty; /* XXX: console tty struct for tprintf */ +static dev_t cn_dev_t; /* seems to be never really used */ +SYSCTL_OPAQUE(_machdep, CPU_CONSDEV, consdev, CTLFLAG_RD, + &cn_dev_t, sizeof cn_dev_t, "T,dev_t", ""); + +static int cn_mute; +int cons_unavail = 0; /* XXX: + * physical console not available for + * input (i.e., it is in graphics mode) + */ + +static u_char cn_is_open; /* nonzero if logical console is open */ +static int openmode, openflag; /* how /dev/console was openned */ +static u_char cn_phys_is_open; /* nonzero if physical device is open */ +static d_close_t *cn_phys_close; /* physical device close function */ +static d_open_t *cn_phys_open; /* physical device open function */ +static struct consdev *cn_tab; /* physical console device info */ +static struct tty *cn_tp; /* physical console tty struct */ +#ifdef DEVFS +static void *cn_devfs_token; /* represents the devfs entry */ +#endif /* DEVFS */ + +CONS_DRIVER(cons, NULL, NULL, NULL, NULL, NULL); + +void cninit() { - register struct consdev *cp; + struct consdev *best_cp, *cp; + struct consdev **list; /* - * Collect information about all possible consoles - * and find the one with highest priority + * Find the first console with the highest priority. */ - for (cp = constab; cp->cn_probe; cp++) { + best_cp = NULL; + list = (struct consdev **)cons_set.ls_items; + while ((cp = *list++) != NULL) { + if (cp->cn_probe == NULL) + continue; (*cp->cn_probe)(cp); if (cp->cn_pri > CN_DEAD && - (cn_tab == NULL || cp->cn_pri > cn_tab->cn_pri)) - cn_tab = cp; + (best_cp == NULL || cp->cn_pri > best_cp->cn_pri)) + best_cp = cp; } + + /* + * Check if we should mute the console (for security reasons perhaps) + * It can be changes dynamically using sysctl kern.consmute + * once we are up and going. + * + */ + cn_mute = ((boothowto & (RB_MUTE + |RB_SINGLE + |RB_VERBOSE + |RB_ASKNAME + |RB_CONFIG)) == RB_MUTE); + + /* + * If no console, give up. + */ + if (best_cp == NULL) { + cn_tab = best_cp; + return; + } + + /* + * Initialize console, then attach to it. This ordering allows + * debugging using the previous console, if any. + * XXX if there was a previous console, then its driver should + * be informed when we forget about it. + */ + (*best_cp->cn_init)(best_cp); + cn_tab = best_cp; +} + +void +cninit_finish() +{ + struct cdevsw *cdp; + + if ((cn_tab == NULL) || cn_mute) + return; + /* - * No console, we can handle it + * Hook the open and close functions. */ - if ((cp = cn_tab) == NULL) + cdp = cdevsw[major(cn_tab->cn_dev)]; + cn_phys_close = cdp->d_close; + cdp->d_close = cnclose; + cn_phys_open = cdp->d_open; + cdp->d_open = cnopen; + cn_tp = (*cdp->d_devtotty)(cn_tab->cn_dev); + cn_dev_t = cn_tp->t_dev; +} + +static void +cnuninit(void) +{ + struct cdevsw *cdp; + + if (cn_tab == NULL) return; + /* - * Turn on console + * Unhook the open and close functions. */ - cn_tty = cp->cn_tp; - (*cp->cn_init)(cp); + cdp = cdevsw[major(cn_tab->cn_dev)]; + cdp->d_close = cn_phys_close; + cn_phys_close = NULL; + cdp->d_open = cn_phys_open; + cn_phys_open = NULL; + cn_tp = NULL; + cn_dev_t = 0; +} + +/* + * User has changed the state of the console muting. + * This may require us to open or close the device in question. + */ +static int +sysctl_kern_consmute SYSCTL_HANDLER_ARGS +{ + int error; + int ocn_mute; + + ocn_mute = cn_mute; + error = sysctl_handle_int(oidp, &cn_mute, 0, req); + if((error == 0) && (cn_tab != NULL) && (req->newptr != NULL)) { + if(ocn_mute && !cn_mute) { + /* + * going from muted to unmuted.. open the physical dev + * if the console has been openned + */ + cninit_finish(); + if(cn_is_open) + /* XXX curproc is not what we want really */ + error = cnopen(cn_dev_t, openflag, + openmode, curproc); + /* if it failed, back it out */ + if ( error != 0) cnuninit(); + } else if (!ocn_mute && cn_mute) { + /* + * going from unmuted to muted.. close the physical dev + * if it's only open via /dev/console + */ + if(cn_is_open) + error = cnclose(cn_dev_t, openflag, + openmode, curproc); + if ( error == 0) cnuninit(); + } + if (error != 0) { + /* + * back out the change if there was an error + */ + cn_mute = ocn_mute; + } + } + return (error); } +SYSCTL_PROC(_kern, OID_AUTO, consmute, CTLTYPE_INT|CTLFLAG_RW, + 0, sizeof cn_mute, sysctl_kern_consmute, "I", ""); + +static int cnopen(dev, flag, mode, p) dev_t dev; int flag, mode; struct proc *p; { + dev_t cndev, physdev; + int retval = 0; + if (cn_tab == NULL) return (0); - dev = cn_tab->cn_dev; - return ((*cdevsw[major(dev)].d_open)(dev, flag, mode, p)); + cndev = cn_tab->cn_dev; + physdev = (major(dev) == major(cndev) ? dev : cndev); + /* + * If mute is active, then non console opens don't get here + * so we don't need to check for that. They + * bypass this and go straight to the device. + */ + if(!cn_mute) + retval = (*cn_phys_open)(physdev, flag, mode, p); + if (retval == 0) { + /* + * check if we openned it via /dev/console or + * via the physical entry (e.g. /dev/sio0). + */ + if (dev == cndev) + cn_phys_is_open = 1; + else if (physdev == cndev) { + openmode = mode; + openflag = flag; + cn_is_open = 1; + } + } + return (retval); } - + +static int cnclose(dev, flag, mode, p) dev_t dev; int flag, mode; struct proc *p; { + dev_t cndev; + if (cn_tab == NULL) return (0); - dev = cn_tab->cn_dev; - return ((*cdevsw[major(dev)].d_close)(dev, flag, mode, p)); + cndev = cn_tab->cn_dev; + /* + * act appropriatly depending on whether it's /dev/console + * or the pysical device (e.g. /dev/sio) that's being closed. + * in either case, don't actually close the device unless + * both are closed. + */ + if (dev == cndev) { + /* the physical device is about to be closed */ + cn_phys_is_open = 0; + if (cn_is_open) { + if (cn_tp) { + /* perform a ttyhalfclose() */ + /* reset session and proc group */ + cn_tp->t_pgrp = NULL; + cn_tp->t_session = NULL; + } + return (0); + } + } else if (major(dev) != major(cndev)) { + /* the logical console is about to be closed */ + cn_is_open = 0; + if (cn_phys_is_open) + return (0); + dev = cndev; + } + if(cn_phys_close) + return ((*cn_phys_close)(dev, flag, mode, p)); + return (0); } - + +static int cnread(dev, uio, flag) dev_t dev; struct uio *uio; + int flag; { - if (cn_tab == NULL) + if ((cn_tab == NULL) || cn_mute) return (0); dev = cn_tab->cn_dev; - return ((*cdevsw[major(dev)].d_read)(dev, uio, flag)); + return ((*cdevsw[major(dev)]->d_read)(dev, uio, flag)); } - + +static int cnwrite(dev, uio, flag) dev_t dev; struct uio *uio; + int flag; { - if (cn_tab == NULL) + if ((cn_tab == NULL) || cn_mute) { + uio->uio_resid = 0; /* dump the data */ return (0); - if (constty) /* 16 Aug 92*/ + } + if (constty) dev = constty->t_dev; else dev = cn_tab->cn_dev; - return ((*cdevsw[major(dev)].d_write)(dev, uio, flag)); + return ((*cdevsw[major(dev)]->d_write)(dev, uio, flag)); } - + +static int cnioctl(dev, cmd, data, flag, p) dev_t dev; + u_long cmd; caddr_t data; + int flag; struct proc *p; { int error; - if (cn_tab == NULL) + if ((cn_tab == NULL) || cn_mute) return (0); /* * Superuser can always use this to wrest control of console @@ -171,43 +365,74 @@ cnioctl(dev, cmd, data, flag, p) return (0); } dev = cn_tab->cn_dev; - return ((*cdevsw[major(dev)].d_ioctl)(dev, cmd, data, flag, p)); + return ((*cdevsw[major(dev)]->d_ioctl)(dev, cmd, data, flag, p)); } -/*ARGSUSED*/ -cnselect(dev, rw, p) +static int +cnpoll(dev, events, p) dev_t dev; - int rw; + int events; struct proc *p; { - if (cn_tab == NULL) + if ((cn_tab == NULL) || cn_mute) return (1); - return (ttselect(cn_tab->cn_dev, rw, p)); + + dev = cn_tab->cn_dev; + + return ((*cdevsw[major(dev)]->d_poll)(dev, events, p)); } +int cngetc() { - if (cn_tab == NULL) - return (0); - return ((*cn_tab->cn_getc)(cn_tab->cn_dev)); + int c; + if ((cn_tab == NULL) || cn_mute) + return (-1); + c = (*cn_tab->cn_getc)(cn_tab->cn_dev); + if (c == '\r') c = '\n'; /* console input is always ICRNL */ + return (c); } +int +cncheckc() +{ + if ((cn_tab == NULL) || cn_mute) + return (-1); + return ((*cn_tab->cn_checkc)(cn_tab->cn_dev)); +} + +void cnputc(c) register int c; { - if (cn_tab == NULL) + if ((cn_tab == NULL) || cn_mute) return; if (c) { - (*cn_tab->cn_putc)(cn_tab->cn_dev, c); if (c == '\n') (*cn_tab->cn_putc)(cn_tab->cn_dev, '\r'); + (*cn_tab->cn_putc)(cn_tab->cn_dev, c); } } -pg(p,q,r,s,t,u,v,w,x,y,z) char *p; { - printf(p,q,r,s,t,u,v,w,x,y,z); - printf("\n>"); - return(cngetc()); +static cn_devsw_installed = 0; + +static void +cn_drvinit(void *unused) +{ + dev_t dev; + + if( ! cn_devsw_installed ) { + dev = makedev(CDEV_MAJOR,0); + cdevsw_add(&dev,&cn_cdevsw,NULL); + cn_devsw_installed = 1; +#ifdef DEVFS + cn_devfs_token = devfs_add_devswf(&cn_cdevsw, 0, DV_CHR, + UID_ROOT, GID_WHEEL, 0600, + "console"); +#endif + } } +SYSINIT(cndev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,cn_drvinit,NULL) + diff --git a/sys/kern/tty_pty.c b/sys/kern/tty_pty.c new file mode 100644 index 0000000..214f103 --- /dev/null +++ b/sys/kern/tty_pty.c @@ -0,0 +1,832 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty_pty.c 8.4 (Berkeley) 2/20/95 + * $Id: tty_pty.c,v 1.53 1998/07/15 12:18:30 bde Exp $ + */ + +/* + * Pseudo-teletype Driver + * (Actually two drivers, requiring two entries in 'cdevsw') + */ +#include "pty.h" /* XXX */ +#include "opt_compat.h" +#include "opt_devfs.h" + +#include <sys/param.h> +#include <sys/systm.h> +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#include <sys/ioctl_compat.h> +#endif +#include <sys/proc.h> +#include <sys/tty.h> +#include <sys/conf.h> +#include <sys/fcntl.h> +#include <sys/poll.h> +#include <sys/kernel.h> +#include <sys/vnode.h> +#include <sys/signalvar.h> + +#ifdef DEVFS +#include <sys/devfsext.h> +#endif /*DEVFS*/ + +#ifdef notyet +static void ptyattach __P((int n)); +#endif +static void ptsstart __P((struct tty *tp)); +static void ptcwakeup __P((struct tty *tp, int flag)); + +static d_open_t ptsopen; +static d_close_t ptsclose; +static d_read_t ptsread; +static d_write_t ptswrite; +static d_ioctl_t ptyioctl; +static d_stop_t ptsstop; +static d_devtotty_t ptydevtotty; +static d_open_t ptcopen; +static d_close_t ptcclose; +static d_read_t ptcread; +static d_write_t ptcwrite; +static d_poll_t ptcpoll; + +#define CDEV_MAJOR_S 5 +static struct cdevsw pts_cdevsw = { + ptsopen, ptsclose, ptsread, ptswrite, + ptyioctl, ptsstop, nullreset, ptydevtotty, + ttpoll, nommap, NULL, "pts", + NULL, -1, nodump, nopsize, + D_TTY, +}; + +#define CDEV_MAJOR_C 6 +static struct cdevsw ptc_cdevsw = { + ptcopen, ptcclose, ptcread, ptcwrite, + ptyioctl, nullstop, nullreset, ptydevtotty, + ptcpoll, nommap, NULL, "ptc", + NULL, -1, nodump, nopsize, + D_TTY, +}; + +#if NPTY == 1 +#undef NPTY +#define NPTY 32 /* crude XXX */ +#warning You have only one pty defined, redefining to 32. +#endif + +#ifdef DEVFS +#define MAXUNITS (8 * 32) +static void *devfs_token_pts[MAXUNITS]; +static void *devfs_token_ptc[MAXUNITS]; +static const char jnames[] = "pqrsPQRS"; +#if NPTY > MAXUNITS +#undef NPTY +#define NPTY MAXUNITS +#warning Can't have more than 256 pty's with DEVFS defined. +#endif +#endif + +#define BUFSIZ 100 /* Chunk size iomoved to/from user */ + +/* + * pts == /dev/tty[pqrsPQRS][0123456789abcdefghijklmnopqrstuv] + * ptc == /dev/pty[pqrsPQRS][0123456789abcdefghijklmnopqrstuv] + */ +static struct tty pt_tty[NPTY]; /* XXX */ +static struct pt_ioctl { + int pt_flags; + struct selinfo pt_selr, pt_selw; + u_char pt_send; + u_char pt_ucntl; +} pt_ioctl[NPTY]; /* XXX */ +static int npty = NPTY; /* for pstat -t */ + +#define PF_PKT 0x08 /* packet mode */ +#define PF_STOPPED 0x10 /* user told stopped */ +#define PF_REMOTE 0x20 /* remote and flow controlled input */ +#define PF_NOSTOP 0x40 +#define PF_UCNTL 0x80 /* user control mode */ + +#ifdef notyet +/* + * Establish n (or default if n is 1) ptys in the system. + * + * XXX cdevsw & pstat require the array `pty[]' to be an array + */ +static void +ptyattach(n) + int n; +{ + char *mem; + register u_long ntb; +#define DEFAULT_NPTY 32 + + /* maybe should allow 0 => none? */ + if (n <= 1) + n = DEFAULT_NPTY; + ntb = n * sizeof(struct tty); + mem = malloc(ntb + ALIGNBYTES + n * sizeof(struct pt_ioctl), + M_DEVBUF, M_WAITOK); + pt_tty = (struct tty *)mem; + mem = (char *)ALIGN(mem + ntb); + pt_ioctl = (struct pt_ioctl *)mem; + npty = n; +} +#endif + +/*ARGSUSED*/ +static int +ptsopen(dev, flag, devtype, p) + dev_t dev; + int flag, devtype; + struct proc *p; +{ + register struct tty *tp; + int error; + + if (minor(dev) >= npty) + return (ENXIO); + tp = &pt_tty[minor(dev)]; + if ((tp->t_state & TS_ISOPEN) == 0) { + ttychars(tp); /* Set up default chars */ + tp->t_iflag = TTYDEF_IFLAG; + tp->t_oflag = TTYDEF_OFLAG; + tp->t_lflag = TTYDEF_LFLAG; + tp->t_cflag = TTYDEF_CFLAG; + tp->t_ispeed = tp->t_ospeed = TTYDEF_SPEED; + } else if (tp->t_state&TS_XCLUDE && p->p_ucred->cr_uid != 0) + return (EBUSY); + if (tp->t_oproc) /* Ctrlr still around. */ + (void)(*linesw[tp->t_line].l_modem)(tp, 1); + while ((tp->t_state & TS_CARR_ON) == 0) { + if (flag&FNONBLOCK) + break; + error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH, + "ptsopn", 0); + if (error) + return (error); + } + error = (*linesw[tp->t_line].l_open)(dev, tp); + if (error == 0) + ptcwakeup(tp, FREAD|FWRITE); + return (error); +} + +static int +ptsclose(dev, flag, mode, p) + dev_t dev; + int flag, mode; + struct proc *p; +{ + register struct tty *tp; + int err; + + tp = &pt_tty[minor(dev)]; + err = (*linesw[tp->t_line].l_close)(tp, flag); + ptsstop(tp, FREAD|FWRITE); + (void) ttyclose(tp); + return (err); +} + +static int +ptsread(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + struct proc *p = curproc; + register struct tty *tp = &pt_tty[minor(dev)]; + register struct pt_ioctl *pti = &pt_ioctl[minor(dev)]; + int error = 0; + +again: + if (pti->pt_flags & PF_REMOTE) { + while (isbackground(p, tp)) { + if ((p->p_sigignore & sigmask(SIGTTIN)) || + (p->p_sigmask & sigmask(SIGTTIN)) || + p->p_pgrp->pg_jobc == 0 || + p->p_flag & P_PPWAIT) + return (EIO); + pgsignal(p->p_pgrp, SIGTTIN, 1); + error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ptsbg", + 0); + if (error) + return (error); + } + if (tp->t_canq.c_cc == 0) { + if (flag & IO_NDELAY) + return (EWOULDBLOCK); + error = ttysleep(tp, TSA_PTS_READ(tp), TTIPRI | PCATCH, + "ptsin", 0); + if (error) + return (error); + goto again; + } + while (tp->t_canq.c_cc > 1 && uio->uio_resid > 0) + if (ureadc(getc(&tp->t_canq), uio) < 0) { + error = EFAULT; + break; + } + if (tp->t_canq.c_cc == 1) + (void) getc(&tp->t_canq); + if (tp->t_canq.c_cc) + return (error); + } else + if (tp->t_oproc) + error = (*linesw[tp->t_line].l_read)(tp, uio, flag); + ptcwakeup(tp, FWRITE); + return (error); +} + +/* + * Write to pseudo-tty. + * Wakeups of controlling tty will happen + * indirectly, when tty driver calls ptsstart. + */ +static int +ptswrite(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + register struct tty *tp; + + tp = &pt_tty[minor(dev)]; + if (tp->t_oproc == 0) + return (EIO); + return ((*linesw[tp->t_line].l_write)(tp, uio, flag)); +} + +/* + * Start output on pseudo-tty. + * Wake up process selecting or sleeping for input from controlling tty. + */ +static void +ptsstart(tp) + struct tty *tp; +{ + register struct pt_ioctl *pti = &pt_ioctl[minor(tp->t_dev)]; + + if (tp->t_state & TS_TTSTOP) + return; + if (pti->pt_flags & PF_STOPPED) { + pti->pt_flags &= ~PF_STOPPED; + pti->pt_send = TIOCPKT_START; + } + ptcwakeup(tp, FREAD); +} + +static void +ptcwakeup(tp, flag) + struct tty *tp; + int flag; +{ + struct pt_ioctl *pti = &pt_ioctl[minor(tp->t_dev)]; + + if (flag & FREAD) { + selwakeup(&pti->pt_selr); + wakeup(TSA_PTC_READ(tp)); + } + if (flag & FWRITE) { + selwakeup(&pti->pt_selw); + wakeup(TSA_PTC_WRITE(tp)); + } +} + +static int +ptcopen(dev, flag, devtype, p) + dev_t dev; + int flag, devtype; + struct proc *p; +{ + register struct tty *tp; + struct pt_ioctl *pti; + + if (minor(dev) >= npty) + return (ENXIO); + tp = &pt_tty[minor(dev)]; + if (tp->t_oproc) + return (EIO); + tp->t_oproc = ptsstart; +#ifdef sun4c + tp->t_stop = ptsstop; +#endif + (void)(*linesw[tp->t_line].l_modem)(tp, 1); + tp->t_lflag &= ~EXTPROC; + pti = &pt_ioctl[minor(dev)]; + pti->pt_flags = 0; + pti->pt_send = 0; + pti->pt_ucntl = 0; + return (0); +} + +static int +ptcclose(dev, flags, fmt, p) + dev_t dev; + int flags; + int fmt; + struct proc *p; +{ + register struct tty *tp; + + tp = &pt_tty[minor(dev)]; + (void)(*linesw[tp->t_line].l_modem)(tp, 0); + + /* + * XXX MDMBUF makes no sense for ptys but would inhibit the above + * l_modem(). CLOCAL makes sense but isn't supported. Special + * l_modem()s that ignore carrier drop make no sense for ptys but + * may be in use because other parts of the line discipline make + * sense for ptys. Recover by doing everything that a normal + * ttymodem() would have done except for sending a SIGHUP. + */ + if (tp->t_state & TS_ISOPEN) { + tp->t_state &= ~(TS_CARR_ON | TS_CONNECTED); + tp->t_state |= TS_ZOMBIE; + ttyflush(tp, FREAD | FWRITE); + } + + tp->t_oproc = 0; /* mark closed */ + return (0); +} + +static int +ptcread(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + register struct tty *tp = &pt_tty[minor(dev)]; + struct pt_ioctl *pti = &pt_ioctl[minor(dev)]; + char buf[BUFSIZ]; + int error = 0, cc; + + /* + * We want to block until the slave + * is open, and there's something to read; + * but if we lost the slave or we're NBIO, + * then return the appropriate error instead. + */ + for (;;) { + if (tp->t_state&TS_ISOPEN) { + if (pti->pt_flags&PF_PKT && pti->pt_send) { + error = ureadc((int)pti->pt_send, uio); + if (error) + return (error); + if (pti->pt_send & TIOCPKT_IOCTL) { + cc = min(uio->uio_resid, + sizeof(tp->t_termios)); + uiomove((caddr_t)&tp->t_termios, cc, + uio); + } + pti->pt_send = 0; + return (0); + } + if (pti->pt_flags&PF_UCNTL && pti->pt_ucntl) { + error = ureadc((int)pti->pt_ucntl, uio); + if (error) + return (error); + pti->pt_ucntl = 0; + return (0); + } + if (tp->t_outq.c_cc && (tp->t_state&TS_TTSTOP) == 0) + break; + } + if ((tp->t_state & TS_CONNECTED) == 0) + return (0); /* EOF */ + if (flag & IO_NDELAY) + return (EWOULDBLOCK); + error = tsleep(TSA_PTC_READ(tp), TTIPRI | PCATCH, "ptcin", 0); + if (error) + return (error); + } + if (pti->pt_flags & (PF_PKT|PF_UCNTL)) + error = ureadc(0, uio); + while (uio->uio_resid > 0 && error == 0) { + cc = q_to_b(&tp->t_outq, buf, min(uio->uio_resid, BUFSIZ)); + if (cc <= 0) + break; + error = uiomove(buf, cc, uio); + } + ttwwakeup(tp); + return (error); +} + +static void +ptsstop(tp, flush) + register struct tty *tp; + int flush; +{ + struct pt_ioctl *pti = &pt_ioctl[minor(tp->t_dev)]; + int flag; + + /* note: FLUSHREAD and FLUSHWRITE already ok */ + if (flush == 0) { + flush = TIOCPKT_STOP; + pti->pt_flags |= PF_STOPPED; + } else + pti->pt_flags &= ~PF_STOPPED; + pti->pt_send |= flush; + /* change of perspective */ + flag = 0; + if (flush & FREAD) + flag |= FWRITE; + if (flush & FWRITE) + flag |= FREAD; + ptcwakeup(tp, flag); +} + +static int +ptcpoll(dev, events, p) + dev_t dev; + int events; + struct proc *p; +{ + register struct tty *tp = &pt_tty[minor(dev)]; + struct pt_ioctl *pti = &pt_ioctl[minor(dev)]; + int revents = 0; + int s; + + if ((tp->t_state & TS_CONNECTED) == 0) + return (seltrue(dev, events, p) | POLLHUP); + + /* + * Need to block timeouts (ttrstart). + */ + s = spltty(); + + if (events & (POLLIN | POLLRDNORM)) + if ((tp->t_state & TS_ISOPEN) && + ((tp->t_outq.c_cc && (tp->t_state & TS_TTSTOP) == 0) || + ((pti->pt_flags & PF_PKT) && pti->pt_send) || + ((pti->pt_flags & PF_UCNTL) && pti->pt_ucntl))) + revents |= events & (POLLIN | POLLRDNORM); + + if (events & (POLLOUT | POLLWRNORM)) + if (tp->t_state & TS_ISOPEN && + ((pti->pt_flags & PF_REMOTE) ? + (tp->t_canq.c_cc == 0) : + ((tp->t_rawq.c_cc + tp->t_canq.c_cc < TTYHOG - 2) || + (tp->t_canq.c_cc == 0 && (tp->t_iflag & ICANON))))) + revents |= events & (POLLOUT | POLLWRNORM); + + if (events & POLLHUP) + if ((tp->t_state & TS_CARR_ON) == 0) + revents |= POLLHUP; + + if (revents == 0) { + if (events & (POLLIN | POLLRDNORM)) + selrecord(p, &pti->pt_selr); + + if (events & (POLLOUT | POLLWRNORM)) + selrecord(p, &pti->pt_selw); + } + splx(s); + + return (revents); +} + +static int +ptcwrite(dev, uio, flag) + dev_t dev; + register struct uio *uio; + int flag; +{ + register struct tty *tp = &pt_tty[minor(dev)]; + register u_char *cp = 0; + register int cc = 0; + u_char locbuf[BUFSIZ]; + int cnt = 0; + struct pt_ioctl *pti = &pt_ioctl[minor(dev)]; + int error = 0; + +again: + if ((tp->t_state&TS_ISOPEN) == 0) + goto block; + if (pti->pt_flags & PF_REMOTE) { + if (tp->t_canq.c_cc) + goto block; + while ((uio->uio_resid > 0 || cc > 0) && + tp->t_canq.c_cc < TTYHOG - 1) { + if (cc == 0) { + cc = min(uio->uio_resid, BUFSIZ); + cc = min(cc, TTYHOG - 1 - tp->t_canq.c_cc); + cp = locbuf; + error = uiomove((caddr_t)cp, cc, uio); + if (error) + return (error); + /* check again for safety */ + if ((tp->t_state & TS_ISOPEN) == 0) { + /* adjust as usual */ + uio->uio_resid += cc; + return (EIO); + } + } + if (cc > 0) { + cc = b_to_q((char *)cp, cc, &tp->t_canq); + /* + * XXX we don't guarantee that the canq size + * is >= TTYHOG, so the above b_to_q() may + * leave some bytes uncopied. However, space + * is guaranteed for the null terminator if + * we don't fail here since (TTYHOG - 1) is + * not a multiple of CBSIZE. + */ + if (cc > 0) + break; + } + } + /* adjust for data copied in but not written */ + uio->uio_resid += cc; + (void) putc(0, &tp->t_canq); + ttwakeup(tp); + wakeup(TSA_PTS_READ(tp)); + return (0); + } + while (uio->uio_resid > 0 || cc > 0) { + if (cc == 0) { + cc = min(uio->uio_resid, BUFSIZ); + cp = locbuf; + error = uiomove((caddr_t)cp, cc, uio); + if (error) + return (error); + /* check again for safety */ + if ((tp->t_state & TS_ISOPEN) == 0) { + /* adjust for data copied in but not written */ + uio->uio_resid += cc; + return (EIO); + } + } + while (cc > 0) { + if ((tp->t_rawq.c_cc + tp->t_canq.c_cc) >= TTYHOG - 2 && + (tp->t_canq.c_cc > 0 || !(tp->t_iflag&ICANON))) { + wakeup(TSA_HUP_OR_INPUT(tp)); + goto block; + } + (*linesw[tp->t_line].l_rint)(*cp++, tp); + cnt++; + cc--; + } + cc = 0; + } + return (0); +block: + /* + * Come here to wait for slave to open, for space + * in outq, or space in rawq, or an empty canq. + */ + if ((tp->t_state & TS_CONNECTED) == 0) { + /* adjust for data copied in but not written */ + uio->uio_resid += cc; + return (EIO); + } + if (flag & IO_NDELAY) { + /* adjust for data copied in but not written */ + uio->uio_resid += cc; + if (cnt == 0) + return (EWOULDBLOCK); + return (0); + } + error = tsleep(TSA_PTC_WRITE(tp), TTOPRI | PCATCH, "ptcout", 0); + if (error) { + /* adjust for data copied in but not written */ + uio->uio_resid += cc; + return (error); + } + goto again; +} + +static struct tty * +ptydevtotty(dev) + dev_t dev; +{ + if (minor(dev) >= npty) + return (NULL); + + return &pt_tty[minor(dev)]; +} + +/*ARGSUSED*/ +static int +ptyioctl(dev, cmd, data, flag, p) + dev_t dev; + u_long cmd; + caddr_t data; + int flag; + struct proc *p; +{ + register struct tty *tp = &pt_tty[minor(dev)]; + register struct pt_ioctl *pti = &pt_ioctl[minor(dev)]; + register u_char *cc = tp->t_cc; + int stop, error; + + /* + * IF CONTROLLER STTY THEN MUST FLUSH TO PREVENT A HANG. + * ttywflush(tp) will hang if there are characters in the outq. + */ + if (cmd == TIOCEXT) { + /* + * When the EXTPROC bit is being toggled, we need + * to send an TIOCPKT_IOCTL if the packet driver + * is turned on. + */ + if (*(int *)data) { + if (pti->pt_flags & PF_PKT) { + pti->pt_send |= TIOCPKT_IOCTL; + ptcwakeup(tp, FREAD); + } + tp->t_lflag |= EXTPROC; + } else { + if ((tp->t_lflag & EXTPROC) && + (pti->pt_flags & PF_PKT)) { + pti->pt_send |= TIOCPKT_IOCTL; + ptcwakeup(tp, FREAD); + } + tp->t_lflag &= ~EXTPROC; + } + return(0); + } else + if (cdevsw[major(dev)]->d_open == ptcopen) + switch (cmd) { + + case TIOCGPGRP: + /* + * We avoid calling ttioctl on the controller since, + * in that case, tp must be the controlling terminal. + */ + *(int *)data = tp->t_pgrp ? tp->t_pgrp->pg_id : 0; + return (0); + + case TIOCPKT: + if (*(int *)data) { + if (pti->pt_flags & PF_UCNTL) + return (EINVAL); + pti->pt_flags |= PF_PKT; + } else + pti->pt_flags &= ~PF_PKT; + return (0); + + case TIOCUCNTL: + if (*(int *)data) { + if (pti->pt_flags & PF_PKT) + return (EINVAL); + pti->pt_flags |= PF_UCNTL; + } else + pti->pt_flags &= ~PF_UCNTL; + return (0); + + case TIOCREMOTE: + if (*(int *)data) + pti->pt_flags |= PF_REMOTE; + else + pti->pt_flags &= ~PF_REMOTE; + ttyflush(tp, FREAD|FWRITE); + return (0); + +#ifdef COMPAT_43 + case TIOCSETP: + case TIOCSETN: +#endif + case TIOCSETD: + case TIOCSETA: + case TIOCSETAW: + case TIOCSETAF: + ndflush(&tp->t_outq, tp->t_outq.c_cc); + break; + + case TIOCSIG: + if (*(unsigned int *)data >= NSIG || + *(unsigned int *)data == 0) + return(EINVAL); + if ((tp->t_lflag&NOFLSH) == 0) + ttyflush(tp, FREAD|FWRITE); + pgsignal(tp->t_pgrp, *(unsigned int *)data, 1); + if ((*(unsigned int *)data == SIGINFO) && + ((tp->t_lflag&NOKERNINFO) == 0)) + ttyinfo(tp); + return(0); + } + error = (*linesw[tp->t_line].l_ioctl)(tp, cmd, data, flag, p); + if (error == ENOIOCTL) + error = ttioctl(tp, cmd, data, flag); + if (error == ENOIOCTL) { + if (pti->pt_flags & PF_UCNTL && + (cmd & ~0xff) == UIOCCMD(0)) { + if (cmd & 0xff) { + pti->pt_ucntl = (u_char)cmd; + ptcwakeup(tp, FREAD); + } + return (0); + } + error = ENOTTY; + } + /* + * If external processing and packet mode send ioctl packet. + */ + if ((tp->t_lflag&EXTPROC) && (pti->pt_flags & PF_PKT)) { + switch(cmd) { + case TIOCSETA: + case TIOCSETAW: + case TIOCSETAF: +#ifdef COMPAT_43 + case TIOCSETP: + case TIOCSETN: +#endif +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) + case TIOCSETC: + case TIOCSLTC: + case TIOCLBIS: + case TIOCLBIC: + case TIOCLSET: +#endif + pti->pt_send |= TIOCPKT_IOCTL; + ptcwakeup(tp, FREAD); + default: + break; + } + } + stop = (tp->t_iflag & IXON) && CCEQ(cc[VSTOP], CTRL('s')) + && CCEQ(cc[VSTART], CTRL('q')); + if (pti->pt_flags & PF_NOSTOP) { + if (stop) { + pti->pt_send &= ~TIOCPKT_NOSTOP; + pti->pt_send |= TIOCPKT_DOSTOP; + pti->pt_flags &= ~PF_NOSTOP; + ptcwakeup(tp, FREAD); + } + } else { + if (!stop) { + pti->pt_send &= ~TIOCPKT_DOSTOP; + pti->pt_send |= TIOCPKT_NOSTOP; + pti->pt_flags |= PF_NOSTOP; + ptcwakeup(tp, FREAD); + } + } + return (error); +} + +static int ptc_devsw_installed; + +static void ptc_drvinit __P((void *unused)); +static void +ptc_drvinit(unused) + void *unused; +{ +#ifdef DEVFS + int i,j,k; +#endif + dev_t dev; + + if( ! ptc_devsw_installed ) { + dev = makedev(CDEV_MAJOR_S, 0); + cdevsw_add(&dev, &pts_cdevsw, NULL); + dev = makedev(CDEV_MAJOR_C, 0); + cdevsw_add(&dev, &ptc_cdevsw, NULL); + ptc_devsw_installed = 1; +#ifdef DEVFS + for ( i = 0 ; i<NPTY ; i++ ) { + j = i / 32; + k = i % 32; + devfs_token_pts[i] = + devfs_add_devswf(&pts_cdevsw,i, + DV_CHR,0,0,0666, + "tty%c%r",jnames[j],k); + devfs_token_ptc[i] = + devfs_add_devswf(&ptc_cdevsw,i, + DV_CHR,0,0,0666, + "pty%c%r",jnames[j],k); + } +#endif + } +} + +SYSINIT(ptcdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR_C,ptc_drvinit,NULL) diff --git a/sys/kern/tty_snoop.c b/sys/kern/tty_snoop.c new file mode 100644 index 0000000..ba71a94 --- /dev/null +++ b/sys/kern/tty_snoop.c @@ -0,0 +1,541 @@ +/* + * Copyright (c) 1995 Ugen J.S.Antsilevich + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * Snoop stuff. + */ + +#include "snp.h" + +#if NSNP > 0 + +#include "opt_compat.h" +#include "opt_devfs.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/filio.h> +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#include <sys/ioctl_compat.h> +#endif +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/tty.h> +#include <sys/conf.h> +#include <sys/poll.h> +#include <sys/kernel.h> +#ifdef DEVFS +#include <sys/devfsext.h> +#endif /*DEVFS*/ +#include <sys/snoop.h> +#include <sys/vnode.h> + +static d_open_t snpopen; +static d_close_t snpclose; +static d_read_t snpread; +static d_write_t snpwrite; +static d_ioctl_t snpioctl; +static d_poll_t snppoll; + +#define CDEV_MAJOR 53 +static struct cdevsw snp_cdevsw = + { snpopen, snpclose, snpread, snpwrite, /*53*/ + snpioctl, nostop, nullreset, nodevtotty,/* snoop */ + snppoll, nommap, NULL, "snp", NULL, -1 }; + + +#ifndef MIN +#define MIN(a,b) (((a)<(b))?(a):(b)) +#endif + +static struct snoop snoopsw[NSNP]; + +static struct tty *snpdevtotty __P((dev_t dev)); +static int snp_detach __P((struct snoop *snp)); + +static struct tty * +snpdevtotty (dev) + dev_t dev; +{ + struct cdevsw *cdp; + int maj; + + maj = major(dev); + if ((u_int)maj >= nchrdev) + return (NULL); + cdp = cdevsw[maj]; + if (cdp == NULL) + return (NULL); + return ((*cdp->d_devtotty)(dev)); +} + +#define SNP_INPUT_BUF 5 /* This is even too much,the maximal + * interactive mode write is 3 bytes + * length for function keys... + */ + +static int +snpwrite(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + int unit = minor(dev), len, i, error; + struct snoop *snp = &snoopsw[unit]; + struct tty *tp; + char c[SNP_INPUT_BUF]; + + if (snp->snp_tty == NULL) + return (EIO); + + tp = snp->snp_tty; + + if ((tp->t_sc == snp) && (tp->t_state & TS_SNOOP) && + (tp->t_line == OTTYDISC || tp->t_line == NTTYDISC)) + goto tty_input; + + printf("Snoop: attempt to write to bad tty.\n"); + return (EIO); + +tty_input: + if (!(tp->t_state & TS_ISOPEN)) + return (EIO); + + while (uio->uio_resid > 0) { + len = MIN(uio->uio_resid,SNP_INPUT_BUF); + if ((error = uiomove(c, len, uio)) != 0) + return (error); + for (i=0;i<len;i++) { + if (ttyinput(c[i] , tp)) + return (EIO); + } + } + return 0; + +} + + +static int +snpread(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + int unit = minor(dev), s; + struct snoop *snp = &snoopsw[unit]; + int len, n, nblen, error = 0; + caddr_t from; + char *nbuf; + + KASSERT(snp->snp_len + snp->snp_base <= snp->snp_blen, + ("snoop buffer error")); + + if (snp->snp_tty == NULL) + return (EIO); + + snp->snp_flags &= ~SNOOP_RWAIT; + + do { + if (snp->snp_len == 0) { + if (flag & IO_NDELAY) + return (EWOULDBLOCK); + snp->snp_flags |= SNOOP_RWAIT; + tsleep((caddr_t) snp, (PZERO + 1) | PCATCH, "snoopread", 0); + } + } while (snp->snp_len == 0); + + n = snp->snp_len; + + while (snp->snp_len > 0 && uio->uio_resid > 0 && error == 0) { + len = MIN(uio->uio_resid, snp->snp_len); + from = (caddr_t) (snp->snp_buf + snp->snp_base); + if (len == 0) + break; + + error = uiomove(from, len, uio); + snp->snp_base += len; + snp->snp_len -= len; + } + if ((snp->snp_flags & SNOOP_OFLOW) && (n < snp->snp_len)) { + snp->snp_flags &= ~SNOOP_OFLOW; + } + s = spltty(); + nblen = snp->snp_blen; + if (((nblen / 2) >= SNOOP_MINLEN) && (nblen / 2) >= snp->snp_len) { + while (((nblen / 2) >= snp->snp_len) && ((nblen / 2) >= SNOOP_MINLEN)) + nblen = nblen / 2; + if (nbuf = malloc(nblen, M_TTYS, M_NOWAIT)) { + bcopy(snp->snp_buf + snp->snp_base, nbuf, snp->snp_len); + free(snp->snp_buf, M_TTYS); + snp->snp_buf = nbuf; + snp->snp_blen = nblen; + snp->snp_base = 0; + } + } + splx(s); + + return error; +} + +int +snpinc(struct snoop *snp, char c) +{ + char buf[1]; + + buf[0]=c; + return (snpin(snp,buf,1)); +} + + +int +snpin(snp, buf, n) + struct snoop *snp; + char *buf; + int n; +{ + int s_free, s_tail; + int s, len, nblen; + caddr_t from, to; + char *nbuf; + + KASSERT(n >= 0, ("negative snoop char count")); + + if (n == 0) + return 0; + +#ifdef DIAGNOSTIC + if (!(snp->snp_flags & SNOOP_OPEN)) { + printf("Snoop: data coming to closed device.\n"); + return 0; + } +#endif + if (snp->snp_flags & SNOOP_DOWN) { + printf("Snoop: more data to down interface.\n"); + return 0; + } + + if (snp->snp_flags & SNOOP_OFLOW) { + printf("Snoop: buffer overflow.\n"); + /* + * On overflow we just repeat the standart close + * procedure...yes , this is waste of space but.. Then next + * read from device will fail if one would recall he is + * snooping and retry... + */ + + return (snpdown(snp)); + } + s_tail = snp->snp_blen - (snp->snp_len + snp->snp_base); + s_free = snp->snp_blen - snp->snp_len; + + + if (n > s_free) { + s = spltty(); + nblen = snp->snp_blen; + while ((n > s_free) && ((nblen * 2) <= SNOOP_MAXLEN)) { + nblen = snp->snp_blen * 2; + s_free = nblen - (snp->snp_len + snp->snp_base); + } + if ((n <= s_free) && (nbuf = malloc(nblen, M_TTYS, M_NOWAIT))) { + bcopy(snp->snp_buf + snp->snp_base, nbuf, snp->snp_len); + free(snp->snp_buf, M_TTYS); + snp->snp_buf = nbuf; + snp->snp_blen = nblen; + snp->snp_base = 0; + } else { + snp->snp_flags |= SNOOP_OFLOW; + if (snp->snp_flags & SNOOP_RWAIT) { + snp->snp_flags &= ~SNOOP_RWAIT; + wakeup((caddr_t) snp); + } + splx(s); + return 0; + } + splx(s); + } + if (n > s_tail) { + from = (caddr_t) (snp->snp_buf + snp->snp_base); + to = (caddr_t) (snp->snp_buf); + len = snp->snp_len; + bcopy(from, to, len); + snp->snp_base = 0; + } + to = (caddr_t) (snp->snp_buf + snp->snp_base + snp->snp_len); + bcopy(buf, to, n); + snp->snp_len += n; + + if (snp->snp_flags & SNOOP_RWAIT) { + snp->snp_flags &= ~SNOOP_RWAIT; + wakeup((caddr_t) snp); + } + selwakeup(&snp->snp_sel); + snp->snp_sel.si_pid = 0; + + return n; +} + +static int +snpopen(dev, flag, mode, p) + dev_t dev; + int flag, mode; + struct proc *p; +{ + struct snoop *snp; + register int unit, error; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + + if ((unit = minor(dev)) >= NSNP) + return (ENXIO); + + snp = &snoopsw[unit]; + + if (snp->snp_flags & SNOOP_OPEN) + return (ENXIO); + + /* + * We intentionally do not OR flags with SNOOP_OPEN,but set them so + * all previous settings (especially SNOOP_OFLOW) will be cleared. + */ + snp->snp_flags = SNOOP_OPEN; + + snp->snp_buf = malloc(SNOOP_MINLEN, M_TTYS, M_WAITOK); + snp->snp_blen = SNOOP_MINLEN; + snp->snp_base = 0; + snp->snp_len = 0; + + /* + * snp_tty == NULL is for inactive snoop devices. + */ + snp->snp_tty = NULL; + snp->snp_target = -1; + return (0); +} + + +static int +snp_detach(snp) + struct snoop *snp; +{ + struct tty *tp; + + snp->snp_base = 0; + snp->snp_len = 0; + + /* + * If line disc. changed we do not touch this pointer,SLIP/PPP will + * change it anyway. + */ + + if (snp->snp_tty == NULL) + goto detach_notty; + + tp = snp->snp_tty; + + if (tp && (tp->t_sc == snp) && (tp->t_state & TS_SNOOP) && + (tp->t_line == OTTYDISC || tp->t_line == NTTYDISC)) { + tp->t_sc = NULL; + tp->t_state &= ~TS_SNOOP; + } else + printf("Snoop: bad attached tty data.\n"); + + snp->snp_tty = NULL; + snp->snp_target = -1; + +detach_notty: + selwakeup(&snp->snp_sel); + snp->snp_sel.si_pid = 0; + + return (0); +} + +static int +snpclose(dev, flags, fmt, p) + dev_t dev; + int flags; + int fmt; + struct proc *p; +{ + register int unit = minor(dev); + struct snoop *snp = &snoopsw[unit]; + + snp->snp_blen = 0; + free(snp->snp_buf, M_TTYS); + snp->snp_flags &= ~SNOOP_OPEN; + + return (snp_detach(snp)); +} + +int +snpdown(snp) + struct snoop *snp; +{ + snp->snp_blen = SNOOP_MINLEN; + free(snp->snp_buf, M_TTYS); + snp->snp_buf = malloc(SNOOP_MINLEN, M_TTYS, M_WAITOK); + snp->snp_flags |= SNOOP_DOWN; + + return (snp_detach(snp)); +} + + +static int +snpioctl(dev, cmd, data, flags, p) + dev_t dev; + u_long cmd; + caddr_t data; + int flags; + struct proc *p; +{ + int unit = minor(dev), s; + dev_t tdev; + struct snoop *snp = &snoopsw[unit]; + struct tty *tp, *tpo; + + switch (cmd) { + case SNPSTTY: + tdev = *((dev_t *) data); + if (tdev == -1) + return (snpdown(snp)); + + tp = snpdevtotty(tdev); + if (!tp) + return (EINVAL); + + if ((tp->t_sc != (caddr_t) snp) && (tp->t_state & TS_SNOOP)) + return (EBUSY); + + if ((tp->t_line != OTTYDISC) && (tp->t_line != NTTYDISC)) + return (EBUSY); + + s = spltty(); + + if (snp->snp_target == -1) { + tpo = snp->snp_tty; + if (tpo) + tpo->t_state &= ~TS_SNOOP; + } + + tp->t_sc = (caddr_t) snp; + tp->t_state |= TS_SNOOP; + snp->snp_tty = tp; + snp->snp_target = tdev; + + /* + * Clean overflow and down flags - + * we'll have a chance to get them in the future :))) + */ + snp->snp_flags &= ~SNOOP_OFLOW; + snp->snp_flags &= ~SNOOP_DOWN; + splx(s); + break; + + case SNPGTTY: + /* + * We keep snp_target field specially to make + * SNPGTTY happy,else we can't know what is device + * major/minor for tty. + */ + *((dev_t *) data) = snp->snp_target; + break; + + case FIONBIO: + break; + + case FIOASYNC: + if (*(int *) data) + snp->snp_flags |= SNOOP_ASYNC; + else + snp->snp_flags &= ~SNOOP_ASYNC; + break; + + case FIONREAD: + s = spltty(); + if (snp->snp_tty != NULL) + *(int *) data = snp->snp_len; + else + if (snp->snp_flags & SNOOP_DOWN) { + if (snp->snp_flags & SNOOP_OFLOW) + *(int *) data = SNP_OFLOW; + else + *(int *) data = SNP_TTYCLOSE; + } else { + *(int *) data = SNP_DETACH; + } + splx(s); + break; + + default: + return (ENOTTY); + } + return (0); +} + + +static int +snppoll(dev, events, p) + dev_t dev; + int events; + struct proc *p; +{ + int unit = minor(dev); + struct snoop *snp = &snoopsw[unit]; + int revents = 0; + + + /* + * If snoop is down,we don't want to poll() forever so we return 1. + * Caller should see if we down via FIONREAD ioctl().The last should + * return -1 to indicate down state. + */ + if (events & (POLLIN | POLLRDNORM)) + if (snp->snp_flags & SNOOP_DOWN || snp->snp_len > 0) + revents |= events & (POLLIN | POLLRDNORM); + else + selrecord(p, &snp->snp_sel); + + return (revents); +} + +#ifdef DEVFS +static void *snp_devfs_token[NSNP]; +#endif +static int snp_devsw_installed; + +static void snp_drvinit __P((void *unused)); +static void +snp_drvinit(unused) + void *unused; +{ + dev_t dev; +#ifdef DEVFS + int i; +#endif + + if( ! snp_devsw_installed ) { + dev = makedev(CDEV_MAJOR, 0); + cdevsw_add(&dev,&snp_cdevsw, NULL); + snp_devsw_installed = 1; +#ifdef DEVFS + for ( i = 0 ; i < NSNP ; i++) { + snp_devfs_token[i] = + devfs_add_devswf(&snp_cdevsw, i, DV_CHR, 0, 0, + 0600, "snp%d", i); + } +#endif + } +} + +SYSINIT(snpdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,snp_drvinit,NULL) + + +#endif diff --git a/sys/kern/tty_subr.c b/sys/kern/tty_subr.c new file mode 100644 index 0000000..593d00c --- /dev/null +++ b/sys/kern/tty_subr.c @@ -0,0 +1,694 @@ +/* + * Copyright (c) 1994, David Greenman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: tty_subr.c,v 1.29 1998/04/15 17:46:27 bde Exp $ + */ + +/* + * clist support routines + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/tty.h> +#include <sys/clist.h> + +static void clist_init __P((void *)); +SYSINIT(clist, SI_SUB_CLIST, SI_ORDER_FIRST, clist_init, NULL) + +static struct cblock *cfreelist = 0; +int cfreecount = 0; +static int cslushcount; +static int ctotcount; + +#ifndef INITIAL_CBLOCKS +#define INITIAL_CBLOCKS 50 +#endif + +static struct cblock *cblock_alloc __P((void)); +static void cblock_alloc_cblocks __P((int number)); +static void cblock_free __P((struct cblock *cblockp)); +static void cblock_free_cblocks __P((int number)); + +#include "opt_ddb.h" +#ifdef DDB +#include <ddb/ddb.h> + +DB_SHOW_COMMAND(cbstat, cbstat) +{ + printf( + "tot = %d (active = %d, free = %d (reserved = %d, slush = %d))\n", + ctotcount * CBSIZE, ctotcount * CBSIZE - cfreecount, cfreecount, + cfreecount - cslushcount * CBSIZE, cslushcount * CBSIZE); +} +#endif /* DDB */ + +/* + * Called from init_main.c + */ +/* ARGSUSED*/ +static void +clist_init(dummy) + void *dummy; +{ + /* + * Allocate an initial base set of cblocks as a 'slush'. + * We allocate non-slush cblocks with each initial ttyopen() and + * deallocate them with each ttyclose(). + * We should adjust the slush allocation. This can't be done in + * the i/o routines because they are sometimes called from + * interrupt handlers when it may be unsafe to call malloc(). + */ + cblock_alloc_cblocks(cslushcount = INITIAL_CBLOCKS); +} + +/* + * Remove a cblock from the cfreelist queue and return a pointer + * to it. + */ +static __inline struct cblock * +cblock_alloc() +{ + struct cblock *cblockp; + + cblockp = cfreelist; + if (cblockp == NULL) + panic("clist reservation botch"); + cfreelist = cblockp->c_next; + cblockp->c_next = NULL; + cfreecount -= CBSIZE; + return (cblockp); +} + +/* + * Add a cblock to the cfreelist queue. + */ +static __inline void +cblock_free(cblockp) + struct cblock *cblockp; +{ + if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) + bzero(cblockp->c_quote, sizeof cblockp->c_quote); + cblockp->c_next = cfreelist; + cfreelist = cblockp; + cfreecount += CBSIZE; +} + +/* + * Allocate some cblocks for the cfreelist queue. + */ +static void +cblock_alloc_cblocks(number) + int number; +{ + int i; + struct cblock *cbp; + + for (i = 0; i < number; ++i) { + cbp = malloc(sizeof *cbp, M_TTYS, M_NOWAIT); + if (cbp == NULL) { + printf( +"clist_alloc_cblocks: M_NOWAIT malloc failed, trying M_WAITOK\n"); + cbp = malloc(sizeof *cbp, M_TTYS, M_WAITOK); + } + /* + * Freed cblocks have zero quotes and garbage elsewhere. + * Set the may-have-quote bit to force zeroing the quotes. + */ + setbit(cbp->c_quote, CBQSIZE * NBBY - 1); + cblock_free(cbp); + } + ctotcount += number; +} + +/* + * Set the cblock allocation policy for a a clist. + * Must be called in process context at spltty(). + */ +void +clist_alloc_cblocks(clistp, ccmax, ccreserved) + struct clist *clistp; + int ccmax; + int ccreserved; +{ + int dcbr; + + /* + * Allow for wasted space at the head. + */ + if (ccmax != 0) + ccmax += CBSIZE - 1; + if (ccreserved != 0) + ccreserved += CBSIZE - 1; + + clistp->c_cbmax = roundup(ccmax, CBSIZE) / CBSIZE; + dcbr = roundup(ccreserved, CBSIZE) / CBSIZE - clistp->c_cbreserved; + if (dcbr >= 0) + cblock_alloc_cblocks(dcbr); + else { + if (clistp->c_cbreserved + dcbr < clistp->c_cbcount) + dcbr = clistp->c_cbcount - clistp->c_cbreserved; + cblock_free_cblocks(-dcbr); + } + clistp->c_cbreserved += dcbr; +} + +/* + * Free some cblocks from the cfreelist queue back to the + * system malloc pool. + */ +static void +cblock_free_cblocks(number) + int number; +{ + int i; + + for (i = 0; i < number; ++i) + free(cblock_alloc(), M_TTYS); + ctotcount -= number; +} + +/* + * Free the cblocks reserved for a clist. + * Must be called at spltty(). + */ +void +clist_free_cblocks(clistp) + struct clist *clistp; +{ + if (clistp->c_cbcount != 0) + panic("freeing active clist cblocks"); + cblock_free_cblocks(clistp->c_cbreserved); + clistp->c_cbmax = 0; + clistp->c_cbreserved = 0; +} + +/* + * Get a character from the head of a clist. + */ +int +getc(clistp) + struct clist *clistp; +{ + int chr = -1; + int s; + struct cblock *cblockp; + + s = spltty(); + + /* If there are characters in the list, get one */ + if (clistp->c_cc) { + cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND); + chr = (u_char)*clistp->c_cf; + + /* + * If this char is quoted, set the flag. + */ + if (isset(cblockp->c_quote, clistp->c_cf - (char *)cblockp->c_info)) + chr |= TTY_QUOTE; + + /* + * Advance to next character. + */ + clistp->c_cf++; + clistp->c_cc--; + /* + * If we have advanced the 'first' character pointer + * past the end of this cblock, advance to the next one. + * If there are no more characters, set the first and + * last pointers to NULL. In either case, free the + * current cblock. + */ + if ((clistp->c_cf >= (char *)(cblockp+1)) || (clistp->c_cc == 0)) { + if (clistp->c_cc > 0) { + clistp->c_cf = cblockp->c_next->c_info; + } else { + clistp->c_cf = clistp->c_cl = NULL; + } + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + } + } + + splx(s); + return (chr); +} + +/* + * Copy 'amount' of chars, beginning at head of clist 'clistp' to + * destination linear buffer 'dest'. Return number of characters + * actually copied. + */ +int +q_to_b(clistp, dest, amount) + struct clist *clistp; + char *dest; + int amount; +{ + struct cblock *cblockp; + struct cblock *cblockn; + char *dest_orig = dest; + int numc; + int s; + + s = spltty(); + + while (clistp && amount && (clistp->c_cc > 0)) { + cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND); + cblockn = cblockp + 1; /* pointer arithmetic! */ + numc = min(amount, (char *)cblockn - clistp->c_cf); + numc = min(numc, clistp->c_cc); + bcopy(clistp->c_cf, dest, numc); + amount -= numc; + clistp->c_cf += numc; + clistp->c_cc -= numc; + dest += numc; + /* + * If this cblock has been emptied, advance to the next + * one. If there are no more characters, set the first + * and last pointer to NULL. In either case, free the + * current cblock. + */ + if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) { + if (clistp->c_cc > 0) { + clistp->c_cf = cblockp->c_next->c_info; + } else { + clistp->c_cf = clistp->c_cl = NULL; + } + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + } + } + + splx(s); + return (dest - dest_orig); +} + +/* + * Flush 'amount' of chars, beginning at head of clist 'clistp'. + */ +void +ndflush(clistp, amount) + struct clist *clistp; + int amount; +{ + struct cblock *cblockp; + struct cblock *cblockn; + int numc; + int s; + + s = spltty(); + + while (amount && (clistp->c_cc > 0)) { + cblockp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND); + cblockn = cblockp + 1; /* pointer arithmetic! */ + numc = min(amount, (char *)cblockn - clistp->c_cf); + numc = min(numc, clistp->c_cc); + amount -= numc; + clistp->c_cf += numc; + clistp->c_cc -= numc; + /* + * If this cblock has been emptied, advance to the next + * one. If there are no more characters, set the first + * and last pointer to NULL. In either case, free the + * current cblock. + */ + if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) { + if (clistp->c_cc > 0) { + clistp->c_cf = cblockp->c_next->c_info; + } else { + clistp->c_cf = clistp->c_cl = NULL; + } + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + } + } + + splx(s); +} + +/* + * Add a character to the end of a clist. Return -1 is no + * more clists, or 0 for success. + */ +int +putc(chr, clistp) + int chr; + struct clist *clistp; +{ + struct cblock *cblockp; + int s; + + s = spltty(); + + if (clistp->c_cl == NULL) { + if (clistp->c_cbreserved < 1) { + splx(s); + printf("putc to a clist with no reserved cblocks\n"); + return (-1); /* nothing done */ + } + cblockp = cblock_alloc(); + clistp->c_cbcount = 1; + clistp->c_cf = clistp->c_cl = cblockp->c_info; + clistp->c_cc = 0; + } else { + cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND); + if (((intptr_t)clistp->c_cl & CROUND) == 0) { + struct cblock *prev = (cblockp - 1); + + if (clistp->c_cbcount >= clistp->c_cbreserved) { + if (clistp->c_cbcount >= clistp->c_cbmax + || cslushcount <= 0) { + splx(s); + return (-1); + } + --cslushcount; + } + cblockp = cblock_alloc(); + clistp->c_cbcount++; + prev->c_next = cblockp; + clistp->c_cl = cblockp->c_info; + } + } + + /* + * If this character is quoted, set the quote bit, if not, clear it. + */ + if (chr & TTY_QUOTE) { + setbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info); + /* + * Use one of the spare quote bits to record that something + * may be quoted. + */ + setbit(cblockp->c_quote, CBQSIZE * NBBY - 1); + } else + clrbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info); + + *clistp->c_cl++ = chr; + clistp->c_cc++; + + splx(s); + return (0); +} + +/* + * Copy data from linear buffer to clist chain. Return the + * number of characters not copied. + */ +int +b_to_q(src, amount, clistp) + char *src; + int amount; + struct clist *clistp; +{ + struct cblock *cblockp; + char *firstbyte, *lastbyte; + u_char startmask, endmask; + int startbit, endbit, num_between, numc; + int s; + + /* + * Avoid allocating an initial cblock and then not using it. + * c_cc == 0 must imply c_cbount == 0. + */ + if (amount <= 0) + return (amount); + + s = spltty(); + + /* + * If there are no cblocks assigned to this clist yet, + * then get one. + */ + if (clistp->c_cl == NULL) { + if (clistp->c_cbreserved < 1) { + splx(s); + printf("b_to_q to a clist with no reserved cblocks.\n"); + return (amount); /* nothing done */ + } + cblockp = cblock_alloc(); + clistp->c_cbcount = 1; + clistp->c_cf = clistp->c_cl = cblockp->c_info; + clistp->c_cc = 0; + } else { + cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND); + } + + while (amount) { + /* + * Get another cblock if needed. + */ + if (((intptr_t)clistp->c_cl & CROUND) == 0) { + struct cblock *prev = cblockp - 1; + + if (clistp->c_cbcount >= clistp->c_cbreserved) { + if (clistp->c_cbcount >= clistp->c_cbmax + || cslushcount <= 0) { + splx(s); + return (amount); + } + --cslushcount; + } + cblockp = cblock_alloc(); + clistp->c_cbcount++; + prev->c_next = cblockp; + clistp->c_cl = cblockp->c_info; + } + + /* + * Copy a chunk of the linear buffer up to the end + * of this cblock. + */ + numc = min(amount, (char *)(cblockp + 1) - clistp->c_cl); + bcopy(src, clistp->c_cl, numc); + + /* + * Clear quote bits if they aren't known to be clear. + * The following could probably be made into a seperate + * "bitzero()" routine, but why bother? + */ + if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) { + startbit = clistp->c_cl - (char *)cblockp->c_info; + endbit = startbit + numc - 1; + + firstbyte = (u_char *)cblockp->c_quote + (startbit / NBBY); + lastbyte = (u_char *)cblockp->c_quote + (endbit / NBBY); + + /* + * Calculate mask of bits to preserve in first and + * last bytes. + */ + startmask = NBBY - (startbit % NBBY); + startmask = 0xff >> startmask; + endmask = (endbit % NBBY); + endmask = 0xff << (endmask + 1); + + if (firstbyte != lastbyte) { + *firstbyte &= startmask; + *lastbyte &= endmask; + + num_between = lastbyte - firstbyte - 1; + if (num_between) + bzero(firstbyte + 1, num_between); + } else { + *firstbyte &= (startmask | endmask); + } + } + + /* + * ...and update pointer for the next chunk. + */ + src += numc; + clistp->c_cl += numc; + clistp->c_cc += numc; + amount -= numc; + /* + * If we go through the loop again, it's always + * for data in the next cblock, so by adding one (cblock), + * (which makes the pointer 1 beyond the end of this + * cblock) we prepare for the assignment of 'prev' + * above. + */ + cblockp += 1; + + } + + splx(s); + return (amount); +} + +/* + * Get the next character in the clist. Store it at dst. Don't + * advance any clist pointers, but return a pointer to the next + * character position. + */ +char * +nextc(clistp, cp, dst) + struct clist *clistp; + char *cp; + int *dst; +{ + struct cblock *cblockp; + + ++cp; + /* + * See if the next character is beyond the end of + * the clist. + */ + if (clistp->c_cc && (cp != clistp->c_cl)) { + /* + * If the next character is beyond the end of this + * cblock, advance to the next cblock. + */ + if (((intptr_t)cp & CROUND) == 0) + cp = ((struct cblock *)cp - 1)->c_next->c_info; + cblockp = (struct cblock *)((intptr_t)cp & ~CROUND); + + /* + * Get the character. Set the quote flag if this character + * is quoted. + */ + *dst = (u_char)*cp | (isset(cblockp->c_quote, cp - (char *)cblockp->c_info) ? TTY_QUOTE : 0); + + return (cp); + } + + return (NULL); +} + +/* + * "Unput" a character from a clist. + */ +int +unputc(clistp) + struct clist *clistp; +{ + struct cblock *cblockp = 0, *cbp = 0; + int s; + int chr = -1; + + + s = spltty(); + + if (clistp->c_cc) { + --clistp->c_cc; + --clistp->c_cl; + + chr = (u_char)*clistp->c_cl; + + cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND); + + /* + * Set quote flag if this character was quoted. + */ + if (isset(cblockp->c_quote, (u_char *)clistp->c_cl - cblockp->c_info)) + chr |= TTY_QUOTE; + + /* + * If all of the characters have been unput in this + * cblock, then find the previous one and free this + * one. + */ + if (clistp->c_cc && (clistp->c_cl <= (char *)cblockp->c_info)) { + cbp = (struct cblock *)((intptr_t)clistp->c_cf & ~CROUND); + + while (cbp->c_next != cblockp) + cbp = cbp->c_next; + + /* + * When the previous cblock is at the end, the 'last' + * pointer always points (invalidly) one past. + */ + clistp->c_cl = (char *)(cbp+1); + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + cbp->c_next = NULL; + } + } + + /* + * If there are no more characters on the list, then + * free the last cblock. + */ + if ((clistp->c_cc == 0) && clistp->c_cl) { + cblockp = (struct cblock *)((intptr_t)clistp->c_cl & ~CROUND); + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + clistp->c_cf = clistp->c_cl = NULL; + } + + splx(s); + return (chr); +} + +/* + * Move characters in source clist to destination clist, + * preserving quote bits. + */ +void +catq(src_clistp, dest_clistp) + struct clist *src_clistp, *dest_clistp; +{ + int chr, s; + + s = spltty(); + /* + * If the destination clist is empty (has no cblocks atttached), + * and there are no possible complications with the resource counters, + * then we simply assign the current clist to the destination. + */ + if (!dest_clistp->c_cf + && src_clistp->c_cbcount <= src_clistp->c_cbmax + && src_clistp->c_cbcount <= dest_clistp->c_cbmax) { + dest_clistp->c_cf = src_clistp->c_cf; + dest_clistp->c_cl = src_clistp->c_cl; + src_clistp->c_cf = src_clistp->c_cl = NULL; + + dest_clistp->c_cc = src_clistp->c_cc; + src_clistp->c_cc = 0; + dest_clistp->c_cbcount = src_clistp->c_cbcount; + src_clistp->c_cbcount = 0; + + splx(s); + return; + } + + splx(s); + + /* + * XXX This should probably be optimized to more than one + * character at a time. + */ + while ((chr = getc(src_clistp)) != -1) + putc(chr, dest_clistp); +} diff --git a/sys/kern/tty_tb.c b/sys/kern/tty_tb.c new file mode 100644 index 0000000..8f4c84c --- /dev/null +++ b/sys/kern/tty_tb.c @@ -0,0 +1,367 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty_tb.c 8.1 (Berkeley) 6/10/93 + * $Id$ + */ + +#include "tb.h" +#if NTB > 0 + +/* + * Line discipline for RS232 tablets; + * supplies binary coordinate data. + */ +#include <sys/param.h> +#include <sys/tablet.h> +#include <sys/tty.h> + +/* + * Tablet configuration table. + */ +struct tbconf { + short tbc_recsize; /* input record size in bytes */ + short tbc_uiosize; /* size of data record returned user */ + int tbc_sync; /* mask for finding sync byte/bit */ + int (*tbc_decode)();/* decoding routine */ + char *tbc_run; /* enter run mode sequence */ + char *tbc_point; /* enter point mode sequence */ + char *tbc_stop; /* stop sequence */ + char *tbc_start; /* start/restart sequence */ + int tbc_flags; +#define TBF_POL 0x1 /* polhemus hack */ +#define TBF_INPROX 0x2 /* tablet has proximity info */ +}; + +static int tbdecode(), gtcodecode(), poldecode(); +static int tblresdecode(), tbhresdecode(); + +struct tbconf tbconf[TBTYPE] = { +{ 0 }, +{ 5, sizeof (struct tbpos), 0200, tbdecode, "6", "4" }, +{ 5, sizeof (struct tbpos), 0200, tbdecode, "\1CN", "\1RT", "\2", "\4" }, +{ 8, sizeof (struct gtcopos), 0200, gtcodecode }, +{17, sizeof (struct polpos), 0200, poldecode, 0, 0, "\21", "\5\22\2\23", + TBF_POL }, +{ 5, sizeof (struct tbpos), 0100, tblresdecode, "\1CN", "\1PT", "\2", "\4", + TBF_INPROX }, +{ 6, sizeof (struct tbpos), 0200, tbhresdecode, "\1CN", "\1PT", "\2", "\4", + TBF_INPROX }, +{ 5, sizeof (struct tbpos), 0100, tblresdecode, "\1CL\33", "\1PT\33", 0, 0}, +{ 6, sizeof (struct tbpos), 0200, tbhresdecode, "\1CL\33", "\1PT\33", 0, 0}, +}; + +/* + * Tablet state + */ +struct tb { + int tbflags; /* mode & type bits */ +#define TBMAXREC 17 /* max input record size */ + char cbuf[TBMAXREC]; /* input buffer */ + union { + struct tbpos tbpos; + struct gtcopos gtcopos; + struct polpos polpos; + } rets; /* processed state */ +#define NTBS 16 +} tb[NTBS]; + +/* + * Open as tablet discipline; called on discipline change. + */ +/*ARGSUSED*/ +tbopen(dev, tp) + dev_t dev; + register struct tty *tp; +{ + register struct tb *tbp; + + if (tp->t_line == TABLDISC) + return (ENODEV); + ttywflush(tp); + for (tbp = tb; tbp < &tb[NTBS]; tbp++) + if (tbp->tbflags == 0) + break; + if (tbp >= &tb[NTBS]) + return (EBUSY); + tbp->tbflags = TBTIGER|TBPOINT; /* default */ + tp->t_cp = tbp->cbuf; + tp->t_inbuf = 0; + bzero((caddr_t)&tbp->rets, sizeof (tbp->rets)); + tp->T_LINEP = (caddr_t)tbp; + tp->t_flags |= LITOUT; + return (0); +} + +/* + * Line discipline change or last device close. + */ +tbclose(tp) + register struct tty *tp; +{ + register int s; + int modebits = TBPOINT|TBSTOP; + + tbioctl(tp, BIOSMODE, &modebits, 0); + s = spltty(); + ((struct tb *)tp->T_LINEP)->tbflags = 0; + tp->t_cp = 0; + tp->t_inbuf = 0; + tp->t_rawq.c_cc = 0; /* clear queues -- paranoid */ + tp->t_canq.c_cc = 0; + tp->t_line = 0; /* paranoid: avoid races */ + splx(s); +} + +/* + * Read from a tablet line. + * Characters have been buffered in a buffer and decoded. + */ +tbread(tp, uio) + register struct tty *tp; + struct uio *uio; +{ + register struct tb *tbp = (struct tb *)tp->T_LINEP; + register struct tbconf *tc = &tbconf[tbp->tbflags & TBTYPE]; + int ret; + + if ((tp->t_state&TS_CARR_ON) == 0) + return (EIO); + ret = uiomove(&tbp->rets, tc->tbc_uiosize, uio); + if (tc->tbc_flags&TBF_POL) + tbp->rets.polpos.p_key = ' '; + return (ret); +} + +/* + * Low level character input routine. + * Stuff the character in the buffer, and decode + * if all the chars are there. + * + * This routine could be expanded in-line in the receiver + * interrupt routine to make it run as fast as possible. + */ +tbinput(c, tp) + register int c; + register struct tty *tp; +{ + register struct tb *tbp = (struct tb *)tp->T_LINEP; + register struct tbconf *tc = &tbconf[tbp->tbflags & TBTYPE]; + + if (tc->tbc_recsize == 0 || tc->tbc_decode == 0) /* paranoid? */ + return; + /* + * Locate sync bit/byte or reset input buffer. + */ + if (c&tc->tbc_sync || tp->t_inbuf == tc->tbc_recsize) { + tp->t_cp = tbp->cbuf; + tp->t_inbuf = 0; + } + *tp->t_cp++ = c&0177; + /* + * Call decode routine only if a full record has been collected. + */ + if (++tp->t_inbuf == tc->tbc_recsize) + (*tc->tbc_decode)(tc, tbp->cbuf, &tbp->rets); +} + +/* + * Decode GTCO 8 byte format (high res, tilt, and pressure). + */ +static +gtcodecode(tc, cp, tbpos) + struct tbconf *tc; + register char *cp; + register struct gtcopos *tbpos; +{ + + tbpos->pressure = *cp >> 2; + tbpos->status = (tbpos->pressure > 16) | TBINPROX; /* half way down */ + tbpos->xpos = (*cp++ & 03) << 14; + tbpos->xpos |= *cp++ << 7; + tbpos->xpos |= *cp++; + tbpos->ypos = (*cp++ & 03) << 14; + tbpos->ypos |= *cp++ << 7; + tbpos->ypos |= *cp++; + tbpos->xtilt = *cp++; + tbpos->ytilt = *cp++; + tbpos->scount++; +} + +/* + * Decode old Hitachi 5 byte format (low res). + */ +static +tbdecode(tc, cp, tbpos) + struct tbconf *tc; + register char *cp; + register struct tbpos *tbpos; +{ + register char byte; + + byte = *cp++; + tbpos->status = (byte&0100) ? TBINPROX : 0; + byte &= ~0100; + if (byte > 036) + tbpos->status |= 1 << ((byte-040)/2); + tbpos->xpos = *cp++ << 7; + tbpos->xpos |= *cp++; + if (tbpos->xpos < 256) /* tablet wraps around at 256 */ + tbpos->status &= ~TBINPROX; /* make it out of proximity */ + tbpos->ypos = *cp++ << 7; + tbpos->ypos |= *cp++; + tbpos->scount++; +} + +/* + * Decode new Hitach 5-byte format (low res). + */ +static +tblresdecode(tc, cp, tbpos) + struct tbconf *tc; + register char *cp; + register struct tbpos *tbpos; +{ + + *cp &= ~0100; /* mask sync bit */ + tbpos->status = (*cp++ >> 2) | TBINPROX; + if (tc->tbc_flags&TBF_INPROX && tbpos->status&020) + tbpos->status &= ~(020|TBINPROX); + tbpos->xpos = *cp++; + tbpos->xpos |= *cp++ << 6; + tbpos->ypos = *cp++; + tbpos->ypos |= *cp++ << 6; + tbpos->scount++; +} + +/* + * Decode new Hitach 6-byte format (high res). + */ +static +tbhresdecode(tc, cp, tbpos) + struct tbconf *tc; + register char *cp; + register struct tbpos *tbpos; +{ + char byte; + + byte = *cp++; + tbpos->xpos = (byte & 03) << 14; + tbpos->xpos |= *cp++ << 7; + tbpos->xpos |= *cp++; + tbpos->ypos = *cp++ << 14; + tbpos->ypos |= *cp++ << 7; + tbpos->ypos |= *cp++; + tbpos->status = (byte >> 2) | TBINPROX; + if (tc->tbc_flags&TBF_INPROX && tbpos->status&020) + tbpos->status &= ~(020|TBINPROX); + tbpos->scount++; +} + +/* + * Polhemus decode. + */ +static +poldecode(tc, cp, polpos) + struct tbconf *tc; + register char *cp; + register struct polpos *polpos; +{ + + polpos->p_x = cp[4] | cp[3]<<7 | (cp[9] & 0x03) << 14; + polpos->p_y = cp[6] | cp[5]<<7 | (cp[9] & 0x0c) << 12; + polpos->p_z = cp[8] | cp[7]<<7 | (cp[9] & 0x30) << 10; + polpos->p_azi = cp[11] | cp[10]<<7 | (cp[16] & 0x03) << 14; + polpos->p_pit = cp[13] | cp[12]<<7 | (cp[16] & 0x0c) << 12; + polpos->p_rol = cp[15] | cp[14]<<7 | (cp[16] & 0x30) << 10; + polpos->p_stat = cp[1] | cp[0]<<7; + if (cp[2] != ' ') + polpos->p_key = cp[2]; +} + +/*ARGSUSED*/ +tbioctl(tp, cmd, data, flag) + struct tty *tp; + caddr_t data; +{ + register struct tb *tbp = (struct tb *)tp->T_LINEP; + + switch (cmd) { + + case BIOGMODE: + *(int *)data = tbp->tbflags & TBMODE; + break; + + case BIOSTYPE: + if (tbconf[*(int *)data & TBTYPE].tbc_recsize == 0 || + tbconf[*(int *)data & TBTYPE].tbc_decode == 0) + return (EINVAL); + tbp->tbflags &= ~TBTYPE; + tbp->tbflags |= *(int *)data & TBTYPE; + /* fall thru... to set mode bits */ + + case BIOSMODE: { + register struct tbconf *tc; + + tbp->tbflags &= ~TBMODE; + tbp->tbflags |= *(int *)data & TBMODE; + tc = &tbconf[tbp->tbflags & TBTYPE]; + if (tbp->tbflags&TBSTOP) { + if (tc->tbc_stop) + ttyout(tc->tbc_stop, tp); + } else if (tc->tbc_start) + ttyout(tc->tbc_start, tp); + if (tbp->tbflags&TBPOINT) { + if (tc->tbc_point) + ttyout(tc->tbc_point, tp); + } else if (tc->tbc_run) + ttyout(tc->tbc_run, tp); + ttstart(tp); + break; + } + + case BIOGTYPE: + *(int *)data = tbp->tbflags & TBTYPE; + break; + + case TIOCSETD: + case TIOCGETD: + case TIOCGETP: + case TIOCGETC: + return (-1); /* pass thru... */ + + default: + return (ENOTTY); + } + return (0); +} +#endif diff --git a/sys/kern/tty_tty.c b/sys/kern/tty_tty.c new file mode 100644 index 0000000..889c935 --- /dev/null +++ b/sys/kern/tty_tty.c @@ -0,0 +1,206 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)tty_tty.c 8.2 (Berkeley) 9/23/93 + * $Id: tty_tty.c,v 1.24 1998/06/07 17:11:44 dfr Exp $ + */ + +/* + * Indirect driver for controlling tty. + */ + +#include "opt_devfs.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/conf.h> +#include <sys/lock.h> +#include <sys/proc.h> +#include <sys/ttycom.h> +#include <sys/vnode.h> +#include <sys/kernel.h> +#ifdef DEVFS +#include <sys/devfsext.h> +#endif /*DEVFS*/ + +static d_open_t cttyopen; +static d_read_t cttyread; +static d_write_t cttywrite; +static d_ioctl_t cttyioctl; +static d_poll_t cttypoll; + +#define CDEV_MAJOR 1 +/* Don't make this static, since fdesc_vnops uses it. */ +struct cdevsw ctty_cdevsw = { + cttyopen, nullclose, cttyread, cttywrite, + cttyioctl, nullstop, nullreset, nodevtotty, + cttypoll, nommap, NULL, "ctty", + NULL, -1, nodump, nopsize, + D_TTY, +}; + +#define cttyvp(p) ((p)->p_flag & P_CONTROLT ? (p)->p_session->s_ttyvp : NULL) + +/*ARGSUSED*/ +static int +cttyopen(dev, flag, mode, p) + dev_t dev; + int flag, mode; + struct proc *p; +{ + struct vnode *ttyvp = cttyvp(p); + int error; + + if (ttyvp == NULL) + return (ENXIO); + vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, p); +#ifdef PARANOID + /* + * Since group is tty and mode is 620 on most terminal lines + * and since sessions protect terminals from processes outside + * your session, this check is probably no longer necessary. + * Since it inhibits setuid root programs that later switch + * to another user from accessing /dev/tty, we have decided + * to delete this test. (mckusick 5/93) + */ + error = VOP_ACCESS(ttyvp, + (flag&FREAD ? VREAD : 0) | (flag&FWRITE ? VWRITE : 0), p->p_ucred, p); + if (!error) +#endif /* PARANOID */ + error = VOP_OPEN(ttyvp, flag, NOCRED, p); + VOP_UNLOCK(ttyvp, 0, p); + return (error); +} + +/*ARGSUSED*/ +static int +cttyread(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + struct proc *p = uio->uio_procp; + register struct vnode *ttyvp = cttyvp(p); + int error; + + if (ttyvp == NULL) + return (EIO); + vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, p); + error = VOP_READ(ttyvp, uio, flag, NOCRED); + VOP_UNLOCK(ttyvp, 0, p); + return (error); +} + +/*ARGSUSED*/ +static int +cttywrite(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + struct proc *p = uio->uio_procp; + struct vnode *ttyvp = cttyvp(uio->uio_procp); + int error; + + if (ttyvp == NULL) + return (EIO); + vn_lock(ttyvp, LK_EXCLUSIVE | LK_RETRY, p); + error = VOP_WRITE(ttyvp, uio, flag, NOCRED); + VOP_UNLOCK(ttyvp, 0, p); + return (error); +} + +/*ARGSUSED*/ +static int +cttyioctl(dev, cmd, addr, flag, p) + dev_t dev; + u_long cmd; + caddr_t addr; + int flag; + struct proc *p; +{ + struct vnode *ttyvp = cttyvp(p); + + if (ttyvp == NULL) + return (EIO); + if (cmd == TIOCSCTTY) /* don't allow controlling tty to be set */ + return EINVAL; /* to controlling tty -- infinite recursion */ + if (cmd == TIOCNOTTY) { + if (!SESS_LEADER(p)) { + p->p_flag &= ~P_CONTROLT; + return (0); + } else + return (EINVAL); + } + return (VOP_IOCTL(ttyvp, cmd, addr, flag, NOCRED, p)); +} + +/*ARGSUSED*/ +static int +cttypoll(dev, events, p) + dev_t dev; + int events; + struct proc *p; +{ + struct vnode *ttyvp = cttyvp(p); + + if (ttyvp == NULL) + /* try operation to get EOF/failure */ + return (seltrue(dev, events, p)); + return (VOP_POLL(ttyvp, events, p->p_ucred, p)); +} + +static int ctty_devsw_installed; +#ifdef DEVFS +static void *ctty_devfs_token; +#endif + +static void ctty_drvinit __P((void *unused)); +static void +ctty_drvinit(unused) + void *unused; +{ + dev_t dev; + + if( ! ctty_devsw_installed ) { + dev = makedev(CDEV_MAJOR,0); + cdevsw_add(&dev,&ctty_cdevsw,NULL); + ctty_devsw_installed = 1; +#ifdef DEVFS + ctty_devfs_token = + devfs_add_devswf(&ctty_cdevsw, 0, DV_CHR, 0, 0, + 0666, "tty"); +#endif + } +} + +SYSINIT(cttydev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,ctty_drvinit,NULL) diff --git a/sys/kern/uipc_domain.c b/sys/kern/uipc_domain.c new file mode 100644 index 0000000..929da87 --- /dev/null +++ b/sys/kern/uipc_domain.c @@ -0,0 +1,294 @@ +/* + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_domain.c 8.2 (Berkeley) 10/18/93 + * $Id: uipc_domain.c,v 1.19 1998/05/15 20:11:29 wollman Exp $ + */ + +#include <sys/param.h> +#include <sys/socket.h> +#include <sys/protosw.h> +#include <sys/domain.h> +#include <sys/mbuf.h> +#include <sys/kernel.h> +#include <sys/socketvar.h> +#include <sys/systm.h> +#include <vm/vm_zone.h> + +/* + * System initialization + * + * Note: domain initialization wants to take place on a per domain basis + * as a result of traversing a linker set. Most likely, each domain + * want to call a registration function rather than being handled here + * in domaininit(). Probably this will look like: + * + * SYSINIT(unique, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY, domain_add, xxx) + * + * Where 'xxx' is replaced by the address of a parameter struct to be + * passed to the doamin_add() function. + */ + +static int x_save_spl; /* used by kludge*/ +static void kludge_splimp __P((void *)); +static void kludge_splx __P((void *)); +static void domaininit __P((void *)); +SYSINIT(splimp, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, kludge_splimp, &x_save_spl) +SYSINIT(domain, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, domaininit, NULL) +SYSINIT(splx, SI_SUB_PROTO_END, SI_ORDER_FIRST, kludge_splx, &x_save_spl) + +static void pffasttimo __P((void *)); +static void pfslowtimo __P((void *)); + +struct domain *domains; + +/* + * Add a new protocol domain to the list of supported domains + * Note: you cant unload it again because a socket may be using it. + * XXX can't fail at this time. + */ +static int +net_init_domain(struct domain *dp) +{ + register struct protosw *pr; + int s; + + s = splnet(); + if (dp->dom_init) + (*dp->dom_init)(); + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++){ + if (pr->pr_usrreqs == 0) + panic("domaininit: %ssw[%d] has no usrreqs!", + dp->dom_name, + (int)(pr - dp->dom_protosw)); + if (pr->pr_init) + (*pr->pr_init)(); + } + /* + * update global informatio about maximums + */ + max_hdr = max_linkhdr + max_protohdr; + max_datalen = MHLEN - max_hdr; + splx(s); + return (0); +} + +/* + * Add a new protocol domain to the list of supported domains + * Note: you cant unload it again because a socket may be using it. + * XXX can't fail at this time. + */ +int +net_add_domain(struct domain *dp) +{ + int s, error; + + s = splnet(); + dp->dom_next = domains; + domains = dp; + splx(s); + error = net_init_domain(dp); + max_hdr = max_linkhdr + max_protohdr; + max_datalen = MHLEN - max_hdr; + return (error); +} + +extern struct linker_set domain_set; + +/* ARGSUSED*/ +static void +domaininit(void *dummy) +{ + register struct domain *dp, **dpp; + /* + * Before we do any setup, make sure to initialize the + * zone allocator we get struct sockets from. The obvious + * maximum number of sockets is `maxfiles', but it is possible + * to have a socket without an open file (e.g., a connection waiting + * to be accept(2)ed). Rather than think up and define a + * better value, we just use nmbclusters, since that's what people + * are told to increase first when the network runs out of memory. + * Perhaps we should have two pools, one of unlimited size + * for use during socreate(), and one ZONE_INTERRUPT pool for + * use in sonewconn(). + */ + socket_zone = zinit("socket", sizeof(struct socket), maxsockets, + ZONE_INTERRUPT, 0); + + if (max_linkhdr < 16) /* XXX */ + max_linkhdr = 16; + + /* + * NB - local domain is always present. + */ + net_add_domain(&localdomain); + + /* + * gather up as many protocols as we have statically linked. + * XXX we need to do this because when we ask the routing + * protocol to initialise it will want to examine all + * installed protocols. This needs fixing before protocols + * that use the standard routing can become modules. + */ + for (dpp = (struct domain **)domain_set.ls_items; *dpp; dpp++) { + (**dpp).dom_next = domains; + domains = *dpp; + } + + /* + * Now ask them all to init (XXX including the routing domain, + * see above) + */ + for (dp = domains; dp; dp = dp->dom_next) + net_init_domain(dp); + + timeout(pffasttimo, (void *)0, 1); + timeout(pfslowtimo, (void *)0, 1); +} + + +/* + * The following two operations are kludge code. Most likely, they should + * be done as a "domainpreinit()" for the first function and then rolled + * in as the last act of "domaininit()" for the second. + * + * In point of fact, it is questionable why other initialization prior + * to this does not also take place at splimp by default. + */ +static void +kludge_splimp(udata) + void *udata; +{ + int *savesplp = udata; + + *savesplp = splimp(); +} + +static void +kludge_splx(udata) + void *udata; +{ + int *savesplp = udata; + + splx(*savesplp); +} + + + +struct protosw * +pffindtype(family, type) + int family; + int type; +{ + register struct domain *dp; + register struct protosw *pr; + + for (dp = domains; dp; dp = dp->dom_next) + if (dp->dom_family == family) + goto found; + return (0); +found: + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_type && pr->pr_type == type) + return (pr); + return (0); +} + +struct protosw * +pffindproto(family, protocol, type) + int family; + int protocol; + int type; +{ + register struct domain *dp; + register struct protosw *pr; + struct protosw *maybe = 0; + + if (family == 0) + return (0); + for (dp = domains; dp; dp = dp->dom_next) + if (dp->dom_family == family) + goto found; + return (0); +found: + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) { + if ((pr->pr_protocol == protocol) && (pr->pr_type == type)) + return (pr); + + if (type == SOCK_RAW && pr->pr_type == SOCK_RAW && + pr->pr_protocol == 0 && maybe == (struct protosw *)0) + maybe = pr; + } + return (maybe); +} + +void +pfctlinput(cmd, sa) + int cmd; + struct sockaddr *sa; +{ + register struct domain *dp; + register struct protosw *pr; + + for (dp = domains; dp; dp = dp->dom_next) + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_ctlinput) + (*pr->pr_ctlinput)(cmd, sa, (void *)0); +} + +static void +pfslowtimo(arg) + void *arg; +{ + register struct domain *dp; + register struct protosw *pr; + + for (dp = domains; dp; dp = dp->dom_next) + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_slowtimo) + (*pr->pr_slowtimo)(); + timeout(pfslowtimo, (void *)0, hz/2); +} + +static void +pffasttimo(arg) + void *arg; +{ + register struct domain *dp; + register struct protosw *pr; + + for (dp = domains; dp; dp = dp->dom_next) + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_fasttimo) + (*pr->pr_fasttimo)(); + timeout(pffasttimo, (void *)0, hz/5); +} diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c new file mode 100644 index 0000000..09ddd23 --- /dev/null +++ b/sys/kern/uipc_mbuf.c @@ -0,0 +1,945 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 + * $Id: uipc_mbuf.c,v 1.36 1998/07/03 08:36:48 phk Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/domain.h> +#include <sys/protosw.h> + +#include <vm/vm.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> + +static void mbinit __P((void *)); +SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbinit, NULL) + +struct mbuf *mbutl; +char *mclrefcnt; +struct mbstat mbstat; +struct mbuf *mmbfree; +union mcluster *mclfree; +int max_linkhdr; +int max_protohdr; +int max_hdr; +int max_datalen; + +SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW, + &max_linkhdr, 0, ""); +SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW, + &max_protohdr, 0, ""); +SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, ""); +SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW, + &max_datalen, 0, ""); +SYSCTL_STRUCT(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RW, &mbstat, mbstat, ""); + +static void m_reclaim __P((void)); + +/* "number of clusters of pages" */ +#define NCL_INIT 1 + +#define NMB_INIT 16 + +/* ARGSUSED*/ +static void +mbinit(dummy) + void *dummy; +{ + int s; + + mmbfree = NULL; mclfree = NULL; + mbstat.m_msize = MSIZE; + mbstat.m_mclbytes = MCLBYTES; + mbstat.m_minclsize = MINCLSIZE; + mbstat.m_mlen = MLEN; + mbstat.m_mhlen = MHLEN; + + s = splimp(); + if (m_mballoc(NMB_INIT, M_DONTWAIT) == 0) + goto bad; +#if MCLBYTES <= PAGE_SIZE + if (m_clalloc(NCL_INIT, M_DONTWAIT) == 0) + goto bad; +#else + /* It's OK to call contigmalloc in this context. */ + if (m_clalloc(16, M_WAIT) == 0) + goto bad; +#endif + splx(s); + return; +bad: + panic("mbinit"); +} + +/* + * Allocate at least nmb mbufs and place on mbuf free list. + * Must be called at splimp. + */ +/* ARGSUSED */ +int +m_mballoc(nmb, how) + register int nmb; + int how; +{ + register caddr_t p; + register int i; + int nbytes; + + /* Once we run out of map space, it will be impossible to get + * any more (nothing is ever freed back to the map) (XXX which + * is dumb). (however you are not dead as m_reclaim might + * still be able to free a substantial amount of space). + */ + if (mb_map_full) + return (0); + + nbytes = round_page(nmb * MSIZE); + p = (caddr_t)kmem_malloc(mb_map, nbytes, M_NOWAIT); + if (p == 0 && how == M_WAIT) { + mbstat.m_wait++; + p = (caddr_t)kmem_malloc(mb_map, nbytes, M_WAITOK); + } + + /* + * Either the map is now full, or `how' is M_NOWAIT and there + * are no pages left. + */ + if (p == NULL) + return (0); + + nmb = nbytes / MSIZE; + for (i = 0; i < nmb; i++) { + ((struct mbuf *)p)->m_next = mmbfree; + mmbfree = (struct mbuf *)p; + p += MSIZE; + } + mbstat.m_mbufs += nmb; + return (1); +} + +#if MCLBYTES > PAGE_SIZE +static int i_want_my_mcl; + +static void +kproc_mclalloc(void) +{ + int status; + + while (1) { + tsleep(&i_want_my_mcl, PVM, "mclalloc", 0); + + for (; i_want_my_mcl; i_want_my_mcl--) { + if (m_clalloc(1, M_WAIT) == 0) + printf("m_clalloc failed even in process context!\n"); + } + } +} + +static struct proc *mclallocproc; +static struct kproc_desc mclalloc_kp = { + "mclalloc", + kproc_mclalloc, + &mclallocproc +}; +SYSINIT_KT(mclallocproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start, + &mclalloc_kp); +#endif + +/* + * Allocate some number of mbuf clusters + * and place on cluster free list. + * Must be called at splimp. + */ +/* ARGSUSED */ +int +m_clalloc(ncl, how) + register int ncl; + int how; +{ + register caddr_t p; + register int i; + int npg; + + /* + * Once we run out of map space, it will be impossible + * to get any more (nothing is ever freed back to the + * map). + */ + if (mb_map_full) { + mbstat.m_drops++; + return (0); + } + +#if MCLBYTES > PAGE_SIZE + if (how != M_WAIT) { + i_want_my_mcl += ncl; + wakeup(&i_want_my_mcl); + mbstat.m_wait++; + p = 0; + } else { + p = contigmalloc1(MCLBYTES * ncl, M_DEVBUF, M_WAITOK, 0ul, + ~0ul, PAGE_SIZE, 0, mb_map); + } +#else + npg = ncl; + p = (caddr_t)kmem_malloc(mb_map, ctob(npg), + how != M_WAIT ? M_NOWAIT : M_WAITOK); + ncl = ncl * PAGE_SIZE / MCLBYTES; +#endif + /* + * Either the map is now full, or `how' is M_NOWAIT and there + * are no pages left. + */ + if (p == NULL) { + mbstat.m_drops++; + return (0); + } + + for (i = 0; i < ncl; i++) { + ((union mcluster *)p)->mcl_next = mclfree; + mclfree = (union mcluster *)p; + p += MCLBYTES; + mbstat.m_clfree++; + } + mbstat.m_clusters += ncl; + return (1); +} + +/* + * When MGET failes, ask protocols to free space when short of memory, + * then re-attempt to allocate an mbuf. + */ +struct mbuf * +m_retry(i, t) + int i, t; +{ + register struct mbuf *m; + + /* + * Must only do the reclaim if not in an interrupt context. + */ + if (i == M_WAIT) + m_reclaim(); +#define m_retry(i, t) (struct mbuf *)0 + MGET(m, i, t); +#undef m_retry + if (m != NULL) { + mbstat.m_wait++; + } else { + if (i == M_DONTWAIT) + mbstat.m_drops++; + else + panic("Out of mbuf clusters"); + } + return (m); +} + +/* + * As above; retry an MGETHDR. + */ +struct mbuf * +m_retryhdr(i, t) + int i, t; +{ + register struct mbuf *m; + + /* + * Must only do the reclaim if not in an interrupt context. + */ + if (i == M_WAIT) + m_reclaim(); +#define m_retryhdr(i, t) (struct mbuf *)0 + MGETHDR(m, i, t); +#undef m_retryhdr + if (m != NULL) { + mbstat.m_wait++; + } else { + if (i == M_DONTWAIT) + mbstat.m_drops++; + else + panic("Out of mbuf clusters"); + } + return (m); +} + +static void +m_reclaim() +{ + register struct domain *dp; + register struct protosw *pr; + int s = splimp(); + + for (dp = domains; dp; dp = dp->dom_next) + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + if (pr->pr_drain) + (*pr->pr_drain)(); + splx(s); + mbstat.m_drain++; +} + +/* + * Space allocation routines. + * These are also available as macros + * for critical paths. + */ +struct mbuf * +m_get(how, type) + int how, type; +{ + register struct mbuf *m; + + MGET(m, how, type); + return (m); +} + +struct mbuf * +m_gethdr(how, type) + int how, type; +{ + register struct mbuf *m; + + MGETHDR(m, how, type); + return (m); +} + +struct mbuf * +m_getclr(how, type) + int how, type; +{ + register struct mbuf *m; + + MGET(m, how, type); + if (m == 0) + return (0); + bzero(mtod(m, caddr_t), MLEN); + return (m); +} + +struct mbuf * +m_free(m) + struct mbuf *m; +{ + register struct mbuf *n; + + MFREE(m, n); + return (n); +} + +void +m_freem(m) + register struct mbuf *m; +{ + register struct mbuf *n; + + if (m == NULL) + return; + do { + MFREE(m, n); + m = n; + } while (m); +} + +/* + * Mbuffer utility routines. + */ + +/* + * Lesser-used path for M_PREPEND: + * allocate new mbuf to prepend to chain, + * copy junk along. + */ +struct mbuf * +m_prepend(m, len, how) + register struct mbuf *m; + int len, how; +{ + struct mbuf *mn; + + MGET(mn, how, m->m_type); + if (mn == (struct mbuf *)NULL) { + m_freem(m); + return ((struct mbuf *)NULL); + } + if (m->m_flags & M_PKTHDR) { + M_COPY_PKTHDR(mn, m); + m->m_flags &= ~M_PKTHDR; + } + mn->m_next = m; + m = mn; + if (len < MHLEN) + MH_ALIGN(m, len); + m->m_len = len; + return (m); +} + +/* + * Make a copy of an mbuf chain starting "off0" bytes from the beginning, + * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. + * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller. + */ +#define MCFail (mbstat.m_mcfail) + +struct mbuf * +m_copym(m, off0, len, wait) + register struct mbuf *m; + int off0, wait; + register int len; +{ + register struct mbuf *n, **np; + register int off = off0; + struct mbuf *top; + int copyhdr = 0; + + if (off < 0 || len < 0) + panic("m_copym"); + if (off == 0 && m->m_flags & M_PKTHDR) + copyhdr = 1; + while (off > 0) { + if (m == 0) + panic("m_copym"); + if (off < m->m_len) + break; + off -= m->m_len; + m = m->m_next; + } + np = ⊤ + top = 0; + while (len > 0) { + if (m == 0) { + if (len != M_COPYALL) + panic("m_copym"); + break; + } + MGET(n, wait, m->m_type); + *np = n; + if (n == 0) + goto nospace; + if (copyhdr) { + M_COPY_PKTHDR(n, m); + if (len == M_COPYALL) + n->m_pkthdr.len -= off0; + else + n->m_pkthdr.len = len; + copyhdr = 0; + } + n->m_len = min(len, m->m_len - off); + if (m->m_flags & M_EXT) { + n->m_data = m->m_data + off; + if(!m->m_ext.ext_ref) + mclrefcnt[mtocl(m->m_ext.ext_buf)]++; + else + (*(m->m_ext.ext_ref))(m->m_ext.ext_buf, + m->m_ext.ext_size); + n->m_ext = m->m_ext; + n->m_flags |= M_EXT; + } else + bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), + (unsigned)n->m_len); + if (len != M_COPYALL) + len -= n->m_len; + off = 0; + m = m->m_next; + np = &n->m_next; + } + if (top == 0) + MCFail++; + return (top); +nospace: + m_freem(top); + MCFail++; + return (0); +} + +/* + * Copy an entire packet, including header (which must be present). + * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'. + */ +struct mbuf * +m_copypacket(m, how) + struct mbuf *m; + int how; +{ + struct mbuf *top, *n, *o; + + MGET(n, how, m->m_type); + top = n; + if (!n) + goto nospace; + + M_COPY_PKTHDR(n, m); + n->m_len = m->m_len; + if (m->m_flags & M_EXT) { + n->m_data = m->m_data; + if(!m->m_ext.ext_ref) + mclrefcnt[mtocl(m->m_ext.ext_buf)]++; + else + (*(m->m_ext.ext_ref))(m->m_ext.ext_buf, + m->m_ext.ext_size); + n->m_ext = m->m_ext; + n->m_flags |= M_EXT; + } else { + bcopy(mtod(m, char *), mtod(n, char *), n->m_len); + } + + m = m->m_next; + while (m) { + MGET(o, how, m->m_type); + if (!o) + goto nospace; + + n->m_next = o; + n = n->m_next; + + n->m_len = m->m_len; + if (m->m_flags & M_EXT) { + n->m_data = m->m_data; + if(!m->m_ext.ext_ref) + mclrefcnt[mtocl(m->m_ext.ext_buf)]++; + else + (*(m->m_ext.ext_ref))(m->m_ext.ext_buf, + m->m_ext.ext_size); + n->m_ext = m->m_ext; + n->m_flags |= M_EXT; + } else { + bcopy(mtod(m, char *), mtod(n, char *), n->m_len); + } + + m = m->m_next; + } + return top; +nospace: + m_freem(top); + MCFail++; + return 0; +} + +/* + * Copy data from an mbuf chain starting "off" bytes from the beginning, + * continuing for "len" bytes, into the indicated buffer. + */ +void +m_copydata(m, off, len, cp) + register struct mbuf *m; + register int off; + register int len; + caddr_t cp; +{ + register unsigned count; + + if (off < 0 || len < 0) + panic("m_copydata"); + while (off > 0) { + if (m == 0) + panic("m_copydata"); + if (off < m->m_len) + break; + off -= m->m_len; + m = m->m_next; + } + while (len > 0) { + if (m == 0) + panic("m_copydata"); + count = min(m->m_len - off, len); + bcopy(mtod(m, caddr_t) + off, cp, count); + len -= count; + cp += count; + off = 0; + m = m->m_next; + } +} + +/* + * Concatenate mbuf chain n to m. + * Both chains must be of the same type (e.g. MT_DATA). + * Any m_pkthdr is not updated. + */ +void +m_cat(m, n) + register struct mbuf *m, *n; +{ + while (m->m_next) + m = m->m_next; + while (n) { + if (m->m_flags & M_EXT || + m->m_data + m->m_len + n->m_len >= &m->m_dat[MLEN]) { + /* just join the two chains */ + m->m_next = n; + return; + } + /* splat the data from one into the other */ + bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, + (u_int)n->m_len); + m->m_len += n->m_len; + n = m_free(n); + } +} + +void +m_adj(mp, req_len) + struct mbuf *mp; + int req_len; +{ + register int len = req_len; + register struct mbuf *m; + register int count; + + if ((m = mp) == NULL) + return; + if (len >= 0) { + /* + * Trim from head. + */ + while (m != NULL && len > 0) { + if (m->m_len <= len) { + len -= m->m_len; + m->m_len = 0; + m = m->m_next; + } else { + m->m_len -= len; + m->m_data += len; + len = 0; + } + } + m = mp; + if (mp->m_flags & M_PKTHDR) + m->m_pkthdr.len -= (req_len - len); + } else { + /* + * Trim from tail. Scan the mbuf chain, + * calculating its length and finding the last mbuf. + * If the adjustment only affects this mbuf, then just + * adjust and return. Otherwise, rescan and truncate + * after the remaining size. + */ + len = -len; + count = 0; + for (;;) { + count += m->m_len; + if (m->m_next == (struct mbuf *)0) + break; + m = m->m_next; + } + if (m->m_len >= len) { + m->m_len -= len; + if (mp->m_flags & M_PKTHDR) + mp->m_pkthdr.len -= len; + return; + } + count -= len; + if (count < 0) + count = 0; + /* + * Correct length for chain is "count". + * Find the mbuf with last data, adjust its length, + * and toss data from remaining mbufs on chain. + */ + m = mp; + if (m->m_flags & M_PKTHDR) + m->m_pkthdr.len = count; + for (; m; m = m->m_next) { + if (m->m_len >= count) { + m->m_len = count; + break; + } + count -= m->m_len; + } + while (m->m_next) + (m = m->m_next) ->m_len = 0; + } +} + +/* + * Rearange an mbuf chain so that len bytes are contiguous + * and in the data area of an mbuf (so that mtod and dtom + * will work for a structure of size len). Returns the resulting + * mbuf chain on success, frees it and returns null on failure. + * If there is room, it will add up to max_protohdr-len extra bytes to the + * contiguous region in an attempt to avoid being called next time. + */ +#define MPFail (mbstat.m_mpfail) + +struct mbuf * +m_pullup(n, len) + register struct mbuf *n; + int len; +{ + register struct mbuf *m; + register int count; + int space; + + /* + * If first mbuf has no cluster, and has room for len bytes + * without shifting current data, pullup into it, + * otherwise allocate a new mbuf to prepend to the chain. + */ + if ((n->m_flags & M_EXT) == 0 && + n->m_data + len < &n->m_dat[MLEN] && n->m_next) { + if (n->m_len >= len) + return (n); + m = n; + n = n->m_next; + len -= m->m_len; + } else { + if (len > MHLEN) + goto bad; + MGET(m, M_DONTWAIT, n->m_type); + if (m == 0) + goto bad; + m->m_len = 0; + if (n->m_flags & M_PKTHDR) { + M_COPY_PKTHDR(m, n); + n->m_flags &= ~M_PKTHDR; + } + } + space = &m->m_dat[MLEN] - (m->m_data + m->m_len); + do { + count = min(min(max(len, max_protohdr), space), n->m_len); + bcopy(mtod(n, caddr_t), mtod(m, caddr_t) + m->m_len, + (unsigned)count); + len -= count; + m->m_len += count; + n->m_len -= count; + space -= count; + if (n->m_len) + n->m_data += count; + else + n = m_free(n); + } while (len > 0 && n); + if (len > 0) { + (void) m_free(m); + goto bad; + } + m->m_next = n; + return (m); +bad: + m_freem(n); + MPFail++; + return (0); +} + +/* + * Partition an mbuf chain in two pieces, returning the tail -- + * all but the first len0 bytes. In case of failure, it returns NULL and + * attempts to restore the chain to its original state. + */ +struct mbuf * +m_split(m0, len0, wait) + register struct mbuf *m0; + int len0, wait; +{ + register struct mbuf *m, *n; + unsigned len = len0, remain; + + for (m = m0; m && len > m->m_len; m = m->m_next) + len -= m->m_len; + if (m == 0) + return (0); + remain = m->m_len - len; + if (m0->m_flags & M_PKTHDR) { + MGETHDR(n, wait, m0->m_type); + if (n == 0) + return (0); + n->m_pkthdr.rcvif = m0->m_pkthdr.rcvif; + n->m_pkthdr.len = m0->m_pkthdr.len - len0; + m0->m_pkthdr.len = len0; + if (m->m_flags & M_EXT) + goto extpacket; + if (remain > MHLEN) { + /* m can't be the lead packet */ + MH_ALIGN(n, 0); + n->m_next = m_split(m, len, wait); + if (n->m_next == 0) { + (void) m_free(n); + return (0); + } else + return (n); + } else + MH_ALIGN(n, remain); + } else if (remain == 0) { + n = m->m_next; + m->m_next = 0; + return (n); + } else { + MGET(n, wait, m->m_type); + if (n == 0) + return (0); + M_ALIGN(n, remain); + } +extpacket: + if (m->m_flags & M_EXT) { + n->m_flags |= M_EXT; + n->m_ext = m->m_ext; + if(!m->m_ext.ext_ref) + mclrefcnt[mtocl(m->m_ext.ext_buf)]++; + else + (*(m->m_ext.ext_ref))(m->m_ext.ext_buf, + m->m_ext.ext_size); + m->m_ext.ext_size = 0; /* For Accounting XXXXXX danger */ + n->m_data = m->m_data + len; + } else { + bcopy(mtod(m, caddr_t) + len, mtod(n, caddr_t), remain); + } + n->m_len = remain; + m->m_len = len; + n->m_next = m->m_next; + m->m_next = 0; + return (n); +} +/* + * Routine to copy from device local memory into mbufs. + */ +struct mbuf * +m_devget(buf, totlen, off0, ifp, copy) + char *buf; + int totlen, off0; + struct ifnet *ifp; + void (*copy) __P((char *from, caddr_t to, u_int len)); +{ + register struct mbuf *m; + struct mbuf *top = 0, **mp = ⊤ + register int off = off0, len; + register char *cp; + char *epkt; + + cp = buf; + epkt = cp + totlen; + if (off) { + cp += off + 2 * sizeof(u_short); + totlen -= 2 * sizeof(u_short); + } + MGETHDR(m, M_DONTWAIT, MT_DATA); + if (m == 0) + return (0); + m->m_pkthdr.rcvif = ifp; + m->m_pkthdr.len = totlen; + m->m_len = MHLEN; + + while (totlen > 0) { + if (top) { + MGET(m, M_DONTWAIT, MT_DATA); + if (m == 0) { + m_freem(top); + return (0); + } + m->m_len = MLEN; + } + len = min(totlen, epkt - cp); + if (len >= MINCLSIZE) { + MCLGET(m, M_DONTWAIT); + if (m->m_flags & M_EXT) + m->m_len = len = min(len, MCLBYTES); + else + len = m->m_len; + } else { + /* + * Place initial small packet/header at end of mbuf. + */ + if (len < m->m_len) { + if (top == 0 && len + max_linkhdr <= m->m_len) + m->m_data += max_linkhdr; + m->m_len = len; + } else + len = m->m_len; + } + if (copy) + copy(cp, mtod(m, caddr_t), (unsigned)len); + else + bcopy(cp, mtod(m, caddr_t), (unsigned)len); + cp += len; + *mp = m; + mp = &m->m_next; + totlen -= len; + if (cp == epkt) + cp = buf; + } + return (top); +} + +/* + * Copy data from a buffer back into the indicated mbuf chain, + * starting "off" bytes from the beginning, extending the mbuf + * chain if necessary. + */ +void +m_copyback(m0, off, len, cp) + struct mbuf *m0; + register int off; + register int len; + caddr_t cp; +{ + register int mlen; + register struct mbuf *m = m0, *n; + int totlen = 0; + + if (m0 == 0) + return; + while (off > (mlen = m->m_len)) { + off -= mlen; + totlen += mlen; + if (m->m_next == 0) { + n = m_getclr(M_DONTWAIT, m->m_type); + if (n == 0) + goto out; + n->m_len = min(MLEN, len + off); + m->m_next = n; + } + m = m->m_next; + } + while (len > 0) { + mlen = min (m->m_len - off, len); + bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen); + cp += mlen; + len -= mlen; + mlen += off; + off = 0; + totlen += mlen; + if (len == 0) + break; + if (m->m_next == 0) { + n = m_get(M_DONTWAIT, m->m_type); + if (n == 0) + break; + n->m_len = min(MLEN, len); + m->m_next = n; + } + m = m->m_next; + } +out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) + m->m_pkthdr.len = totlen; +} diff --git a/sys/kern/uipc_proto.c b/sys/kern/uipc_proto.c new file mode 100644 index 0000000..094d1bf --- /dev/null +++ b/sys/kern/uipc_proto.c @@ -0,0 +1,79 @@ +/*- + * Copyright (c) 1982, 1986, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_proto.c 8.1 (Berkeley) 6/10/93 + * $Id: uipc_proto.c,v 1.16 1998/06/21 14:53:18 bde Exp $ + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/domain.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/un.h> +#include <sys/unpcb.h> + +#include <net/raw_cb.h> + +/* + * Definitions of protocols supported in the LOCAL domain. + */ + +static struct protosw localsw[] = { +{ SOCK_STREAM, &localdomain, 0, PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS, + 0, 0, 0, 0, + 0, + 0, 0, 0, 0, + &uipc_usrreqs +}, +{ SOCK_DGRAM, &localdomain, 0, PR_ATOMIC|PR_ADDR|PR_RIGHTS, + 0, 0, 0, 0, + 0, + 0, 0, 0, 0, + &uipc_usrreqs +}, +{ 0, 0, 0, 0, + 0, 0, raw_ctlinput, 0, + 0, + raw_init, 0, 0, 0, + &raw_usrreqs +} +}; + +struct domain localdomain = + { AF_LOCAL, "local", unp_init, unp_externalize, unp_dispose, + localsw, &localsw[sizeof(localsw)/sizeof(localsw[0])] }; + +SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain"); +SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0, "SOCK_STREAM"); +SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM"); diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c new file mode 100644 index 0000000..e718c62 --- /dev/null +++ b/sys/kern/uipc_sockbuf.c @@ -0,0 +1,954 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 + * $Id: uipc_socket2.c,v 1.42 1998/11/23 00:45:38 truckman Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/domain.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/signalvar.h> +#include <sys/sysctl.h> + +/* + * Primitive routines for operating on sockets and socket buffers + */ + +u_long sb_max = SB_MAX; /* XXX should be static */ + +static u_long sb_efficiency = 8; /* parameter for sbreserve() */ + +/* + * Procedures to manipulate state flags of socket + * and do appropriate wakeups. Normal sequence from the + * active (originating) side is that soisconnecting() is + * called during processing of connect() call, + * resulting in an eventual call to soisconnected() if/when the + * connection is established. When the connection is torn down + * soisdisconnecting() is called during processing of disconnect() call, + * and soisdisconnected() is called when the connection to the peer + * is totally severed. The semantics of these routines are such that + * connectionless protocols can call soisconnected() and soisdisconnected() + * only, bypassing the in-progress calls when setting up a ``connection'' + * takes no time. + * + * From the passive side, a socket is created with + * two queues of sockets: so_q0 for connections in progress + * and so_q for connections already made and awaiting user acceptance. + * As a protocol is preparing incoming connections, it creates a socket + * structure queued on so_q0 by calling sonewconn(). When the connection + * is established, soisconnected() is called, and transfers the + * socket structure to so_q, making it available to accept(). + * + * If a socket is closed with sockets on either + * so_q0 or so_q, these sockets are dropped. + * + * If higher level protocols are implemented in + * the kernel, the wakeups done here will sometimes + * cause software-interrupt process scheduling. + */ + +void +soisconnecting(so) + register struct socket *so; +{ + + so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); + so->so_state |= SS_ISCONNECTING; +} + +void +soisconnected(so) + register struct socket *so; +{ + register struct socket *head = so->so_head; + + so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); + so->so_state |= SS_ISCONNECTED; + if (head && (so->so_state & SS_INCOMP)) { + TAILQ_REMOVE(&head->so_incomp, so, so_list); + head->so_incqlen--; + so->so_state &= ~SS_INCOMP; + TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); + so->so_state |= SS_COMP; + sorwakeup(head); + wakeup_one(&head->so_timeo); + } else { + wakeup(&so->so_timeo); + sorwakeup(so); + sowwakeup(so); + } +} + +void +soisdisconnecting(so) + register struct socket *so; +{ + + so->so_state &= ~SS_ISCONNECTING; + so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); + wakeup((caddr_t)&so->so_timeo); + sowwakeup(so); + sorwakeup(so); +} + +void +soisdisconnected(so) + register struct socket *so; +{ + + so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); + so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE); + wakeup((caddr_t)&so->so_timeo); + sowwakeup(so); + sorwakeup(so); +} + +/* + * Return a random connection that hasn't been serviced yet and + * is eligible for discard. There is a one in qlen chance that + * we will return a null, saying that there are no dropable + * requests. In this case, the protocol specific code should drop + * the new request. This insures fairness. + * + * This may be used in conjunction with protocol specific queue + * congestion routines. + */ +struct socket * +sodropablereq(head) + register struct socket *head; +{ + register struct socket *so; + unsigned int i, j, qlen; + static int rnd; + static struct timeval old_runtime; + static unsigned int cur_cnt, old_cnt; + struct timeval tv; + + getmicrouptime(&tv); + if ((i = (tv.tv_sec - old_runtime.tv_sec)) != 0) { + old_runtime = tv; + old_cnt = cur_cnt / i; + cur_cnt = 0; + } + + so = TAILQ_FIRST(&head->so_incomp); + if (!so) + return (so); + + qlen = head->so_incqlen; + if (++cur_cnt > qlen || old_cnt > qlen) { + rnd = (314159 * rnd + 66329) & 0xffff; + j = ((qlen + 1) * rnd) >> 16; + + while (j-- && so) + so = TAILQ_NEXT(so, so_list); + } + + return (so); +} + +/* + * When an attempt at a new connection is noted on a socket + * which accepts connections, sonewconn is called. If the + * connection is possible (subject to space constraints, etc.) + * then we allocate a new structure, propoerly linked into the + * data structure of the original socket, and return this. + * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. + */ +struct socket * +sonewconn(head, connstatus) + register struct socket *head; + int connstatus; +{ + register struct socket *so; + + if (head->so_qlen > 3 * head->so_qlimit / 2) + return ((struct socket *)0); + so = soalloc(0); + if (so == NULL) + return ((struct socket *)0); + so->so_head = head; + so->so_type = head->so_type; + so->so_options = head->so_options &~ SO_ACCEPTCONN; + so->so_linger = head->so_linger; + so->so_state = head->so_state | SS_NOFDREF; + so->so_proto = head->so_proto; + so->so_timeo = head->so_timeo; + so->so_uid = head->so_uid; + (void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat); + + if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { + sodealloc(so); + return ((struct socket *)0); + } + + if (connstatus) { + TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); + so->so_state |= SS_COMP; + } else { + TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); + so->so_state |= SS_INCOMP; + head->so_incqlen++; + } + head->so_qlen++; + if (connstatus) { + sorwakeup(head); + wakeup((caddr_t)&head->so_timeo); + so->so_state |= connstatus; + } + return (so); +} + +/* + * Socantsendmore indicates that no more data will be sent on the + * socket; it would normally be applied to a socket when the user + * informs the system that no more data is to be sent, by the protocol + * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data + * will be received, and will normally be applied to the socket by a + * protocol when it detects that the peer will send no more data. + * Data queued for reading in the socket may yet be read. + */ + +void +socantsendmore(so) + struct socket *so; +{ + + so->so_state |= SS_CANTSENDMORE; + sowwakeup(so); +} + +void +socantrcvmore(so) + struct socket *so; +{ + + so->so_state |= SS_CANTRCVMORE; + sorwakeup(so); +} + +/* + * Wait for data to arrive at/drain from a socket buffer. + */ +int +sbwait(sb) + struct sockbuf *sb; +{ + + sb->sb_flags |= SB_WAIT; + return (tsleep((caddr_t)&sb->sb_cc, + (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait", + sb->sb_timeo)); +} + +/* + * Lock a sockbuf already known to be locked; + * return any error returned from sleep (EINTR). + */ +int +sb_lock(sb) + register struct sockbuf *sb; +{ + int error; + + while (sb->sb_flags & SB_LOCK) { + sb->sb_flags |= SB_WANT; + error = tsleep((caddr_t)&sb->sb_flags, + (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH, + "sblock", 0); + if (error) + return (error); + } + sb->sb_flags |= SB_LOCK; + return (0); +} + +/* + * Wakeup processes waiting on a socket buffer. + * Do asynchronous notification via SIGIO + * if the socket has the SS_ASYNC flag set. + */ +void +sowakeup(so, sb) + register struct socket *so; + register struct sockbuf *sb; +{ + selwakeup(&sb->sb_sel); + sb->sb_flags &= ~SB_SEL; + if (sb->sb_flags & SB_WAIT) { + sb->sb_flags &= ~SB_WAIT; + wakeup((caddr_t)&sb->sb_cc); + } + if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL) + pgsigio(so->so_sigio, SIGIO, 0); + if (sb->sb_flags & SB_UPCALL) + (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT); +} + +/* + * Socket buffer (struct sockbuf) utility routines. + * + * Each socket contains two socket buffers: one for sending data and + * one for receiving data. Each buffer contains a queue of mbufs, + * information about the number of mbufs and amount of data in the + * queue, and other fields allowing select() statements and notification + * on data availability to be implemented. + * + * Data stored in a socket buffer is maintained as a list of records. + * Each record is a list of mbufs chained together with the m_next + * field. Records are chained together with the m_nextpkt field. The upper + * level routine soreceive() expects the following conventions to be + * observed when placing information in the receive buffer: + * + * 1. If the protocol requires each message be preceded by the sender's + * name, then a record containing that name must be present before + * any associated data (mbuf's must be of type MT_SONAME). + * 2. If the protocol supports the exchange of ``access rights'' (really + * just additional data associated with the message), and there are + * ``rights'' to be received, then a record containing this data + * should be present (mbuf's must be of type MT_RIGHTS). + * 3. If a name or rights record exists, then it must be followed by + * a data record, perhaps of zero length. + * + * Before using a new socket structure it is first necessary to reserve + * buffer space to the socket, by calling sbreserve(). This should commit + * some of the available buffer space in the system buffer pool for the + * socket (currently, it does nothing but enforce limits). The space + * should be released by calling sbrelease() when the socket is destroyed. + */ + +int +soreserve(so, sndcc, rcvcc) + register struct socket *so; + u_long sndcc, rcvcc; +{ + + if (sbreserve(&so->so_snd, sndcc) == 0) + goto bad; + if (sbreserve(&so->so_rcv, rcvcc) == 0) + goto bad2; + if (so->so_rcv.sb_lowat == 0) + so->so_rcv.sb_lowat = 1; + if (so->so_snd.sb_lowat == 0) + so->so_snd.sb_lowat = MCLBYTES; + if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) + so->so_snd.sb_lowat = so->so_snd.sb_hiwat; + return (0); +bad2: + sbrelease(&so->so_snd); +bad: + return (ENOBUFS); +} + +/* + * Allot mbufs to a sockbuf. + * Attempt to scale mbmax so that mbcnt doesn't become limiting + * if buffering efficiency is near the normal case. + */ +int +sbreserve(sb, cc) + struct sockbuf *sb; + u_long cc; +{ + if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES)) + return (0); + sb->sb_hiwat = cc; + sb->sb_mbmax = min(cc * sb_efficiency, sb_max); + if (sb->sb_lowat > sb->sb_hiwat) + sb->sb_lowat = sb->sb_hiwat; + return (1); +} + +/* + * Free mbufs held by a socket, and reserved mbuf space. + */ +void +sbrelease(sb) + struct sockbuf *sb; +{ + + sbflush(sb); + sb->sb_hiwat = sb->sb_mbmax = 0; +} + +/* + * Routines to add and remove + * data from an mbuf queue. + * + * The routines sbappend() or sbappendrecord() are normally called to + * append new mbufs to a socket buffer, after checking that adequate + * space is available, comparing the function sbspace() with the amount + * of data to be added. sbappendrecord() differs from sbappend() in + * that data supplied is treated as the beginning of a new record. + * To place a sender's address, optional access rights, and data in a + * socket receive buffer, sbappendaddr() should be used. To place + * access rights and data in a socket receive buffer, sbappendrights() + * should be used. In either case, the new data begins a new record. + * Note that unlike sbappend() and sbappendrecord(), these routines check + * for the caller that there will be enough space to store the data. + * Each fails if there is not enough space, or if it cannot find mbufs + * to store additional information in. + * + * Reliable protocols may use the socket send buffer to hold data + * awaiting acknowledgement. Data is normally copied from a socket + * send buffer in a protocol with m_copy for output to a peer, + * and then removing the data from the socket buffer with sbdrop() + * or sbdroprecord() when the data is acknowledged by the peer. + */ + +/* + * Append mbuf chain m to the last record in the + * socket buffer sb. The additional space associated + * the mbuf chain is recorded in sb. Empty mbufs are + * discarded and mbufs are compacted where possible. + */ +void +sbappend(sb, m) + struct sockbuf *sb; + struct mbuf *m; +{ + register struct mbuf *n; + + if (m == 0) + return; + n = sb->sb_mb; + if (n) { + while (n->m_nextpkt) + n = n->m_nextpkt; + do { + if (n->m_flags & M_EOR) { + sbappendrecord(sb, m); /* XXXXXX!!!! */ + return; + } + } while (n->m_next && (n = n->m_next)); + } + sbcompress(sb, m, n); +} + +#ifdef SOCKBUF_DEBUG +void +sbcheck(sb) + register struct sockbuf *sb; +{ + register struct mbuf *m; + register struct mbuf *n = 0; + register u_long len = 0, mbcnt = 0; + + for (m = sb->sb_mb; m; m = n) { + n = m->m_nextpkt; + for (; m; m = m->m_next) { + len += m->m_len; + mbcnt += MSIZE; + if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ + mbcnt += m->m_ext.ext_size; + } + } + if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { + printf("cc %ld != %ld || mbcnt %ld != %ld\n", len, sb->sb_cc, + mbcnt, sb->sb_mbcnt); + panic("sbcheck"); + } +} +#endif + +/* + * As above, except the mbuf chain + * begins a new record. + */ +void +sbappendrecord(sb, m0) + register struct sockbuf *sb; + register struct mbuf *m0; +{ + register struct mbuf *m; + + if (m0 == 0) + return; + m = sb->sb_mb; + if (m) + while (m->m_nextpkt) + m = m->m_nextpkt; + /* + * Put the first mbuf on the queue. + * Note this permits zero length records. + */ + sballoc(sb, m0); + if (m) + m->m_nextpkt = m0; + else + sb->sb_mb = m0; + m = m0->m_next; + m0->m_next = 0; + if (m && (m0->m_flags & M_EOR)) { + m0->m_flags &= ~M_EOR; + m->m_flags |= M_EOR; + } + sbcompress(sb, m, m0); +} + +/* + * As above except that OOB data + * is inserted at the beginning of the sockbuf, + * but after any other OOB data. + */ +void +sbinsertoob(sb, m0) + register struct sockbuf *sb; + register struct mbuf *m0; +{ + register struct mbuf *m; + register struct mbuf **mp; + + if (m0 == 0) + return; + for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) { + m = *mp; + again: + switch (m->m_type) { + + case MT_OOBDATA: + continue; /* WANT next train */ + + case MT_CONTROL: + m = m->m_next; + if (m) + goto again; /* inspect THIS train further */ + } + break; + } + /* + * Put the first mbuf on the queue. + * Note this permits zero length records. + */ + sballoc(sb, m0); + m0->m_nextpkt = *mp; + *mp = m0; + m = m0->m_next; + m0->m_next = 0; + if (m && (m0->m_flags & M_EOR)) { + m0->m_flags &= ~M_EOR; + m->m_flags |= M_EOR; + } + sbcompress(sb, m, m0); +} + +/* + * Append address and data, and optionally, control (ancillary) data + * to the receive queue of a socket. If present, + * m0 must include a packet header with total length. + * Returns 0 if no space in sockbuf or insufficient mbufs. + */ +int +sbappendaddr(sb, asa, m0, control) + register struct sockbuf *sb; + struct sockaddr *asa; + struct mbuf *m0, *control; +{ + register struct mbuf *m, *n; + int space = asa->sa_len; + +if (m0 && (m0->m_flags & M_PKTHDR) == 0) +panic("sbappendaddr"); + if (m0) + space += m0->m_pkthdr.len; + for (n = control; n; n = n->m_next) { + space += n->m_len; + if (n->m_next == 0) /* keep pointer to last control buf */ + break; + } + if (space > sbspace(sb)) + return (0); + if (asa->sa_len > MLEN) + return (0); + MGET(m, M_DONTWAIT, MT_SONAME); + if (m == 0) + return (0); + m->m_len = asa->sa_len; + bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len); + if (n) + n->m_next = m0; /* concatenate data to control */ + else + control = m0; + m->m_next = control; + for (n = m; n; n = n->m_next) + sballoc(sb, n); + n = sb->sb_mb; + if (n) { + while (n->m_nextpkt) + n = n->m_nextpkt; + n->m_nextpkt = m; + } else + sb->sb_mb = m; + return (1); +} + +int +sbappendcontrol(sb, m0, control) + struct sockbuf *sb; + struct mbuf *control, *m0; +{ + register struct mbuf *m, *n; + int space = 0; + + if (control == 0) + panic("sbappendcontrol"); + for (m = control; ; m = m->m_next) { + space += m->m_len; + if (m->m_next == 0) + break; + } + n = m; /* save pointer to last control buffer */ + for (m = m0; m; m = m->m_next) + space += m->m_len; + if (space > sbspace(sb)) + return (0); + n->m_next = m0; /* concatenate data to control */ + for (m = control; m; m = m->m_next) + sballoc(sb, m); + n = sb->sb_mb; + if (n) { + while (n->m_nextpkt) + n = n->m_nextpkt; + n->m_nextpkt = control; + } else + sb->sb_mb = control; + return (1); +} + +/* + * Compress mbuf chain m into the socket + * buffer sb following mbuf n. If n + * is null, the buffer is presumed empty. + */ +void +sbcompress(sb, m, n) + register struct sockbuf *sb; + register struct mbuf *m, *n; +{ + register int eor = 0; + register struct mbuf *o; + + while (m) { + eor |= m->m_flags & M_EOR; + if (m->m_len == 0 && + (eor == 0 || + (((o = m->m_next) || (o = n)) && + o->m_type == m->m_type))) { + m = m_free(m); + continue; + } + if (n && (n->m_flags & (M_EXT | M_EOR)) == 0 && + (n->m_data + n->m_len + m->m_len) < &n->m_dat[MLEN] && + n->m_type == m->m_type) { + bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len, + (unsigned)m->m_len); + n->m_len += m->m_len; + sb->sb_cc += m->m_len; + m = m_free(m); + continue; + } + if (n) + n->m_next = m; + else + sb->sb_mb = m; + sballoc(sb, m); + n = m; + m->m_flags &= ~M_EOR; + m = m->m_next; + n->m_next = 0; + } + if (eor) { + if (n) + n->m_flags |= eor; + else + printf("semi-panic: sbcompress\n"); + } +} + +/* + * Free all mbufs in a sockbuf. + * Check that all resources are reclaimed. + */ +void +sbflush(sb) + register struct sockbuf *sb; +{ + + if (sb->sb_flags & SB_LOCK) + panic("sbflush: locked"); + while (sb->sb_mbcnt && sb->sb_cc) + sbdrop(sb, (int)sb->sb_cc); + if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt) + panic("sbflush: cc %ld || mb %p || mbcnt %ld", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt); +} + +/* + * Drop data from (the front of) a sockbuf. + */ +void +sbdrop(sb, len) + register struct sockbuf *sb; + register int len; +{ + register struct mbuf *m, *mn; + struct mbuf *next; + + next = (m = sb->sb_mb) ? m->m_nextpkt : 0; + while (len > 0) { + if (m == 0) { + if (next == 0) + panic("sbdrop"); + m = next; + next = m->m_nextpkt; + continue; + } + if (m->m_len > len) { + m->m_len -= len; + m->m_data += len; + sb->sb_cc -= len; + break; + } + len -= m->m_len; + sbfree(sb, m); + MFREE(m, mn); + m = mn; + } + while (m && m->m_len == 0) { + sbfree(sb, m); + MFREE(m, mn); + m = mn; + } + if (m) { + sb->sb_mb = m; + m->m_nextpkt = next; + } else + sb->sb_mb = next; +} + +/* + * Drop a record off the front of a sockbuf + * and move the next record to the front. + */ +void +sbdroprecord(sb) + register struct sockbuf *sb; +{ + register struct mbuf *m, *mn; + + m = sb->sb_mb; + if (m) { + sb->sb_mb = m->m_nextpkt; + do { + sbfree(sb, m); + MFREE(m, mn); + m = mn; + } while (m); + } +} + +/* + * Create a "control" mbuf containing the specified data + * with the specified type for presentation on a socket buffer. + */ +struct mbuf * +sbcreatecontrol(p, size, type, level) + caddr_t p; + register int size; + int type, level; +{ + register struct cmsghdr *cp; + struct mbuf *m; + + if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) + return ((struct mbuf *) NULL); + cp = mtod(m, struct cmsghdr *); + /* XXX check size? */ + (void)memcpy(CMSG_DATA(cp), p, size); + size += sizeof(*cp); + m->m_len = size; + cp->cmsg_len = size; + cp->cmsg_level = level; + cp->cmsg_type = type; + return (m); +} + +/* + * Some routines that return EOPNOTSUPP for entry points that are not + * supported by a protocol. Fill in as needed. + */ +int +pru_accept_notsupp(struct socket *so, struct sockaddr **nam) +{ + return EOPNOTSUPP; +} + +int +pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct proc *p) +{ + return EOPNOTSUPP; +} + +int +pru_connect2_notsupp(struct socket *so1, struct socket *so2) +{ + return EOPNOTSUPP; +} + +int +pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, + struct ifnet *ifp, struct proc *p) +{ + return EOPNOTSUPP; +} + +int +pru_listen_notsupp(struct socket *so, struct proc *p) +{ + return EOPNOTSUPP; +} + +int +pru_rcvd_notsupp(struct socket *so, int flags) +{ + return EOPNOTSUPP; +} + +int +pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) +{ + return EOPNOTSUPP; +} + +/* + * This isn't really a ``null'' operation, but it's the default one + * and doesn't do anything destructive. + */ +int +pru_sense_null(struct socket *so, struct stat *sb) +{ + sb->st_blksize = so->so_snd.sb_hiwat; + return 0; +} + +/* + * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. + */ +struct sockaddr * +dup_sockaddr(sa, canwait) + struct sockaddr *sa; + int canwait; +{ + struct sockaddr *sa2; + + MALLOC(sa2, struct sockaddr *, sa->sa_len, M_SONAME, + canwait ? M_WAITOK : M_NOWAIT); + if (sa2) + bcopy(sa, sa2, sa->sa_len); + return sa2; +} + +/* + * Create an external-format (``xsocket'') structure using the information + * in the kernel-format socket structure pointed to by so. This is done + * to reduce the spew of irrelevant information over this interface, + * to isolate user code from changes in the kernel structure, and + * potentially to provide information-hiding if we decide that + * some of this information should be hidden from users. + */ +void +sotoxsocket(struct socket *so, struct xsocket *xso) +{ + xso->xso_len = sizeof *xso; + xso->xso_so = so; + xso->so_type = so->so_type; + xso->so_options = so->so_options; + xso->so_linger = so->so_linger; + xso->so_state = so->so_state; + xso->so_pcb = so->so_pcb; + xso->xso_protocol = so->so_proto->pr_protocol; + xso->xso_family = so->so_proto->pr_domain->dom_family; + xso->so_qlen = so->so_qlen; + xso->so_incqlen = so->so_incqlen; + xso->so_qlimit = so->so_qlimit; + xso->so_timeo = so->so_timeo; + xso->so_error = so->so_error; + xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; + xso->so_oobmark = so->so_oobmark; + sbtoxsockbuf(&so->so_snd, &xso->so_snd); + sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); + xso->so_uid = so->so_uid; +} + +/* + * This does the same for sockbufs. Note that the xsockbuf structure, + * since it is always embedded in a socket, does not include a self + * pointer nor a length. We make this entry point public in case + * some other mechanism needs it. + */ +void +sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb) +{ + xsb->sb_cc = sb->sb_cc; + xsb->sb_hiwat = sb->sb_hiwat; + xsb->sb_mbcnt = sb->sb_mbcnt; + xsb->sb_mbmax = sb->sb_mbmax; + xsb->sb_lowat = sb->sb_lowat; + xsb->sb_flags = sb->sb_flags; + xsb->sb_timeo = sb->sb_timeo; +} + +/* + * Here is the definition of some of the basic objects in the kern.ipc + * branch of the MIB. + */ +SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); + +/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */ +static int dummy; +SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, ""); + +SYSCTL_INT(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLFLAG_RW, &sb_max, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RD, &maxsockets, 0, ""); +SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW, + &sb_efficiency, 0, ""); +SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD, &nmbclusters, 0, ""); + diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c new file mode 100644 index 0000000..1efa8c5 --- /dev/null +++ b/sys/kern/uipc_socket.c @@ -0,0 +1,1216 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 + * $Id: uipc_socket.c,v 1.50 1999/01/20 17:31:54 fenner Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/fcntl.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/domain.h> +#include <sys/kernel.h> +#include <sys/poll.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/sysctl.h> +#include <sys/uio.h> +#include <vm/vm_zone.h> + +#include <machine/limits.h> + +struct vm_zone *socket_zone; +so_gen_t so_gencnt; /* generation count for sockets */ + +MALLOC_DEFINE(M_SONAME, "soname", "socket name"); +MALLOC_DEFINE(M_PCB, "pcb", "protocol control block"); + +static int somaxconn = SOMAXCONN; +SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn, + 0, ""); + +/* + * Socket operation routines. + * These routines are called by the routines in + * sys_socket.c or from a system process, and + * implement the semantics of socket operations by + * switching out to the protocol specific routines. + */ + +/* + * Get a socket structure from our zone, and initialize it. + * We don't implement `waitok' yet (see comments in uipc_domain.c). + * Note that it would probably be better to allocate socket + * and PCB at the same time, but I'm not convinced that all + * the protocols can be easily modified to do this. + */ +struct socket * +soalloc(waitok) + int waitok; +{ + struct socket *so; + + so = zalloci(socket_zone); + if (so) { + /* XXX race condition for reentrant kernel */ + bzero(so, sizeof *so); + so->so_gencnt = ++so_gencnt; + so->so_zone = socket_zone; + } + return so; +} + +int +socreate(dom, aso, type, proto, p) + int dom; + struct socket **aso; + register int type; + int proto; + struct proc *p; +{ + register struct protosw *prp; + register struct socket *so; + register int error; + + if (proto) + prp = pffindproto(dom, proto, type); + else + prp = pffindtype(dom, type); + if (prp == 0 || prp->pr_usrreqs->pru_attach == 0) + return (EPROTONOSUPPORT); + if (prp->pr_type != type) + return (EPROTOTYPE); + so = soalloc(p != 0); + if (so == 0) + return (ENOBUFS); + + TAILQ_INIT(&so->so_incomp); + TAILQ_INIT(&so->so_comp); + so->so_type = type; + if (p != 0) + so->so_uid = p->p_ucred->cr_uid; + so->so_proto = prp; + error = (*prp->pr_usrreqs->pru_attach)(so, proto, p); + if (error) { + so->so_state |= SS_NOFDREF; + sofree(so); + return (error); + } + *aso = so; + return (0); +} + +int +sobind(so, nam, p) + struct socket *so; + struct sockaddr *nam; + struct proc *p; +{ + int s = splnet(); + int error; + + error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam, p); + splx(s); + return (error); +} + +void +sodealloc(so) + struct socket *so; +{ + so->so_gencnt = ++so_gencnt; + zfreei(so->so_zone, so); +} + +int +solisten(so, backlog, p) + register struct socket *so; + int backlog; + struct proc *p; +{ + int s, error; + + s = splnet(); + error = (*so->so_proto->pr_usrreqs->pru_listen)(so, p); + if (error) { + splx(s); + return (error); + } + if (so->so_comp.tqh_first == NULL) + so->so_options |= SO_ACCEPTCONN; + if (backlog < 0 || backlog > somaxconn) + backlog = somaxconn; + so->so_qlimit = backlog; + splx(s); + return (0); +} + +void +sofree(so) + register struct socket *so; +{ + struct socket *head = so->so_head; + + if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) + return; + if (head != NULL) { + if (so->so_state & SS_INCOMP) { + TAILQ_REMOVE(&head->so_incomp, so, so_list); + head->so_incqlen--; + } else if (so->so_state & SS_COMP) { + TAILQ_REMOVE(&head->so_comp, so, so_list); + } else { + panic("sofree: not queued"); + } + head->so_qlen--; + so->so_state &= ~(SS_INCOMP|SS_COMP); + so->so_head = NULL; + } + sbrelease(&so->so_snd); + sorflush(so); + sodealloc(so); +} + +/* + * Close a socket on last file table reference removal. + * Initiate disconnect if connected. + * Free socket when disconnect complete. + */ +int +soclose(so) + register struct socket *so; +{ + int s = splnet(); /* conservative */ + int error = 0; + + funsetown(so->so_sigio); + if (so->so_options & SO_ACCEPTCONN) { + struct socket *sp, *sonext; + + for (sp = so->so_incomp.tqh_first; sp != NULL; sp = sonext) { + sonext = sp->so_list.tqe_next; + (void) soabort(sp); + } + for (sp = so->so_comp.tqh_first; sp != NULL; sp = sonext) { + sonext = sp->so_list.tqe_next; + (void) soabort(sp); + } + } + if (so->so_pcb == 0) + goto discard; + if (so->so_state & SS_ISCONNECTED) { + if ((so->so_state & SS_ISDISCONNECTING) == 0) { + error = sodisconnect(so); + if (error) + goto drop; + } + if (so->so_options & SO_LINGER) { + if ((so->so_state & SS_ISDISCONNECTING) && + (so->so_state & SS_NBIO)) + goto drop; + while (so->so_state & SS_ISCONNECTED) { + error = tsleep((caddr_t)&so->so_timeo, + PSOCK | PCATCH, "soclos", so->so_linger); + if (error) + break; + } + } + } +drop: + if (so->so_pcb) { + int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); + if (error == 0) + error = error2; + } +discard: + if (so->so_state & SS_NOFDREF) + panic("soclose: NOFDREF"); + so->so_state |= SS_NOFDREF; + sofree(so); + splx(s); + return (error); +} + +/* + * Must be called at splnet... + */ +int +soabort(so) + struct socket *so; +{ + + return (*so->so_proto->pr_usrreqs->pru_abort)(so); +} + +int +soaccept(so, nam) + register struct socket *so; + struct sockaddr **nam; +{ + int s = splnet(); + int error; + + if ((so->so_state & SS_NOFDREF) == 0) + panic("soaccept: !NOFDREF"); + so->so_state &= ~SS_NOFDREF; + error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); + splx(s); + return (error); +} + +int +soconnect(so, nam, p) + register struct socket *so; + struct sockaddr *nam; + struct proc *p; +{ + int s; + int error; + + if (so->so_options & SO_ACCEPTCONN) + return (EOPNOTSUPP); + s = splnet(); + /* + * If protocol is connection-based, can only connect once. + * Otherwise, if connected, try to disconnect first. + * This allows user to disconnect by connecting to, e.g., + * a null address. + */ + if (so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING) && + ((so->so_proto->pr_flags & PR_CONNREQUIRED) || + (error = sodisconnect(so)))) + error = EISCONN; + else + error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam, p); + splx(s); + return (error); +} + +int +soconnect2(so1, so2) + register struct socket *so1; + struct socket *so2; +{ + int s = splnet(); + int error; + + error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); + splx(s); + return (error); +} + +int +sodisconnect(so) + register struct socket *so; +{ + int s = splnet(); + int error; + + if ((so->so_state & SS_ISCONNECTED) == 0) { + error = ENOTCONN; + goto bad; + } + if (so->so_state & SS_ISDISCONNECTING) { + error = EALREADY; + goto bad; + } + error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); +bad: + splx(s); + return (error); +} + +#define SBLOCKWAIT(f) (((f) & MSG_DONTWAIT) ? M_NOWAIT : M_WAITOK) +/* + * Send on a socket. + * If send must go all at once and message is larger than + * send buffering, then hard error. + * Lock against other senders. + * If must go all at once and not enough room now, then + * inform user that this would block and do nothing. + * Otherwise, if nonblocking, send as much as possible. + * The data to be sent is described by "uio" if nonzero, + * otherwise by the mbuf chain "top" (which must be null + * if uio is not). Data provided in mbuf chain must be small + * enough to send all at once. + * + * Returns nonzero on error, timeout or signal; callers + * must check for short counts if EINTR/ERESTART are returned. + * Data and control buffers are freed on return. + */ +int +sosend(so, addr, uio, top, control, flags, p) + register struct socket *so; + struct sockaddr *addr; + struct uio *uio; + struct mbuf *top; + struct mbuf *control; + int flags; + struct proc *p; +{ + struct mbuf **mp; + register struct mbuf *m; + register long space, len, resid; + int clen = 0, error, s, dontroute, mlen; + int atomic = sosendallatonce(so) || top; + + if (uio) + resid = uio->uio_resid; + else + resid = top->m_pkthdr.len; + /* + * In theory resid should be unsigned. + * However, space must be signed, as it might be less than 0 + * if we over-committed, and we must use a signed comparison + * of space and resid. On the other hand, a negative resid + * causes us to loop sending 0-length segments to the protocol. + * + * Also check to make sure that MSG_EOR isn't used on SOCK_STREAM + * type sockets since that's an error. + */ + if (resid < 0 || so->so_type == SOCK_STREAM && (flags & MSG_EOR)) { + error = EINVAL; + goto out; + } + + dontroute = + (flags & MSG_DONTROUTE) && (so->so_options & SO_DONTROUTE) == 0 && + (so->so_proto->pr_flags & PR_ATOMIC); + if (p) + p->p_stats->p_ru.ru_msgsnd++; + if (control) + clen = control->m_len; +#define snderr(errno) { error = errno; splx(s); goto release; } + +restart: + error = sblock(&so->so_snd, SBLOCKWAIT(flags)); + if (error) + goto out; + do { + s = splnet(); + if (so->so_state & SS_CANTSENDMORE) + snderr(EPIPE); + if (so->so_error) { + error = so->so_error; + so->so_error = 0; + splx(s); + goto release; + } + if ((so->so_state & SS_ISCONNECTED) == 0) { + /* + * `sendto' and `sendmsg' is allowed on a connection- + * based socket if it supports implied connect. + * Return ENOTCONN if not connected and no address is + * supplied. + */ + if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && + (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { + if ((so->so_state & SS_ISCONFIRMING) == 0 && + !(resid == 0 && clen != 0)) + snderr(ENOTCONN); + } else if (addr == 0) + snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? + ENOTCONN : EDESTADDRREQ); + } + space = sbspace(&so->so_snd); + if (flags & MSG_OOB) + space += 1024; + if ((atomic && resid > so->so_snd.sb_hiwat) || + clen > so->so_snd.sb_hiwat) + snderr(EMSGSIZE); + if (space < resid + clen && uio && + (atomic || space < so->so_snd.sb_lowat || space < clen)) { + if (so->so_state & SS_NBIO) + snderr(EWOULDBLOCK); + sbunlock(&so->so_snd); + error = sbwait(&so->so_snd); + splx(s); + if (error) + goto out; + goto restart; + } + splx(s); + mp = ⊤ + space -= clen; + do { + if (uio == NULL) { + /* + * Data is prepackaged in "top". + */ + resid = 0; + if (flags & MSG_EOR) + top->m_flags |= M_EOR; + } else do { + if (top == 0) { + MGETHDR(m, M_WAIT, MT_DATA); + mlen = MHLEN; + m->m_pkthdr.len = 0; + m->m_pkthdr.rcvif = (struct ifnet *)0; + } else { + MGET(m, M_WAIT, MT_DATA); + mlen = MLEN; + } + if (resid >= MINCLSIZE) { + MCLGET(m, M_WAIT); + if ((m->m_flags & M_EXT) == 0) + goto nopages; + mlen = MCLBYTES; + len = min(min(mlen, resid), space); + } else { +nopages: + len = min(min(mlen, resid), space); + /* + * For datagram protocols, leave room + * for protocol headers in first mbuf. + */ + if (atomic && top == 0 && len < mlen) + MH_ALIGN(m, len); + } + space -= len; + error = uiomove(mtod(m, caddr_t), (int)len, uio); + resid = uio->uio_resid; + m->m_len = len; + *mp = m; + top->m_pkthdr.len += len; + if (error) + goto release; + mp = &m->m_next; + if (resid <= 0) { + if (flags & MSG_EOR) + top->m_flags |= M_EOR; + break; + } + } while (space > 0 && atomic); + if (dontroute) + so->so_options |= SO_DONTROUTE; + s = splnet(); /* XXX */ + error = (*so->so_proto->pr_usrreqs->pru_send)(so, + (flags & MSG_OOB) ? PRUS_OOB : + /* + * If the user set MSG_EOF, the protocol + * understands this flag and nothing left to + * send then use PRU_SEND_EOF instead of PRU_SEND. + */ + ((flags & MSG_EOF) && + (so->so_proto->pr_flags & PR_IMPLOPCL) && + (resid <= 0)) ? + PRUS_EOF : + /* If there is more to send set PRUS_MORETOCOME */ + (resid > 0 && space > 0) ? PRUS_MORETOCOME : 0, + top, addr, control, p); + splx(s); + if (dontroute) + so->so_options &= ~SO_DONTROUTE; + clen = 0; + control = 0; + top = 0; + mp = ⊤ + if (error) + goto release; + } while (resid && space > 0); + } while (resid); + +release: + sbunlock(&so->so_snd); +out: + if (top) + m_freem(top); + if (control) + m_freem(control); + return (error); +} + +/* + * Implement receive operations on a socket. + * We depend on the way that records are added to the sockbuf + * by sbappend*. In particular, each record (mbufs linked through m_next) + * must begin with an address if the protocol so specifies, + * followed by an optional mbuf or mbufs containing ancillary data, + * and then zero or more mbufs of data. + * In order to avoid blocking network interrupts for the entire time here, + * we splx() while doing the actual copy to user space. + * Although the sockbuf is locked, new data may still be appended, + * and thus we must maintain consistency of the sockbuf during that time. + * + * The caller may receive the data as a single mbuf chain by supplying + * an mbuf **mp0 for use in returning the chain. The uio is then used + * only for the count in uio_resid. + */ +int +soreceive(so, psa, uio, mp0, controlp, flagsp) + register struct socket *so; + struct sockaddr **psa; + struct uio *uio; + struct mbuf **mp0; + struct mbuf **controlp; + int *flagsp; +{ + register struct mbuf *m, **mp; + register int flags, len, error, s, offset; + struct protosw *pr = so->so_proto; + struct mbuf *nextrecord; + int moff, type = 0; + int orig_resid = uio->uio_resid; + + mp = mp0; + if (psa) + *psa = 0; + if (controlp) + *controlp = 0; + if (flagsp) + flags = *flagsp &~ MSG_EOR; + else + flags = 0; + if (flags & MSG_OOB) { + m = m_get(M_WAIT, MT_DATA); + error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); + if (error) + goto bad; + do { + error = uiomove(mtod(m, caddr_t), + (int) min(uio->uio_resid, m->m_len), uio); + m = m_free(m); + } while (uio->uio_resid && error == 0 && m); +bad: + if (m) + m_freem(m); + return (error); + } + if (mp) + *mp = (struct mbuf *)0; + if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) + (*pr->pr_usrreqs->pru_rcvd)(so, 0); + +restart: + error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); + if (error) + return (error); + s = splnet(); + + m = so->so_rcv.sb_mb; + /* + * If we have less data than requested, block awaiting more + * (subject to any timeout) if: + * 1. the current count is less than the low water mark, or + * 2. MSG_WAITALL is set, and it is possible to do the entire + * receive operation at once if we block (resid <= hiwat). + * 3. MSG_DONTWAIT is not set + * If MSG_WAITALL is set but resid is larger than the receive buffer, + * we have to do the receive in sections, and thus risk returning + * a short count if a timeout or signal occurs after we start. + */ + if (m == 0 || (((flags & MSG_DONTWAIT) == 0 && + so->so_rcv.sb_cc < uio->uio_resid) && + (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || + ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && + m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { + KASSERT(m != 0 || !so->so_rcv.sb_cc, ("receive 1")); + if (so->so_error) { + if (m) + goto dontblock; + error = so->so_error; + if ((flags & MSG_PEEK) == 0) + so->so_error = 0; + goto release; + } + if (so->so_state & SS_CANTRCVMORE) { + if (m) + goto dontblock; + else + goto release; + } + for (; m; m = m->m_next) + if (m->m_type == MT_OOBDATA || (m->m_flags & M_EOR)) { + m = so->so_rcv.sb_mb; + goto dontblock; + } + if ((so->so_state & (SS_ISCONNECTED|SS_ISCONNECTING)) == 0 && + (so->so_proto->pr_flags & PR_CONNREQUIRED)) { + error = ENOTCONN; + goto release; + } + if (uio->uio_resid == 0) + goto release; + if ((so->so_state & SS_NBIO) || (flags & MSG_DONTWAIT)) { + error = EWOULDBLOCK; + goto release; + } + sbunlock(&so->so_rcv); + error = sbwait(&so->so_rcv); + splx(s); + if (error) + return (error); + goto restart; + } +dontblock: + if (uio->uio_procp) + uio->uio_procp->p_stats->p_ru.ru_msgrcv++; + nextrecord = m->m_nextpkt; + if (pr->pr_flags & PR_ADDR) { + KASSERT(m->m_type == MT_SONAME, ("receive 1a")); + orig_resid = 0; + if (psa) + *psa = dup_sockaddr(mtod(m, struct sockaddr *), + mp0 == 0); + if (flags & MSG_PEEK) { + m = m->m_next; + } else { + sbfree(&so->so_rcv, m); + MFREE(m, so->so_rcv.sb_mb); + m = so->so_rcv.sb_mb; + } + } + while (m && m->m_type == MT_CONTROL && error == 0) { + if (flags & MSG_PEEK) { + if (controlp) + *controlp = m_copy(m, 0, m->m_len); + m = m->m_next; + } else { + sbfree(&so->so_rcv, m); + if (controlp) { + if (pr->pr_domain->dom_externalize && + mtod(m, struct cmsghdr *)->cmsg_type == + SCM_RIGHTS) + error = (*pr->pr_domain->dom_externalize)(m); + *controlp = m; + so->so_rcv.sb_mb = m->m_next; + m->m_next = 0; + m = so->so_rcv.sb_mb; + } else { + MFREE(m, so->so_rcv.sb_mb); + m = so->so_rcv.sb_mb; + } + } + if (controlp) { + orig_resid = 0; + controlp = &(*controlp)->m_next; + } + } + if (m) { + if ((flags & MSG_PEEK) == 0) + m->m_nextpkt = nextrecord; + type = m->m_type; + if (type == MT_OOBDATA) + flags |= MSG_OOB; + } + moff = 0; + offset = 0; + while (m && uio->uio_resid > 0 && error == 0) { + if (m->m_type == MT_OOBDATA) { + if (type != MT_OOBDATA) + break; + } else if (type == MT_OOBDATA) + break; + else + KASSERT(m->m_type == MT_DATA || m->m_type == MT_HEADER, + ("receive 3")); + so->so_state &= ~SS_RCVATMARK; + len = uio->uio_resid; + if (so->so_oobmark && len > so->so_oobmark - offset) + len = so->so_oobmark - offset; + if (len > m->m_len - moff) + len = m->m_len - moff; + /* + * If mp is set, just pass back the mbufs. + * Otherwise copy them out via the uio, then free. + * Sockbuf must be consistent here (points to current mbuf, + * it points to next record) when we drop priority; + * we must note any additions to the sockbuf when we + * block interrupts again. + */ + if (mp == 0) { + splx(s); + error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); + s = splnet(); + if (error) + goto release; + } else + uio->uio_resid -= len; + if (len == m->m_len - moff) { + if (m->m_flags & M_EOR) + flags |= MSG_EOR; + if (flags & MSG_PEEK) { + m = m->m_next; + moff = 0; + } else { + nextrecord = m->m_nextpkt; + sbfree(&so->so_rcv, m); + if (mp) { + *mp = m; + mp = &m->m_next; + so->so_rcv.sb_mb = m = m->m_next; + *mp = (struct mbuf *)0; + } else { + MFREE(m, so->so_rcv.sb_mb); + m = so->so_rcv.sb_mb; + } + if (m) + m->m_nextpkt = nextrecord; + } + } else { + if (flags & MSG_PEEK) + moff += len; + else { + if (mp) + *mp = m_copym(m, 0, len, M_WAIT); + m->m_data += len; + m->m_len -= len; + so->so_rcv.sb_cc -= len; + } + } + if (so->so_oobmark) { + if ((flags & MSG_PEEK) == 0) { + so->so_oobmark -= len; + if (so->so_oobmark == 0) { + so->so_state |= SS_RCVATMARK; + break; + } + } else { + offset += len; + if (offset == so->so_oobmark) + break; + } + } + if (flags & MSG_EOR) + break; + /* + * If the MSG_WAITALL flag is set (for non-atomic socket), + * we must not quit until "uio->uio_resid == 0" or an error + * termination. If a signal/timeout occurs, return + * with a short count but without error. + * Keep sockbuf locked against other readers. + */ + while (flags & MSG_WAITALL && m == 0 && uio->uio_resid > 0 && + !sosendallatonce(so) && !nextrecord) { + if (so->so_error || so->so_state & SS_CANTRCVMORE) + break; + error = sbwait(&so->so_rcv); + if (error) { + sbunlock(&so->so_rcv); + splx(s); + return (0); + } + m = so->so_rcv.sb_mb; + if (m) + nextrecord = m->m_nextpkt; + } + } + + if (m && pr->pr_flags & PR_ATOMIC) { + flags |= MSG_TRUNC; + if ((flags & MSG_PEEK) == 0) + (void) sbdroprecord(&so->so_rcv); + } + if ((flags & MSG_PEEK) == 0) { + if (m == 0) + so->so_rcv.sb_mb = nextrecord; + if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) + (*pr->pr_usrreqs->pru_rcvd)(so, flags); + } + if (orig_resid == uio->uio_resid && orig_resid && + (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { + sbunlock(&so->so_rcv); + splx(s); + goto restart; + } + + if (flagsp) + *flagsp |= flags; +release: + sbunlock(&so->so_rcv); + splx(s); + return (error); +} + +int +soshutdown(so, how) + register struct socket *so; + register int how; +{ + register struct protosw *pr = so->so_proto; + + how++; + if (how & FREAD) + sorflush(so); + if (how & FWRITE) + return ((*pr->pr_usrreqs->pru_shutdown)(so)); + return (0); +} + +void +sorflush(so) + register struct socket *so; +{ + register struct sockbuf *sb = &so->so_rcv; + register struct protosw *pr = so->so_proto; + register int s; + struct sockbuf asb; + + sb->sb_flags |= SB_NOINTR; + (void) sblock(sb, M_WAITOK); + s = splimp(); + socantrcvmore(so); + sbunlock(sb); + asb = *sb; + bzero((caddr_t)sb, sizeof (*sb)); + splx(s); + if (pr->pr_flags & PR_RIGHTS && pr->pr_domain->dom_dispose) + (*pr->pr_domain->dom_dispose)(asb.sb_mb); + sbrelease(&asb); +} + +/* + * Perhaps this routine, and sooptcopyout(), below, ought to come in + * an additional variant to handle the case where the option value needs + * to be some kind of integer, but not a specific size. + * In addition to their use here, these functions are also called by the + * protocol-level pr_ctloutput() routines. + */ +int +sooptcopyin(sopt, buf, len, minlen) + struct sockopt *sopt; + void *buf; + size_t len; + size_t minlen; +{ + size_t valsize; + + /* + * If the user gives us more than we wanted, we ignore it, + * but if we don't get the minimum length the caller + * wants, we return EINVAL. On success, sopt->sopt_valsize + * is set to however much we actually retrieved. + */ + if ((valsize = sopt->sopt_valsize) < minlen) + return EINVAL; + if (valsize > len) + sopt->sopt_valsize = valsize = len; + + if (sopt->sopt_p != 0) + return (copyin(sopt->sopt_val, buf, valsize)); + + bcopy(sopt->sopt_val, buf, valsize); + return 0; +} + +int +sosetopt(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + int error, optval; + struct linger l; + struct timeval tv; + short val; + + error = 0; + if (sopt->sopt_level != SOL_SOCKET) { + if (so->so_proto && so->so_proto->pr_ctloutput) + return ((*so->so_proto->pr_ctloutput) + (so, sopt)); + error = ENOPROTOOPT; + } else { + switch (sopt->sopt_name) { + case SO_LINGER: + error = sooptcopyin(sopt, &l, sizeof l, sizeof l); + if (error) + goto bad; + + so->so_linger = l.l_linger; + if (l.l_onoff) + so->so_options |= SO_LINGER; + else + so->so_options &= ~SO_LINGER; + break; + + case SO_DEBUG: + case SO_KEEPALIVE: + case SO_DONTROUTE: + case SO_USELOOPBACK: + case SO_BROADCAST: + case SO_REUSEADDR: + case SO_REUSEPORT: + case SO_OOBINLINE: + case SO_TIMESTAMP: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + goto bad; + if (optval) + so->so_options |= sopt->sopt_name; + else + so->so_options &= ~sopt->sopt_name; + break; + + case SO_SNDBUF: + case SO_RCVBUF: + case SO_SNDLOWAT: + case SO_RCVLOWAT: + error = sooptcopyin(sopt, &optval, sizeof optval, + sizeof optval); + if (error) + goto bad; + + /* + * Values < 1 make no sense for any of these + * options, so disallow them. + */ + if (optval < 1) { + error = EINVAL; + goto bad; + } + + switch (sopt->sopt_name) { + case SO_SNDBUF: + case SO_RCVBUF: + if (sbreserve(sopt->sopt_name == SO_SNDBUF ? + &so->so_snd : &so->so_rcv, + (u_long) optval) == 0) { + error = ENOBUFS; + goto bad; + } + break; + + /* + * Make sure the low-water is never greater than + * the high-water. + */ + case SO_SNDLOWAT: + so->so_snd.sb_lowat = + (optval > so->so_snd.sb_hiwat) ? + so->so_snd.sb_hiwat : optval; + break; + case SO_RCVLOWAT: + so->so_rcv.sb_lowat = + (optval > so->so_rcv.sb_hiwat) ? + so->so_rcv.sb_hiwat : optval; + break; + } + break; + + case SO_SNDTIMEO: + case SO_RCVTIMEO: + error = sooptcopyin(sopt, &tv, sizeof tv, + sizeof tv); + if (error) + goto bad; + + if (tv.tv_sec > SHRT_MAX / hz - hz) { + error = EDOM; + goto bad; + } + val = tv.tv_sec * hz + tv.tv_usec / tick; + + switch (sopt->sopt_name) { + case SO_SNDTIMEO: + so->so_snd.sb_timeo = val; + break; + case SO_RCVTIMEO: + so->so_rcv.sb_timeo = val; + break; + } + break; + + default: + error = ENOPROTOOPT; + break; + } + if (error == 0 && so->so_proto && so->so_proto->pr_ctloutput) { + (void) ((*so->so_proto->pr_ctloutput) + (so, sopt)); + } + } +bad: + return (error); +} + +/* Helper routine for getsockopt */ +int +sooptcopyout(sopt, buf, len) + struct sockopt *sopt; + void *buf; + size_t len; +{ + int error; + size_t valsize; + + error = 0; + + /* + * Documented get behavior is that we always return a value, + * possibly truncated to fit in the user's buffer. + * Traditional behavior is that we always tell the user + * precisely how much we copied, rather than something useful + * like the total amount we had available for her. + * Note that this interface is not idempotent; the entire answer must + * generated ahead of time. + */ + valsize = min(len, sopt->sopt_valsize); + sopt->sopt_valsize = valsize; + if (sopt->sopt_val != 0) { + if (sopt->sopt_p != 0) + error = copyout(buf, sopt->sopt_val, valsize); + else + bcopy(buf, sopt->sopt_val, valsize); + } + return error; +} + +int +sogetopt(so, sopt) + struct socket *so; + struct sockopt *sopt; +{ + int error, optval; + struct linger l; + struct timeval tv; + + error = 0; + if (sopt->sopt_level != SOL_SOCKET) { + if (so->so_proto && so->so_proto->pr_ctloutput) { + return ((*so->so_proto->pr_ctloutput) + (so, sopt)); + } else + return (ENOPROTOOPT); + } else { + switch (sopt->sopt_name) { + case SO_LINGER: + l.l_onoff = so->so_options & SO_LINGER; + l.l_linger = so->so_linger; + error = sooptcopyout(sopt, &l, sizeof l); + break; + + case SO_USELOOPBACK: + case SO_DONTROUTE: + case SO_DEBUG: + case SO_KEEPALIVE: + case SO_REUSEADDR: + case SO_REUSEPORT: + case SO_BROADCAST: + case SO_OOBINLINE: + case SO_TIMESTAMP: + optval = so->so_options & sopt->sopt_name; +integer: + error = sooptcopyout(sopt, &optval, sizeof optval); + break; + + case SO_TYPE: + optval = so->so_type; + goto integer; + + case SO_ERROR: + optval = so->so_error; + so->so_error = 0; + goto integer; + + case SO_SNDBUF: + optval = so->so_snd.sb_hiwat; + goto integer; + + case SO_RCVBUF: + optval = so->so_rcv.sb_hiwat; + goto integer; + + case SO_SNDLOWAT: + optval = so->so_snd.sb_lowat; + goto integer; + + case SO_RCVLOWAT: + optval = so->so_rcv.sb_lowat; + goto integer; + + case SO_SNDTIMEO: + case SO_RCVTIMEO: + optval = (sopt->sopt_name == SO_SNDTIMEO ? + so->so_snd.sb_timeo : so->so_rcv.sb_timeo); + + tv.tv_sec = optval / hz; + tv.tv_usec = (optval % hz) * tick; + error = sooptcopyout(sopt, &tv, sizeof tv); + break; + + default: + error = ENOPROTOOPT; + break; + } + return (error); + } +} + +void +sohasoutofband(so) + register struct socket *so; +{ + if (so->so_sigio != NULL) + pgsigio(so->so_sigio, SIGURG, 0); + selwakeup(&so->so_rcv.sb_sel); +} + +int +sopoll(struct socket *so, int events, struct ucred *cred, struct proc *p) +{ + int revents = 0; + int s = splnet(); + + if (events & (POLLIN | POLLRDNORM)) + if (soreadable(so)) + revents |= events & (POLLIN | POLLRDNORM); + + if (events & (POLLOUT | POLLWRNORM)) + if (sowriteable(so)) + revents |= events & (POLLOUT | POLLWRNORM); + + if (events & (POLLPRI | POLLRDBAND)) + if (so->so_oobmark || (so->so_state & SS_RCVATMARK)) + revents |= events & (POLLPRI | POLLRDBAND); + + if (revents == 0) { + if (events & (POLLIN | POLLPRI | POLLRDNORM | POLLRDBAND)) { + selrecord(p, &so->so_rcv.sb_sel); + so->so_rcv.sb_flags |= SB_SEL; + } + + if (events & (POLLOUT | POLLWRNORM)) { + selrecord(p, &so->so_snd.sb_sel); + so->so_snd.sb_flags |= SB_SEL; + } + } + + splx(s); + return (revents); +} diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c new file mode 100644 index 0000000..e718c62 --- /dev/null +++ b/sys/kern/uipc_socket2.c @@ -0,0 +1,954 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 + * $Id: uipc_socket2.c,v 1.42 1998/11/23 00:45:38 truckman Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/domain.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/signalvar.h> +#include <sys/sysctl.h> + +/* + * Primitive routines for operating on sockets and socket buffers + */ + +u_long sb_max = SB_MAX; /* XXX should be static */ + +static u_long sb_efficiency = 8; /* parameter for sbreserve() */ + +/* + * Procedures to manipulate state flags of socket + * and do appropriate wakeups. Normal sequence from the + * active (originating) side is that soisconnecting() is + * called during processing of connect() call, + * resulting in an eventual call to soisconnected() if/when the + * connection is established. When the connection is torn down + * soisdisconnecting() is called during processing of disconnect() call, + * and soisdisconnected() is called when the connection to the peer + * is totally severed. The semantics of these routines are such that + * connectionless protocols can call soisconnected() and soisdisconnected() + * only, bypassing the in-progress calls when setting up a ``connection'' + * takes no time. + * + * From the passive side, a socket is created with + * two queues of sockets: so_q0 for connections in progress + * and so_q for connections already made and awaiting user acceptance. + * As a protocol is preparing incoming connections, it creates a socket + * structure queued on so_q0 by calling sonewconn(). When the connection + * is established, soisconnected() is called, and transfers the + * socket structure to so_q, making it available to accept(). + * + * If a socket is closed with sockets on either + * so_q0 or so_q, these sockets are dropped. + * + * If higher level protocols are implemented in + * the kernel, the wakeups done here will sometimes + * cause software-interrupt process scheduling. + */ + +void +soisconnecting(so) + register struct socket *so; +{ + + so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); + so->so_state |= SS_ISCONNECTING; +} + +void +soisconnected(so) + register struct socket *so; +{ + register struct socket *head = so->so_head; + + so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); + so->so_state |= SS_ISCONNECTED; + if (head && (so->so_state & SS_INCOMP)) { + TAILQ_REMOVE(&head->so_incomp, so, so_list); + head->so_incqlen--; + so->so_state &= ~SS_INCOMP; + TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); + so->so_state |= SS_COMP; + sorwakeup(head); + wakeup_one(&head->so_timeo); + } else { + wakeup(&so->so_timeo); + sorwakeup(so); + sowwakeup(so); + } +} + +void +soisdisconnecting(so) + register struct socket *so; +{ + + so->so_state &= ~SS_ISCONNECTING; + so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); + wakeup((caddr_t)&so->so_timeo); + sowwakeup(so); + sorwakeup(so); +} + +void +soisdisconnected(so) + register struct socket *so; +{ + + so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); + so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE); + wakeup((caddr_t)&so->so_timeo); + sowwakeup(so); + sorwakeup(so); +} + +/* + * Return a random connection that hasn't been serviced yet and + * is eligible for discard. There is a one in qlen chance that + * we will return a null, saying that there are no dropable + * requests. In this case, the protocol specific code should drop + * the new request. This insures fairness. + * + * This may be used in conjunction with protocol specific queue + * congestion routines. + */ +struct socket * +sodropablereq(head) + register struct socket *head; +{ + register struct socket *so; + unsigned int i, j, qlen; + static int rnd; + static struct timeval old_runtime; + static unsigned int cur_cnt, old_cnt; + struct timeval tv; + + getmicrouptime(&tv); + if ((i = (tv.tv_sec - old_runtime.tv_sec)) != 0) { + old_runtime = tv; + old_cnt = cur_cnt / i; + cur_cnt = 0; + } + + so = TAILQ_FIRST(&head->so_incomp); + if (!so) + return (so); + + qlen = head->so_incqlen; + if (++cur_cnt > qlen || old_cnt > qlen) { + rnd = (314159 * rnd + 66329) & 0xffff; + j = ((qlen + 1) * rnd) >> 16; + + while (j-- && so) + so = TAILQ_NEXT(so, so_list); + } + + return (so); +} + +/* + * When an attempt at a new connection is noted on a socket + * which accepts connections, sonewconn is called. If the + * connection is possible (subject to space constraints, etc.) + * then we allocate a new structure, propoerly linked into the + * data structure of the original socket, and return this. + * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. + */ +struct socket * +sonewconn(head, connstatus) + register struct socket *head; + int connstatus; +{ + register struct socket *so; + + if (head->so_qlen > 3 * head->so_qlimit / 2) + return ((struct socket *)0); + so = soalloc(0); + if (so == NULL) + return ((struct socket *)0); + so->so_head = head; + so->so_type = head->so_type; + so->so_options = head->so_options &~ SO_ACCEPTCONN; + so->so_linger = head->so_linger; + so->so_state = head->so_state | SS_NOFDREF; + so->so_proto = head->so_proto; + so->so_timeo = head->so_timeo; + so->so_uid = head->so_uid; + (void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat); + + if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0, NULL)) { + sodealloc(so); + return ((struct socket *)0); + } + + if (connstatus) { + TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); + so->so_state |= SS_COMP; + } else { + TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); + so->so_state |= SS_INCOMP; + head->so_incqlen++; + } + head->so_qlen++; + if (connstatus) { + sorwakeup(head); + wakeup((caddr_t)&head->so_timeo); + so->so_state |= connstatus; + } + return (so); +} + +/* + * Socantsendmore indicates that no more data will be sent on the + * socket; it would normally be applied to a socket when the user + * informs the system that no more data is to be sent, by the protocol + * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data + * will be received, and will normally be applied to the socket by a + * protocol when it detects that the peer will send no more data. + * Data queued for reading in the socket may yet be read. + */ + +void +socantsendmore(so) + struct socket *so; +{ + + so->so_state |= SS_CANTSENDMORE; + sowwakeup(so); +} + +void +socantrcvmore(so) + struct socket *so; +{ + + so->so_state |= SS_CANTRCVMORE; + sorwakeup(so); +} + +/* + * Wait for data to arrive at/drain from a socket buffer. + */ +int +sbwait(sb) + struct sockbuf *sb; +{ + + sb->sb_flags |= SB_WAIT; + return (tsleep((caddr_t)&sb->sb_cc, + (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait", + sb->sb_timeo)); +} + +/* + * Lock a sockbuf already known to be locked; + * return any error returned from sleep (EINTR). + */ +int +sb_lock(sb) + register struct sockbuf *sb; +{ + int error; + + while (sb->sb_flags & SB_LOCK) { + sb->sb_flags |= SB_WANT; + error = tsleep((caddr_t)&sb->sb_flags, + (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH, + "sblock", 0); + if (error) + return (error); + } + sb->sb_flags |= SB_LOCK; + return (0); +} + +/* + * Wakeup processes waiting on a socket buffer. + * Do asynchronous notification via SIGIO + * if the socket has the SS_ASYNC flag set. + */ +void +sowakeup(so, sb) + register struct socket *so; + register struct sockbuf *sb; +{ + selwakeup(&sb->sb_sel); + sb->sb_flags &= ~SB_SEL; + if (sb->sb_flags & SB_WAIT) { + sb->sb_flags &= ~SB_WAIT; + wakeup((caddr_t)&sb->sb_cc); + } + if ((so->so_state & SS_ASYNC) && so->so_sigio != NULL) + pgsigio(so->so_sigio, SIGIO, 0); + if (sb->sb_flags & SB_UPCALL) + (*so->so_upcall)(so, so->so_upcallarg, M_DONTWAIT); +} + +/* + * Socket buffer (struct sockbuf) utility routines. + * + * Each socket contains two socket buffers: one for sending data and + * one for receiving data. Each buffer contains a queue of mbufs, + * information about the number of mbufs and amount of data in the + * queue, and other fields allowing select() statements and notification + * on data availability to be implemented. + * + * Data stored in a socket buffer is maintained as a list of records. + * Each record is a list of mbufs chained together with the m_next + * field. Records are chained together with the m_nextpkt field. The upper + * level routine soreceive() expects the following conventions to be + * observed when placing information in the receive buffer: + * + * 1. If the protocol requires each message be preceded by the sender's + * name, then a record containing that name must be present before + * any associated data (mbuf's must be of type MT_SONAME). + * 2. If the protocol supports the exchange of ``access rights'' (really + * just additional data associated with the message), and there are + * ``rights'' to be received, then a record containing this data + * should be present (mbuf's must be of type MT_RIGHTS). + * 3. If a name or rights record exists, then it must be followed by + * a data record, perhaps of zero length. + * + * Before using a new socket structure it is first necessary to reserve + * buffer space to the socket, by calling sbreserve(). This should commit + * some of the available buffer space in the system buffer pool for the + * socket (currently, it does nothing but enforce limits). The space + * should be released by calling sbrelease() when the socket is destroyed. + */ + +int +soreserve(so, sndcc, rcvcc) + register struct socket *so; + u_long sndcc, rcvcc; +{ + + if (sbreserve(&so->so_snd, sndcc) == 0) + goto bad; + if (sbreserve(&so->so_rcv, rcvcc) == 0) + goto bad2; + if (so->so_rcv.sb_lowat == 0) + so->so_rcv.sb_lowat = 1; + if (so->so_snd.sb_lowat == 0) + so->so_snd.sb_lowat = MCLBYTES; + if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) + so->so_snd.sb_lowat = so->so_snd.sb_hiwat; + return (0); +bad2: + sbrelease(&so->so_snd); +bad: + return (ENOBUFS); +} + +/* + * Allot mbufs to a sockbuf. + * Attempt to scale mbmax so that mbcnt doesn't become limiting + * if buffering efficiency is near the normal case. + */ +int +sbreserve(sb, cc) + struct sockbuf *sb; + u_long cc; +{ + if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES)) + return (0); + sb->sb_hiwat = cc; + sb->sb_mbmax = min(cc * sb_efficiency, sb_max); + if (sb->sb_lowat > sb->sb_hiwat) + sb->sb_lowat = sb->sb_hiwat; + return (1); +} + +/* + * Free mbufs held by a socket, and reserved mbuf space. + */ +void +sbrelease(sb) + struct sockbuf *sb; +{ + + sbflush(sb); + sb->sb_hiwat = sb->sb_mbmax = 0; +} + +/* + * Routines to add and remove + * data from an mbuf queue. + * + * The routines sbappend() or sbappendrecord() are normally called to + * append new mbufs to a socket buffer, after checking that adequate + * space is available, comparing the function sbspace() with the amount + * of data to be added. sbappendrecord() differs from sbappend() in + * that data supplied is treated as the beginning of a new record. + * To place a sender's address, optional access rights, and data in a + * socket receive buffer, sbappendaddr() should be used. To place + * access rights and data in a socket receive buffer, sbappendrights() + * should be used. In either case, the new data begins a new record. + * Note that unlike sbappend() and sbappendrecord(), these routines check + * for the caller that there will be enough space to store the data. + * Each fails if there is not enough space, or if it cannot find mbufs + * to store additional information in. + * + * Reliable protocols may use the socket send buffer to hold data + * awaiting acknowledgement. Data is normally copied from a socket + * send buffer in a protocol with m_copy for output to a peer, + * and then removing the data from the socket buffer with sbdrop() + * or sbdroprecord() when the data is acknowledged by the peer. + */ + +/* + * Append mbuf chain m to the last record in the + * socket buffer sb. The additional space associated + * the mbuf chain is recorded in sb. Empty mbufs are + * discarded and mbufs are compacted where possible. + */ +void +sbappend(sb, m) + struct sockbuf *sb; + struct mbuf *m; +{ + register struct mbuf *n; + + if (m == 0) + return; + n = sb->sb_mb; + if (n) { + while (n->m_nextpkt) + n = n->m_nextpkt; + do { + if (n->m_flags & M_EOR) { + sbappendrecord(sb, m); /* XXXXXX!!!! */ + return; + } + } while (n->m_next && (n = n->m_next)); + } + sbcompress(sb, m, n); +} + +#ifdef SOCKBUF_DEBUG +void +sbcheck(sb) + register struct sockbuf *sb; +{ + register struct mbuf *m; + register struct mbuf *n = 0; + register u_long len = 0, mbcnt = 0; + + for (m = sb->sb_mb; m; m = n) { + n = m->m_nextpkt; + for (; m; m = m->m_next) { + len += m->m_len; + mbcnt += MSIZE; + if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ + mbcnt += m->m_ext.ext_size; + } + } + if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { + printf("cc %ld != %ld || mbcnt %ld != %ld\n", len, sb->sb_cc, + mbcnt, sb->sb_mbcnt); + panic("sbcheck"); + } +} +#endif + +/* + * As above, except the mbuf chain + * begins a new record. + */ +void +sbappendrecord(sb, m0) + register struct sockbuf *sb; + register struct mbuf *m0; +{ + register struct mbuf *m; + + if (m0 == 0) + return; + m = sb->sb_mb; + if (m) + while (m->m_nextpkt) + m = m->m_nextpkt; + /* + * Put the first mbuf on the queue. + * Note this permits zero length records. + */ + sballoc(sb, m0); + if (m) + m->m_nextpkt = m0; + else + sb->sb_mb = m0; + m = m0->m_next; + m0->m_next = 0; + if (m && (m0->m_flags & M_EOR)) { + m0->m_flags &= ~M_EOR; + m->m_flags |= M_EOR; + } + sbcompress(sb, m, m0); +} + +/* + * As above except that OOB data + * is inserted at the beginning of the sockbuf, + * but after any other OOB data. + */ +void +sbinsertoob(sb, m0) + register struct sockbuf *sb; + register struct mbuf *m0; +{ + register struct mbuf *m; + register struct mbuf **mp; + + if (m0 == 0) + return; + for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) { + m = *mp; + again: + switch (m->m_type) { + + case MT_OOBDATA: + continue; /* WANT next train */ + + case MT_CONTROL: + m = m->m_next; + if (m) + goto again; /* inspect THIS train further */ + } + break; + } + /* + * Put the first mbuf on the queue. + * Note this permits zero length records. + */ + sballoc(sb, m0); + m0->m_nextpkt = *mp; + *mp = m0; + m = m0->m_next; + m0->m_next = 0; + if (m && (m0->m_flags & M_EOR)) { + m0->m_flags &= ~M_EOR; + m->m_flags |= M_EOR; + } + sbcompress(sb, m, m0); +} + +/* + * Append address and data, and optionally, control (ancillary) data + * to the receive queue of a socket. If present, + * m0 must include a packet header with total length. + * Returns 0 if no space in sockbuf or insufficient mbufs. + */ +int +sbappendaddr(sb, asa, m0, control) + register struct sockbuf *sb; + struct sockaddr *asa; + struct mbuf *m0, *control; +{ + register struct mbuf *m, *n; + int space = asa->sa_len; + +if (m0 && (m0->m_flags & M_PKTHDR) == 0) +panic("sbappendaddr"); + if (m0) + space += m0->m_pkthdr.len; + for (n = control; n; n = n->m_next) { + space += n->m_len; + if (n->m_next == 0) /* keep pointer to last control buf */ + break; + } + if (space > sbspace(sb)) + return (0); + if (asa->sa_len > MLEN) + return (0); + MGET(m, M_DONTWAIT, MT_SONAME); + if (m == 0) + return (0); + m->m_len = asa->sa_len; + bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len); + if (n) + n->m_next = m0; /* concatenate data to control */ + else + control = m0; + m->m_next = control; + for (n = m; n; n = n->m_next) + sballoc(sb, n); + n = sb->sb_mb; + if (n) { + while (n->m_nextpkt) + n = n->m_nextpkt; + n->m_nextpkt = m; + } else + sb->sb_mb = m; + return (1); +} + +int +sbappendcontrol(sb, m0, control) + struct sockbuf *sb; + struct mbuf *control, *m0; +{ + register struct mbuf *m, *n; + int space = 0; + + if (control == 0) + panic("sbappendcontrol"); + for (m = control; ; m = m->m_next) { + space += m->m_len; + if (m->m_next == 0) + break; + } + n = m; /* save pointer to last control buffer */ + for (m = m0; m; m = m->m_next) + space += m->m_len; + if (space > sbspace(sb)) + return (0); + n->m_next = m0; /* concatenate data to control */ + for (m = control; m; m = m->m_next) + sballoc(sb, m); + n = sb->sb_mb; + if (n) { + while (n->m_nextpkt) + n = n->m_nextpkt; + n->m_nextpkt = control; + } else + sb->sb_mb = control; + return (1); +} + +/* + * Compress mbuf chain m into the socket + * buffer sb following mbuf n. If n + * is null, the buffer is presumed empty. + */ +void +sbcompress(sb, m, n) + register struct sockbuf *sb; + register struct mbuf *m, *n; +{ + register int eor = 0; + register struct mbuf *o; + + while (m) { + eor |= m->m_flags & M_EOR; + if (m->m_len == 0 && + (eor == 0 || + (((o = m->m_next) || (o = n)) && + o->m_type == m->m_type))) { + m = m_free(m); + continue; + } + if (n && (n->m_flags & (M_EXT | M_EOR)) == 0 && + (n->m_data + n->m_len + m->m_len) < &n->m_dat[MLEN] && + n->m_type == m->m_type) { + bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len, + (unsigned)m->m_len); + n->m_len += m->m_len; + sb->sb_cc += m->m_len; + m = m_free(m); + continue; + } + if (n) + n->m_next = m; + else + sb->sb_mb = m; + sballoc(sb, m); + n = m; + m->m_flags &= ~M_EOR; + m = m->m_next; + n->m_next = 0; + } + if (eor) { + if (n) + n->m_flags |= eor; + else + printf("semi-panic: sbcompress\n"); + } +} + +/* + * Free all mbufs in a sockbuf. + * Check that all resources are reclaimed. + */ +void +sbflush(sb) + register struct sockbuf *sb; +{ + + if (sb->sb_flags & SB_LOCK) + panic("sbflush: locked"); + while (sb->sb_mbcnt && sb->sb_cc) + sbdrop(sb, (int)sb->sb_cc); + if (sb->sb_cc || sb->sb_mb || sb->sb_mbcnt) + panic("sbflush: cc %ld || mb %p || mbcnt %ld", sb->sb_cc, (void *)sb->sb_mb, sb->sb_mbcnt); +} + +/* + * Drop data from (the front of) a sockbuf. + */ +void +sbdrop(sb, len) + register struct sockbuf *sb; + register int len; +{ + register struct mbuf *m, *mn; + struct mbuf *next; + + next = (m = sb->sb_mb) ? m->m_nextpkt : 0; + while (len > 0) { + if (m == 0) { + if (next == 0) + panic("sbdrop"); + m = next; + next = m->m_nextpkt; + continue; + } + if (m->m_len > len) { + m->m_len -= len; + m->m_data += len; + sb->sb_cc -= len; + break; + } + len -= m->m_len; + sbfree(sb, m); + MFREE(m, mn); + m = mn; + } + while (m && m->m_len == 0) { + sbfree(sb, m); + MFREE(m, mn); + m = mn; + } + if (m) { + sb->sb_mb = m; + m->m_nextpkt = next; + } else + sb->sb_mb = next; +} + +/* + * Drop a record off the front of a sockbuf + * and move the next record to the front. + */ +void +sbdroprecord(sb) + register struct sockbuf *sb; +{ + register struct mbuf *m, *mn; + + m = sb->sb_mb; + if (m) { + sb->sb_mb = m->m_nextpkt; + do { + sbfree(sb, m); + MFREE(m, mn); + m = mn; + } while (m); + } +} + +/* + * Create a "control" mbuf containing the specified data + * with the specified type for presentation on a socket buffer. + */ +struct mbuf * +sbcreatecontrol(p, size, type, level) + caddr_t p; + register int size; + int type, level; +{ + register struct cmsghdr *cp; + struct mbuf *m; + + if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) + return ((struct mbuf *) NULL); + cp = mtod(m, struct cmsghdr *); + /* XXX check size? */ + (void)memcpy(CMSG_DATA(cp), p, size); + size += sizeof(*cp); + m->m_len = size; + cp->cmsg_len = size; + cp->cmsg_level = level; + cp->cmsg_type = type; + return (m); +} + +/* + * Some routines that return EOPNOTSUPP for entry points that are not + * supported by a protocol. Fill in as needed. + */ +int +pru_accept_notsupp(struct socket *so, struct sockaddr **nam) +{ + return EOPNOTSUPP; +} + +int +pru_connect_notsupp(struct socket *so, struct sockaddr *nam, struct proc *p) +{ + return EOPNOTSUPP; +} + +int +pru_connect2_notsupp(struct socket *so1, struct socket *so2) +{ + return EOPNOTSUPP; +} + +int +pru_control_notsupp(struct socket *so, u_long cmd, caddr_t data, + struct ifnet *ifp, struct proc *p) +{ + return EOPNOTSUPP; +} + +int +pru_listen_notsupp(struct socket *so, struct proc *p) +{ + return EOPNOTSUPP; +} + +int +pru_rcvd_notsupp(struct socket *so, int flags) +{ + return EOPNOTSUPP; +} + +int +pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) +{ + return EOPNOTSUPP; +} + +/* + * This isn't really a ``null'' operation, but it's the default one + * and doesn't do anything destructive. + */ +int +pru_sense_null(struct socket *so, struct stat *sb) +{ + sb->st_blksize = so->so_snd.sb_hiwat; + return 0; +} + +/* + * Make a copy of a sockaddr in a malloced buffer of type M_SONAME. + */ +struct sockaddr * +dup_sockaddr(sa, canwait) + struct sockaddr *sa; + int canwait; +{ + struct sockaddr *sa2; + + MALLOC(sa2, struct sockaddr *, sa->sa_len, M_SONAME, + canwait ? M_WAITOK : M_NOWAIT); + if (sa2) + bcopy(sa, sa2, sa->sa_len); + return sa2; +} + +/* + * Create an external-format (``xsocket'') structure using the information + * in the kernel-format socket structure pointed to by so. This is done + * to reduce the spew of irrelevant information over this interface, + * to isolate user code from changes in the kernel structure, and + * potentially to provide information-hiding if we decide that + * some of this information should be hidden from users. + */ +void +sotoxsocket(struct socket *so, struct xsocket *xso) +{ + xso->xso_len = sizeof *xso; + xso->xso_so = so; + xso->so_type = so->so_type; + xso->so_options = so->so_options; + xso->so_linger = so->so_linger; + xso->so_state = so->so_state; + xso->so_pcb = so->so_pcb; + xso->xso_protocol = so->so_proto->pr_protocol; + xso->xso_family = so->so_proto->pr_domain->dom_family; + xso->so_qlen = so->so_qlen; + xso->so_incqlen = so->so_incqlen; + xso->so_qlimit = so->so_qlimit; + xso->so_timeo = so->so_timeo; + xso->so_error = so->so_error; + xso->so_pgid = so->so_sigio ? so->so_sigio->sio_pgid : 0; + xso->so_oobmark = so->so_oobmark; + sbtoxsockbuf(&so->so_snd, &xso->so_snd); + sbtoxsockbuf(&so->so_rcv, &xso->so_rcv); + xso->so_uid = so->so_uid; +} + +/* + * This does the same for sockbufs. Note that the xsockbuf structure, + * since it is always embedded in a socket, does not include a self + * pointer nor a length. We make this entry point public in case + * some other mechanism needs it. + */ +void +sbtoxsockbuf(struct sockbuf *sb, struct xsockbuf *xsb) +{ + xsb->sb_cc = sb->sb_cc; + xsb->sb_hiwat = sb->sb_hiwat; + xsb->sb_mbcnt = sb->sb_mbcnt; + xsb->sb_mbmax = sb->sb_mbmax; + xsb->sb_lowat = sb->sb_lowat; + xsb->sb_flags = sb->sb_flags; + xsb->sb_timeo = sb->sb_timeo; +} + +/* + * Here is the definition of some of the basic objects in the kern.ipc + * branch of the MIB. + */ +SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); + +/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */ +static int dummy; +SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, ""); + +SYSCTL_INT(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLFLAG_RW, &sb_max, 0, ""); +SYSCTL_INT(_kern_ipc, OID_AUTO, maxsockets, CTLFLAG_RD, &maxsockets, 0, ""); +SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW, + &sb_efficiency, 0, ""); +SYSCTL_INT(_kern_ipc, KIPC_NMBCLUSTERS, nmbclusters, CTLFLAG_RD, &nmbclusters, 0, ""); + diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c new file mode 100644 index 0000000..bd5149f --- /dev/null +++ b/sys/kern/uipc_syscalls.c @@ -0,0 +1,1701 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * sendfile(2) and related extensions: + * Copyright (c) 1998, David Greenman. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 + * $Id: uipc_syscalls.c,v 1.50 1999/01/21 08:29:04 dillon Exp $ + */ + +#include "opt_compat.h" +#include "opt_ktrace.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysproto.h> +#include <sys/malloc.h> +#include <sys/filedesc.h> +#include <sys/proc.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/signalvar.h> +#include <sys/uio.h> +#include <sys/vnode.h> +#include <sys/lock.h> +#include <sys/mount.h> +#ifdef KTRACE +#include <sys/ktrace.h> +#endif +#include <vm/vm.h> +#include <vm/vm_prot.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_pager.h> +#include <vm/vm_pageout.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#include <machine/limits.h> + +static void sf_buf_init(void *arg); +SYSINIT(sock_sf, SI_SUB_MBUF, SI_ORDER_ANY, sf_buf_init, NULL) +static struct sf_buf *sf_buf_alloc(void); +static void sf_buf_ref(caddr_t addr, u_int size); +static void sf_buf_free(caddr_t addr, u_int size); + +static int sendit __P((struct proc *p, int s, struct msghdr *mp, int flags)); +static int recvit __P((struct proc *p, int s, struct msghdr *mp, + caddr_t namelenp)); + +static int accept1 __P((struct proc *p, struct accept_args *uap, int compat)); +static int getsockname1 __P((struct proc *p, struct getsockname_args *uap, + int compat)); +static int getpeername1 __P((struct proc *p, struct getpeername_args *uap, + int compat)); + +static SLIST_HEAD(, sf_buf) sf_freelist; +static vm_offset_t sf_base; +static struct sf_buf *sf_bufs; +static int sf_buf_alloc_want; + +/* + * System call interface to the socket abstraction. + */ +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#define COMPAT_OLDSOCK +#endif + +extern struct fileops socketops; + +int +socket(p, uap) + struct proc *p; + register struct socket_args /* { + int domain; + int type; + int protocol; + } */ *uap; +{ + struct filedesc *fdp = p->p_fd; + struct socket *so; + struct file *fp; + int fd, error; + + error = falloc(p, &fp, &fd); + if (error) + return (error); + fp->f_flag = FREAD|FWRITE; + fp->f_type = DTYPE_SOCKET; + fp->f_ops = &socketops; + error = socreate(uap->domain, &so, uap->type, uap->protocol, p); + if (error) { + fdp->fd_ofiles[fd] = 0; + ffree(fp); + } else { + fp->f_data = (caddr_t)so; + p->p_retval[0] = fd; + } + return (error); +} + +/* ARGSUSED */ +int +bind(p, uap) + struct proc *p; + register struct bind_args /* { + int s; + caddr_t name; + int namelen; + } */ *uap; +{ + struct file *fp; + struct sockaddr *sa; + int error; + + error = getsock(p->p_fd, uap->s, &fp); + if (error) + return (error); + error = getsockaddr(&sa, uap->name, uap->namelen); + if (error) + return (error); + error = sobind((struct socket *)fp->f_data, sa, p); + FREE(sa, M_SONAME); + return (error); +} + +/* ARGSUSED */ +int +listen(p, uap) + struct proc *p; + register struct listen_args /* { + int s; + int backlog; + } */ *uap; +{ + struct file *fp; + int error; + + error = getsock(p->p_fd, uap->s, &fp); + if (error) + return (error); + return (solisten((struct socket *)fp->f_data, uap->backlog, p)); +} + +static int +accept1(p, uap, compat) + struct proc *p; + register struct accept_args /* { + int s; + caddr_t name; + int *anamelen; + } */ *uap; + int compat; +{ + struct file *fp; + struct sockaddr *sa; + int namelen, error, s; + struct socket *head, *so; + int fd; + short fflag; /* type must match fp->f_flag */ + + if (uap->name) { + error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen, + sizeof (namelen)); + if(error) + return (error); + } + error = getsock(p->p_fd, uap->s, &fp); + if (error) + return (error); + s = splnet(); + head = (struct socket *)fp->f_data; + if ((head->so_options & SO_ACCEPTCONN) == 0) { + splx(s); + return (EINVAL); + } + if ((head->so_state & SS_NBIO) && head->so_comp.tqh_first == NULL) { + splx(s); + return (EWOULDBLOCK); + } + while (head->so_comp.tqh_first == NULL && head->so_error == 0) { + if (head->so_state & SS_CANTRCVMORE) { + head->so_error = ECONNABORTED; + break; + } + error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH, + "accept", 0); + if (error) { + splx(s); + return (error); + } + } + if (head->so_error) { + error = head->so_error; + head->so_error = 0; + splx(s); + return (error); + } + + /* + * At this point we know that there is at least one connection + * ready to be accepted. Remove it from the queue prior to + * allocating the file descriptor for it since falloc() may + * block allowing another process to accept the connection + * instead. + */ + so = head->so_comp.tqh_first; + TAILQ_REMOVE(&head->so_comp, so, so_list); + head->so_qlen--; + + fflag = fp->f_flag; + error = falloc(p, &fp, &fd); + if (error) { + /* + * Probably ran out of file descriptors. Put the + * unaccepted connection back onto the queue and + * do another wakeup so some other process might + * have a chance at it. + */ + TAILQ_INSERT_HEAD(&head->so_comp, so, so_list); + head->so_qlen++; + wakeup_one(&head->so_timeo); + splx(s); + return (error); + } else + p->p_retval[0] = fd; + + so->so_state &= ~SS_COMP; + so->so_head = NULL; + if (head->so_sigio != NULL) + fsetown(fgetown(head->so_sigio), &so->so_sigio); + + fp->f_type = DTYPE_SOCKET; + fp->f_flag = fflag; + fp->f_ops = &socketops; + fp->f_data = (caddr_t)so; + sa = 0; + (void) soaccept(so, &sa); + if (sa == 0) { + namelen = 0; + if (uap->name) + goto gotnoname; + return 0; + } + if (uap->name) { + /* check sa_len before it is destroyed */ + if (namelen > sa->sa_len) + namelen = sa->sa_len; +#ifdef COMPAT_OLDSOCK + if (compat) + ((struct osockaddr *)sa)->sa_family = + sa->sa_family; +#endif + error = copyout(sa, (caddr_t)uap->name, (u_int)namelen); + if (!error) +gotnoname: + error = copyout((caddr_t)&namelen, + (caddr_t)uap->anamelen, sizeof (*uap->anamelen)); + } + FREE(sa, M_SONAME); + splx(s); + return (error); +} + +int +accept(p, uap) + struct proc *p; + struct accept_args *uap; +{ + + return (accept1(p, uap, 0)); +} + +#ifdef COMPAT_OLDSOCK +int +oaccept(p, uap) + struct proc *p; + struct accept_args *uap; +{ + + return (accept1(p, uap, 1)); +} +#endif /* COMPAT_OLDSOCK */ + +/* ARGSUSED */ +int +connect(p, uap) + struct proc *p; + register struct connect_args /* { + int s; + caddr_t name; + int namelen; + } */ *uap; +{ + struct file *fp; + register struct socket *so; + struct sockaddr *sa; + int error, s; + + error = getsock(p->p_fd, uap->s, &fp); + if (error) + return (error); + so = (struct socket *)fp->f_data; + if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) + return (EALREADY); + error = getsockaddr(&sa, uap->name, uap->namelen); + if (error) + return (error); + error = soconnect(so, sa, p); + if (error) + goto bad; + if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) { + FREE(sa, M_SONAME); + return (EINPROGRESS); + } + s = splnet(); + while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { + error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH, + "connec", 0); + if (error) + break; + } + if (error == 0) { + error = so->so_error; + so->so_error = 0; + } + splx(s); +bad: + so->so_state &= ~SS_ISCONNECTING; + FREE(sa, M_SONAME); + if (error == ERESTART) + error = EINTR; + return (error); +} + +int +socketpair(p, uap) + struct proc *p; + register struct socketpair_args /* { + int domain; + int type; + int protocol; + int *rsv; + } */ *uap; +{ + register struct filedesc *fdp = p->p_fd; + struct file *fp1, *fp2; + struct socket *so1, *so2; + int fd, error, sv[2]; + + error = socreate(uap->domain, &so1, uap->type, uap->protocol, p); + if (error) + return (error); + error = socreate(uap->domain, &so2, uap->type, uap->protocol, p); + if (error) + goto free1; + error = falloc(p, &fp1, &fd); + if (error) + goto free2; + sv[0] = fd; + fp1->f_flag = FREAD|FWRITE; + fp1->f_type = DTYPE_SOCKET; + fp1->f_ops = &socketops; + fp1->f_data = (caddr_t)so1; + error = falloc(p, &fp2, &fd); + if (error) + goto free3; + fp2->f_flag = FREAD|FWRITE; + fp2->f_type = DTYPE_SOCKET; + fp2->f_ops = &socketops; + fp2->f_data = (caddr_t)so2; + sv[1] = fd; + error = soconnect2(so1, so2); + if (error) + goto free4; + if (uap->type == SOCK_DGRAM) { + /* + * Datagram socket connection is asymmetric. + */ + error = soconnect2(so2, so1); + if (error) + goto free4; + } + error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int)); + return (error); +free4: + ffree(fp2); + fdp->fd_ofiles[sv[1]] = 0; +free3: + ffree(fp1); + fdp->fd_ofiles[sv[0]] = 0; +free2: + (void)soclose(so2); +free1: + (void)soclose(so1); + return (error); +} + +static int +sendit(p, s, mp, flags) + register struct proc *p; + int s; + register struct msghdr *mp; + int flags; +{ + struct file *fp; + struct uio auio; + register struct iovec *iov; + register int i; + struct mbuf *control; + struct sockaddr *to; + int len, error; + struct socket *so; +#ifdef KTRACE + struct iovec *ktriov = NULL; +#endif + + error = getsock(p->p_fd, s, &fp); + if (error) + return (error); + auio.uio_iov = mp->msg_iov; + auio.uio_iovcnt = mp->msg_iovlen; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_rw = UIO_WRITE; + auio.uio_procp = p; + auio.uio_offset = 0; /* XXX */ + auio.uio_resid = 0; + iov = mp->msg_iov; + for (i = 0; i < mp->msg_iovlen; i++, iov++) { + if ((auio.uio_resid += iov->iov_len) < 0) + return (EINVAL); + } + if (mp->msg_name) { + error = getsockaddr(&to, mp->msg_name, mp->msg_namelen); + if (error) + return (error); + } else + to = 0; + if (mp->msg_control) { + if (mp->msg_controllen < sizeof(struct cmsghdr) +#ifdef COMPAT_OLDSOCK + && mp->msg_flags != MSG_COMPAT +#endif + ) { + error = EINVAL; + goto bad; + } + error = sockargs(&control, mp->msg_control, + mp->msg_controllen, MT_CONTROL); + if (error) + goto bad; +#ifdef COMPAT_OLDSOCK + if (mp->msg_flags == MSG_COMPAT) { + register struct cmsghdr *cm; + + M_PREPEND(control, sizeof(*cm), M_WAIT); + if (control == 0) { + error = ENOBUFS; + goto bad; + } else { + cm = mtod(control, struct cmsghdr *); + cm->cmsg_len = control->m_len; + cm->cmsg_level = SOL_SOCKET; + cm->cmsg_type = SCM_RIGHTS; + } + } +#endif + } else + control = 0; +#ifdef KTRACE + if (KTRPOINT(p, KTR_GENIO)) { + int iovlen = auio.uio_iovcnt * sizeof (struct iovec); + + MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); + bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); + } +#endif + len = auio.uio_resid; + so = (struct socket *)fp->f_data; + error = so->so_proto->pr_usrreqs->pru_sosend(so, to, &auio, 0, control, + flags, p); + if (error) { + if (auio.uio_resid != len && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + if (error == EPIPE) + psignal(p, SIGPIPE); + } + if (error == 0) + p->p_retval[0] = len - auio.uio_resid; +#ifdef KTRACE + if (ktriov != NULL) { + if (error == 0) + ktrgenio(p->p_tracep, s, UIO_WRITE, + ktriov, p->p_retval[0], error); + FREE(ktriov, M_TEMP); + } +#endif +bad: + if (to) + FREE(to, M_SONAME); + return (error); +} + +int +sendto(p, uap) + struct proc *p; + register struct sendto_args /* { + int s; + caddr_t buf; + size_t len; + int flags; + caddr_t to; + int tolen; + } */ *uap; +{ + struct msghdr msg; + struct iovec aiov; + + msg.msg_name = uap->to; + msg.msg_namelen = uap->tolen; + msg.msg_iov = &aiov; + msg.msg_iovlen = 1; + msg.msg_control = 0; +#ifdef COMPAT_OLDSOCK + msg.msg_flags = 0; +#endif + aiov.iov_base = uap->buf; + aiov.iov_len = uap->len; + return (sendit(p, uap->s, &msg, uap->flags)); +} + +#ifdef COMPAT_OLDSOCK +int +osend(p, uap) + struct proc *p; + register struct osend_args /* { + int s; + caddr_t buf; + int len; + int flags; + } */ *uap; +{ + struct msghdr msg; + struct iovec aiov; + + msg.msg_name = 0; + msg.msg_namelen = 0; + msg.msg_iov = &aiov; + msg.msg_iovlen = 1; + aiov.iov_base = uap->buf; + aiov.iov_len = uap->len; + msg.msg_control = 0; + msg.msg_flags = 0; + return (sendit(p, uap->s, &msg, uap->flags)); +} + +int +osendmsg(p, uap) + struct proc *p; + register struct osendmsg_args /* { + int s; + caddr_t msg; + int flags; + } */ *uap; +{ + struct msghdr msg; + struct iovec aiov[UIO_SMALLIOV], *iov; + int error; + + error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr)); + if (error) + return (error); + if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { + if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) + return (EMSGSIZE); + MALLOC(iov, struct iovec *, + sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, + M_WAITOK); + } else + iov = aiov; + error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, + (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); + if (error) + goto done; + msg.msg_flags = MSG_COMPAT; + msg.msg_iov = iov; + error = sendit(p, uap->s, &msg, uap->flags); +done: + if (iov != aiov) + FREE(iov, M_IOV); + return (error); +} +#endif + +int +sendmsg(p, uap) + struct proc *p; + register struct sendmsg_args /* { + int s; + caddr_t msg; + int flags; + } */ *uap; +{ + struct msghdr msg; + struct iovec aiov[UIO_SMALLIOV], *iov; + int error; + + error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg)); + if (error) + return (error); + if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { + if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) + return (EMSGSIZE); + MALLOC(iov, struct iovec *, + sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, + M_WAITOK); + } else + iov = aiov; + if (msg.msg_iovlen && + (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, + (unsigned)(msg.msg_iovlen * sizeof (struct iovec))))) + goto done; + msg.msg_iov = iov; +#ifdef COMPAT_OLDSOCK + msg.msg_flags = 0; +#endif + error = sendit(p, uap->s, &msg, uap->flags); +done: + if (iov != aiov) + FREE(iov, M_IOV); + return (error); +} + +static int +recvit(p, s, mp, namelenp) + register struct proc *p; + int s; + register struct msghdr *mp; + caddr_t namelenp; +{ + struct file *fp; + struct uio auio; + register struct iovec *iov; + register int i; + int len, error; + struct mbuf *m, *control = 0; + caddr_t ctlbuf; + struct socket *so; + struct sockaddr *fromsa = 0; +#ifdef KTRACE + struct iovec *ktriov = NULL; +#endif + + error = getsock(p->p_fd, s, &fp); + if (error) + return (error); + auio.uio_iov = mp->msg_iov; + auio.uio_iovcnt = mp->msg_iovlen; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_rw = UIO_READ; + auio.uio_procp = p; + auio.uio_offset = 0; /* XXX */ + auio.uio_resid = 0; + iov = mp->msg_iov; + for (i = 0; i < mp->msg_iovlen; i++, iov++) { + if ((auio.uio_resid += iov->iov_len) < 0) + return (EINVAL); + } +#ifdef KTRACE + if (KTRPOINT(p, KTR_GENIO)) { + int iovlen = auio.uio_iovcnt * sizeof (struct iovec); + + MALLOC(ktriov, struct iovec *, iovlen, M_TEMP, M_WAITOK); + bcopy((caddr_t)auio.uio_iov, (caddr_t)ktriov, iovlen); + } +#endif + len = auio.uio_resid; + so = (struct socket *)fp->f_data; + error = so->so_proto->pr_usrreqs->pru_soreceive(so, &fromsa, &auio, + (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0, + &mp->msg_flags); + if (error) { + if (auio.uio_resid != len && (error == ERESTART || + error == EINTR || error == EWOULDBLOCK)) + error = 0; + } +#ifdef KTRACE + if (ktriov != NULL) { + if (error == 0) + ktrgenio(p->p_tracep, s, UIO_READ, + ktriov, len - auio.uio_resid, error); + FREE(ktriov, M_TEMP); + } +#endif + if (error) + goto out; + p->p_retval[0] = len - auio.uio_resid; + if (mp->msg_name) { + len = mp->msg_namelen; + if (len <= 0 || fromsa == 0) + len = 0; + else { +#ifndef MIN +#define MIN(a,b) ((a)>(b)?(b):(a)) +#endif + /* save sa_len before it is destroyed by MSG_COMPAT */ + len = MIN(len, fromsa->sa_len); +#ifdef COMPAT_OLDSOCK + if (mp->msg_flags & MSG_COMPAT) + ((struct osockaddr *)fromsa)->sa_family = + fromsa->sa_family; +#endif + error = copyout(fromsa, + (caddr_t)mp->msg_name, (unsigned)len); + if (error) + goto out; + } + mp->msg_namelen = len; + if (namelenp && + (error = copyout((caddr_t)&len, namelenp, sizeof (int)))) { +#ifdef COMPAT_OLDSOCK + if (mp->msg_flags & MSG_COMPAT) + error = 0; /* old recvfrom didn't check */ + else +#endif + goto out; + } + } + if (mp->msg_control) { +#ifdef COMPAT_OLDSOCK + /* + * We assume that old recvmsg calls won't receive access + * rights and other control info, esp. as control info + * is always optional and those options didn't exist in 4.3. + * If we receive rights, trim the cmsghdr; anything else + * is tossed. + */ + if (control && mp->msg_flags & MSG_COMPAT) { + if (mtod(control, struct cmsghdr *)->cmsg_level != + SOL_SOCKET || + mtod(control, struct cmsghdr *)->cmsg_type != + SCM_RIGHTS) { + mp->msg_controllen = 0; + goto out; + } + control->m_len -= sizeof (struct cmsghdr); + control->m_data += sizeof (struct cmsghdr); + } +#endif + len = mp->msg_controllen; + m = control; + mp->msg_controllen = 0; + ctlbuf = (caddr_t) mp->msg_control; + + while (m && len > 0) { + unsigned int tocopy; + + if (len >= m->m_len) + tocopy = m->m_len; + else { + mp->msg_flags |= MSG_CTRUNC; + tocopy = len; + } + + if (error = copyout((caddr_t)mtod(m, caddr_t), + ctlbuf, tocopy)) + goto out; + + ctlbuf += tocopy; + len -= tocopy; + m = m->m_next; + } + mp->msg_controllen = ctlbuf - mp->msg_control; + } +out: + if (fromsa) + FREE(fromsa, M_SONAME); + if (control) + m_freem(control); + return (error); +} + +int +recvfrom(p, uap) + struct proc *p; + register struct recvfrom_args /* { + int s; + caddr_t buf; + size_t len; + int flags; + caddr_t from; + int *fromlenaddr; + } */ *uap; +{ + struct msghdr msg; + struct iovec aiov; + int error; + + if (uap->fromlenaddr) { + error = copyin((caddr_t)uap->fromlenaddr, + (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen)); + if (error) + return (error); + } else + msg.msg_namelen = 0; + msg.msg_name = uap->from; + msg.msg_iov = &aiov; + msg.msg_iovlen = 1; + aiov.iov_base = uap->buf; + aiov.iov_len = uap->len; + msg.msg_control = 0; + msg.msg_flags = uap->flags; + return (recvit(p, uap->s, &msg, (caddr_t)uap->fromlenaddr)); +} + +#ifdef COMPAT_OLDSOCK +int +orecvfrom(p, uap) + struct proc *p; + struct recvfrom_args *uap; +{ + + uap->flags |= MSG_COMPAT; + return (recvfrom(p, uap)); +} +#endif + + +#ifdef COMPAT_OLDSOCK +int +orecv(p, uap) + struct proc *p; + register struct orecv_args /* { + int s; + caddr_t buf; + int len; + int flags; + } */ *uap; +{ + struct msghdr msg; + struct iovec aiov; + + msg.msg_name = 0; + msg.msg_namelen = 0; + msg.msg_iov = &aiov; + msg.msg_iovlen = 1; + aiov.iov_base = uap->buf; + aiov.iov_len = uap->len; + msg.msg_control = 0; + msg.msg_flags = uap->flags; + return (recvit(p, uap->s, &msg, (caddr_t)0)); +} + +/* + * Old recvmsg. This code takes advantage of the fact that the old msghdr + * overlays the new one, missing only the flags, and with the (old) access + * rights where the control fields are now. + */ +int +orecvmsg(p, uap) + struct proc *p; + register struct orecvmsg_args /* { + int s; + struct omsghdr *msg; + int flags; + } */ *uap; +{ + struct msghdr msg; + struct iovec aiov[UIO_SMALLIOV], *iov; + int error; + + error = copyin((caddr_t)uap->msg, (caddr_t)&msg, + sizeof (struct omsghdr)); + if (error) + return (error); + if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { + if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) + return (EMSGSIZE); + MALLOC(iov, struct iovec *, + sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, + M_WAITOK); + } else + iov = aiov; + msg.msg_flags = uap->flags | MSG_COMPAT; + error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, + (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); + if (error) + goto done; + msg.msg_iov = iov; + error = recvit(p, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen); + + if (msg.msg_controllen && error == 0) + error = copyout((caddr_t)&msg.msg_controllen, + (caddr_t)&uap->msg->msg_accrightslen, sizeof (int)); +done: + if (iov != aiov) + FREE(iov, M_IOV); + return (error); +} +#endif + +int +recvmsg(p, uap) + struct proc *p; + register struct recvmsg_args /* { + int s; + struct msghdr *msg; + int flags; + } */ *uap; +{ + struct msghdr msg; + struct iovec aiov[UIO_SMALLIOV], *uiov, *iov; + register int error; + + error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg)); + if (error) + return (error); + if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { + if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) + return (EMSGSIZE); + MALLOC(iov, struct iovec *, + sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, + M_WAITOK); + } else + iov = aiov; +#ifdef COMPAT_OLDSOCK + msg.msg_flags = uap->flags &~ MSG_COMPAT; +#else + msg.msg_flags = uap->flags; +#endif + uiov = msg.msg_iov; + msg.msg_iov = iov; + error = copyin((caddr_t)uiov, (caddr_t)iov, + (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); + if (error) + goto done; + error = recvit(p, uap->s, &msg, (caddr_t)0); + if (!error) { + msg.msg_iov = uiov; + error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg)); + } +done: + if (iov != aiov) + FREE(iov, M_IOV); + return (error); +} + +/* ARGSUSED */ +int +shutdown(p, uap) + struct proc *p; + register struct shutdown_args /* { + int s; + int how; + } */ *uap; +{ + struct file *fp; + int error; + + error = getsock(p->p_fd, uap->s, &fp); + if (error) + return (error); + return (soshutdown((struct socket *)fp->f_data, uap->how)); +} + +/* ARGSUSED */ +int +setsockopt(p, uap) + struct proc *p; + register struct setsockopt_args /* { + int s; + int level; + int name; + caddr_t val; + int valsize; + } */ *uap; +{ + struct file *fp; + struct sockopt sopt; + int error; + + if (uap->val == 0 && uap->valsize != 0) + return (EFAULT); + if (uap->valsize < 0) + return (EINVAL); + + error = getsock(p->p_fd, uap->s, &fp); + if (error) + return (error); + + sopt.sopt_dir = SOPT_SET; + sopt.sopt_level = uap->level; + sopt.sopt_name = uap->name; + sopt.sopt_val = uap->val; + sopt.sopt_valsize = uap->valsize; + sopt.sopt_p = p; + + return (sosetopt((struct socket *)fp->f_data, &sopt)); +} + +/* ARGSUSED */ +int +getsockopt(p, uap) + struct proc *p; + register struct getsockopt_args /* { + int s; + int level; + int name; + caddr_t val; + int *avalsize; + } */ *uap; +{ + int valsize, error; + struct file *fp; + struct sockopt sopt; + + error = getsock(p->p_fd, uap->s, &fp); + if (error) + return (error); + if (uap->val) { + error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize, + sizeof (valsize)); + if (error) + return (error); + if (valsize < 0) + return (EINVAL); + } else + valsize = 0; + + sopt.sopt_dir = SOPT_GET; + sopt.sopt_level = uap->level; + sopt.sopt_name = uap->name; + sopt.sopt_val = uap->val; + sopt.sopt_valsize = (size_t)valsize; /* checked non-negative above */ + sopt.sopt_p = p; + + error = sogetopt((struct socket *)fp->f_data, &sopt); + if (error == 0) { + valsize = sopt.sopt_valsize; + error = copyout((caddr_t)&valsize, + (caddr_t)uap->avalsize, sizeof (valsize)); + } + return (error); +} + +/* + * Get socket name. + */ +/* ARGSUSED */ +static int +getsockname1(p, uap, compat) + struct proc *p; + register struct getsockname_args /* { + int fdes; + caddr_t asa; + int *alen; + } */ *uap; + int compat; +{ + struct file *fp; + register struct socket *so; + struct sockaddr *sa; + int len, error; + + error = getsock(p->p_fd, uap->fdes, &fp); + if (error) + return (error); + error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len)); + if (error) + return (error); + so = (struct socket *)fp->f_data; + sa = 0; + error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, &sa); + if (error) + goto bad; + if (sa == 0) { + len = 0; + goto gotnothing; + } + + len = MIN(len, sa->sa_len); +#ifdef COMPAT_OLDSOCK + if (compat) + ((struct osockaddr *)sa)->sa_family = sa->sa_family; +#endif + error = copyout(sa, (caddr_t)uap->asa, (u_int)len); + if (error == 0) +gotnothing: + error = copyout((caddr_t)&len, (caddr_t)uap->alen, + sizeof (len)); +bad: + if (sa) + FREE(sa, M_SONAME); + return (error); +} + +int +getsockname(p, uap) + struct proc *p; + struct getsockname_args *uap; +{ + + return (getsockname1(p, uap, 0)); +} + +#ifdef COMPAT_OLDSOCK +int +ogetsockname(p, uap) + struct proc *p; + struct getsockname_args *uap; +{ + + return (getsockname1(p, uap, 1)); +} +#endif /* COMPAT_OLDSOCK */ + +/* + * Get name of peer for connected socket. + */ +/* ARGSUSED */ +static int +getpeername1(p, uap, compat) + struct proc *p; + register struct getpeername_args /* { + int fdes; + caddr_t asa; + int *alen; + } */ *uap; + int compat; +{ + struct file *fp; + register struct socket *so; + struct sockaddr *sa; + int len, error; + + error = getsock(p->p_fd, uap->fdes, &fp); + if (error) + return (error); + so = (struct socket *)fp->f_data; + if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) + return (ENOTCONN); + error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len)); + if (error) + return (error); + sa = 0; + error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, &sa); + if (error) + goto bad; + if (sa == 0) { + len = 0; + goto gotnothing; + } + len = MIN(len, sa->sa_len); +#ifdef COMPAT_OLDSOCK + if (compat) + ((struct osockaddr *)sa)->sa_family = + sa->sa_family; +#endif + error = copyout(sa, (caddr_t)uap->asa, (u_int)len); + if (error) + goto bad; +gotnothing: + error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len)); +bad: + if (sa) FREE(sa, M_SONAME); + return (error); +} + +int +getpeername(p, uap) + struct proc *p; + struct getpeername_args *uap; +{ + + return (getpeername1(p, uap, 0)); +} + +#ifdef COMPAT_OLDSOCK +int +ogetpeername(p, uap) + struct proc *p; + struct ogetpeername_args *uap; +{ + + /* XXX uap should have type `getpeername_args *' to begin with. */ + return (getpeername1(p, (struct getpeername_args *)uap, 1)); +} +#endif /* COMPAT_OLDSOCK */ + +int +sockargs(mp, buf, buflen, type) + struct mbuf **mp; + caddr_t buf; + int buflen, type; +{ + register struct sockaddr *sa; + register struct mbuf *m; + int error; + + if ((u_int)buflen > MLEN) { +#ifdef COMPAT_OLDSOCK + if (type == MT_SONAME && (u_int)buflen <= 112) + buflen = MLEN; /* unix domain compat. hack */ + else +#endif + return (EINVAL); + } + m = m_get(M_WAIT, type); + if (m == NULL) + return (ENOBUFS); + m->m_len = buflen; + error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); + if (error) + (void) m_free(m); + else { + *mp = m; + if (type == MT_SONAME) { + sa = mtod(m, struct sockaddr *); + +#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN + if (sa->sa_family == 0 && sa->sa_len < AF_MAX) + sa->sa_family = sa->sa_len; +#endif + sa->sa_len = buflen; + } + } + return (error); +} + +int +getsockaddr(namp, uaddr, len) + struct sockaddr **namp; + caddr_t uaddr; + size_t len; +{ + struct sockaddr *sa; + int error; + + if (len > SOCK_MAXADDRLEN) + return ENAMETOOLONG; + MALLOC(sa, struct sockaddr *, len, M_SONAME, M_WAITOK); + error = copyin(uaddr, sa, len); + if (error) { + FREE(sa, M_SONAME); + } else { +#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN + if (sa->sa_family == 0 && sa->sa_len < AF_MAX) + sa->sa_family = sa->sa_len; +#endif + sa->sa_len = len; + *namp = sa; + } + return error; +} + +int +getsock(fdp, fdes, fpp) + struct filedesc *fdp; + int fdes; + struct file **fpp; +{ + register struct file *fp; + + if ((unsigned)fdes >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fdes]) == NULL) + return (EBADF); + if (fp->f_type != DTYPE_SOCKET) + return (ENOTSOCK); + *fpp = fp; + return (0); +} + +/* + * Allocate a pool of sf_bufs (sendfile(2) or "super-fast" if you prefer. :-)) + * XXX - The sf_buf functions are currently private to sendfile(2), so have + * been made static, but may be useful in the future for doing zero-copy in + * other parts of the networking code. + */ +static void +sf_buf_init(void *arg) +{ + int i; + + SLIST_INIT(&sf_freelist); + sf_base = kmem_alloc_pageable(kernel_map, nsfbufs * PAGE_SIZE); + sf_bufs = malloc(nsfbufs * sizeof(struct sf_buf), M_TEMP, M_NOWAIT); + bzero(sf_bufs, nsfbufs * sizeof(struct sf_buf)); + for (i = 0; i < nsfbufs; i++) { + sf_bufs[i].kva = sf_base + i * PAGE_SIZE; + SLIST_INSERT_HEAD(&sf_freelist, &sf_bufs[i], free_list); + } +} + +/* + * Get an sf_buf from the freelist. Will block if none are available. + */ +static struct sf_buf * +sf_buf_alloc() +{ + struct sf_buf *sf; + int s; + + s = splimp(); + while ((sf = SLIST_FIRST(&sf_freelist)) == NULL) { + sf_buf_alloc_want = 1; + tsleep(&sf_freelist, PVM, "sfbufa", 0); + } + SLIST_REMOVE_HEAD(&sf_freelist, free_list); + splx(s); + sf->refcnt = 1; + return (sf); +} + +#define dtosf(x) (&sf_bufs[((uintptr_t)(x) - (uintptr_t)sf_base) >> PAGE_SHIFT]) +static void +sf_buf_ref(caddr_t addr, u_int size) +{ + struct sf_buf *sf; + + sf = dtosf(addr); + if (sf->refcnt == 0) + panic("sf_buf_ref: referencing a free sf_buf"); + sf->refcnt++; +} + +/* + * Lose a reference to an sf_buf. When none left, detach mapped page + * and release resources back to the system. + * + * Must be called at splimp. + */ +static void +sf_buf_free(caddr_t addr, u_int size) +{ + struct sf_buf *sf; + struct vm_page *m; + int s; + + sf = dtosf(addr); + if (sf->refcnt == 0) + panic("sf_buf_free: freeing free sf_buf"); + sf->refcnt--; + if (sf->refcnt == 0) { + pmap_qremove((vm_offset_t)addr, 1); + m = sf->m; + s = splvm(); + vm_page_unwire(m, 0); + /* + * Check for the object going away on us. This can + * happen since we don't hold a reference to it. + * If so, we're responsible for freeing the page. + */ + if (m->wire_count == 0 && m->object == NULL) + vm_page_free(m); + splx(s); + sf->m = NULL; + SLIST_INSERT_HEAD(&sf_freelist, sf, free_list); + if (sf_buf_alloc_want) { + sf_buf_alloc_want = 0; + wakeup(&sf_freelist); + } + } +} + +/* + * sendfile(2). + * int sendfile(int fd, int s, off_t offset, size_t nbytes, + * struct sf_hdtr *hdtr, off_t *sbytes, int flags) + * + * Send a file specified by 'fd' and starting at 'offset' to a socket + * specified by 's'. Send only 'nbytes' of the file or until EOF if + * nbytes == 0. Optionally add a header and/or trailer to the socket + * output. If specified, write the total number of bytes sent into *sbytes. + */ +int +sendfile(struct proc *p, struct sendfile_args *uap) +{ + struct file *fp; + struct filedesc *fdp = p->p_fd; + struct vnode *vp; + struct vm_object *obj; + struct socket *so; + struct mbuf *m; + struct sf_buf *sf; + struct vm_page *pg; + struct writev_args nuap; + struct sf_hdtr hdtr; + off_t off, xfsize, sbytes = 0; + int error = 0, s; + + /* + * Do argument checking. Must be a regular file in, stream + * type and connected socket out, positive offset. + */ + if (((u_int)uap->fd) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL || + (fp->f_flag & FREAD) == 0) { + error = EBADF; + goto done; + } + if (fp->f_type != DTYPE_VNODE) { + error = EINVAL; + goto done; + } + vp = (struct vnode *)fp->f_data; + obj = vp->v_object; + if (vp->v_type != VREG || obj == NULL) { + error = EINVAL; + goto done; + } + error = getsock(p->p_fd, uap->s, &fp); + if (error) + goto done; + so = (struct socket *)fp->f_data; + if (so->so_type != SOCK_STREAM) { + error = EINVAL; + goto done; + } + if ((so->so_state & SS_ISCONNECTED) == 0) { + error = ENOTCONN; + goto done; + } + if (uap->offset < 0) { + error = EINVAL; + goto done; + } + + /* + * If specified, get the pointer to the sf_hdtr struct for + * any headers/trailers. + */ + if (uap->hdtr != NULL) { + error = copyin(uap->hdtr, &hdtr, sizeof(hdtr)); + if (error) + goto done; + /* + * Send any headers. Wimp out and use writev(2). + */ + if (hdtr.headers != NULL) { + nuap.fd = uap->s; + nuap.iovp = hdtr.headers; + nuap.iovcnt = hdtr.hdr_cnt; + error = writev(p, &nuap); + if (error) + goto done; + sbytes += p->p_retval[0]; + } + } + + /* + * Protect against multiple writers to the socket. + */ + (void) sblock(&so->so_snd, M_WAITOK); + + /* + * Loop through the pages in the file, starting with the requested + * offset. Get a file page (do I/O if necessary), map the file page + * into an sf_buf, attach an mbuf header to the sf_buf, and queue + * it on the socket. + */ + for (off = uap->offset; ; off += xfsize, sbytes += xfsize) { + vm_pindex_t pindex; + vm_offset_t pgoff; + + pindex = OFF_TO_IDX(off); +retry_lookup: + /* + * Calculate the amount to transfer. Not to exceed a page, + * the EOF, or the passed in nbytes. + */ + xfsize = obj->un_pager.vnp.vnp_size - off; + if (xfsize > PAGE_SIZE) + xfsize = PAGE_SIZE; + pgoff = (vm_offset_t)(off & PAGE_MASK); + if (PAGE_SIZE - pgoff < xfsize) + xfsize = PAGE_SIZE - pgoff; + if (uap->nbytes && xfsize > (uap->nbytes - sbytes)) + xfsize = uap->nbytes - sbytes; + if (xfsize <= 0) + break; + /* + * Optimize the non-blocking case by looking at the socket space + * before going to the extra work of constituting the sf_buf. + */ + if ((so->so_state & SS_NBIO) && sbspace(&so->so_snd) <= 0) { + if (so->so_state & SS_CANTSENDMORE) + error = EPIPE; + else + error = EAGAIN; + sbunlock(&so->so_snd); + goto done; + } + /* + * Attempt to look up the page. If the page doesn't exist or the + * part we're interested in isn't valid, then read it from disk. + * If some other part of the kernel has this page (i.e. it's busy), + * then disk I/O may be occuring on it, so wait and retry. + */ + pg = vm_page_lookup(obj, pindex); + if (pg == NULL || (!(pg->flags & PG_BUSY) && !pg->busy && + !vm_page_is_valid(pg, pgoff, xfsize))) { + struct uio auio; + struct iovec aiov; + int bsize; + + if (pg == NULL) { + pg = vm_page_alloc(obj, pindex, VM_ALLOC_NORMAL); + if (pg == NULL) { + VM_WAIT; + goto retry_lookup; + } + /* + * don't just clear PG_BUSY manually - + * vm_page_alloc() should be considered opaque, + * use the VM routine provided to clear + * PG_BUSY. + */ + vm_page_wakeup(pg); + } + /* + * Ensure that our page is still around when the I/O completes. + */ + vm_page_io_start(pg); + vm_page_wire(pg); + /* + * Get the page from backing store. + */ + bsize = vp->v_mount->mnt_stat.f_iosize; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = 0; + aiov.iov_len = MAXBSIZE; + auio.uio_resid = MAXBSIZE; + auio.uio_offset = trunc_page(off); + auio.uio_segflg = UIO_NOCOPY; + auio.uio_rw = UIO_READ; + auio.uio_procp = p; + vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p); + error = VOP_READ(vp, &auio, IO_VMIO | ((MAXBSIZE / bsize) << 16), + p->p_ucred); + VOP_UNLOCK(vp, 0, p); + vm_page_flag_clear(pg, PG_ZERO); + vm_page_io_finish(pg); + if (error) { + vm_page_unwire(pg, 0); + /* + * See if anyone else might know about this page. + * If not and it is not valid, then free it. + */ + if (pg->wire_count == 0 && pg->valid == 0 && + pg->busy == 0 && !(pg->flags & PG_BUSY) && + pg->hold_count == 0) + vm_page_free(pg); + sbunlock(&so->so_snd); + goto done; + } + } else { + if (vm_page_sleep_busy(pg, TRUE, "sfpbsy")) + goto retry_lookup; + + /* + * Protect from having the page ripped out from + * beneath us. + */ + vm_page_wire(pg); + } + /* + * Allocate a kernel virtual page and insert the physical page + * into it. + */ + sf = sf_buf_alloc(); + sf->m = pg; + pmap_qenter(sf->kva, &pg, 1); + /* + * Get an mbuf header and set it up as having external storage. + */ + MGETHDR(m, M_WAIT, MT_DATA); + m->m_ext.ext_free = sf_buf_free; + m->m_ext.ext_ref = sf_buf_ref; + m->m_ext.ext_buf = (void *)sf->kva; + m->m_ext.ext_size = PAGE_SIZE; + m->m_data = (char *) sf->kva + pgoff; + m->m_flags |= M_EXT; + m->m_pkthdr.len = m->m_len = xfsize; + /* + * Add the buffer to the socket buffer chain. + */ + s = splnet(); +retry_space: + /* + * Make sure that the socket is still able to take more data. + * CANTSENDMORE being true usually means that the connection + * was closed. so_error is true when an error was sensed after + * a previous send. + * The state is checked after the page mapping and buffer + * allocation above since those operations may block and make + * any socket checks stale. From this point forward, nothing + * blocks before the pru_send (or more accurately, any blocking + * results in a loop back to here to re-check). + */ + if ((so->so_state & SS_CANTSENDMORE) || so->so_error) { + if (so->so_state & SS_CANTSENDMORE) { + error = EPIPE; + } else { + error = so->so_error; + so->so_error = 0; + } + m_freem(m); + sbunlock(&so->so_snd); + splx(s); + goto done; + } + /* + * Wait for socket space to become available. We do this just + * after checking the connection state above in order to avoid + * a race condition with sbwait(). + */ + if (sbspace(&so->so_snd) < so->so_snd.sb_lowat) { + if (so->so_state & SS_NBIO) { + m_freem(m); + sbunlock(&so->so_snd); + splx(s); + error = EAGAIN; + goto done; + } + error = sbwait(&so->so_snd); + /* + * An error from sbwait usually indicates that we've + * been interrupted by a signal. If we've sent anything + * then return bytes sent, otherwise return the error. + */ + if (error) { + m_freem(m); + sbunlock(&so->so_snd); + splx(s); + goto done; + } + goto retry_space; + } + error = (*so->so_proto->pr_usrreqs->pru_send)(so, 0, m, 0, 0, p); + splx(s); + if (error) { + sbunlock(&so->so_snd); + goto done; + } + } + sbunlock(&so->so_snd); + + /* + * Send trailers. Wimp out and use writev(2). + */ + if (uap->hdtr != NULL && hdtr.trailers != NULL) { + nuap.fd = uap->s; + nuap.iovp = hdtr.trailers; + nuap.iovcnt = hdtr.trl_cnt; + error = writev(p, &nuap); + if (error) + goto done; + sbytes += p->p_retval[0]; + } + +done: + if (uap->sbytes != NULL) { + copyout(&sbytes, uap->sbytes, sizeof(off_t)); + } + return (error); +} diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c new file mode 100644 index 0000000..abdb71e --- /dev/null +++ b/sys/kern/uipc_usrreq.c @@ -0,0 +1,1186 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94 + * $Id: uipc_usrreq.c,v 1.38 1999/01/21 08:29:04 dillon Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/domain.h> +#include <sys/fcntl.h> +#include <sys/malloc.h> /* XXX must be before <sys/file.h> */ +#include <sys/file.h> +#include <sys/filedesc.h> +#include <sys/lock.h> +#include <sys/mbuf.h> +#include <sys/namei.h> +#include <sys/proc.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/stat.h> +#include <sys/sysctl.h> +#include <sys/un.h> +#include <sys/unpcb.h> +#include <sys/vnode.h> + +#include <vm/vm_zone.h> + +struct vm_zone *unp_zone; +static unp_gen_t unp_gencnt; +static u_int unp_count; + +static struct unp_head unp_shead, unp_dhead; + +/* + * Unix communications domain. + * + * TODO: + * SEQPACKET, RDM + * rethink name space problems + * need a proper out-of-band + * lock pushdown + */ +static struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL }; +static ino_t unp_ino; /* prototype for fake inode numbers */ + +static int unp_attach __P((struct socket *)); +static void unp_detach __P((struct unpcb *)); +static int unp_bind __P((struct unpcb *,struct sockaddr *, struct proc *)); +static int unp_connect __P((struct socket *,struct sockaddr *, + struct proc *)); +static void unp_disconnect __P((struct unpcb *)); +static void unp_shutdown __P((struct unpcb *)); +static void unp_drop __P((struct unpcb *, int)); +static void unp_gc __P((void)); +static void unp_scan __P((struct mbuf *, void (*)(struct file *))); +static void unp_mark __P((struct file *)); +static void unp_discard __P((struct file *)); +static int unp_internalize __P((struct mbuf *, struct proc *)); + +static int +uipc_abort(struct socket *so) +{ + struct unpcb *unp = sotounpcb(so); + + if (unp == 0) + return EINVAL; + unp_drop(unp, ECONNABORTED); + return 0; +} + +static int +uipc_accept(struct socket *so, struct sockaddr **nam) +{ + struct unpcb *unp = sotounpcb(so); + + if (unp == 0) + return EINVAL; + + /* + * Pass back name of connected socket, + * if it was bound and we are still connected + * (our peer may have closed already!). + */ + if (unp->unp_conn && unp->unp_conn->unp_addr) { + *nam = dup_sockaddr((struct sockaddr *)unp->unp_conn->unp_addr, + 1); + } else { + *nam = dup_sockaddr((struct sockaddr *)&sun_noname, 1); + } + return 0; +} + +static int +uipc_attach(struct socket *so, int proto, struct proc *p) +{ + struct unpcb *unp = sotounpcb(so); + + if (unp != 0) + return EISCONN; + return unp_attach(so); +} + +static int +uipc_bind(struct socket *so, struct sockaddr *nam, struct proc *p) +{ + struct unpcb *unp = sotounpcb(so); + + if (unp == 0) + return EINVAL; + + return unp_bind(unp, nam, p); +} + +static int +uipc_connect(struct socket *so, struct sockaddr *nam, struct proc *p) +{ + struct unpcb *unp = sotounpcb(so); + + if (unp == 0) + return EINVAL; + return unp_connect(so, nam, curproc); +} + +static int +uipc_connect2(struct socket *so1, struct socket *so2) +{ + struct unpcb *unp = sotounpcb(so1); + + if (unp == 0) + return EINVAL; + + return unp_connect2(so1, so2); +} + +/* control is EOPNOTSUPP */ + +static int +uipc_detach(struct socket *so) +{ + struct unpcb *unp = sotounpcb(so); + + if (unp == 0) + return EINVAL; + + unp_detach(unp); + return 0; +} + +static int +uipc_disconnect(struct socket *so) +{ + struct unpcb *unp = sotounpcb(so); + + if (unp == 0) + return EINVAL; + unp_disconnect(unp); + return 0; +} + +static int +uipc_listen(struct socket *so, struct proc *p) +{ + struct unpcb *unp = sotounpcb(so); + + if (unp == 0 || unp->unp_vnode == 0) + return EINVAL; + return 0; +} + +static int +uipc_peeraddr(struct socket *so, struct sockaddr **nam) +{ + struct unpcb *unp = sotounpcb(so); + + if (unp == 0) + return EINVAL; + if (unp->unp_conn && unp->unp_conn->unp_addr) + *nam = dup_sockaddr((struct sockaddr *)unp->unp_conn->unp_addr, + 1); + return 0; +} + +static int +uipc_rcvd(struct socket *so, int flags) +{ + struct unpcb *unp = sotounpcb(so); + struct socket *so2; + + if (unp == 0) + return EINVAL; + switch (so->so_type) { + case SOCK_DGRAM: + panic("uipc_rcvd DGRAM?"); + /*NOTREACHED*/ + + case SOCK_STREAM: +#define rcv (&so->so_rcv) +#define snd (&so2->so_snd) + if (unp->unp_conn == 0) + break; + so2 = unp->unp_conn->unp_socket; + /* + * Adjust backpressure on sender + * and wakeup any waiting to write. + */ + snd->sb_mbmax += unp->unp_mbcnt - rcv->sb_mbcnt; + unp->unp_mbcnt = rcv->sb_mbcnt; + snd->sb_hiwat += unp->unp_cc - rcv->sb_cc; + unp->unp_cc = rcv->sb_cc; + sowwakeup(so2); +#undef snd +#undef rcv + break; + + default: + panic("uipc_rcvd unknown socktype"); + } + return 0; +} + +/* pru_rcvoob is EOPNOTSUPP */ + +static int +uipc_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, + struct mbuf *control, struct proc *p) +{ + int error = 0; + struct unpcb *unp = sotounpcb(so); + struct socket *so2; + + if (unp == 0) { + error = EINVAL; + goto release; + } + if (flags & PRUS_OOB) { + error = EOPNOTSUPP; + goto release; + } + + if (control && (error = unp_internalize(control, p))) + goto release; + + switch (so->so_type) { + case SOCK_DGRAM: + { + struct sockaddr *from; + + if (nam) { + if (unp->unp_conn) { + error = EISCONN; + break; + } + error = unp_connect(so, nam, p); + if (error) + break; + } else { + if (unp->unp_conn == 0) { + error = ENOTCONN; + break; + } + } + so2 = unp->unp_conn->unp_socket; + if (unp->unp_addr) + from = (struct sockaddr *)unp->unp_addr; + else + from = &sun_noname; + if (sbappendaddr(&so2->so_rcv, from, m, control)) { + sorwakeup(so2); + m = 0; + control = 0; + } else + error = ENOBUFS; + if (nam) + unp_disconnect(unp); + break; + } + + case SOCK_STREAM: +#define rcv (&so2->so_rcv) +#define snd (&so->so_snd) + /* Connect if not connected yet. */ + /* + * Note: A better implementation would complain + * if not equal to the peer's address. + */ + if ((so->so_state & SS_ISCONNECTED) == 0) { + if (nam) { + error = unp_connect(so, nam, p); + if (error) + break; /* XXX */ + } else { + error = ENOTCONN; + break; + } + } + + if (so->so_state & SS_CANTSENDMORE) { + error = EPIPE; + break; + } + if (unp->unp_conn == 0) + panic("uipc_send connected but no connection?"); + so2 = unp->unp_conn->unp_socket; + /* + * Send to paired receive port, and then reduce + * send buffer hiwater marks to maintain backpressure. + * Wake up readers. + */ + if (control) { + if (sbappendcontrol(rcv, m, control)) + control = 0; + } else + sbappend(rcv, m); + snd->sb_mbmax -= + rcv->sb_mbcnt - unp->unp_conn->unp_mbcnt; + unp->unp_conn->unp_mbcnt = rcv->sb_mbcnt; + snd->sb_hiwat -= rcv->sb_cc - unp->unp_conn->unp_cc; + unp->unp_conn->unp_cc = rcv->sb_cc; + sorwakeup(so2); + m = 0; +#undef snd +#undef rcv + break; + + default: + panic("uipc_send unknown socktype"); + } + + /* + * SEND_EOF is equivalent to a SEND followed by + * a SHUTDOWN. + */ + if (flags & PRUS_EOF) { + socantsendmore(so); + unp_shutdown(unp); + } + +release: + if (control) + m_freem(control); + if (m) + m_freem(m); + return error; +} + +static int +uipc_sense(struct socket *so, struct stat *sb) +{ + struct unpcb *unp = sotounpcb(so); + struct socket *so2; + + if (unp == 0) + return EINVAL; + sb->st_blksize = so->so_snd.sb_hiwat; + if (so->so_type == SOCK_STREAM && unp->unp_conn != 0) { + so2 = unp->unp_conn->unp_socket; + sb->st_blksize += so2->so_rcv.sb_cc; + } + sb->st_dev = NODEV; + if (unp->unp_ino == 0) + unp->unp_ino = unp_ino++; + sb->st_ino = unp->unp_ino; + return (0); +} + +static int +uipc_shutdown(struct socket *so) +{ + struct unpcb *unp = sotounpcb(so); + + if (unp == 0) + return EINVAL; + socantsendmore(so); + unp_shutdown(unp); + return 0; +} + +static int +uipc_sockaddr(struct socket *so, struct sockaddr **nam) +{ + struct unpcb *unp = sotounpcb(so); + + if (unp == 0) + return EINVAL; + if (unp->unp_addr) + *nam = dup_sockaddr((struct sockaddr *)unp->unp_addr, 1); + return 0; +} + +struct pr_usrreqs uipc_usrreqs = { + uipc_abort, uipc_accept, uipc_attach, uipc_bind, uipc_connect, + uipc_connect2, pru_control_notsupp, uipc_detach, uipc_disconnect, + uipc_listen, uipc_peeraddr, uipc_rcvd, pru_rcvoob_notsupp, + uipc_send, uipc_sense, uipc_shutdown, uipc_sockaddr, + sosend, soreceive, sopoll +}; + +/* + * Both send and receive buffers are allocated PIPSIZ bytes of buffering + * for stream sockets, although the total for sender and receiver is + * actually only PIPSIZ. + * Datagram sockets really use the sendspace as the maximum datagram size, + * and don't really want to reserve the sendspace. Their recvspace should + * be large enough for at least one max-size datagram plus address. + */ +#ifndef PIPSIZ +#define PIPSIZ 8192 +#endif +static u_long unpst_sendspace = PIPSIZ; +static u_long unpst_recvspace = PIPSIZ; +static u_long unpdg_sendspace = 2*1024; /* really max datagram size */ +static u_long unpdg_recvspace = 4*1024; + +static int unp_rights; /* file descriptors in flight */ + +SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, + &unpst_sendspace, 0, ""); +SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW, + &unpst_recvspace, 0, ""); +SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW, + &unpdg_sendspace, 0, ""); +SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW, + &unpdg_recvspace, 0, ""); +SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, ""); + +static int +unp_attach(so) + struct socket *so; +{ + register struct unpcb *unp; + int error; + + if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { + switch (so->so_type) { + + case SOCK_STREAM: + error = soreserve(so, unpst_sendspace, unpst_recvspace); + break; + + case SOCK_DGRAM: + error = soreserve(so, unpdg_sendspace, unpdg_recvspace); + break; + + default: + panic("unp_attach"); + } + if (error) + return (error); + } + unp = zalloc(unp_zone); + if (unp == NULL) + return (ENOBUFS); + bzero(unp, sizeof *unp); + unp->unp_gencnt = ++unp_gencnt; + unp_count++; + LIST_INIT(&unp->unp_refs); + unp->unp_socket = so; + LIST_INSERT_HEAD(so->so_type == SOCK_DGRAM ? &unp_dhead + : &unp_shead, unp, unp_link); + so->so_pcb = (caddr_t)unp; + return (0); +} + +static void +unp_detach(unp) + register struct unpcb *unp; +{ + LIST_REMOVE(unp, unp_link); + unp->unp_gencnt = ++unp_gencnt; + --unp_count; + if (unp->unp_vnode) { + unp->unp_vnode->v_socket = 0; + vrele(unp->unp_vnode); + unp->unp_vnode = 0; + } + if (unp->unp_conn) + unp_disconnect(unp); + while (unp->unp_refs.lh_first) + unp_drop(unp->unp_refs.lh_first, ECONNRESET); + soisdisconnected(unp->unp_socket); + unp->unp_socket->so_pcb = 0; + if (unp_rights) { + /* + * Normally the receive buffer is flushed later, + * in sofree, but if our receive buffer holds references + * to descriptors that are now garbage, we will dispose + * of those descriptor references after the garbage collector + * gets them (resulting in a "panic: closef: count < 0"). + */ + sorflush(unp->unp_socket); + unp_gc(); + } + if (unp->unp_addr) + FREE(unp->unp_addr, M_SONAME); + zfree(unp_zone, unp); +} + +static int +unp_bind(unp, nam, p) + struct unpcb *unp; + struct sockaddr *nam; + struct proc *p; +{ + struct sockaddr_un *soun = (struct sockaddr_un *)nam; + register struct vnode *vp; + struct vattr vattr; + int error, namelen; + struct nameidata nd; + char buf[SOCK_MAXADDRLEN]; + + if (unp->unp_vnode != NULL) + return (EINVAL); +#define offsetof(s, e) ((char *)&((s *)0)->e - (char *)((s *)0)) + namelen = soun->sun_len - offsetof(struct sockaddr_un, sun_path); + if (namelen <= 0) + return EINVAL; + strncpy(buf, soun->sun_path, namelen); + buf[namelen] = 0; /* null-terminate the string */ + NDINIT(&nd, CREATE, FOLLOW | LOCKPARENT, UIO_SYSSPACE, + buf, p); +/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ + error = namei(&nd); + if (error) + return (error); + vp = nd.ni_vp; + if (vp != NULL) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(vp); + return (EADDRINUSE); + } + VATTR_NULL(&vattr); + vattr.va_type = VSOCK; + vattr.va_mode = (ACCESSPERMS & ~p->p_fd->fd_cmask); + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_CREATE(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); + vput(nd.ni_dvp); + if (error) + return (error); + vp = nd.ni_vp; + vp->v_socket = unp->unp_socket; + unp->unp_vnode = vp; + unp->unp_addr = (struct sockaddr_un *)dup_sockaddr(nam, 1); + VOP_UNLOCK(vp, 0, p); + return (0); +} + +static int +unp_connect(so, nam, p) + struct socket *so; + struct sockaddr *nam; + struct proc *p; +{ + register struct sockaddr_un *soun = (struct sockaddr_un *)nam; + register struct vnode *vp; + register struct socket *so2, *so3; + struct unpcb *unp2, *unp3; + int error, len; + struct nameidata nd; + char buf[SOCK_MAXADDRLEN]; + + len = nam->sa_len - offsetof(struct sockaddr_un, sun_path); + if (len <= 0) + return EINVAL; + strncpy(buf, soun->sun_path, len); + buf[len] = 0; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, buf, p); + error = namei(&nd); + if (error) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VSOCK) { + error = ENOTSOCK; + goto bad; + } + error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p); + if (error) + goto bad; + so2 = vp->v_socket; + if (so2 == 0) { + error = ECONNREFUSED; + goto bad; + } + if (so->so_type != so2->so_type) { + error = EPROTOTYPE; + goto bad; + } + if (so->so_proto->pr_flags & PR_CONNREQUIRED) { + if ((so2->so_options & SO_ACCEPTCONN) == 0 || + (so3 = sonewconn(so2, 0)) == 0) { + error = ECONNREFUSED; + goto bad; + } + unp2 = sotounpcb(so2); + unp3 = sotounpcb(so3); + if (unp2->unp_addr) + unp3->unp_addr = (struct sockaddr_un *) + dup_sockaddr((struct sockaddr *) + unp2->unp_addr, 1); + so2 = so3; + } + error = unp_connect2(so, so2); +bad: + vput(vp); + return (error); +} + +int +unp_connect2(so, so2) + register struct socket *so; + register struct socket *so2; +{ + register struct unpcb *unp = sotounpcb(so); + register struct unpcb *unp2; + + if (so2->so_type != so->so_type) + return (EPROTOTYPE); + unp2 = sotounpcb(so2); + unp->unp_conn = unp2; + switch (so->so_type) { + + case SOCK_DGRAM: + LIST_INSERT_HEAD(&unp2->unp_refs, unp, unp_reflink); + soisconnected(so); + break; + + case SOCK_STREAM: + unp2->unp_conn = unp; + soisconnected(so); + soisconnected(so2); + break; + + default: + panic("unp_connect2"); + } + return (0); +} + +static void +unp_disconnect(unp) + struct unpcb *unp; +{ + register struct unpcb *unp2 = unp->unp_conn; + + if (unp2 == 0) + return; + unp->unp_conn = 0; + switch (unp->unp_socket->so_type) { + + case SOCK_DGRAM: + LIST_REMOVE(unp, unp_reflink); + unp->unp_socket->so_state &= ~SS_ISCONNECTED; + break; + + case SOCK_STREAM: + soisdisconnected(unp->unp_socket); + unp2->unp_conn = 0; + soisdisconnected(unp2->unp_socket); + break; + } +} + +#ifdef notdef +void +unp_abort(unp) + struct unpcb *unp; +{ + + unp_detach(unp); +} +#endif + +static int +unp_pcblist SYSCTL_HANDLER_ARGS +{ + int error, i, n; + struct unpcb *unp, **unp_list; + unp_gen_t gencnt; + struct xunpgen xug; + struct unp_head *head; + + head = ((intptr_t)arg1 == SOCK_DGRAM ? &unp_dhead : &unp_shead); + + /* + * The process of preparing the PCB list is too time-consuming and + * resource-intensive to repeat twice on every request. + */ + if (req->oldptr == 0) { + n = unp_count; + req->oldidx = 2 * (sizeof xug) + + (n + n/8) * sizeof(struct xunpcb); + return 0; + } + + if (req->newptr != 0) + return EPERM; + + /* + * OK, now we're committed to doing something. + */ + gencnt = unp_gencnt; + n = unp_count; + + xug.xug_len = sizeof xug; + xug.xug_count = n; + xug.xug_gen = gencnt; + xug.xug_sogen = so_gencnt; + error = SYSCTL_OUT(req, &xug, sizeof xug); + if (error) + return error; + + unp_list = malloc(n * sizeof *unp_list, M_TEMP, M_WAITOK); + if (unp_list == 0) + return ENOMEM; + + for (unp = head->lh_first, i = 0; unp && i < n; + unp = unp->unp_link.le_next) { + if (unp->unp_gencnt <= gencnt) + unp_list[i++] = unp; + } + n = i; /* in case we lost some during malloc */ + + error = 0; + for (i = 0; i < n; i++) { + unp = unp_list[i]; + if (unp->unp_gencnt <= gencnt) { + struct xunpcb xu; + xu.xu_len = sizeof xu; + xu.xu_unpp = unp; + /* + * XXX - need more locking here to protect against + * connect/disconnect races for SMP. + */ + if (unp->unp_addr) + bcopy(unp->unp_addr, &xu.xu_addr, + unp->unp_addr->sun_len); + if (unp->unp_conn && unp->unp_conn->unp_addr) + bcopy(unp->unp_conn->unp_addr, + &xu.xu_caddr, + unp->unp_conn->unp_addr->sun_len); + bcopy(unp, &xu.xu_unp, sizeof *unp); + sotoxsocket(unp->unp_socket, &xu.xu_socket); + error = SYSCTL_OUT(req, &xu, sizeof xu); + } + } + if (!error) { + /* + * Give the user an updated idea of our state. + * If the generation differs from what we told + * her before, she knows that something happened + * while we were processing this request, and it + * might be necessary to retry. + */ + xug.xug_gen = unp_gencnt; + xug.xug_sogen = so_gencnt; + xug.xug_count = unp_count; + error = SYSCTL_OUT(req, &xug, sizeof xug); + } + free(unp_list, M_TEMP); + return error; +} + +SYSCTL_PROC(_net_local_dgram, OID_AUTO, pcblist, CTLFLAG_RD, + (caddr_t)(long)SOCK_DGRAM, 0, unp_pcblist, "S,xunpcb", + "List of active local datagram sockets"); +SYSCTL_PROC(_net_local_stream, OID_AUTO, pcblist, CTLFLAG_RD, + (caddr_t)(long)SOCK_STREAM, 0, unp_pcblist, "S,xunpcb", + "List of active local stream sockets"); + +static void +unp_shutdown(unp) + struct unpcb *unp; +{ + struct socket *so; + + if (unp->unp_socket->so_type == SOCK_STREAM && unp->unp_conn && + (so = unp->unp_conn->unp_socket)) + socantrcvmore(so); +} + +static void +unp_drop(unp, errno) + struct unpcb *unp; + int errno; +{ + struct socket *so = unp->unp_socket; + + so->so_error = errno; + unp_disconnect(unp); + if (so->so_head) { + LIST_REMOVE(unp, unp_link); + unp->unp_gencnt = ++unp_gencnt; + unp_count--; + so->so_pcb = (caddr_t) 0; + if (unp->unp_addr) + FREE(unp->unp_addr, M_SONAME); + zfree(unp_zone, unp); + sofree(so); + } +} + +#ifdef notdef +void +unp_drain() +{ + +} +#endif + +int +unp_externalize(rights) + struct mbuf *rights; +{ + struct proc *p = curproc; /* XXX */ + register int i; + register struct cmsghdr *cm = mtod(rights, struct cmsghdr *); + register struct file **rp = (struct file **)(cm + 1); + register struct file *fp; + int newfds = (cm->cmsg_len - sizeof(*cm)) / sizeof (int); + int f; + + /* + * if the new FD's will not fit, then we free them all + */ + if (!fdavail(p, newfds)) { + for (i = 0; i < newfds; i++) { + fp = *rp; + unp_discard(fp); + *rp++ = 0; + } + return (EMSGSIZE); + } + /* + * now change each pointer to an fd in the global table to + * an integer that is the index to the local fd table entry + * that we set up to point to the global one we are transferring. + * XXX this assumes a pointer and int are the same size...! + */ + for (i = 0; i < newfds; i++) { + if (fdalloc(p, 0, &f)) + panic("unp_externalize"); + fp = *rp; + p->p_fd->fd_ofiles[f] = fp; + fp->f_msgcount--; + unp_rights--; + *(int *)rp++ = f; + } + return (0); +} + +void +unp_init(void) +{ + unp_zone = zinit("unpcb", sizeof(struct unpcb), nmbclusters, 0, 0); + if (unp_zone == 0) + panic("unp_init"); + LIST_INIT(&unp_dhead); + LIST_INIT(&unp_shead); +} + +#ifndef MIN +#define MIN(a,b) (((a)<(b))?(a):(b)) +#endif + +static int +unp_internalize(control, p) + struct mbuf *control; + struct proc *p; +{ + struct filedesc *fdp = p->p_fd; + register struct cmsghdr *cm = mtod(control, struct cmsghdr *); + register struct file **rp; + register struct file *fp; + register int i, fd; + register struct cmsgcred *cmcred; + int oldfds; + + if ((cm->cmsg_type != SCM_RIGHTS && cm->cmsg_type != SCM_CREDS) || + cm->cmsg_level != SOL_SOCKET || cm->cmsg_len != control->m_len) + return (EINVAL); + + /* + * Fill in credential information. + */ + if (cm->cmsg_type == SCM_CREDS) { + cmcred = (struct cmsgcred *)(cm + 1); + cmcred->cmcred_pid = p->p_pid; + cmcred->cmcred_uid = p->p_cred->p_ruid; + cmcred->cmcred_gid = p->p_cred->p_rgid; + cmcred->cmcred_euid = p->p_ucred->cr_uid; + cmcred->cmcred_ngroups = MIN(p->p_ucred->cr_ngroups, + CMGROUP_MAX); + for (i = 0; i < cmcred->cmcred_ngroups; i++) + cmcred->cmcred_groups[i] = p->p_ucred->cr_groups[i]; + return(0); + } + + oldfds = (cm->cmsg_len - sizeof (*cm)) / sizeof (int); + /* + * check that all the FDs passed in refer to legal OPEN files + * If not, reject the entire operation. + */ + rp = (struct file **)(cm + 1); + for (i = 0; i < oldfds; i++) { + fd = *(int *)rp++; + if ((unsigned)fd >= fdp->fd_nfiles || + fdp->fd_ofiles[fd] == NULL) + return (EBADF); + } + /* + * Now replace the integer FDs with pointers to + * the associated global file table entry.. + * XXX this assumes a pointer and an int are the same size! + */ + rp = (struct file **)(cm + 1); + for (i = 0; i < oldfds; i++) { + fp = fdp->fd_ofiles[*(int *)rp]; + *rp++ = fp; + fp->f_count++; + fp->f_msgcount++; + unp_rights++; + } + return (0); +} + +static int unp_defer, unp_gcing; + +static void +unp_gc() +{ + register struct file *fp, *nextfp; + register struct socket *so; + struct file **extra_ref, **fpp; + int nunref, i; + + if (unp_gcing) + return; + unp_gcing = 1; + unp_defer = 0; + /* + * before going through all this, set all FDs to + * be NOT defered and NOT externally accessible + */ + for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next) + fp->f_flag &= ~(FMARK|FDEFER); + do { + for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next) { + /* + * If the file is not open, skip it + */ + if (fp->f_count == 0) + continue; + /* + * If we already marked it as 'defer' in a + * previous pass, then try process it this time + * and un-mark it + */ + if (fp->f_flag & FDEFER) { + fp->f_flag &= ~FDEFER; + unp_defer--; + } else { + /* + * if it's not defered, then check if it's + * already marked.. if so skip it + */ + if (fp->f_flag & FMARK) + continue; + /* + * If all references are from messages + * in transit, then skip it. it's not + * externally accessible. + */ + if (fp->f_count == fp->f_msgcount) + continue; + /* + * If it got this far then it must be + * externally accessible. + */ + fp->f_flag |= FMARK; + } + /* + * either it was defered, or it is externally + * accessible and not already marked so. + * Now check if it is possibly one of OUR sockets. + */ + if (fp->f_type != DTYPE_SOCKET || + (so = (struct socket *)fp->f_data) == 0) + continue; + if (so->so_proto->pr_domain != &localdomain || + (so->so_proto->pr_flags&PR_RIGHTS) == 0) + continue; +#ifdef notdef + if (so->so_rcv.sb_flags & SB_LOCK) { + /* + * This is problematical; it's not clear + * we need to wait for the sockbuf to be + * unlocked (on a uniprocessor, at least), + * and it's also not clear what to do + * if sbwait returns an error due to receipt + * of a signal. If sbwait does return + * an error, we'll go into an infinite + * loop. Delete all of this for now. + */ + (void) sbwait(&so->so_rcv); + goto restart; + } +#endif + /* + * So, Ok, it's one of our sockets and it IS externally + * accessible (or was defered). Now we look + * to see if we hold any file descriptors in its + * message buffers. Follow those links and mark them + * as accessible too. + */ + unp_scan(so->so_rcv.sb_mb, unp_mark); + } + } while (unp_defer); + /* + * We grab an extra reference to each of the file table entries + * that are not otherwise accessible and then free the rights + * that are stored in messages on them. + * + * The bug in the orginal code is a little tricky, so I'll describe + * what's wrong with it here. + * + * It is incorrect to simply unp_discard each entry for f_msgcount + * times -- consider the case of sockets A and B that contain + * references to each other. On a last close of some other socket, + * we trigger a gc since the number of outstanding rights (unp_rights) + * is non-zero. If during the sweep phase the gc code un_discards, + * we end up doing a (full) closef on the descriptor. A closef on A + * results in the following chain. Closef calls soo_close, which + * calls soclose. Soclose calls first (through the switch + * uipc_usrreq) unp_detach, which re-invokes unp_gc. Unp_gc simply + * returns because the previous instance had set unp_gcing, and + * we return all the way back to soclose, which marks the socket + * with SS_NOFDREF, and then calls sofree. Sofree calls sorflush + * to free up the rights that are queued in messages on the socket A, + * i.e., the reference on B. The sorflush calls via the dom_dispose + * switch unp_dispose, which unp_scans with unp_discard. This second + * instance of unp_discard just calls closef on B. + * + * Well, a similar chain occurs on B, resulting in a sorflush on B, + * which results in another closef on A. Unfortunately, A is already + * being closed, and the descriptor has already been marked with + * SS_NOFDREF, and soclose panics at this point. + * + * Here, we first take an extra reference to each inaccessible + * descriptor. Then, we call sorflush ourself, since we know + * it is a Unix domain socket anyhow. After we destroy all the + * rights carried in messages, we do a last closef to get rid + * of our extra reference. This is the last close, and the + * unp_detach etc will shut down the socket. + * + * 91/09/19, bsy@cs.cmu.edu + */ + extra_ref = malloc(nfiles * sizeof(struct file *), M_FILE, M_WAITOK); + for (nunref = 0, fp = filehead.lh_first, fpp = extra_ref; fp != 0; + fp = nextfp) { + nextfp = fp->f_list.le_next; + /* + * If it's not open, skip it + */ + if (fp->f_count == 0) + continue; + /* + * If all refs are from msgs, and it's not marked accessible + * then it must be referenced from some unreachable cycle + * of (shut-down) FDs, so include it in our + * list of FDs to remove + */ + if (fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) { + *fpp++ = fp; + nunref++; + fp->f_count++; + } + } + /* + * for each FD on our hit list, do the following two things + */ + for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) { + struct file *tfp = *fpp; + if (tfp->f_type == DTYPE_SOCKET && tfp->f_data != NULL) + sorflush((struct socket *)(tfp->f_data)); + } + for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) + closef(*fpp, (struct proc *) NULL); + free((caddr_t)extra_ref, M_FILE); + unp_gcing = 0; +} + +void +unp_dispose(m) + struct mbuf *m; +{ + + if (m) + unp_scan(m, unp_discard); +} + +static void +unp_scan(m0, op) + register struct mbuf *m0; + void (*op) __P((struct file *)); +{ + register struct mbuf *m; + register struct file **rp; + register struct cmsghdr *cm; + register int i; + int qfds; + + while (m0) { + for (m = m0; m; m = m->m_next) + if (m->m_type == MT_CONTROL && + m->m_len >= sizeof(*cm)) { + cm = mtod(m, struct cmsghdr *); + if (cm->cmsg_level != SOL_SOCKET || + cm->cmsg_type != SCM_RIGHTS) + continue; + qfds = (cm->cmsg_len - sizeof *cm) + / sizeof (struct file *); + rp = (struct file **)(cm + 1); + for (i = 0; i < qfds; i++) + (*op)(*rp++); + break; /* XXX, but saves time */ + } + m0 = m0->m_act; + } +} + +static void +unp_mark(fp) + struct file *fp; +{ + + if (fp->f_flag & FMARK) + return; + unp_defer++; + fp->f_flag |= (FMARK|FDEFER); +} + +static void +unp_discard(fp) + struct file *fp; +{ + + fp->f_msgcount--; + unp_rights--; + (void) closef(fp, (struct proc *)NULL); +} diff --git a/sys/kern/vfs_aio.c b/sys/kern/vfs_aio.c new file mode 100644 index 0000000..c1af873 --- /dev/null +++ b/sys/kern/vfs_aio.c @@ -0,0 +1,2046 @@ +/* + * Copyright (c) 1997 John S. Dyson. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. John S. Dyson's name may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * DISCLAIMER: This code isn't warranted to do anything useful. Anything + * bad that happens because of using this software isn't the responsibility + * of the author. This software is distributed AS-IS. + * + * $Id: vfs_aio.c,v 1.36 1998/12/15 17:38:33 des Exp $ + */ + +/* + * This file contains support for the POSIX 1003.1B AIO/LIO facility. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/lock.h> +#include <sys/unistd.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/sysctl.h> +#include <sys/vnode.h> +#include <sys/conf.h> +#include <miscfs/specfs/specdev.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_zone.h> +#include <sys/aio.h> +#include <sys/shm.h> + +#include <machine/cpu.h> +#include <machine/limits.h> + +static long jobrefid; + +#define JOBST_NULL 0x0 +#define JOBST_JOBQPROC 0x1 +#define JOBST_JOBQGLOBAL 0x2 +#define JOBST_JOBRUNNING 0x3 +#define JOBST_JOBFINISHED 0x4 +#define JOBST_JOBQBUF 0x5 +#define JOBST_JOBBFINISHED 0x6 + +#ifndef MAX_AIO_PER_PROC +#define MAX_AIO_PER_PROC 32 +#endif + +#ifndef MAX_AIO_QUEUE_PER_PROC +#define MAX_AIO_QUEUE_PER_PROC 256 /* Bigger than AIO_LISTIO_MAX */ +#endif + +#ifndef MAX_AIO_PROCS +#define MAX_AIO_PROCS 32 +#endif + +#ifndef MAX_AIO_QUEUE +#define MAX_AIO_QUEUE 1024 /* Bigger than AIO_LISTIO_MAX */ +#endif + +#ifndef TARGET_AIO_PROCS +#define TARGET_AIO_PROCS 0 +#endif + +#ifndef MAX_BUF_AIO +#define MAX_BUF_AIO 16 +#endif + +#ifndef AIOD_TIMEOUT_DEFAULT +#define AIOD_TIMEOUT_DEFAULT (10 * hz) +#endif + +#ifndef AIOD_LIFETIME_DEFAULT +#define AIOD_LIFETIME_DEFAULT (30 * hz) +#endif + +static int max_aio_procs = MAX_AIO_PROCS; +static int num_aio_procs = 0; +static int target_aio_procs = TARGET_AIO_PROCS; +static int max_queue_count = MAX_AIO_QUEUE; +static int num_queue_count = 0; +static int num_buf_aio = 0; +static int num_aio_resv_start = 0; +static int aiod_timeout; +static int aiod_lifetime; + +static int max_aio_per_proc = MAX_AIO_PER_PROC, + max_aio_queue_per_proc=MAX_AIO_QUEUE_PER_PROC; + +static int max_buf_aio = MAX_BUF_AIO; + +SYSCTL_NODE(_vfs, OID_AUTO, aio, CTLFLAG_RW, 0, "AIO mgmt"); + +SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_per_proc, + CTLFLAG_RW, &max_aio_per_proc, 0, ""); + +SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue_per_proc, + CTLFLAG_RW, &max_aio_queue_per_proc, 0, ""); + +SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_procs, + CTLFLAG_RW, &max_aio_procs, 0, ""); + +SYSCTL_INT(_vfs_aio, OID_AUTO, num_aio_procs, + CTLFLAG_RD, &num_aio_procs, 0, ""); + +SYSCTL_INT(_vfs_aio, OID_AUTO, num_queue_count, + CTLFLAG_RD, &num_queue_count, 0, ""); + +SYSCTL_INT(_vfs_aio, OID_AUTO, max_aio_queue, + CTLFLAG_RW, &max_queue_count, 0, ""); + +SYSCTL_INT(_vfs_aio, OID_AUTO, target_aio_procs, + CTLFLAG_RW, &target_aio_procs, 0, ""); + +SYSCTL_INT(_vfs_aio, OID_AUTO, max_buf_aio, + CTLFLAG_RW, &max_buf_aio, 0, ""); + +SYSCTL_INT(_vfs_aio, OID_AUTO, num_buf_aio, + CTLFLAG_RD, &num_buf_aio, 0, ""); + +SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_lifetime, + CTLFLAG_RW, &aiod_lifetime, 0, ""); + +SYSCTL_INT(_vfs_aio, OID_AUTO, aiod_timeout, + CTLFLAG_RW, &aiod_timeout, 0, ""); + + +/* + * Job queue item + */ + +#define AIOCBLIST_CANCELLED 0x1 +#define AIOCBLIST_RUNDOWN 0x4 +#define AIOCBLIST_ASYNCFREE 0x8 +#define AIOCBLIST_DONE 0x10 + +struct aiocblist { + TAILQ_ENTRY (aiocblist) list; /* List of jobs */ + TAILQ_ENTRY (aiocblist) plist; /* List of jobs for proc */ + int jobflags; + int jobstate; + int inputcharge, outputcharge; + struct buf *bp; /* buffer pointer */ + struct proc *userproc; /* User process */ + struct aioproclist *jobaioproc; /* AIO process descriptor */ + struct aio_liojob *lio; /* optional lio job */ + struct aiocb *uuaiocb; /* pointer in userspace of aiocb */ + struct aiocb uaiocb; /* Kernel I/O control block */ +}; + + +/* + * AIO process info + */ +#define AIOP_FREE 0x1 /* proc on free queue */ +#define AIOP_SCHED 0x2 /* proc explicitly scheduled */ + +struct aioproclist { + int aioprocflags; /* AIO proc flags */ + TAILQ_ENTRY(aioproclist) list; /* List of processes */ + struct proc *aioproc; /* The AIO thread */ + TAILQ_HEAD (,aiocblist) jobtorun; /* suggested job to run */ +}; + +/* + * data-structure for lio signal management + */ +struct aio_liojob { + int lioj_flags; + int lioj_buffer_count; + int lioj_buffer_finished_count; + int lioj_queue_count; + int lioj_queue_finished_count; + struct sigevent lioj_signal; /* signal on all I/O done */ + TAILQ_ENTRY (aio_liojob) lioj_list; + struct kaioinfo *lioj_ki; +}; +#define LIOJ_SIGNAL 0x1 /* signal on all done (lio) */ +#define LIOJ_SIGNAL_POSTED 0x2 /* signal has been posted */ + +/* + * per process aio data structure + */ +struct kaioinfo { + int kaio_flags; /* per process kaio flags */ + int kaio_maxactive_count; /* maximum number of AIOs */ + int kaio_active_count; /* number of currently used AIOs */ + int kaio_qallowed_count; /* maxiumu size of AIO queue */ + int kaio_queue_count; /* size of AIO queue */ + int kaio_ballowed_count; /* maximum number of buffers */ + int kaio_queue_finished_count; /* number of daemon jobs finished */ + int kaio_buffer_count; /* number of physio buffers */ + int kaio_buffer_finished_count; /* count of I/O done */ + struct proc *kaio_p; /* process that uses this kaio block */ + TAILQ_HEAD (,aio_liojob) kaio_liojoblist; /* list of lio jobs */ + TAILQ_HEAD (,aiocblist) kaio_jobqueue; /* job queue for process */ + TAILQ_HEAD (,aiocblist) kaio_jobdone; /* done queue for process */ + TAILQ_HEAD (,aiocblist) kaio_bufqueue; /* buffer job queue for process */ + TAILQ_HEAD (,aiocblist) kaio_bufdone; /* buffer done queue for process */ +}; + +#define KAIO_RUNDOWN 0x1 /* process is being run down */ +#define KAIO_WAKEUP 0x2 /* wakeup process when there is a significant + event */ + + +static TAILQ_HEAD (,aioproclist) aio_freeproc, aio_activeproc; +static TAILQ_HEAD(,aiocblist) aio_jobs; /* Async job list */ +static TAILQ_HEAD(,aiocblist) aio_bufjobs; /* Phys I/O job list */ +static TAILQ_HEAD(,aiocblist) aio_freejobs; /* Pool of free jobs */ + +static void aio_init_aioinfo(struct proc *p) ; +static void aio_onceonly(void *) ; +static int aio_free_entry(struct aiocblist *aiocbe); +static void aio_process(struct aiocblist *aiocbe); +static int aio_newproc(void) ; +static int aio_aqueue(struct proc *p, struct aiocb *job, int type) ; +static void aio_physwakeup(struct buf *bp); +static int aio_fphysio(struct proc *p, struct aiocblist *aiocbe, int type); +static int aio_qphysio(struct proc *p, struct aiocblist *iocb); +static void aio_daemon(void *uproc); + +SYSINIT(aio, SI_SUB_VFS, SI_ORDER_ANY, aio_onceonly, NULL); + +static vm_zone_t kaio_zone=0, aiop_zone=0, + aiocb_zone=0, aiol_zone=0, aiolio_zone=0; + +/* + * Single AIOD vmspace shared amongst all of them + */ +struct vmspace *aiovmspace = NULL; + +/* + * Startup initialization + */ +void +aio_onceonly(void *na) +{ + TAILQ_INIT(&aio_freeproc); + TAILQ_INIT(&aio_activeproc); + TAILQ_INIT(&aio_jobs); + TAILQ_INIT(&aio_bufjobs); + TAILQ_INIT(&aio_freejobs); + kaio_zone = zinit("AIO", sizeof (struct kaioinfo), 0, 0, 1); + aiop_zone = zinit("AIOP", sizeof (struct aioproclist), 0, 0, 1); + aiocb_zone = zinit("AIOCB", sizeof (struct aiocblist), 0, 0, 1); + aiol_zone = zinit("AIOL", AIO_LISTIO_MAX * sizeof (int), 0, 0, 1); + aiolio_zone = zinit("AIOLIO", + AIO_LISTIO_MAX * sizeof (struct aio_liojob), 0, 0, 1); + aiod_timeout = AIOD_TIMEOUT_DEFAULT; + aiod_lifetime = AIOD_LIFETIME_DEFAULT; + jobrefid = 1; +} + +/* + * Init the per-process aioinfo structure. + * The aioinfo limits are set per-process for user limit (resource) management. + */ +void +aio_init_aioinfo(struct proc *p) +{ + struct kaioinfo *ki; + if (p->p_aioinfo == NULL) { + ki = zalloc(kaio_zone); + p->p_aioinfo = ki; + ki->kaio_flags = 0; + ki->kaio_maxactive_count = max_aio_per_proc; + ki->kaio_active_count = 0; + ki->kaio_qallowed_count = max_aio_queue_per_proc; + ki->kaio_queue_count = 0; + ki->kaio_ballowed_count = max_buf_aio; + ki->kaio_buffer_count = 0; + ki->kaio_buffer_finished_count = 0; + ki->kaio_p = p; + TAILQ_INIT(&ki->kaio_jobdone); + TAILQ_INIT(&ki->kaio_jobqueue); + TAILQ_INIT(&ki->kaio_bufdone); + TAILQ_INIT(&ki->kaio_bufqueue); + TAILQ_INIT(&ki->kaio_liojoblist); + } +} + +/* + * Free a job entry. Wait for completion if it is currently + * active, but don't delay forever. If we delay, we return + * a flag that says that we have to restart the queue scan. + */ +int +aio_free_entry(struct aiocblist *aiocbe) +{ + struct kaioinfo *ki; + struct aioproclist *aiop; + struct aio_liojob *lj; + struct proc *p; + int error; + int s; + + if (aiocbe->jobstate == JOBST_NULL) + panic("aio_free_entry: freeing already free job"); + + p = aiocbe->userproc; + ki = p->p_aioinfo; + lj = aiocbe->lio; + if (ki == NULL) + panic("aio_free_entry: missing p->p_aioinfo"); + + if (aiocbe->jobstate == JOBST_JOBRUNNING) { + if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) + return 0; + aiocbe->jobflags |= AIOCBLIST_RUNDOWN; + tsleep(aiocbe, PRIBIO|PCATCH, "jobwai", 0); + } + aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; + + if (aiocbe->bp == NULL) { + if (ki->kaio_queue_count <= 0) + panic("aio_free_entry: process queue size <= 0"); + if (num_queue_count <= 0) + panic("aio_free_entry: system wide queue size <= 0"); + + if(lj) { + lj->lioj_queue_count--; + if (aiocbe->jobflags & AIOCBLIST_DONE) + lj->lioj_queue_finished_count--; + } + ki->kaio_queue_count--; + if (aiocbe->jobflags & AIOCBLIST_DONE) + ki->kaio_queue_finished_count--; + num_queue_count--; + + } else { + if(lj) { + lj->lioj_buffer_count--; + if (aiocbe->jobflags & AIOCBLIST_DONE) + lj->lioj_buffer_finished_count--; + } + if (aiocbe->jobflags & AIOCBLIST_DONE) + ki->kaio_buffer_finished_count--; + ki->kaio_buffer_count--; + num_buf_aio--; + + } + + if ((ki->kaio_flags & KAIO_WAKEUP) || + (ki->kaio_flags & KAIO_RUNDOWN) && + ((ki->kaio_buffer_count == 0) && (ki->kaio_queue_count == 0))) { + ki->kaio_flags &= ~KAIO_WAKEUP; + wakeup(p); + } + + if ( aiocbe->jobstate == JOBST_JOBQBUF) { + if ((error = aio_fphysio(p, aiocbe, 1)) != 0) + return error; + if (aiocbe->jobstate != JOBST_JOBBFINISHED) + panic("aio_free_entry: invalid physio finish-up state"); + s = splbio(); + TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); + splx(s); + } else if ( aiocbe->jobstate == JOBST_JOBQPROC) { + aiop = aiocbe->jobaioproc; + TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); + } else if ( aiocbe->jobstate == JOBST_JOBQGLOBAL) { + TAILQ_REMOVE(&aio_jobs, aiocbe, list); + } else if ( aiocbe->jobstate == JOBST_JOBFINISHED) { + TAILQ_REMOVE(&ki->kaio_jobdone, aiocbe, plist); + } else if ( aiocbe->jobstate == JOBST_JOBBFINISHED) { + s = splbio(); + TAILQ_REMOVE(&ki->kaio_bufdone, aiocbe, plist); + splx(s); + if (aiocbe->bp) { + vunmapbuf(aiocbe->bp); + relpbuf(aiocbe->bp, NULL); + aiocbe->bp = NULL; + } + } + if (lj && (lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { + TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); + zfree(aiolio_zone, lj); + } + TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); + aiocbe->jobstate = JOBST_NULL; + return 0; +} + +/* + * Rundown the jobs for a given process. + */ +void +aio_proc_rundown(struct proc *p) +{ + int s; + struct kaioinfo *ki; + struct aio_liojob *lj, *ljn; + struct aiocblist *aiocbe, *aiocbn; + + ki = p->p_aioinfo; + if (ki == NULL) + return; + + ki->kaio_flags |= LIOJ_SIGNAL_POSTED; + while ((ki->kaio_active_count > 0) || + (ki->kaio_buffer_count > ki->kaio_buffer_finished_count)) { + ki->kaio_flags |= KAIO_RUNDOWN; + if (tsleep(p, PRIBIO, "kaiowt", aiod_timeout)) + break; + } + +restart1: + for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobdone); + aiocbe; + aiocbe = aiocbn) { + aiocbn = TAILQ_NEXT(aiocbe, plist); + if (aio_free_entry(aiocbe)) + goto restart1; + } + +restart2: + for ( aiocbe = TAILQ_FIRST(&ki->kaio_jobqueue); + aiocbe; + aiocbe = aiocbn) { + aiocbn = TAILQ_NEXT(aiocbe, plist); + if (aio_free_entry(aiocbe)) + goto restart2; + } + +/* + * Note the use of lots of splbio here, trying to avoid + * splbio for long chains of I/O. Probably unnecessary. + */ + +restart3: + s = splbio(); + while (TAILQ_FIRST(&ki->kaio_bufqueue)) { + ki->kaio_flags |= KAIO_WAKEUP; + tsleep (p, PRIBIO, "aioprn", 0); + splx(s); + goto restart3; + } + splx(s); + +restart4: + s = splbio(); + for ( aiocbe = TAILQ_FIRST(&ki->kaio_bufdone); + aiocbe; + aiocbe = aiocbn) { + aiocbn = TAILQ_NEXT(aiocbe, plist); + if (aio_free_entry(aiocbe)) { + splx(s); + goto restart4; + } + } + splx(s); + + for ( lj = TAILQ_FIRST(&ki->kaio_liojoblist); + lj; + lj = ljn) { + ljn = TAILQ_NEXT(lj, lioj_list); + if ((lj->lioj_buffer_count == 0) && (lj->lioj_queue_count == 0)) { + TAILQ_REMOVE(&ki->kaio_liojoblist, lj, lioj_list); + zfree(aiolio_zone, lj); + } else { +#if defined(DIAGNOSTIC) + printf("LIO job not cleaned up: B:%d, BF:%d, Q:%d, QF:%d\n", + lj->lioj_buffer_count, lj->lioj_buffer_finished_count, + lj->lioj_queue_count, lj->lioj_queue_finished_count); +#endif + } + } + + zfree(kaio_zone, ki); + p->p_aioinfo = NULL; +} + +/* + * Select a job to run (called by an AIO daemon) + */ +static struct aiocblist * +aio_selectjob(struct aioproclist *aiop) +{ + + struct aiocblist *aiocbe; + + aiocbe = TAILQ_FIRST(&aiop->jobtorun); + if (aiocbe) { + TAILQ_REMOVE(&aiop->jobtorun, aiocbe, list); + return aiocbe; + } + + for (aiocbe = TAILQ_FIRST(&aio_jobs); + aiocbe; + aiocbe = TAILQ_NEXT(aiocbe, list)) { + struct kaioinfo *ki; + struct proc *userp; + + userp = aiocbe->userproc; + ki = userp->p_aioinfo; + + if (ki->kaio_active_count < ki->kaio_maxactive_count) { + TAILQ_REMOVE(&aio_jobs, aiocbe, list); + return aiocbe; + } + } + + return NULL; +} + +/* + * The AIO processing activity. This is the code that does the + * I/O request for the non-physio version of the operations. The + * normal vn operations are used, and this code should work in + * all instances for every type of file, including pipes, sockets, + * fifos, and regular files. + */ +void +aio_process(struct aiocblist *aiocbe) +{ + struct filedesc *fdp; + struct proc *userp, *mycp; + struct aiocb *cb; + struct file *fp; + struct uio auio; + struct iovec aiov; + unsigned int fd; + int cnt; + int error; + off_t offset; + int oublock_st, oublock_end; + int inblock_st, inblock_end; + + userp = aiocbe->userproc; + cb = &aiocbe->uaiocb; + + mycp = curproc; + + fdp = mycp->p_fd; + fd = cb->aio_fildes; + fp = fdp->fd_ofiles[fd]; + + aiov.iov_base = (void *) cb->aio_buf; + aiov.iov_len = cb->aio_nbytes; + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = offset = cb->aio_offset; + auio.uio_resid = cb->aio_nbytes; + cnt = cb->aio_nbytes; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = mycp; + + inblock_st = mycp->p_stats->p_ru.ru_inblock; + oublock_st = mycp->p_stats->p_ru.ru_oublock; + if (cb->aio_lio_opcode == LIO_READ) { + auio.uio_rw = UIO_READ; + error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred); + } else { + auio.uio_rw = UIO_WRITE; + error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred); + } + inblock_end = mycp->p_stats->p_ru.ru_inblock; + oublock_end = mycp->p_stats->p_ru.ru_oublock; + + aiocbe->inputcharge = inblock_end - inblock_st; + aiocbe->outputcharge = oublock_end - oublock_st; + + if (error) { + if (auio.uio_resid != cnt) { + if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) + error = 0; + if ((error == EPIPE) && (cb->aio_lio_opcode == LIO_WRITE)) + psignal(userp, SIGPIPE); + } + } + + cnt -= auio.uio_resid; + cb->_aiocb_private.error = error; + cb->_aiocb_private.status = cnt; + + return; + +} + +/* + * The AIO daemon, most of the actual work is done in aio_process, + * but the setup (and address space mgmt) is done in this routine. + */ +static void +aio_daemon(void *uproc) +{ + int s; + struct aioproclist *aiop; + struct vmspace *myvm, *aiovm; + struct proc *mycp; + + /* + * Local copies of curproc (cp) and vmspace (myvm) + */ + mycp = curproc; + myvm = mycp->p_vmspace; + + /* + * We manage to create only one VM space for all AIOD processes. + * The VM space for the first AIOD created becomes the shared VM + * space for all of them. We add an additional reference count, + * even for the first AIOD, so the address space does not go away, + * and we continue to use that original VM space even if the first + * AIOD exits. + */ + if ((aiovm = aiovmspace) == NULL) { + aiovmspace = myvm; + myvm->vm_refcnt++; + /* + * Remove userland cruft from address space. + */ + if (myvm->vm_shm) + shmexit(mycp); + pmap_remove_pages(&myvm->vm_pmap, 0, USRSTACK); + vm_map_remove(&myvm->vm_map, 0, USRSTACK); + myvm->vm_tsize = 0; + myvm->vm_dsize = 0; + myvm->vm_ssize = 0; + } else { + aiovm->vm_refcnt++; + mycp->p_vmspace = aiovm; + pmap_activate(mycp); + vmspace_free(myvm); + myvm = aiovm; + } + + if (mycp->p_textvp) { + vrele(mycp->p_textvp); + mycp->p_textvp = NULL; + } + + /* + * Allocate and ready the aio control info. There is one + * aiop structure per daemon. + */ + aiop = zalloc(aiop_zone); + aiop->aioproc = mycp; + aiop->aioprocflags |= AIOP_FREE; + TAILQ_INIT(&aiop->jobtorun); + + /* + * Place thread (lightweight process) onto the AIO free thread list + */ + if (TAILQ_EMPTY(&aio_freeproc)) + wakeup(&aio_freeproc); + TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); + + /* + * Make up a name for the daemon + */ + strcpy(mycp->p_comm, "aiod"); + + /* + * Get rid of our current filedescriptors. AIOD's don't need any + * filedescriptors, except as temporarily inherited from the client. + * Credentials are also cloned, and made equivalent to "root." + */ + fdfree(mycp); + mycp->p_fd = NULL; + mycp->p_ucred = crcopy(mycp->p_ucred); + mycp->p_ucred->cr_uid = 0; + mycp->p_ucred->cr_ngroups = 1; + mycp->p_ucred->cr_groups[0] = 1; + + /* + * The daemon resides in its own pgrp. + */ + enterpgrp(mycp, mycp->p_pid, 1); + + /* + * Mark special process type + */ + mycp->p_flag |= P_SYSTEM|P_KTHREADP; + + /* + * Wakeup parent process. (Parent sleeps to keep from blasting away + * creating to many daemons.) + */ + wakeup(mycp); + + while(1) { + struct proc *curcp; + struct aiocblist *aiocbe; + + /* + * curcp is the current daemon process context. + * userp is the current user process context. + */ + curcp = mycp; + + /* + * Take daemon off of free queue + */ + if (aiop->aioprocflags & AIOP_FREE) { + TAILQ_REMOVE(&aio_freeproc, aiop, list); + TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); + aiop->aioprocflags &= ~AIOP_FREE; + } + aiop->aioprocflags &= ~AIOP_SCHED; + + /* + * Check for jobs + */ + while ( aiocbe = aio_selectjob(aiop)) { + struct proc *userp; + struct aiocb *cb; + struct kaioinfo *ki; + struct aio_liojob *lj; + + cb = &aiocbe->uaiocb; + userp = aiocbe->userproc; + + aiocbe->jobstate = JOBST_JOBRUNNING; + + /* + * Connect to process address space for user program + */ + if (userp != curcp) { + struct vmspace *tmpvm; + /* + * Save the current address space that we are connected to. + */ + tmpvm = mycp->p_vmspace; + /* + * Point to the new user address space, and refer to it. + */ + mycp->p_vmspace = userp->p_vmspace; + mycp->p_vmspace->vm_refcnt++; + /* + * Activate the new mapping. + */ + pmap_activate(mycp); + /* + * If the old address space wasn't the daemons own address + * space, then we need to remove the daemon's reference from + * the other process that it was acting on behalf of. + */ + if (tmpvm != myvm) { + vmspace_free(tmpvm); + } + /* + * Disassociate from previous clients file descriptors, and + * associate to the new clients descriptors. Note that + * the daemon doesn't need to worry about its orginal + * descriptors, because they were originally freed. + */ + if (mycp->p_fd) + fdfree(mycp); + mycp->p_fd = fdshare(userp); + curcp = userp; + } + + ki = userp->p_aioinfo; + lj = aiocbe->lio; + + /* + * Account for currently active jobs + */ + ki->kaio_active_count++; + + /* + * Do the I/O function + */ + aiocbe->jobaioproc = aiop; + aio_process(aiocbe); + + /* + * decrement the active job count + */ + ki->kaio_active_count--; + + /* + * increment the completion count for wakeup/signal comparisons + */ + aiocbe->jobflags |= AIOCBLIST_DONE; + ki->kaio_queue_finished_count++; + if (lj) { + lj->lioj_queue_finished_count++; + } + if ((ki->kaio_flags & KAIO_WAKEUP) || + (ki->kaio_flags & KAIO_RUNDOWN) && + (ki->kaio_active_count == 0)) { + ki->kaio_flags &= ~KAIO_WAKEUP; + wakeup(userp); + } + + s = splbio(); + if (lj && (lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == + LIOJ_SIGNAL) { + if ((lj->lioj_queue_finished_count == lj->lioj_queue_count) && + (lj->lioj_buffer_finished_count == lj->lioj_buffer_count)) { + psignal(userp, lj->lioj_signal.sigev_signo); + lj->lioj_flags |= LIOJ_SIGNAL_POSTED; + } + } + splx(s); + + aiocbe->jobstate = JOBST_JOBFINISHED; + + /* + * If the I/O request should be automatically rundown, do the + * needed cleanup. Otherwise, place the queue entry for + * the just finished I/O request into the done queue for the + * associated client. + */ + if (aiocbe->jobflags & AIOCBLIST_ASYNCFREE) { + aiocbe->jobflags &= ~AIOCBLIST_ASYNCFREE; + TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); + } else { + TAILQ_REMOVE(&ki->kaio_jobqueue, + aiocbe, plist); + TAILQ_INSERT_TAIL(&ki->kaio_jobdone, + aiocbe, plist); + } + + if (aiocbe->jobflags & AIOCBLIST_RUNDOWN) { + wakeup(aiocbe); + aiocbe->jobflags &= ~AIOCBLIST_RUNDOWN; + } + + if (cb->aio_sigevent.sigev_notify == SIGEV_SIGNAL) { + psignal(userp, cb->aio_sigevent.sigev_signo); + } + } + + /* + * Disconnect from user address space + */ + if (curcp != mycp) { + struct vmspace *tmpvm; + /* + * Get the user address space to disconnect from. + */ + tmpvm = mycp->p_vmspace; + /* + * Get original address space for daemon. + */ + mycp->p_vmspace = myvm; + /* + * Activate the daemon's address space. + */ + pmap_activate(mycp); +#if defined(DIAGNOSTIC) + if (tmpvm == myvm) + printf("AIOD: vmspace problem -- %d\n", mycp->p_pid); +#endif + /* + * remove our vmspace reference. + */ + vmspace_free(tmpvm); + /* + * disassociate from the user process's file descriptors. + */ + if (mycp->p_fd) + fdfree(mycp); + mycp->p_fd = NULL; + curcp = mycp; + } + + /* + * If we are the first to be put onto the free queue, wakeup + * anyone waiting for a daemon. + */ + TAILQ_REMOVE(&aio_activeproc, aiop, list); + if (TAILQ_EMPTY(&aio_freeproc)) + wakeup(&aio_freeproc); + TAILQ_INSERT_HEAD(&aio_freeproc, aiop, list); + aiop->aioprocflags |= AIOP_FREE; + + /* + * If daemon is inactive for a long time, allow it to exit, thereby + * freeing resources. + */ + if (((aiop->aioprocflags & AIOP_SCHED) == 0) && + tsleep(mycp, PRIBIO, "aiordy", aiod_lifetime)) { + if ((TAILQ_FIRST(&aio_jobs) == NULL) && + (TAILQ_FIRST(&aiop->jobtorun) == NULL)) { + if ((aiop->aioprocflags & AIOP_FREE) && + (num_aio_procs > target_aio_procs)) { + TAILQ_REMOVE(&aio_freeproc, aiop, list); + zfree(aiop_zone, aiop); + num_aio_procs--; +#if defined(DIAGNOSTIC) + if (mycp->p_vmspace->vm_refcnt <= 1) + printf("AIOD: bad vm refcnt for exiting daemon: %d\n", + mycp->p_vmspace->vm_refcnt); +#endif + exit1(mycp, 0); + } + } + } + } +} + +/* + * Create a new AIO daemon. This is mostly a kernel-thread fork routine. + * The AIO daemon modifies its environment itself. + */ +static int +aio_newproc() +{ + int error; + struct rfork_args rfa; + struct proc *p, *np; + + rfa.flags = RFPROC | RFCFDG; + + p = curproc; + if (error = rfork(p, &rfa)) + return error; + + np = pfind(p->p_retval[0]); + cpu_set_fork_handler(np, aio_daemon, p); + + /* + * Wait until daemon is started, but continue on just in case (to + * handle error conditions. + */ + error = tsleep(np, PZERO, "aiosta", aiod_timeout); + num_aio_procs++; + + return error; + +} + +/* + * Try the high-performance physio method for eligible VCHR devices. This + * routine doesn't require the use of any additional threads, and have + * overhead. + */ +int +aio_qphysio(p, aiocbe) + struct proc *p; + struct aiocblist *aiocbe; +{ + int error; + struct aiocb *cb; + struct file *fp; + struct buf *bp; + int bflags; + struct vnode *vp; + struct kaioinfo *ki; + struct filedesc *fdp; + struct aio_liojob *lj; + int fd; + int majordev; + int s; + int cnt; + dev_t dev; + int rw; + d_strategy_t *fstrategy; + struct cdevsw *cdev; + struct cdevsw *bdev; + + cb = &aiocbe->uaiocb; + fdp = p->p_fd; + fd = cb->aio_fildes; + fp = fdp->fd_ofiles[fd]; + + if (fp->f_type != DTYPE_VNODE) { + return -1; + } + + vp = (struct vnode *)fp->f_data; + if (vp->v_type != VCHR || ((cb->aio_nbytes & (DEV_BSIZE - 1)) != 0)) { + return -1; + } + + if ((cb->aio_nbytes > MAXPHYS) && (num_buf_aio >= max_buf_aio)) { + return -1; + } + + if ((vp->v_specinfo == NULL) || (vp->v_flag & VISTTY)) { + return -1; + } + + majordev = major(vp->v_rdev); + if (majordev == NODEV) { + return -1; + } + + cdev = cdevsw[major(vp->v_rdev)]; + if (cdev == NULL) { + return -1; + } + + if (cdev->d_bmaj == -1) { + return -1; + } + bdev = cdev; + + ki = p->p_aioinfo; + if (ki->kaio_buffer_count >= ki->kaio_ballowed_count) { + return -1; + } + + cnt = cb->aio_nbytes; + if (cnt > MAXPHYS) { + return -1; + } + + dev = makedev(bdev->d_bmaj, minor(vp->v_rdev)); + + /* + * Physical I/O is charged directly to the process, so we don't have + * to fake it. + */ + aiocbe->inputcharge = 0; + aiocbe->outputcharge = 0; + + ki->kaio_buffer_count++; + + lj = aiocbe->lio; + if (lj) { + lj->lioj_buffer_count++; + } + + /* create and build a buffer header for a transfer */ + bp = (struct buf *)getpbuf(NULL); + + /* + * get a copy of the kva from the physical buffer + */ + bp->b_proc = p; + bp->b_dev = dev; + error = bp->b_error = 0; + + if (cb->aio_lio_opcode == LIO_WRITE) { + rw = 0; + bflags = B_WRITE; + } else { + rw = 1; + bflags = B_READ; + } + + bp->b_bcount = cb->aio_nbytes; + bp->b_bufsize = cb->aio_nbytes; + bp->b_flags = B_BUSY | B_PHYS | B_CALL | bflags; + bp->b_iodone = aio_physwakeup; + bp->b_saveaddr = bp->b_data; + bp->b_data = (void *) cb->aio_buf; + bp->b_blkno = btodb(cb->aio_offset); + + if (rw && !useracc(bp->b_data, bp->b_bufsize, B_WRITE)) { + error = EFAULT; + goto doerror; + } + if (!rw && !useracc(bp->b_data, bp->b_bufsize, B_READ)) { + error = EFAULT; + goto doerror; + } + + /* bring buffer into kernel space */ + vmapbuf(bp); + + s = splbio(); + aiocbe->bp = bp; + bp->b_spc = (void *)aiocbe; + TAILQ_INSERT_TAIL(&aio_bufjobs, aiocbe, list); + TAILQ_INSERT_TAIL(&ki->kaio_bufqueue, aiocbe, plist); + aiocbe->jobstate = JOBST_JOBQBUF; + cb->_aiocb_private.status = cb->aio_nbytes; + num_buf_aio++; + fstrategy = bdev->d_strategy; + bp->b_error = 0; + + splx(s); + /* perform transfer */ + (*fstrategy)(bp); + + s = splbio(); + /* + * If we had an error invoking the request, or an error in processing + * the request before we have returned, we process it as an error + * in transfer. Note that such an I/O error is not indicated immediately, + * but is returned using the aio_error mechanism. In this case, aio_suspend + * will return immediately. + */ + if (bp->b_error || (bp->b_flags & B_ERROR)) { + struct aiocb *job = aiocbe->uuaiocb; + + aiocbe->uaiocb._aiocb_private.status = 0; + suword(&job->_aiocb_private.status, 0); + aiocbe->uaiocb._aiocb_private.error = bp->b_error; + suword(&job->_aiocb_private.error, bp->b_error); + + ki->kaio_buffer_finished_count++; + + if (aiocbe->jobstate != JOBST_JOBBFINISHED) { + aiocbe->jobstate = JOBST_JOBBFINISHED; + aiocbe->jobflags |= AIOCBLIST_DONE; + TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); + TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); + TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); + } + } + splx(s); + return 0; + +doerror: + ki->kaio_buffer_count--; + if (lj) { + lj->lioj_buffer_count--; + } + aiocbe->bp = NULL; + relpbuf(bp, NULL); + return error; +} + +/* + * This waits/tests physio completion. + */ +int +aio_fphysio(p, iocb, flgwait) + struct proc *p; + struct aiocblist *iocb; + int flgwait; +{ + int s; + struct buf *bp; + int error; + + bp = iocb->bp; + + s = splbio(); + if (flgwait == 0) { + if ((bp->b_flags & B_DONE) == 0) { + splx(s); + return EINPROGRESS; + } + } + + while ((bp->b_flags & B_DONE) == 0) { + if (tsleep((caddr_t)bp, PRIBIO, "physstr", aiod_timeout)) { + if ((bp->b_flags & B_DONE) == 0) { + splx(s); + return EINPROGRESS; + } else { + break; + } + } + } + + /* release mapping into kernel space */ + vunmapbuf(bp); + iocb->bp = 0; + + error = 0; + /* + * check for an error + */ + if (bp->b_flags & B_ERROR) { + error = bp->b_error; + } + + relpbuf(bp, NULL); + return (error); +} + +/* + * Queue a new AIO request. Choosing either the threaded or direct physio + * VCHR technique is done in this code. + */ +static int +_aio_aqueue(struct proc *p, struct aiocb *job, struct aio_liojob *lj, int type) +{ + struct filedesc *fdp; + struct file *fp; + unsigned int fd; + + int error; + int opcode; + struct aiocblist *aiocbe; + struct aioproclist *aiop; + struct kaioinfo *ki; + + if (aiocbe = TAILQ_FIRST(&aio_freejobs)) { + TAILQ_REMOVE(&aio_freejobs, aiocbe, list); + } else { + aiocbe = zalloc (aiocb_zone); + } + + aiocbe->inputcharge = 0; + aiocbe->outputcharge = 0; + + suword(&job->_aiocb_private.status, -1); + suword(&job->_aiocb_private.error, 0); + suword(&job->_aiocb_private.kernelinfo, -1); + + error = copyin((caddr_t)job, + (caddr_t) &aiocbe->uaiocb, sizeof aiocbe->uaiocb); + if (error) { + suword(&job->_aiocb_private.error, error); + + TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); + return error; + } + + /* + * Save userspace address of the job info + */ + aiocbe->uuaiocb = job; + + /* + * Get the opcode + */ + if (type != LIO_NOP) { + aiocbe->uaiocb.aio_lio_opcode = type; + } + opcode = aiocbe->uaiocb.aio_lio_opcode; + + /* + * Get the fd info for process + */ + fdp = p->p_fd; + + /* + * Range check file descriptor + */ + fd = aiocbe->uaiocb.aio_fildes; + if (fd >= fdp->fd_nfiles) { + TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); + if (type == 0) { + suword(&job->_aiocb_private.error, EBADF); + } + return EBADF; + } + + fp = fdp->fd_ofiles[fd]; + if ((fp == NULL) || + ((opcode == LIO_WRITE) && ((fp->f_flag & FWRITE) == 0))) { + TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); + if (type == 0) { + suword(&job->_aiocb_private.error, EBADF); + } + return EBADF; + } + + if (aiocbe->uaiocb.aio_offset == -1LL) { + TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); + if (type == 0) { + suword(&job->_aiocb_private.error, EINVAL); + } + return EINVAL; + } + + error = suword(&job->_aiocb_private.kernelinfo, jobrefid); + if (error) { + TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); + if (type == 0) { + suword(&job->_aiocb_private.error, EINVAL); + } + return error; + } + + aiocbe->uaiocb._aiocb_private.kernelinfo = (void *)(intptr_t)jobrefid; + if (jobrefid == LONG_MAX) + jobrefid = 1; + else + jobrefid++; + + if (opcode == LIO_NOP) { + TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); + if (type == 0) { + suword(&job->_aiocb_private.error, 0); + suword(&job->_aiocb_private.status, 0); + suword(&job->_aiocb_private.kernelinfo, 0); + } + return 0; + } + + if ((opcode != LIO_READ) && (opcode != LIO_WRITE)) { + TAILQ_INSERT_HEAD(&aio_freejobs, aiocbe, list); + if (type == 0) { + suword(&job->_aiocb_private.status, 0); + suword(&job->_aiocb_private.error, EINVAL); + } + return EINVAL; + } + + suword(&job->_aiocb_private.error, EINPROGRESS); + aiocbe->uaiocb._aiocb_private.error = EINPROGRESS; + aiocbe->userproc = p; + aiocbe->jobflags = 0; + aiocbe->lio = lj; + ki = p->p_aioinfo; + + if ((error = aio_qphysio(p, aiocbe)) == 0) { + return 0; + } else if (error > 0) { + suword(&job->_aiocb_private.status, 0); + aiocbe->uaiocb._aiocb_private.error = error; + suword(&job->_aiocb_private.error, error); + return error; + } + + /* + * No buffer for daemon I/O + */ + aiocbe->bp = NULL; + + ki->kaio_queue_count++; + if (lj) { + lj->lioj_queue_count++; + } + TAILQ_INSERT_TAIL(&ki->kaio_jobqueue, aiocbe, plist); + TAILQ_INSERT_TAIL(&aio_jobs, aiocbe, list); + aiocbe->jobstate = JOBST_JOBQGLOBAL; + + num_queue_count++; + error = 0; + + /* + * If we don't have a free AIO process, and we are below our + * quota, then start one. Otherwise, depend on the subsequent + * I/O completions to pick-up this job. If we don't sucessfully + * create the new process (thread) due to resource issues, we + * return an error for now (EAGAIN), which is likely not the + * correct thing to do. + */ +retryproc: + if (aiop = TAILQ_FIRST(&aio_freeproc)) { + TAILQ_REMOVE(&aio_freeproc, aiop, list); + TAILQ_INSERT_TAIL(&aio_activeproc, aiop, list); + aiop->aioprocflags &= ~AIOP_FREE; + wakeup(aiop->aioproc); + } else if (((num_aio_resv_start + num_aio_procs) < max_aio_procs) && + ((ki->kaio_active_count + num_aio_resv_start) < + ki->kaio_maxactive_count)) { + num_aio_resv_start++; + if ((error = aio_newproc()) == 0) { + num_aio_resv_start--; + p->p_retval[0] = 0; + goto retryproc; + } + num_aio_resv_start--; + } + return error; +} + +/* + * This routine queues an AIO request, checking for quotas. + */ +static int +aio_aqueue(struct proc *p, struct aiocb *job, int type) +{ + struct kaioinfo *ki; + + if (p->p_aioinfo == NULL) { + aio_init_aioinfo(p); + } + + if (num_queue_count >= max_queue_count) + return EAGAIN; + + ki = p->p_aioinfo; + if (ki->kaio_queue_count >= ki->kaio_qallowed_count) + return EAGAIN; + + return _aio_aqueue(p, job, NULL, type); +} + +/* + * Support the aio_return system call, as a side-effect, kernel + * resources are released. + */ +int +aio_return(struct proc *p, struct aio_return_args *uap) +{ + int s; + int jobref; + struct aiocblist *cb, *ncb; + struct aiocb *ujob; + struct kaioinfo *ki; + + ki = p->p_aioinfo; + if (ki == NULL) { + return EINVAL; + } + + ujob = uap->aiocbp; + + jobref = fuword(&ujob->_aiocb_private.kernelinfo); + if (jobref == -1 || jobref == 0) + return EINVAL; + + for (cb = TAILQ_FIRST(&ki->kaio_jobdone); + cb; + cb = TAILQ_NEXT(cb, plist)) { + if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { + if (ujob == cb->uuaiocb) { + p->p_retval[0] = cb->uaiocb._aiocb_private.status; + } else { + p->p_retval[0] = EFAULT; + } + if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { + curproc->p_stats->p_ru.ru_oublock += cb->outputcharge; + cb->outputcharge = 0; + } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { + curproc->p_stats->p_ru.ru_inblock += cb->inputcharge; + cb->inputcharge = 0; + } + aio_free_entry(cb); + return 0; + } + } + + s = splbio(); + for (cb = TAILQ_FIRST(&ki->kaio_bufdone); + cb; + cb = ncb) { + ncb = TAILQ_NEXT(cb, plist); + if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { + splx(s); + if (ujob == cb->uuaiocb) { + p->p_retval[0] = cb->uaiocb._aiocb_private.status; + } else { + p->p_retval[0] = EFAULT; + } + aio_free_entry(cb); + return 0; + } + } + splx(s); + + return (EINVAL); +} + +/* + * Allow a process to wakeup when any of the I/O requests are + * completed. + */ +int +aio_suspend(struct proc *p, struct aio_suspend_args *uap) +{ + struct timeval atv; + struct timespec ts; + struct aiocb *const *cbptr, *cbp; + struct kaioinfo *ki; + struct aiocblist *cb; + int i; + int njoblist; + int error, s, timo; + int *ijoblist; + struct aiocb **ujoblist; + + if (uap->nent >= AIO_LISTIO_MAX) + return EINVAL; + + timo = 0; + if (uap->timeout) { + /* + * Get timespec struct + */ + if (error = copyin((caddr_t) uap->timeout, (caddr_t) &ts, sizeof ts)) { + return error; + } + + if (ts.tv_nsec < 0 || ts.tv_nsec >= 1000000000) + return (EINVAL); + + TIMESPEC_TO_TIMEVAL(&atv, &ts); + if (itimerfix(&atv)) + return (EINVAL); + timo = tvtohz(&atv); + } + + ki = p->p_aioinfo; + if (ki == NULL) + return EAGAIN; + + njoblist = 0; + ijoblist = zalloc(aiol_zone); + ujoblist = zalloc(aiol_zone); + cbptr = uap->aiocbp; + + for(i = 0; i < uap->nent; i++) { + cbp = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]); + if (cbp == 0) + continue; + ujoblist[njoblist] = cbp; + ijoblist[njoblist] = fuword(&cbp->_aiocb_private.kernelinfo); + njoblist++; + } + if (njoblist == 0) { + zfree(aiol_zone, ijoblist); + zfree(aiol_zone, ujoblist); + return 0; + } + + error = 0; + while (1) { + for (cb = TAILQ_FIRST(&ki->kaio_jobdone); + cb; cb = TAILQ_NEXT(cb, plist)) { + for(i = 0; i < njoblist; i++) { + if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == + ijoblist[i]) { + if (ujoblist[i] != cb->uuaiocb) + error = EINVAL; + zfree(aiol_zone, ijoblist); + zfree(aiol_zone, ujoblist); + return error; + } + } + } + + s = splbio(); + for (cb = TAILQ_FIRST(&ki->kaio_bufdone); + cb; cb = TAILQ_NEXT(cb, plist)) { + for(i = 0; i < njoblist; i++) { + if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == + ijoblist[i]) { + splx(s); + if (ujoblist[i] != cb->uuaiocb) + error = EINVAL; + zfree(aiol_zone, ijoblist); + zfree(aiol_zone, ujoblist); + return error; + } + } + } + + ki->kaio_flags |= KAIO_WAKEUP; + error = tsleep(p, PRIBIO|PCATCH, "aiospn", timo); + splx(s); + + if (error == EINTR) { + zfree(aiol_zone, ijoblist); + zfree(aiol_zone, ujoblist); + return EINTR; + } else if (error == EWOULDBLOCK) { + zfree(aiol_zone, ijoblist); + zfree(aiol_zone, ujoblist); + return EAGAIN; + } + } + +/* NOTREACHED */ + return EINVAL; +} + +/* + * aio_cancel at the kernel level is a NOOP right now. It + * might be possible to support it partially in user mode, or + * in kernel mode later on. + */ +int +aio_cancel(struct proc *p, struct aio_cancel_args *uap) +{ + return ENOSYS; +} + +/* + * aio_error is implemented in the kernel level for compatibility + * purposes only. For a user mode async implementation, it would be + * best to do it in a userland subroutine. + */ +int +aio_error(struct proc *p, struct aio_error_args *uap) +{ + int s; + struct aiocblist *cb; + struct kaioinfo *ki; + int jobref; + + ki = p->p_aioinfo; + if (ki == NULL) + return EINVAL; + + jobref = fuword(&uap->aiocbp->_aiocb_private.kernelinfo); + if ((jobref == -1) || (jobref == 0)) + return EINVAL; + + for (cb = TAILQ_FIRST(&ki->kaio_jobdone); + cb; + cb = TAILQ_NEXT(cb, plist)) { + + if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { + p->p_retval[0] = cb->uaiocb._aiocb_private.error; + return 0; + } + } + + for (cb = TAILQ_FIRST(&ki->kaio_jobqueue); + cb; + cb = TAILQ_NEXT(cb, plist)) { + + if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { + p->p_retval[0] = EINPROGRESS; + return 0; + } + } + + s = splbio(); + for (cb = TAILQ_FIRST(&ki->kaio_bufdone); + cb; + cb = TAILQ_NEXT(cb, plist)) { + if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { + p->p_retval[0] = cb->uaiocb._aiocb_private.error; + splx(s); + return 0; + } + } + + for (cb = TAILQ_FIRST(&ki->kaio_bufqueue); + cb; + cb = TAILQ_NEXT(cb, plist)) { + if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == jobref) { + p->p_retval[0] = EINPROGRESS; + splx(s); + return 0; + } + } + splx(s); + + + /* + * Hack for lio + */ +/* + status = fuword(&uap->aiocbp->_aiocb_private.status); + if (status == -1) { + return fuword(&uap->aiocbp->_aiocb_private.error); + } +*/ + return EINVAL; +} + +int +aio_read(struct proc *p, struct aio_read_args *uap) +{ + struct filedesc *fdp; + struct file *fp; + struct uio auio; + struct iovec aiov; + unsigned int fd; + int cnt; + struct aiocb iocb; + int error, pmodes; + + pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); + if ((pmodes & AIO_PMODE_SYNC) == 0) { + return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ); + } + + /* + * Get control block + */ + if (error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb)) + return error; + + /* + * Get the fd info for process + */ + fdp = p->p_fd; + + /* + * Range check file descriptor + */ + fd = iocb.aio_fildes; + if (fd >= fdp->fd_nfiles) + return EBADF; + fp = fdp->fd_ofiles[fd]; + if ((fp == NULL) || ((fp->f_flag & FREAD) == 0)) + return EBADF; + if (iocb.aio_offset == -1LL) + return EINVAL; + + auio.uio_resid = iocb.aio_nbytes; + if (auio.uio_resid < 0) + return (EINVAL); + + /* + * Process sync simply -- queue async request. + */ + if ((iocb._aiocb_private.privatemodes & AIO_PMODE_SYNC) == 0) { + return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_READ); + } + + aiov.iov_base = (void *) iocb.aio_buf; + aiov.iov_len = iocb.aio_nbytes; + + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = iocb.aio_offset; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + + cnt = iocb.aio_nbytes; + error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred); + if (error && + (auio.uio_resid != cnt) && + (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) + error = 0; + cnt -= auio.uio_resid; + p->p_retval[0] = cnt; + return error; +} + +int +aio_write(struct proc *p, struct aio_write_args *uap) +{ + struct filedesc *fdp; + struct file *fp; + struct uio auio; + struct iovec aiov; + unsigned int fd; + int cnt; + struct aiocb iocb; + int error; + int pmodes; + + /* + * Process sync simply -- queue async request. + */ + pmodes = fuword(&uap->aiocbp->_aiocb_private.privatemodes); + if ((pmodes & AIO_PMODE_SYNC) == 0) { + return aio_aqueue(p, (struct aiocb *) uap->aiocbp, LIO_WRITE); + } + + if (error = copyin((caddr_t) uap->aiocbp, (caddr_t) &iocb, sizeof iocb)) + return error; + + /* + * Get the fd info for process + */ + fdp = p->p_fd; + + /* + * Range check file descriptor + */ + fd = iocb.aio_fildes; + if (fd >= fdp->fd_nfiles) + return EBADF; + fp = fdp->fd_ofiles[fd]; + if ((fp == NULL) || ((fp->f_flag & FWRITE) == 0)) + return EBADF; + if (iocb.aio_offset == -1LL) + return EINVAL; + + aiov.iov_base = (void *) iocb.aio_buf; + aiov.iov_len = iocb.aio_nbytes; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = iocb.aio_offset; + + auio.uio_resid = iocb.aio_nbytes; + if (auio.uio_resid < 0) + return (EINVAL); + + auio.uio_rw = UIO_WRITE; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + + cnt = iocb.aio_nbytes; + error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred); + if (error) { + if (auio.uio_resid != cnt) { + if (error == ERESTART || error == EINTR || error == EWOULDBLOCK) + error = 0; + if (error == EPIPE) + psignal(p, SIGPIPE); + } + } + cnt -= auio.uio_resid; + p->p_retval[0] = cnt; + return error; +} + +int +lio_listio(struct proc *p, struct lio_listio_args *uap) +{ + int nent, nentqueued; + struct aiocb *iocb, * const *cbptr; + struct aiocblist *cb; + struct kaioinfo *ki; + struct aio_liojob *lj; + int error, runningcode; + int nerror; + int i; + int s; + + if ((uap->mode != LIO_NOWAIT) && (uap->mode != LIO_WAIT)) { + return EINVAL; + } + + nent = uap->nent; + if (nent > AIO_LISTIO_MAX) { + return EINVAL; + } + + if (p->p_aioinfo == NULL) { + aio_init_aioinfo(p); + } + + if ((nent + num_queue_count) > max_queue_count) { + return EAGAIN; + } + + ki = p->p_aioinfo; + if ((nent + ki->kaio_queue_count) > ki->kaio_qallowed_count) { + return EAGAIN; + } + + lj = zalloc(aiolio_zone); + if (!lj) { + return EAGAIN; + } + + lj->lioj_flags = 0; + lj->lioj_buffer_count = 0; + lj->lioj_buffer_finished_count = 0; + lj->lioj_queue_count = 0; + lj->lioj_queue_finished_count = 0; + lj->lioj_ki = ki; + TAILQ_INSERT_TAIL(&ki->kaio_liojoblist, lj, lioj_list); + + /* + * Setup signal + */ + if (uap->sig && (uap->mode == LIO_NOWAIT)) { + error = copyin(uap->sig, &lj->lioj_signal, sizeof lj->lioj_signal); + if (error) + return error; + lj->lioj_flags |= LIOJ_SIGNAL; + lj->lioj_flags &= ~LIOJ_SIGNAL_POSTED; + } else { + lj->lioj_flags &= ~LIOJ_SIGNAL; + } + +/* + * get pointers to the list of I/O requests + */ + + nerror = 0; + nentqueued = 0; + cbptr = uap->acb_list; + for(i = 0; i < uap->nent; i++) { + iocb = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]); + if (((intptr_t) iocb != -1) && ((intptr_t) iocb != NULL)) { + error = _aio_aqueue(p, iocb, lj, 0); + if (error == 0) { + nentqueued++; + } else { + nerror++; + } + } + } + + /* + * If we haven't queued any, then just return error + */ + if (nentqueued == 0) { + return 0; + } + + /* + * Calculate the appropriate error return + */ + runningcode = 0; + if (nerror) + runningcode = EIO; + + if (uap->mode == LIO_WAIT) { + while (1) { + int found; + found = 0; + for(i = 0; i < uap->nent; i++) { + int jobref, command; + + /* + * Fetch address of the control buf pointer in user space + */ + iocb = (struct aiocb *) (intptr_t) fuword((caddr_t) &cbptr[i]); + if (((intptr_t) iocb == -1) || ((intptr_t) iocb == 0)) + continue; + + /* + * Fetch the associated command from user space + */ + command = fuword(&iocb->aio_lio_opcode); + if (command == LIO_NOP) { + found++; + continue; + } + + jobref = fuword(&iocb->_aiocb_private.kernelinfo); + + for (cb = TAILQ_FIRST(&ki->kaio_jobdone); + cb; + cb = TAILQ_NEXT(cb, plist)) { + if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == + jobref) { + if (cb->uaiocb.aio_lio_opcode == LIO_WRITE) { + curproc->p_stats->p_ru.ru_oublock += + cb->outputcharge; + cb->outputcharge = 0; + } else if (cb->uaiocb.aio_lio_opcode == LIO_READ) { + curproc->p_stats->p_ru.ru_inblock += + cb->inputcharge; + cb->inputcharge = 0; + } + found++; + break; + } + } + + s = splbio(); + for (cb = TAILQ_FIRST(&ki->kaio_bufdone); + cb; + cb = TAILQ_NEXT(cb, plist)) { + if (((intptr_t) cb->uaiocb._aiocb_private.kernelinfo) == + jobref) { + found++; + break; + } + } + splx(s); + + } + + /* + * If all I/Os have been disposed of, then we can return + */ + if (found == nentqueued) { + return runningcode; + } + + ki->kaio_flags |= KAIO_WAKEUP; + error = tsleep(p, PRIBIO|PCATCH, "aiospn", 0); + + if (error == EINTR) { + return EINTR; + } else if (error == EWOULDBLOCK) { + return EAGAIN; + } + + } + } + + return runningcode; +} + +/* + * This is a wierd hack so that we can post a signal. It is safe + * to do so from a timeout routine, but *not* from an interrupt routine. + */ +static void +process_signal(void *ljarg) +{ + struct aio_liojob *lj = ljarg; + if (lj->lioj_signal.sigev_notify == SIGEV_SIGNAL) { + if (lj->lioj_queue_count == lj->lioj_queue_finished_count) { + psignal(lj->lioj_ki->kaio_p, lj->lioj_signal.sigev_signo); + lj->lioj_flags |= LIOJ_SIGNAL_POSTED; + } + } +} + +/* + * Interrupt handler for physio, performs the necessary process wakeups, + * and signals. + */ +static void +aio_physwakeup(bp) + struct buf *bp; +{ + struct aiocblist *aiocbe; + struct proc *p; + struct kaioinfo *ki; + struct aio_liojob *lj; + int s; + s = splbio(); + + wakeup((caddr_t) bp); + bp->b_flags &= ~B_CALL; + bp->b_flags |= B_DONE; + + aiocbe = (struct aiocblist *)bp->b_spc; + if (aiocbe) { + p = bp->b_proc; + + aiocbe->jobstate = JOBST_JOBBFINISHED; + aiocbe->uaiocb._aiocb_private.status -= bp->b_resid; + aiocbe->uaiocb._aiocb_private.error = 0; + aiocbe->jobflags |= AIOCBLIST_DONE; + + if (bp->b_flags & B_ERROR) { + aiocbe->uaiocb._aiocb_private.error = bp->b_error; + } + + lj = aiocbe->lio; + if (lj) { + lj->lioj_buffer_finished_count++; + /* + * wakeup/signal if all of the interrupt jobs are done + */ + if (lj->lioj_buffer_finished_count == lj->lioj_buffer_count) { + /* + * post a signal if it is called for + */ + if ((lj->lioj_flags & (LIOJ_SIGNAL|LIOJ_SIGNAL_POSTED)) == + LIOJ_SIGNAL) { + lj->lioj_flags |= LIOJ_SIGNAL_POSTED; + timeout(process_signal, lj, 0); + } + } + } + + ki = p->p_aioinfo; + if (ki) { + ki->kaio_buffer_finished_count++; + TAILQ_REMOVE(&aio_bufjobs, aiocbe, list); + TAILQ_REMOVE(&ki->kaio_bufqueue, aiocbe, plist); + TAILQ_INSERT_TAIL(&ki->kaio_bufdone, aiocbe, plist); + /* + * and do the wakeup + */ + if (ki->kaio_flags & (KAIO_RUNDOWN|KAIO_WAKEUP)) { + ki->kaio_flags &= ~KAIO_WAKEUP; + wakeup(p); + } + } + } + splx(s); +} diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c new file mode 100644 index 0000000..3664ccd --- /dev/null +++ b/sys/kern/vfs_bio.c @@ -0,0 +1,2443 @@ +/* + * Copyright (c) 1994,1997 John S. Dyson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice immediately at the beginning of the file, without modification, + * this list of conditions, and the following disclaimer. + * 2. Absolutely no warranty of function or purpose is made by the author + * John S. Dyson. + * + * $Id: vfs_bio.c,v 1.194 1999/01/21 08:29:05 dillon Exp $ + */ + +/* + * this file contains a new buffer I/O scheme implementing a coherent + * VM object and buffer cache scheme. Pains have been taken to make + * sure that the performance degradation associated with schemes such + * as this is not realized. + * + * Author: John S. Dyson + * Significant help during the development and debugging phases + * had been provided by David Greenman, also of the FreeBSD core team. + * + * see man buf(9) for more info. + */ + +#define VMIO +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/proc.h> +#include <sys/vnode.h> +#include <sys/vmmeter.h> +#include <sys/lock.h> +#include <miscfs/specfs/specdev.h> +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <vm/vm_kern.h> +#include <vm/vm_pageout.h> +#include <vm/vm_page.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> +#include <vm/vm_map.h> +#include <sys/buf.h> +#include <sys/mount.h> +#include <sys/malloc.h> +#include <sys/resourcevar.h> + +static MALLOC_DEFINE(M_BIOBUF, "BIO buffer", "BIO buffer"); + +struct bio_ops bioops; /* I/O operation notification */ + +#if 0 /* replaced bu sched_sync */ +static void vfs_update __P((void)); +static struct proc *updateproc; +static struct kproc_desc up_kp = { + "update", + vfs_update, + &updateproc +}; +SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) +#endif + +struct buf *buf; /* buffer header pool */ +struct swqueue bswlist; + +static void vm_hold_free_pages(struct buf * bp, vm_offset_t from, + vm_offset_t to); +static void vm_hold_load_pages(struct buf * bp, vm_offset_t from, + vm_offset_t to); +static void vfs_buf_set_valid(struct buf *bp, vm_ooffset_t foff, + vm_offset_t off, vm_offset_t size, + vm_page_t m); +static void vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, + int pageno, vm_page_t m); +static void vfs_clean_pages(struct buf * bp); +static void vfs_setdirty(struct buf *bp); +static void vfs_vmio_release(struct buf *bp); +static void flushdirtybuffers(int slpflag, int slptimeo); + +int needsbuffer; + +/* + * Internal update daemon, process 3 + * The variable vfs_update_wakeup allows for internal syncs. + */ +int vfs_update_wakeup; + + +/* + * buffers base kva + */ + +/* + * bogus page -- for I/O to/from partially complete buffers + * this is a temporary solution to the problem, but it is not + * really that bad. it would be better to split the buffer + * for input in the case of buffers partially already in memory, + * but the code is intricate enough already. + */ +vm_page_t bogus_page; +static vm_offset_t bogus_offset; + +static int bufspace, maxbufspace, vmiospace, maxvmiobufspace, + bufmallocspace, maxbufmallocspace; +int numdirtybuffers; +static int lodirtybuffers, hidirtybuffers; +static int numfreebuffers, lofreebuffers, hifreebuffers; +static int kvafreespace; + +SYSCTL_INT(_vfs, OID_AUTO, numdirtybuffers, CTLFLAG_RD, + &numdirtybuffers, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, lodirtybuffers, CTLFLAG_RW, + &lodirtybuffers, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, hidirtybuffers, CTLFLAG_RW, + &hidirtybuffers, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, numfreebuffers, CTLFLAG_RD, + &numfreebuffers, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, lofreebuffers, CTLFLAG_RW, + &lofreebuffers, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, hifreebuffers, CTLFLAG_RW, + &hifreebuffers, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, maxbufspace, CTLFLAG_RW, + &maxbufspace, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, bufspace, CTLFLAG_RD, + &bufspace, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, maxvmiobufspace, CTLFLAG_RW, + &maxvmiobufspace, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, vmiospace, CTLFLAG_RD, + &vmiospace, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, maxmallocbufspace, CTLFLAG_RW, + &maxbufmallocspace, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, bufmallocspace, CTLFLAG_RD, + &bufmallocspace, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, kvafreespace, CTLFLAG_RD, + &kvafreespace, 0, ""); + +static LIST_HEAD(bufhashhdr, buf) bufhashtbl[BUFHSZ], invalhash; +struct bqueues bufqueues[BUFFER_QUEUES] = {0}; + +extern int vm_swap_size; + +#define BUF_MAXUSE 24 + +#define VFS_BIO_NEED_ANY 1 +#define VFS_BIO_NEED_LOWLIMIT 2 +#define VFS_BIO_NEED_FREE 4 + +/* + * Initialize buffer headers and related structures. + */ +void +bufinit() +{ + struct buf *bp; + int i; + + TAILQ_INIT(&bswlist); + LIST_INIT(&invalhash); + + /* first, make a null hash table */ + for (i = 0; i < BUFHSZ; i++) + LIST_INIT(&bufhashtbl[i]); + + /* next, make a null set of free lists */ + for (i = 0; i < BUFFER_QUEUES; i++) + TAILQ_INIT(&bufqueues[i]); + + /* finally, initialize each buffer header and stick on empty q */ + for (i = 0; i < nbuf; i++) { + bp = &buf[i]; + bzero(bp, sizeof *bp); + bp->b_flags = B_INVAL; /* we're just an empty header */ + bp->b_dev = NODEV; + bp->b_rcred = NOCRED; + bp->b_wcred = NOCRED; + bp->b_qindex = QUEUE_EMPTY; + bp->b_xflags = 0; + LIST_INIT(&bp->b_dep); + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); + LIST_INSERT_HEAD(&invalhash, bp, b_hash); + } +/* + * maxbufspace is currently calculated to support all filesystem blocks + * to be 8K. If you happen to use a 16K filesystem, the size of the buffer + * cache is still the same as it would be for 8K filesystems. This + * keeps the size of the buffer cache "in check" for big block filesystems. + */ + maxbufspace = (nbuf + 8) * DFLTBSIZE; +/* + * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed + */ + maxvmiobufspace = 2 * maxbufspace / 3; +/* + * Limit the amount of malloc memory since it is wired permanently into + * the kernel space. Even though this is accounted for in the buffer + * allocation, we don't want the malloced region to grow uncontrolled. + * The malloc scheme improves memory utilization significantly on average + * (small) directories. + */ + maxbufmallocspace = maxbufspace / 20; + +/* + * Remove the probability of deadlock conditions by limiting the + * number of dirty buffers. + */ + hidirtybuffers = nbuf / 8 + 20; + lodirtybuffers = nbuf / 16 + 10; + numdirtybuffers = 0; + lofreebuffers = nbuf / 18 + 5; + hifreebuffers = 2 * lofreebuffers; + numfreebuffers = nbuf; + kvafreespace = 0; + + bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); + bogus_page = vm_page_alloc(kernel_object, + ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), + VM_ALLOC_NORMAL); + +} + +/* + * Free the kva allocation for a buffer + * Must be called only at splbio or higher, + * as this is the only locking for buffer_map. + */ +static void +bfreekva(struct buf * bp) +{ + if (bp->b_kvasize == 0) + return; + + vm_map_delete(buffer_map, + (vm_offset_t) bp->b_kvabase, + (vm_offset_t) bp->b_kvabase + bp->b_kvasize); + + bp->b_kvasize = 0; + +} + +/* + * remove the buffer from the appropriate free list + */ +void +bremfree(struct buf * bp) +{ + int s = splbio(); + + if (bp->b_qindex != QUEUE_NONE) { + if (bp->b_qindex == QUEUE_EMPTY) { + kvafreespace -= bp->b_kvasize; + } + TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); + bp->b_qindex = QUEUE_NONE; + } else { +#if !defined(MAX_PERF) + panic("bremfree: removing a buffer when not on a queue"); +#endif + } + if ((bp->b_flags & B_INVAL) || + (bp->b_flags & (B_DELWRI|B_LOCKED)) == 0) + --numfreebuffers; + splx(s); +} + + +/* + * Get a buffer with the specified data. Look in the cache first. + */ +int +bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, + struct buf ** bpp) +{ + struct buf *bp; + + bp = getblk(vp, blkno, size, 0, 0); + *bpp = bp; + + /* if not found in cache, do some I/O */ + if ((bp->b_flags & B_CACHE) == 0) { + if (curproc != NULL) + curproc->p_stats->p_ru.ru_inblock++; + bp->b_flags |= B_READ; + bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); + if (bp->b_rcred == NOCRED) { + if (cred != NOCRED) + crhold(cred); + bp->b_rcred = cred; + } + vfs_busy_pages(bp, 0); + VOP_STRATEGY(vp, bp); + return (biowait(bp)); + } + return (0); +} + +/* + * Operates like bread, but also starts asynchronous I/O on + * read-ahead blocks. + */ +int +breadn(struct vnode * vp, daddr_t blkno, int size, + daddr_t * rablkno, int *rabsize, + int cnt, struct ucred * cred, struct buf ** bpp) +{ + struct buf *bp, *rabp; + int i; + int rv = 0, readwait = 0; + + *bpp = bp = getblk(vp, blkno, size, 0, 0); + + /* if not found in cache, do some I/O */ + if ((bp->b_flags & B_CACHE) == 0) { + if (curproc != NULL) + curproc->p_stats->p_ru.ru_inblock++; + bp->b_flags |= B_READ; + bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); + if (bp->b_rcred == NOCRED) { + if (cred != NOCRED) + crhold(cred); + bp->b_rcred = cred; + } + vfs_busy_pages(bp, 0); + VOP_STRATEGY(vp, bp); + ++readwait; + } + for (i = 0; i < cnt; i++, rablkno++, rabsize++) { + if (inmem(vp, *rablkno)) + continue; + rabp = getblk(vp, *rablkno, *rabsize, 0, 0); + + if ((rabp->b_flags & B_CACHE) == 0) { + if (curproc != NULL) + curproc->p_stats->p_ru.ru_inblock++; + rabp->b_flags |= B_READ | B_ASYNC; + rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); + if (rabp->b_rcred == NOCRED) { + if (cred != NOCRED) + crhold(cred); + rabp->b_rcred = cred; + } + vfs_busy_pages(rabp, 0); + VOP_STRATEGY(vp, rabp); + } else { + brelse(rabp); + } + } + + if (readwait) { + rv = biowait(bp); + } + return (rv); +} + +/* + * Write, release buffer on completion. (Done by iodone + * if async.) + */ +int +bwrite(struct buf * bp) +{ + int oldflags, s; + struct vnode *vp; + struct mount *mp; + + + if (bp->b_flags & B_INVAL) { + brelse(bp); + return (0); + } + + oldflags = bp->b_flags; + +#if !defined(MAX_PERF) + if ((bp->b_flags & B_BUSY) == 0) + panic("bwrite: buffer is not busy???"); +#endif + + bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); + bp->b_flags |= B_WRITEINPROG; + + s = splbio(); + if ((oldflags & B_DELWRI) == B_DELWRI) { + --numdirtybuffers; + reassignbuf(bp, bp->b_vp); + } + + bp->b_vp->v_numoutput++; + vfs_busy_pages(bp, 1); + if (curproc != NULL) + curproc->p_stats->p_ru.ru_oublock++; + splx(s); + VOP_STRATEGY(bp->b_vp, bp); + + /* + * Collect statistics on synchronous and asynchronous writes. + * Writes to block devices are charged to their associated + * filesystem (if any). + */ + if ((vp = bp->b_vp) != NULL) { + if (vp->v_type == VBLK) + mp = vp->v_specmountpoint; + else + mp = vp->v_mount; + if (mp != NULL) + if ((oldflags & B_ASYNC) == 0) + mp->mnt_stat.f_syncwrites++; + else + mp->mnt_stat.f_asyncwrites++; + } + + if ((oldflags & B_ASYNC) == 0) { + int rtval = biowait(bp); + brelse(bp); + return (rtval); + } + return (0); +} + +void +vfs_bio_need_satisfy(void) { + ++numfreebuffers; + if (!needsbuffer) + return; + if (numdirtybuffers < lodirtybuffers) { + needsbuffer &= ~(VFS_BIO_NEED_ANY | VFS_BIO_NEED_LOWLIMIT); + } else { + needsbuffer &= ~VFS_BIO_NEED_ANY; + } + if (numfreebuffers >= hifreebuffers) { + needsbuffer &= ~VFS_BIO_NEED_FREE; + } + wakeup(&needsbuffer); +} + +/* + * Delayed write. (Buffer is marked dirty). + */ +void +bdwrite(struct buf * bp) +{ + struct vnode *vp; + +#if !defined(MAX_PERF) + if ((bp->b_flags & B_BUSY) == 0) { + panic("bdwrite: buffer is not busy"); + } +#endif + + if (bp->b_flags & B_INVAL) { + brelse(bp); + return; + } + bp->b_flags &= ~(B_READ|B_RELBUF); + if ((bp->b_flags & B_DELWRI) == 0) { + bp->b_flags |= B_DONE | B_DELWRI; + reassignbuf(bp, bp->b_vp); + ++numdirtybuffers; + } + + /* + * This bmap keeps the system from needing to do the bmap later, + * perhaps when the system is attempting to do a sync. Since it + * is likely that the indirect block -- or whatever other datastructure + * that the filesystem needs is still in memory now, it is a good + * thing to do this. Note also, that if the pageout daemon is + * requesting a sync -- there might not be enough memory to do + * the bmap then... So, this is important to do. + */ + if (bp->b_lblkno == bp->b_blkno) { + VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); + } + + /* + * Set the *dirty* buffer range based upon the VM system dirty pages. + */ + vfs_setdirty(bp); + + /* + * We need to do this here to satisfy the vnode_pager and the + * pageout daemon, so that it thinks that the pages have been + * "cleaned". Note that since the pages are in a delayed write + * buffer -- the VFS layer "will" see that the pages get written + * out on the next sync, or perhaps the cluster will be completed. + */ + vfs_clean_pages(bp); + bqrelse(bp); + + /* + * XXX The soft dependency code is not prepared to + * have I/O done when a bdwrite is requested. For + * now we just let the write be delayed if it is + * requested by the soft dependency code. + */ + if ((vp = bp->b_vp) && + ((vp->v_type == VBLK && vp->v_specmountpoint && + (vp->v_specmountpoint->mnt_flag & MNT_SOFTDEP)) || + (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP)))) + return; + + if (numdirtybuffers >= hidirtybuffers) + flushdirtybuffers(0, 0); + + return; +} + + +/* + * Same as first half of bdwrite, mark buffer dirty, but do not release it. + * Check how this compares with vfs_setdirty(); XXX [JRE] + */ +void +bdirty(bp) + struct buf *bp; +{ + + bp->b_flags &= ~(B_READ|B_RELBUF); /* XXX ??? check this */ + if ((bp->b_flags & B_DELWRI) == 0) { + bp->b_flags |= B_DONE | B_DELWRI; /* why done? XXX JRE */ + reassignbuf(bp, bp->b_vp); + ++numdirtybuffers; + } +} + +/* + * Asynchronous write. + * Start output on a buffer, but do not wait for it to complete. + * The buffer is released when the output completes. + */ +void +bawrite(struct buf * bp) +{ + bp->b_flags |= B_ASYNC; + (void) VOP_BWRITE(bp); +} + +/* + * Ordered write. + * Start output on a buffer, and flag it so that the device will write + * it in the order it was queued. The buffer is released when the output + * completes. + */ +int +bowrite(struct buf * bp) +{ + bp->b_flags |= B_ORDERED|B_ASYNC; + return (VOP_BWRITE(bp)); +} + +/* + * Release a buffer. + */ +void +brelse(struct buf * bp) +{ + int s; + + if (bp->b_flags & B_CLUSTER) { + relpbuf(bp, NULL); + return; + } + + s = splbio(); + + /* anyone need this block? */ + if (bp->b_flags & B_WANTED) { + bp->b_flags &= ~(B_WANTED | B_AGE); + wakeup(bp); + } + + if (bp->b_flags & B_LOCKED) + bp->b_flags &= ~B_ERROR; + + if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_FREEBUF)) || + (bp->b_bufsize <= 0)) { + bp->b_flags |= B_INVAL; + if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_deallocate) + (*bioops.io_deallocate)(bp); + if (bp->b_flags & B_DELWRI) + --numdirtybuffers; + bp->b_flags &= ~(B_DELWRI | B_CACHE | B_FREEBUF); + if ((bp->b_flags & B_VMIO) == 0) { + if (bp->b_bufsize) + allocbuf(bp, 0); + if (bp->b_vp) + brelvp(bp); + } + } + + /* + * We must clear B_RELBUF if B_DELWRI is set. If vfs_vmio_release() + * is called with B_DELWRI set, the underlying pages may wind up + * getting freed causing a previous write (bdwrite()) to get 'lost' + * because pages associated with a B_DELWRI bp are marked clean. + * + * We still allow the B_INVAL case to call vfs_vmio_release(), even + * if B_DELWRI is set. + */ + + if (bp->b_flags & B_DELWRI) + bp->b_flags &= ~B_RELBUF; + + /* + * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer + * constituted, so the B_INVAL flag is used to *invalidate* the buffer, + * but the VM object is kept around. The B_NOCACHE flag is used to + * invalidate the pages in the VM object. + * + * The b_{validoff,validend,dirtyoff,dirtyend} values are relative + * to b_offset and currently have byte granularity, whereas the + * valid flags in the vm_pages have only DEV_BSIZE resolution. + * The byte resolution fields are used to avoid unnecessary re-reads + * of the buffer but the code really needs to be genericized so + * other filesystem modules can take advantage of these fields. + * + * XXX this seems to cause performance problems. + */ + if ((bp->b_flags & B_VMIO) + && !(bp->b_vp->v_tag == VT_NFS && + bp->b_vp->v_type != VBLK && + (bp->b_flags & B_DELWRI) != 0) +#ifdef notdef + && (bp->b_vp->v_tag != VT_NFS + || bp->b_vp->v_type == VBLK + || (bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) + || bp->b_validend == 0 + || (bp->b_validoff == 0 + && bp->b_validend == bp->b_bufsize)) +#endif + ) { + + int i, j, resid; + vm_page_t m; + off_t foff; + vm_pindex_t poff; + vm_object_t obj; + struct vnode *vp; + + vp = bp->b_vp; + + /* + * Get the base offset and length of the buffer. Note that + * for block sizes that are less then PAGE_SIZE, the b_data + * base of the buffer does not represent exactly b_offset and + * neither b_offset nor b_size are necessarily page aligned. + * Instead, the starting position of b_offset is: + * + * b_data + (b_offset & PAGE_MASK) + * + * block sizes less then DEV_BSIZE (usually 512) are not + * supported due to the page granularity bits (m->valid, + * m->dirty, etc...). + * + * See man buf(9) for more information + */ + + resid = bp->b_bufsize; + foff = bp->b_offset; + + for (i = 0; i < bp->b_npages; i++) { + m = bp->b_pages[i]; + vm_page_flag_clear(m, PG_ZERO); + if (m == bogus_page) { + + obj = (vm_object_t) vp->v_object; + poff = OFF_TO_IDX(bp->b_offset); + + for (j = i; j < bp->b_npages; j++) { + m = bp->b_pages[j]; + if (m == bogus_page) { + m = vm_page_lookup(obj, poff + j); +#if !defined(MAX_PERF) + if (!m) { + panic("brelse: page missing\n"); + } +#endif + bp->b_pages[j] = m; + } + } + + if ((bp->b_flags & B_INVAL) == 0) { + pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); + } + } + if (bp->b_flags & (B_NOCACHE|B_ERROR)) { + int poffset = foff & PAGE_MASK; + int presid = resid > (PAGE_SIZE - poffset) ? + (PAGE_SIZE - poffset) : resid; + + KASSERT(presid >= 0, ("brelse: extra page")); + vm_page_set_invalid(m, poffset, presid); + } + resid -= PAGE_SIZE - (foff & PAGE_MASK); + foff = (foff + PAGE_SIZE) & ~PAGE_MASK; + } + + if (bp->b_flags & (B_INVAL | B_RELBUF)) + vfs_vmio_release(bp); + + } else if (bp->b_flags & B_VMIO) { + + if (bp->b_flags & (B_INVAL | B_RELBUF)) + vfs_vmio_release(bp); + + } + +#if !defined(MAX_PERF) + if (bp->b_qindex != QUEUE_NONE) + panic("brelse: free buffer onto another queue???"); +#endif + + /* enqueue */ + /* buffers with no memory */ + if (bp->b_bufsize == 0) { + bp->b_flags |= B_INVAL; + bp->b_qindex = QUEUE_EMPTY; + TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist); + LIST_REMOVE(bp, b_hash); + LIST_INSERT_HEAD(&invalhash, bp, b_hash); + bp->b_dev = NODEV; + kvafreespace += bp->b_kvasize; + + /* buffers with junk contents */ + } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) { + bp->b_flags |= B_INVAL; + bp->b_qindex = QUEUE_AGE; + TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); + LIST_REMOVE(bp, b_hash); + LIST_INSERT_HEAD(&invalhash, bp, b_hash); + bp->b_dev = NODEV; + + /* buffers that are locked */ + } else if (bp->b_flags & B_LOCKED) { + bp->b_qindex = QUEUE_LOCKED; + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); + + /* buffers with stale but valid contents */ + } else if (bp->b_flags & B_AGE) { + bp->b_qindex = QUEUE_AGE; + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); + + /* buffers with valid and quite potentially reuseable contents */ + } else { + bp->b_qindex = QUEUE_LRU; + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); + } + + if ((bp->b_flags & B_INVAL) || + (bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) { + if (bp->b_flags & B_DELWRI) { + --numdirtybuffers; + bp->b_flags &= ~B_DELWRI; + } + vfs_bio_need_satisfy(); + } + + /* unlock */ + bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY | + B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); + splx(s); +} + +/* + * Release a buffer. + */ +void +bqrelse(struct buf * bp) +{ + int s; + + s = splbio(); + + /* anyone need this block? */ + if (bp->b_flags & B_WANTED) { + bp->b_flags &= ~(B_WANTED | B_AGE); + wakeup(bp); + } + +#if !defined(MAX_PERF) + if (bp->b_qindex != QUEUE_NONE) + panic("bqrelse: free buffer onto another queue???"); +#endif + + if (bp->b_flags & B_LOCKED) { + bp->b_flags &= ~B_ERROR; + bp->b_qindex = QUEUE_LOCKED; + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); + /* buffers with stale but valid contents */ + } else { + bp->b_qindex = QUEUE_LRU; + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); + } + + if ((bp->b_flags & (B_LOCKED|B_DELWRI)) == 0) { + vfs_bio_need_satisfy(); + } + + /* unlock */ + bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY | + B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); + splx(s); +} + +static void +vfs_vmio_release(bp) + struct buf *bp; +{ + int i, s; + vm_page_t m; + + s = splvm(); + for (i = 0; i < bp->b_npages; i++) { + m = bp->b_pages[i]; + bp->b_pages[i] = NULL; + /* + * In order to keep page LRU ordering consistent, put + * everything on the inactive queue. + */ + vm_page_unwire(m, 0); + /* + * We don't mess with busy pages, it is + * the responsibility of the process that + * busied the pages to deal with them. + */ + if ((m->flags & PG_BUSY) || (m->busy != 0)) + continue; + + if (m->wire_count == 0) { + vm_page_flag_clear(m, PG_ZERO); + /* + * Might as well free the page if we can and it has + * no valid data. + */ + if ((bp->b_flags & B_ASYNC) == 0 && !m->valid && m->hold_count == 0) { + vm_page_busy(m); + vm_page_protect(m, VM_PROT_NONE); + vm_page_free(m); + } + } + } + splx(s); + bufspace -= bp->b_bufsize; + vmiospace -= bp->b_bufsize; + pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); + bp->b_npages = 0; + bp->b_bufsize = 0; + bp->b_flags &= ~B_VMIO; + if (bp->b_vp) + brelvp(bp); +} + +/* + * Check to see if a block is currently memory resident. + */ +struct buf * +gbincore(struct vnode * vp, daddr_t blkno) +{ + struct buf *bp; + struct bufhashhdr *bh; + + bh = BUFHASH(vp, blkno); + bp = bh->lh_first; + + /* Search hash chain */ + while (bp != NULL) { + /* hit */ + if (bp->b_vp == vp && bp->b_lblkno == blkno && + (bp->b_flags & B_INVAL) == 0) { + break; + } + bp = bp->b_hash.le_next; + } + return (bp); +} + +/* + * this routine implements clustered async writes for + * clearing out B_DELWRI buffers... This is much better + * than the old way of writing only one buffer at a time. + */ +int +vfs_bio_awrite(struct buf * bp) +{ + int i; + daddr_t lblkno = bp->b_lblkno; + struct vnode *vp = bp->b_vp; + int s; + int ncl; + struct buf *bpa; + int nwritten; + int size; + int maxcl; + + s = splbio(); + /* + * right now we support clustered writing only to regular files + */ + if ((vp->v_type == VREG) && + (vp->v_mount != 0) && /* Only on nodes that have the size info */ + (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { + + size = vp->v_mount->mnt_stat.f_iosize; + maxcl = MAXPHYS / size; + + for (i = 1; i < maxcl; i++) { + if ((bpa = gbincore(vp, lblkno + i)) && + ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) == + (B_DELWRI | B_CLUSTEROK)) && + (bpa->b_bufsize == size)) { + if ((bpa->b_blkno == bpa->b_lblkno) || + (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT))) + break; + } else { + break; + } + } + ncl = i; + /* + * this is a possible cluster write + */ + if (ncl != 1) { + nwritten = cluster_wbuild(vp, size, lblkno, ncl); + splx(s); + return nwritten; + } + } + + bremfree(bp); + bp->b_flags |= B_BUSY | B_ASYNC; + + splx(s); + /* + * default (old) behavior, writing out only one block + */ + nwritten = bp->b_bufsize; + (void) VOP_BWRITE(bp); + return nwritten; +} + + +/* + * Find a buffer header which is available for use. + */ +static struct buf * +getnewbuf(struct vnode *vp, daddr_t blkno, + int slpflag, int slptimeo, int size, int maxsize) +{ + struct buf *bp, *bp1; + int nbyteswritten = 0; + vm_offset_t addr; + static int writerecursion = 0; + +start: + if (bufspace >= maxbufspace) + goto trytofreespace; + + /* can we constitute a new buffer? */ + if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) { +#if !defined(MAX_PERF) + if (bp->b_qindex != QUEUE_EMPTY) + panic("getnewbuf: inconsistent EMPTY queue, qindex=%d", + bp->b_qindex); +#endif + bp->b_flags |= B_BUSY; + bremfree(bp); + goto fillbuf; + } +trytofreespace: + /* + * We keep the file I/O from hogging metadata I/O + * This is desirable because file data is cached in the + * VM/Buffer cache even if a buffer is freed. + */ + if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) { +#if !defined(MAX_PERF) + if (bp->b_qindex != QUEUE_AGE) + panic("getnewbuf: inconsistent AGE queue, qindex=%d", + bp->b_qindex); +#endif + } else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) { +#if !defined(MAX_PERF) + if (bp->b_qindex != QUEUE_LRU) + panic("getnewbuf: inconsistent LRU queue, qindex=%d", + bp->b_qindex); +#endif + } + if (!bp) { + /* wait for a free buffer of any kind */ + needsbuffer |= VFS_BIO_NEED_ANY; + do + tsleep(&needsbuffer, (PRIBIO + 4) | slpflag, "newbuf", + slptimeo); + while (needsbuffer & VFS_BIO_NEED_ANY); + return (0); + } + KASSERT(!(bp->b_flags & B_BUSY), + ("getnewbuf: busy buffer on free list\n")); + /* + * We are fairly aggressive about freeing VMIO buffers, but since + * the buffering is intact without buffer headers, there is not + * much loss. We gain by maintaining non-VMIOed metadata in buffers. + */ + if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) { + if ((bp->b_flags & B_VMIO) == 0 || + (vmiospace < maxvmiobufspace)) { + --bp->b_usecount; + TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist); + if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) { + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); + goto start; + } + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); + } + } + + + /* if we are a delayed write, convert to an async write */ + if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) { + + /* + * If our delayed write is likely to be used soon, then + * recycle back onto the LRU queue. + */ + if (vp && (bp->b_vp == vp) && (bp->b_qindex == QUEUE_LRU) && + (bp->b_lblkno >= blkno) && (maxsize > 0)) { + + if (bp->b_usecount > 0) { + if (bp->b_lblkno < blkno + (MAXPHYS / maxsize)) { + + TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist); + + if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) { + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); + bp->b_usecount--; + goto start; + } + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); + } + } + } + + /* + * Certain layered filesystems can recursively re-enter the vfs_bio + * code, due to delayed writes. This helps keep the system from + * deadlocking. + */ + if (writerecursion > 0) { + if (writerecursion > 5) { + bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]); + while (bp) { + if ((bp->b_flags & B_DELWRI) == 0) + break; + bp = TAILQ_NEXT(bp, b_freelist); + } + if (bp == NULL) { + bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]); + while (bp) { + if ((bp->b_flags & B_DELWRI) == 0) + break; + bp = TAILQ_NEXT(bp, b_freelist); + } + } + if (bp == NULL) + panic("getnewbuf: cannot get buffer, infinite recursion failure"); + } else { + bremfree(bp); + bp->b_flags |= B_BUSY | B_AGE | B_ASYNC; + nbyteswritten += bp->b_bufsize; + ++writerecursion; + VOP_BWRITE(bp); + --writerecursion; + if (!slpflag && !slptimeo) { + return (0); + } + goto start; + } + } else { + ++writerecursion; + nbyteswritten += vfs_bio_awrite(bp); + --writerecursion; + if (!slpflag && !slptimeo) { + return (0); + } + goto start; + } + } + + if (bp->b_flags & B_WANTED) { + bp->b_flags &= ~B_WANTED; + wakeup(bp); + } + bremfree(bp); + bp->b_flags |= B_BUSY; + + if (bp->b_flags & B_VMIO) { + bp->b_flags &= ~B_ASYNC; + vfs_vmio_release(bp); + } + + if (bp->b_vp) + brelvp(bp); + +fillbuf: + + /* we are not free, nor do we contain interesting data */ + if (bp->b_rcred != NOCRED) { + crfree(bp->b_rcred); + bp->b_rcred = NOCRED; + } + if (bp->b_wcred != NOCRED) { + crfree(bp->b_wcred); + bp->b_wcred = NOCRED; + } + if (LIST_FIRST(&bp->b_dep) != NULL && + bioops.io_deallocate) + (*bioops.io_deallocate)(bp); + + LIST_REMOVE(bp, b_hash); + LIST_INSERT_HEAD(&invalhash, bp, b_hash); + if (bp->b_bufsize) { + allocbuf(bp, 0); + } + bp->b_flags = B_BUSY; + bp->b_dev = NODEV; + bp->b_vp = NULL; + bp->b_blkno = bp->b_lblkno = 0; + bp->b_offset = NOOFFSET; + bp->b_iodone = 0; + bp->b_error = 0; + bp->b_resid = 0; + bp->b_bcount = 0; + bp->b_npages = 0; + bp->b_dirtyoff = bp->b_dirtyend = 0; + bp->b_validoff = bp->b_validend = 0; + bp->b_usecount = 5; + /* Here, not kern_physio.c, is where this should be done*/ + LIST_INIT(&bp->b_dep); + + maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK; + + /* + * we assume that buffer_map is not at address 0 + */ + addr = 0; + if (maxsize != bp->b_kvasize) { + bfreekva(bp); + +findkvaspace: + /* + * See if we have buffer kva space + */ + if (vm_map_findspace(buffer_map, + vm_map_min(buffer_map), maxsize, &addr)) { + if (kvafreespace > 0) { + int totfree = 0, freed; + do { + freed = 0; + for (bp1 = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]); + bp1 != NULL; bp1 = TAILQ_NEXT(bp1, b_freelist)) { + if (bp1->b_kvasize != 0) { + totfree += bp1->b_kvasize; + freed = bp1->b_kvasize; + bremfree(bp1); + bfreekva(bp1); + brelse(bp1); + break; + } + } + } while (freed); + /* + * if we found free space, then retry with the same buffer. + */ + if (totfree) + goto findkvaspace; + } + bp->b_flags |= B_INVAL; + brelse(bp); + goto trytofreespace; + } + } + + /* + * See if we are below are allocated minimum + */ + if (bufspace >= (maxbufspace + nbyteswritten)) { + bp->b_flags |= B_INVAL; + brelse(bp); + goto trytofreespace; + } + + /* + * create a map entry for the buffer -- in essence + * reserving the kva space. + */ + if (addr) { + vm_map_insert(buffer_map, NULL, 0, + addr, addr + maxsize, + VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); + + bp->b_kvabase = (caddr_t) addr; + bp->b_kvasize = maxsize; + } + bp->b_data = bp->b_kvabase; + + return (bp); +} + +static void +waitfreebuffers(int slpflag, int slptimeo) { + while (numfreebuffers < hifreebuffers) { + flushdirtybuffers(slpflag, slptimeo); + if (numfreebuffers < hifreebuffers) + break; + needsbuffer |= VFS_BIO_NEED_FREE; + if (tsleep(&needsbuffer, (PRIBIO + 4)|slpflag, "biofre", slptimeo)) + break; + } +} + +static void +flushdirtybuffers(int slpflag, int slptimeo) { + int s; + static pid_t flushing = 0; + + s = splbio(); + + if (flushing) { + if (flushing == curproc->p_pid) { + splx(s); + return; + } + while (flushing) { + if (tsleep(&flushing, (PRIBIO + 4)|slpflag, "biofls", slptimeo)) { + splx(s); + return; + } + } + } + flushing = curproc->p_pid; + + while (numdirtybuffers > lodirtybuffers) { + struct buf *bp; + needsbuffer |= VFS_BIO_NEED_LOWLIMIT; + bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]); + if (bp == NULL) + bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]); + + while (bp && ((bp->b_flags & B_DELWRI) == 0)) { + bp = TAILQ_NEXT(bp, b_freelist); + } + + if (bp) { + vfs_bio_awrite(bp); + continue; + } + break; + } + + flushing = 0; + wakeup(&flushing); + splx(s); +} + +/* + * Check to see if a block is currently memory resident. + */ +struct buf * +incore(struct vnode * vp, daddr_t blkno) +{ + struct buf *bp; + + int s = splbio(); + bp = gbincore(vp, blkno); + splx(s); + return (bp); +} + +/* + * Returns true if no I/O is needed to access the + * associated VM object. This is like incore except + * it also hunts around in the VM system for the data. + */ + +int +inmem(struct vnode * vp, daddr_t blkno) +{ + vm_object_t obj; + vm_offset_t toff, tinc, size; + vm_page_t m; + vm_ooffset_t off; + + if (incore(vp, blkno)) + return 1; + if (vp->v_mount == NULL) + return 0; + if ((vp->v_object == NULL) || (vp->v_flag & VOBJBUF) == 0) + return 0; + + obj = vp->v_object; + size = PAGE_SIZE; + if (size > vp->v_mount->mnt_stat.f_iosize) + size = vp->v_mount->mnt_stat.f_iosize; + off = (vm_ooffset_t)blkno * (vm_ooffset_t)vp->v_mount->mnt_stat.f_iosize; + + for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { + m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); + if (!m) + return 0; + tinc = size; + if (tinc > PAGE_SIZE - ((toff + off) & PAGE_MASK)) + tinc = PAGE_SIZE - ((toff + off) & PAGE_MASK); + if (vm_page_is_valid(m, + (vm_offset_t) ((toff + off) & PAGE_MASK), tinc) == 0) + return 0; + } + return 1; +} + +/* + * now we set the dirty range for the buffer -- + * for NFS -- if the file is mapped and pages have + * been written to, let it know. We want the + * entire range of the buffer to be marked dirty if + * any of the pages have been written to for consistancy + * with the b_validoff, b_validend set in the nfs write + * code, and used by the nfs read code. + */ +static void +vfs_setdirty(struct buf *bp) { + int i; + vm_object_t object; + vm_offset_t boffset; +#if 0 + vm_offset_t offset; +#endif + + /* + * We qualify the scan for modified pages on whether the + * object has been flushed yet. The OBJ_WRITEABLE flag + * is not cleared simply by protecting pages off. + */ + if ((bp->b_flags & B_VMIO) && + ((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) { + /* + * test the pages to see if they have been modified directly + * by users through the VM system. + */ + for (i = 0; i < bp->b_npages; i++) { + vm_page_flag_clear(bp->b_pages[i], PG_ZERO); + vm_page_test_dirty(bp->b_pages[i]); + } + + /* + * scan forwards for the first page modified + */ + for (i = 0; i < bp->b_npages; i++) { + if (bp->b_pages[i]->dirty) { + break; + } + } + + boffset = (i << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); + if (boffset < bp->b_dirtyoff) { + bp->b_dirtyoff = max(boffset, 0); + } + + /* + * scan backwards for the last page modified + */ + for (i = bp->b_npages - 1; i >= 0; --i) { + if (bp->b_pages[i]->dirty) { + break; + } + } + boffset = (i + 1); +#if 0 + offset = boffset + bp->b_pages[0]->pindex; + if (offset >= object->size) + boffset = object->size - bp->b_pages[0]->pindex; +#endif + boffset = (boffset << PAGE_SHIFT) - (bp->b_offset & PAGE_MASK); + if (bp->b_dirtyend < boffset) + bp->b_dirtyend = min(boffset, bp->b_bufsize); + } +} + +/* + * Get a block given a specified block and offset into a file/device. + */ +struct buf * +getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) +{ + struct buf *bp; + int i, s; + struct bufhashhdr *bh; + +#if !defined(MAX_PERF) + if (size > MAXBSIZE) + panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); +#endif + + s = splbio(); +loop: + if (numfreebuffers < lofreebuffers) { + waitfreebuffers(slpflag, slptimeo); + } + + if ((bp = gbincore(vp, blkno))) { + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + if (bp->b_usecount < BUF_MAXUSE) + ++bp->b_usecount; + + if (!tsleep(bp, + (PRIBIO + 4) | slpflag, "getblk", slptimeo)) { + goto loop; + } + + splx(s); + return (struct buf *) NULL; + } + bp->b_flags |= B_BUSY | B_CACHE; + bremfree(bp); + + /* + * check for size inconsistancies for non-VMIO case. + */ + + if (bp->b_bcount != size) { + if ((bp->b_flags & B_VMIO) == 0 || + (size > bp->b_kvasize) + ) { + if (bp->b_flags & B_DELWRI) { + bp->b_flags |= B_NOCACHE; + VOP_BWRITE(bp); + } else { + if ((bp->b_flags & B_VMIO) && + (LIST_FIRST(&bp->b_dep) == NULL)) { + bp->b_flags |= B_RELBUF; + brelse(bp); + } else { + bp->b_flags |= B_NOCACHE; + VOP_BWRITE(bp); + } + } + goto loop; + } + } + + /* + * If the size is inconsistant in the VMIO case, we can resize + * the buffer. This might lead to B_CACHE getting cleared. + */ + + if (bp->b_bcount != size) + allocbuf(bp, size); + + KASSERT(bp->b_offset != NOOFFSET, + ("getblk: no buffer offset")); + + /* + * Check that the constituted buffer really deserves for the + * B_CACHE bit to be set. B_VMIO type buffers might not + * contain fully valid pages. Normal (old-style) buffers + * should be fully valid. This might also lead to B_CACHE + * getting clear. + */ + if ((bp->b_flags & B_VMIO|B_CACHE) == (B_VMIO|B_CACHE)) { + int checksize = bp->b_bufsize; + int poffset = bp->b_offset & PAGE_MASK; + int resid; + for (i = 0; i < bp->b_npages; i++) { + resid = (checksize > (PAGE_SIZE - poffset)) ? + (PAGE_SIZE - poffset) : checksize; + if (!vm_page_is_valid(bp->b_pages[i], poffset, resid)) { + bp->b_flags &= ~(B_CACHE | B_DONE); + break; + } + checksize -= resid; + poffset = 0; + } + } + + /* + * If B_DELWRI is set and B_CACHE got cleared ( or was + * already clear ), we have to commit the write and + * retry. The NFS code absolutely depends on this, + * and so might the FFS code. In anycase, it formalizes + * the B_CACHE rules. See sys/buf.h. + */ + + if ((bp->b_flags & (B_CACHE|B_DELWRI)) == B_DELWRI) { + VOP_BWRITE(bp); + goto loop; + } + + if (bp->b_usecount < BUF_MAXUSE) + ++bp->b_usecount; + splx(s); + return (bp); + } else { + int bsize, maxsize, vmio; + off_t offset; + + if (vp->v_type == VBLK) + bsize = DEV_BSIZE; + else if (vp->v_mountedhere) + bsize = vp->v_mountedhere->mnt_stat.f_iosize; + else if (vp->v_mount) + bsize = vp->v_mount->mnt_stat.f_iosize; + else + bsize = size; + + offset = (off_t)blkno * bsize; + vmio = (vp->v_object != 0) && (vp->v_flag & VOBJBUF); + maxsize = vmio ? size + (offset & PAGE_MASK) : size; + maxsize = imax(maxsize, bsize); + + if ((bp = getnewbuf(vp, blkno, + slpflag, slptimeo, size, maxsize)) == 0) { + if (slpflag || slptimeo) { + splx(s); + return NULL; + } + goto loop; + } + + /* + * This code is used to make sure that a buffer is not + * created while the getnewbuf routine is blocked. + * Normally the vnode is locked so this isn't a problem. + * VBLK type I/O requests, however, don't lock the vnode. + */ + if (VOP_ISLOCKED(vp) != LK_EXCLUSIVE && gbincore(vp, blkno)) { + bp->b_flags |= B_INVAL; + brelse(bp); + goto loop; + } + + /* + * Insert the buffer into the hash, so that it can + * be found by incore. + */ + bp->b_blkno = bp->b_lblkno = blkno; + bp->b_offset = offset; + + bgetvp(vp, bp); + LIST_REMOVE(bp, b_hash); + bh = BUFHASH(vp, blkno); + LIST_INSERT_HEAD(bh, bp, b_hash); + + if (vmio) { + bp->b_flags |= (B_VMIO | B_CACHE); +#if defined(VFS_BIO_DEBUG) + if (vp->v_type != VREG && vp->v_type != VBLK) + printf("getblk: vmioing file type %d???\n", vp->v_type); +#endif + } else { + bp->b_flags &= ~B_VMIO; + } + + allocbuf(bp, size); + + splx(s); + return (bp); + } +} + +/* + * Get an empty, disassociated buffer of given size. + */ +struct buf * +geteblk(int size) +{ + struct buf *bp; + int s; + + s = splbio(); + while ((bp = getnewbuf(0, (daddr_t) 0, 0, 0, size, MAXBSIZE)) == 0); + splx(s); + allocbuf(bp, size); + bp->b_flags |= B_INVAL; /* b_dep cleared by getnewbuf() */ + return (bp); +} + + +/* + * This code constitutes the buffer memory from either anonymous system + * memory (in the case of non-VMIO operations) or from an associated + * VM object (in the case of VMIO operations). This code is able to + * resize a buffer up or down. + * + * Note that this code is tricky, and has many complications to resolve + * deadlock or inconsistant data situations. Tread lightly!!! + * There are B_CACHE and B_DELWRI interactions that must be dealt with by + * the caller. Calling this code willy nilly can result in the loss of data. + */ + +int +allocbuf(struct buf *bp, int size) +{ + int newbsize, mbsize; + int i; + +#if !defined(MAX_PERF) + if (!(bp->b_flags & B_BUSY)) + panic("allocbuf: buffer not busy"); + + if (bp->b_kvasize < size) + panic("allocbuf: buffer too small"); +#endif + + if ((bp->b_flags & B_VMIO) == 0) { + caddr_t origbuf; + int origbufsize; + /* + * Just get anonymous memory from the kernel + */ + mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); +#if !defined(NO_B_MALLOC) + if (bp->b_flags & B_MALLOC) + newbsize = mbsize; + else +#endif + newbsize = round_page(size); + + if (newbsize < bp->b_bufsize) { +#if !defined(NO_B_MALLOC) + /* + * malloced buffers are not shrunk + */ + if (bp->b_flags & B_MALLOC) { + if (newbsize) { + bp->b_bcount = size; + } else { + free(bp->b_data, M_BIOBUF); + bufspace -= bp->b_bufsize; + bufmallocspace -= bp->b_bufsize; + bp->b_data = bp->b_kvabase; + bp->b_bufsize = 0; + bp->b_bcount = 0; + bp->b_flags &= ~B_MALLOC; + } + return 1; + } +#endif + vm_hold_free_pages( + bp, + (vm_offset_t) bp->b_data + newbsize, + (vm_offset_t) bp->b_data + bp->b_bufsize); + } else if (newbsize > bp->b_bufsize) { +#if !defined(NO_B_MALLOC) + /* + * We only use malloced memory on the first allocation. + * and revert to page-allocated memory when the buffer grows. + */ + if ( (bufmallocspace < maxbufmallocspace) && + (bp->b_bufsize == 0) && + (mbsize <= PAGE_SIZE/2)) { + + bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK); + bp->b_bufsize = mbsize; + bp->b_bcount = size; + bp->b_flags |= B_MALLOC; + bufspace += mbsize; + bufmallocspace += mbsize; + return 1; + } +#endif + origbuf = NULL; + origbufsize = 0; +#if !defined(NO_B_MALLOC) + /* + * If the buffer is growing on its other-than-first allocation, + * then we revert to the page-allocation scheme. + */ + if (bp->b_flags & B_MALLOC) { + origbuf = bp->b_data; + origbufsize = bp->b_bufsize; + bp->b_data = bp->b_kvabase; + bufspace -= bp->b_bufsize; + bufmallocspace -= bp->b_bufsize; + bp->b_bufsize = 0; + bp->b_flags &= ~B_MALLOC; + newbsize = round_page(newbsize); + } +#endif + vm_hold_load_pages( + bp, + (vm_offset_t) bp->b_data + bp->b_bufsize, + (vm_offset_t) bp->b_data + newbsize); +#if !defined(NO_B_MALLOC) + if (origbuf) { + bcopy(origbuf, bp->b_data, origbufsize); + free(origbuf, M_BIOBUF); + } +#endif + } + } else { + vm_page_t m; + int desiredpages; + + newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); + desiredpages = (size == 0) ? 0 : + num_pages((bp->b_offset & PAGE_MASK) + newbsize); + +#if !defined(NO_B_MALLOC) + if (bp->b_flags & B_MALLOC) + panic("allocbuf: VMIO buffer can't be malloced"); +#endif + + if (newbsize < bp->b_bufsize) { + if (desiredpages < bp->b_npages) { + for (i = desiredpages; i < bp->b_npages; i++) { + /* + * the page is not freed here -- it + * is the responsibility of vnode_pager_setsize + */ + m = bp->b_pages[i]; + KASSERT(m != bogus_page, + ("allocbuf: bogus page found")); + while (vm_page_sleep_busy(m, TRUE, "biodep")) + ; + + bp->b_pages[i] = NULL; + vm_page_unwire(m, 0); + } + pmap_qremove((vm_offset_t) trunc_page((vm_offset_t)bp->b_data) + + (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); + bp->b_npages = desiredpages; + } + } else if (newbsize > bp->b_bufsize) { + vm_object_t obj; + vm_offset_t tinc, toff; + vm_ooffset_t off; + vm_pindex_t objoff; + int pageindex, curbpnpages; + struct vnode *vp; + int bsize; + int orig_validoff = bp->b_validoff; + int orig_validend = bp->b_validend; + + vp = bp->b_vp; + + if (vp->v_type == VBLK) + bsize = DEV_BSIZE; + else + bsize = vp->v_mount->mnt_stat.f_iosize; + + if (bp->b_npages < desiredpages) { + obj = vp->v_object; + tinc = PAGE_SIZE; + + off = bp->b_offset; + KASSERT(bp->b_offset != NOOFFSET, + ("allocbuf: no buffer offset")); + curbpnpages = bp->b_npages; + doretry: + bp->b_validoff = orig_validoff; + bp->b_validend = orig_validend; + bp->b_flags |= B_CACHE; + for (toff = 0; toff < newbsize; toff += tinc) { + objoff = OFF_TO_IDX(off + toff); + pageindex = objoff - OFF_TO_IDX(off); + tinc = PAGE_SIZE - ((off + toff) & PAGE_MASK); + if (pageindex < curbpnpages) { + + m = bp->b_pages[pageindex]; +#ifdef VFS_BIO_DIAG + if (m->pindex != objoff) + panic("allocbuf: page changed offset?!!!?"); +#endif + if (tinc > (newbsize - toff)) + tinc = newbsize - toff; + if (bp->b_flags & B_CACHE) + vfs_buf_set_valid(bp, off, toff, tinc, m); + continue; + } + m = vm_page_lookup(obj, objoff); + if (!m) { + m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL); + if (!m) { + VM_WAIT; + vm_pageout_deficit += (desiredpages - curbpnpages); + goto doretry; + } + + vm_page_wire(m); + vm_page_wakeup(m); + bp->b_flags &= ~B_CACHE; + + } else if (vm_page_sleep_busy(m, FALSE, "pgtblk")) { + /* + * If we had to sleep, retry. + * + * Also note that we only test + * PG_BUSY here, not m->busy. + * + * We cannot sleep on m->busy + * here because a vm_fault -> + * getpages -> cluster-read -> + * ...-> allocbuf sequence + * will convert PG_BUSY to + * m->busy so we have to let + * m->busy through if we do + * not want to deadlock. + */ + goto doretry; + } else { + if ((curproc != pageproc) && + ((m->queue - m->pc) == PQ_CACHE) && + ((cnt.v_free_count + cnt.v_cache_count) < + (cnt.v_free_min + cnt.v_cache_min))) { + pagedaemon_wakeup(); + } + if (tinc > (newbsize - toff)) + tinc = newbsize - toff; + if (bp->b_flags & B_CACHE) + vfs_buf_set_valid(bp, off, toff, tinc, m); + vm_page_flag_clear(m, PG_ZERO); + vm_page_wire(m); + } + bp->b_pages[pageindex] = m; + curbpnpages = pageindex + 1; + } + if (vp->v_tag == VT_NFS && + vp->v_type != VBLK) { + if (bp->b_dirtyend > 0) { + bp->b_validoff = min(bp->b_validoff, bp->b_dirtyoff); + bp->b_validend = max(bp->b_validend, bp->b_dirtyend); + } + if (bp->b_validend == 0) + bp->b_flags &= ~B_CACHE; + } + bp->b_data = (caddr_t) trunc_page((vm_offset_t)bp->b_data); + bp->b_npages = curbpnpages; + pmap_qenter((vm_offset_t) bp->b_data, + bp->b_pages, bp->b_npages); + ((vm_offset_t) bp->b_data) |= off & PAGE_MASK; + } + } + } + if (bp->b_flags & B_VMIO) + vmiospace += (newbsize - bp->b_bufsize); + bufspace += (newbsize - bp->b_bufsize); + bp->b_bufsize = newbsize; + bp->b_bcount = size; + return 1; +} + +/* + * Wait for buffer I/O completion, returning error status. + */ +int +biowait(register struct buf * bp) +{ + int s; + + s = splbio(); + while ((bp->b_flags & B_DONE) == 0) +#if defined(NO_SCHEDULE_MODS) + tsleep(bp, PRIBIO, "biowait", 0); +#else + if (bp->b_flags & B_READ) + tsleep(bp, PRIBIO, "biord", 0); + else + tsleep(bp, PRIBIO, "biowr", 0); +#endif + splx(s); + if (bp->b_flags & B_EINTR) { + bp->b_flags &= ~B_EINTR; + return (EINTR); + } + if (bp->b_flags & B_ERROR) { + return (bp->b_error ? bp->b_error : EIO); + } else { + return (0); + } +} + +/* + * Finish I/O on a buffer, calling an optional function. + * This is usually called from interrupt level, so process blocking + * is not *a good idea*. + */ +void +biodone(register struct buf * bp) +{ + int s; + + s = splbio(); + +#if !defined(MAX_PERF) + if (!(bp->b_flags & B_BUSY)) + panic("biodone: buffer not busy"); +#endif + + if (bp->b_flags & B_DONE) { + splx(s); +#if !defined(MAX_PERF) + printf("biodone: buffer already done\n"); +#endif + return; + } + bp->b_flags |= B_DONE; + + if (bp->b_flags & B_FREEBUF) { + brelse(bp); + splx(s); + return; + } + + if ((bp->b_flags & B_READ) == 0) { + vwakeup(bp); + } + + /* call optional completion function if requested */ + if (bp->b_flags & B_CALL) { + bp->b_flags &= ~B_CALL; + (*bp->b_iodone) (bp); + splx(s); + return; + } + if (LIST_FIRST(&bp->b_dep) != NULL && bioops.io_complete) + (*bioops.io_complete)(bp); + + if (bp->b_flags & B_VMIO) { + int i, resid; + vm_ooffset_t foff; + vm_page_t m; + vm_object_t obj; + int iosize; + struct vnode *vp = bp->b_vp; + + obj = vp->v_object; + +#if defined(VFS_BIO_DEBUG) + if (vp->v_usecount == 0) { + panic("biodone: zero vnode ref count"); + } + + if (vp->v_object == NULL) { + panic("biodone: missing VM object"); + } + + if ((vp->v_flag & VOBJBUF) == 0) { + panic("biodone: vnode is not setup for merged cache"); + } +#endif + + foff = bp->b_offset; + KASSERT(bp->b_offset != NOOFFSET, + ("biodone: no buffer offset")); + +#if !defined(MAX_PERF) + if (!obj) { + panic("biodone: no object"); + } +#endif +#if defined(VFS_BIO_DEBUG) + if (obj->paging_in_progress < bp->b_npages) { + printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", + obj->paging_in_progress, bp->b_npages); + } +#endif + iosize = bp->b_bufsize; + for (i = 0; i < bp->b_npages; i++) { + int bogusflag = 0; + m = bp->b_pages[i]; + if (m == bogus_page) { + bogusflag = 1; + m = vm_page_lookup(obj, OFF_TO_IDX(foff)); + if (!m) { +#if defined(VFS_BIO_DEBUG) + printf("biodone: page disappeared\n"); +#endif + vm_object_pip_subtract(obj, 1); + continue; + } + bp->b_pages[i] = m; + pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); + } +#if defined(VFS_BIO_DEBUG) + if (OFF_TO_IDX(foff) != m->pindex) { + printf( +"biodone: foff(%lu)/m->pindex(%d) mismatch\n", + (unsigned long)foff, m->pindex); + } +#endif + resid = IDX_TO_OFF(m->pindex + 1) - foff; + if (resid > iosize) + resid = iosize; + + /* + * In the write case, the valid and clean bits are + * already changed correctly, so we only need to do this + * here in the read case. + */ + if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) { + vfs_page_set_valid(bp, foff, i, m); + } + vm_page_flag_clear(m, PG_ZERO); + + /* + * when debugging new filesystems or buffer I/O methods, this + * is the most common error that pops up. if you see this, you + * have not set the page busy flag correctly!!! + */ + if (m->busy == 0) { +#if !defined(MAX_PERF) + printf("biodone: page busy < 0, " + "pindex: %d, foff: 0x(%x,%x), " + "resid: %d, index: %d\n", + (int) m->pindex, (int)(foff >> 32), + (int) foff & 0xffffffff, resid, i); +#endif + if (vp->v_type != VBLK) +#if !defined(MAX_PERF) + printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n", + bp->b_vp->v_mount->mnt_stat.f_iosize, + (int) bp->b_lblkno, + bp->b_flags, bp->b_npages); + else + printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n", + (int) bp->b_lblkno, + bp->b_flags, bp->b_npages); + printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n", + m->valid, m->dirty, m->wire_count); +#endif + panic("biodone: page busy < 0\n"); + } + vm_page_io_finish(m); + vm_object_pip_subtract(obj, 1); + foff += resid; + iosize -= resid; + } + if (obj) + vm_object_pip_wakeupn(obj, 0); + } + /* + * For asynchronous completions, release the buffer now. The brelse + * checks for B_WANTED and will do the wakeup there if necessary - so + * no need to do a wakeup here in the async case. + */ + + if (bp->b_flags & B_ASYNC) { + if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0) + brelse(bp); + else + bqrelse(bp); + } else { + bp->b_flags &= ~B_WANTED; + wakeup(bp); + } + splx(s); +} + +#if 0 /* not with kirks code */ +static int vfs_update_interval = 30; + +static void +vfs_update() +{ + while (1) { + tsleep(&vfs_update_wakeup, PUSER, "update", + hz * vfs_update_interval); + vfs_update_wakeup = 0; + sync(curproc, NULL); + } +} + +static int +sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS +{ + int error = sysctl_handle_int(oidp, + oidp->oid_arg1, oidp->oid_arg2, req); + if (!error) + wakeup(&vfs_update_wakeup); + return error; +} + +SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW, + &vfs_update_interval, 0, sysctl_kern_updateinterval, "I", ""); + +#endif + + +/* + * This routine is called in lieu of iodone in the case of + * incomplete I/O. This keeps the busy status for pages + * consistant. + */ +void +vfs_unbusy_pages(struct buf * bp) +{ + int i; + + if (bp->b_flags & B_VMIO) { + struct vnode *vp = bp->b_vp; + vm_object_t obj = vp->v_object; + + for (i = 0; i < bp->b_npages; i++) { + vm_page_t m = bp->b_pages[i]; + + if (m == bogus_page) { + m = vm_page_lookup(obj, OFF_TO_IDX(bp->b_offset) + i); +#if !defined(MAX_PERF) + if (!m) { + panic("vfs_unbusy_pages: page missing\n"); + } +#endif + bp->b_pages[i] = m; + pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); + } + vm_object_pip_subtract(obj, 1); + vm_page_flag_clear(m, PG_ZERO); + vm_page_io_finish(m); + } + vm_object_pip_wakeupn(obj, 0); + } +} + +/* + * Set NFS' b_validoff and b_validend fields from the valid bits + * of a page. If the consumer is not NFS, and the page is not + * valid for the entire range, clear the B_CACHE flag to force + * the consumer to re-read the page. + * + * B_CACHE interaction is especially tricky. + */ +static void +vfs_buf_set_valid(struct buf *bp, + vm_ooffset_t foff, vm_offset_t off, vm_offset_t size, + vm_page_t m) +{ + if (bp->b_vp->v_tag == VT_NFS && bp->b_vp->v_type != VBLK) { + vm_offset_t svalid, evalid; + int validbits = m->valid >> (((foff+off)&PAGE_MASK)/DEV_BSIZE); + + /* + * This only bothers with the first valid range in the + * page. + */ + svalid = off; + while (validbits && !(validbits & 1)) { + svalid += DEV_BSIZE; + validbits >>= 1; + } + evalid = svalid; + while (validbits & 1) { + evalid += DEV_BSIZE; + validbits >>= 1; + } + evalid = min(evalid, off + size); + /* + * We can only set b_validoff/end if this range is contiguous + * with the range built up already. If we cannot set + * b_validoff/end, we must clear B_CACHE to force an update + * to clean the bp up. + */ + if (svalid == bp->b_validend) { + bp->b_validoff = min(bp->b_validoff, svalid); + bp->b_validend = max(bp->b_validend, evalid); + } else { + bp->b_flags &= ~B_CACHE; + } + } else if (!vm_page_is_valid(m, + (vm_offset_t) ((foff + off) & PAGE_MASK), + size)) { + bp->b_flags &= ~B_CACHE; + } +} + +/* + * Set the valid bits in a page, taking care of the b_validoff, + * b_validend fields which NFS uses to optimise small reads. Off is + * the offset within the file and pageno is the page index within the buf. + * + * XXX we have to set the valid & clean bits for all page fragments + * touched by b_validoff/validend, even if the page fragment goes somewhat + * beyond b_validoff/validend due to alignment. + */ +static void +vfs_page_set_valid(struct buf *bp, vm_ooffset_t off, int pageno, vm_page_t m) +{ + struct vnode *vp = bp->b_vp; + vm_ooffset_t soff, eoff; + + soff = off; + eoff = (off + PAGE_SIZE) & ~PAGE_MASK; + if (eoff > bp->b_offset + bp->b_bufsize) + eoff = bp->b_offset + bp->b_bufsize; + if (vp->v_tag == VT_NFS && vp->v_type != VBLK) { + vm_ooffset_t sv, ev; + vm_page_set_invalid(m, + (vm_offset_t) (soff & PAGE_MASK), + (vm_offset_t) (eoff - soff)); + sv = (bp->b_offset + bp->b_validoff + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); + ev = (bp->b_offset + bp->b_validend + (DEV_BSIZE - 1)) & + ~(DEV_BSIZE - 1); + soff = qmax(sv, soff); + eoff = qmin(ev, eoff); + } + if (eoff > soff) + vm_page_set_validclean(m, + (vm_offset_t) (soff & PAGE_MASK), + (vm_offset_t) (eoff - soff)); +} + +/* + * This routine is called before a device strategy routine. + * It is used to tell the VM system that paging I/O is in + * progress, and treat the pages associated with the buffer + * almost as being PG_BUSY. Also the object paging_in_progress + * flag is handled to make sure that the object doesn't become + * inconsistant. + */ +void +vfs_busy_pages(struct buf * bp, int clear_modify) +{ + int i, bogus; + + if (bp->b_flags & B_VMIO) { + struct vnode *vp = bp->b_vp; + vm_object_t obj = vp->v_object; + vm_ooffset_t foff; + + foff = bp->b_offset; + KASSERT(bp->b_offset != NOOFFSET, + ("vfs_busy_pages: no buffer offset")); + vfs_setdirty(bp); + +retry: + for (i = 0; i < bp->b_npages; i++) { + vm_page_t m = bp->b_pages[i]; + if (vm_page_sleep_busy(m, FALSE, "vbpage")) + goto retry; + } + + bogus = 0; + for (i = 0; i < bp->b_npages; i++) { + vm_page_t m = bp->b_pages[i]; + + vm_page_flag_clear(m, PG_ZERO); + if ((bp->b_flags & B_CLUSTER) == 0) { + vm_object_pip_add(obj, 1); + vm_page_io_start(m); + } + + vm_page_protect(m, VM_PROT_NONE); + if (clear_modify) + vfs_page_set_valid(bp, foff, i, m); + else if (m->valid == VM_PAGE_BITS_ALL && + (bp->b_flags & B_CACHE) == 0) { + bp->b_pages[i] = bogus_page; + bogus++; + } + foff = (foff + PAGE_SIZE) & ~PAGE_MASK; + } + if (bogus) + pmap_qenter(trunc_page((vm_offset_t)bp->b_data), bp->b_pages, bp->b_npages); + } +} + +/* + * Tell the VM system that the pages associated with this buffer + * are clean. This is used for delayed writes where the data is + * going to go to disk eventually without additional VM intevention. + */ +void +vfs_clean_pages(struct buf * bp) +{ + int i; + + if (bp->b_flags & B_VMIO) { + vm_ooffset_t foff; + foff = bp->b_offset; + KASSERT(bp->b_offset != NOOFFSET, + ("vfs_clean_pages: no buffer offset")); + for (i = 0; i < bp->b_npages; i++) { + vm_page_t m = bp->b_pages[i]; + vfs_page_set_valid(bp, foff, i, m); + foff = (foff + PAGE_SIZE) & ~PAGE_MASK; + } + } +} + +void +vfs_bio_clrbuf(struct buf *bp) { + int i, mask = 0; + caddr_t sa, ea; + if ((bp->b_flags & (B_VMIO | B_MALLOC)) == B_VMIO) { + if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE) && + (bp->b_offset & PAGE_MASK) == 0) { + mask = (1 << (bp->b_bufsize / DEV_BSIZE)) - 1; + if (((bp->b_pages[0]->flags & PG_ZERO) == 0) && + ((bp->b_pages[0]->valid & mask) != mask)) { + bzero(bp->b_data, bp->b_bufsize); + } + bp->b_pages[0]->valid |= mask; + bp->b_resid = 0; + return; + } + ea = sa = bp->b_data; + for(i=0;i<bp->b_npages;i++,sa=ea) { + int j = ((u_long)sa & PAGE_MASK) / DEV_BSIZE; + ea = (caddr_t)trunc_page((vm_offset_t)sa + PAGE_SIZE); + ea = (caddr_t)ulmin((u_long)ea, + (u_long)bp->b_data + bp->b_bufsize); + mask = ((1 << ((ea - sa) / DEV_BSIZE)) - 1) << j; + if ((bp->b_pages[i]->valid & mask) == mask) + continue; + if ((bp->b_pages[i]->valid & mask) == 0) { + if ((bp->b_pages[i]->flags & PG_ZERO) == 0) { + bzero(sa, ea - sa); + } + } else { + for (; sa < ea; sa += DEV_BSIZE, j++) { + if (((bp->b_pages[i]->flags & PG_ZERO) == 0) && + (bp->b_pages[i]->valid & (1<<j)) == 0) + bzero(sa, DEV_BSIZE); + } + } + bp->b_pages[i]->valid |= mask; + vm_page_flag_clear(bp->b_pages[i], PG_ZERO); + } + bp->b_resid = 0; + } else { + clrbuf(bp); + } +} + +/* + * vm_hold_load_pages and vm_hold_unload pages get pages into + * a buffers address space. The pages are anonymous and are + * not associated with a file object. + */ +void +vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) +{ + vm_offset_t pg; + vm_page_t p; + int index; + + to = round_page(to); + from = round_page(from); + index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; + + for (pg = from; pg < to; pg += PAGE_SIZE, index++) { + +tryagain: + + p = vm_page_alloc(kernel_object, + ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), + VM_ALLOC_NORMAL); + if (!p) { + vm_pageout_deficit += (to - from) >> PAGE_SHIFT; + VM_WAIT; + goto tryagain; + } + vm_page_wire(p); + p->valid = VM_PAGE_BITS_ALL; + vm_page_flag_clear(p, PG_ZERO); + pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); + bp->b_pages[index] = p; + vm_page_wakeup(p); + } + bp->b_npages = index; +} + +void +vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) +{ + vm_offset_t pg; + vm_page_t p; + int index, newnpages; + + from = round_page(from); + to = round_page(to); + newnpages = index = (from - trunc_page((vm_offset_t)bp->b_data)) >> PAGE_SHIFT; + + for (pg = from; pg < to; pg += PAGE_SIZE, index++) { + p = bp->b_pages[index]; + if (p && (index < bp->b_npages)) { +#if !defined(MAX_PERF) + if (p->busy) { + printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n", + bp->b_blkno, bp->b_lblkno); + } +#endif + bp->b_pages[index] = NULL; + pmap_kremove(pg); + vm_page_busy(p); + vm_page_unwire(p, 0); + vm_page_free(p); + } + } + bp->b_npages = newnpages; +} + + +#include "opt_ddb.h" +#ifdef DDB +#include <ddb/ddb.h> + +DB_SHOW_COMMAND(buffer, db_show_buffer) +{ + /* get args */ + struct buf *bp = (struct buf *)addr; + + if (!have_addr) { + db_printf("usage: show buffer <addr>\n"); + return; + } + + db_printf("b_proc = %p,\nb_flags = 0x%b\n", (void *)bp->b_proc, + (u_int)bp->b_flags, PRINT_BUF_FLAGS); + db_printf("b_error = %d, b_bufsize = %ld, b_bcount = %ld, " + "b_resid = %ld\nb_dev = 0x%x, b_data = %p, " + "b_blkno = %d, b_pblkno = %d\n", + bp->b_error, bp->b_bufsize, bp->b_bcount, bp->b_resid, + bp->b_dev, bp->b_data, bp->b_blkno, bp->b_pblkno); + if (bp->b_npages) { + int i; + db_printf("b_npages = %d, pages(OBJ, IDX, PA): ", bp->b_npages); + for (i = 0; i < bp->b_npages; i++) { + vm_page_t m; + m = bp->b_pages[i]; + db_printf("(%p, 0x%lx, 0x%lx)", (void *)m->object, + (u_long)m->pindex, (u_long)VM_PAGE_TO_PHYS(m)); + if ((i + 1) < bp->b_npages) + db_printf(","); + } + db_printf("\n"); + } +} +#endif /* DDB */ diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c new file mode 100644 index 0000000..a8ac5e7 --- /dev/null +++ b/sys/kern/vfs_cache.c @@ -0,0 +1,450 @@ +/* + * Copyright (c) 1989, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Poul-Henning Kamp of the FreeBSD Project. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 + * $Id: vfs_cache.c,v 1.37 1997/12/19 23:18:37 bde Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> +#include <sys/mount.h> +#include <sys/vnode.h> +#include <sys/namei.h> +#include <sys/malloc.h> + + +/* + * Name caching works as follows: + * + * Names found by directory scans are retained in a cache + * for future reference. It is managed LRU, so frequently + * used names will hang around. Cache is indexed by hash value + * obtained from (vp, name) where vp refers to the directory + * containing name. + * + * If it is a "negative" entry, (i.e. for a name that is known NOT to + * exist) the vnode pointer will be NULL. + * + * Upon reaching the last segment of a path, if the reference + * is for DELETE, or NOCACHE is set (rewrite), and the + * name is located in the cache, it will be dropped. + */ + +/* + * Structures associated with name cacheing. + */ +#define NCHHASH(dvp, cnp) \ + (&nchashtbl[((dvp)->v_id + (cnp)->cn_hash) & nchash]) +static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */ +static TAILQ_HEAD(, namecache) ncneg; /* Hash Table */ +static u_long nchash; /* size of hash table */ +SYSCTL_INT(_debug, OID_AUTO, nchash, CTLFLAG_RD, &nchash, 0, ""); +static u_long ncnegfactor = 16; /* ratio of negative entries */ +SYSCTL_INT(_debug, OID_AUTO, ncnegfactor, CTLFLAG_RW, &ncnegfactor, 0, ""); +static u_long numneg; /* number of cache entries allocated */ +SYSCTL_INT(_debug, OID_AUTO, numneg, CTLFLAG_RD, &numneg, 0, ""); +static u_long numcache; /* number of cache entries allocated */ +SYSCTL_INT(_debug, OID_AUTO, numcache, CTLFLAG_RD, &numcache, 0, ""); +struct nchstats nchstats; /* cache effectiveness statistics */ + +static int doingcache = 1; /* 1 => enable the cache */ +SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, ""); +SYSCTL_INT(_debug, OID_AUTO, vnsize, CTLFLAG_RD, 0, sizeof(struct vnode), ""); +SYSCTL_INT(_debug, OID_AUTO, ncsize, CTLFLAG_RD, 0, sizeof(struct namecache), ""); + +/* + * The new name cache statistics + */ +SYSCTL_NODE(_vfs, OID_AUTO, cache, CTLFLAG_RW, 0, "Name cache statistics"); +#define STATNODE(mode, name, var) \ + SYSCTL_INT(_vfs_cache, OID_AUTO, name, mode, var, 0, ""); +STATNODE(CTLFLAG_RD, numneg, &numneg); +STATNODE(CTLFLAG_RD, numcache, &numcache); +static u_long numcalls; STATNODE(CTLFLAG_RD, numcalls, &numcalls); +static u_long dothits; STATNODE(CTLFLAG_RD, dothits, &dothits); +static u_long dotdothits; STATNODE(CTLFLAG_RD, dotdothits, &dotdothits); +static u_long numchecks; STATNODE(CTLFLAG_RD, numchecks, &numchecks); +static u_long nummiss; STATNODE(CTLFLAG_RD, nummiss, &nummiss); +static u_long nummisszap; STATNODE(CTLFLAG_RD, nummisszap, &nummisszap); +static u_long numposzaps; STATNODE(CTLFLAG_RD, numposzaps, &numposzaps); +static u_long numposhits; STATNODE(CTLFLAG_RD, numposhits, &numposhits); +static u_long numnegzaps; STATNODE(CTLFLAG_RD, numnegzaps, &numnegzaps); +static u_long numneghits; STATNODE(CTLFLAG_RD, numneghits, &numneghits); + + +static void cache_zap __P((struct namecache *ncp)); + +/* + * Flags in namecache.nc_flag + */ +#define NCF_WHITE 1 +/* + * Delete an entry from its hash list and move it to the front + * of the LRU list for immediate reuse. + */ +static void +cache_zap(ncp) + struct namecache *ncp; +{ + LIST_REMOVE(ncp, nc_hash); + LIST_REMOVE(ncp, nc_src); + if (LIST_EMPTY(&ncp->nc_dvp->v_cache_src)) + vdrop(ncp->nc_dvp); + if (ncp->nc_vp) { + TAILQ_REMOVE(&ncp->nc_vp->v_cache_dst, ncp, nc_dst); + } else { + TAILQ_REMOVE(&ncneg, ncp, nc_dst); + numneg--; + } + numcache--; + free(ncp, M_CACHE); +} + +/* + * Lookup an entry in the cache + * + * We don't do this if the segment name is long, simply so the cache + * can avoid holding long names (which would either waste space, or + * add greatly to the complexity). + * + * Lookup is called with dvp pointing to the directory to search, + * cnp pointing to the name of the entry being sought. If the lookup + * succeeds, the vnode is returned in *vpp, and a status of -1 is + * returned. If the lookup determines that the name does not exist + * (negative cacheing), a status of ENOENT is returned. If the lookup + * fails, a status of zero is returned. + */ + +int +cache_lookup(dvp, vpp, cnp) + struct vnode *dvp; + struct vnode **vpp; + struct componentname *cnp; +{ + register struct namecache *ncp; + + if (!doingcache) { + cnp->cn_flags &= ~MAKEENTRY; + return (0); + } + + numcalls++; + + if (cnp->cn_nameptr[0] == '.') { + if (cnp->cn_namelen == 1) { + *vpp = dvp; + dothits++; + return (-1); + } + if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { + dotdothits++; + if (dvp->v_dd->v_id != dvp->v_ddid || + (cnp->cn_flags & MAKEENTRY) == 0) { + dvp->v_ddid = 0; + return (0); + } + *vpp = dvp->v_dd; + return (-1); + } + } + + LIST_FOREACH(ncp, (NCHHASH(dvp, cnp)), nc_hash) { + numchecks++; + if (ncp->nc_dvp == dvp && ncp->nc_nlen == cnp->cn_namelen && + !bcmp(ncp->nc_name, cnp->cn_nameptr, ncp->nc_nlen)) + break; + } + + /* We failed to find an entry */ + if (ncp == 0) { + if ((cnp->cn_flags & MAKEENTRY) == 0) { + nummisszap++; + } else { + nummiss++; + } + nchstats.ncs_miss++; + return (0); + } + + /* We don't want to have an entry, so dump it */ + if ((cnp->cn_flags & MAKEENTRY) == 0) { + numposzaps++; + nchstats.ncs_badhits++; + cache_zap(ncp); + return (0); + } + + /* We found a "positive" match, return the vnode */ + if (ncp->nc_vp) { + numposhits++; + nchstats.ncs_goodhits++; + *vpp = ncp->nc_vp; + return (-1); + } + + /* We found a negative match, and want to create it, so purge */ + if (cnp->cn_nameiop == CREATE) { + numnegzaps++; + nchstats.ncs_badhits++; + cache_zap(ncp); + return (0); + } + + numneghits++; + /* + * We found a "negative" match, ENOENT notifies client of this match. + * The nc_vpid field records whether this is a whiteout. + */ + TAILQ_REMOVE(&ncneg, ncp, nc_dst); + TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); + nchstats.ncs_neghits++; + if (ncp->nc_flag & NCF_WHITE) + cnp->cn_flags |= ISWHITEOUT; + return (ENOENT); +} + +/* + * Add an entry to the cache. + */ +void +cache_enter(dvp, vp, cnp) + struct vnode *dvp; + struct vnode *vp; + struct componentname *cnp; +{ + register struct namecache *ncp; + register struct nchashhead *ncpp; + + if (!doingcache) + return; + + if (cnp->cn_nameptr[0] == '.') { + if (cnp->cn_namelen == 1) { + return; + } + if (cnp->cn_namelen == 2 && cnp->cn_nameptr[1] == '.') { + if (vp) { + dvp->v_dd = vp; + dvp->v_ddid = vp->v_id; + } else { + dvp->v_dd = dvp; + dvp->v_ddid = 0; + } + return; + } + } + + ncp = (struct namecache *) + malloc(sizeof *ncp + cnp->cn_namelen, M_CACHE, M_WAITOK); + bzero((char *)ncp, sizeof *ncp); + numcache++; + if (!vp) { + numneg++; + ncp->nc_flag = cnp->cn_flags & ISWHITEOUT ? NCF_WHITE : 0; + } else if (vp->v_type == VDIR) { + vp->v_dd = dvp; + vp->v_ddid = dvp->v_id; + } + + /* + * Fill in cache info, if vp is NULL this is a "negative" cache entry. + * For negative entries, we have to record whether it is a whiteout. + * the whiteout flag is stored in the nc_vpid field which is + * otherwise unused. + */ + ncp->nc_vp = vp; + ncp->nc_dvp = dvp; + ncp->nc_nlen = cnp->cn_namelen; + bcopy(cnp->cn_nameptr, ncp->nc_name, ncp->nc_nlen); + ncpp = NCHHASH(dvp, cnp); + LIST_INSERT_HEAD(ncpp, ncp, nc_hash); + if (LIST_EMPTY(&dvp->v_cache_src)) + vhold(dvp); + LIST_INSERT_HEAD(&dvp->v_cache_src, ncp, nc_src); + if (vp) { + TAILQ_INSERT_HEAD(&vp->v_cache_dst, ncp, nc_dst); + } else { + TAILQ_INSERT_TAIL(&ncneg, ncp, nc_dst); + } + if (numneg*ncnegfactor > numcache) { + ncp = TAILQ_FIRST(&ncneg); + cache_zap(ncp); + } +} + +/* + * Name cache initialization, from vfs_init() when we are booting + */ +void +nchinit() +{ + + TAILQ_INIT(&ncneg); + nchashtbl = hashinit(desiredvnodes*2, M_CACHE, &nchash); +} + +/* + * Invalidate all entries to particular vnode. + * + * We actually just increment the v_id, that will do it. The stale entries + * will be purged by lookup as they get found. If the v_id wraps around, we + * need to ditch the entire cache, to avoid confusion. No valid vnode will + * ever have (v_id == 0). + */ +void +cache_purge(vp) + struct vnode *vp; +{ + static u_long nextid; + + while (!LIST_EMPTY(&vp->v_cache_src)) + cache_zap(LIST_FIRST(&vp->v_cache_src)); + while (!TAILQ_EMPTY(&vp->v_cache_dst)) + cache_zap(TAILQ_FIRST(&vp->v_cache_dst)); + + nextid++; + while (nextid == vp->v_id || !nextid) + continue; + vp->v_id = nextid; + vp->v_dd = vp; + vp->v_ddid = 0; +} + +/* + * Flush all entries referencing a particular filesystem. + * + * Since we need to check it anyway, we will flush all the invalid + * entries at the same time. + */ +void +cache_purgevfs(mp) + struct mount *mp; +{ + struct nchashhead *ncpp; + struct namecache *ncp, *nnp; + + /* Scan hash tables for applicable entries */ + for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) { + for (ncp = LIST_FIRST(ncpp); ncp != 0; ncp = nnp) { + nnp = LIST_NEXT(ncp, nc_hash); + if (ncp->nc_dvp->v_mount == mp) { + cache_zap(ncp); + } + } + } +} + +/* + * Perform canonical checks and cache lookup and pass on to filesystem + * through the vop_cachedlookup only if needed. + */ + +int +vfs_cache_lookup(ap) + struct vop_lookup_args /* { + struct vnode *a_dvp; + struct vnode **a_vpp; + struct componentname *a_cnp; + } */ *ap; +{ + struct vnode *vdp; + struct vnode *pdp; + int lockparent; + int error; + struct vnode **vpp = ap->a_vpp; + struct componentname *cnp = ap->a_cnp; + struct ucred *cred = cnp->cn_cred; + int flags = cnp->cn_flags; + struct proc *p = cnp->cn_proc; + u_long vpid; /* capability number of vnode */ + + *vpp = NULL; + vdp = ap->a_dvp; + lockparent = flags & LOCKPARENT; + + if (vdp->v_type != VDIR) + return (ENOTDIR); + + if ((flags & ISLASTCN) && (vdp->v_mount->mnt_flag & MNT_RDONLY) && + (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) + return (EROFS); + + error = VOP_ACCESS(vdp, VEXEC, cred, cnp->cn_proc); + + if (error) + return (error); + + error = cache_lookup(vdp, vpp, cnp); + + if (!error) + return (VOP_CACHEDLOOKUP(ap->a_dvp, ap->a_vpp, ap->a_cnp)); + + if (error == ENOENT) + return (error); + + pdp = vdp; + vdp = *vpp; + vpid = vdp->v_id; + if (pdp == vdp) { /* lookup on "." */ + VREF(vdp); + error = 0; + } else if (flags & ISDOTDOT) { + VOP_UNLOCK(pdp, 0, p); + error = vget(vdp, LK_EXCLUSIVE, p); + if (!error && lockparent && (flags & ISLASTCN)) + error = vn_lock(pdp, LK_EXCLUSIVE, p); + } else { + error = vget(vdp, LK_EXCLUSIVE, p); + if (!lockparent || error || !(flags & ISLASTCN)) + VOP_UNLOCK(pdp, 0, p); + } + /* + * Check that the capability number did not change + * while we were waiting for the lock. + */ + if (!error) { + if (vpid == vdp->v_id) + return (0); + vput(vdp); + if (lockparent && pdp != vdp && (flags & ISLASTCN)) + VOP_UNLOCK(pdp, 0, p); + } + error = vn_lock(pdp, LK_EXCLUSIVE, p); + if (error) + return (error); + return (VOP_CACHEDLOOKUP(ap->a_dvp, ap->a_vpp, ap->a_cnp)); +} diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c new file mode 100644 index 0000000..781508e --- /dev/null +++ b/sys/kern/vfs_cluster.c @@ -0,0 +1,840 @@ +/*- + * Copyright (c) 1993 + * The Regents of the University of California. All rights reserved. + * Modifications/enhancements: + * Copyright (c) 1995 John S. Dyson. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 + * $Id: vfs_cluster.c,v 1.77 1999/01/10 01:58:25 eivind Exp $ + */ + +#include "opt_debug_cluster.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/buf.h> +#include <sys/vnode.h> +#include <sys/malloc.h> +#include <sys/mount.h> +#include <sys/resourcevar.h> +#include <vm/vm.h> +#include <vm/vm_prot.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> + +#if defined(CLUSTERDEBUG) +#include <sys/sysctl.h> +static int rcluster= 0; +SYSCTL_INT(_debug, OID_AUTO, rcluster, CTLFLAG_RW, &rcluster, 0, ""); +#endif + +static MALLOC_DEFINE(M_SEGMENT, "cluster_save buffer", "cluster_save buffer"); + +static struct cluster_save * + cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp)); +static struct buf * + cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn, + daddr_t blkno, long size, int run, struct buf *fbp)); + +extern vm_page_t bogus_page; + +extern int cluster_pbuf_freecnt; + +/* + * Maximum number of blocks for read-ahead. + */ +#define MAXRA 32 + +/* + * This replaces bread. + */ +int +cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) + struct vnode *vp; + u_quad_t filesize; + daddr_t lblkno; + long size; + struct ucred *cred; + long totread; + int seqcount; + struct buf **bpp; +{ + struct buf *bp, *rbp, *reqbp; + daddr_t blkno, origblkno; + int error, num_ra; + int i; + int maxra, racluster; + long origtotread; + + error = 0; + if (vp->v_maxio == 0) + vp->v_maxio = DFLTPHYS; + + /* + * Try to limit the amount of read-ahead by a few + * ad-hoc parameters. This needs work!!! + */ + racluster = vp->v_maxio/size; + maxra = 2 * racluster + (totread / size); + if (maxra > MAXRA) + maxra = MAXRA; + if (maxra > nbuf/8) + maxra = nbuf/8; + + /* + * get the requested block + */ + *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0); + origblkno = lblkno; + origtotread = totread; + + /* + * if it is in the cache, then check to see if the reads have been + * sequential. If they have, then try some read-ahead, otherwise + * back-off on prospective read-aheads. + */ + if (bp->b_flags & B_CACHE) { + if (!seqcount) { + return 0; + } else if ((bp->b_flags & B_RAM) == 0) { + return 0; + } else { + int s; + struct buf *tbp; + bp->b_flags &= ~B_RAM; + /* + * We do the spl here so that there is no window + * between the incore and the b_usecount increment + * below. We opt to keep the spl out of the loop + * for efficiency. + */ + s = splbio(); + for(i=1;i<maxra;i++) { + + if (!(tbp = incore(vp, lblkno+i))) { + break; + } + + /* + * Set another read-ahead mark so we know to check + * again. + */ + if (((i % racluster) == (racluster - 1)) || + (i == (maxra - 1))) + tbp->b_flags |= B_RAM; + + if ((tbp->b_usecount < 1) && + ((tbp->b_flags & B_BUSY) == 0) && + (tbp->b_qindex == QUEUE_LRU)) { + TAILQ_REMOVE(&bufqueues[QUEUE_LRU], tbp, b_freelist); + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], tbp, b_freelist); + } + } + splx(s); + if (i >= maxra) { + return 0; + } + lblkno += i; + } + reqbp = bp = NULL; + } else { + off_t firstread = bp->b_offset; + + KASSERT(bp->b_offset != NOOFFSET, + ("cluster_read: no buffer offset")); + if (firstread + totread > filesize) + totread = filesize - firstread; + if (totread > size) { + int nblks = 0; + int ncontigafter; + while (totread > 0) { + nblks++; + totread -= size; + } + if (nblks == 1) + goto single_block_read; + if (nblks > racluster) + nblks = racluster; + + error = VOP_BMAP(vp, lblkno, NULL, + &blkno, &ncontigafter, NULL); + if (error) + goto single_block_read; + if (blkno == -1) + goto single_block_read; + if (ncontigafter == 0) + goto single_block_read; + if (ncontigafter + 1 < nblks) + nblks = ncontigafter + 1; + + bp = cluster_rbuild(vp, filesize, lblkno, + blkno, size, nblks, bp); + lblkno += (bp->b_bufsize / size); + } else { +single_block_read: + /* + * if it isn't in the cache, then get a chunk from + * disk if sequential, otherwise just get the block. + */ + bp->b_flags |= B_READ | B_RAM; + lblkno += 1; + } + } + + /* + * if we have been doing sequential I/O, then do some read-ahead + */ + rbp = NULL; + if (seqcount && (lblkno < (origblkno + seqcount))) { + /* + * we now build the read-ahead buffer if it is desirable. + */ + if (((u_quad_t)(lblkno + 1) * size) <= filesize && + !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) && + blkno != -1) { + int nblksread; + int ntoread = num_ra + 1; + nblksread = (origtotread + size - 1) / size; + if (seqcount < nblksread) + seqcount = nblksread; + if (seqcount < ntoread) + ntoread = seqcount; + if (num_ra) { + rbp = cluster_rbuild(vp, filesize, lblkno, + blkno, size, ntoread, NULL); + } else { + rbp = getblk(vp, lblkno, size, 0, 0); + rbp->b_flags |= B_READ | B_ASYNC | B_RAM; + rbp->b_blkno = blkno; + } + } + } + + /* + * handle the synchronous read + */ + if (bp) { +#if defined(CLUSTERDEBUG) + if (rcluster) + printf("S(%ld,%ld,%d) ", + (long)bp->b_lblkno, bp->b_bcount, seqcount); +#endif + if ((bp->b_flags & B_CLUSTER) == 0) + vfs_busy_pages(bp, 0); + error = VOP_STRATEGY(vp, bp); + curproc->p_stats->p_ru.ru_inblock++; + } + + /* + * and if we have read-aheads, do them too + */ + if (rbp) { + if (error) { + rbp->b_flags &= ~(B_ASYNC | B_READ); + brelse(rbp); + } else if (rbp->b_flags & B_CACHE) { + rbp->b_flags &= ~(B_ASYNC | B_READ); + bqrelse(rbp); + } else { +#if defined(CLUSTERDEBUG) + if (rcluster) { + if (bp) + printf("A+(%ld,%ld,%ld,%d) ", + (long)rbp->b_lblkno, rbp->b_bcount, + (long)(rbp->b_lblkno - origblkno), + seqcount); + else + printf("A(%ld,%ld,%ld,%d) ", + (long)rbp->b_lblkno, rbp->b_bcount, + (long)(rbp->b_lblkno - origblkno), + seqcount); + } +#endif + + if ((rbp->b_flags & B_CLUSTER) == 0) + vfs_busy_pages(rbp, 0); + (void) VOP_STRATEGY(vp, rbp); + curproc->p_stats->p_ru.ru_inblock++; + } + } + if (reqbp) + return (biowait(reqbp)); + else + return (error); +} + +/* + * If blocks are contiguous on disk, use this to provide clustered + * read ahead. We will read as many blocks as possible sequentially + * and then parcel them up into logical blocks in the buffer hash table. + */ +static struct buf * +cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) + struct vnode *vp; + u_quad_t filesize; + daddr_t lbn; + daddr_t blkno; + long size; + int run; + struct buf *fbp; +{ + struct buf *bp, *tbp; + daddr_t bn; + int i, inc, j; + + KASSERT(size == vp->v_mount->mnt_stat.f_iosize, + ("cluster_rbuild: size %ld != filesize %ld\n", + size, vp->v_mount->mnt_stat.f_iosize)); + + /* + * avoid a division + */ + while ((u_quad_t) size * (lbn + run) > filesize) { + --run; + } + + if (fbp) { + tbp = fbp; + tbp->b_flags |= B_READ; + } else { + tbp = getblk(vp, lbn, size, 0, 0); + if (tbp->b_flags & B_CACHE) + return tbp; + tbp->b_flags |= B_ASYNC | B_READ | B_RAM; + } + + tbp->b_blkno = blkno; + if( (tbp->b_flags & B_MALLOC) || + ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) + return tbp; + + bp = trypbuf(&cluster_pbuf_freecnt); + if (bp == 0) + return tbp; + + bp->b_data = (char *)((vm_offset_t)bp->b_data | + ((vm_offset_t)tbp->b_data & PAGE_MASK)); + bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO; + bp->b_iodone = cluster_callback; + bp->b_blkno = blkno; + bp->b_lblkno = lbn; + bp->b_offset = tbp->b_offset; + KASSERT(bp->b_offset != NOOFFSET, ("cluster_rbuild: no buffer offset")); + pbgetvp(vp, bp); + + TAILQ_INIT(&bp->b_cluster.cluster_head); + + bp->b_bcount = 0; + bp->b_bufsize = 0; + bp->b_npages = 0; + + if (vp->v_maxio == 0) + vp->v_maxio = DFLTPHYS; + inc = btodb(size); + for (bn = blkno, i = 0; i < run; ++i, bn += inc) { + if (i != 0) { + if ((bp->b_npages * PAGE_SIZE) + + round_page(size) > vp->v_maxio) + break; + + if (tbp = incore(vp, lbn + i)) { + if (tbp->b_flags & B_BUSY) + break; + + for (j = 0; j < tbp->b_npages; j++) + if (tbp->b_pages[j]->valid) + break; + + if (j != tbp->b_npages) + break; + + if (tbp->b_bcount != size) + break; + } + + tbp = getblk(vp, lbn + i, size, 0, 0); + + if ((tbp->b_flags & B_CACHE) || + (tbp->b_flags & B_VMIO) == 0) { + bqrelse(tbp); + break; + } + + for (j = 0;j < tbp->b_npages; j++) + if (tbp->b_pages[j]->valid) + break; + + if (j != tbp->b_npages) { + bqrelse(tbp); + break; + } + + if ((fbp && (i == 1)) || (i == (run - 1))) + tbp->b_flags |= B_RAM; + tbp->b_flags |= B_READ | B_ASYNC; + if (tbp->b_blkno == tbp->b_lblkno) { + tbp->b_blkno = bn; + } else if (tbp->b_blkno != bn) { + brelse(tbp); + break; + } + } + TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, + tbp, b_cluster.cluster_entry); + for (j = 0; j < tbp->b_npages; j += 1) { + vm_page_t m; + m = tbp->b_pages[j]; + vm_page_io_start(m); + vm_object_pip_add(m->object, 1); + if ((bp->b_npages == 0) || + (bp->b_pages[bp->b_npages-1] != m)) { + bp->b_pages[bp->b_npages] = m; + bp->b_npages++; + } + if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) + tbp->b_pages[j] = bogus_page; + } + bp->b_bcount += tbp->b_bcount; + bp->b_bufsize += tbp->b_bufsize; + } + + for(j=0;j<bp->b_npages;j++) { + if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) == + VM_PAGE_BITS_ALL) + bp->b_pages[j] = bogus_page; + } + if (bp->b_bufsize > bp->b_kvasize) + panic("cluster_rbuild: b_bufsize(%ld) > b_kvasize(%d)\n", + bp->b_bufsize, bp->b_kvasize); + bp->b_kvasize = bp->b_bufsize; + + pmap_qenter(trunc_page((vm_offset_t) bp->b_data), + (vm_page_t *)bp->b_pages, bp->b_npages); + return (bp); +} + +/* + * Cleanup after a clustered read or write. + * This is complicated by the fact that any of the buffers might have + * extra memory (if there were no empty buffer headers at allocbuf time) + * that we will need to shift around. + */ +void +cluster_callback(bp) + struct buf *bp; +{ + struct buf *nbp, *tbp; + int error = 0; + + /* + * Must propogate errors to all the components. + */ + if (bp->b_flags & B_ERROR) + error = bp->b_error; + + pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); + /* + * Move memory from the large cluster buffer into the component + * buffers and mark IO as done on these. + */ + for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); + tbp; tbp = nbp) { + nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); + if (error) { + tbp->b_flags |= B_ERROR; + tbp->b_error = error; + } else + tbp->b_dirtyoff = tbp->b_dirtyend = 0; + biodone(tbp); + } + relpbuf(bp, &cluster_pbuf_freecnt); +} + +/* + * Do clustered write for FFS. + * + * Three cases: + * 1. Write is not sequential (write asynchronously) + * Write is sequential: + * 2. beginning of cluster - begin cluster + * 3. middle of a cluster - add to cluster + * 4. end of a cluster - asynchronously write cluster + */ +void +cluster_write(bp, filesize) + struct buf *bp; + u_quad_t filesize; +{ + struct vnode *vp; + daddr_t lbn; + int maxclen, cursize; + int lblocksize; + int async; + + vp = bp->b_vp; + if (vp->v_maxio == 0) + vp->v_maxio = DFLTPHYS; + if (vp->v_type == VREG) { + async = vp->v_mount->mnt_flag & MNT_ASYNC; + lblocksize = vp->v_mount->mnt_stat.f_iosize; + } else { + async = 0; + lblocksize = bp->b_bufsize; + } + lbn = bp->b_lblkno; + KASSERT(bp->b_offset != NOOFFSET, ("cluster_write: no buffer offset")); + + /* Initialize vnode to beginning of file. */ + if (lbn == 0) + vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; + + if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || + (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { + maxclen = vp->v_maxio / lblocksize - 1; + if (vp->v_clen != 0) { + /* + * Next block is not sequential. + * + * If we are not writing at end of file, the process + * seeked to another point in the file since its last + * write, or we have reached our maximum cluster size, + * then push the previous cluster. Otherwise try + * reallocating to make it sequential. + */ + cursize = vp->v_lastw - vp->v_cstart + 1; + if (((u_quad_t) bp->b_offset + lblocksize) != filesize || + lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { + if (!async) + cluster_wbuild(vp, lblocksize, + vp->v_cstart, cursize); + } else { + struct buf **bpp, **endbp; + struct cluster_save *buflist; + + buflist = cluster_collectbufs(vp, bp); + endbp = &buflist->bs_children + [buflist->bs_nchildren - 1]; + if (VOP_REALLOCBLKS(vp, buflist)) { + /* + * Failed, push the previous cluster. + */ + for (bpp = buflist->bs_children; + bpp < endbp; bpp++) + brelse(*bpp); + free(buflist, M_SEGMENT); + cluster_wbuild(vp, lblocksize, + vp->v_cstart, cursize); + } else { + /* + * Succeeded, keep building cluster. + */ + for (bpp = buflist->bs_children; + bpp <= endbp; bpp++) + bdwrite(*bpp); + free(buflist, M_SEGMENT); + vp->v_lastw = lbn; + vp->v_lasta = bp->b_blkno; + return; + } + } + } + /* + * Consider beginning a cluster. If at end of file, make + * cluster as large as possible, otherwise find size of + * existing cluster. + */ + if ((vp->v_type == VREG) && + ((u_quad_t) bp->b_offset + lblocksize) != filesize && + (bp->b_blkno == bp->b_lblkno) && + (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || + bp->b_blkno == -1)) { + bawrite(bp); + vp->v_clen = 0; + vp->v_lasta = bp->b_blkno; + vp->v_cstart = lbn + 1; + vp->v_lastw = lbn; + return; + } + vp->v_clen = maxclen; + if (!async && maxclen == 0) { /* I/O not contiguous */ + vp->v_cstart = lbn + 1; + bawrite(bp); + } else { /* Wait for rest of cluster */ + vp->v_cstart = lbn; + bdwrite(bp); + } + } else if (lbn == vp->v_cstart + vp->v_clen) { + /* + * At end of cluster, write it out. + */ + bdwrite(bp); + cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); + vp->v_clen = 0; + vp->v_cstart = lbn + 1; + } else + /* + * In the middle of a cluster, so just delay the I/O for now. + */ + bdwrite(bp); + vp->v_lastw = lbn; + vp->v_lasta = bp->b_blkno; +} + + +/* + * This is an awful lot like cluster_rbuild...wish they could be combined. + * The last lbn argument is the current block on which I/O is being + * performed. Check to see that it doesn't fall in the middle of + * the current block (if last_bp == NULL). + */ +int +cluster_wbuild(vp, size, start_lbn, len) + struct vnode *vp; + long size; + daddr_t start_lbn; + int len; +{ + struct buf *bp, *tbp; + int i, j, s; + int totalwritten = 0; + int dbsize = btodb(size); + + if (vp->v_maxio == 0) + vp->v_maxio = DFLTPHYS; + while (len > 0) { + s = splbio(); + if (((tbp = gbincore(vp, start_lbn)) == NULL) || + ((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) { + ++start_lbn; + --len; + splx(s); + continue; + } + bremfree(tbp); + tbp->b_flags |= B_BUSY; + tbp->b_flags &= ~B_DONE; + splx(s); + + /* + * Extra memory in the buffer, punt on this buffer. XXX we could + * handle this in most cases, but we would have to push the extra + * memory down to after our max possible cluster size and then + * potentially pull it back up if the cluster was terminated + * prematurely--too much hassle. + */ + if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) || + (tbp->b_bcount != tbp->b_bufsize) || + (tbp->b_bcount != size) || + (len == 1) || + ((bp = trypbuf(&cluster_pbuf_freecnt)) == NULL)) { + totalwritten += tbp->b_bufsize; + bawrite(tbp); + ++start_lbn; + --len; + continue; + } + + /* + * We got a pbuf to make the cluster in. + * so initialise it. + */ + TAILQ_INIT(&bp->b_cluster.cluster_head); + bp->b_bcount = 0; + bp->b_bufsize = 0; + bp->b_npages = 0; + if (tbp->b_wcred != NOCRED) { + bp->b_wcred = tbp->b_wcred; + crhold(bp->b_wcred); + } + + bp->b_blkno = tbp->b_blkno; + bp->b_lblkno = tbp->b_lblkno; + bp->b_offset = tbp->b_offset; + bp->b_data = (char *)((vm_offset_t)bp->b_data | + ((vm_offset_t)tbp->b_data & PAGE_MASK)); + bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER | + (tbp->b_flags & (B_VMIO | B_NEEDCOMMIT)); + bp->b_iodone = cluster_callback; + pbgetvp(vp, bp); + /* + * From this location in the file, scan forward to see + * if there are buffers with adjacent data that need to + * be written as well. + */ + for (i = 0; i < len; ++i, ++start_lbn) { + if (i != 0) { /* If not the first buffer */ + s = splbio(); + /* + * If the adjacent data is not even in core it + * can't need to be written. + */ + if ((tbp = gbincore(vp, start_lbn)) == NULL) { + splx(s); + break; + } + + /* + * If it IS in core, but has different + * characteristics, don't cluster with it. + */ + if ((tbp->b_flags & + (B_VMIO | B_CLUSTEROK | B_INVAL | B_BUSY | + B_DELWRI | B_NEEDCOMMIT)) + != (B_DELWRI | B_CLUSTEROK | + (bp->b_flags & (B_VMIO | B_NEEDCOMMIT)))) { + splx(s); + break; + } + + if (tbp->b_wcred != bp->b_wcred) { + splx(s); + break; + } + + /* + * Check that the combined cluster + * would make sense with regard to pages + * and would not be too large + */ + if ((tbp->b_bcount != size) || + ((bp->b_blkno + (dbsize * i)) != + tbp->b_blkno) || + ((tbp->b_npages + bp->b_npages) > + (vp->v_maxio / PAGE_SIZE))) { + splx(s); + break; + } + /* + * Ok, it's passed all the tests, + * so remove it from the free list + * and mark it busy. We will use it. + */ + bremfree(tbp); + tbp->b_flags |= B_BUSY; + tbp->b_flags &= ~B_DONE; + splx(s); + } /* end of code for non-first buffers only */ + /* check for latent dependencies to be handled */ + if ((LIST_FIRST(&tbp->b_dep)) != NULL && + bioops.io_start) + (*bioops.io_start)(tbp); + /* + * If the IO is via the VM then we do some + * special VM hackery. (yuck) + */ + if (tbp->b_flags & B_VMIO) { + vm_page_t m; + + if (i != 0) { /* if not first buffer */ + for (j = 0; j < tbp->b_npages; j += 1) { + m = tbp->b_pages[j]; + if (m->flags & PG_BUSY) + goto finishcluster; + } + } + + for (j = 0; j < tbp->b_npages; j += 1) { + m = tbp->b_pages[j]; + vm_page_io_start(m); + vm_object_pip_add(m->object, 1); + if ((bp->b_npages == 0) || + (bp->b_pages[bp->b_npages - 1] != m)) { + bp->b_pages[bp->b_npages] = m; + bp->b_npages++; + } + } + } + bp->b_bcount += size; + bp->b_bufsize += size; + + s = splbio(); + --numdirtybuffers; + tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); + tbp->b_flags |= B_ASYNC; + reassignbuf(tbp, tbp->b_vp); /* put on clean list */ + ++tbp->b_vp->v_numoutput; + splx(s); + TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, + tbp, b_cluster.cluster_entry); + } + finishcluster: + pmap_qenter(trunc_page((vm_offset_t) bp->b_data), + (vm_page_t *) bp->b_pages, bp->b_npages); + if (bp->b_bufsize > bp->b_kvasize) + panic( + "cluster_wbuild: b_bufsize(%ld) > b_kvasize(%d)\n", + bp->b_bufsize, bp->b_kvasize); + bp->b_kvasize = bp->b_bufsize; + totalwritten += bp->b_bufsize; + bp->b_dirtyoff = 0; + bp->b_dirtyend = bp->b_bufsize; + bawrite(bp); + + len -= i; + } + return totalwritten; +} + +/* + * Collect together all the buffers in a cluster. + * Plus add one additional buffer. + */ +static struct cluster_save * +cluster_collectbufs(vp, last_bp) + struct vnode *vp; + struct buf *last_bp; +{ + struct cluster_save *buflist; + struct buf *bp; + daddr_t lbn; + int i, len; + + len = vp->v_lastw - vp->v_cstart + 1; + buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), + M_SEGMENT, M_WAITOK); + buflist->bs_nchildren = 0; + buflist->bs_children = (struct buf **) (buflist + 1); + for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) { + (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, &bp); + buflist->bs_children[i] = bp; + if (bp->b_blkno == bp->b_lblkno) + VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, + NULL, NULL); + } + buflist->bs_children[i] = bp = last_bp; + if (bp->b_blkno == bp->b_lblkno) + VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, + NULL, NULL); + buflist->bs_nchildren = i + 1; + return (buflist); +} diff --git a/sys/kern/vfs_conf.c b/sys/kern/vfs_conf.c new file mode 100644 index 0000000..a7a830f --- /dev/null +++ b/sys/kern/vfs_conf.c @@ -0,0 +1,190 @@ +/* + * Copyright (c) 1989, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * Copyright (c) 1995 Artisoft, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_conf.c 8.8 (Berkeley) 3/31/94 + * $Id: vfs_conf.c,v 1.25 1998/06/09 12:52:33 bde Exp $ + */ + +/* + * PURPOSE: This file abstracts the root mounting interface from + * the per file system semantics for handling mounts, + * the overall intent of which is to move the BSD + * internals dependence out of the FS code, both to + * make the FS code more portable and to free up some + * of the BSD internals so that they may more easily + * be changed. + * + * NOTE1: Code is single entry/single exit to aid debugging + * and conversion for kernel multithreading. + * + * NOTE2: Code notes lock state in headers on entry and exit + * as an aid to conversion for kernel multithreading + * on SMP reentrancy + */ +#include "opt_bootp.h" + +#include <sys/param.h> /* dev_t (types.h)*/ +#include <sys/kernel.h> +#include <sys/systm.h> /* rootvp*/ +#include <sys/proc.h> /* curproc*/ +#include <sys/vnode.h> /* NULLVP*/ +#include <sys/mount.h> /* struct mount*/ +#include <sys/malloc.h> /* M_MOUNT*/ + +/* + * GLOBALS + */ + +MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount struct"); + +/* + * These define the root filesystem, device, and root filesystem type. + */ +dev_t rootdevs[] = { NODEV, NODEV }; +char *rootdevnames[2]; +struct vnode *rootvnode; +char *mountrootfsname; +#ifdef BOOTP +extern void bootpc_init __P((void)); +#endif + +/* + * vfs_init() will set maxvfsconf + * to the highest defined type number. + */ +int maxvfsconf; +struct vfsconf *vfsconf; + +/* + * Common root mount code shared by all filesystems + */ +#define ROOTNAME "root_device" + +/* + * vfs_mountrootfs + * + * Common entry point for root mounts + * + * PARAMETERS: + * NONE + * + * RETURNS: 0 Success + * !0 error number (errno.h) + * + * LOCK STATE: + * ENTRY + * <no locks held> + * EXIT + * <no locks held> + * + * NOTES: + * This code is currently supported only for use for + * the FFS file system type. This is a matter of + * fixing the other file systems, not this code! + */ +static void +vfs_mountrootfs(void *unused) +{ + struct mount *mp; + int i, err; + struct proc *p = curproc; /* XXX */ + dev_t orootdev; + +#ifdef BOOTP + bootpc_init(); +#endif + /* + * New root mount structure + */ + if ((err = vfs_rootmountalloc(mountrootfsname, ROOTNAME, &mp))) { + printf("error %d: ", err); + panic("cannot mount root\n"); + return ; + } + mp->mnt_flag |= MNT_ROOTFS; + + /* + * Attempt the mount + */ + err = ENXIO; + orootdev = rootdev; + if (rootdevs[0] == NODEV) + rootdevs[0] = rootdev; + for (i = 0; i < sizeof(rootdevs) / sizeof(rootdevs[0]); i++) { + if (rootdevs[i] == NODEV) + break; + rootdev = rootdevs[i]; + if (rootdev != orootdev) { + printf("changing root device to %s\n", rootdevnames[i]); + orootdev = rootdev; + } + strncpy(mp->mnt_stat.f_mntfromname, + rootdevnames[i] ? rootdevnames[i] : ROOTNAME, MNAMELEN - 1); + err = VFS_MOUNT(mp, NULL, NULL, NULL, p); + if (err != ENXIO) + break; + } + if (err) { + /* + * XXX should ask the user for the name in some cases. + * Why do we call vfs_unbusy() here and not after ENXIO + * is returned above? + */ + vfs_unbusy(mp, p); + /* + * free mount struct before failing + * (hardly worthwhile with the PANIC eh?) + */ + free( mp, M_MOUNT); + printf("error %d: ", err); + panic("cannot mount root (2)\n"); + return; + } + + simple_lock(&mountlist_slock); + + /* + * Add fs to list of mounted file systems + */ + CIRCLEQ_INSERT_HEAD(&mountlist, mp, mnt_list); + + simple_unlock(&mountlist_slock); + vfs_unbusy(mp, p); + + /* root mount, update system time from FS specific data*/ + inittodr(mp->mnt_time); + return; +} + +SYSINIT(mountroot, SI_SUB_MOUNT_ROOT, SI_ORDER_FIRST, vfs_mountrootfs, NULL) + diff --git a/sys/kern/vfs_default.c b/sys/kern/vfs_default.c new file mode 100644 index 0000000..b73b126 --- /dev/null +++ b/sys/kern/vfs_default.c @@ -0,0 +1,477 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed + * to Berkeley by John Heidemann of the UCLA Ficus project. + * + * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/lock.h> +#include <sys/malloc.h> +#include <sys/unistd.h> +#include <sys/vnode.h> +#include <sys/poll.h> + +static int vop_nostrategy __P((struct vop_strategy_args *)); + +/* + * This vnode table stores what we want to do if the filesystem doesn't + * implement a particular VOP. + * + * If there is no specific entry here, we will return EOPNOTSUPP. + * + */ + +vop_t **default_vnodeop_p; +static struct vnodeopv_entry_desc default_vnodeop_entries[] = { + { &vop_default_desc, (vop_t *) vop_eopnotsupp }, + { &vop_abortop_desc, (vop_t *) vop_null }, + { &vop_advlock_desc, (vop_t *) vop_einval }, + { &vop_bwrite_desc, (vop_t *) vop_stdbwrite }, + { &vop_close_desc, (vop_t *) vop_null }, + { &vop_fsync_desc, (vop_t *) vop_null }, + { &vop_ioctl_desc, (vop_t *) vop_enotty }, + { &vop_islocked_desc, (vop_t *) vop_noislocked }, + { &vop_lease_desc, (vop_t *) vop_null }, + { &vop_lock_desc, (vop_t *) vop_nolock }, + { &vop_mmap_desc, (vop_t *) vop_einval }, + { &vop_open_desc, (vop_t *) vop_null }, + { &vop_pathconf_desc, (vop_t *) vop_einval }, + { &vop_poll_desc, (vop_t *) vop_nopoll }, + { &vop_readlink_desc, (vop_t *) vop_einval }, + { &vop_reallocblks_desc, (vop_t *) vop_eopnotsupp }, + { &vop_revoke_desc, (vop_t *) vop_revoke }, + { &vop_strategy_desc, (vop_t *) vop_nostrategy }, + { &vop_unlock_desc, (vop_t *) vop_nounlock }, + { NULL, NULL } +}; + +static struct vnodeopv_desc default_vnodeop_opv_desc = + { &default_vnodeop_p, default_vnodeop_entries }; + +VNODEOP_SET(default_vnodeop_opv_desc); + +int +vop_eopnotsupp(struct vop_generic_args *ap) +{ + /* + printf("vop_notsupp[%s]\n", ap->a_desc->vdesc_name); + */ + + return (EOPNOTSUPP); +} + +int +vop_ebadf(struct vop_generic_args *ap) +{ + + return (EBADF); +} + +int +vop_enotty(struct vop_generic_args *ap) +{ + + return (ENOTTY); +} + +int +vop_einval(struct vop_generic_args *ap) +{ + + return (EINVAL); +} + +int +vop_null(struct vop_generic_args *ap) +{ + + return (0); +} + +int +vop_defaultop(struct vop_generic_args *ap) +{ + + return (VOCALL(default_vnodeop_p, ap->a_desc->vdesc_offset, ap)); +} + +int +vop_panic(struct vop_generic_args *ap) +{ + + panic("illegal vnode op called"); +} + +static int +vop_nostrategy (struct vop_strategy_args *ap) +{ + printf("No strategy for buffer at %p\n", ap->a_bp); + vprint("", ap->a_vp); + vprint("", ap->a_bp->b_vp); + ap->a_bp->b_flags |= B_ERROR; + ap->a_bp->b_error = EOPNOTSUPP; + biodone(ap->a_bp); + return (EOPNOTSUPP); +} + +int +vop_stdpathconf(ap) + struct vop_pathconf_args /* { + struct vnode *a_vp; + int a_name; + int *a_retval; + } */ *ap; +{ + + switch (ap->a_name) { + case _PC_LINK_MAX: + *ap->a_retval = LINK_MAX; + return (0); + case _PC_MAX_CANON: + *ap->a_retval = MAX_CANON; + return (0); + case _PC_MAX_INPUT: + *ap->a_retval = MAX_INPUT; + return (0); + case _PC_PIPE_BUF: + *ap->a_retval = PIPE_BUF; + return (0); + case _PC_CHOWN_RESTRICTED: + *ap->a_retval = 1; + return (0); + case _PC_VDISABLE: + *ap->a_retval = _POSIX_VDISABLE; + return (0); + default: + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Standard lock, unlock and islocked functions. + * + * These depend on the lock structure being the first element in the + * inode, ie: vp->v_data points to the the lock! + */ +int +vop_stdlock(ap) + struct vop_lock_args /* { + struct vnode *a_vp; + int a_flags; + struct proc *a_p; + } */ *ap; +{ + struct lock *l; + + if ((l = (struct lock *)ap->a_vp->v_data) == NULL) { + if (ap->a_flags & LK_INTERLOCK) + simple_unlock(&ap->a_vp->v_interlock); + return 0; + } + +#ifndef DEBUG_LOCKS + return (lockmgr(l, ap->a_flags, &ap->a_vp->v_interlock, ap->a_p)); +#else + return (debuglockmgr(l, ap->a_flags, &ap->a_vp->v_interlock, ap->a_p, + "vop_stdlock", ap->a_vp->filename, ap->a_vp->line)); +#endif +} + +int +vop_stdunlock(ap) + struct vop_unlock_args /* { + struct vnode *a_vp; + int a_flags; + struct proc *a_p; + } */ *ap; +{ + struct lock *l; + + if ((l = (struct lock *)ap->a_vp->v_data) == NULL) { + if (ap->a_flags & LK_INTERLOCK) + simple_unlock(&ap->a_vp->v_interlock); + return 0; + } + + return (lockmgr(l, ap->a_flags | LK_RELEASE, &ap->a_vp->v_interlock, + ap->a_p)); +} + +int +vop_stdislocked(ap) + struct vop_islocked_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct lock *l; + + if ((l = (struct lock *)ap->a_vp->v_data) == NULL) + return 0; + + return (lockstatus(l)); +} + +/* + * Return true for select/poll. + */ +int +vop_nopoll(ap) + struct vop_poll_args /* { + struct vnode *a_vp; + int a_events; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + /* + * Return true for read/write. If the user asked for something + * special, return POLLNVAL, so that clients have a way of + * determining reliably whether or not the extended + * functionality is present without hard-coding knowledge + * of specific filesystem implementations. + */ + if (ap->a_events & ~POLLSTANDARD) + return (POLLNVAL); + + return (ap->a_events & (POLLIN | POLLOUT | POLLRDNORM | POLLWRNORM)); +} + +/* + * Implement poll for local filesystems that support it. + */ +int +vop_stdpoll(ap) + struct vop_poll_args /* { + struct vnode *a_vp; + int a_events; + struct ucred *a_cred; + struct proc *a_p; + } */ *ap; +{ + if ((ap->a_events & ~POLLSTANDARD) == 0) + return (ap->a_events & (POLLRDNORM|POLLWRNORM)); + return (vn_pollrecord(ap->a_vp, ap->a_p, ap->a_events)); +} + +int +vop_stdbwrite(ap) + struct vop_bwrite_args *ap; +{ + return (bwrite(ap->a_bp)); +} + +/* + * Stubs to use when there is no locking to be done on the underlying object. + * A minimal shared lock is necessary to ensure that the underlying object + * is not revoked while an operation is in progress. So, an active shared + * count is maintained in an auxillary vnode lock structure. + */ +int +vop_sharedlock(ap) + struct vop_lock_args /* { + struct vnode *a_vp; + int a_flags; + struct proc *a_p; + } */ *ap; +{ + /* + * This code cannot be used until all the non-locking filesystems + * (notably NFS) are converted to properly lock and release nodes. + * Also, certain vnode operations change the locking state within + * the operation (create, mknod, remove, link, rename, mkdir, rmdir, + * and symlink). Ideally these operations should not change the + * lock state, but should be changed to let the caller of the + * function unlock them. Otherwise all intermediate vnode layers + * (such as union, umapfs, etc) must catch these functions to do + * the necessary locking at their layer. Note that the inactive + * and lookup operations also change their lock state, but this + * cannot be avoided, so these two operations will always need + * to be handled in intermediate layers. + */ + struct vnode *vp = ap->a_vp; + int vnflags, flags = ap->a_flags; + + if (vp->v_vnlock == NULL) { + if ((flags & LK_TYPE_MASK) == LK_DRAIN) + return (0); + MALLOC(vp->v_vnlock, struct lock *, sizeof(struct lock), + M_VNODE, M_WAITOK); + lockinit(vp->v_vnlock, PVFS, "vnlock", 0, LK_NOPAUSE); + } + switch (flags & LK_TYPE_MASK) { + case LK_DRAIN: + vnflags = LK_DRAIN; + break; + case LK_EXCLUSIVE: +#ifdef DEBUG_VFS_LOCKS + /* + * Normally, we use shared locks here, but that confuses + * the locking assertions. + */ + vnflags = LK_EXCLUSIVE; + break; +#endif + case LK_SHARED: + vnflags = LK_SHARED; + break; + case LK_UPGRADE: + case LK_EXCLUPGRADE: + case LK_DOWNGRADE: + return (0); + case LK_RELEASE: + default: + panic("vop_sharedlock: bad operation %d", flags & LK_TYPE_MASK); + } + if (flags & LK_INTERLOCK) + vnflags |= LK_INTERLOCK; +#ifndef DEBUG_LOCKS + return (lockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p)); +#else + return (debuglockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p, + "vop_sharedlock", vp->filename, vp->line)); +#endif +} + +/* + * Stubs to use when there is no locking to be done on the underlying object. + * A minimal shared lock is necessary to ensure that the underlying object + * is not revoked while an operation is in progress. So, an active shared + * count is maintained in an auxillary vnode lock structure. + */ +int +vop_nolock(ap) + struct vop_lock_args /* { + struct vnode *a_vp; + int a_flags; + struct proc *a_p; + } */ *ap; +{ +#ifdef notyet + /* + * This code cannot be used until all the non-locking filesystems + * (notably NFS) are converted to properly lock and release nodes. + * Also, certain vnode operations change the locking state within + * the operation (create, mknod, remove, link, rename, mkdir, rmdir, + * and symlink). Ideally these operations should not change the + * lock state, but should be changed to let the caller of the + * function unlock them. Otherwise all intermediate vnode layers + * (such as union, umapfs, etc) must catch these functions to do + * the necessary locking at their layer. Note that the inactive + * and lookup operations also change their lock state, but this + * cannot be avoided, so these two operations will always need + * to be handled in intermediate layers. + */ + struct vnode *vp = ap->a_vp; + int vnflags, flags = ap->a_flags; + + if (vp->v_vnlock == NULL) { + if ((flags & LK_TYPE_MASK) == LK_DRAIN) + return (0); + MALLOC(vp->v_vnlock, struct lock *, sizeof(struct lock), + M_VNODE, M_WAITOK); + lockinit(vp->v_vnlock, PVFS, "vnlock", 0, LK_NOPAUSE); + } + switch (flags & LK_TYPE_MASK) { + case LK_DRAIN: + vnflags = LK_DRAIN; + break; + case LK_EXCLUSIVE: + case LK_SHARED: + vnflags = LK_SHARED; + break; + case LK_UPGRADE: + case LK_EXCLUPGRADE: + case LK_DOWNGRADE: + return (0); + case LK_RELEASE: + default: + panic("vop_nolock: bad operation %d", flags & LK_TYPE_MASK); + } + if (flags & LK_INTERLOCK) + vnflags |= LK_INTERLOCK; + return(lockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p)); +#else /* for now */ + /* + * Since we are not using the lock manager, we must clear + * the interlock here. + */ + if (ap->a_flags & LK_INTERLOCK) + simple_unlock(&ap->a_vp->v_interlock); + return (0); +#endif +} + +/* + * Do the inverse of vop_nolock, handling the interlock in a compatible way. + */ +int +vop_nounlock(ap) + struct vop_unlock_args /* { + struct vnode *a_vp; + int a_flags; + struct proc *a_p; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + + if (vp->v_vnlock == NULL) { + if (ap->a_flags & LK_INTERLOCK) + simple_unlock(&ap->a_vp->v_interlock); + return (0); + } + return (lockmgr(vp->v_vnlock, LK_RELEASE | ap->a_flags, + &ap->a_vp->v_interlock, ap->a_p)); +} + +/* + * Return whether or not the node is in use. + */ +int +vop_noislocked(ap) + struct vop_islocked_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + + if (vp->v_vnlock == NULL) + return (0); + return (lockstatus(vp->v_vnlock)); +} + diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c new file mode 100644 index 0000000..44b1698 --- /dev/null +++ b/sys/kern/vfs_export.c @@ -0,0 +1,2872 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 + * $Id: vfs_subr.c,v 1.182 1999/01/10 01:58:26 eivind Exp $ + */ + +/* + * External virtual filesystem routines + */ +#include "opt_ddb.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/conf.h> +#include <sys/fcntl.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/mount.h> +#include <sys/socket.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/buf.h> +#include <sys/domain.h> +#include <sys/dirent.h> +#include <sys/vmmeter.h> + +#include <machine/limits.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/vm_pager.h> +#include <vm/vnode_pager.h> +#include <vm/vm_zone.h> +#include <sys/sysctl.h> + +#include <miscfs/specfs/specdev.h> + +static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); + +static void insmntque __P((struct vnode *vp, struct mount *mp)); +static void vclean __P((struct vnode *vp, int flags, struct proc *p)); +static void vfree __P((struct vnode *)); +static void vgonel __P((struct vnode *vp, struct proc *p)); +static unsigned long numvnodes; +SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); + +enum vtype iftovt_tab[16] = { + VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, + VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, +}; +int vttoif_tab[9] = { + 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, + S_IFSOCK, S_IFIFO, S_IFMT, +}; + +static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ +struct tobefreelist vnode_tobefree_list; /* vnode free list */ + +static u_long wantfreevnodes = 25; +SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); +static u_long freevnodes = 0; +SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); + +int vfs_ioopt = 0; +#ifdef ENABLE_VFS_IOOPT +SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); +#endif + +struct mntlist mountlist; /* mounted filesystem list */ +struct simplelock mountlist_slock; +struct simplelock mntvnode_slock; +int nfs_mount_type = -1; +#ifndef NULL_SIMPLELOCKS +static struct simplelock mntid_slock; +static struct simplelock vnode_free_list_slock; +static struct simplelock spechash_slock; +#endif +struct nfs_public nfs_pub; /* publicly exported FS */ +static vm_zone_t vnode_zone; + +/* + * The workitem queue. + */ +#define SYNCER_MAXDELAY 32 +static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ +time_t syncdelay = 30; +int rushjob; /* number of slots to run ASAP */ + +static int syncer_delayno = 0; +static long syncer_mask; +LIST_HEAD(synclist, vnode); +static struct synclist *syncer_workitem_pending; + +int desiredvnodes; +SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, ""); + +static void vfs_free_addrlist __P((struct netexport *nep)); +static int vfs_free_netcred __P((struct radix_node *rn, void *w)); +static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, + struct export_args *argp)); + +/* + * Initialize the vnode management data structures. + */ +void +vntblinit() +{ + + desiredvnodes = maxproc + cnt.v_page_count / 4; + simple_lock_init(&mntvnode_slock); + simple_lock_init(&mntid_slock); + simple_lock_init(&spechash_slock); + TAILQ_INIT(&vnode_free_list); + TAILQ_INIT(&vnode_tobefree_list); + simple_lock_init(&vnode_free_list_slock); + CIRCLEQ_INIT(&mountlist); + vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); + /* + * Initialize the filesystem syncer. + */ + syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, + &syncer_mask); + syncer_maxdelay = syncer_mask + 1; +} + +/* + * Mark a mount point as busy. Used to synchronize access and to delay + * unmounting. Interlock is not released on failure. + */ +int +vfs_busy(mp, flags, interlkp, p) + struct mount *mp; + int flags; + struct simplelock *interlkp; + struct proc *p; +{ + int lkflags; + + if (mp->mnt_kern_flag & MNTK_UNMOUNT) { + if (flags & LK_NOWAIT) + return (ENOENT); + mp->mnt_kern_flag |= MNTK_MWAIT; + if (interlkp) { + simple_unlock(interlkp); + } + /* + * Since all busy locks are shared except the exclusive + * lock granted when unmounting, the only place that a + * wakeup needs to be done is at the release of the + * exclusive lock at the end of dounmount. + */ + tsleep((caddr_t)mp, PVFS, "vfs_busy", 0); + if (interlkp) { + simple_lock(interlkp); + } + return (ENOENT); + } + lkflags = LK_SHARED | LK_NOPAUSE; + if (interlkp) + lkflags |= LK_INTERLOCK; + if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) + panic("vfs_busy: unexpected lock failure"); + return (0); +} + +/* + * Free a busy filesystem. + */ +void +vfs_unbusy(mp, p) + struct mount *mp; + struct proc *p; +{ + + lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); +} + +/* + * Lookup a filesystem type, and if found allocate and initialize + * a mount structure for it. + * + * Devname is usually updated by mount(8) after booting. + */ +int +vfs_rootmountalloc(fstypename, devname, mpp) + char *fstypename; + char *devname; + struct mount **mpp; +{ + struct proc *p = curproc; /* XXX */ + struct vfsconf *vfsp; + struct mount *mp; + + if (fstypename == NULL) + return (ENODEV); + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (!strcmp(vfsp->vfc_name, fstypename)) + break; + if (vfsp == NULL) + return (ENODEV); + mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); + bzero((char *)mp, (u_long)sizeof(struct mount)); + lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); + (void)vfs_busy(mp, LK_NOWAIT, 0, p); + LIST_INIT(&mp->mnt_vnodelist); + mp->mnt_vfc = vfsp; + mp->mnt_op = vfsp->vfc_vfsops; + mp->mnt_flag = MNT_RDONLY; + mp->mnt_vnodecovered = NULLVP; + vfsp->vfc_refcount++; + mp->mnt_stat.f_type = vfsp->vfc_typenum; + mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; + strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); + mp->mnt_stat.f_mntonname[0] = '/'; + mp->mnt_stat.f_mntonname[1] = 0; + (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); + *mpp = mp; + return (0); +} + +/* + * Find an appropriate filesystem to use for the root. If a filesystem + * has not been preselected, walk through the list of known filesystems + * trying those that have mountroot routines, and try them until one + * works or we have tried them all. + */ +#ifdef notdef /* XXX JH */ +int +lite2_vfs_mountroot() +{ + struct vfsconf *vfsp; + extern int (*lite2_mountroot) __P((void)); + int error; + + if (lite2_mountroot != NULL) + return ((*lite2_mountroot)()); + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { + if (vfsp->vfc_mountroot == NULL) + continue; + if ((error = (*vfsp->vfc_mountroot)()) == 0) + return (0); + printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); + } + return (ENODEV); +} +#endif + +/* + * Lookup a mount point by filesystem identifier. + */ +struct mount * +vfs_getvfs(fsid) + fsid_t *fsid; +{ + register struct mount *mp; + + simple_lock(&mountlist_slock); + for (mp = mountlist.cqh_first; mp != (void *)&mountlist; + mp = mp->mnt_list.cqe_next) { + if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && + mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { + simple_unlock(&mountlist_slock); + return (mp); + } + } + simple_unlock(&mountlist_slock); + return ((struct mount *) 0); +} + +/* + * Get a new unique fsid + */ +void +vfs_getnewfsid(mp) + struct mount *mp; +{ + static u_short xxxfs_mntid; + + fsid_t tfsid; + int mtype; + + simple_lock(&mntid_slock); + mtype = mp->mnt_vfc->vfc_typenum; + mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0); + mp->mnt_stat.f_fsid.val[1] = mtype; + if (xxxfs_mntid == 0) + ++xxxfs_mntid; + tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid); + tfsid.val[1] = mtype; + if (mountlist.cqh_first != (void *)&mountlist) { + while (vfs_getvfs(&tfsid)) { + tfsid.val[0]++; + xxxfs_mntid++; + } + } + mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; + simple_unlock(&mntid_slock); +} + +/* + * Set vnode attributes to VNOVAL + */ +void +vattr_null(vap) + register struct vattr *vap; +{ + + vap->va_type = VNON; + vap->va_size = VNOVAL; + vap->va_bytes = VNOVAL; + vap->va_mode = VNOVAL; + vap->va_nlink = VNOVAL; + vap->va_uid = VNOVAL; + vap->va_gid = VNOVAL; + vap->va_fsid = VNOVAL; + vap->va_fileid = VNOVAL; + vap->va_blocksize = VNOVAL; + vap->va_rdev = VNOVAL; + vap->va_atime.tv_sec = VNOVAL; + vap->va_atime.tv_nsec = VNOVAL; + vap->va_mtime.tv_sec = VNOVAL; + vap->va_mtime.tv_nsec = VNOVAL; + vap->va_ctime.tv_sec = VNOVAL; + vap->va_ctime.tv_nsec = VNOVAL; + vap->va_flags = VNOVAL; + vap->va_gen = VNOVAL; + vap->va_vaflags = 0; +} + +/* + * Routines having to do with the management of the vnode table. + */ +extern vop_t **dead_vnodeop_p; + +/* + * Return the next vnode from the free list. + */ +int +getnewvnode(tag, mp, vops, vpp) + enum vtagtype tag; + struct mount *mp; + vop_t **vops; + struct vnode **vpp; +{ + int s; + struct proc *p = curproc; /* XXX */ + struct vnode *vp, *tvp, *nvp; + vm_object_t object; + TAILQ_HEAD(freelst, vnode) vnode_tmp_list; + + /* + * We take the least recently used vnode from the freelist + * if we can get it and it has no cached pages, and no + * namecache entries are relative to it. + * Otherwise we allocate a new vnode + */ + + s = splbio(); + simple_lock(&vnode_free_list_slock); + TAILQ_INIT(&vnode_tmp_list); + + for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) { + nvp = TAILQ_NEXT(vp, v_freelist); + TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); + if (vp->v_flag & VAGE) { + TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); + } else { + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + } + vp->v_flag &= ~(VTBFREE|VAGE); + vp->v_flag |= VFREE; + if (vp->v_usecount) + panic("tobe free vnode isn't"); + freevnodes++; + } + + if (wantfreevnodes && freevnodes < wantfreevnodes) { + vp = NULL; + } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { + /* + * XXX: this is only here to be backwards compatible + */ + vp = NULL; + } else { + for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) { + nvp = TAILQ_NEXT(vp, v_freelist); + if (!simple_lock_try(&vp->v_interlock)) + continue; + if (vp->v_usecount) + panic("free vnode isn't"); + + object = vp->v_object; + if (object && (object->resident_page_count || object->ref_count)) { + printf("object inconsistant state: RPC: %d, RC: %d\n", + object->resident_page_count, object->ref_count); + /* Don't recycle if it's caching some pages */ + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist); + continue; + } else if (LIST_FIRST(&vp->v_cache_src)) { + /* Don't recycle if active in the namecache */ + simple_unlock(&vp->v_interlock); + continue; + } else { + break; + } + } + } + + for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) { + nvp = TAILQ_NEXT(tvp, v_freelist); + TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist); + TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist); + simple_unlock(&tvp->v_interlock); + } + + if (vp) { + vp->v_flag |= VDOOMED; + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + freevnodes--; + simple_unlock(&vnode_free_list_slock); + cache_purge(vp); + vp->v_lease = NULL; + if (vp->v_type != VBAD) { + vgonel(vp, p); + } else { + simple_unlock(&vp->v_interlock); + } + +#ifdef INVARIANTS + { + int s; + + if (vp->v_data) + panic("cleaned vnode isn't"); + s = splbio(); + if (vp->v_numoutput) + panic("Clean vnode has pending I/O's"); + splx(s); + } +#endif + vp->v_flag = 0; + vp->v_lastr = 0; + vp->v_lastw = 0; + vp->v_lasta = 0; + vp->v_cstart = 0; + vp->v_clen = 0; + vp->v_socket = 0; + vp->v_writecount = 0; /* XXX */ + vp->v_maxio = 0; + } else { + simple_unlock(&vnode_free_list_slock); + vp = (struct vnode *) zalloc(vnode_zone); + bzero((char *) vp, sizeof *vp); + simple_lock_init(&vp->v_interlock); + vp->v_dd = vp; + cache_purge(vp); + LIST_INIT(&vp->v_cache_src); + TAILQ_INIT(&vp->v_cache_dst); + numvnodes++; + } + + TAILQ_INIT(&vp->v_cleanblkhd); + TAILQ_INIT(&vp->v_dirtyblkhd); + vp->v_type = VNON; + vp->v_tag = tag; + vp->v_op = vops; + insmntque(vp, mp); + *vpp = vp; + vp->v_usecount = 1; + vp->v_data = 0; + splx(s); + + vfs_object_create(vp, p, p->p_ucred); + return (0); +} + +/* + * Move a vnode from one mount queue to another. + */ +static void +insmntque(vp, mp) + register struct vnode *vp; + register struct mount *mp; +{ + + simple_lock(&mntvnode_slock); + /* + * Delete from old mount point vnode list, if on one. + */ + if (vp->v_mount != NULL) + LIST_REMOVE(vp, v_mntvnodes); + /* + * Insert into list of vnodes for the new mount point, if available. + */ + if ((vp->v_mount = mp) == NULL) { + simple_unlock(&mntvnode_slock); + return; + } + LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); + simple_unlock(&mntvnode_slock); +} + +/* + * Update outstanding I/O count and do wakeup if requested. + */ +void +vwakeup(bp) + register struct buf *bp; +{ + register struct vnode *vp; + + bp->b_flags &= ~B_WRITEINPROG; + if ((vp = bp->b_vp)) { + vp->v_numoutput--; + if (vp->v_numoutput < 0) + panic("vwakeup: neg numoutput"); + if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { + vp->v_flag &= ~VBWAIT; + wakeup((caddr_t) &vp->v_numoutput); + } + } +} + +/* + * Flush out and invalidate all buffers associated with a vnode. + * Called with the underlying object locked. + */ +int +vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) + register struct vnode *vp; + int flags; + struct ucred *cred; + struct proc *p; + int slpflag, slptimeo; +{ + register struct buf *bp; + struct buf *nbp, *blist; + int s, error; + vm_object_t object; + + if (flags & V_SAVE) { + s = splbio(); + while (vp->v_numoutput) { + vp->v_flag |= VBWAIT; + error = tsleep((caddr_t)&vp->v_numoutput, + slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); + if (error) { + splx(s); + return (error); + } + } + if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { + splx(s); + if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) + return (error); + s = splbio(); + if (vp->v_numoutput > 0 || + !TAILQ_EMPTY(&vp->v_dirtyblkhd)) + panic("vinvalbuf: dirty bufs"); + } + splx(s); + } + s = splbio(); + for (;;) { + blist = TAILQ_FIRST(&vp->v_cleanblkhd); + if (!blist) + blist = TAILQ_FIRST(&vp->v_dirtyblkhd); + if (!blist) + break; + + for (bp = blist; bp; bp = nbp) { + nbp = TAILQ_NEXT(bp, b_vnbufs); + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + error = tsleep((caddr_t) bp, + slpflag | (PRIBIO + 4), "vinvalbuf", + slptimeo); + if (error) { + splx(s); + return (error); + } + break; + } + /* + * XXX Since there are no node locks for NFS, I + * believe there is a slight chance that a delayed + * write will occur while sleeping just above, so + * check for it. Note that vfs_bio_awrite expects + * buffers to reside on a queue, while VOP_BWRITE and + * brelse do not. + */ + if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && + (flags & V_SAVE)) { + + if (bp->b_vp == vp) { + if (bp->b_flags & B_CLUSTEROK) { + vfs_bio_awrite(bp); + } else { + bremfree(bp); + bp->b_flags |= (B_BUSY | B_ASYNC); + VOP_BWRITE(bp); + } + } else { + bremfree(bp); + bp->b_flags |= B_BUSY; + (void) VOP_BWRITE(bp); + } + break; + } + bremfree(bp); + bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF | B_BUSY); + bp->b_flags &= ~B_ASYNC; + brelse(bp); + } + } + + while (vp->v_numoutput > 0) { + vp->v_flag |= VBWAIT; + tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); + } + + splx(s); + + /* + * Destroy the copy in the VM cache, too. + */ + simple_lock(&vp->v_interlock); + object = vp->v_object; + if (object != NULL) { + vm_object_page_remove(object, 0, 0, + (flags & V_SAVE) ? TRUE : FALSE); + } + simple_unlock(&vp->v_interlock); + + if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) + panic("vinvalbuf: flush failed"); + return (0); +} + +/* + * Truncate a file's buffer and pages to a specified length. This + * is in lieu of the old vinvalbuf mechanism, which performed unneeded + * sync activity. + */ +int +vtruncbuf(vp, cred, p, length, blksize) + register struct vnode *vp; + struct ucred *cred; + struct proc *p; + off_t length; + int blksize; +{ + register struct buf *bp; + struct buf *nbp; + int s, anyfreed; + int trunclbn; + + /* + * Round up to the *next* lbn. + */ + trunclbn = (length + blksize - 1) / blksize; + + s = splbio(); +restart: + anyfreed = 1; + for (;anyfreed;) { + anyfreed = 0; + for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { + nbp = TAILQ_NEXT(bp, b_vnbufs); + if (bp->b_lblkno >= trunclbn) { + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + tsleep(bp, PRIBIO + 4, "vtrb1", 0); + goto restart; + } else { + bremfree(bp); + bp->b_flags |= (B_BUSY | B_INVAL | B_RELBUF); + bp->b_flags &= ~B_ASYNC; + brelse(bp); + anyfreed = 1; + } + if (nbp && (((nbp->b_xflags & B_VNCLEAN) == 0)|| + (nbp->b_vp != vp) || + (nbp->b_flags & B_DELWRI))) { + goto restart; + } + } + } + + for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { + nbp = TAILQ_NEXT(bp, b_vnbufs); + if (bp->b_lblkno >= trunclbn) { + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + tsleep(bp, PRIBIO + 4, "vtrb2", 0); + goto restart; + } else { + bremfree(bp); + bp->b_flags |= (B_BUSY | B_INVAL | B_RELBUF); + bp->b_flags &= ~B_ASYNC; + brelse(bp); + anyfreed = 1; + } + if (nbp && (((nbp->b_xflags & B_VNDIRTY) == 0)|| + (nbp->b_vp != vp) || + (nbp->b_flags & B_DELWRI) == 0)) { + goto restart; + } + } + } + } + + if (length > 0) { +restartsync: + for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { + nbp = TAILQ_NEXT(bp, b_vnbufs); + if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + tsleep(bp, PRIBIO, "vtrb3", 0); + } else { + bremfree(bp); + bp->b_flags |= B_BUSY; + if (bp->b_vp == vp) { + bp->b_flags |= B_ASYNC; + } else { + bp->b_flags &= ~B_ASYNC; + } + VOP_BWRITE(bp); + } + goto restartsync; + } + + } + } + + while (vp->v_numoutput > 0) { + vp->v_flag |= VBWAIT; + tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); + } + + splx(s); + + vnode_pager_setsize(vp, length); + + return (0); +} + +/* + * Associate a buffer with a vnode. + */ +void +bgetvp(vp, bp) + register struct vnode *vp; + register struct buf *bp; +{ + int s; + + KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); + + vhold(vp); + bp->b_vp = vp; + if (vp->v_type == VBLK || vp->v_type == VCHR) + bp->b_dev = vp->v_rdev; + else + bp->b_dev = NODEV; + /* + * Insert onto list for new vnode. + */ + s = splbio(); + bp->b_xflags |= B_VNCLEAN; + bp->b_xflags &= ~B_VNDIRTY; + TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); + splx(s); +} + +/* + * Disassociate a buffer from a vnode. + */ +void +brelvp(bp) + register struct buf *bp; +{ + struct vnode *vp; + struct buflists *listheadp; + int s; + + KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); + + /* + * Delete from old vnode list, if on one. + */ + vp = bp->b_vp; + s = splbio(); + if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) { + if (bp->b_xflags & B_VNDIRTY) + listheadp = &vp->v_dirtyblkhd; + else + listheadp = &vp->v_cleanblkhd; + TAILQ_REMOVE(listheadp, bp, b_vnbufs); + bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN); + } + if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { + vp->v_flag &= ~VONWORKLST; + LIST_REMOVE(vp, v_synclist); + } + splx(s); + bp->b_vp = (struct vnode *) 0; + vdrop(vp); +} + +/* + * The workitem queue. + * + * It is useful to delay writes of file data and filesystem metadata + * for tens of seconds so that quickly created and deleted files need + * not waste disk bandwidth being created and removed. To realize this, + * we append vnodes to a "workitem" queue. When running with a soft + * updates implementation, most pending metadata dependencies should + * not wait for more than a few seconds. Thus, mounted on block devices + * are delayed only about a half the time that file data is delayed. + * Similarly, directory updates are more critical, so are only delayed + * about a third the time that file data is delayed. Thus, there are + * SYNCER_MAXDELAY queues that are processed round-robin at a rate of + * one each second (driven off the filesystem syner process). The + * syncer_delayno variable indicates the next queue that is to be processed. + * Items that need to be processed soon are placed in this queue: + * + * syncer_workitem_pending[syncer_delayno] + * + * A delay of fifteen seconds is done by placing the request fifteen + * entries later in the queue: + * + * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] + * + */ + +/* + * Add an item to the syncer work queue. + */ +void +vn_syncer_add_to_worklist(vp, delay) + struct vnode *vp; + int delay; +{ + int s, slot; + + s = splbio(); + + if (vp->v_flag & VONWORKLST) { + LIST_REMOVE(vp, v_synclist); + } + + if (delay > syncer_maxdelay - 2) + delay = syncer_maxdelay - 2; + slot = (syncer_delayno + delay) & syncer_mask; + + LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); + vp->v_flag |= VONWORKLST; + splx(s); +} + +static void sched_sync __P((void)); +static struct proc *updateproc; +static struct kproc_desc up_kp = { + "syncer", + sched_sync, + &updateproc +}; +SYSINIT_KT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) + +/* + * System filesystem synchronizer daemon. + */ +void +sched_sync(void) +{ + struct synclist *slp; + struct vnode *vp; + long starttime; + int s; + struct proc *p = updateproc; + + for (;;) { + starttime = time_second; + + /* + * Push files whose dirty time has expired. + */ + s = splbio(); + slp = &syncer_workitem_pending[syncer_delayno]; + syncer_delayno += 1; + if (syncer_delayno == syncer_maxdelay) + syncer_delayno = 0; + splx(s); + + while ((vp = LIST_FIRST(slp)) != NULL) { + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); + VOP_UNLOCK(vp, 0, p); + if (LIST_FIRST(slp) == vp) { + if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && + vp->v_type != VBLK) + panic("sched_sync: fsync failed"); + /* + * Move ourselves to the back of the sync list. + */ + LIST_REMOVE(vp, v_synclist); + vn_syncer_add_to_worklist(vp, syncdelay); + } + } + + /* + * Do soft update processing. + */ + if (bioops.io_sync) + (*bioops.io_sync)(NULL); + + /* + * The variable rushjob allows the kernel to speed up the + * processing of the filesystem syncer process. A rushjob + * value of N tells the filesystem syncer to process the next + * N seconds worth of work on its queue ASAP. Currently rushjob + * is used by the soft update code to speed up the filesystem + * syncer process when the incore state is getting so far + * ahead of the disk that the kernel memory pool is being + * threatened with exhaustion. + */ + if (rushjob > 0) { + rushjob -= 1; + continue; + } + /* + * If it has taken us less than a second to process the + * current work, then wait. Otherwise start right over + * again. We can still lose time if any single round + * takes more than two seconds, but it does not really + * matter as we are just trying to generally pace the + * filesystem activity. + */ + if (time_second == starttime) + tsleep(&lbolt, PPAUSE, "syncer", 0); + } +} + +/* + * Associate a p-buffer with a vnode. + * + * Also sets B_PAGING flag to indicate that vnode is not fully associated + * with the buffer. i.e. the bp has not been linked into the vnode or + * ref-counted. + */ +void +pbgetvp(vp, bp) + register struct vnode *vp; + register struct buf *bp; +{ + + KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); + + bp->b_vp = vp; + bp->b_flags |= B_PAGING; + if (vp->v_type == VBLK || vp->v_type == VCHR) + bp->b_dev = vp->v_rdev; + else + bp->b_dev = NODEV; +} + +/* + * Disassociate a p-buffer from a vnode. + */ +void +pbrelvp(bp) + register struct buf *bp; +{ + + KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); + +#if !defined(MAX_PERF) + /* XXX REMOVE ME */ + if (bp->b_vnbufs.tqe_next != NULL) { + panic( + "relpbuf(): b_vp was probably reassignbuf()d %p %x", + bp, + (int)bp->b_flags + ); + } +#endif + bp->b_vp = (struct vnode *) 0; + bp->b_flags &= ~B_PAGING; +} + +void +pbreassignbuf(bp, newvp) + struct buf *bp; + struct vnode *newvp; +{ +#if !defined(MAX_PERF) + if ((bp->b_flags & B_PAGING) == 0) { + panic( + "pbreassignbuf() on non phys bp %p", + bp + ); + } +#endif + bp->b_vp = newvp; +} + +/* + * Reassign a buffer from one vnode to another. + * Used to assign file specific control information + * (indirect blocks) to the vnode to which they belong. + */ +void +reassignbuf(bp, newvp) + register struct buf *bp; + register struct vnode *newvp; +{ + struct buflists *listheadp; + struct vnode *oldvp; + int delay; + int s; + + if (newvp == NULL) { + printf("reassignbuf: NULL"); + return; + } + +#if !defined(MAX_PERF) + /* + * B_PAGING flagged buffers cannot be reassigned because their vp + * is not fully linked in. + */ + if (bp->b_flags & B_PAGING) + panic("cannot reassign paging buffer"); +#endif + + s = splbio(); + /* + * Delete from old vnode list, if on one. + */ + if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) { + oldvp = bp->b_vp; + if (bp->b_xflags & B_VNDIRTY) + listheadp = &oldvp->v_dirtyblkhd; + else + listheadp = &oldvp->v_cleanblkhd; + TAILQ_REMOVE(listheadp, bp, b_vnbufs); + bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN); + vdrop(oldvp); + } + /* + * If dirty, put on list of dirty buffers; otherwise insert onto list + * of clean buffers. + */ + if (bp->b_flags & B_DELWRI) { + struct buf *tbp; + + listheadp = &newvp->v_dirtyblkhd; + if ((newvp->v_flag & VONWORKLST) == 0) { + switch (newvp->v_type) { + case VDIR: + delay = syncdelay / 3; + break; + case VBLK: + if (newvp->v_specmountpoint != NULL) { + delay = syncdelay / 2; + break; + } + /* fall through */ + default: + delay = syncdelay; + } + vn_syncer_add_to_worklist(newvp, delay); + } + bp->b_xflags |= B_VNDIRTY; + tbp = TAILQ_FIRST(listheadp); + if (tbp == NULL || + (bp->b_lblkno >= 0 && tbp->b_lblkno > bp->b_lblkno)) { + TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); + } else { + if (bp->b_lblkno >= 0) { + struct buf *ttbp; + while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && + (ttbp->b_lblkno < bp->b_lblkno)) { + tbp = ttbp; + } + TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); + } else { + TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); + } + } + } else { + bp->b_xflags |= B_VNCLEAN; + TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); + if ((newvp->v_flag & VONWORKLST) && + TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { + newvp->v_flag &= ~VONWORKLST; + LIST_REMOVE(newvp, v_synclist); + } + } + bp->b_vp = newvp; + vhold(bp->b_vp); + splx(s); +} + +/* + * Create a vnode for a block device. + * Used for mounting the root file system. + */ +int +bdevvp(dev, vpp) + dev_t dev; + struct vnode **vpp; +{ + register struct vnode *vp; + struct vnode *nvp; + int error; + + /* XXX 255 is for mfs. */ + if (dev == NODEV || (major(dev) != 255 && (major(dev) >= nblkdev || + bdevsw[major(dev)] == NULL))) { + *vpp = NULLVP; + return (ENXIO); + } + error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); + if (error) { + *vpp = NULLVP; + return (error); + } + vp = nvp; + vp->v_type = VBLK; + if ((nvp = checkalias(vp, dev, (struct mount *)0)) != NULL) { + vput(vp); + vp = nvp; + } + *vpp = vp; + return (0); +} + +/* + * Check to see if the new vnode represents a special device + * for which we already have a vnode (either because of + * bdevvp() or because of a different vnode representing + * the same block device). If such an alias exists, deallocate + * the existing contents and return the aliased vnode. The + * caller is responsible for filling it with its new contents. + */ +struct vnode * +checkalias(nvp, nvp_rdev, mp) + register struct vnode *nvp; + dev_t nvp_rdev; + struct mount *mp; +{ + struct proc *p = curproc; /* XXX */ + struct vnode *vp; + struct vnode **vpp; + + if (nvp->v_type != VBLK && nvp->v_type != VCHR) + return (NULLVP); + + vpp = &speclisth[SPECHASH(nvp_rdev)]; +loop: + simple_lock(&spechash_slock); + for (vp = *vpp; vp; vp = vp->v_specnext) { + if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) + continue; + /* + * Alias, but not in use, so flush it out. + * Only alias active device nodes. + * Not sure why we don't re-use this like we do below. + */ + simple_lock(&vp->v_interlock); + if (vp->v_usecount == 0) { + simple_unlock(&spechash_slock); + vgonel(vp, p); + goto loop; + } + if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) { + /* + * It dissappeared, and we may have slept. + * Restart from the beginning + */ + simple_unlock(&spechash_slock); + goto loop; + } + break; + } + /* + * It would be a lot clearer what is going on here if + * this had been expressed as: + * if ( vp && (vp->v_tag == VT_NULL)) + * and the clauses had been swapped. + */ + if (vp == NULL || vp->v_tag != VT_NON) { + /* + * Put the new vnode into the hash chain. + * and if there was an alias, connect them. + */ + MALLOC(nvp->v_specinfo, struct specinfo *, + sizeof(struct specinfo), M_VNODE, M_WAITOK); + nvp->v_rdev = nvp_rdev; + nvp->v_hashchain = vpp; + nvp->v_specnext = *vpp; + nvp->v_specmountpoint = NULL; + simple_unlock(&spechash_slock); + *vpp = nvp; + if (vp != NULLVP) { + nvp->v_flag |= VALIASED; + vp->v_flag |= VALIASED; + vput(vp); + } + return (NULLVP); + } + /* + * if ( vp && (vp->v_tag == VT_NULL)) + * We have a vnode alias, but it is a trashed. + * Make it look like it's newley allocated. (by getnewvnode()) + * The caller should use this instead. + */ + simple_unlock(&spechash_slock); + VOP_UNLOCK(vp, 0, p); + simple_lock(&vp->v_interlock); + vclean(vp, 0, p); + vp->v_op = nvp->v_op; + vp->v_tag = nvp->v_tag; + nvp->v_type = VNON; + insmntque(vp, mp); + return (vp); +} + +/* + * Grab a particular vnode from the free list, increment its + * reference count and lock it. The vnode lock bit is set the + * vnode is being eliminated in vgone. The process is awakened + * when the transition is completed, and an error returned to + * indicate that the vnode is no longer usable (possibly having + * been changed to a new file system type). + */ +int +vget(vp, flags, p) + register struct vnode *vp; + int flags; + struct proc *p; +{ + int error; + + /* + * If the vnode is in the process of being cleaned out for + * another use, we wait for the cleaning to finish and then + * return failure. Cleaning is determined by checking that + * the VXLOCK flag is set. + */ + if ((flags & LK_INTERLOCK) == 0) { + simple_lock(&vp->v_interlock); + } + if (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + simple_unlock(&vp->v_interlock); + tsleep((caddr_t)vp, PINOD, "vget", 0); + return (ENOENT); + } + + vp->v_usecount++; + + if (VSHOULDBUSY(vp)) + vbusy(vp); + if (flags & LK_TYPE_MASK) { + if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) { + /* + * must expand vrele here because we do not want + * to call VOP_INACTIVE if the reference count + * drops back to zero since it was never really + * active. We must remove it from the free list + * before sleeping so that multiple processes do + * not try to recycle it. + */ + simple_lock(&vp->v_interlock); + vp->v_usecount--; + if (VSHOULDFREE(vp)) + vfree(vp); + simple_unlock(&vp->v_interlock); + } + return (error); + } + simple_unlock(&vp->v_interlock); + return (0); +} + +void +vref(struct vnode *vp) +{ + simple_lock(&vp->v_interlock); + vp->v_usecount++; + simple_unlock(&vp->v_interlock); +} + +/* + * Vnode put/release. + * If count drops to zero, call inactive routine and return to freelist. + */ +void +vrele(vp) + struct vnode *vp; +{ + struct proc *p = curproc; /* XXX */ + + KASSERT(vp != NULL, ("vrele: null vp")); + + simple_lock(&vp->v_interlock); + + if (vp->v_usecount > 1) { + + vp->v_usecount--; + simple_unlock(&vp->v_interlock); + + return; + } + + if (vp->v_usecount == 1) { + + vp->v_usecount--; + if (VSHOULDFREE(vp)) + vfree(vp); + /* + * If we are doing a vput, the node is already locked, and we must + * call VOP_INACTIVE with the node locked. So, in the case of + * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. + */ + if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { + VOP_INACTIVE(vp, p); + } + + } else { +#ifdef DIAGNOSTIC + vprint("vrele: negative ref count", vp); + simple_unlock(&vp->v_interlock); +#endif + panic("vrele: negative ref cnt"); + } +} + +void +vput(vp) + struct vnode *vp; +{ + struct proc *p = curproc; /* XXX */ + + KASSERT(vp != NULL, ("vput: null vp")); + + simple_lock(&vp->v_interlock); + + if (vp->v_usecount > 1) { + + vp->v_usecount--; + VOP_UNLOCK(vp, LK_INTERLOCK, p); + return; + + } + + if (vp->v_usecount == 1) { + + vp->v_usecount--; + if (VSHOULDFREE(vp)) + vfree(vp); + /* + * If we are doing a vput, the node is already locked, and we must + * call VOP_INACTIVE with the node locked. So, in the case of + * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. + */ + simple_unlock(&vp->v_interlock); + VOP_INACTIVE(vp, p); + + } else { +#ifdef DIAGNOSTIC + vprint("vput: negative ref count", vp); +#endif + panic("vput: negative ref cnt"); + } +} + +/* + * Somebody doesn't want the vnode recycled. + */ +void +vhold(vp) + register struct vnode *vp; +{ + int s; + + s = splbio(); + vp->v_holdcnt++; + if (VSHOULDBUSY(vp)) + vbusy(vp); + splx(s); +} + +/* + * One less who cares about this vnode. + */ +void +vdrop(vp) + register struct vnode *vp; +{ + int s; + + s = splbio(); + if (vp->v_holdcnt <= 0) + panic("vdrop: holdcnt"); + vp->v_holdcnt--; + if (VSHOULDFREE(vp)) + vfree(vp); + splx(s); +} + +/* + * Remove any vnodes in the vnode table belonging to mount point mp. + * + * If MNT_NOFORCE is specified, there should not be any active ones, + * return error if any are found (nb: this is a user error, not a + * system error). If MNT_FORCE is specified, detach any active vnodes + * that are found. + */ +#ifdef DIAGNOSTIC +static int busyprt = 0; /* print out busy vnodes */ +SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); +#endif + +int +vflush(mp, skipvp, flags) + struct mount *mp; + struct vnode *skipvp; + int flags; +{ + struct proc *p = curproc; /* XXX */ + struct vnode *vp, *nvp; + int busy = 0; + + simple_lock(&mntvnode_slock); +loop: + for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { + /* + * Make sure this vnode wasn't reclaimed in getnewvnode(). + * Start over if it has (it won't be on the list anymore). + */ + if (vp->v_mount != mp) + goto loop; + nvp = vp->v_mntvnodes.le_next; + /* + * Skip over a selected vnode. + */ + if (vp == skipvp) + continue; + + simple_lock(&vp->v_interlock); + /* + * Skip over a vnodes marked VSYSTEM. + */ + if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { + simple_unlock(&vp->v_interlock); + continue; + } + /* + * If WRITECLOSE is set, only flush out regular file vnodes + * open for writing. + */ + if ((flags & WRITECLOSE) && + (vp->v_writecount == 0 || vp->v_type != VREG)) { + simple_unlock(&vp->v_interlock); + continue; + } + + /* + * With v_usecount == 0, all we need to do is clear out the + * vnode data structures and we are done. + */ + if (vp->v_usecount == 0) { + simple_unlock(&mntvnode_slock); + vgonel(vp, p); + simple_lock(&mntvnode_slock); + continue; + } + + /* + * If FORCECLOSE is set, forcibly close the vnode. For block + * or character devices, revert to an anonymous device. For + * all other files, just kill them. + */ + if (flags & FORCECLOSE) { + simple_unlock(&mntvnode_slock); + if (vp->v_type != VBLK && vp->v_type != VCHR) { + vgonel(vp, p); + } else { + vclean(vp, 0, p); + vp->v_op = spec_vnodeop_p; + insmntque(vp, (struct mount *) 0); + } + simple_lock(&mntvnode_slock); + continue; + } +#ifdef DIAGNOSTIC + if (busyprt) + vprint("vflush: busy vnode", vp); +#endif + simple_unlock(&vp->v_interlock); + busy++; + } + simple_unlock(&mntvnode_slock); + if (busy) + return (EBUSY); + return (0); +} + +/* + * Disassociate the underlying file system from a vnode. + */ +static void +vclean(vp, flags, p) + struct vnode *vp; + int flags; + struct proc *p; +{ + int active; + vm_object_t obj; + + /* + * Check to see if the vnode is in use. If so we have to reference it + * before we clean it out so that its count cannot fall to zero and + * generate a race against ourselves to recycle it. + */ + if ((active = vp->v_usecount)) + vp->v_usecount++; + + /* + * Prevent the vnode from being recycled or brought into use while we + * clean it out. + */ + if (vp->v_flag & VXLOCK) + panic("vclean: deadlock"); + vp->v_flag |= VXLOCK; + /* + * Even if the count is zero, the VOP_INACTIVE routine may still + * have the object locked while it cleans it out. The VOP_LOCK + * ensures that the VOP_INACTIVE routine is done with its work. + * For active vnodes, it ensures that no other activity can + * occur while the underlying object is being cleaned out. + */ + VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); + + /* + * Clean out any buffers associated with the vnode. + */ + vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); + if (obj = vp->v_object) { + if (obj->ref_count == 0) { + /* + * This is a normal way of shutting down the object/vnode + * association. + */ + vm_object_terminate(obj); + } else { + /* + * Woe to the process that tries to page now :-). + */ + vm_pager_deallocate(obj); + } + } + + /* + * If purging an active vnode, it must be closed and + * deactivated before being reclaimed. Note that the + * VOP_INACTIVE will unlock the vnode. + */ + if (active) { + if (flags & DOCLOSE) + VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); + VOP_INACTIVE(vp, p); + } else { + /* + * Any other processes trying to obtain this lock must first + * wait for VXLOCK to clear, then call the new lock operation. + */ + VOP_UNLOCK(vp, 0, p); + } + /* + * Reclaim the vnode. + */ + if (VOP_RECLAIM(vp, p)) + panic("vclean: cannot reclaim"); + + if (active) + vrele(vp); + + cache_purge(vp); + if (vp->v_vnlock) { +#if 0 /* This is the only place we have LK_DRAINED in the entire kernel ??? */ +#ifdef DIAGNOSTIC + if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0) + vprint("vclean: lock not drained", vp); +#endif +#endif + FREE(vp->v_vnlock, M_VNODE); + vp->v_vnlock = NULL; + } + + if (VSHOULDFREE(vp)) + vfree(vp); + + /* + * Done with purge, notify sleepers of the grim news. + */ + vp->v_op = dead_vnodeop_p; + vn_pollgone(vp); + vp->v_tag = VT_NON; + vp->v_flag &= ~VXLOCK; + if (vp->v_flag & VXWANT) { + vp->v_flag &= ~VXWANT; + wakeup((caddr_t) vp); + } +} + +/* + * Eliminate all activity associated with the requested vnode + * and with all vnodes aliased to the requested vnode. + */ +int +vop_revoke(ap) + struct vop_revoke_args /* { + struct vnode *a_vp; + int a_flags; + } */ *ap; +{ + struct vnode *vp, *vq; + struct proc *p = curproc; /* XXX */ + + KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); + + vp = ap->a_vp; + simple_lock(&vp->v_interlock); + + if (vp->v_flag & VALIASED) { + /* + * If a vgone (or vclean) is already in progress, + * wait until it is done and return. + */ + if (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + simple_unlock(&vp->v_interlock); + tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0); + return (0); + } + /* + * Ensure that vp will not be vgone'd while we + * are eliminating its aliases. + */ + vp->v_flag |= VXLOCK; + simple_unlock(&vp->v_interlock); + while (vp->v_flag & VALIASED) { + simple_lock(&spechash_slock); + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_rdev != vp->v_rdev || + vq->v_type != vp->v_type || vp == vq) + continue; + simple_unlock(&spechash_slock); + vgone(vq); + break; + } + if (vq == NULLVP) { + simple_unlock(&spechash_slock); + } + } + /* + * Remove the lock so that vgone below will + * really eliminate the vnode after which time + * vgone will awaken any sleepers. + */ + simple_lock(&vp->v_interlock); + vp->v_flag &= ~VXLOCK; + if (vp->v_flag & VXWANT) { + vp->v_flag &= ~VXWANT; + wakeup(vp); + } + } + vgonel(vp, p); + return (0); +} + +/* + * Recycle an unused vnode to the front of the free list. + * Release the passed interlock if the vnode will be recycled. + */ +int +vrecycle(vp, inter_lkp, p) + struct vnode *vp; + struct simplelock *inter_lkp; + struct proc *p; +{ + + simple_lock(&vp->v_interlock); + if (vp->v_usecount == 0) { + if (inter_lkp) { + simple_unlock(inter_lkp); + } + vgonel(vp, p); + return (1); + } + simple_unlock(&vp->v_interlock); + return (0); +} + +/* + * Eliminate all activity associated with a vnode + * in preparation for reuse. + */ +void +vgone(vp) + register struct vnode *vp; +{ + struct proc *p = curproc; /* XXX */ + + simple_lock(&vp->v_interlock); + vgonel(vp, p); +} + +/* + * vgone, with the vp interlock held. + */ +static void +vgonel(vp, p) + struct vnode *vp; + struct proc *p; +{ + int s; + struct vnode *vq; + struct vnode *vx; + + /* + * If a vgone (or vclean) is already in progress, + * wait until it is done and return. + */ + if (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + simple_unlock(&vp->v_interlock); + tsleep((caddr_t)vp, PINOD, "vgone", 0); + return; + } + + /* + * Clean out the filesystem specific data. + */ + vclean(vp, DOCLOSE, p); + simple_lock(&vp->v_interlock); + + /* + * Delete from old mount point vnode list, if on one. + */ + if (vp->v_mount != NULL) + insmntque(vp, (struct mount *)0); + /* + * If special device, remove it from special device alias list + * if it is on one. + */ + if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) { + simple_lock(&spechash_slock); + if (*vp->v_hashchain == vp) { + *vp->v_hashchain = vp->v_specnext; + } else { + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_specnext != vp) + continue; + vq->v_specnext = vp->v_specnext; + break; + } + if (vq == NULL) + panic("missing bdev"); + } + if (vp->v_flag & VALIASED) { + vx = NULL; + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_rdev != vp->v_rdev || + vq->v_type != vp->v_type) + continue; + if (vx) + break; + vx = vq; + } + if (vx == NULL) + panic("missing alias"); + if (vq == NULL) + vx->v_flag &= ~VALIASED; + vp->v_flag &= ~VALIASED; + } + simple_unlock(&spechash_slock); + FREE(vp->v_specinfo, M_VNODE); + vp->v_specinfo = NULL; + } + + /* + * If it is on the freelist and not already at the head, + * move it to the head of the list. The test of the back + * pointer and the reference count of zero is because + * it will be removed from the free list by getnewvnode, + * but will not have its reference count incremented until + * after calling vgone. If the reference count were + * incremented first, vgone would (incorrectly) try to + * close the previous instance of the underlying object. + */ + if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { + s = splbio(); + simple_lock(&vnode_free_list_slock); + if (vp->v_flag & VFREE) { + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + } else if (vp->v_flag & VTBFREE) { + TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); + vp->v_flag &= ~VTBFREE; + freevnodes++; + } else + freevnodes++; + vp->v_flag |= VFREE; + TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); + simple_unlock(&vnode_free_list_slock); + splx(s); + } + + vp->v_type = VBAD; + simple_unlock(&vp->v_interlock); +} + +/* + * Lookup a vnode by device number. + */ +int +vfinddev(dev, type, vpp) + dev_t dev; + enum vtype type; + struct vnode **vpp; +{ + register struct vnode *vp; + int rc = 0; + + simple_lock(&spechash_slock); + for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { + if (dev != vp->v_rdev || type != vp->v_type) + continue; + *vpp = vp; + rc = 1; + break; + } + simple_unlock(&spechash_slock); + return (rc); +} + +/* + * Calculate the total number of references to a special device. + */ +int +vcount(vp) + register struct vnode *vp; +{ + struct vnode *vq, *vnext; + int count; + +loop: + if ((vp->v_flag & VALIASED) == 0) + return (vp->v_usecount); + simple_lock(&spechash_slock); + for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) { + vnext = vq->v_specnext; + if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) + continue; + /* + * Alias, but not in use, so flush it out. + */ + if (vq->v_usecount == 0 && vq != vp) { + simple_unlock(&spechash_slock); + vgone(vq); + goto loop; + } + count += vq->v_usecount; + } + simple_unlock(&spechash_slock); + return (count); +} +/* + * Print out a description of a vnode. + */ +static char *typename[] = +{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; + +void +vprint(label, vp) + char *label; + register struct vnode *vp; +{ + char buf[96]; + + if (label != NULL) + printf("%s: %p: ", label, (void *)vp); + else + printf("%p: ", (void *)vp); + printf("type %s, usecount %d, writecount %d, refcount %d,", + typename[vp->v_type], vp->v_usecount, vp->v_writecount, + vp->v_holdcnt); + buf[0] = '\0'; + if (vp->v_flag & VROOT) + strcat(buf, "|VROOT"); + if (vp->v_flag & VTEXT) + strcat(buf, "|VTEXT"); + if (vp->v_flag & VSYSTEM) + strcat(buf, "|VSYSTEM"); + if (vp->v_flag & VXLOCK) + strcat(buf, "|VXLOCK"); + if (vp->v_flag & VXWANT) + strcat(buf, "|VXWANT"); + if (vp->v_flag & VBWAIT) + strcat(buf, "|VBWAIT"); + if (vp->v_flag & VALIASED) + strcat(buf, "|VALIASED"); + if (vp->v_flag & VDOOMED) + strcat(buf, "|VDOOMED"); + if (vp->v_flag & VFREE) + strcat(buf, "|VFREE"); + if (vp->v_flag & VOBJBUF) + strcat(buf, "|VOBJBUF"); + if (buf[0] != '\0') + printf(" flags (%s)", &buf[1]); + if (vp->v_data == NULL) { + printf("\n"); + } else { + printf("\n\t"); + VOP_PRINT(vp); + } +} + +#ifdef DDB +#include <ddb/ddb.h> +/* + * List all of the locked vnodes in the system. + * Called when debugging the kernel. + */ +DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) +{ + struct proc *p = curproc; /* XXX */ + struct mount *mp, *nmp; + struct vnode *vp; + + printf("Locked vnodes\n"); + simple_lock(&mountlist_slock); + for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { + nmp = mp->mnt_list.cqe_next; + continue; + } + for (vp = mp->mnt_vnodelist.lh_first; + vp != NULL; + vp = vp->v_mntvnodes.le_next) { + if (VOP_ISLOCKED(vp)) + vprint((char *)0, vp); + } + simple_lock(&mountlist_slock); + nmp = mp->mnt_list.cqe_next; + vfs_unbusy(mp, p); + } + simple_unlock(&mountlist_slock); +} +#endif + +/* + * Top level filesystem related information gathering. + */ +static int sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS); + +static int +vfs_sysctl SYSCTL_HANDLER_ARGS +{ + int *name = (int *)arg1 - 1; /* XXX */ + u_int namelen = arg2 + 1; /* XXX */ + struct vfsconf *vfsp; + +#if 1 || defined(COMPAT_PRELITE2) + /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ + if (namelen == 1) + return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); +#endif + +#ifdef notyet + /* all sysctl names at this level are at least name and field */ + if (namelen < 2) + return (ENOTDIR); /* overloaded */ + if (name[0] != VFS_GENERIC) { + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (vfsp->vfc_typenum == name[0]) + break; + if (vfsp == NULL) + return (EOPNOTSUPP); + return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, + oldp, oldlenp, newp, newlen, p)); + } +#endif + switch (name[1]) { + case VFS_MAXTYPENUM: + if (namelen != 2) + return (ENOTDIR); + return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); + case VFS_CONF: + if (namelen != 3) + return (ENOTDIR); /* overloaded */ + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (vfsp->vfc_typenum == name[2]) + break; + if (vfsp == NULL) + return (EOPNOTSUPP); + return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); + } + return (EOPNOTSUPP); +} + +SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, + "Generic filesystem"); + +#if 1 || defined(COMPAT_PRELITE2) + +static int +sysctl_ovfs_conf SYSCTL_HANDLER_ARGS +{ + int error; + struct vfsconf *vfsp; + struct ovfsconf ovfs; + + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { + ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ + strcpy(ovfs.vfc_name, vfsp->vfc_name); + ovfs.vfc_index = vfsp->vfc_typenum; + ovfs.vfc_refcount = vfsp->vfc_refcount; + ovfs.vfc_flags = vfsp->vfc_flags; + error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); + if (error) + return error; + } + return 0; +} + +#endif /* 1 || COMPAT_PRELITE2 */ + +#if 0 +#define KINFO_VNODESLOP 10 +/* + * Dump vnode list (via sysctl). + * Copyout address of vnode followed by vnode. + */ +/* ARGSUSED */ +static int +sysctl_vnode SYSCTL_HANDLER_ARGS +{ + struct proc *p = curproc; /* XXX */ + struct mount *mp, *nmp; + struct vnode *nvp, *vp; + int error; + +#define VPTRSZ sizeof (struct vnode *) +#define VNODESZ sizeof (struct vnode) + + req->lock = 0; + if (!req->oldptr) /* Make an estimate */ + return (SYSCTL_OUT(req, 0, + (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); + + simple_lock(&mountlist_slock); + for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { + nmp = mp->mnt_list.cqe_next; + continue; + } +again: + simple_lock(&mntvnode_slock); + for (vp = mp->mnt_vnodelist.lh_first; + vp != NULL; + vp = nvp) { + /* + * Check that the vp is still associated with + * this filesystem. RACE: could have been + * recycled onto the same filesystem. + */ + if (vp->v_mount != mp) { + simple_unlock(&mntvnode_slock); + goto again; + } + nvp = vp->v_mntvnodes.le_next; + simple_unlock(&mntvnode_slock); + if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || + (error = SYSCTL_OUT(req, vp, VNODESZ))) + return (error); + simple_lock(&mntvnode_slock); + } + simple_unlock(&mntvnode_slock); + simple_lock(&mountlist_slock); + nmp = mp->mnt_list.cqe_next; + vfs_unbusy(mp, p); + } + simple_unlock(&mountlist_slock); + + return (0); +} +#endif + +/* + * XXX + * Exporting the vnode list on large systems causes them to crash. + * Exporting the vnode list on medium systems causes sysctl to coredump. + */ +#if 0 +SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, + 0, 0, sysctl_vnode, "S,vnode", ""); +#endif + +/* + * Check to see if a filesystem is mounted on a block device. + */ +int +vfs_mountedon(vp) + struct vnode *vp; +{ + struct vnode *vq; + int error = 0; + + if (vp->v_specmountpoint != NULL) + return (EBUSY); + if (vp->v_flag & VALIASED) { + simple_lock(&spechash_slock); + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_rdev != vp->v_rdev || + vq->v_type != vp->v_type) + continue; + if (vq->v_specmountpoint != NULL) { + error = EBUSY; + break; + } + } + simple_unlock(&spechash_slock); + } + return (error); +} + +/* + * Unmount all filesystems. The list is traversed in reverse order + * of mounting to avoid dependencies. + */ +void +vfs_unmountall() +{ + struct mount *mp, *nmp; + struct proc *p; + int error; + + if (curproc != NULL) + p = curproc; + else + p = initproc; /* XXX XXX should this be proc0? */ + /* + * Since this only runs when rebooting, it is not interlocked. + */ + for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { + nmp = mp->mnt_list.cqe_prev; + error = dounmount(mp, MNT_FORCE, p); + if (error) { + printf("unmount of %s failed (", + mp->mnt_stat.f_mntonname); + if (error == EBUSY) + printf("BUSY)\n"); + else + printf("%d)\n", error); + } + } +} + +/* + * Build hash lists of net addresses and hang them off the mount point. + * Called by ufs_mount() to set up the lists of export addresses. + */ +static int +vfs_hang_addrlist(mp, nep, argp) + struct mount *mp; + struct netexport *nep; + struct export_args *argp; +{ + register struct netcred *np; + register struct radix_node_head *rnh; + register int i; + struct radix_node *rn; + struct sockaddr *saddr, *smask = 0; + struct domain *dom; + int error; + + if (argp->ex_addrlen == 0) { + if (mp->mnt_flag & MNT_DEFEXPORTED) + return (EPERM); + np = &nep->ne_defexported; + np->netc_exflags = argp->ex_flags; + np->netc_anon = argp->ex_anon; + np->netc_anon.cr_ref = 1; + mp->mnt_flag |= MNT_DEFEXPORTED; + return (0); + } + i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; + np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); + bzero((caddr_t) np, i); + saddr = (struct sockaddr *) (np + 1); + if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) + goto out; + if (saddr->sa_len > argp->ex_addrlen) + saddr->sa_len = argp->ex_addrlen; + if (argp->ex_masklen) { + smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); + error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); + if (error) + goto out; + if (smask->sa_len > argp->ex_masklen) + smask->sa_len = argp->ex_masklen; + } + i = saddr->sa_family; + if ((rnh = nep->ne_rtable[i]) == 0) { + /* + * Seems silly to initialize every AF when most are not used, + * do so on demand here + */ + for (dom = domains; dom; dom = dom->dom_next) + if (dom->dom_family == i && dom->dom_rtattach) { + dom->dom_rtattach((void **) &nep->ne_rtable[i], + dom->dom_rtoffset); + break; + } + if ((rnh = nep->ne_rtable[i]) == 0) { + error = ENOBUFS; + goto out; + } + } + rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, + np->netc_rnodes); + if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ + error = EPERM; + goto out; + } + np->netc_exflags = argp->ex_flags; + np->netc_anon = argp->ex_anon; + np->netc_anon.cr_ref = 1; + return (0); +out: + free(np, M_NETADDR); + return (error); +} + +/* ARGSUSED */ +static int +vfs_free_netcred(rn, w) + struct radix_node *rn; + void *w; +{ + register struct radix_node_head *rnh = (struct radix_node_head *) w; + + (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); + free((caddr_t) rn, M_NETADDR); + return (0); +} + +/* + * Free the net address hash lists that are hanging off the mount points. + */ +static void +vfs_free_addrlist(nep) + struct netexport *nep; +{ + register int i; + register struct radix_node_head *rnh; + + for (i = 0; i <= AF_MAX; i++) + if ((rnh = nep->ne_rtable[i])) { + (*rnh->rnh_walktree) (rnh, vfs_free_netcred, + (caddr_t) rnh); + free((caddr_t) rnh, M_RTABLE); + nep->ne_rtable[i] = 0; + } +} + +int +vfs_export(mp, nep, argp) + struct mount *mp; + struct netexport *nep; + struct export_args *argp; +{ + int error; + + if (argp->ex_flags & MNT_DELEXPORT) { + if (mp->mnt_flag & MNT_EXPUBLIC) { + vfs_setpublicfs(NULL, NULL, NULL); + mp->mnt_flag &= ~MNT_EXPUBLIC; + } + vfs_free_addrlist(nep); + mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); + } + if (argp->ex_flags & MNT_EXPORTED) { + if (argp->ex_flags & MNT_EXPUBLIC) { + if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) + return (error); + mp->mnt_flag |= MNT_EXPUBLIC; + } + if ((error = vfs_hang_addrlist(mp, nep, argp))) + return (error); + mp->mnt_flag |= MNT_EXPORTED; + } + return (0); +} + + +/* + * Set the publicly exported filesystem (WebNFS). Currently, only + * one public filesystem is possible in the spec (RFC 2054 and 2055) + */ +int +vfs_setpublicfs(mp, nep, argp) + struct mount *mp; + struct netexport *nep; + struct export_args *argp; +{ + int error; + struct vnode *rvp; + char *cp; + + /* + * mp == NULL -> invalidate the current info, the FS is + * no longer exported. May be called from either vfs_export + * or unmount, so check if it hasn't already been done. + */ + if (mp == NULL) { + if (nfs_pub.np_valid) { + nfs_pub.np_valid = 0; + if (nfs_pub.np_index != NULL) { + FREE(nfs_pub.np_index, M_TEMP); + nfs_pub.np_index = NULL; + } + } + return (0); + } + + /* + * Only one allowed at a time. + */ + if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) + return (EBUSY); + + /* + * Get real filehandle for root of exported FS. + */ + bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); + nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; + + if ((error = VFS_ROOT(mp, &rvp))) + return (error); + + if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) + return (error); + + vput(rvp); + + /* + * If an indexfile was specified, pull it in. + */ + if (argp->ex_indexfile != NULL) { + MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, + M_WAITOK); + error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, + MAXNAMLEN, (size_t *)0); + if (!error) { + /* + * Check for illegal filenames. + */ + for (cp = nfs_pub.np_index; *cp; cp++) { + if (*cp == '/') { + error = EINVAL; + break; + } + } + } + if (error) { + FREE(nfs_pub.np_index, M_TEMP); + return (error); + } + } + + nfs_pub.np_mount = mp; + nfs_pub.np_valid = 1; + return (0); +} + +struct netcred * +vfs_export_lookup(mp, nep, nam) + register struct mount *mp; + struct netexport *nep; + struct sockaddr *nam; +{ + register struct netcred *np; + register struct radix_node_head *rnh; + struct sockaddr *saddr; + + np = NULL; + if (mp->mnt_flag & MNT_EXPORTED) { + /* + * Lookup in the export list first. + */ + if (nam != NULL) { + saddr = nam; + rnh = nep->ne_rtable[saddr->sa_family]; + if (rnh != NULL) { + np = (struct netcred *) + (*rnh->rnh_matchaddr)((caddr_t)saddr, + rnh); + if (np && np->netc_rnodes->rn_flags & RNF_ROOT) + np = NULL; + } + } + /* + * If no address match, use the default if it exists. + */ + if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) + np = &nep->ne_defexported; + } + return (np); +} + +/* + * perform msync on all vnodes under a mount point + * the mount point must be locked. + */ +void +vfs_msync(struct mount *mp, int flags) { + struct vnode *vp, *nvp; + struct vm_object *obj; + int anyio, tries; + + tries = 5; +loop: + anyio = 0; + for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { + + nvp = vp->v_mntvnodes.le_next; + + if (vp->v_mount != mp) { + goto loop; + } + + if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ + continue; + + if (flags != MNT_WAIT) { + obj = vp->v_object; + if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0) + continue; + if (VOP_ISLOCKED(vp)) + continue; + } + + simple_lock(&vp->v_interlock); + if (vp->v_object && + (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) { + if (!vget(vp, + LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { + if (vp->v_object) { + vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0); + anyio = 1; + } + vput(vp); + } + } else { + simple_unlock(&vp->v_interlock); + } + } + if (anyio && (--tries > 0)) + goto loop; +} + +/* + * Create the VM object needed for VMIO and mmap support. This + * is done for all VREG files in the system. Some filesystems might + * afford the additional metadata buffering capability of the + * VMIO code by making the device node be VMIO mode also. + * + * vp must be locked when vfs_object_create is called. + */ +int +vfs_object_create(vp, p, cred) + struct vnode *vp; + struct proc *p; + struct ucred *cred; +{ + struct vattr vat; + vm_object_t object; + int error = 0; + + if ((vp->v_type != VREG) && (vp->v_type != VBLK)) + return 0; + +retry: + if ((object = vp->v_object) == NULL) { + if (vp->v_type == VREG) { + if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0) + goto retn; + object = vnode_pager_alloc(vp, vat.va_size, 0, 0); + } else if (major(vp->v_rdev) < nblkdev && + bdevsw[major(vp->v_rdev)] != NULL) { + /* + * This simply allocates the biggest object possible + * for a VBLK vnode. This should be fixed, but doesn't + * cause any problems (yet). + */ + object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0); + } + object->ref_count--; + vp->v_usecount--; + } else { + if (object->flags & OBJ_DEAD) { + VOP_UNLOCK(vp, 0, p); + tsleep(object, PVM, "vodead", 0); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + goto retry; + } + } + + if (vp->v_object) + vp->v_flag |= VOBJBUF; + +retn: + return error; +} + +static void +vfree(vp) + struct vnode *vp; +{ + int s; + + s = splbio(); + simple_lock(&vnode_free_list_slock); + if (vp->v_flag & VTBFREE) { + TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); + vp->v_flag &= ~VTBFREE; + } + if (vp->v_flag & VAGE) { + TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); + } else { + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + } + freevnodes++; + simple_unlock(&vnode_free_list_slock); + vp->v_flag &= ~VAGE; + vp->v_flag |= VFREE; + splx(s); +} + +void +vbusy(vp) + struct vnode *vp; +{ + int s; + + s = splbio(); + simple_lock(&vnode_free_list_slock); + if (vp->v_flag & VTBFREE) { + TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); + vp->v_flag &= ~VTBFREE; + } else { + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + freevnodes--; + } + simple_unlock(&vnode_free_list_slock); + vp->v_flag &= ~(VFREE|VAGE); + splx(s); +} + +/* + * Record a process's interest in events which might happen to + * a vnode. Because poll uses the historic select-style interface + * internally, this routine serves as both the ``check for any + * pending events'' and the ``record my interest in future events'' + * functions. (These are done together, while the lock is held, + * to avoid race conditions.) + */ +int +vn_pollrecord(vp, p, events) + struct vnode *vp; + struct proc *p; + short events; +{ + simple_lock(&vp->v_pollinfo.vpi_lock); + if (vp->v_pollinfo.vpi_revents & events) { + /* + * This leaves events we are not interested + * in available for the other process which + * which presumably had requested them + * (otherwise they would never have been + * recorded). + */ + events &= vp->v_pollinfo.vpi_revents; + vp->v_pollinfo.vpi_revents &= ~events; + + simple_unlock(&vp->v_pollinfo.vpi_lock); + return events; + } + vp->v_pollinfo.vpi_events |= events; + selrecord(p, &vp->v_pollinfo.vpi_selinfo); + simple_unlock(&vp->v_pollinfo.vpi_lock); + return 0; +} + +/* + * Note the occurrence of an event. If the VN_POLLEVENT macro is used, + * it is possible for us to miss an event due to race conditions, but + * that condition is expected to be rare, so for the moment it is the + * preferred interface. + */ +void +vn_pollevent(vp, events) + struct vnode *vp; + short events; +{ + simple_lock(&vp->v_pollinfo.vpi_lock); + if (vp->v_pollinfo.vpi_events & events) { + /* + * We clear vpi_events so that we don't + * call selwakeup() twice if two events are + * posted before the polling process(es) is + * awakened. This also ensures that we take at + * most one selwakeup() if the polling process + * is no longer interested. However, it does + * mean that only one event can be noticed at + * a time. (Perhaps we should only clear those + * event bits which we note?) XXX + */ + vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ + vp->v_pollinfo.vpi_revents |= events; + selwakeup(&vp->v_pollinfo.vpi_selinfo); + } + simple_unlock(&vp->v_pollinfo.vpi_lock); +} + +/* + * Wake up anyone polling on vp because it is being revoked. + * This depends on dead_poll() returning POLLHUP for correct + * behavior. + */ +void +vn_pollgone(vp) + struct vnode *vp; +{ + simple_lock(&vp->v_pollinfo.vpi_lock); + if (vp->v_pollinfo.vpi_events) { + vp->v_pollinfo.vpi_events = 0; + selwakeup(&vp->v_pollinfo.vpi_selinfo); + } + simple_unlock(&vp->v_pollinfo.vpi_lock); +} + + + +/* + * Routine to create and manage a filesystem syncer vnode. + */ +#define sync_close ((int (*) __P((struct vop_close_args *)))nullop) +static int sync_fsync __P((struct vop_fsync_args *)); +static int sync_inactive __P((struct vop_inactive_args *)); +static int sync_reclaim __P((struct vop_reclaim_args *)); +#define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) +#define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) +static int sync_print __P((struct vop_print_args *)); +#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) + +static vop_t **sync_vnodeop_p; +static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { + { &vop_default_desc, (vop_t *) vop_eopnotsupp }, + { &vop_close_desc, (vop_t *) sync_close }, /* close */ + { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ + { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ + { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ + { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ + { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ + { &vop_print_desc, (vop_t *) sync_print }, /* print */ + { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ + { NULL, NULL } +}; +static struct vnodeopv_desc sync_vnodeop_opv_desc = + { &sync_vnodeop_p, sync_vnodeop_entries }; + +VNODEOP_SET(sync_vnodeop_opv_desc); + +/* + * Create a new filesystem syncer vnode for the specified mount point. + */ +int +vfs_allocate_syncvnode(mp) + struct mount *mp; +{ + struct vnode *vp; + static long start, incr, next; + int error; + + /* Allocate a new vnode */ + if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { + mp->mnt_syncer = NULL; + return (error); + } + vp->v_type = VNON; + /* + * Place the vnode onto the syncer worklist. We attempt to + * scatter them about on the list so that they will go off + * at evenly distributed times even if all the filesystems + * are mounted at once. + */ + next += incr; + if (next == 0 || next > syncer_maxdelay) { + start /= 2; + incr /= 2; + if (start == 0) { + start = syncer_maxdelay / 2; + incr = syncer_maxdelay; + } + next = start; + } + vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); + mp->mnt_syncer = vp; + return (0); +} + +/* + * Do a lazy sync of the filesystem. + */ +static int +sync_fsync(ap) + struct vop_fsync_args /* { + struct vnode *a_vp; + struct ucred *a_cred; + int a_waitfor; + struct proc *a_p; + } */ *ap; +{ + struct vnode *syncvp = ap->a_vp; + struct mount *mp = syncvp->v_mount; + struct proc *p = ap->a_p; + int asyncflag; + + /* + * We only need to do something if this is a lazy evaluation. + */ + if (ap->a_waitfor != MNT_LAZY) + return (0); + + /* + * Move ourselves to the back of the sync list. + */ + vn_syncer_add_to_worklist(syncvp, syncdelay); + + /* + * Walk the list of vnodes pushing all that are dirty and + * not already on the sync list. + */ + simple_lock(&mountlist_slock); + if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) { + simple_unlock(&mountlist_slock); + return (0); + } + asyncflag = mp->mnt_flag & MNT_ASYNC; + mp->mnt_flag &= ~MNT_ASYNC; + vfs_msync(mp, MNT_NOWAIT); + VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); + if (asyncflag) + mp->mnt_flag |= MNT_ASYNC; + vfs_unbusy(mp, p); + return (0); +} + +/* + * The syncer vnode is no referenced. + */ +static int +sync_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + struct proc *a_p; + } */ *ap; +{ + + vgone(ap->a_vp); + return (0); +} + +/* + * The syncer vnode is no longer needed and is being decommissioned. + */ +static int +sync_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + + vp->v_mount->mnt_syncer = NULL; + if (vp->v_flag & VONWORKLST) { + LIST_REMOVE(vp, v_synclist); + vp->v_flag &= ~VONWORKLST; + } + + return (0); +} + +/* + * Print out a syncer vnode. + */ +static int +sync_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + + printf("syncer vnode"); + if (vp->v_vnlock != NULL) + lockmgr_printinfo(vp->v_vnlock); + printf("\n"); + return (0); +} diff --git a/sys/kern/vfs_extattr.c b/sys/kern/vfs_extattr.c new file mode 100644 index 0000000..18e39d6 --- /dev/null +++ b/sys/kern/vfs_extattr.c @@ -0,0 +1,3034 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 + * $Id: vfs_syscalls.c,v 1.111 1998/12/12 21:07:09 dillon Exp $ + */ + +/* For 4.3 integer FS ID compatibility */ +#include "opt_compat.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysent.h> +#include <sys/sysproto.h> +#include <sys/namei.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/linker.h> +#include <sys/stat.h> +#include <sys/unistd.h> +#include <sys/vnode.h> +#include <sys/malloc.h> +#include <sys/mount.h> +#include <sys/proc.h> +#include <sys/dirent.h> + +#include <miscfs/union/union.h> + +#include <vm/vm.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> +#include <vm/vm_zone.h> +#include <sys/sysctl.h> + +static int change_dir __P((struct nameidata *ndp, struct proc *p)); +static void checkdirs __P((struct vnode *olddp)); +static int setfown __P((struct proc *, struct vnode *, uid_t, gid_t)); +static int setfmode __P((struct proc *, struct vnode *, int)); +static int setfflags __P((struct proc *, struct vnode *, int)); +static int setutimes __P((struct proc *, struct vnode *, struct timeval *, int)); +static int usermount = 0; /* if 1, non-root can mount fs. */ + +int (*union_dircheckp) __P((struct proc *, struct vnode **, struct file *)); + +SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, ""); + +/* + * Virtual File System System Calls + */ + +/* + * Mount a file system. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mount_args { + char *type; + char *path; + int flags; + caddr_t data; +}; +#endif +/* ARGSUSED */ +int +mount(p, uap) + struct proc *p; + register struct mount_args /* { + syscallarg(char *) type; + syscallarg(char *) path; + syscallarg(int) flags; + syscallarg(caddr_t) data; + } */ *uap; +{ + struct vnode *vp; + struct mount *mp; + struct vfsconf *vfsp; + int error, flag = 0, flag2 = 0; + struct vattr va; + u_long fstypenum; + struct nameidata nd; + char fstypename[MFSNAMELEN]; + + if (usermount == 0 && (error = suser(p->p_ucred, &p->p_acflag))) + return (error); + + /* + * Get vnode to be covered + */ + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (SCARG(uap, flags) & MNT_UPDATE) { + if ((vp->v_flag & VROOT) == 0) { + vput(vp); + return (EINVAL); + } + mp = vp->v_mount; + flag = mp->mnt_flag; + flag2 = mp->mnt_kern_flag; + /* + * We only allow the filesystem to be reloaded if it + * is currently mounted read-only. + */ + if ((SCARG(uap, flags) & MNT_RELOAD) && + ((mp->mnt_flag & MNT_RDONLY) == 0)) { + vput(vp); + return (EOPNOTSUPP); /* Needs translation */ + } + mp->mnt_flag |= + SCARG(uap, flags) & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE); + /* + * Only root, or the user that did the original mount is + * permitted to update it. + */ + if (mp->mnt_stat.f_owner != p->p_ucred->cr_uid && + (error = suser(p->p_ucred, &p->p_acflag))) { + vput(vp); + return (error); + } + /* + * Do not allow NFS export by non-root users. Silently + * enforce MNT_NOSUID and MNT_NODEV for non-root users. + */ + if (p->p_ucred->cr_uid != 0) { + if (SCARG(uap, flags) & MNT_EXPORTED) { + vput(vp); + return (EPERM); + } + SCARG(uap, flags) |= MNT_NOSUID | MNT_NODEV; + } + if (vfs_busy(mp, LK_NOWAIT, 0, p)) { + vput(vp); + return (EBUSY); + } + VOP_UNLOCK(vp, 0, p); + goto update; + } + /* + * If the user is not root, ensure that they own the directory + * onto which we are attempting to mount. + */ + if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)) || + (va.va_uid != p->p_ucred->cr_uid && + (error = suser(p->p_ucred, &p->p_acflag)))) { + vput(vp); + return (error); + } + /* + * Do not allow NFS export by non-root users. Silently + * enforce MNT_NOSUID and MNT_NODEV for non-root users. + */ + if (p->p_ucred->cr_uid != 0) { + if (SCARG(uap, flags) & MNT_EXPORTED) { + vput(vp); + return (EPERM); + } + SCARG(uap, flags) |= MNT_NOSUID | MNT_NODEV; + } + if (error = vinvalbuf(vp, V_SAVE, p->p_ucred, p, 0, 0)) + return (error); + if (vp->v_type != VDIR) { + vput(vp); + return (ENOTDIR); + } +#ifdef COMPAT_43 + /* + * Historically filesystem types were identified by number. If we + * get an integer for the filesystem type instead of a string, we + * check to see if it matches one of the historic filesystem types. + */ + fstypenum = (uintptr_t)SCARG(uap, type); + if (fstypenum < maxvfsconf) { + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (vfsp->vfc_typenum == fstypenum) + break; + if (vfsp == NULL) { + vput(vp); + return (ENODEV); + } + strncpy(fstypename, vfsp->vfc_name, MFSNAMELEN); + } else +#endif /* COMPAT_43 */ + if (error = copyinstr(SCARG(uap, type), fstypename, MFSNAMELEN, NULL)) { + vput(vp); + return (error); + } + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (!strcmp(vfsp->vfc_name, fstypename)) + break; + if (vfsp == NULL) { + linker_file_t lf; + + /* Refuse to load modules if securelevel raised */ + if (securelevel > 0) { + vput(vp); + return EPERM; + } + /* Only load modules for root (very important!) */ + if (error = suser(p->p_ucred, &p->p_acflag)) { + vput(vp); + return error; + } + error = linker_load_file(fstypename, &lf); + if (error || lf == NULL) { + vput(vp); + if (lf == NULL) + error = ENODEV; + return error; + } + lf->userrefs++; + /* lookup again, see if the VFS was loaded */ + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (!strcmp(vfsp->vfc_name, fstypename)) + break; + if (vfsp == NULL) { + lf->userrefs--; + linker_file_unload(lf); + vput(vp); + return (ENODEV); + } + } + simple_lock(&vp->v_interlock); + if ((vp->v_flag & VMOUNT) != 0 || + vp->v_mountedhere != NULL) { + simple_unlock(&vp->v_interlock); + vput(vp); + return (EBUSY); + } + vp->v_flag |= VMOUNT; + simple_unlock(&vp->v_interlock); + + /* + * Allocate and initialize the filesystem. + */ + mp = (struct mount *)malloc((u_long)sizeof(struct mount), + M_MOUNT, M_WAITOK); + bzero((char *)mp, (u_long)sizeof(struct mount)); + lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); + (void)vfs_busy(mp, LK_NOWAIT, 0, p); + mp->mnt_op = vfsp->vfc_vfsops; + mp->mnt_vfc = vfsp; + vfsp->vfc_refcount++; + mp->mnt_stat.f_type = vfsp->vfc_typenum; + mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; + strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); + mp->mnt_vnodecovered = vp; + mp->mnt_stat.f_owner = p->p_ucred->cr_uid; + VOP_UNLOCK(vp, 0, p); +update: + /* + * Set the mount level flags. + */ + if (SCARG(uap, flags) & MNT_RDONLY) + mp->mnt_flag |= MNT_RDONLY; + else if (mp->mnt_flag & MNT_RDONLY) + mp->mnt_kern_flag |= MNTK_WANTRDWR; + mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | + MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOATIME | + MNT_NOSYMFOLLOW | + MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR); + mp->mnt_flag |= SCARG(uap, flags) & (MNT_NOSUID | MNT_NOEXEC | + MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_FORCE | + MNT_NOSYMFOLLOW | + MNT_NOATIME | MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR); + /* + * Mount the filesystem. + */ + error = VFS_MOUNT(mp, SCARG(uap, path), SCARG(uap, data), &nd, p); + if (mp->mnt_flag & MNT_UPDATE) { + vrele(vp); + if (mp->mnt_kern_flag & MNTK_WANTRDWR) + mp->mnt_flag &= ~MNT_RDONLY; + mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE); + mp->mnt_kern_flag &=~ MNTK_WANTRDWR; + if (error) { + mp->mnt_flag = flag; + mp->mnt_kern_flag = flag2; + } + if ((mp->mnt_flag & MNT_RDONLY) == 0) { + if (mp->mnt_syncer == NULL) + error = vfs_allocate_syncvnode(mp); + } else { + if (mp->mnt_syncer != NULL) + vrele(mp->mnt_syncer); + mp->mnt_syncer = NULL; + } + vfs_unbusy(mp, p); + return (error); + } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + /* + * Put the new filesystem on the mount list after root. + */ + cache_purge(vp); + if (!error) { + simple_lock(&vp->v_interlock); + vp->v_flag &= ~VMOUNT; + vp->v_mountedhere = mp; + simple_unlock(&vp->v_interlock); + simple_lock(&mountlist_slock); + CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list); + simple_unlock(&mountlist_slock); + checkdirs(vp); + VOP_UNLOCK(vp, 0, p); + if ((mp->mnt_flag & MNT_RDONLY) == 0) + error = vfs_allocate_syncvnode(mp); + vfs_unbusy(mp, p); + if (error = VFS_START(mp, 0, p)) + vrele(vp); + } else { + simple_lock(&vp->v_interlock); + vp->v_flag &= ~VMOUNT; + simple_unlock(&vp->v_interlock); + mp->mnt_vfc->vfc_refcount--; + vfs_unbusy(mp, p); + free((caddr_t)mp, M_MOUNT); + vput(vp); + } + return (error); +} + +/* + * Scan all active processes to see if any of them have a current + * or root directory onto which the new filesystem has just been + * mounted. If so, replace them with the new mount point. + */ +static void +checkdirs(olddp) + struct vnode *olddp; +{ + struct filedesc *fdp; + struct vnode *newdp; + struct proc *p; + + if (olddp->v_usecount == 1) + return; + if (VFS_ROOT(olddp->v_mountedhere, &newdp)) + panic("mount: lost mount"); + for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { + fdp = p->p_fd; + if (fdp->fd_cdir == olddp) { + vrele(fdp->fd_cdir); + VREF(newdp); + fdp->fd_cdir = newdp; + } + if (fdp->fd_rdir == olddp) { + vrele(fdp->fd_rdir); + VREF(newdp); + fdp->fd_rdir = newdp; + } + } + if (rootvnode == olddp) { + vrele(rootvnode); + VREF(newdp); + rootvnode = newdp; + } + vput(newdp); +} + +/* + * Unmount a file system. + * + * Note: unmount takes a path to the vnode mounted on as argument, + * not special file (as before). + */ +#ifndef _SYS_SYSPROTO_H_ +struct unmount_args { + char *path; + int flags; +}; +#endif +/* ARGSUSED */ +int +unmount(p, uap) + struct proc *p; + register struct unmount_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + } */ *uap; +{ + register struct vnode *vp; + struct mount *mp; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + mp = vp->v_mount; + + /* + * Only root, or the user that did the original mount is + * permitted to unmount this filesystem. + */ + if ((mp->mnt_stat.f_owner != p->p_ucred->cr_uid) && + (error = suser(p->p_ucred, &p->p_acflag))) { + vput(vp); + return (error); + } + + /* + * Don't allow unmounting the root file system. + */ + if (mp->mnt_flag & MNT_ROOTFS) { + vput(vp); + return (EINVAL); + } + + /* + * Must be the root of the filesystem + */ + if ((vp->v_flag & VROOT) == 0) { + vput(vp); + return (EINVAL); + } + vput(vp); + return (dounmount(mp, SCARG(uap, flags), p)); +} + +/* + * Do the actual file system unmount. + */ +int +dounmount(mp, flags, p) + register struct mount *mp; + int flags; + struct proc *p; +{ + struct vnode *coveredvp; + int error; + int async_flag; + + simple_lock(&mountlist_slock); + mp->mnt_kern_flag |= MNTK_UNMOUNT; + lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK, &mountlist_slock, p); + + if (mp->mnt_flag & MNT_EXPUBLIC) + vfs_setpublicfs(NULL, NULL, NULL); + + vfs_msync(mp, MNT_WAIT); + async_flag = mp->mnt_flag & MNT_ASYNC; + mp->mnt_flag &=~ MNT_ASYNC; + cache_purgevfs(mp); /* remove cache entries for this file sys */ + if (mp->mnt_syncer != NULL) + vrele(mp->mnt_syncer); + if (((mp->mnt_flag & MNT_RDONLY) || + (error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p)) == 0) || + (flags & MNT_FORCE)) + error = VFS_UNMOUNT(mp, flags, p); + simple_lock(&mountlist_slock); + if (error) { + if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL) + (void) vfs_allocate_syncvnode(mp); + mp->mnt_kern_flag &= ~MNTK_UNMOUNT; + mp->mnt_flag |= async_flag; + lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK | LK_REENABLE, + &mountlist_slock, p); + if (mp->mnt_kern_flag & MNTK_MWAIT) + wakeup((caddr_t)mp); + return (error); + } + CIRCLEQ_REMOVE(&mountlist, mp, mnt_list); + if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) { + coveredvp->v_mountedhere = (struct mount *)0; + vrele(coveredvp); + } + mp->mnt_vfc->vfc_refcount--; + if (mp->mnt_vnodelist.lh_first != NULL) + panic("unmount: dangling vnode"); + lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_slock, p); + if (mp->mnt_kern_flag & MNTK_MWAIT) + wakeup((caddr_t)mp); + free((caddr_t)mp, M_MOUNT); + return (0); +} + +/* + * Sync each mounted filesystem. + */ +#ifndef _SYS_SYSPROTO_H_ +struct sync_args { + int dummy; +}; +#endif + +#ifdef DEBUG +static int syncprt = 0; +SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, ""); +#endif + +/* ARGSUSED */ +int +sync(p, uap) + struct proc *p; + struct sync_args *uap; +{ + register struct mount *mp, *nmp; + int asyncflag; + + simple_lock(&mountlist_slock); + for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { + nmp = mp->mnt_list.cqe_next; + continue; + } + if ((mp->mnt_flag & MNT_RDONLY) == 0) { + asyncflag = mp->mnt_flag & MNT_ASYNC; + mp->mnt_flag &= ~MNT_ASYNC; + vfs_msync(mp, MNT_NOWAIT); + VFS_SYNC(mp, MNT_NOWAIT, + ((p != NULL) ? p->p_ucred : NOCRED), p); + mp->mnt_flag |= asyncflag; + } + simple_lock(&mountlist_slock); + nmp = mp->mnt_list.cqe_next; + vfs_unbusy(mp, p); + } + simple_unlock(&mountlist_slock); +#if 0 +/* + * XXX don't call vfs_bufstats() yet because that routine + * was not imported in the Lite2 merge. + */ +#ifdef DIAGNOSTIC + if (syncprt) + vfs_bufstats(); +#endif /* DIAGNOSTIC */ +#endif + return (0); +} + +/* + * Change filesystem quotas. + */ +#ifndef _SYS_SYSPROTO_H_ +struct quotactl_args { + char *path; + int cmd; + int uid; + caddr_t arg; +}; +#endif +/* ARGSUSED */ +int +quotactl(p, uap) + struct proc *p; + register struct quotactl_args /* { + syscallarg(char *) path; + syscallarg(int) cmd; + syscallarg(int) uid; + syscallarg(caddr_t) arg; + } */ *uap; +{ + register struct mount *mp; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + mp = nd.ni_vp->v_mount; + vrele(nd.ni_vp); + return (VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid), + SCARG(uap, arg), p)); +} + +/* + * Get filesystem statistics. + */ +#ifndef _SYS_SYSPROTO_H_ +struct statfs_args { + char *path; + struct statfs *buf; +}; +#endif +/* ARGSUSED */ +int +statfs(p, uap) + struct proc *p; + register struct statfs_args /* { + syscallarg(char *) path; + syscallarg(struct statfs *) buf; + } */ *uap; +{ + register struct mount *mp; + register struct statfs *sp; + int error; + struct nameidata nd; + struct statfs sb; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + mp = nd.ni_vp->v_mount; + sp = &mp->mnt_stat; + vrele(nd.ni_vp); + error = VFS_STATFS(mp, sp, p); + if (error) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (p->p_ucred->cr_uid != 0) { + bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb)); + sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; + sp = &sb; + } + return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp))); +} + +/* + * Get filesystem statistics. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fstatfs_args { + int fd; + struct statfs *buf; +}; +#endif +/* ARGSUSED */ +int +fstatfs(p, uap) + struct proc *p; + register struct fstatfs_args /* { + syscallarg(int) fd; + syscallarg(struct statfs *) buf; + } */ *uap; +{ + struct file *fp; + struct mount *mp; + register struct statfs *sp; + int error; + struct statfs sb; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + mp = ((struct vnode *)fp->f_data)->v_mount; + sp = &mp->mnt_stat; + error = VFS_STATFS(mp, sp, p); + if (error) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (p->p_ucred->cr_uid != 0) { + bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb)); + sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; + sp = &sb; + } + return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp))); +} + +/* + * Get statistics on all filesystems. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getfsstat_args { + struct statfs *buf; + long bufsize; + int flags; +}; +#endif +int +getfsstat(p, uap) + struct proc *p; + register struct getfsstat_args /* { + syscallarg(struct statfs *) buf; + syscallarg(long) bufsize; + syscallarg(int) flags; + } */ *uap; +{ + register struct mount *mp, *nmp; + register struct statfs *sp; + caddr_t sfsp; + long count, maxcount, error; + + maxcount = SCARG(uap, bufsize) / sizeof(struct statfs); + sfsp = (caddr_t)SCARG(uap, buf); + count = 0; + simple_lock(&mountlist_slock); + for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { + nmp = mp->mnt_list.cqe_next; + continue; + } + if (sfsp && count < maxcount) { + sp = &mp->mnt_stat; + /* + * If MNT_NOWAIT or MNT_LAZY is specified, do not + * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY + * overrides MNT_WAIT. + */ + if (((SCARG(uap, flags) & (MNT_LAZY|MNT_NOWAIT)) == 0 || + (SCARG(uap, flags) & MNT_WAIT)) && + (error = VFS_STATFS(mp, sp, p))) { + simple_lock(&mountlist_slock); + nmp = mp->mnt_list.cqe_next; + vfs_unbusy(mp, p); + continue; + } + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + error = copyout((caddr_t)sp, sfsp, sizeof(*sp)); + if (error) { + vfs_unbusy(mp, p); + return (error); + } + sfsp += sizeof(*sp); + } + count++; + simple_lock(&mountlist_slock); + nmp = mp->mnt_list.cqe_next; + vfs_unbusy(mp, p); + } + simple_unlock(&mountlist_slock); + if (sfsp && count > maxcount) + p->p_retval[0] = maxcount; + else + p->p_retval[0] = count; + return (0); +} + +/* + * Change current working directory to a given file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchdir_args { + int fd; +}; +#endif +/* ARGSUSED */ +int +fchdir(p, uap) + struct proc *p; + struct fchdir_args /* { + syscallarg(int) fd; + } */ *uap; +{ + register struct filedesc *fdp = p->p_fd; + struct vnode *vp, *tdp; + struct mount *mp; + struct file *fp; + int error; + + if (error = getvnode(fdp, SCARG(uap, fd), &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + VREF(vp); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + if (vp->v_type != VDIR) + error = ENOTDIR; + else + error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p); + while (!error && (mp = vp->v_mountedhere) != NULL) { + if (vfs_busy(mp, 0, 0, p)) + continue; + error = VFS_ROOT(mp, &tdp); + vfs_unbusy(mp, p); + if (error) + break; + vput(vp); + vp = tdp; + } + if (error) { + vput(vp); + return (error); + } + VOP_UNLOCK(vp, 0, p); + vrele(fdp->fd_cdir); + fdp->fd_cdir = vp; + return (0); +} + +/* + * Change current working directory (``.''). + */ +#ifndef _SYS_SYSPROTO_H_ +struct chdir_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +chdir(p, uap) + struct proc *p; + struct chdir_args /* { + syscallarg(char *) path; + } */ *uap; +{ + register struct filedesc *fdp = p->p_fd; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = change_dir(&nd, p)) + return (error); + vrele(fdp->fd_cdir); + fdp->fd_cdir = nd.ni_vp; + return (0); +} + +/* + * Change notion of root (``/'') directory. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chroot_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +chroot(p, uap) + struct proc *p; + struct chroot_args /* { + syscallarg(char *) path; + } */ *uap; +{ + register struct filedesc *fdp = p->p_fd; + int error; + struct nameidata nd; + + error = suser(p->p_ucred, &p->p_acflag); + if (error) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = change_dir(&nd, p)) + return (error); + vrele(fdp->fd_rdir); + fdp->fd_rdir = nd.ni_vp; + return (0); +} + +/* + * Common routine for chroot and chdir. + */ +static int +change_dir(ndp, p) + register struct nameidata *ndp; + struct proc *p; +{ + struct vnode *vp; + int error; + + error = namei(ndp); + if (error) + return (error); + vp = ndp->ni_vp; + if (vp->v_type != VDIR) + error = ENOTDIR; + else + error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p); + if (error) + vput(vp); + else + VOP_UNLOCK(vp, 0, p); + return (error); +} + +/* + * Check permissions, allocate an open file structure, + * and call the device open routine if any. + */ +#ifndef _SYS_SYSPROTO_H_ +struct open_args { + char *path; + int flags; + int mode; +}; +#endif +int +open(p, uap) + struct proc *p; + register struct open_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + syscallarg(int) mode; + } */ *uap; +{ + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + register struct vnode *vp; + int cmode, flags, oflags; + struct file *nfp; + int type, indx, error; + struct flock lf; + struct nameidata nd; + + oflags = SCARG(uap, flags); + if ((oflags & O_ACCMODE) == O_ACCMODE) + return (EINVAL); + flags = FFLAGS(oflags); + error = falloc(p, &nfp, &indx); + if (error) + return (error); + fp = nfp; + cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT; + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + p->p_dupfd = -indx - 1; /* XXX check for fdopen */ + error = vn_open(&nd, flags, cmode); + if (error) { + ffree(fp); + if ((error == ENODEV || error == ENXIO) && + p->p_dupfd >= 0 && /* XXX from fdopen */ + (error = + dupfdopen(fdp, indx, p->p_dupfd, flags, error)) == 0) { + p->p_retval[0] = indx; + return (0); + } + if (error == ERESTART) + error = EINTR; + fdp->fd_ofiles[indx] = NULL; + return (error); + } + p->p_dupfd = 0; + vp = nd.ni_vp; + + fp->f_flag = flags & FMASK; + fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE); + fp->f_ops = &vnops; + fp->f_data = (caddr_t)vp; + if (flags & (O_EXLOCK | O_SHLOCK)) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + if (flags & O_EXLOCK) + lf.l_type = F_WRLCK; + else + lf.l_type = F_RDLCK; + type = F_FLOCK; + if ((flags & FNONBLOCK) == 0) + type |= F_WAIT; + VOP_UNLOCK(vp, 0, p); + if (error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) { + (void) vn_close(vp, fp->f_flag, fp->f_cred, p); + ffree(fp); + fdp->fd_ofiles[indx] = NULL; + return (error); + } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + fp->f_flag |= FHASLOCK; + } + if ((vp->v_type == VREG) && (vp->v_object == NULL)) + vfs_object_create(vp, p, p->p_ucred); + VOP_UNLOCK(vp, 0, p); + p->p_retval[0] = indx; + return (0); +} + +#ifdef COMPAT_43 +/* + * Create a file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ocreat_args { + char *path; + int mode; +}; +#endif +int +ocreat(p, uap) + struct proc *p; + register struct ocreat_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; +{ + struct open_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + syscallarg(int) mode; + } */ nuap; + + SCARG(&nuap, path) = SCARG(uap, path); + SCARG(&nuap, mode) = SCARG(uap, mode); + SCARG(&nuap, flags) = O_WRONLY | O_CREAT | O_TRUNC; + return (open(p, &nuap)); +} +#endif /* COMPAT_43 */ + +/* + * Create a special file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mknod_args { + char *path; + int mode; + int dev; +}; +#endif +/* ARGSUSED */ +int +mknod(p, uap) + struct proc *p; + register struct mknod_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + syscallarg(int) dev; + } */ *uap; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + int whiteout = 0; + struct nameidata nd; + + error = suser(p->p_ucred, &p->p_acflag); + if (error) + return (error); + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp != NULL) + error = EEXIST; + else { + VATTR_NULL(&vattr); + vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask; + vattr.va_rdev = SCARG(uap, dev); + whiteout = 0; + + switch (SCARG(uap, mode) & S_IFMT) { + case S_IFMT: /* used by badsect to flag bad sectors */ + vattr.va_type = VBAD; + break; + case S_IFCHR: + vattr.va_type = VCHR; + break; + case S_IFBLK: + vattr.va_type = VBLK; + break; + case S_IFWHT: + whiteout = 1; + break; + default: + error = EINVAL; + break; + } + } + if (!error) { + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + if (whiteout) { + error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE); + if (error) + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + vput(nd.ni_dvp); + } else { + error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, + &nd.ni_cnd, &vattr); + vput(nd.ni_dvp); + } + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (vp) + vrele(vp); + } + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mknod"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "mknod"); + return (error); +} + +/* + * Create a named pipe. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mkfifo_args { + char *path; + int mode; +}; +#endif +/* ARGSUSED */ +int +mkfifo(p, uap) + struct proc *p; + register struct mkfifo_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; +{ + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + if (nd.ni_vp != NULL) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(nd.ni_vp); + return (EEXIST); + } + VATTR_NULL(&vattr); + vattr.va_type = VFIFO; + vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask; + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); + vput(nd.ni_dvp); + return (error); +} + +/* + * Make a hard file link. + */ +#ifndef _SYS_SYSPROTO_H_ +struct link_args { + char *path; + char *link; +}; +#endif +/* ARGSUSED */ +int +link(p, uap) + struct proc *p; + register struct link_args /* { + syscallarg(char *) path; + syscallarg(char *) link; + } */ *uap; +{ + register struct vnode *vp; + struct nameidata nd; + int error; + + NDINIT(&nd, LOOKUP, FOLLOW|NOOBJ, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type == VDIR) + error = EPERM; /* POSIX */ + else { + NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), p); + error = namei(&nd); + if (!error) { + if (nd.ni_vp != NULL) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_vp) + vrele(nd.ni_vp); + error = EEXIST; + } else { + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, + LEASE_WRITE); + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); + } + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + } + } + vrele(vp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "link"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "link"); + return (error); +} + +/* + * Make a symbolic link. + */ +#ifndef _SYS_SYSPROTO_H_ +struct symlink_args { + char *path; + char *link; +}; +#endif +/* ARGSUSED */ +int +symlink(p, uap) + struct proc *p; + register struct symlink_args /* { + syscallarg(char *) path; + syscallarg(char *) link; + } */ *uap; +{ + struct vattr vattr; + char *path; + int error; + struct nameidata nd; + + path = zalloc(namei_zone); + if (error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL)) + goto out; + NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), p); + if (error = namei(&nd)) + goto out; + if (nd.ni_vp) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(nd.ni_vp); + error = EEXIST; + goto out; + } + VATTR_NULL(&vattr); + vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask; + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path); + vput(nd.ni_dvp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink"); +out: + zfree(namei_zone, path); + return (error); +} + +/* + * Delete a whiteout from the filesystem. + */ +/* ARGSUSED */ +int +undelete(p, uap) + struct proc *p; + register struct undelete_args /* { + syscallarg(char *) path; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE, + SCARG(uap, path), p); + error = namei(&nd); + if (error) + return (error); + + if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (nd.ni_vp) + vrele(nd.ni_vp); + return (EEXIST); + } + + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + if (error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + vput(nd.ni_dvp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "undelete"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "undelete"); + return (error); +} + +/* + * Delete a name from the filesystem. + */ +#ifndef _SYS_SYSPROTO_H_ +struct unlink_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +unlink(p, uap) + struct proc *p; + struct unlink_args /* { + syscallarg(char *) path; + } */ *uap; +{ + register struct vnode *vp; + int error; + struct nameidata nd; + + NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + + if (vp->v_type == VDIR) + error = EPERM; /* POSIX */ + else { + /* + * The root of a mounted filesystem cannot be deleted. + * + * XXX: can this only be a VDIR case? + */ + if (vp->v_flag & VROOT) + error = EBUSY; + } + + if (!error) { + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + } + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (vp != NULLVP) + vput(vp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "unlink"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "unlink"); + return (error); +} + +/* + * Reposition read/write file offset. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lseek_args { + int fd; + int pad; + off_t offset; + int whence; +}; +#endif +int +lseek(p, uap) + struct proc *p; + register struct lseek_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) offset; + syscallarg(int) whence; + } */ *uap; +{ + struct ucred *cred = p->p_ucred; + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + struct vattr vattr; + int error; + + if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL) + return (EBADF); + if (fp->f_type != DTYPE_VNODE) + return (ESPIPE); + switch (SCARG(uap, whence)) { + case L_INCR: + fp->f_offset += SCARG(uap, offset); + break; + case L_XTND: + error=VOP_GETATTR((struct vnode *)fp->f_data, &vattr, cred, p); + if (error) + return (error); + fp->f_offset = SCARG(uap, offset) + vattr.va_size; + break; + case L_SET: + fp->f_offset = SCARG(uap, offset); + break; + default: + return (EINVAL); + } + *(off_t *)(p->p_retval) = fp->f_offset; + return (0); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Reposition read/write file offset. + */ +#ifndef _SYS_SYSPROTO_H_ +struct olseek_args { + int fd; + long offset; + int whence; +}; +#endif +int +olseek(p, uap) + struct proc *p; + register struct olseek_args /* { + syscallarg(int) fd; + syscallarg(long) offset; + syscallarg(int) whence; + } */ *uap; +{ + struct lseek_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) offset; + syscallarg(int) whence; + } */ nuap; + int error; + + SCARG(&nuap, fd) = SCARG(uap, fd); + SCARG(&nuap, offset) = SCARG(uap, offset); + SCARG(&nuap, whence) = SCARG(uap, whence); + error = lseek(p, &nuap); + return (error); +} +#endif /* COMPAT_43 */ + +/* + * Check access permissions. + */ +#ifndef _SYS_SYSPROTO_H_ +struct access_args { + char *path; + int flags; +}; +#endif +int +access(p, uap) + struct proc *p; + register struct access_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + } */ *uap; +{ + register struct ucred *cred = p->p_ucred; + register struct vnode *vp; + int error, flags, t_gid, t_uid; + struct nameidata nd; + + t_uid = cred->cr_uid; + t_gid = cred->cr_groups[0]; + cred->cr_uid = p->p_cred->p_ruid; + cred->cr_groups[0] = p->p_cred->p_rgid; + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + goto out1; + vp = nd.ni_vp; + + /* Flags == 0 means only check for existence. */ + if (SCARG(uap, flags)) { + flags = 0; + if (SCARG(uap, flags) & R_OK) + flags |= VREAD; + if (SCARG(uap, flags) & W_OK) + flags |= VWRITE; + if (SCARG(uap, flags) & X_OK) + flags |= VEXEC; + if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) + error = VOP_ACCESS(vp, flags, cred, p); + } + vput(vp); +out1: + cred->cr_uid = t_uid; + cred->cr_groups[0] = t_gid; + return (error); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Get file status; this version follows links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ostat_args { + char *path; + struct ostat *ub; +}; +#endif +/* ARGSUSED */ +int +ostat(p, uap) + struct proc *p; + register struct ostat_args /* { + syscallarg(char *) path; + syscallarg(struct ostat *) ub; + } */ *uap; +{ + struct stat sb; + struct ostat osb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = vn_stat(nd.ni_vp, &sb, p); + vput(nd.ni_vp); + if (error) + return (error); + cvtstat(&sb, &osb); + error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb)); + return (error); +} + +/* + * Get file status; this version does not follow links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct olstat_args { + char *path; + struct ostat *ub; +}; +#endif +/* ARGSUSED */ +int +olstat(p, uap) + struct proc *p; + register struct olstat_args /* { + syscallarg(char *) path; + syscallarg(struct ostat *) ub; + } */ *uap; +{ + struct vnode *vp; + struct stat sb; + struct ostat osb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + error = vn_stat(vp, &sb, p); + vput(vp); + if (error) + return (error); + cvtstat(&sb, &osb); + error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb)); + return (error); +} + +/* + * Convert from an old to a new stat structure. + */ +void +cvtstat(st, ost) + struct stat *st; + struct ostat *ost; +{ + + ost->st_dev = st->st_dev; + ost->st_ino = st->st_ino; + ost->st_mode = st->st_mode; + ost->st_nlink = st->st_nlink; + ost->st_uid = st->st_uid; + ost->st_gid = st->st_gid; + ost->st_rdev = st->st_rdev; + if (st->st_size < (quad_t)1 << 32) + ost->st_size = st->st_size; + else + ost->st_size = -2; + ost->st_atime = st->st_atime; + ost->st_mtime = st->st_mtime; + ost->st_ctime = st->st_ctime; + ost->st_blksize = st->st_blksize; + ost->st_blocks = st->st_blocks; + ost->st_flags = st->st_flags; + ost->st_gen = st->st_gen; +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Get file status; this version follows links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct stat_args { + char *path; + struct stat *ub; +}; +#endif +/* ARGSUSED */ +int +stat(p, uap) + struct proc *p; + register struct stat_args /* { + syscallarg(char *) path; + syscallarg(struct stat *) ub; + } */ *uap; +{ + struct stat sb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = vn_stat(nd.ni_vp, &sb, p); + vput(nd.ni_vp); + if (error) + return (error); + error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb)); + return (error); +} + +/* + * Get file status; this version does not follow links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lstat_args { + char *path; + struct stat *ub; +}; +#endif +/* ARGSUSED */ +int +lstat(p, uap) + struct proc *p; + register struct lstat_args /* { + syscallarg(char *) path; + syscallarg(struct stat *) ub; + } */ *uap; +{ + int error; + struct vnode *vp; + struct stat sb; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + error = vn_stat(vp, &sb, p); + vput(vp); + if (error) + return (error); + error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb)); + return (error); +} + +void +cvtnstat(sb, nsb) + struct stat *sb; + struct nstat *nsb; +{ + nsb->st_dev = sb->st_dev; + nsb->st_ino = sb->st_ino; + nsb->st_mode = sb->st_mode; + nsb->st_nlink = sb->st_nlink; + nsb->st_uid = sb->st_uid; + nsb->st_gid = sb->st_gid; + nsb->st_rdev = sb->st_rdev; + nsb->st_atimespec = sb->st_atimespec; + nsb->st_mtimespec = sb->st_mtimespec; + nsb->st_ctimespec = sb->st_ctimespec; + nsb->st_size = sb->st_size; + nsb->st_blocks = sb->st_blocks; + nsb->st_blksize = sb->st_blksize; + nsb->st_flags = sb->st_flags; + nsb->st_gen = sb->st_gen; +} + +#ifndef _SYS_SYSPROTO_H_ +struct nstat_args { + char *path; + struct nstat *ub; +}; +#endif +/* ARGSUSED */ +int +nstat(p, uap) + struct proc *p; + register struct nstat_args /* { + syscallarg(char *) path; + syscallarg(struct nstat *) ub; + } */ *uap; +{ + struct stat sb; + struct nstat nsb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = vn_stat(nd.ni_vp, &sb, p); + vput(nd.ni_vp); + if (error) + return (error); + cvtnstat(&sb, &nsb); + error = copyout((caddr_t)&nsb, (caddr_t)SCARG(uap, ub), sizeof (nsb)); + return (error); +} + +/* + * Get file status; this version does not follow links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lstat_args { + char *path; + struct stat *ub; +}; +#endif +/* ARGSUSED */ +int +nlstat(p, uap) + struct proc *p; + register struct nlstat_args /* { + syscallarg(char *) path; + syscallarg(struct nstat *) ub; + } */ *uap; +{ + int error; + struct vnode *vp; + struct stat sb; + struct nstat nsb; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + error = vn_stat(vp, &sb, p); + vput(vp); + if (error) + return (error); + cvtnstat(&sb, &nsb); + error = copyout((caddr_t)&nsb, (caddr_t)SCARG(uap, ub), sizeof (nsb)); + return (error); +} + +/* + * Get configurable pathname variables. + */ +#ifndef _SYS_SYSPROTO_H_ +struct pathconf_args { + char *path; + int name; +}; +#endif +/* ARGSUSED */ +int +pathconf(p, uap) + struct proc *p; + register struct pathconf_args /* { + syscallarg(char *) path; + syscallarg(int) name; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), p->p_retval); + vput(nd.ni_vp); + return (error); +} + +/* + * Return target name of a symbolic link. + */ +#ifndef _SYS_SYSPROTO_H_ +struct readlink_args { + char *path; + char *buf; + int count; +}; +#endif +/* ARGSUSED */ +int +readlink(p, uap) + struct proc *p; + register struct readlink_args /* { + syscallarg(char *) path; + syscallarg(char *) buf; + syscallarg(int) count; + } */ *uap; +{ + register struct vnode *vp; + struct iovec aiov; + struct uio auio; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VLNK) + error = EINVAL; + else { + aiov.iov_base = SCARG(uap, buf); + aiov.iov_len = SCARG(uap, count); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + auio.uio_resid = SCARG(uap, count); + error = VOP_READLINK(vp, &auio, p->p_ucred); + } + vput(vp); + p->p_retval[0] = SCARG(uap, count) - auio.uio_resid; + return (error); +} + +static int +setfflags(p, vp, flags) + struct proc *p; + struct vnode *vp; + int flags; +{ + int error; + struct vattr vattr; + + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + VATTR_NULL(&vattr); + vattr.va_flags = flags; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + VOP_UNLOCK(vp, 0, p); + return error; +} + +/* + * Change flags of a file given a path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chflags_args { + char *path; + int flags; +}; +#endif +/* ARGSUSED */ +int +chflags(p, uap) + struct proc *p; + register struct chflags_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = setfflags(p, nd.ni_vp, SCARG(uap, flags)); + vrele(nd.ni_vp); + return error; +} + +/* + * Change flags of a file given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchflags_args { + int fd; + int flags; +}; +#endif +/* ARGSUSED */ +int +fchflags(p, uap) + struct proc *p; + register struct fchflags_args /* { + syscallarg(int) fd; + syscallarg(int) flags; + } */ *uap; +{ + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + return setfflags(p, (struct vnode *) fp->f_data, SCARG(uap, flags)); +} + +static int +setfmode(p, vp, mode) + struct proc *p; + struct vnode *vp; + int mode; +{ + int error; + struct vattr vattr; + + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + VATTR_NULL(&vattr); + vattr.va_mode = mode & ALLPERMS; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + VOP_UNLOCK(vp, 0, p); + return error; +} + +/* + * Change mode of a file given path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chmod_args { + char *path; + int mode; +}; +#endif +/* ARGSUSED */ +int +chmod(p, uap) + struct proc *p; + register struct chmod_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = setfmode(p, nd.ni_vp, SCARG(uap, mode)); + vrele(nd.ni_vp); + return error; +} + +/* + * Change mode of a file given path name (don't follow links.) + */ +#ifndef _SYS_SYSPROTO_H_ +struct lchmod_args { + char *path; + int mode; +}; +#endif +/* ARGSUSED */ +int +lchmod(p, uap) + struct proc *p; + register struct lchmod_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = setfmode(p, nd.ni_vp, SCARG(uap, mode)); + vrele(nd.ni_vp); + return error; +} + +/* + * Change mode of a file given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchmod_args { + int fd; + int mode; +}; +#endif +/* ARGSUSED */ +int +fchmod(p, uap) + struct proc *p; + register struct fchmod_args /* { + syscallarg(int) fd; + syscallarg(int) mode; + } */ *uap; +{ + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + return setfmode(p, (struct vnode *)fp->f_data, SCARG(uap, mode)); +} + +static int +setfown(p, vp, uid, gid) + struct proc *p; + struct vnode *vp; + uid_t uid; + gid_t gid; +{ + int error; + struct vattr vattr; + + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + VATTR_NULL(&vattr); + vattr.va_uid = uid; + vattr.va_gid = gid; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + VOP_UNLOCK(vp, 0, p); + return error; +} + +/* + * Set ownership given a path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chown_args { + char *path; + int uid; + int gid; +}; +#endif +/* ARGSUSED */ +int +chown(p, uap) + struct proc *p; + register struct chown_args /* { + syscallarg(char *) path; + syscallarg(int) uid; + syscallarg(int) gid; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = setfown(p, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid)); + vrele(nd.ni_vp); + + return (error); +} + +/* + * Set ownership given a path name, do not cross symlinks. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lchown_args { + char *path; + int uid; + int gid; +}; +#endif +/* ARGSUSED */ +int +lchown(p, uap) + struct proc *p; + register struct lchown_args /* { + syscallarg(char *) path; + syscallarg(int) uid; + syscallarg(int) gid; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = setfown(p, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid)); + vrele(nd.ni_vp); + return (error); +} + +/* + * Set ownership given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchown_args { + int fd; + int uid; + int gid; +}; +#endif +/* ARGSUSED */ +int +fchown(p, uap) + struct proc *p; + register struct fchown_args /* { + syscallarg(int) fd; + syscallarg(int) uid; + syscallarg(int) gid; + } */ *uap; +{ + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + return setfown(p, (struct vnode *)fp->f_data, + SCARG(uap, uid), SCARG(uap, gid)); +} + +static int +setutimes(p, vp, tv, nullflag) + struct proc *p; + struct vnode *vp; + struct timeval *tv; + int nullflag; +{ + int error; + struct vattr vattr; + + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + VATTR_NULL(&vattr); + vattr.va_atime.tv_sec = tv[0].tv_sec; + vattr.va_atime.tv_nsec = tv[0].tv_usec * 1000; + vattr.va_mtime.tv_sec = tv[1].tv_sec; + vattr.va_mtime.tv_nsec = tv[1].tv_usec * 1000; + if (nullflag) + vattr.va_vaflags |= VA_UTIMES_NULL; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + VOP_UNLOCK(vp, 0, p); + return error; +} + +/* + * Set the access and modification times of a file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct utimes_args { + char *path; + struct timeval *tptr; +}; +#endif +/* ARGSUSED */ +int +utimes(p, uap) + struct proc *p; + register struct utimes_args /* { + syscallarg(char *) path; + syscallarg(struct timeval *) tptr; + } */ *uap; +{ + struct timeval tv[2]; + int error; + struct nameidata nd; + int nullflag; + + nullflag = 0; + if (SCARG(uap, tptr) == NULL) { + microtime(&tv[0]); + tv[1] = tv[0]; + nullflag = 1; + } else if (error = copyin((caddr_t)SCARG(uap, tptr), (caddr_t)tv, + sizeof (tv))) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = setutimes(p, nd.ni_vp, tv, nullflag); + vrele(nd.ni_vp); + return (error); +} + +/* + * Set the access and modification times of a file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lutimes_args { + char *path; + struct timeval *tptr; +}; +#endif +/* ARGSUSED */ +int +lutimes(p, uap) + struct proc *p; + register struct lutimes_args /* { + syscallarg(char *) path; + syscallarg(struct timeval *) tptr; + } */ *uap; +{ + struct timeval tv[2]; + int error; + struct nameidata nd; + int nullflag; + + nullflag = 0; + if (SCARG(uap, tptr) == NULL) { + microtime(&tv[0]); + tv[1] = tv[0]; + nullflag = 1; + } else if (error = copyin((caddr_t)SCARG(uap, tptr), (caddr_t)tv, + sizeof (tv))) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + + error = setutimes(p, nd.ni_vp, tv, nullflag); + vrele(nd.ni_vp); + return (error); +} + +/* + * Set the access and modification times of a file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct futimes_args { + int fd; + struct timeval *tptr; +}; +#endif +/* ARGSUSED */ +int +futimes(p, uap) + struct proc *p; + register struct futimes_args /* { + syscallarg(int ) fd; + syscallarg(struct timeval *) tptr; + } */ *uap; +{ + struct timeval tv[2]; + struct file *fp; + int error; + int nullflag; + + nullflag = 0; + if (SCARG(uap, tptr) == NULL) { + microtime(&tv[0]); + tv[1] = tv[0]; + nullflag = 1; + } else if (error = copyin((caddr_t)SCARG(uap, tptr), (caddr_t)tv, + sizeof (tv))) + return (error); + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + return setutimes(p, (struct vnode *)fp->f_data, tv, nullflag); +} + +/* + * Truncate a file given its path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct truncate_args { + char *path; + int pad; + off_t length; +}; +#endif +/* ARGSUSED */ +int +truncate(p, uap) + struct proc *p; + register struct truncate_args /* { + syscallarg(char *) path; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ *uap; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + if (uap->length < 0) + return(EINVAL); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + if (vp->v_type == VDIR) + error = EISDIR; + else if ((error = vn_writechk(vp)) == 0 && + (error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) == 0) { + VATTR_NULL(&vattr); + vattr.va_size = SCARG(uap, length); + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + } + vput(vp); + return (error); +} + +/* + * Truncate a file given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ftruncate_args { + int fd; + int pad; + off_t length; +}; +#endif +/* ARGSUSED */ +int +ftruncate(p, uap) + struct proc *p; + register struct ftruncate_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ *uap; +{ + struct vattr vattr; + struct vnode *vp; + struct file *fp; + int error; + + if (uap->length < 0) + return(EINVAL); + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + if ((fp->f_flag & FWRITE) == 0) + return (EINVAL); + vp = (struct vnode *)fp->f_data; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + if (vp->v_type == VDIR) + error = EISDIR; + else if ((error = vn_writechk(vp)) == 0) { + VATTR_NULL(&vattr); + vattr.va_size = SCARG(uap, length); + error = VOP_SETATTR(vp, &vattr, fp->f_cred, p); + } + VOP_UNLOCK(vp, 0, p); + return (error); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Truncate a file given its path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct otruncate_args { + char *path; + long length; +}; +#endif +/* ARGSUSED */ +int +otruncate(p, uap) + struct proc *p; + register struct otruncate_args /* { + syscallarg(char *) path; + syscallarg(long) length; + } */ *uap; +{ + struct truncate_args /* { + syscallarg(char *) path; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ nuap; + + SCARG(&nuap, path) = SCARG(uap, path); + SCARG(&nuap, length) = SCARG(uap, length); + return (truncate(p, &nuap)); +} + +/* + * Truncate a file given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct oftruncate_args { + int fd; + long length; +}; +#endif +/* ARGSUSED */ +int +oftruncate(p, uap) + struct proc *p; + register struct oftruncate_args /* { + syscallarg(int) fd; + syscallarg(long) length; + } */ *uap; +{ + struct ftruncate_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ nuap; + + SCARG(&nuap, fd) = SCARG(uap, fd); + SCARG(&nuap, length) = SCARG(uap, length); + return (ftruncate(p, &nuap)); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Sync an open file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fsync_args { + int fd; +}; +#endif +/* ARGSUSED */ +int +fsync(p, uap) + struct proc *p; + struct fsync_args /* { + syscallarg(int) fd; + } */ *uap; +{ + register struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + if (vp->v_object) + vm_object_page_clean(vp->v_object, 0, 0, 0); + if ((error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, p)) == 0 && + vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP) && + bioops.io_fsync) + error = (*bioops.io_fsync)(vp); + VOP_UNLOCK(vp, 0, p); + return (error); +} + +/* + * Rename files. Source and destination must either both be directories, + * or both not be directories. If target is a directory, it must be empty. + */ +#ifndef _SYS_SYSPROTO_H_ +struct rename_args { + char *from; + char *to; +}; +#endif +/* ARGSUSED */ +int +rename(p, uap) + struct proc *p; + register struct rename_args /* { + syscallarg(char *) from; + syscallarg(char *) to; + } */ *uap; +{ + register struct vnode *tvp, *fvp, *tdvp; + struct nameidata fromnd, tond; + int error; + + NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE, + SCARG(uap, from), p); + if (error = namei(&fromnd)) + return (error); + fvp = fromnd.ni_vp; + NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | NOOBJ, + UIO_USERSPACE, SCARG(uap, to), p); + if (fromnd.ni_vp->v_type == VDIR) + tond.ni_cnd.cn_flags |= WILLBEDIR; + if (error = namei(&tond)) { + /* Translate error code for rename("dir1", "dir2/."). */ + if (error == EISDIR && fvp->v_type == VDIR) + error = EINVAL; + VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); + vrele(fromnd.ni_dvp); + vrele(fvp); + goto out1; + } + tdvp = tond.ni_dvp; + tvp = tond.ni_vp; + if (tvp != NULL) { + if (fvp->v_type == VDIR && tvp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { + error = EISDIR; + goto out; + } + } + if (fvp == tdvp) + error = EINVAL; + /* + * If source is the same as the destination (that is the + * same inode number with the same name in the same directory), + * then there is nothing to do. + */ + if (fvp == tvp && fromnd.ni_dvp == tdvp && + fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen && + !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr, + fromnd.ni_cnd.cn_namelen)) + error = -1; +out: + if (!error) { + VOP_LEASE(tdvp, p, p->p_ucred, LEASE_WRITE); + if (fromnd.ni_dvp != tdvp) { + VOP_LEASE(fromnd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + } + if (tvp) { + VOP_LEASE(tvp, p, p->p_ucred, LEASE_WRITE); + } + error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, + tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); + } else { + VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd); + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + if (tvp) + vput(tvp); + VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); + vrele(fromnd.ni_dvp); + vrele(fvp); + } + vrele(tond.ni_startdir); + ASSERT_VOP_UNLOCKED(fromnd.ni_dvp, "rename"); + ASSERT_VOP_UNLOCKED(fromnd.ni_vp, "rename"); + ASSERT_VOP_UNLOCKED(tond.ni_dvp, "rename"); + ASSERT_VOP_UNLOCKED(tond.ni_vp, "rename"); + zfree(namei_zone, tond.ni_cnd.cn_pnbuf); +out1: + if (fromnd.ni_startdir) + vrele(fromnd.ni_startdir); + zfree(namei_zone, fromnd.ni_cnd.cn_pnbuf); + if (error == -1) + return (0); + return (error); +} + +/* + * Make a directory file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mkdir_args { + char *path; + int mode; +}; +#endif +/* ARGSUSED */ +int +mkdir(p, uap) + struct proc *p; + register struct mkdir_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); + nd.ni_cnd.cn_flags |= WILLBEDIR; + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp != NULL) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(vp); + return (EEXIST); + } + VATTR_NULL(&vattr); + vattr.va_type = VDIR; + vattr.va_mode = (SCARG(uap, mode) & ACCESSPERMS) &~ p->p_fd->fd_cmask; + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); + vput(nd.ni_dvp); + if (!error) + vput(nd.ni_vp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mkdir"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "mkdir"); + return (error); +} + +/* + * Remove a directory file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct rmdir_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +rmdir(p, uap) + struct proc *p; + struct rmdir_args /* { + syscallarg(char *) path; + } */ *uap; +{ + register struct vnode *vp; + int error; + struct nameidata nd; + + NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + /* + * No rmdir "." please. + */ + if (nd.ni_dvp == vp) { + error = EINVAL; + goto out; + } + /* + * The root of a mounted filesystem cannot be deleted. + */ + if (vp->v_flag & VROOT) + error = EBUSY; +out: + if (!error) { + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + } + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (vp != NULLVP) + vput(vp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "rmdir"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "rmdir"); + return (error); +} + +#ifdef COMPAT_43 +/* + * Read a block of directory entries in a file system independent format. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ogetdirentries_args { + int fd; + char *buf; + u_int count; + long *basep; +}; +#endif +int +ogetdirentries(p, uap) + struct proc *p; + register struct ogetdirentries_args /* { + syscallarg(int) fd; + syscallarg(char *) buf; + syscallarg(u_int) count; + syscallarg(long *) basep; + } */ *uap; +{ + struct vnode *vp; + struct file *fp; + struct uio auio, kuio; + struct iovec aiov, kiov; + struct dirent *dp, *edp; + caddr_t dirbuf; + int error, eofflag, readcnt; + long loff; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + if ((fp->f_flag & FREAD) == 0) + return (EBADF); + vp = (struct vnode *)fp->f_data; +unionread: + if (vp->v_type != VDIR) + return (EINVAL); + aiov.iov_base = SCARG(uap, buf); + aiov.iov_len = SCARG(uap, count); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + auio.uio_resid = SCARG(uap, count); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + loff = auio.uio_offset = fp->f_offset; +# if (BYTE_ORDER != LITTLE_ENDIAN) + if (vp->v_mount->mnt_maxsymlinklen <= 0) { + error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, + NULL, NULL); + fp->f_offset = auio.uio_offset; + } else +# endif + { + kuio = auio; + kuio.uio_iov = &kiov; + kuio.uio_segflg = UIO_SYSSPACE; + kiov.iov_len = SCARG(uap, count); + MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK); + kiov.iov_base = dirbuf; + error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag, + NULL, NULL); + fp->f_offset = kuio.uio_offset; + if (error == 0) { + readcnt = SCARG(uap, count) - kuio.uio_resid; + edp = (struct dirent *)&dirbuf[readcnt]; + for (dp = (struct dirent *)dirbuf; dp < edp; ) { +# if (BYTE_ORDER == LITTLE_ENDIAN) + /* + * The expected low byte of + * dp->d_namlen is our dp->d_type. + * The high MBZ byte of dp->d_namlen + * is our dp->d_namlen. + */ + dp->d_type = dp->d_namlen; + dp->d_namlen = 0; +# else + /* + * The dp->d_type is the high byte + * of the expected dp->d_namlen, + * so must be zero'ed. + */ + dp->d_type = 0; +# endif + if (dp->d_reclen > 0) { + dp = (struct dirent *) + ((char *)dp + dp->d_reclen); + } else { + error = EIO; + break; + } + } + if (dp >= edp) + error = uiomove(dirbuf, readcnt, &auio); + } + FREE(dirbuf, M_TEMP); + } + VOP_UNLOCK(vp, 0, p); + if (error) + return (error); + if (union_dircheckp && SCARG(uap, count) == auio.uio_resid) { + error = union_dircheckp(p, &vp, fp); + if (error == -1) + goto unionread; + if (error) + return (error); + } + error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep), + sizeof(long)); + p->p_retval[0] = SCARG(uap, count) - auio.uio_resid; + return (error); +} +#endif /* COMPAT_43 */ + +/* + * Read a block of directory entries in a file system independent format. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getdirentries_args { + int fd; + char *buf; + u_int count; + long *basep; +}; +#endif +int +getdirentries(p, uap) + struct proc *p; + register struct getdirentries_args /* { + syscallarg(int) fd; + syscallarg(char *) buf; + syscallarg(u_int) count; + syscallarg(long *) basep; + } */ *uap; +{ + struct vnode *vp; + struct file *fp; + struct uio auio; + struct iovec aiov; + long loff; + int error, eofflag; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + if ((fp->f_flag & FREAD) == 0) + return (EBADF); + vp = (struct vnode *)fp->f_data; +unionread: + if (vp->v_type != VDIR) + return (EINVAL); + aiov.iov_base = SCARG(uap, buf); + aiov.iov_len = SCARG(uap, count); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + auio.uio_resid = SCARG(uap, count); + /* vn_lock(vp, LK_SHARED | LK_RETRY, p); */ + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + loff = auio.uio_offset = fp->f_offset; + error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); + fp->f_offset = auio.uio_offset; + VOP_UNLOCK(vp, 0, p); + if (error) + return (error); + if (union_dircheckp && SCARG(uap, count) == auio.uio_resid) { + error = union_dircheckp(p, &vp, fp); + if (error == -1) + goto unionread; + if (error) + return (error); + } + if (SCARG(uap, basep) != NULL) { + error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep), + sizeof(long)); + } + p->p_retval[0] = SCARG(uap, count) - auio.uio_resid; + return (error); +} +#ifndef _SYS_SYSPROTO_H_ +struct getdents_args { + int fd; + char *buf; + size_t count; +}; +#endif +int +getdents(p, uap) + struct proc *p; + register struct getdents_args /* { + syscallarg(int) fd; + syscallarg(char *) buf; + syscallarg(u_int) count; + } */ *uap; +{ + struct getdirentries_args ap; + ap.fd = uap->fd; + ap.buf = uap->buf; + ap.count = uap->count; + ap.basep = NULL; + return getdirentries(p, &ap); +} + +/* + * Set the mode mask for creation of filesystem nodes. + */ +#ifndef _SYS_SYSPROTO_H_ +struct umask_args { + int newmask; +}; +#endif +int +umask(p, uap) + struct proc *p; + struct umask_args /* { + syscallarg(int) newmask; + } */ *uap; +{ + register struct filedesc *fdp; + + fdp = p->p_fd; + p->p_retval[0] = fdp->fd_cmask; + fdp->fd_cmask = SCARG(uap, newmask) & ALLPERMS; + return (0); +} + +/* + * Void all references to file by ripping underlying filesystem + * away from vnode. + */ +#ifndef _SYS_SYSPROTO_H_ +struct revoke_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +revoke(p, uap) + struct proc *p; + register struct revoke_args /* { + syscallarg(char *) path; + } */ *uap; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (error = VOP_GETATTR(vp, &vattr, p->p_ucred, p)) + goto out; + if (p->p_ucred->cr_uid != vattr.va_uid && + (error = suser(p->p_ucred, &p->p_acflag))) + goto out; + if (vp->v_usecount > 1 || (vp->v_flag & VALIASED)) + VOP_REVOKE(vp, REVOKEALL); +out: + vrele(vp); + return (error); +} + +/* + * Convert a user file descriptor to a kernel file entry. + */ +int +getvnode(fdp, fd, fpp) + struct filedesc *fdp; + int fd; + struct file **fpp; +{ + struct file *fp; + + if ((u_int)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) + return (EBADF); + if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) + return (EINVAL); + *fpp = fp; + return (0); +} +#ifndef _SYS_SYSPROTO_H_ +struct __getcwd_args { + u_char *buf; + u_int buflen; +}; +#endif +#define STATNODE(mode, name, var) \ + SYSCTL_INT(_vfs_cache, OID_AUTO, name, mode, var, 0, ""); + +static int disablecwd; +SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, ""); + +static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls); +static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1); +static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2); +static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3); +static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4); +static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound); +int +__getcwd(p, uap) + struct proc *p; + struct __getcwd_args *uap; +{ + char *bp, *buf; + int error, i, slash_prefixed; + struct filedesc *fdp; + struct namecache *ncp; + struct vnode *vp; + + numcwdcalls++; + if (disablecwd) + return (ENODEV); + if (uap->buflen < 2) + return (EINVAL); + if (uap->buflen > MAXPATHLEN) + uap->buflen = MAXPATHLEN; + buf = bp = malloc(uap->buflen, M_TEMP, M_WAITOK); + bp += uap->buflen - 1; + *bp = '\0'; + fdp = p->p_fd; + slash_prefixed = 0; + for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) { + if (vp->v_flag & VROOT) { + vp = vp->v_mount->mnt_vnodecovered; + continue; + } + if (vp->v_dd->v_id != vp->v_ddid) { + numcwdfail1++; + free(buf, M_TEMP); + return (ENOTDIR); + } + ncp = TAILQ_FIRST(&vp->v_cache_dst); + if (!ncp) { + numcwdfail2++; + free(buf, M_TEMP); + return (ENOENT); + } + if (ncp->nc_dvp != vp->v_dd) { + numcwdfail3++; + free(buf, M_TEMP); + return (EBADF); + } + for (i = ncp->nc_nlen - 1; i >= 0; i--) { + if (bp == buf) { + numcwdfail4++; + free(buf, M_TEMP); + return (ENOMEM); + } + *--bp = ncp->nc_name[i]; + } + if (bp == buf) { + numcwdfail4++; + free(buf, M_TEMP); + return (ENOMEM); + } + *--bp = '/'; + slash_prefixed = 1; + vp = vp->v_dd; + } + if (!slash_prefixed) { + if (bp == buf) { + numcwdfail4++; + free(buf, M_TEMP); + return (ENOMEM); + } + *--bp = '/'; + } + numcwdfound++; + error = copyout(bp, uap->buf, strlen(bp) + 1); + free(buf, M_TEMP); + return (error); +} diff --git a/sys/kern/vfs_init.c b/sys/kern/vfs_init.c new file mode 100644 index 0000000..43589c74 --- /dev/null +++ b/sys/kern/vfs_init.c @@ -0,0 +1,461 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed + * to Berkeley by John Heidemann of the UCLA Ficus project. + * + * Source: * @(#)i405_init.c 2.10 92/04/27 UCLA Ficus project + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_init.c 8.3 (Berkeley) 1/4/94 + * $Id: vfs_init.c,v 1.40 1998/11/15 15:18:30 bde Exp $ + */ + + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/mount.h> +#include <sys/sysctl.h> +#include <sys/vnode.h> +#include <sys/malloc.h> +#include <vm/vm_zone.h> + + +MALLOC_DEFINE(M_VNODE, "vnodes", "Dynamically allocated vnodes"); + +/* + * XXX this bloat just exands the sysctl__vfs linker set a little so that + * we can attach sysctls for VFS modules without expanding the linker set. + * Currently (1998/09/06), only one VFS uses sysctls, so 2 extra linker + * set slots are more than sufficient. + */ +extern struct linker_set sysctl__vfs; +static int mod_xx; +SYSCTL_INT(_vfs, OID_AUTO, mod0, CTLFLAG_RD, &mod_xx, 0, ""); +SYSCTL_INT(_vfs, OID_AUTO, mod1, CTLFLAG_RD, &mod_xx, 0, ""); + +/* + * Zone for namei + */ +struct vm_zone *namei_zone; + +/* + * vfs_init.c + * + * Allocate and fill in operations vectors. + * + * An undocumented feature of this approach to defining operations is that + * there can be multiple entries in vfs_opv_descs for the same operations + * vector. This allows third parties to extend the set of operations + * supported by another layer in a binary compatibile way. For example, + * assume that NFS needed to be modified to support Ficus. NFS has an entry + * (probably nfs_vnopdeop_decls) declaring all the operations NFS supports by + * default. Ficus could add another entry (ficus_nfs_vnodeop_decl_entensions) + * listing those new operations Ficus adds to NFS, all without modifying the + * NFS code. (Of couse, the OTW NFS protocol still needs to be munged, but + * that is a(whole)nother story.) This is a feature. + */ + +/* Table of known vnodeop vectors (list of VFS vnode vectors) */ +static struct vnodeopv_desc **vnodeopv_descs; +static int vnodeopv_num; + +/* Table of known descs (list of vnode op handlers "vop_access_desc") */ +static struct vnodeop_desc **vfs_op_descs; +static int *vfs_op_desc_refs; /* reference counts */ +static int num_op_descs; +static int vfs_opv_numops; + +static void +vfs_opv_recalc(void) +{ + int i, j; + vop_t ***opv_desc_vector_p; + vop_t **opv_desc_vector; + struct vnodeopv_entry_desc *opve_descp; + struct vnodeopv_desc *opv; + + if (vfs_op_descs == NULL) + panic("vfs_opv_recalc called with null vfs_op_descs"); + + /* + * Run through and make sure all known descs have an offset + * + * vop_default_desc is hardwired at offset 1, and offset 0 + * is a panic sanity check. + */ + vfs_opv_numops = 0; + for (i = 0; i < num_op_descs; i++) + if (vfs_opv_numops < (vfs_op_descs[i]->vdesc_offset + 1)) + vfs_opv_numops = vfs_op_descs[i]->vdesc_offset + 1; + for (i = 0; i < num_op_descs; i++) + if (vfs_op_descs[i]->vdesc_offset == 0) + vfs_op_descs[i]->vdesc_offset = vfs_opv_numops++; + /* + * Allocate and fill in the vectors + */ + for (i = 0; i < vnodeopv_num; i++) { + opv = vnodeopv_descs[i]; + opv_desc_vector_p = opv->opv_desc_vector_p; + if (*opv_desc_vector_p) + FREE(*opv_desc_vector_p, M_VNODE); + MALLOC(*opv_desc_vector_p, vop_t **, + vfs_opv_numops * sizeof(vop_t *), M_VNODE, M_WAITOK); + if (*opv_desc_vector_p == NULL) + panic("no memory for vop_t ** vector"); + bzero(*opv_desc_vector_p, vfs_opv_numops * sizeof(vop_t *)); + + /* Fill in, with slot 0 being panic */ + opv_desc_vector = *opv_desc_vector_p; + opv_desc_vector[0] = (vop_t *)vop_panic; + for (j = 0; opv->opv_desc_ops[j].opve_op; j++) { + opve_descp = &(opv->opv_desc_ops[j]); + opv_desc_vector[opve_descp->opve_op->vdesc_offset] = + opve_descp->opve_impl; + } + + /* Replace unfilled routines with their default (slot 1). */ + opv_desc_vector = *(opv->opv_desc_vector_p); + if (opv_desc_vector[1] == NULL) + panic("vfs_opv_recalc: vector without a default."); + for (j = 0; j < vfs_opv_numops; j++) + if (opv_desc_vector[j] == NULL) + opv_desc_vector[j] = opv_desc_vector[1]; + } +} + +void +vfs_add_vnodeops(void *data) +{ + struct vnodeopv_desc *opv; + struct vnodeopv_desc **newopv; + struct vnodeop_desc **newop; + int *newref; + vop_t **opv_desc_vector; + struct vnodeop_desc *desc; + int i, j; + + opv = (struct vnodeopv_desc *)data; + MALLOC(newopv, struct vnodeopv_desc **, + (vnodeopv_num + 1) * sizeof(*newopv), M_VNODE, M_WAITOK); + if (newopv == NULL) + panic("vfs_add_vnodeops: no memory"); + if (vnodeopv_descs) { + bcopy(vnodeopv_descs, newopv, vnodeopv_num * sizeof(*newopv)); + FREE(vnodeopv_descs, M_VNODE); + } + newopv[vnodeopv_num] = opv; + vnodeopv_descs = newopv; + vnodeopv_num++; + + /* See if we have turned up a new vnode op desc */ + opv_desc_vector = *(opv->opv_desc_vector_p); + for (i = 0; (desc = opv->opv_desc_ops[i].opve_op); i++) { + for (j = 0; j < num_op_descs; j++) { + if (desc == vfs_op_descs[j]) { + /* found it, increase reference count */ + vfs_op_desc_refs[j]++; + break; + } + } + if (j == num_op_descs) { + /* not found, new entry */ + MALLOC(newop, struct vnodeop_desc **, + (num_op_descs + 1) * sizeof(*newop), + M_VNODE, M_WAITOK); + if (newop == NULL) + panic("vfs_add_vnodeops: no memory for desc"); + /* new reference count (for unload) */ + MALLOC(newref, int *, + (num_op_descs + 1) * sizeof(*newref), + M_VNODE, M_WAITOK); + if (newref == NULL) + panic("vfs_add_vnodeops: no memory for refs"); + if (vfs_op_descs) { + bcopy(vfs_op_descs, newop, + num_op_descs * sizeof(*newop)); + FREE(vfs_op_descs, M_VNODE); + } + if (vfs_op_desc_refs) { + bcopy(vfs_op_desc_refs, newref, + num_op_descs * sizeof(*newref)); + FREE(vfs_op_desc_refs, M_VNODE); + } + newop[num_op_descs] = desc; + newref[num_op_descs] = 1; + vfs_op_descs = newop; + vfs_op_desc_refs = newref; + num_op_descs++; + } + } + vfs_opv_recalc(); +} + +void +vfs_rm_vnodeops(void *data) +{ + struct vnodeopv_desc *opv; + struct vnodeopv_desc **newopv; + struct vnodeop_desc **newop; + int *newref; + vop_t **opv_desc_vector; + struct vnodeop_desc *desc; + int i, j, k; + + opv = (struct vnodeopv_desc *)data; + /* Lower ref counts on descs in the table and release if zero */ + opv_desc_vector = *(opv->opv_desc_vector_p); + for (i = 0; (desc = opv->opv_desc_ops[i].opve_op); i++) { + for (j = 0; j < num_op_descs; j++) { + if (desc == vfs_op_descs[j]) { + /* found it, decrease reference count */ + vfs_op_desc_refs[j]--; + break; + } + } + for (j = 0; j < num_op_descs; j++) { + if (vfs_op_desc_refs[j] > 0) + continue; + if (vfs_op_desc_refs[j] < 0) + panic("vfs_remove_vnodeops: negative refcnt"); + MALLOC(newop, struct vnodeop_desc **, + (num_op_descs - 1) * sizeof(*newop), + M_VNODE, M_WAITOK); + if (newop == NULL) + panic("vfs_remove_vnodeops: no memory for desc"); + /* new reference count (for unload) */ + MALLOC(newref, int *, + (num_op_descs - 1) * sizeof(*newref), + M_VNODE, M_WAITOK); + if (newref == NULL) + panic("vfs_remove_vnodeops: no memory for refs"); + for (k = j; k < (num_op_descs - 1); k++) { + vfs_op_descs[k] = vfs_op_descs[k + 1]; + vfs_op_desc_refs[k] = vfs_op_desc_refs[k + 1]; + } + bcopy(vfs_op_descs, newop, + (num_op_descs - 1) * sizeof(*newop)); + bcopy(vfs_op_desc_refs, newref, + (num_op_descs - 1) * sizeof(*newref)); + FREE(vfs_op_descs, M_VNODE); + FREE(vfs_op_desc_refs, M_VNODE); + vfs_op_descs = newop; + vfs_op_desc_refs = newref; + num_op_descs--; + } + } + + for (i = 0; i < vnodeopv_num; i++) { + if (vnodeopv_descs[i] == opv) { + for (j = i; j < (vnodeopv_num - 1); j++) + vnodeopv_descs[j] = vnodeopv_descs[j + 1]; + break; + } + } + if (i == vnodeopv_num) + panic("vfs_remove_vnodeops: opv not found"); + MALLOC(newopv, struct vnodeopv_desc **, + (vnodeopv_num - 1) * sizeof(*newopv), M_VNODE, M_WAITOK); + if (newopv == NULL) + panic("vfs_remove_vnodeops: no memory"); + bcopy(vnodeopv_descs, newopv, (vnodeopv_num - 1) * sizeof(*newopv)); + FREE(vnodeopv_descs, M_VNODE); + vnodeopv_descs = newopv; + vnodeopv_num--; + + vfs_opv_recalc(); +} + +/* + * Routines having to do with the management of the vnode table. + */ +struct vattr va_null; + +/* + * Initialize the vnode structures and initialize each file system type. + */ +/* ARGSUSED*/ +static void +vfsinit(void *dummy) +{ + + namei_zone = zinit("NAMEI", MAXPATHLEN, 0, 0, 2); + + /* + * Initialize the vnode table + */ + vntblinit(); + /* + * Initialize the vnode name cache + */ + nchinit(); + /* + * Initialize each file system type. + * Vfs type numbers must be distinct from VFS_GENERIC (and VFS_VFSCONF). + */ + vattr_null(&va_null); + maxvfsconf = VFS_GENERIC + 1; +} +SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vfsinit, NULL) + +int +vfs_register(struct vfsconf *vfc) +{ + struct linker_set *l; + struct sysctl_oid **oidpp; + struct vfsconf *vfsp; + int i, exists; + + vfsp = NULL; + l = &sysctl__vfs; + if (vfsconf) + for (vfsp = vfsconf; vfsp->vfc_next; vfsp = vfsp->vfc_next) + if (!strcmp(vfc->vfc_name, vfsp->vfc_name)) + return EEXIST; + + vfc->vfc_typenum = maxvfsconf++; + if (vfc->vfc_vfsops->vfs_oid != NULL) { + /* + * Attach the oid to the "vfs" node of the sysctl tree if + * it isn't already there (it will be there for statically + * configured vfs's). + */ + exists = 0; + for (i = l->ls_length, + oidpp = (struct sysctl_oid **)l->ls_items; + i-- != 0; oidpp++) + if (*oidpp == vfc->vfc_vfsops->vfs_oid) { + exists = 1; + break; + } + if (exists == 0) + for (i = l->ls_length, + oidpp = (struct sysctl_oid **)l->ls_items; + i-- != 0; oidpp++) { + if (*oidpp == NULL || + *oidpp == &sysctl___vfs_mod0 || + *oidpp == &sysctl___vfs_mod1) { + *oidpp = vfc->vfc_vfsops->vfs_oid; + break; + } + } + + vfc->vfc_vfsops->vfs_oid->oid_number = vfc->vfc_typenum; + sysctl_order_all(); + } + if (vfsp) + vfsp->vfc_next = vfc; + else + vfsconf = vfc; + vfc->vfc_next = NULL; + + /* + * Call init function for this VFS... + */ + (*(vfc->vfc_vfsops->vfs_init))(vfc); + + return 0; +} + + +int +vfs_unregister(struct vfsconf *vfc) +{ + struct linker_set *l; + struct sysctl_oid **oidpp; + struct vfsconf *vfsp, *prev_vfsp; + int error, i, maxtypenum; + + i = vfc->vfc_typenum; + + prev_vfsp = NULL; + for (vfsp = vfsconf; vfsp; + prev_vfsp = vfsp, vfsp = vfsp->vfc_next) { + if (!strcmp(vfc->vfc_name, vfsp->vfc_name)) + break; + } + if (vfsp == NULL) + return EINVAL; + if (vfsp->vfc_refcount) + return EBUSY; + if (vfc->vfc_vfsops->vfs_uninit != NULL) { + error = (*vfc->vfc_vfsops->vfs_uninit)(vfsp); + if (error) + return (error); + } + if (prev_vfsp) + prev_vfsp->vfc_next = vfsp->vfc_next; + else + vfsconf = vfsp->vfc_next; + if (vfsp->vfc_vfsops->vfs_oid != NULL) { + l = &sysctl__vfs; + for (i = l->ls_length, + oidpp = (struct sysctl_oid **)l->ls_items; + i--; oidpp++) { + if (*oidpp == vfsp->vfc_vfsops->vfs_oid) { + *oidpp = NULL; + sysctl_order_all(); + break; + } + } + } + maxtypenum = VFS_GENERIC; + for (vfsp = vfsconf; vfsp != NULL; vfsp = vfsp->vfc_next) + if (maxtypenum < vfsp->vfc_typenum) + maxtypenum = vfsp->vfc_typenum; + maxvfsconf = maxtypenum + 1; + return 0; +} + +int +vfs_modevent(module_t mod, int type, void *data) +{ + struct vfsconf *vfc; + int error = 0; + + vfc = (struct vfsconf *)data; + + switch (type) { + case MOD_LOAD: + if (vfc) + error = vfs_register(vfc); + break; + + case MOD_UNLOAD: + if (vfc) + error = vfs_unregister(vfc); + break; + default: /* including MOD_SHUTDOWN */ + break; + } + return (error); +} diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c new file mode 100644 index 0000000..67efd52 --- /dev/null +++ b/sys/kern/vfs_lookup.c @@ -0,0 +1,706 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_lookup.c 8.4 (Berkeley) 2/16/94 + * $Id: vfs_lookup.c,v 1.30 1999/01/08 17:31:16 eivind Exp $ + */ + +#include "opt_ktrace.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/namei.h> +#include <sys/vnode.h> +#include <sys/mount.h> +#include <sys/filedesc.h> +#include <sys/proc.h> + +#ifdef KTRACE +#include <sys/ktrace.h> +#endif + +#include <vm/vm_zone.h> + +/* + * Convert a pathname into a pointer to a locked inode. + * + * The FOLLOW flag is set when symbolic links are to be followed + * when they occur at the end of the name translation process. + * Symbolic links are always followed for all other pathname + * components other than the last. + * + * The segflg defines whether the name is to be copied from user + * space or kernel space. + * + * Overall outline of namei: + * + * copy in name + * get starting directory + * while (!done && !error) { + * call lookup to search path. + * if symbolic link, massage name in buffer and continue + * } + */ +int +namei(ndp) + register struct nameidata *ndp; +{ + register struct filedesc *fdp; /* pointer to file descriptor state */ + register char *cp; /* pointer into pathname argument */ + register struct vnode *dp; /* the directory we are searching */ + struct iovec aiov; /* uio for reading symbolic links */ + struct uio auio; + int error, linklen; + struct componentname *cnp = &ndp->ni_cnd; + struct proc *p = cnp->cn_proc; + + ndp->ni_cnd.cn_cred = ndp->ni_cnd.cn_proc->p_ucred; + KASSERT(cnp->cn_cred && cnp->cn_proc, ("namei: bad cred/proc")); + KASSERT((cnp->cn_nameiop & (~OPMASK)) == 0, + ("namei: nameiop contaminated with flags")); + KASSERT((cnp->cn_flags & OPMASK) == 0, + ("namei: flags contaminated with nameiops")); + fdp = cnp->cn_proc->p_fd; + + /* + * Get a buffer for the name to be translated, and copy the + * name into the buffer. + */ + if ((cnp->cn_flags & HASBUF) == 0) + cnp->cn_pnbuf = zalloc(namei_zone); + if (ndp->ni_segflg == UIO_SYSSPACE) + error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, + MAXPATHLEN, (size_t *)&ndp->ni_pathlen); + else + error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, + MAXPATHLEN, (size_t *)&ndp->ni_pathlen); + + /* + * Don't allow empty pathnames. + */ + if (!error && *cnp->cn_pnbuf == '\0') + error = ENOENT; + + if (error) { + zfree(namei_zone, cnp->cn_pnbuf); + ndp->ni_vp = NULL; + return (error); + } + ndp->ni_loopcnt = 0; +#ifdef KTRACE + if (KTRPOINT(cnp->cn_proc, KTR_NAMEI)) + ktrnamei(cnp->cn_proc->p_tracep, cnp->cn_pnbuf); +#endif + + /* + * Get starting point for the translation. + */ + ndp->ni_rootdir = fdp->fd_rdir; + + dp = fdp->fd_cdir; + VREF(dp); + for (;;) { + /* + * Check if root directory should replace current directory. + * Done at start of translation and after symbolic link. + */ + cnp->cn_nameptr = cnp->cn_pnbuf; + if (*(cnp->cn_nameptr) == '/') { + vrele(dp); + while (*(cnp->cn_nameptr) == '/') { + cnp->cn_nameptr++; + ndp->ni_pathlen--; + } + dp = ndp->ni_rootdir; + VREF(dp); + } + ndp->ni_startdir = dp; + error = lookup(ndp); + if (error) { + zfree(namei_zone, cnp->cn_pnbuf); + return (error); + } + /* + * Check for symbolic link + */ + if ((cnp->cn_flags & ISSYMLINK) == 0) { + if ((cnp->cn_flags & (SAVENAME | SAVESTART)) == 0) + zfree(namei_zone, cnp->cn_pnbuf); + else + cnp->cn_flags |= HASBUF; + + if (ndp->ni_vp && ndp->ni_vp->v_type == VREG && + (cnp->cn_nameiop != DELETE) && + ((cnp->cn_flags & (NOOBJ|LOCKLEAF)) == + LOCKLEAF)) + vfs_object_create(ndp->ni_vp, + ndp->ni_cnd.cn_proc, + ndp->ni_cnd.cn_cred); + + return (0); + } + if ((cnp->cn_flags & LOCKPARENT) && ndp->ni_pathlen == 1) + VOP_UNLOCK(ndp->ni_dvp, 0, p); + if (ndp->ni_loopcnt++ >= MAXSYMLINKS) { + error = ELOOP; + break; + } + if (ndp->ni_pathlen > 1) + cp = zalloc(namei_zone); + else + cp = cnp->cn_pnbuf; + aiov.iov_base = cp; + aiov.iov_len = MAXPATHLEN; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_procp = (struct proc *)0; + auio.uio_resid = MAXPATHLEN; + error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred); + if (error) { + if (ndp->ni_pathlen > 1) + zfree(namei_zone, cp); + break; + } + linklen = MAXPATHLEN - auio.uio_resid; + if (linklen + ndp->ni_pathlen >= MAXPATHLEN) { + if (ndp->ni_pathlen > 1) + zfree(namei_zone, cp); + error = ENAMETOOLONG; + break; + } + if (ndp->ni_pathlen > 1) { + bcopy(ndp->ni_next, cp + linklen, ndp->ni_pathlen); + zfree(namei_zone, cnp->cn_pnbuf); + cnp->cn_pnbuf = cp; + } else + cnp->cn_pnbuf[linklen] = '\0'; + ndp->ni_pathlen += linklen; + vput(ndp->ni_vp); + dp = ndp->ni_dvp; + } + zfree(namei_zone, cnp->cn_pnbuf); + vrele(ndp->ni_dvp); + vput(ndp->ni_vp); + ndp->ni_vp = NULL; + return (error); +} + +/* + * Search a pathname. + * This is a very central and rather complicated routine. + * + * The pathname is pointed to by ni_ptr and is of length ni_pathlen. + * The starting directory is taken from ni_startdir. The pathname is + * descended until done, or a symbolic link is encountered. The variable + * ni_more is clear if the path is completed; it is set to one if a + * symbolic link needing interpretation is encountered. + * + * The flag argument is LOOKUP, CREATE, RENAME, or DELETE depending on + * whether the name is to be looked up, created, renamed, or deleted. + * When CREATE, RENAME, or DELETE is specified, information usable in + * creating, renaming, or deleting a directory entry may be calculated. + * If flag has LOCKPARENT or'ed into it, the parent directory is returned + * locked. If flag has WANTPARENT or'ed into it, the parent directory is + * returned unlocked. Otherwise the parent directory is not returned. If + * the target of the pathname exists and LOCKLEAF is or'ed into the flag + * the target is returned locked, otherwise it is returned unlocked. + * When creating or renaming and LOCKPARENT is specified, the target may not + * be ".". When deleting and LOCKPARENT is specified, the target may be ".". + * + * Overall outline of lookup: + * + * dirloop: + * identify next component of name at ndp->ni_ptr + * handle degenerate case where name is null string + * if .. and crossing mount points and on mounted filesys, find parent + * call VOP_LOOKUP routine for next component name + * directory vnode returned in ni_dvp, unlocked unless LOCKPARENT set + * component vnode returned in ni_vp (if it exists), locked. + * if result vnode is mounted on and crossing mount points, + * find mounted on vnode + * if more components of name, do next level at dirloop + * return the answer in ni_vp, locked if LOCKLEAF set + * if LOCKPARENT set, return locked parent in ni_dvp + * if WANTPARENT set, return unlocked parent in ni_dvp + */ +int +lookup(ndp) + register struct nameidata *ndp; +{ + register char *cp; /* pointer into pathname argument */ + register struct vnode *dp = 0; /* the directory we are searching */ + struct vnode *tdp; /* saved dp */ + struct mount *mp; /* mount table entry */ + int docache; /* == 0 do not cache last component */ + int wantparent; /* 1 => wantparent or lockparent flag */ + int rdonly; /* lookup read-only flag bit */ + int trailing_slash; + int error = 0; + struct componentname *cnp = &ndp->ni_cnd; + struct proc *p = cnp->cn_proc; + + /* + * Setup: break out flag bits into variables. + */ + wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT); + docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; + if (cnp->cn_nameiop == DELETE || + (wantparent && cnp->cn_nameiop != CREATE && + cnp->cn_nameiop != LOOKUP)) + docache = 0; + rdonly = cnp->cn_flags & RDONLY; + ndp->ni_dvp = NULL; + cnp->cn_flags &= ~ISSYMLINK; + dp = ndp->ni_startdir; + ndp->ni_startdir = NULLVP; + vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, p); + +dirloop: + /* + * Search a new directory. + * + * The cn_hash value is for use by vfs_cache. + * The last component of the filename is left accessible via + * cnp->cn_nameptr for callers that need the name. Callers needing + * the name set the SAVENAME flag. When done, they assume + * responsibility for freeing the pathname buffer. + */ + cnp->cn_consume = 0; + cnp->cn_hash = 0; + for (cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) + cnp->cn_hash += (unsigned char)*cp; + cnp->cn_namelen = cp - cnp->cn_nameptr; + if (cnp->cn_namelen > NAME_MAX) { + error = ENAMETOOLONG; + goto bad; + } +#ifdef NAMEI_DIAGNOSTIC + { char c = *cp; + *cp = '\0'; + printf("{%s}: ", cnp->cn_nameptr); + *cp = c; } +#endif + ndp->ni_pathlen -= cnp->cn_namelen; + ndp->ni_next = cp; + + /* + * Replace multiple slashes by a single slash and trailing slashes + * by a null. This must be done before VOP_LOOKUP() because some + * fs's don't know about trailing slashes. Remember if there were + * trailing slashes to handle symlinks, existing non-directories + * and non-existing files that won't be directories specially later. + */ + trailing_slash = 0; + while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { + cp++; + ndp->ni_pathlen--; + if (*cp == '\0') { + trailing_slash = 1; + *ndp->ni_next = '\0'; /* XXX for direnter() ... */ + } + } + ndp->ni_next = cp; + + cnp->cn_flags |= MAKEENTRY; + if (*cp == '\0' && docache == 0) + cnp->cn_flags &= ~MAKEENTRY; + if (cnp->cn_namelen == 2 && + cnp->cn_nameptr[1] == '.' && cnp->cn_nameptr[0] == '.') + cnp->cn_flags |= ISDOTDOT; + else + cnp->cn_flags &= ~ISDOTDOT; + if (*ndp->ni_next == 0) + cnp->cn_flags |= ISLASTCN; + else + cnp->cn_flags &= ~ISLASTCN; + + + /* + * Check for degenerate name (e.g. / or "") + * which is a way of talking about a directory, + * e.g. like "/." or ".". + */ + if (cnp->cn_nameptr[0] == '\0') { + if (dp->v_type != VDIR) { + error = ENOTDIR; + goto bad; + } + if (cnp->cn_nameiop != LOOKUP) { + error = EISDIR; + goto bad; + } + if (wantparent) { + ndp->ni_dvp = dp; + VREF(dp); + } + ndp->ni_vp = dp; + if (!(cnp->cn_flags & (LOCKPARENT | LOCKLEAF))) + VOP_UNLOCK(dp, 0, p); + if (cnp->cn_flags & SAVESTART) + panic("lookup: SAVESTART"); + return (0); + } + + /* + * Handle "..": two special cases. + * 1. If at root directory (e.g. after chroot) + * or at absolute root directory + * then ignore it so can't get out. + * 2. If this vnode is the root of a mounted + * filesystem, then replace it with the + * vnode which was mounted on so we take the + * .. in the other file system. + */ + if (cnp->cn_flags & ISDOTDOT) { + for (;;) { + if (dp == ndp->ni_rootdir || dp == rootvnode) { + ndp->ni_dvp = dp; + ndp->ni_vp = dp; + VREF(dp); + goto nextname; + } + if ((dp->v_flag & VROOT) == 0 || + (cnp->cn_flags & NOCROSSMOUNT)) + break; + tdp = dp; + dp = dp->v_mount->mnt_vnodecovered; + vput(tdp); + VREF(dp); + vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, p); + } + } + + /* + * We now have a segment name to search for, and a directory to search. + */ +unionlookup: + ndp->ni_dvp = dp; + ndp->ni_vp = NULL; + ASSERT_VOP_LOCKED(dp, "lookup"); + if (error = VOP_LOOKUP(dp, &ndp->ni_vp, cnp)) { + KASSERT(ndp->ni_vp == NULL, ("leaf should be empty")); +#ifdef NAMEI_DIAGNOSTIC + printf("not found\n"); +#endif + if ((error == ENOENT) && + (dp->v_flag & VROOT) && + (dp->v_mount->mnt_flag & MNT_UNION)) { + tdp = dp; + dp = dp->v_mount->mnt_vnodecovered; + vput(tdp); + VREF(dp); + vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, p); + goto unionlookup; + } + + if (error != EJUSTRETURN) + goto bad; + /* + * If creating and at end of pathname, then can consider + * allowing file to be created. + */ + if (rdonly) { + error = EROFS; + goto bad; + } + if (*cp == '\0' && trailing_slash && + !(cnp->cn_flags & WILLBEDIR)) { + error = ENOENT; + goto bad; + } + /* + * We return with ni_vp NULL to indicate that the entry + * doesn't currently exist, leaving a pointer to the + * (possibly locked) directory inode in ndp->ni_dvp. + */ + if (cnp->cn_flags & SAVESTART) { + ndp->ni_startdir = ndp->ni_dvp; + VREF(ndp->ni_startdir); + } + return (0); + } +#ifdef NAMEI_DIAGNOSTIC + printf("found\n"); +#endif + + ASSERT_VOP_LOCKED(ndp->ni_vp, "lookup"); + + /* + * Take into account any additional components consumed by + * the underlying filesystem. + */ + if (cnp->cn_consume > 0) { + cnp->cn_nameptr += cnp->cn_consume; + ndp->ni_next += cnp->cn_consume; + ndp->ni_pathlen -= cnp->cn_consume; + cnp->cn_consume = 0; + } + + dp = ndp->ni_vp; + + /* + * Check to see if the vnode has been mounted on; + * if so find the root of the mounted file system. + */ + while (dp->v_type == VDIR && (mp = dp->v_mountedhere) && + (cnp->cn_flags & NOCROSSMOUNT) == 0) { + if (vfs_busy(mp, 0, 0, p)) + continue; + error = VFS_ROOT(mp, &tdp); + vfs_unbusy(mp, p); + if (error) + goto bad2; + vput(dp); + ndp->ni_vp = dp = tdp; + } + + /* + * Check for symbolic link + */ + if ((dp->v_type == VLNK) && + ((cnp->cn_flags & FOLLOW) || trailing_slash || + *ndp->ni_next == '/')) { + cnp->cn_flags |= ISSYMLINK; + if (dp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) { + error = EACCES; + goto bad2; + } + return (0); + } + + /* + * Check for bogus trailing slashes. + */ + if (trailing_slash && dp->v_type != VDIR) { + error = ENOTDIR; + goto bad2; + } + +nextname: + /* + * Not a symbolic link. If more pathname, + * continue at next component, else return. + */ + if (*ndp->ni_next == '/') { + cnp->cn_nameptr = ndp->ni_next; + while (*cnp->cn_nameptr == '/') { + cnp->cn_nameptr++; + ndp->ni_pathlen--; + } + if (ndp->ni_dvp != ndp->ni_vp) { + ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "lookup"); + } + vrele(ndp->ni_dvp); + goto dirloop; + } + /* + * Disallow directory write attempts on read-only file systems. + */ + if (rdonly && + (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { + error = EROFS; + goto bad2; + } + if (cnp->cn_flags & SAVESTART) { + ndp->ni_startdir = ndp->ni_dvp; + VREF(ndp->ni_startdir); + } + if (!wantparent) + vrele(ndp->ni_dvp); + + if ((cnp->cn_flags & LOCKLEAF) == 0) + VOP_UNLOCK(dp, 0, p); + return (0); + +bad2: + if ((cnp->cn_flags & LOCKPARENT) && *ndp->ni_next == '\0') + VOP_UNLOCK(ndp->ni_dvp, 0, p); + vrele(ndp->ni_dvp); +bad: + vput(dp); + ndp->ni_vp = NULL; + return (error); +} + +/* + * relookup - lookup a path name component + * Used by lookup to re-aquire things. + */ +int +relookup(dvp, vpp, cnp) + struct vnode *dvp, **vpp; + struct componentname *cnp; +{ + struct proc *p = cnp->cn_proc; + struct vnode *dp = 0; /* the directory we are searching */ + int docache; /* == 0 do not cache last component */ + int wantparent; /* 1 => wantparent or lockparent flag */ + int rdonly; /* lookup read-only flag bit */ + int error = 0; +#ifdef NAMEI_DIAGNOSTIC + int newhash; /* DEBUG: check name hash */ + char *cp; /* DEBUG: check name ptr/len */ +#endif + + /* + * Setup: break out flag bits into variables. + */ + wantparent = cnp->cn_flags & (LOCKPARENT|WANTPARENT); + docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; + if (cnp->cn_nameiop == DELETE || + (wantparent && cnp->cn_nameiop != CREATE)) + docache = 0; + rdonly = cnp->cn_flags & RDONLY; + cnp->cn_flags &= ~ISSYMLINK; + dp = dvp; + vn_lock(dp, LK_EXCLUSIVE | LK_RETRY, p); + +/* dirloop: */ + /* + * Search a new directory. + * + * The cn_hash value is for use by vfs_cache. + * The last component of the filename is left accessible via + * cnp->cn_nameptr for callers that need the name. Callers needing + * the name set the SAVENAME flag. When done, they assume + * responsibility for freeing the pathname buffer. + */ +#ifdef NAMEI_DIAGNOSTIC + for (newhash = 0, cp = cnp->cn_nameptr; *cp != 0 && *cp != '/'; cp++) + newhash += (unsigned char)*cp; + if (newhash != cnp->cn_hash) + panic("relookup: bad hash"); + if (cnp->cn_namelen != cp - cnp->cn_nameptr) + panic ("relookup: bad len"); + if (*cp != 0) + panic("relookup: not last component"); + printf("{%s}: ", cnp->cn_nameptr); +#endif + + /* + * Check for degenerate name (e.g. / or "") + * which is a way of talking about a directory, + * e.g. like "/." or ".". + */ + if (cnp->cn_nameptr[0] == '\0') { + if (cnp->cn_nameiop != LOOKUP || wantparent) { + error = EISDIR; + goto bad; + } + if (dp->v_type != VDIR) { + error = ENOTDIR; + goto bad; + } + if (!(cnp->cn_flags & LOCKLEAF)) + VOP_UNLOCK(dp, 0, p); + *vpp = dp; + if (cnp->cn_flags & SAVESTART) + panic("lookup: SAVESTART"); + return (0); + } + + if (cnp->cn_flags & ISDOTDOT) + panic ("relookup: lookup on dot-dot"); + + /* + * We now have a segment name to search for, and a directory to search. + */ + if (error = VOP_LOOKUP(dp, vpp, cnp)) { + KASSERT(*vpp == NULL, ("leaf should be empty")); + if (error != EJUSTRETURN) + goto bad; + /* + * If creating and at end of pathname, then can consider + * allowing file to be created. + */ + if (rdonly) { + error = EROFS; + goto bad; + } + /* ASSERT(dvp == ndp->ni_startdir) */ + if (cnp->cn_flags & SAVESTART) + VREF(dvp); + /* + * We return with ni_vp NULL to indicate that the entry + * doesn't currently exist, leaving a pointer to the + * (possibly locked) directory inode in ndp->ni_dvp. + */ + return (0); + } + dp = *vpp; + + /* + * Check for symbolic link + */ + KASSERT(dp->v_type != VLNK || !(cnp->cn_flags & FOLLOW), + ("relookup: symlink found.\n")); + + /* + * Disallow directory write attempts on read-only file systems. + */ + if (rdonly && + (cnp->cn_nameiop == DELETE || cnp->cn_nameiop == RENAME)) { + error = EROFS; + goto bad2; + } + /* ASSERT(dvp == ndp->ni_startdir) */ + if (cnp->cn_flags & SAVESTART) + VREF(dvp); + + if (!wantparent) + vrele(dvp); + + if (dp->v_type == VREG && + ((cnp->cn_flags & (NOOBJ|LOCKLEAF)) == LOCKLEAF)) + vfs_object_create(dp, cnp->cn_proc, cnp->cn_cred); + + if ((cnp->cn_flags & LOCKLEAF) == 0) + VOP_UNLOCK(dp, 0, p); + return (0); + +bad2: + if ((cnp->cn_flags & LOCKPARENT) && (cnp->cn_flags & ISLASTCN)) + VOP_UNLOCK(dvp, 0, p); + vrele(dvp); +bad: + vput(dp); + *vpp = NULL; + return (error); +} diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c new file mode 100644 index 0000000..a7a830f --- /dev/null +++ b/sys/kern/vfs_mount.c @@ -0,0 +1,190 @@ +/* + * Copyright (c) 1989, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * Copyright (c) 1995 Artisoft, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_conf.c 8.8 (Berkeley) 3/31/94 + * $Id: vfs_conf.c,v 1.25 1998/06/09 12:52:33 bde Exp $ + */ + +/* + * PURPOSE: This file abstracts the root mounting interface from + * the per file system semantics for handling mounts, + * the overall intent of which is to move the BSD + * internals dependence out of the FS code, both to + * make the FS code more portable and to free up some + * of the BSD internals so that they may more easily + * be changed. + * + * NOTE1: Code is single entry/single exit to aid debugging + * and conversion for kernel multithreading. + * + * NOTE2: Code notes lock state in headers on entry and exit + * as an aid to conversion for kernel multithreading + * on SMP reentrancy + */ +#include "opt_bootp.h" + +#include <sys/param.h> /* dev_t (types.h)*/ +#include <sys/kernel.h> +#include <sys/systm.h> /* rootvp*/ +#include <sys/proc.h> /* curproc*/ +#include <sys/vnode.h> /* NULLVP*/ +#include <sys/mount.h> /* struct mount*/ +#include <sys/malloc.h> /* M_MOUNT*/ + +/* + * GLOBALS + */ + +MALLOC_DEFINE(M_MOUNT, "mount", "vfs mount struct"); + +/* + * These define the root filesystem, device, and root filesystem type. + */ +dev_t rootdevs[] = { NODEV, NODEV }; +char *rootdevnames[2]; +struct vnode *rootvnode; +char *mountrootfsname; +#ifdef BOOTP +extern void bootpc_init __P((void)); +#endif + +/* + * vfs_init() will set maxvfsconf + * to the highest defined type number. + */ +int maxvfsconf; +struct vfsconf *vfsconf; + +/* + * Common root mount code shared by all filesystems + */ +#define ROOTNAME "root_device" + +/* + * vfs_mountrootfs + * + * Common entry point for root mounts + * + * PARAMETERS: + * NONE + * + * RETURNS: 0 Success + * !0 error number (errno.h) + * + * LOCK STATE: + * ENTRY + * <no locks held> + * EXIT + * <no locks held> + * + * NOTES: + * This code is currently supported only for use for + * the FFS file system type. This is a matter of + * fixing the other file systems, not this code! + */ +static void +vfs_mountrootfs(void *unused) +{ + struct mount *mp; + int i, err; + struct proc *p = curproc; /* XXX */ + dev_t orootdev; + +#ifdef BOOTP + bootpc_init(); +#endif + /* + * New root mount structure + */ + if ((err = vfs_rootmountalloc(mountrootfsname, ROOTNAME, &mp))) { + printf("error %d: ", err); + panic("cannot mount root\n"); + return ; + } + mp->mnt_flag |= MNT_ROOTFS; + + /* + * Attempt the mount + */ + err = ENXIO; + orootdev = rootdev; + if (rootdevs[0] == NODEV) + rootdevs[0] = rootdev; + for (i = 0; i < sizeof(rootdevs) / sizeof(rootdevs[0]); i++) { + if (rootdevs[i] == NODEV) + break; + rootdev = rootdevs[i]; + if (rootdev != orootdev) { + printf("changing root device to %s\n", rootdevnames[i]); + orootdev = rootdev; + } + strncpy(mp->mnt_stat.f_mntfromname, + rootdevnames[i] ? rootdevnames[i] : ROOTNAME, MNAMELEN - 1); + err = VFS_MOUNT(mp, NULL, NULL, NULL, p); + if (err != ENXIO) + break; + } + if (err) { + /* + * XXX should ask the user for the name in some cases. + * Why do we call vfs_unbusy() here and not after ENXIO + * is returned above? + */ + vfs_unbusy(mp, p); + /* + * free mount struct before failing + * (hardly worthwhile with the PANIC eh?) + */ + free( mp, M_MOUNT); + printf("error %d: ", err); + panic("cannot mount root (2)\n"); + return; + } + + simple_lock(&mountlist_slock); + + /* + * Add fs to list of mounted file systems + */ + CIRCLEQ_INSERT_HEAD(&mountlist, mp, mnt_list); + + simple_unlock(&mountlist_slock); + vfs_unbusy(mp, p); + + /* root mount, update system time from FS specific data*/ + inittodr(mp->mnt_time); + return; +} + +SYSINIT(mountroot, SI_SUB_MOUNT_ROOT, SI_ORDER_FIRST, vfs_mountrootfs, NULL) + diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c new file mode 100644 index 0000000..44b1698 --- /dev/null +++ b/sys/kern/vfs_subr.c @@ -0,0 +1,2872 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 + * $Id: vfs_subr.c,v 1.182 1999/01/10 01:58:26 eivind Exp $ + */ + +/* + * External virtual filesystem routines + */ +#include "opt_ddb.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/conf.h> +#include <sys/fcntl.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/mount.h> +#include <sys/socket.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/buf.h> +#include <sys/domain.h> +#include <sys/dirent.h> +#include <sys/vmmeter.h> + +#include <machine/limits.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/vm_pager.h> +#include <vm/vnode_pager.h> +#include <vm/vm_zone.h> +#include <sys/sysctl.h> + +#include <miscfs/specfs/specdev.h> + +static MALLOC_DEFINE(M_NETADDR, "Export Host", "Export host address structure"); + +static void insmntque __P((struct vnode *vp, struct mount *mp)); +static void vclean __P((struct vnode *vp, int flags, struct proc *p)); +static void vfree __P((struct vnode *)); +static void vgonel __P((struct vnode *vp, struct proc *p)); +static unsigned long numvnodes; +SYSCTL_INT(_debug, OID_AUTO, numvnodes, CTLFLAG_RD, &numvnodes, 0, ""); + +enum vtype iftovt_tab[16] = { + VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, + VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, +}; +int vttoif_tab[9] = { + 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, + S_IFSOCK, S_IFIFO, S_IFMT, +}; + +static TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ +struct tobefreelist vnode_tobefree_list; /* vnode free list */ + +static u_long wantfreevnodes = 25; +SYSCTL_INT(_debug, OID_AUTO, wantfreevnodes, CTLFLAG_RW, &wantfreevnodes, 0, ""); +static u_long freevnodes = 0; +SYSCTL_INT(_debug, OID_AUTO, freevnodes, CTLFLAG_RD, &freevnodes, 0, ""); + +int vfs_ioopt = 0; +#ifdef ENABLE_VFS_IOOPT +SYSCTL_INT(_vfs, OID_AUTO, ioopt, CTLFLAG_RW, &vfs_ioopt, 0, ""); +#endif + +struct mntlist mountlist; /* mounted filesystem list */ +struct simplelock mountlist_slock; +struct simplelock mntvnode_slock; +int nfs_mount_type = -1; +#ifndef NULL_SIMPLELOCKS +static struct simplelock mntid_slock; +static struct simplelock vnode_free_list_slock; +static struct simplelock spechash_slock; +#endif +struct nfs_public nfs_pub; /* publicly exported FS */ +static vm_zone_t vnode_zone; + +/* + * The workitem queue. + */ +#define SYNCER_MAXDELAY 32 +static int syncer_maxdelay = SYNCER_MAXDELAY; /* maximum delay time */ +time_t syncdelay = 30; +int rushjob; /* number of slots to run ASAP */ + +static int syncer_delayno = 0; +static long syncer_mask; +LIST_HEAD(synclist, vnode); +static struct synclist *syncer_workitem_pending; + +int desiredvnodes; +SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, ""); + +static void vfs_free_addrlist __P((struct netexport *nep)); +static int vfs_free_netcred __P((struct radix_node *rn, void *w)); +static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, + struct export_args *argp)); + +/* + * Initialize the vnode management data structures. + */ +void +vntblinit() +{ + + desiredvnodes = maxproc + cnt.v_page_count / 4; + simple_lock_init(&mntvnode_slock); + simple_lock_init(&mntid_slock); + simple_lock_init(&spechash_slock); + TAILQ_INIT(&vnode_free_list); + TAILQ_INIT(&vnode_tobefree_list); + simple_lock_init(&vnode_free_list_slock); + CIRCLEQ_INIT(&mountlist); + vnode_zone = zinit("VNODE", sizeof (struct vnode), 0, 0, 5); + /* + * Initialize the filesystem syncer. + */ + syncer_workitem_pending = hashinit(syncer_maxdelay, M_VNODE, + &syncer_mask); + syncer_maxdelay = syncer_mask + 1; +} + +/* + * Mark a mount point as busy. Used to synchronize access and to delay + * unmounting. Interlock is not released on failure. + */ +int +vfs_busy(mp, flags, interlkp, p) + struct mount *mp; + int flags; + struct simplelock *interlkp; + struct proc *p; +{ + int lkflags; + + if (mp->mnt_kern_flag & MNTK_UNMOUNT) { + if (flags & LK_NOWAIT) + return (ENOENT); + mp->mnt_kern_flag |= MNTK_MWAIT; + if (interlkp) { + simple_unlock(interlkp); + } + /* + * Since all busy locks are shared except the exclusive + * lock granted when unmounting, the only place that a + * wakeup needs to be done is at the release of the + * exclusive lock at the end of dounmount. + */ + tsleep((caddr_t)mp, PVFS, "vfs_busy", 0); + if (interlkp) { + simple_lock(interlkp); + } + return (ENOENT); + } + lkflags = LK_SHARED | LK_NOPAUSE; + if (interlkp) + lkflags |= LK_INTERLOCK; + if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) + panic("vfs_busy: unexpected lock failure"); + return (0); +} + +/* + * Free a busy filesystem. + */ +void +vfs_unbusy(mp, p) + struct mount *mp; + struct proc *p; +{ + + lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); +} + +/* + * Lookup a filesystem type, and if found allocate and initialize + * a mount structure for it. + * + * Devname is usually updated by mount(8) after booting. + */ +int +vfs_rootmountalloc(fstypename, devname, mpp) + char *fstypename; + char *devname; + struct mount **mpp; +{ + struct proc *p = curproc; /* XXX */ + struct vfsconf *vfsp; + struct mount *mp; + + if (fstypename == NULL) + return (ENODEV); + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (!strcmp(vfsp->vfc_name, fstypename)) + break; + if (vfsp == NULL) + return (ENODEV); + mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); + bzero((char *)mp, (u_long)sizeof(struct mount)); + lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); + (void)vfs_busy(mp, LK_NOWAIT, 0, p); + LIST_INIT(&mp->mnt_vnodelist); + mp->mnt_vfc = vfsp; + mp->mnt_op = vfsp->vfc_vfsops; + mp->mnt_flag = MNT_RDONLY; + mp->mnt_vnodecovered = NULLVP; + vfsp->vfc_refcount++; + mp->mnt_stat.f_type = vfsp->vfc_typenum; + mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; + strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); + mp->mnt_stat.f_mntonname[0] = '/'; + mp->mnt_stat.f_mntonname[1] = 0; + (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); + *mpp = mp; + return (0); +} + +/* + * Find an appropriate filesystem to use for the root. If a filesystem + * has not been preselected, walk through the list of known filesystems + * trying those that have mountroot routines, and try them until one + * works or we have tried them all. + */ +#ifdef notdef /* XXX JH */ +int +lite2_vfs_mountroot() +{ + struct vfsconf *vfsp; + extern int (*lite2_mountroot) __P((void)); + int error; + + if (lite2_mountroot != NULL) + return ((*lite2_mountroot)()); + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { + if (vfsp->vfc_mountroot == NULL) + continue; + if ((error = (*vfsp->vfc_mountroot)()) == 0) + return (0); + printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); + } + return (ENODEV); +} +#endif + +/* + * Lookup a mount point by filesystem identifier. + */ +struct mount * +vfs_getvfs(fsid) + fsid_t *fsid; +{ + register struct mount *mp; + + simple_lock(&mountlist_slock); + for (mp = mountlist.cqh_first; mp != (void *)&mountlist; + mp = mp->mnt_list.cqe_next) { + if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && + mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { + simple_unlock(&mountlist_slock); + return (mp); + } + } + simple_unlock(&mountlist_slock); + return ((struct mount *) 0); +} + +/* + * Get a new unique fsid + */ +void +vfs_getnewfsid(mp) + struct mount *mp; +{ + static u_short xxxfs_mntid; + + fsid_t tfsid; + int mtype; + + simple_lock(&mntid_slock); + mtype = mp->mnt_vfc->vfc_typenum; + mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0); + mp->mnt_stat.f_fsid.val[1] = mtype; + if (xxxfs_mntid == 0) + ++xxxfs_mntid; + tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid); + tfsid.val[1] = mtype; + if (mountlist.cqh_first != (void *)&mountlist) { + while (vfs_getvfs(&tfsid)) { + tfsid.val[0]++; + xxxfs_mntid++; + } + } + mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; + simple_unlock(&mntid_slock); +} + +/* + * Set vnode attributes to VNOVAL + */ +void +vattr_null(vap) + register struct vattr *vap; +{ + + vap->va_type = VNON; + vap->va_size = VNOVAL; + vap->va_bytes = VNOVAL; + vap->va_mode = VNOVAL; + vap->va_nlink = VNOVAL; + vap->va_uid = VNOVAL; + vap->va_gid = VNOVAL; + vap->va_fsid = VNOVAL; + vap->va_fileid = VNOVAL; + vap->va_blocksize = VNOVAL; + vap->va_rdev = VNOVAL; + vap->va_atime.tv_sec = VNOVAL; + vap->va_atime.tv_nsec = VNOVAL; + vap->va_mtime.tv_sec = VNOVAL; + vap->va_mtime.tv_nsec = VNOVAL; + vap->va_ctime.tv_sec = VNOVAL; + vap->va_ctime.tv_nsec = VNOVAL; + vap->va_flags = VNOVAL; + vap->va_gen = VNOVAL; + vap->va_vaflags = 0; +} + +/* + * Routines having to do with the management of the vnode table. + */ +extern vop_t **dead_vnodeop_p; + +/* + * Return the next vnode from the free list. + */ +int +getnewvnode(tag, mp, vops, vpp) + enum vtagtype tag; + struct mount *mp; + vop_t **vops; + struct vnode **vpp; +{ + int s; + struct proc *p = curproc; /* XXX */ + struct vnode *vp, *tvp, *nvp; + vm_object_t object; + TAILQ_HEAD(freelst, vnode) vnode_tmp_list; + + /* + * We take the least recently used vnode from the freelist + * if we can get it and it has no cached pages, and no + * namecache entries are relative to it. + * Otherwise we allocate a new vnode + */ + + s = splbio(); + simple_lock(&vnode_free_list_slock); + TAILQ_INIT(&vnode_tmp_list); + + for (vp = TAILQ_FIRST(&vnode_tobefree_list); vp; vp = nvp) { + nvp = TAILQ_NEXT(vp, v_freelist); + TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); + if (vp->v_flag & VAGE) { + TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); + } else { + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + } + vp->v_flag &= ~(VTBFREE|VAGE); + vp->v_flag |= VFREE; + if (vp->v_usecount) + panic("tobe free vnode isn't"); + freevnodes++; + } + + if (wantfreevnodes && freevnodes < wantfreevnodes) { + vp = NULL; + } else if (!wantfreevnodes && freevnodes <= desiredvnodes) { + /* + * XXX: this is only here to be backwards compatible + */ + vp = NULL; + } else { + for (vp = TAILQ_FIRST(&vnode_free_list); vp; vp = nvp) { + nvp = TAILQ_NEXT(vp, v_freelist); + if (!simple_lock_try(&vp->v_interlock)) + continue; + if (vp->v_usecount) + panic("free vnode isn't"); + + object = vp->v_object; + if (object && (object->resident_page_count || object->ref_count)) { + printf("object inconsistant state: RPC: %d, RC: %d\n", + object->resident_page_count, object->ref_count); + /* Don't recycle if it's caching some pages */ + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + TAILQ_INSERT_TAIL(&vnode_tmp_list, vp, v_freelist); + continue; + } else if (LIST_FIRST(&vp->v_cache_src)) { + /* Don't recycle if active in the namecache */ + simple_unlock(&vp->v_interlock); + continue; + } else { + break; + } + } + } + + for (tvp = TAILQ_FIRST(&vnode_tmp_list); tvp; tvp = nvp) { + nvp = TAILQ_NEXT(tvp, v_freelist); + TAILQ_REMOVE(&vnode_tmp_list, tvp, v_freelist); + TAILQ_INSERT_TAIL(&vnode_free_list, tvp, v_freelist); + simple_unlock(&tvp->v_interlock); + } + + if (vp) { + vp->v_flag |= VDOOMED; + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + freevnodes--; + simple_unlock(&vnode_free_list_slock); + cache_purge(vp); + vp->v_lease = NULL; + if (vp->v_type != VBAD) { + vgonel(vp, p); + } else { + simple_unlock(&vp->v_interlock); + } + +#ifdef INVARIANTS + { + int s; + + if (vp->v_data) + panic("cleaned vnode isn't"); + s = splbio(); + if (vp->v_numoutput) + panic("Clean vnode has pending I/O's"); + splx(s); + } +#endif + vp->v_flag = 0; + vp->v_lastr = 0; + vp->v_lastw = 0; + vp->v_lasta = 0; + vp->v_cstart = 0; + vp->v_clen = 0; + vp->v_socket = 0; + vp->v_writecount = 0; /* XXX */ + vp->v_maxio = 0; + } else { + simple_unlock(&vnode_free_list_slock); + vp = (struct vnode *) zalloc(vnode_zone); + bzero((char *) vp, sizeof *vp); + simple_lock_init(&vp->v_interlock); + vp->v_dd = vp; + cache_purge(vp); + LIST_INIT(&vp->v_cache_src); + TAILQ_INIT(&vp->v_cache_dst); + numvnodes++; + } + + TAILQ_INIT(&vp->v_cleanblkhd); + TAILQ_INIT(&vp->v_dirtyblkhd); + vp->v_type = VNON; + vp->v_tag = tag; + vp->v_op = vops; + insmntque(vp, mp); + *vpp = vp; + vp->v_usecount = 1; + vp->v_data = 0; + splx(s); + + vfs_object_create(vp, p, p->p_ucred); + return (0); +} + +/* + * Move a vnode from one mount queue to another. + */ +static void +insmntque(vp, mp) + register struct vnode *vp; + register struct mount *mp; +{ + + simple_lock(&mntvnode_slock); + /* + * Delete from old mount point vnode list, if on one. + */ + if (vp->v_mount != NULL) + LIST_REMOVE(vp, v_mntvnodes); + /* + * Insert into list of vnodes for the new mount point, if available. + */ + if ((vp->v_mount = mp) == NULL) { + simple_unlock(&mntvnode_slock); + return; + } + LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); + simple_unlock(&mntvnode_slock); +} + +/* + * Update outstanding I/O count and do wakeup if requested. + */ +void +vwakeup(bp) + register struct buf *bp; +{ + register struct vnode *vp; + + bp->b_flags &= ~B_WRITEINPROG; + if ((vp = bp->b_vp)) { + vp->v_numoutput--; + if (vp->v_numoutput < 0) + panic("vwakeup: neg numoutput"); + if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { + vp->v_flag &= ~VBWAIT; + wakeup((caddr_t) &vp->v_numoutput); + } + } +} + +/* + * Flush out and invalidate all buffers associated with a vnode. + * Called with the underlying object locked. + */ +int +vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) + register struct vnode *vp; + int flags; + struct ucred *cred; + struct proc *p; + int slpflag, slptimeo; +{ + register struct buf *bp; + struct buf *nbp, *blist; + int s, error; + vm_object_t object; + + if (flags & V_SAVE) { + s = splbio(); + while (vp->v_numoutput) { + vp->v_flag |= VBWAIT; + error = tsleep((caddr_t)&vp->v_numoutput, + slpflag | (PRIBIO + 1), "vinvlbuf", slptimeo); + if (error) { + splx(s); + return (error); + } + } + if (!TAILQ_EMPTY(&vp->v_dirtyblkhd)) { + splx(s); + if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) != 0) + return (error); + s = splbio(); + if (vp->v_numoutput > 0 || + !TAILQ_EMPTY(&vp->v_dirtyblkhd)) + panic("vinvalbuf: dirty bufs"); + } + splx(s); + } + s = splbio(); + for (;;) { + blist = TAILQ_FIRST(&vp->v_cleanblkhd); + if (!blist) + blist = TAILQ_FIRST(&vp->v_dirtyblkhd); + if (!blist) + break; + + for (bp = blist; bp; bp = nbp) { + nbp = TAILQ_NEXT(bp, b_vnbufs); + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + error = tsleep((caddr_t) bp, + slpflag | (PRIBIO + 4), "vinvalbuf", + slptimeo); + if (error) { + splx(s); + return (error); + } + break; + } + /* + * XXX Since there are no node locks for NFS, I + * believe there is a slight chance that a delayed + * write will occur while sleeping just above, so + * check for it. Note that vfs_bio_awrite expects + * buffers to reside on a queue, while VOP_BWRITE and + * brelse do not. + */ + if (((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) && + (flags & V_SAVE)) { + + if (bp->b_vp == vp) { + if (bp->b_flags & B_CLUSTEROK) { + vfs_bio_awrite(bp); + } else { + bremfree(bp); + bp->b_flags |= (B_BUSY | B_ASYNC); + VOP_BWRITE(bp); + } + } else { + bremfree(bp); + bp->b_flags |= B_BUSY; + (void) VOP_BWRITE(bp); + } + break; + } + bremfree(bp); + bp->b_flags |= (B_INVAL | B_NOCACHE | B_RELBUF | B_BUSY); + bp->b_flags &= ~B_ASYNC; + brelse(bp); + } + } + + while (vp->v_numoutput > 0) { + vp->v_flag |= VBWAIT; + tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); + } + + splx(s); + + /* + * Destroy the copy in the VM cache, too. + */ + simple_lock(&vp->v_interlock); + object = vp->v_object; + if (object != NULL) { + vm_object_page_remove(object, 0, 0, + (flags & V_SAVE) ? TRUE : FALSE); + } + simple_unlock(&vp->v_interlock); + + if (!TAILQ_EMPTY(&vp->v_dirtyblkhd) || !TAILQ_EMPTY(&vp->v_cleanblkhd)) + panic("vinvalbuf: flush failed"); + return (0); +} + +/* + * Truncate a file's buffer and pages to a specified length. This + * is in lieu of the old vinvalbuf mechanism, which performed unneeded + * sync activity. + */ +int +vtruncbuf(vp, cred, p, length, blksize) + register struct vnode *vp; + struct ucred *cred; + struct proc *p; + off_t length; + int blksize; +{ + register struct buf *bp; + struct buf *nbp; + int s, anyfreed; + int trunclbn; + + /* + * Round up to the *next* lbn. + */ + trunclbn = (length + blksize - 1) / blksize; + + s = splbio(); +restart: + anyfreed = 1; + for (;anyfreed;) { + anyfreed = 0; + for (bp = TAILQ_FIRST(&vp->v_cleanblkhd); bp; bp = nbp) { + nbp = TAILQ_NEXT(bp, b_vnbufs); + if (bp->b_lblkno >= trunclbn) { + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + tsleep(bp, PRIBIO + 4, "vtrb1", 0); + goto restart; + } else { + bremfree(bp); + bp->b_flags |= (B_BUSY | B_INVAL | B_RELBUF); + bp->b_flags &= ~B_ASYNC; + brelse(bp); + anyfreed = 1; + } + if (nbp && (((nbp->b_xflags & B_VNCLEAN) == 0)|| + (nbp->b_vp != vp) || + (nbp->b_flags & B_DELWRI))) { + goto restart; + } + } + } + + for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { + nbp = TAILQ_NEXT(bp, b_vnbufs); + if (bp->b_lblkno >= trunclbn) { + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + tsleep(bp, PRIBIO + 4, "vtrb2", 0); + goto restart; + } else { + bremfree(bp); + bp->b_flags |= (B_BUSY | B_INVAL | B_RELBUF); + bp->b_flags &= ~B_ASYNC; + brelse(bp); + anyfreed = 1; + } + if (nbp && (((nbp->b_xflags & B_VNDIRTY) == 0)|| + (nbp->b_vp != vp) || + (nbp->b_flags & B_DELWRI) == 0)) { + goto restart; + } + } + } + } + + if (length > 0) { +restartsync: + for (bp = TAILQ_FIRST(&vp->v_dirtyblkhd); bp; bp = nbp) { + nbp = TAILQ_NEXT(bp, b_vnbufs); + if ((bp->b_flags & B_DELWRI) && (bp->b_lblkno < 0)) { + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + tsleep(bp, PRIBIO, "vtrb3", 0); + } else { + bremfree(bp); + bp->b_flags |= B_BUSY; + if (bp->b_vp == vp) { + bp->b_flags |= B_ASYNC; + } else { + bp->b_flags &= ~B_ASYNC; + } + VOP_BWRITE(bp); + } + goto restartsync; + } + + } + } + + while (vp->v_numoutput > 0) { + vp->v_flag |= VBWAIT; + tsleep(&vp->v_numoutput, PVM, "vbtrunc", 0); + } + + splx(s); + + vnode_pager_setsize(vp, length); + + return (0); +} + +/* + * Associate a buffer with a vnode. + */ +void +bgetvp(vp, bp) + register struct vnode *vp; + register struct buf *bp; +{ + int s; + + KASSERT(bp->b_vp == NULL, ("bgetvp: not free")); + + vhold(vp); + bp->b_vp = vp; + if (vp->v_type == VBLK || vp->v_type == VCHR) + bp->b_dev = vp->v_rdev; + else + bp->b_dev = NODEV; + /* + * Insert onto list for new vnode. + */ + s = splbio(); + bp->b_xflags |= B_VNCLEAN; + bp->b_xflags &= ~B_VNDIRTY; + TAILQ_INSERT_TAIL(&vp->v_cleanblkhd, bp, b_vnbufs); + splx(s); +} + +/* + * Disassociate a buffer from a vnode. + */ +void +brelvp(bp) + register struct buf *bp; +{ + struct vnode *vp; + struct buflists *listheadp; + int s; + + KASSERT(bp->b_vp != NULL, ("brelvp: NULL")); + + /* + * Delete from old vnode list, if on one. + */ + vp = bp->b_vp; + s = splbio(); + if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) { + if (bp->b_xflags & B_VNDIRTY) + listheadp = &vp->v_dirtyblkhd; + else + listheadp = &vp->v_cleanblkhd; + TAILQ_REMOVE(listheadp, bp, b_vnbufs); + bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN); + } + if ((vp->v_flag & VONWORKLST) && TAILQ_EMPTY(&vp->v_dirtyblkhd)) { + vp->v_flag &= ~VONWORKLST; + LIST_REMOVE(vp, v_synclist); + } + splx(s); + bp->b_vp = (struct vnode *) 0; + vdrop(vp); +} + +/* + * The workitem queue. + * + * It is useful to delay writes of file data and filesystem metadata + * for tens of seconds so that quickly created and deleted files need + * not waste disk bandwidth being created and removed. To realize this, + * we append vnodes to a "workitem" queue. When running with a soft + * updates implementation, most pending metadata dependencies should + * not wait for more than a few seconds. Thus, mounted on block devices + * are delayed only about a half the time that file data is delayed. + * Similarly, directory updates are more critical, so are only delayed + * about a third the time that file data is delayed. Thus, there are + * SYNCER_MAXDELAY queues that are processed round-robin at a rate of + * one each second (driven off the filesystem syner process). The + * syncer_delayno variable indicates the next queue that is to be processed. + * Items that need to be processed soon are placed in this queue: + * + * syncer_workitem_pending[syncer_delayno] + * + * A delay of fifteen seconds is done by placing the request fifteen + * entries later in the queue: + * + * syncer_workitem_pending[(syncer_delayno + 15) & syncer_mask] + * + */ + +/* + * Add an item to the syncer work queue. + */ +void +vn_syncer_add_to_worklist(vp, delay) + struct vnode *vp; + int delay; +{ + int s, slot; + + s = splbio(); + + if (vp->v_flag & VONWORKLST) { + LIST_REMOVE(vp, v_synclist); + } + + if (delay > syncer_maxdelay - 2) + delay = syncer_maxdelay - 2; + slot = (syncer_delayno + delay) & syncer_mask; + + LIST_INSERT_HEAD(&syncer_workitem_pending[slot], vp, v_synclist); + vp->v_flag |= VONWORKLST; + splx(s); +} + +static void sched_sync __P((void)); +static struct proc *updateproc; +static struct kproc_desc up_kp = { + "syncer", + sched_sync, + &updateproc +}; +SYSINIT_KT(syncer, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) + +/* + * System filesystem synchronizer daemon. + */ +void +sched_sync(void) +{ + struct synclist *slp; + struct vnode *vp; + long starttime; + int s; + struct proc *p = updateproc; + + for (;;) { + starttime = time_second; + + /* + * Push files whose dirty time has expired. + */ + s = splbio(); + slp = &syncer_workitem_pending[syncer_delayno]; + syncer_delayno += 1; + if (syncer_delayno == syncer_maxdelay) + syncer_delayno = 0; + splx(s); + + while ((vp = LIST_FIRST(slp)) != NULL) { + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + (void) VOP_FSYNC(vp, p->p_ucred, MNT_LAZY, p); + VOP_UNLOCK(vp, 0, p); + if (LIST_FIRST(slp) == vp) { + if (TAILQ_EMPTY(&vp->v_dirtyblkhd) && + vp->v_type != VBLK) + panic("sched_sync: fsync failed"); + /* + * Move ourselves to the back of the sync list. + */ + LIST_REMOVE(vp, v_synclist); + vn_syncer_add_to_worklist(vp, syncdelay); + } + } + + /* + * Do soft update processing. + */ + if (bioops.io_sync) + (*bioops.io_sync)(NULL); + + /* + * The variable rushjob allows the kernel to speed up the + * processing of the filesystem syncer process. A rushjob + * value of N tells the filesystem syncer to process the next + * N seconds worth of work on its queue ASAP. Currently rushjob + * is used by the soft update code to speed up the filesystem + * syncer process when the incore state is getting so far + * ahead of the disk that the kernel memory pool is being + * threatened with exhaustion. + */ + if (rushjob > 0) { + rushjob -= 1; + continue; + } + /* + * If it has taken us less than a second to process the + * current work, then wait. Otherwise start right over + * again. We can still lose time if any single round + * takes more than two seconds, but it does not really + * matter as we are just trying to generally pace the + * filesystem activity. + */ + if (time_second == starttime) + tsleep(&lbolt, PPAUSE, "syncer", 0); + } +} + +/* + * Associate a p-buffer with a vnode. + * + * Also sets B_PAGING flag to indicate that vnode is not fully associated + * with the buffer. i.e. the bp has not been linked into the vnode or + * ref-counted. + */ +void +pbgetvp(vp, bp) + register struct vnode *vp; + register struct buf *bp; +{ + + KASSERT(bp->b_vp == NULL, ("pbgetvp: not free")); + + bp->b_vp = vp; + bp->b_flags |= B_PAGING; + if (vp->v_type == VBLK || vp->v_type == VCHR) + bp->b_dev = vp->v_rdev; + else + bp->b_dev = NODEV; +} + +/* + * Disassociate a p-buffer from a vnode. + */ +void +pbrelvp(bp) + register struct buf *bp; +{ + + KASSERT(bp->b_vp != NULL, ("pbrelvp: NULL")); + +#if !defined(MAX_PERF) + /* XXX REMOVE ME */ + if (bp->b_vnbufs.tqe_next != NULL) { + panic( + "relpbuf(): b_vp was probably reassignbuf()d %p %x", + bp, + (int)bp->b_flags + ); + } +#endif + bp->b_vp = (struct vnode *) 0; + bp->b_flags &= ~B_PAGING; +} + +void +pbreassignbuf(bp, newvp) + struct buf *bp; + struct vnode *newvp; +{ +#if !defined(MAX_PERF) + if ((bp->b_flags & B_PAGING) == 0) { + panic( + "pbreassignbuf() on non phys bp %p", + bp + ); + } +#endif + bp->b_vp = newvp; +} + +/* + * Reassign a buffer from one vnode to another. + * Used to assign file specific control information + * (indirect blocks) to the vnode to which they belong. + */ +void +reassignbuf(bp, newvp) + register struct buf *bp; + register struct vnode *newvp; +{ + struct buflists *listheadp; + struct vnode *oldvp; + int delay; + int s; + + if (newvp == NULL) { + printf("reassignbuf: NULL"); + return; + } + +#if !defined(MAX_PERF) + /* + * B_PAGING flagged buffers cannot be reassigned because their vp + * is not fully linked in. + */ + if (bp->b_flags & B_PAGING) + panic("cannot reassign paging buffer"); +#endif + + s = splbio(); + /* + * Delete from old vnode list, if on one. + */ + if (bp->b_xflags & (B_VNDIRTY|B_VNCLEAN)) { + oldvp = bp->b_vp; + if (bp->b_xflags & B_VNDIRTY) + listheadp = &oldvp->v_dirtyblkhd; + else + listheadp = &oldvp->v_cleanblkhd; + TAILQ_REMOVE(listheadp, bp, b_vnbufs); + bp->b_xflags &= ~(B_VNDIRTY|B_VNCLEAN); + vdrop(oldvp); + } + /* + * If dirty, put on list of dirty buffers; otherwise insert onto list + * of clean buffers. + */ + if (bp->b_flags & B_DELWRI) { + struct buf *tbp; + + listheadp = &newvp->v_dirtyblkhd; + if ((newvp->v_flag & VONWORKLST) == 0) { + switch (newvp->v_type) { + case VDIR: + delay = syncdelay / 3; + break; + case VBLK: + if (newvp->v_specmountpoint != NULL) { + delay = syncdelay / 2; + break; + } + /* fall through */ + default: + delay = syncdelay; + } + vn_syncer_add_to_worklist(newvp, delay); + } + bp->b_xflags |= B_VNDIRTY; + tbp = TAILQ_FIRST(listheadp); + if (tbp == NULL || + (bp->b_lblkno >= 0 && tbp->b_lblkno > bp->b_lblkno)) { + TAILQ_INSERT_HEAD(listheadp, bp, b_vnbufs); + } else { + if (bp->b_lblkno >= 0) { + struct buf *ttbp; + while ((ttbp = TAILQ_NEXT(tbp, b_vnbufs)) && + (ttbp->b_lblkno < bp->b_lblkno)) { + tbp = ttbp; + } + TAILQ_INSERT_AFTER(listheadp, tbp, bp, b_vnbufs); + } else { + TAILQ_INSERT_TAIL(listheadp, bp, b_vnbufs); + } + } + } else { + bp->b_xflags |= B_VNCLEAN; + TAILQ_INSERT_TAIL(&newvp->v_cleanblkhd, bp, b_vnbufs); + if ((newvp->v_flag & VONWORKLST) && + TAILQ_EMPTY(&newvp->v_dirtyblkhd)) { + newvp->v_flag &= ~VONWORKLST; + LIST_REMOVE(newvp, v_synclist); + } + } + bp->b_vp = newvp; + vhold(bp->b_vp); + splx(s); +} + +/* + * Create a vnode for a block device. + * Used for mounting the root file system. + */ +int +bdevvp(dev, vpp) + dev_t dev; + struct vnode **vpp; +{ + register struct vnode *vp; + struct vnode *nvp; + int error; + + /* XXX 255 is for mfs. */ + if (dev == NODEV || (major(dev) != 255 && (major(dev) >= nblkdev || + bdevsw[major(dev)] == NULL))) { + *vpp = NULLVP; + return (ENXIO); + } + error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); + if (error) { + *vpp = NULLVP; + return (error); + } + vp = nvp; + vp->v_type = VBLK; + if ((nvp = checkalias(vp, dev, (struct mount *)0)) != NULL) { + vput(vp); + vp = nvp; + } + *vpp = vp; + return (0); +} + +/* + * Check to see if the new vnode represents a special device + * for which we already have a vnode (either because of + * bdevvp() or because of a different vnode representing + * the same block device). If such an alias exists, deallocate + * the existing contents and return the aliased vnode. The + * caller is responsible for filling it with its new contents. + */ +struct vnode * +checkalias(nvp, nvp_rdev, mp) + register struct vnode *nvp; + dev_t nvp_rdev; + struct mount *mp; +{ + struct proc *p = curproc; /* XXX */ + struct vnode *vp; + struct vnode **vpp; + + if (nvp->v_type != VBLK && nvp->v_type != VCHR) + return (NULLVP); + + vpp = &speclisth[SPECHASH(nvp_rdev)]; +loop: + simple_lock(&spechash_slock); + for (vp = *vpp; vp; vp = vp->v_specnext) { + if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) + continue; + /* + * Alias, but not in use, so flush it out. + * Only alias active device nodes. + * Not sure why we don't re-use this like we do below. + */ + simple_lock(&vp->v_interlock); + if (vp->v_usecount == 0) { + simple_unlock(&spechash_slock); + vgonel(vp, p); + goto loop; + } + if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) { + /* + * It dissappeared, and we may have slept. + * Restart from the beginning + */ + simple_unlock(&spechash_slock); + goto loop; + } + break; + } + /* + * It would be a lot clearer what is going on here if + * this had been expressed as: + * if ( vp && (vp->v_tag == VT_NULL)) + * and the clauses had been swapped. + */ + if (vp == NULL || vp->v_tag != VT_NON) { + /* + * Put the new vnode into the hash chain. + * and if there was an alias, connect them. + */ + MALLOC(nvp->v_specinfo, struct specinfo *, + sizeof(struct specinfo), M_VNODE, M_WAITOK); + nvp->v_rdev = nvp_rdev; + nvp->v_hashchain = vpp; + nvp->v_specnext = *vpp; + nvp->v_specmountpoint = NULL; + simple_unlock(&spechash_slock); + *vpp = nvp; + if (vp != NULLVP) { + nvp->v_flag |= VALIASED; + vp->v_flag |= VALIASED; + vput(vp); + } + return (NULLVP); + } + /* + * if ( vp && (vp->v_tag == VT_NULL)) + * We have a vnode alias, but it is a trashed. + * Make it look like it's newley allocated. (by getnewvnode()) + * The caller should use this instead. + */ + simple_unlock(&spechash_slock); + VOP_UNLOCK(vp, 0, p); + simple_lock(&vp->v_interlock); + vclean(vp, 0, p); + vp->v_op = nvp->v_op; + vp->v_tag = nvp->v_tag; + nvp->v_type = VNON; + insmntque(vp, mp); + return (vp); +} + +/* + * Grab a particular vnode from the free list, increment its + * reference count and lock it. The vnode lock bit is set the + * vnode is being eliminated in vgone. The process is awakened + * when the transition is completed, and an error returned to + * indicate that the vnode is no longer usable (possibly having + * been changed to a new file system type). + */ +int +vget(vp, flags, p) + register struct vnode *vp; + int flags; + struct proc *p; +{ + int error; + + /* + * If the vnode is in the process of being cleaned out for + * another use, we wait for the cleaning to finish and then + * return failure. Cleaning is determined by checking that + * the VXLOCK flag is set. + */ + if ((flags & LK_INTERLOCK) == 0) { + simple_lock(&vp->v_interlock); + } + if (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + simple_unlock(&vp->v_interlock); + tsleep((caddr_t)vp, PINOD, "vget", 0); + return (ENOENT); + } + + vp->v_usecount++; + + if (VSHOULDBUSY(vp)) + vbusy(vp); + if (flags & LK_TYPE_MASK) { + if ((error = vn_lock(vp, flags | LK_INTERLOCK, p)) != 0) { + /* + * must expand vrele here because we do not want + * to call VOP_INACTIVE if the reference count + * drops back to zero since it was never really + * active. We must remove it from the free list + * before sleeping so that multiple processes do + * not try to recycle it. + */ + simple_lock(&vp->v_interlock); + vp->v_usecount--; + if (VSHOULDFREE(vp)) + vfree(vp); + simple_unlock(&vp->v_interlock); + } + return (error); + } + simple_unlock(&vp->v_interlock); + return (0); +} + +void +vref(struct vnode *vp) +{ + simple_lock(&vp->v_interlock); + vp->v_usecount++; + simple_unlock(&vp->v_interlock); +} + +/* + * Vnode put/release. + * If count drops to zero, call inactive routine and return to freelist. + */ +void +vrele(vp) + struct vnode *vp; +{ + struct proc *p = curproc; /* XXX */ + + KASSERT(vp != NULL, ("vrele: null vp")); + + simple_lock(&vp->v_interlock); + + if (vp->v_usecount > 1) { + + vp->v_usecount--; + simple_unlock(&vp->v_interlock); + + return; + } + + if (vp->v_usecount == 1) { + + vp->v_usecount--; + if (VSHOULDFREE(vp)) + vfree(vp); + /* + * If we are doing a vput, the node is already locked, and we must + * call VOP_INACTIVE with the node locked. So, in the case of + * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. + */ + if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { + VOP_INACTIVE(vp, p); + } + + } else { +#ifdef DIAGNOSTIC + vprint("vrele: negative ref count", vp); + simple_unlock(&vp->v_interlock); +#endif + panic("vrele: negative ref cnt"); + } +} + +void +vput(vp) + struct vnode *vp; +{ + struct proc *p = curproc; /* XXX */ + + KASSERT(vp != NULL, ("vput: null vp")); + + simple_lock(&vp->v_interlock); + + if (vp->v_usecount > 1) { + + vp->v_usecount--; + VOP_UNLOCK(vp, LK_INTERLOCK, p); + return; + + } + + if (vp->v_usecount == 1) { + + vp->v_usecount--; + if (VSHOULDFREE(vp)) + vfree(vp); + /* + * If we are doing a vput, the node is already locked, and we must + * call VOP_INACTIVE with the node locked. So, in the case of + * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. + */ + simple_unlock(&vp->v_interlock); + VOP_INACTIVE(vp, p); + + } else { +#ifdef DIAGNOSTIC + vprint("vput: negative ref count", vp); +#endif + panic("vput: negative ref cnt"); + } +} + +/* + * Somebody doesn't want the vnode recycled. + */ +void +vhold(vp) + register struct vnode *vp; +{ + int s; + + s = splbio(); + vp->v_holdcnt++; + if (VSHOULDBUSY(vp)) + vbusy(vp); + splx(s); +} + +/* + * One less who cares about this vnode. + */ +void +vdrop(vp) + register struct vnode *vp; +{ + int s; + + s = splbio(); + if (vp->v_holdcnt <= 0) + panic("vdrop: holdcnt"); + vp->v_holdcnt--; + if (VSHOULDFREE(vp)) + vfree(vp); + splx(s); +} + +/* + * Remove any vnodes in the vnode table belonging to mount point mp. + * + * If MNT_NOFORCE is specified, there should not be any active ones, + * return error if any are found (nb: this is a user error, not a + * system error). If MNT_FORCE is specified, detach any active vnodes + * that are found. + */ +#ifdef DIAGNOSTIC +static int busyprt = 0; /* print out busy vnodes */ +SYSCTL_INT(_debug, OID_AUTO, busyprt, CTLFLAG_RW, &busyprt, 0, ""); +#endif + +int +vflush(mp, skipvp, flags) + struct mount *mp; + struct vnode *skipvp; + int flags; +{ + struct proc *p = curproc; /* XXX */ + struct vnode *vp, *nvp; + int busy = 0; + + simple_lock(&mntvnode_slock); +loop: + for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { + /* + * Make sure this vnode wasn't reclaimed in getnewvnode(). + * Start over if it has (it won't be on the list anymore). + */ + if (vp->v_mount != mp) + goto loop; + nvp = vp->v_mntvnodes.le_next; + /* + * Skip over a selected vnode. + */ + if (vp == skipvp) + continue; + + simple_lock(&vp->v_interlock); + /* + * Skip over a vnodes marked VSYSTEM. + */ + if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { + simple_unlock(&vp->v_interlock); + continue; + } + /* + * If WRITECLOSE is set, only flush out regular file vnodes + * open for writing. + */ + if ((flags & WRITECLOSE) && + (vp->v_writecount == 0 || vp->v_type != VREG)) { + simple_unlock(&vp->v_interlock); + continue; + } + + /* + * With v_usecount == 0, all we need to do is clear out the + * vnode data structures and we are done. + */ + if (vp->v_usecount == 0) { + simple_unlock(&mntvnode_slock); + vgonel(vp, p); + simple_lock(&mntvnode_slock); + continue; + } + + /* + * If FORCECLOSE is set, forcibly close the vnode. For block + * or character devices, revert to an anonymous device. For + * all other files, just kill them. + */ + if (flags & FORCECLOSE) { + simple_unlock(&mntvnode_slock); + if (vp->v_type != VBLK && vp->v_type != VCHR) { + vgonel(vp, p); + } else { + vclean(vp, 0, p); + vp->v_op = spec_vnodeop_p; + insmntque(vp, (struct mount *) 0); + } + simple_lock(&mntvnode_slock); + continue; + } +#ifdef DIAGNOSTIC + if (busyprt) + vprint("vflush: busy vnode", vp); +#endif + simple_unlock(&vp->v_interlock); + busy++; + } + simple_unlock(&mntvnode_slock); + if (busy) + return (EBUSY); + return (0); +} + +/* + * Disassociate the underlying file system from a vnode. + */ +static void +vclean(vp, flags, p) + struct vnode *vp; + int flags; + struct proc *p; +{ + int active; + vm_object_t obj; + + /* + * Check to see if the vnode is in use. If so we have to reference it + * before we clean it out so that its count cannot fall to zero and + * generate a race against ourselves to recycle it. + */ + if ((active = vp->v_usecount)) + vp->v_usecount++; + + /* + * Prevent the vnode from being recycled or brought into use while we + * clean it out. + */ + if (vp->v_flag & VXLOCK) + panic("vclean: deadlock"); + vp->v_flag |= VXLOCK; + /* + * Even if the count is zero, the VOP_INACTIVE routine may still + * have the object locked while it cleans it out. The VOP_LOCK + * ensures that the VOP_INACTIVE routine is done with its work. + * For active vnodes, it ensures that no other activity can + * occur while the underlying object is being cleaned out. + */ + VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); + + /* + * Clean out any buffers associated with the vnode. + */ + vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); + if (obj = vp->v_object) { + if (obj->ref_count == 0) { + /* + * This is a normal way of shutting down the object/vnode + * association. + */ + vm_object_terminate(obj); + } else { + /* + * Woe to the process that tries to page now :-). + */ + vm_pager_deallocate(obj); + } + } + + /* + * If purging an active vnode, it must be closed and + * deactivated before being reclaimed. Note that the + * VOP_INACTIVE will unlock the vnode. + */ + if (active) { + if (flags & DOCLOSE) + VOP_CLOSE(vp, FNONBLOCK, NOCRED, p); + VOP_INACTIVE(vp, p); + } else { + /* + * Any other processes trying to obtain this lock must first + * wait for VXLOCK to clear, then call the new lock operation. + */ + VOP_UNLOCK(vp, 0, p); + } + /* + * Reclaim the vnode. + */ + if (VOP_RECLAIM(vp, p)) + panic("vclean: cannot reclaim"); + + if (active) + vrele(vp); + + cache_purge(vp); + if (vp->v_vnlock) { +#if 0 /* This is the only place we have LK_DRAINED in the entire kernel ??? */ +#ifdef DIAGNOSTIC + if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0) + vprint("vclean: lock not drained", vp); +#endif +#endif + FREE(vp->v_vnlock, M_VNODE); + vp->v_vnlock = NULL; + } + + if (VSHOULDFREE(vp)) + vfree(vp); + + /* + * Done with purge, notify sleepers of the grim news. + */ + vp->v_op = dead_vnodeop_p; + vn_pollgone(vp); + vp->v_tag = VT_NON; + vp->v_flag &= ~VXLOCK; + if (vp->v_flag & VXWANT) { + vp->v_flag &= ~VXWANT; + wakeup((caddr_t) vp); + } +} + +/* + * Eliminate all activity associated with the requested vnode + * and with all vnodes aliased to the requested vnode. + */ +int +vop_revoke(ap) + struct vop_revoke_args /* { + struct vnode *a_vp; + int a_flags; + } */ *ap; +{ + struct vnode *vp, *vq; + struct proc *p = curproc; /* XXX */ + + KASSERT((ap->a_flags & REVOKEALL) != 0, ("vop_revoke")); + + vp = ap->a_vp; + simple_lock(&vp->v_interlock); + + if (vp->v_flag & VALIASED) { + /* + * If a vgone (or vclean) is already in progress, + * wait until it is done and return. + */ + if (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + simple_unlock(&vp->v_interlock); + tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0); + return (0); + } + /* + * Ensure that vp will not be vgone'd while we + * are eliminating its aliases. + */ + vp->v_flag |= VXLOCK; + simple_unlock(&vp->v_interlock); + while (vp->v_flag & VALIASED) { + simple_lock(&spechash_slock); + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_rdev != vp->v_rdev || + vq->v_type != vp->v_type || vp == vq) + continue; + simple_unlock(&spechash_slock); + vgone(vq); + break; + } + if (vq == NULLVP) { + simple_unlock(&spechash_slock); + } + } + /* + * Remove the lock so that vgone below will + * really eliminate the vnode after which time + * vgone will awaken any sleepers. + */ + simple_lock(&vp->v_interlock); + vp->v_flag &= ~VXLOCK; + if (vp->v_flag & VXWANT) { + vp->v_flag &= ~VXWANT; + wakeup(vp); + } + } + vgonel(vp, p); + return (0); +} + +/* + * Recycle an unused vnode to the front of the free list. + * Release the passed interlock if the vnode will be recycled. + */ +int +vrecycle(vp, inter_lkp, p) + struct vnode *vp; + struct simplelock *inter_lkp; + struct proc *p; +{ + + simple_lock(&vp->v_interlock); + if (vp->v_usecount == 0) { + if (inter_lkp) { + simple_unlock(inter_lkp); + } + vgonel(vp, p); + return (1); + } + simple_unlock(&vp->v_interlock); + return (0); +} + +/* + * Eliminate all activity associated with a vnode + * in preparation for reuse. + */ +void +vgone(vp) + register struct vnode *vp; +{ + struct proc *p = curproc; /* XXX */ + + simple_lock(&vp->v_interlock); + vgonel(vp, p); +} + +/* + * vgone, with the vp interlock held. + */ +static void +vgonel(vp, p) + struct vnode *vp; + struct proc *p; +{ + int s; + struct vnode *vq; + struct vnode *vx; + + /* + * If a vgone (or vclean) is already in progress, + * wait until it is done and return. + */ + if (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + simple_unlock(&vp->v_interlock); + tsleep((caddr_t)vp, PINOD, "vgone", 0); + return; + } + + /* + * Clean out the filesystem specific data. + */ + vclean(vp, DOCLOSE, p); + simple_lock(&vp->v_interlock); + + /* + * Delete from old mount point vnode list, if on one. + */ + if (vp->v_mount != NULL) + insmntque(vp, (struct mount *)0); + /* + * If special device, remove it from special device alias list + * if it is on one. + */ + if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) { + simple_lock(&spechash_slock); + if (*vp->v_hashchain == vp) { + *vp->v_hashchain = vp->v_specnext; + } else { + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_specnext != vp) + continue; + vq->v_specnext = vp->v_specnext; + break; + } + if (vq == NULL) + panic("missing bdev"); + } + if (vp->v_flag & VALIASED) { + vx = NULL; + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_rdev != vp->v_rdev || + vq->v_type != vp->v_type) + continue; + if (vx) + break; + vx = vq; + } + if (vx == NULL) + panic("missing alias"); + if (vq == NULL) + vx->v_flag &= ~VALIASED; + vp->v_flag &= ~VALIASED; + } + simple_unlock(&spechash_slock); + FREE(vp->v_specinfo, M_VNODE); + vp->v_specinfo = NULL; + } + + /* + * If it is on the freelist and not already at the head, + * move it to the head of the list. The test of the back + * pointer and the reference count of zero is because + * it will be removed from the free list by getnewvnode, + * but will not have its reference count incremented until + * after calling vgone. If the reference count were + * incremented first, vgone would (incorrectly) try to + * close the previous instance of the underlying object. + */ + if (vp->v_usecount == 0 && !(vp->v_flag & VDOOMED)) { + s = splbio(); + simple_lock(&vnode_free_list_slock); + if (vp->v_flag & VFREE) { + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + } else if (vp->v_flag & VTBFREE) { + TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); + vp->v_flag &= ~VTBFREE; + freevnodes++; + } else + freevnodes++; + vp->v_flag |= VFREE; + TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); + simple_unlock(&vnode_free_list_slock); + splx(s); + } + + vp->v_type = VBAD; + simple_unlock(&vp->v_interlock); +} + +/* + * Lookup a vnode by device number. + */ +int +vfinddev(dev, type, vpp) + dev_t dev; + enum vtype type; + struct vnode **vpp; +{ + register struct vnode *vp; + int rc = 0; + + simple_lock(&spechash_slock); + for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { + if (dev != vp->v_rdev || type != vp->v_type) + continue; + *vpp = vp; + rc = 1; + break; + } + simple_unlock(&spechash_slock); + return (rc); +} + +/* + * Calculate the total number of references to a special device. + */ +int +vcount(vp) + register struct vnode *vp; +{ + struct vnode *vq, *vnext; + int count; + +loop: + if ((vp->v_flag & VALIASED) == 0) + return (vp->v_usecount); + simple_lock(&spechash_slock); + for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) { + vnext = vq->v_specnext; + if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) + continue; + /* + * Alias, but not in use, so flush it out. + */ + if (vq->v_usecount == 0 && vq != vp) { + simple_unlock(&spechash_slock); + vgone(vq); + goto loop; + } + count += vq->v_usecount; + } + simple_unlock(&spechash_slock); + return (count); +} +/* + * Print out a description of a vnode. + */ +static char *typename[] = +{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; + +void +vprint(label, vp) + char *label; + register struct vnode *vp; +{ + char buf[96]; + + if (label != NULL) + printf("%s: %p: ", label, (void *)vp); + else + printf("%p: ", (void *)vp); + printf("type %s, usecount %d, writecount %d, refcount %d,", + typename[vp->v_type], vp->v_usecount, vp->v_writecount, + vp->v_holdcnt); + buf[0] = '\0'; + if (vp->v_flag & VROOT) + strcat(buf, "|VROOT"); + if (vp->v_flag & VTEXT) + strcat(buf, "|VTEXT"); + if (vp->v_flag & VSYSTEM) + strcat(buf, "|VSYSTEM"); + if (vp->v_flag & VXLOCK) + strcat(buf, "|VXLOCK"); + if (vp->v_flag & VXWANT) + strcat(buf, "|VXWANT"); + if (vp->v_flag & VBWAIT) + strcat(buf, "|VBWAIT"); + if (vp->v_flag & VALIASED) + strcat(buf, "|VALIASED"); + if (vp->v_flag & VDOOMED) + strcat(buf, "|VDOOMED"); + if (vp->v_flag & VFREE) + strcat(buf, "|VFREE"); + if (vp->v_flag & VOBJBUF) + strcat(buf, "|VOBJBUF"); + if (buf[0] != '\0') + printf(" flags (%s)", &buf[1]); + if (vp->v_data == NULL) { + printf("\n"); + } else { + printf("\n\t"); + VOP_PRINT(vp); + } +} + +#ifdef DDB +#include <ddb/ddb.h> +/* + * List all of the locked vnodes in the system. + * Called when debugging the kernel. + */ +DB_SHOW_COMMAND(lockedvnodes, lockedvnodes) +{ + struct proc *p = curproc; /* XXX */ + struct mount *mp, *nmp; + struct vnode *vp; + + printf("Locked vnodes\n"); + simple_lock(&mountlist_slock); + for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { + nmp = mp->mnt_list.cqe_next; + continue; + } + for (vp = mp->mnt_vnodelist.lh_first; + vp != NULL; + vp = vp->v_mntvnodes.le_next) { + if (VOP_ISLOCKED(vp)) + vprint((char *)0, vp); + } + simple_lock(&mountlist_slock); + nmp = mp->mnt_list.cqe_next; + vfs_unbusy(mp, p); + } + simple_unlock(&mountlist_slock); +} +#endif + +/* + * Top level filesystem related information gathering. + */ +static int sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS); + +static int +vfs_sysctl SYSCTL_HANDLER_ARGS +{ + int *name = (int *)arg1 - 1; /* XXX */ + u_int namelen = arg2 + 1; /* XXX */ + struct vfsconf *vfsp; + +#if 1 || defined(COMPAT_PRELITE2) + /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ + if (namelen == 1) + return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); +#endif + +#ifdef notyet + /* all sysctl names at this level are at least name and field */ + if (namelen < 2) + return (ENOTDIR); /* overloaded */ + if (name[0] != VFS_GENERIC) { + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (vfsp->vfc_typenum == name[0]) + break; + if (vfsp == NULL) + return (EOPNOTSUPP); + return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, + oldp, oldlenp, newp, newlen, p)); + } +#endif + switch (name[1]) { + case VFS_MAXTYPENUM: + if (namelen != 2) + return (ENOTDIR); + return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); + case VFS_CONF: + if (namelen != 3) + return (ENOTDIR); /* overloaded */ + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (vfsp->vfc_typenum == name[2]) + break; + if (vfsp == NULL) + return (EOPNOTSUPP); + return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); + } + return (EOPNOTSUPP); +} + +SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, + "Generic filesystem"); + +#if 1 || defined(COMPAT_PRELITE2) + +static int +sysctl_ovfs_conf SYSCTL_HANDLER_ARGS +{ + int error; + struct vfsconf *vfsp; + struct ovfsconf ovfs; + + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { + ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ + strcpy(ovfs.vfc_name, vfsp->vfc_name); + ovfs.vfc_index = vfsp->vfc_typenum; + ovfs.vfc_refcount = vfsp->vfc_refcount; + ovfs.vfc_flags = vfsp->vfc_flags; + error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); + if (error) + return error; + } + return 0; +} + +#endif /* 1 || COMPAT_PRELITE2 */ + +#if 0 +#define KINFO_VNODESLOP 10 +/* + * Dump vnode list (via sysctl). + * Copyout address of vnode followed by vnode. + */ +/* ARGSUSED */ +static int +sysctl_vnode SYSCTL_HANDLER_ARGS +{ + struct proc *p = curproc; /* XXX */ + struct mount *mp, *nmp; + struct vnode *nvp, *vp; + int error; + +#define VPTRSZ sizeof (struct vnode *) +#define VNODESZ sizeof (struct vnode) + + req->lock = 0; + if (!req->oldptr) /* Make an estimate */ + return (SYSCTL_OUT(req, 0, + (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); + + simple_lock(&mountlist_slock); + for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { + nmp = mp->mnt_list.cqe_next; + continue; + } +again: + simple_lock(&mntvnode_slock); + for (vp = mp->mnt_vnodelist.lh_first; + vp != NULL; + vp = nvp) { + /* + * Check that the vp is still associated with + * this filesystem. RACE: could have been + * recycled onto the same filesystem. + */ + if (vp->v_mount != mp) { + simple_unlock(&mntvnode_slock); + goto again; + } + nvp = vp->v_mntvnodes.le_next; + simple_unlock(&mntvnode_slock); + if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || + (error = SYSCTL_OUT(req, vp, VNODESZ))) + return (error); + simple_lock(&mntvnode_slock); + } + simple_unlock(&mntvnode_slock); + simple_lock(&mountlist_slock); + nmp = mp->mnt_list.cqe_next; + vfs_unbusy(mp, p); + } + simple_unlock(&mountlist_slock); + + return (0); +} +#endif + +/* + * XXX + * Exporting the vnode list on large systems causes them to crash. + * Exporting the vnode list on medium systems causes sysctl to coredump. + */ +#if 0 +SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, + 0, 0, sysctl_vnode, "S,vnode", ""); +#endif + +/* + * Check to see if a filesystem is mounted on a block device. + */ +int +vfs_mountedon(vp) + struct vnode *vp; +{ + struct vnode *vq; + int error = 0; + + if (vp->v_specmountpoint != NULL) + return (EBUSY); + if (vp->v_flag & VALIASED) { + simple_lock(&spechash_slock); + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_rdev != vp->v_rdev || + vq->v_type != vp->v_type) + continue; + if (vq->v_specmountpoint != NULL) { + error = EBUSY; + break; + } + } + simple_unlock(&spechash_slock); + } + return (error); +} + +/* + * Unmount all filesystems. The list is traversed in reverse order + * of mounting to avoid dependencies. + */ +void +vfs_unmountall() +{ + struct mount *mp, *nmp; + struct proc *p; + int error; + + if (curproc != NULL) + p = curproc; + else + p = initproc; /* XXX XXX should this be proc0? */ + /* + * Since this only runs when rebooting, it is not interlocked. + */ + for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { + nmp = mp->mnt_list.cqe_prev; + error = dounmount(mp, MNT_FORCE, p); + if (error) { + printf("unmount of %s failed (", + mp->mnt_stat.f_mntonname); + if (error == EBUSY) + printf("BUSY)\n"); + else + printf("%d)\n", error); + } + } +} + +/* + * Build hash lists of net addresses and hang them off the mount point. + * Called by ufs_mount() to set up the lists of export addresses. + */ +static int +vfs_hang_addrlist(mp, nep, argp) + struct mount *mp; + struct netexport *nep; + struct export_args *argp; +{ + register struct netcred *np; + register struct radix_node_head *rnh; + register int i; + struct radix_node *rn; + struct sockaddr *saddr, *smask = 0; + struct domain *dom; + int error; + + if (argp->ex_addrlen == 0) { + if (mp->mnt_flag & MNT_DEFEXPORTED) + return (EPERM); + np = &nep->ne_defexported; + np->netc_exflags = argp->ex_flags; + np->netc_anon = argp->ex_anon; + np->netc_anon.cr_ref = 1; + mp->mnt_flag |= MNT_DEFEXPORTED; + return (0); + } + i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; + np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); + bzero((caddr_t) np, i); + saddr = (struct sockaddr *) (np + 1); + if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) + goto out; + if (saddr->sa_len > argp->ex_addrlen) + saddr->sa_len = argp->ex_addrlen; + if (argp->ex_masklen) { + smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); + error = copyin(argp->ex_mask, (caddr_t) smask, argp->ex_masklen); + if (error) + goto out; + if (smask->sa_len > argp->ex_masklen) + smask->sa_len = argp->ex_masklen; + } + i = saddr->sa_family; + if ((rnh = nep->ne_rtable[i]) == 0) { + /* + * Seems silly to initialize every AF when most are not used, + * do so on demand here + */ + for (dom = domains; dom; dom = dom->dom_next) + if (dom->dom_family == i && dom->dom_rtattach) { + dom->dom_rtattach((void **) &nep->ne_rtable[i], + dom->dom_rtoffset); + break; + } + if ((rnh = nep->ne_rtable[i]) == 0) { + error = ENOBUFS; + goto out; + } + } + rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, + np->netc_rnodes); + if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ + error = EPERM; + goto out; + } + np->netc_exflags = argp->ex_flags; + np->netc_anon = argp->ex_anon; + np->netc_anon.cr_ref = 1; + return (0); +out: + free(np, M_NETADDR); + return (error); +} + +/* ARGSUSED */ +static int +vfs_free_netcred(rn, w) + struct radix_node *rn; + void *w; +{ + register struct radix_node_head *rnh = (struct radix_node_head *) w; + + (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); + free((caddr_t) rn, M_NETADDR); + return (0); +} + +/* + * Free the net address hash lists that are hanging off the mount points. + */ +static void +vfs_free_addrlist(nep) + struct netexport *nep; +{ + register int i; + register struct radix_node_head *rnh; + + for (i = 0; i <= AF_MAX; i++) + if ((rnh = nep->ne_rtable[i])) { + (*rnh->rnh_walktree) (rnh, vfs_free_netcred, + (caddr_t) rnh); + free((caddr_t) rnh, M_RTABLE); + nep->ne_rtable[i] = 0; + } +} + +int +vfs_export(mp, nep, argp) + struct mount *mp; + struct netexport *nep; + struct export_args *argp; +{ + int error; + + if (argp->ex_flags & MNT_DELEXPORT) { + if (mp->mnt_flag & MNT_EXPUBLIC) { + vfs_setpublicfs(NULL, NULL, NULL); + mp->mnt_flag &= ~MNT_EXPUBLIC; + } + vfs_free_addrlist(nep); + mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); + } + if (argp->ex_flags & MNT_EXPORTED) { + if (argp->ex_flags & MNT_EXPUBLIC) { + if ((error = vfs_setpublicfs(mp, nep, argp)) != 0) + return (error); + mp->mnt_flag |= MNT_EXPUBLIC; + } + if ((error = vfs_hang_addrlist(mp, nep, argp))) + return (error); + mp->mnt_flag |= MNT_EXPORTED; + } + return (0); +} + + +/* + * Set the publicly exported filesystem (WebNFS). Currently, only + * one public filesystem is possible in the spec (RFC 2054 and 2055) + */ +int +vfs_setpublicfs(mp, nep, argp) + struct mount *mp; + struct netexport *nep; + struct export_args *argp; +{ + int error; + struct vnode *rvp; + char *cp; + + /* + * mp == NULL -> invalidate the current info, the FS is + * no longer exported. May be called from either vfs_export + * or unmount, so check if it hasn't already been done. + */ + if (mp == NULL) { + if (nfs_pub.np_valid) { + nfs_pub.np_valid = 0; + if (nfs_pub.np_index != NULL) { + FREE(nfs_pub.np_index, M_TEMP); + nfs_pub.np_index = NULL; + } + } + return (0); + } + + /* + * Only one allowed at a time. + */ + if (nfs_pub.np_valid != 0 && mp != nfs_pub.np_mount) + return (EBUSY); + + /* + * Get real filehandle for root of exported FS. + */ + bzero((caddr_t)&nfs_pub.np_handle, sizeof(nfs_pub.np_handle)); + nfs_pub.np_handle.fh_fsid = mp->mnt_stat.f_fsid; + + if ((error = VFS_ROOT(mp, &rvp))) + return (error); + + if ((error = VFS_VPTOFH(rvp, &nfs_pub.np_handle.fh_fid))) + return (error); + + vput(rvp); + + /* + * If an indexfile was specified, pull it in. + */ + if (argp->ex_indexfile != NULL) { + MALLOC(nfs_pub.np_index, char *, MAXNAMLEN + 1, M_TEMP, + M_WAITOK); + error = copyinstr(argp->ex_indexfile, nfs_pub.np_index, + MAXNAMLEN, (size_t *)0); + if (!error) { + /* + * Check for illegal filenames. + */ + for (cp = nfs_pub.np_index; *cp; cp++) { + if (*cp == '/') { + error = EINVAL; + break; + } + } + } + if (error) { + FREE(nfs_pub.np_index, M_TEMP); + return (error); + } + } + + nfs_pub.np_mount = mp; + nfs_pub.np_valid = 1; + return (0); +} + +struct netcred * +vfs_export_lookup(mp, nep, nam) + register struct mount *mp; + struct netexport *nep; + struct sockaddr *nam; +{ + register struct netcred *np; + register struct radix_node_head *rnh; + struct sockaddr *saddr; + + np = NULL; + if (mp->mnt_flag & MNT_EXPORTED) { + /* + * Lookup in the export list first. + */ + if (nam != NULL) { + saddr = nam; + rnh = nep->ne_rtable[saddr->sa_family]; + if (rnh != NULL) { + np = (struct netcred *) + (*rnh->rnh_matchaddr)((caddr_t)saddr, + rnh); + if (np && np->netc_rnodes->rn_flags & RNF_ROOT) + np = NULL; + } + } + /* + * If no address match, use the default if it exists. + */ + if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) + np = &nep->ne_defexported; + } + return (np); +} + +/* + * perform msync on all vnodes under a mount point + * the mount point must be locked. + */ +void +vfs_msync(struct mount *mp, int flags) { + struct vnode *vp, *nvp; + struct vm_object *obj; + int anyio, tries; + + tries = 5; +loop: + anyio = 0; + for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { + + nvp = vp->v_mntvnodes.le_next; + + if (vp->v_mount != mp) { + goto loop; + } + + if (vp->v_flag & VXLOCK) /* XXX: what if MNT_WAIT? */ + continue; + + if (flags != MNT_WAIT) { + obj = vp->v_object; + if (obj == NULL || (obj->flags & OBJ_MIGHTBEDIRTY) == 0) + continue; + if (VOP_ISLOCKED(vp)) + continue; + } + + simple_lock(&vp->v_interlock); + if (vp->v_object && + (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) { + if (!vget(vp, + LK_INTERLOCK | LK_EXCLUSIVE | LK_RETRY | LK_NOOBJ, curproc)) { + if (vp->v_object) { + vm_object_page_clean(vp->v_object, 0, 0, flags == MNT_WAIT ? OBJPC_SYNC : 0); + anyio = 1; + } + vput(vp); + } + } else { + simple_unlock(&vp->v_interlock); + } + } + if (anyio && (--tries > 0)) + goto loop; +} + +/* + * Create the VM object needed for VMIO and mmap support. This + * is done for all VREG files in the system. Some filesystems might + * afford the additional metadata buffering capability of the + * VMIO code by making the device node be VMIO mode also. + * + * vp must be locked when vfs_object_create is called. + */ +int +vfs_object_create(vp, p, cred) + struct vnode *vp; + struct proc *p; + struct ucred *cred; +{ + struct vattr vat; + vm_object_t object; + int error = 0; + + if ((vp->v_type != VREG) && (vp->v_type != VBLK)) + return 0; + +retry: + if ((object = vp->v_object) == NULL) { + if (vp->v_type == VREG) { + if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0) + goto retn; + object = vnode_pager_alloc(vp, vat.va_size, 0, 0); + } else if (major(vp->v_rdev) < nblkdev && + bdevsw[major(vp->v_rdev)] != NULL) { + /* + * This simply allocates the biggest object possible + * for a VBLK vnode. This should be fixed, but doesn't + * cause any problems (yet). + */ + object = vnode_pager_alloc(vp, IDX_TO_OFF(INT_MAX), 0, 0); + } + object->ref_count--; + vp->v_usecount--; + } else { + if (object->flags & OBJ_DEAD) { + VOP_UNLOCK(vp, 0, p); + tsleep(object, PVM, "vodead", 0); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + goto retry; + } + } + + if (vp->v_object) + vp->v_flag |= VOBJBUF; + +retn: + return error; +} + +static void +vfree(vp) + struct vnode *vp; +{ + int s; + + s = splbio(); + simple_lock(&vnode_free_list_slock); + if (vp->v_flag & VTBFREE) { + TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); + vp->v_flag &= ~VTBFREE; + } + if (vp->v_flag & VAGE) { + TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); + } else { + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + } + freevnodes++; + simple_unlock(&vnode_free_list_slock); + vp->v_flag &= ~VAGE; + vp->v_flag |= VFREE; + splx(s); +} + +void +vbusy(vp) + struct vnode *vp; +{ + int s; + + s = splbio(); + simple_lock(&vnode_free_list_slock); + if (vp->v_flag & VTBFREE) { + TAILQ_REMOVE(&vnode_tobefree_list, vp, v_freelist); + vp->v_flag &= ~VTBFREE; + } else { + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + freevnodes--; + } + simple_unlock(&vnode_free_list_slock); + vp->v_flag &= ~(VFREE|VAGE); + splx(s); +} + +/* + * Record a process's interest in events which might happen to + * a vnode. Because poll uses the historic select-style interface + * internally, this routine serves as both the ``check for any + * pending events'' and the ``record my interest in future events'' + * functions. (These are done together, while the lock is held, + * to avoid race conditions.) + */ +int +vn_pollrecord(vp, p, events) + struct vnode *vp; + struct proc *p; + short events; +{ + simple_lock(&vp->v_pollinfo.vpi_lock); + if (vp->v_pollinfo.vpi_revents & events) { + /* + * This leaves events we are not interested + * in available for the other process which + * which presumably had requested them + * (otherwise they would never have been + * recorded). + */ + events &= vp->v_pollinfo.vpi_revents; + vp->v_pollinfo.vpi_revents &= ~events; + + simple_unlock(&vp->v_pollinfo.vpi_lock); + return events; + } + vp->v_pollinfo.vpi_events |= events; + selrecord(p, &vp->v_pollinfo.vpi_selinfo); + simple_unlock(&vp->v_pollinfo.vpi_lock); + return 0; +} + +/* + * Note the occurrence of an event. If the VN_POLLEVENT macro is used, + * it is possible for us to miss an event due to race conditions, but + * that condition is expected to be rare, so for the moment it is the + * preferred interface. + */ +void +vn_pollevent(vp, events) + struct vnode *vp; + short events; +{ + simple_lock(&vp->v_pollinfo.vpi_lock); + if (vp->v_pollinfo.vpi_events & events) { + /* + * We clear vpi_events so that we don't + * call selwakeup() twice if two events are + * posted before the polling process(es) is + * awakened. This also ensures that we take at + * most one selwakeup() if the polling process + * is no longer interested. However, it does + * mean that only one event can be noticed at + * a time. (Perhaps we should only clear those + * event bits which we note?) XXX + */ + vp->v_pollinfo.vpi_events = 0; /* &= ~events ??? */ + vp->v_pollinfo.vpi_revents |= events; + selwakeup(&vp->v_pollinfo.vpi_selinfo); + } + simple_unlock(&vp->v_pollinfo.vpi_lock); +} + +/* + * Wake up anyone polling on vp because it is being revoked. + * This depends on dead_poll() returning POLLHUP for correct + * behavior. + */ +void +vn_pollgone(vp) + struct vnode *vp; +{ + simple_lock(&vp->v_pollinfo.vpi_lock); + if (vp->v_pollinfo.vpi_events) { + vp->v_pollinfo.vpi_events = 0; + selwakeup(&vp->v_pollinfo.vpi_selinfo); + } + simple_unlock(&vp->v_pollinfo.vpi_lock); +} + + + +/* + * Routine to create and manage a filesystem syncer vnode. + */ +#define sync_close ((int (*) __P((struct vop_close_args *)))nullop) +static int sync_fsync __P((struct vop_fsync_args *)); +static int sync_inactive __P((struct vop_inactive_args *)); +static int sync_reclaim __P((struct vop_reclaim_args *)); +#define sync_lock ((int (*) __P((struct vop_lock_args *)))vop_nolock) +#define sync_unlock ((int (*) __P((struct vop_unlock_args *)))vop_nounlock) +static int sync_print __P((struct vop_print_args *)); +#define sync_islocked ((int(*) __P((struct vop_islocked_args *)))vop_noislocked) + +static vop_t **sync_vnodeop_p; +static struct vnodeopv_entry_desc sync_vnodeop_entries[] = { + { &vop_default_desc, (vop_t *) vop_eopnotsupp }, + { &vop_close_desc, (vop_t *) sync_close }, /* close */ + { &vop_fsync_desc, (vop_t *) sync_fsync }, /* fsync */ + { &vop_inactive_desc, (vop_t *) sync_inactive }, /* inactive */ + { &vop_reclaim_desc, (vop_t *) sync_reclaim }, /* reclaim */ + { &vop_lock_desc, (vop_t *) sync_lock }, /* lock */ + { &vop_unlock_desc, (vop_t *) sync_unlock }, /* unlock */ + { &vop_print_desc, (vop_t *) sync_print }, /* print */ + { &vop_islocked_desc, (vop_t *) sync_islocked }, /* islocked */ + { NULL, NULL } +}; +static struct vnodeopv_desc sync_vnodeop_opv_desc = + { &sync_vnodeop_p, sync_vnodeop_entries }; + +VNODEOP_SET(sync_vnodeop_opv_desc); + +/* + * Create a new filesystem syncer vnode for the specified mount point. + */ +int +vfs_allocate_syncvnode(mp) + struct mount *mp; +{ + struct vnode *vp; + static long start, incr, next; + int error; + + /* Allocate a new vnode */ + if ((error = getnewvnode(VT_VFS, mp, sync_vnodeop_p, &vp)) != 0) { + mp->mnt_syncer = NULL; + return (error); + } + vp->v_type = VNON; + /* + * Place the vnode onto the syncer worklist. We attempt to + * scatter them about on the list so that they will go off + * at evenly distributed times even if all the filesystems + * are mounted at once. + */ + next += incr; + if (next == 0 || next > syncer_maxdelay) { + start /= 2; + incr /= 2; + if (start == 0) { + start = syncer_maxdelay / 2; + incr = syncer_maxdelay; + } + next = start; + } + vn_syncer_add_to_worklist(vp, syncdelay > 0 ? next % syncdelay : 0); + mp->mnt_syncer = vp; + return (0); +} + +/* + * Do a lazy sync of the filesystem. + */ +static int +sync_fsync(ap) + struct vop_fsync_args /* { + struct vnode *a_vp; + struct ucred *a_cred; + int a_waitfor; + struct proc *a_p; + } */ *ap; +{ + struct vnode *syncvp = ap->a_vp; + struct mount *mp = syncvp->v_mount; + struct proc *p = ap->a_p; + int asyncflag; + + /* + * We only need to do something if this is a lazy evaluation. + */ + if (ap->a_waitfor != MNT_LAZY) + return (0); + + /* + * Move ourselves to the back of the sync list. + */ + vn_syncer_add_to_worklist(syncvp, syncdelay); + + /* + * Walk the list of vnodes pushing all that are dirty and + * not already on the sync list. + */ + simple_lock(&mountlist_slock); + if (vfs_busy(mp, LK_EXCLUSIVE | LK_NOWAIT, &mountlist_slock, p) != 0) { + simple_unlock(&mountlist_slock); + return (0); + } + asyncflag = mp->mnt_flag & MNT_ASYNC; + mp->mnt_flag &= ~MNT_ASYNC; + vfs_msync(mp, MNT_NOWAIT); + VFS_SYNC(mp, MNT_LAZY, ap->a_cred, p); + if (asyncflag) + mp->mnt_flag |= MNT_ASYNC; + vfs_unbusy(mp, p); + return (0); +} + +/* + * The syncer vnode is no referenced. + */ +static int +sync_inactive(ap) + struct vop_inactive_args /* { + struct vnode *a_vp; + struct proc *a_p; + } */ *ap; +{ + + vgone(ap->a_vp); + return (0); +} + +/* + * The syncer vnode is no longer needed and is being decommissioned. + */ +static int +sync_reclaim(ap) + struct vop_reclaim_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + + vp->v_mount->mnt_syncer = NULL; + if (vp->v_flag & VONWORKLST) { + LIST_REMOVE(vp, v_synclist); + vp->v_flag &= ~VONWORKLST; + } + + return (0); +} + +/* + * Print out a syncer vnode. + */ +static int +sync_print(ap) + struct vop_print_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + + printf("syncer vnode"); + if (vp->v_vnlock != NULL) + lockmgr_printinfo(vp->v_vnlock); + printf("\n"); + return (0); +} diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c new file mode 100644 index 0000000..18e39d6 --- /dev/null +++ b/sys/kern/vfs_syscalls.c @@ -0,0 +1,3034 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 + * $Id: vfs_syscalls.c,v 1.111 1998/12/12 21:07:09 dillon Exp $ + */ + +/* For 4.3 integer FS ID compatibility */ +#include "opt_compat.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysent.h> +#include <sys/sysproto.h> +#include <sys/namei.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/linker.h> +#include <sys/stat.h> +#include <sys/unistd.h> +#include <sys/vnode.h> +#include <sys/malloc.h> +#include <sys/mount.h> +#include <sys/proc.h> +#include <sys/dirent.h> + +#include <miscfs/union/union.h> + +#include <vm/vm.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> +#include <vm/vm_zone.h> +#include <sys/sysctl.h> + +static int change_dir __P((struct nameidata *ndp, struct proc *p)); +static void checkdirs __P((struct vnode *olddp)); +static int setfown __P((struct proc *, struct vnode *, uid_t, gid_t)); +static int setfmode __P((struct proc *, struct vnode *, int)); +static int setfflags __P((struct proc *, struct vnode *, int)); +static int setutimes __P((struct proc *, struct vnode *, struct timeval *, int)); +static int usermount = 0; /* if 1, non-root can mount fs. */ + +int (*union_dircheckp) __P((struct proc *, struct vnode **, struct file *)); + +SYSCTL_INT(_vfs, OID_AUTO, usermount, CTLFLAG_RW, &usermount, 0, ""); + +/* + * Virtual File System System Calls + */ + +/* + * Mount a file system. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mount_args { + char *type; + char *path; + int flags; + caddr_t data; +}; +#endif +/* ARGSUSED */ +int +mount(p, uap) + struct proc *p; + register struct mount_args /* { + syscallarg(char *) type; + syscallarg(char *) path; + syscallarg(int) flags; + syscallarg(caddr_t) data; + } */ *uap; +{ + struct vnode *vp; + struct mount *mp; + struct vfsconf *vfsp; + int error, flag = 0, flag2 = 0; + struct vattr va; + u_long fstypenum; + struct nameidata nd; + char fstypename[MFSNAMELEN]; + + if (usermount == 0 && (error = suser(p->p_ucred, &p->p_acflag))) + return (error); + + /* + * Get vnode to be covered + */ + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (SCARG(uap, flags) & MNT_UPDATE) { + if ((vp->v_flag & VROOT) == 0) { + vput(vp); + return (EINVAL); + } + mp = vp->v_mount; + flag = mp->mnt_flag; + flag2 = mp->mnt_kern_flag; + /* + * We only allow the filesystem to be reloaded if it + * is currently mounted read-only. + */ + if ((SCARG(uap, flags) & MNT_RELOAD) && + ((mp->mnt_flag & MNT_RDONLY) == 0)) { + vput(vp); + return (EOPNOTSUPP); /* Needs translation */ + } + mp->mnt_flag |= + SCARG(uap, flags) & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE); + /* + * Only root, or the user that did the original mount is + * permitted to update it. + */ + if (mp->mnt_stat.f_owner != p->p_ucred->cr_uid && + (error = suser(p->p_ucred, &p->p_acflag))) { + vput(vp); + return (error); + } + /* + * Do not allow NFS export by non-root users. Silently + * enforce MNT_NOSUID and MNT_NODEV for non-root users. + */ + if (p->p_ucred->cr_uid != 0) { + if (SCARG(uap, flags) & MNT_EXPORTED) { + vput(vp); + return (EPERM); + } + SCARG(uap, flags) |= MNT_NOSUID | MNT_NODEV; + } + if (vfs_busy(mp, LK_NOWAIT, 0, p)) { + vput(vp); + return (EBUSY); + } + VOP_UNLOCK(vp, 0, p); + goto update; + } + /* + * If the user is not root, ensure that they own the directory + * onto which we are attempting to mount. + */ + if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)) || + (va.va_uid != p->p_ucred->cr_uid && + (error = suser(p->p_ucred, &p->p_acflag)))) { + vput(vp); + return (error); + } + /* + * Do not allow NFS export by non-root users. Silently + * enforce MNT_NOSUID and MNT_NODEV for non-root users. + */ + if (p->p_ucred->cr_uid != 0) { + if (SCARG(uap, flags) & MNT_EXPORTED) { + vput(vp); + return (EPERM); + } + SCARG(uap, flags) |= MNT_NOSUID | MNT_NODEV; + } + if (error = vinvalbuf(vp, V_SAVE, p->p_ucred, p, 0, 0)) + return (error); + if (vp->v_type != VDIR) { + vput(vp); + return (ENOTDIR); + } +#ifdef COMPAT_43 + /* + * Historically filesystem types were identified by number. If we + * get an integer for the filesystem type instead of a string, we + * check to see if it matches one of the historic filesystem types. + */ + fstypenum = (uintptr_t)SCARG(uap, type); + if (fstypenum < maxvfsconf) { + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (vfsp->vfc_typenum == fstypenum) + break; + if (vfsp == NULL) { + vput(vp); + return (ENODEV); + } + strncpy(fstypename, vfsp->vfc_name, MFSNAMELEN); + } else +#endif /* COMPAT_43 */ + if (error = copyinstr(SCARG(uap, type), fstypename, MFSNAMELEN, NULL)) { + vput(vp); + return (error); + } + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (!strcmp(vfsp->vfc_name, fstypename)) + break; + if (vfsp == NULL) { + linker_file_t lf; + + /* Refuse to load modules if securelevel raised */ + if (securelevel > 0) { + vput(vp); + return EPERM; + } + /* Only load modules for root (very important!) */ + if (error = suser(p->p_ucred, &p->p_acflag)) { + vput(vp); + return error; + } + error = linker_load_file(fstypename, &lf); + if (error || lf == NULL) { + vput(vp); + if (lf == NULL) + error = ENODEV; + return error; + } + lf->userrefs++; + /* lookup again, see if the VFS was loaded */ + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (!strcmp(vfsp->vfc_name, fstypename)) + break; + if (vfsp == NULL) { + lf->userrefs--; + linker_file_unload(lf); + vput(vp); + return (ENODEV); + } + } + simple_lock(&vp->v_interlock); + if ((vp->v_flag & VMOUNT) != 0 || + vp->v_mountedhere != NULL) { + simple_unlock(&vp->v_interlock); + vput(vp); + return (EBUSY); + } + vp->v_flag |= VMOUNT; + simple_unlock(&vp->v_interlock); + + /* + * Allocate and initialize the filesystem. + */ + mp = (struct mount *)malloc((u_long)sizeof(struct mount), + M_MOUNT, M_WAITOK); + bzero((char *)mp, (u_long)sizeof(struct mount)); + lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, LK_NOPAUSE); + (void)vfs_busy(mp, LK_NOWAIT, 0, p); + mp->mnt_op = vfsp->vfc_vfsops; + mp->mnt_vfc = vfsp; + vfsp->vfc_refcount++; + mp->mnt_stat.f_type = vfsp->vfc_typenum; + mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; + strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); + mp->mnt_vnodecovered = vp; + mp->mnt_stat.f_owner = p->p_ucred->cr_uid; + VOP_UNLOCK(vp, 0, p); +update: + /* + * Set the mount level flags. + */ + if (SCARG(uap, flags) & MNT_RDONLY) + mp->mnt_flag |= MNT_RDONLY; + else if (mp->mnt_flag & MNT_RDONLY) + mp->mnt_kern_flag |= MNTK_WANTRDWR; + mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | + MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOATIME | + MNT_NOSYMFOLLOW | + MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR); + mp->mnt_flag |= SCARG(uap, flags) & (MNT_NOSUID | MNT_NOEXEC | + MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_FORCE | + MNT_NOSYMFOLLOW | + MNT_NOATIME | MNT_NOCLUSTERR | MNT_NOCLUSTERW | MNT_SUIDDIR); + /* + * Mount the filesystem. + */ + error = VFS_MOUNT(mp, SCARG(uap, path), SCARG(uap, data), &nd, p); + if (mp->mnt_flag & MNT_UPDATE) { + vrele(vp); + if (mp->mnt_kern_flag & MNTK_WANTRDWR) + mp->mnt_flag &= ~MNT_RDONLY; + mp->mnt_flag &=~ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE); + mp->mnt_kern_flag &=~ MNTK_WANTRDWR; + if (error) { + mp->mnt_flag = flag; + mp->mnt_kern_flag = flag2; + } + if ((mp->mnt_flag & MNT_RDONLY) == 0) { + if (mp->mnt_syncer == NULL) + error = vfs_allocate_syncvnode(mp); + } else { + if (mp->mnt_syncer != NULL) + vrele(mp->mnt_syncer); + mp->mnt_syncer = NULL; + } + vfs_unbusy(mp, p); + return (error); + } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + /* + * Put the new filesystem on the mount list after root. + */ + cache_purge(vp); + if (!error) { + simple_lock(&vp->v_interlock); + vp->v_flag &= ~VMOUNT; + vp->v_mountedhere = mp; + simple_unlock(&vp->v_interlock); + simple_lock(&mountlist_slock); + CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list); + simple_unlock(&mountlist_slock); + checkdirs(vp); + VOP_UNLOCK(vp, 0, p); + if ((mp->mnt_flag & MNT_RDONLY) == 0) + error = vfs_allocate_syncvnode(mp); + vfs_unbusy(mp, p); + if (error = VFS_START(mp, 0, p)) + vrele(vp); + } else { + simple_lock(&vp->v_interlock); + vp->v_flag &= ~VMOUNT; + simple_unlock(&vp->v_interlock); + mp->mnt_vfc->vfc_refcount--; + vfs_unbusy(mp, p); + free((caddr_t)mp, M_MOUNT); + vput(vp); + } + return (error); +} + +/* + * Scan all active processes to see if any of them have a current + * or root directory onto which the new filesystem has just been + * mounted. If so, replace them with the new mount point. + */ +static void +checkdirs(olddp) + struct vnode *olddp; +{ + struct filedesc *fdp; + struct vnode *newdp; + struct proc *p; + + if (olddp->v_usecount == 1) + return; + if (VFS_ROOT(olddp->v_mountedhere, &newdp)) + panic("mount: lost mount"); + for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { + fdp = p->p_fd; + if (fdp->fd_cdir == olddp) { + vrele(fdp->fd_cdir); + VREF(newdp); + fdp->fd_cdir = newdp; + } + if (fdp->fd_rdir == olddp) { + vrele(fdp->fd_rdir); + VREF(newdp); + fdp->fd_rdir = newdp; + } + } + if (rootvnode == olddp) { + vrele(rootvnode); + VREF(newdp); + rootvnode = newdp; + } + vput(newdp); +} + +/* + * Unmount a file system. + * + * Note: unmount takes a path to the vnode mounted on as argument, + * not special file (as before). + */ +#ifndef _SYS_SYSPROTO_H_ +struct unmount_args { + char *path; + int flags; +}; +#endif +/* ARGSUSED */ +int +unmount(p, uap) + struct proc *p; + register struct unmount_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + } */ *uap; +{ + register struct vnode *vp; + struct mount *mp; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + mp = vp->v_mount; + + /* + * Only root, or the user that did the original mount is + * permitted to unmount this filesystem. + */ + if ((mp->mnt_stat.f_owner != p->p_ucred->cr_uid) && + (error = suser(p->p_ucred, &p->p_acflag))) { + vput(vp); + return (error); + } + + /* + * Don't allow unmounting the root file system. + */ + if (mp->mnt_flag & MNT_ROOTFS) { + vput(vp); + return (EINVAL); + } + + /* + * Must be the root of the filesystem + */ + if ((vp->v_flag & VROOT) == 0) { + vput(vp); + return (EINVAL); + } + vput(vp); + return (dounmount(mp, SCARG(uap, flags), p)); +} + +/* + * Do the actual file system unmount. + */ +int +dounmount(mp, flags, p) + register struct mount *mp; + int flags; + struct proc *p; +{ + struct vnode *coveredvp; + int error; + int async_flag; + + simple_lock(&mountlist_slock); + mp->mnt_kern_flag |= MNTK_UNMOUNT; + lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK, &mountlist_slock, p); + + if (mp->mnt_flag & MNT_EXPUBLIC) + vfs_setpublicfs(NULL, NULL, NULL); + + vfs_msync(mp, MNT_WAIT); + async_flag = mp->mnt_flag & MNT_ASYNC; + mp->mnt_flag &=~ MNT_ASYNC; + cache_purgevfs(mp); /* remove cache entries for this file sys */ + if (mp->mnt_syncer != NULL) + vrele(mp->mnt_syncer); + if (((mp->mnt_flag & MNT_RDONLY) || + (error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p)) == 0) || + (flags & MNT_FORCE)) + error = VFS_UNMOUNT(mp, flags, p); + simple_lock(&mountlist_slock); + if (error) { + if ((mp->mnt_flag & MNT_RDONLY) == 0 && mp->mnt_syncer == NULL) + (void) vfs_allocate_syncvnode(mp); + mp->mnt_kern_flag &= ~MNTK_UNMOUNT; + mp->mnt_flag |= async_flag; + lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK | LK_REENABLE, + &mountlist_slock, p); + if (mp->mnt_kern_flag & MNTK_MWAIT) + wakeup((caddr_t)mp); + return (error); + } + CIRCLEQ_REMOVE(&mountlist, mp, mnt_list); + if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) { + coveredvp->v_mountedhere = (struct mount *)0; + vrele(coveredvp); + } + mp->mnt_vfc->vfc_refcount--; + if (mp->mnt_vnodelist.lh_first != NULL) + panic("unmount: dangling vnode"); + lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_slock, p); + if (mp->mnt_kern_flag & MNTK_MWAIT) + wakeup((caddr_t)mp); + free((caddr_t)mp, M_MOUNT); + return (0); +} + +/* + * Sync each mounted filesystem. + */ +#ifndef _SYS_SYSPROTO_H_ +struct sync_args { + int dummy; +}; +#endif + +#ifdef DEBUG +static int syncprt = 0; +SYSCTL_INT(_debug, OID_AUTO, syncprt, CTLFLAG_RW, &syncprt, 0, ""); +#endif + +/* ARGSUSED */ +int +sync(p, uap) + struct proc *p; + struct sync_args *uap; +{ + register struct mount *mp, *nmp; + int asyncflag; + + simple_lock(&mountlist_slock); + for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { + nmp = mp->mnt_list.cqe_next; + continue; + } + if ((mp->mnt_flag & MNT_RDONLY) == 0) { + asyncflag = mp->mnt_flag & MNT_ASYNC; + mp->mnt_flag &= ~MNT_ASYNC; + vfs_msync(mp, MNT_NOWAIT); + VFS_SYNC(mp, MNT_NOWAIT, + ((p != NULL) ? p->p_ucred : NOCRED), p); + mp->mnt_flag |= asyncflag; + } + simple_lock(&mountlist_slock); + nmp = mp->mnt_list.cqe_next; + vfs_unbusy(mp, p); + } + simple_unlock(&mountlist_slock); +#if 0 +/* + * XXX don't call vfs_bufstats() yet because that routine + * was not imported in the Lite2 merge. + */ +#ifdef DIAGNOSTIC + if (syncprt) + vfs_bufstats(); +#endif /* DIAGNOSTIC */ +#endif + return (0); +} + +/* + * Change filesystem quotas. + */ +#ifndef _SYS_SYSPROTO_H_ +struct quotactl_args { + char *path; + int cmd; + int uid; + caddr_t arg; +}; +#endif +/* ARGSUSED */ +int +quotactl(p, uap) + struct proc *p; + register struct quotactl_args /* { + syscallarg(char *) path; + syscallarg(int) cmd; + syscallarg(int) uid; + syscallarg(caddr_t) arg; + } */ *uap; +{ + register struct mount *mp; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + mp = nd.ni_vp->v_mount; + vrele(nd.ni_vp); + return (VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid), + SCARG(uap, arg), p)); +} + +/* + * Get filesystem statistics. + */ +#ifndef _SYS_SYSPROTO_H_ +struct statfs_args { + char *path; + struct statfs *buf; +}; +#endif +/* ARGSUSED */ +int +statfs(p, uap) + struct proc *p; + register struct statfs_args /* { + syscallarg(char *) path; + syscallarg(struct statfs *) buf; + } */ *uap; +{ + register struct mount *mp; + register struct statfs *sp; + int error; + struct nameidata nd; + struct statfs sb; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + mp = nd.ni_vp->v_mount; + sp = &mp->mnt_stat; + vrele(nd.ni_vp); + error = VFS_STATFS(mp, sp, p); + if (error) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (p->p_ucred->cr_uid != 0) { + bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb)); + sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; + sp = &sb; + } + return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp))); +} + +/* + * Get filesystem statistics. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fstatfs_args { + int fd; + struct statfs *buf; +}; +#endif +/* ARGSUSED */ +int +fstatfs(p, uap) + struct proc *p; + register struct fstatfs_args /* { + syscallarg(int) fd; + syscallarg(struct statfs *) buf; + } */ *uap; +{ + struct file *fp; + struct mount *mp; + register struct statfs *sp; + int error; + struct statfs sb; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + mp = ((struct vnode *)fp->f_data)->v_mount; + sp = &mp->mnt_stat; + error = VFS_STATFS(mp, sp, p); + if (error) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (p->p_ucred->cr_uid != 0) { + bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb)); + sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; + sp = &sb; + } + return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp))); +} + +/* + * Get statistics on all filesystems. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getfsstat_args { + struct statfs *buf; + long bufsize; + int flags; +}; +#endif +int +getfsstat(p, uap) + struct proc *p; + register struct getfsstat_args /* { + syscallarg(struct statfs *) buf; + syscallarg(long) bufsize; + syscallarg(int) flags; + } */ *uap; +{ + register struct mount *mp, *nmp; + register struct statfs *sp; + caddr_t sfsp; + long count, maxcount, error; + + maxcount = SCARG(uap, bufsize) / sizeof(struct statfs); + sfsp = (caddr_t)SCARG(uap, buf); + count = 0; + simple_lock(&mountlist_slock); + for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { + nmp = mp->mnt_list.cqe_next; + continue; + } + if (sfsp && count < maxcount) { + sp = &mp->mnt_stat; + /* + * If MNT_NOWAIT or MNT_LAZY is specified, do not + * refresh the fsstat cache. MNT_NOWAIT or MNT_LAZY + * overrides MNT_WAIT. + */ + if (((SCARG(uap, flags) & (MNT_LAZY|MNT_NOWAIT)) == 0 || + (SCARG(uap, flags) & MNT_WAIT)) && + (error = VFS_STATFS(mp, sp, p))) { + simple_lock(&mountlist_slock); + nmp = mp->mnt_list.cqe_next; + vfs_unbusy(mp, p); + continue; + } + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + error = copyout((caddr_t)sp, sfsp, sizeof(*sp)); + if (error) { + vfs_unbusy(mp, p); + return (error); + } + sfsp += sizeof(*sp); + } + count++; + simple_lock(&mountlist_slock); + nmp = mp->mnt_list.cqe_next; + vfs_unbusy(mp, p); + } + simple_unlock(&mountlist_slock); + if (sfsp && count > maxcount) + p->p_retval[0] = maxcount; + else + p->p_retval[0] = count; + return (0); +} + +/* + * Change current working directory to a given file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchdir_args { + int fd; +}; +#endif +/* ARGSUSED */ +int +fchdir(p, uap) + struct proc *p; + struct fchdir_args /* { + syscallarg(int) fd; + } */ *uap; +{ + register struct filedesc *fdp = p->p_fd; + struct vnode *vp, *tdp; + struct mount *mp; + struct file *fp; + int error; + + if (error = getvnode(fdp, SCARG(uap, fd), &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + VREF(vp); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + if (vp->v_type != VDIR) + error = ENOTDIR; + else + error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p); + while (!error && (mp = vp->v_mountedhere) != NULL) { + if (vfs_busy(mp, 0, 0, p)) + continue; + error = VFS_ROOT(mp, &tdp); + vfs_unbusy(mp, p); + if (error) + break; + vput(vp); + vp = tdp; + } + if (error) { + vput(vp); + return (error); + } + VOP_UNLOCK(vp, 0, p); + vrele(fdp->fd_cdir); + fdp->fd_cdir = vp; + return (0); +} + +/* + * Change current working directory (``.''). + */ +#ifndef _SYS_SYSPROTO_H_ +struct chdir_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +chdir(p, uap) + struct proc *p; + struct chdir_args /* { + syscallarg(char *) path; + } */ *uap; +{ + register struct filedesc *fdp = p->p_fd; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = change_dir(&nd, p)) + return (error); + vrele(fdp->fd_cdir); + fdp->fd_cdir = nd.ni_vp; + return (0); +} + +/* + * Change notion of root (``/'') directory. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chroot_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +chroot(p, uap) + struct proc *p; + struct chroot_args /* { + syscallarg(char *) path; + } */ *uap; +{ + register struct filedesc *fdp = p->p_fd; + int error; + struct nameidata nd; + + error = suser(p->p_ucred, &p->p_acflag); + if (error) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = change_dir(&nd, p)) + return (error); + vrele(fdp->fd_rdir); + fdp->fd_rdir = nd.ni_vp; + return (0); +} + +/* + * Common routine for chroot and chdir. + */ +static int +change_dir(ndp, p) + register struct nameidata *ndp; + struct proc *p; +{ + struct vnode *vp; + int error; + + error = namei(ndp); + if (error) + return (error); + vp = ndp->ni_vp; + if (vp->v_type != VDIR) + error = ENOTDIR; + else + error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p); + if (error) + vput(vp); + else + VOP_UNLOCK(vp, 0, p); + return (error); +} + +/* + * Check permissions, allocate an open file structure, + * and call the device open routine if any. + */ +#ifndef _SYS_SYSPROTO_H_ +struct open_args { + char *path; + int flags; + int mode; +}; +#endif +int +open(p, uap) + struct proc *p; + register struct open_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + syscallarg(int) mode; + } */ *uap; +{ + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + register struct vnode *vp; + int cmode, flags, oflags; + struct file *nfp; + int type, indx, error; + struct flock lf; + struct nameidata nd; + + oflags = SCARG(uap, flags); + if ((oflags & O_ACCMODE) == O_ACCMODE) + return (EINVAL); + flags = FFLAGS(oflags); + error = falloc(p, &nfp, &indx); + if (error) + return (error); + fp = nfp; + cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT; + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + p->p_dupfd = -indx - 1; /* XXX check for fdopen */ + error = vn_open(&nd, flags, cmode); + if (error) { + ffree(fp); + if ((error == ENODEV || error == ENXIO) && + p->p_dupfd >= 0 && /* XXX from fdopen */ + (error = + dupfdopen(fdp, indx, p->p_dupfd, flags, error)) == 0) { + p->p_retval[0] = indx; + return (0); + } + if (error == ERESTART) + error = EINTR; + fdp->fd_ofiles[indx] = NULL; + return (error); + } + p->p_dupfd = 0; + vp = nd.ni_vp; + + fp->f_flag = flags & FMASK; + fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE); + fp->f_ops = &vnops; + fp->f_data = (caddr_t)vp; + if (flags & (O_EXLOCK | O_SHLOCK)) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + if (flags & O_EXLOCK) + lf.l_type = F_WRLCK; + else + lf.l_type = F_RDLCK; + type = F_FLOCK; + if ((flags & FNONBLOCK) == 0) + type |= F_WAIT; + VOP_UNLOCK(vp, 0, p); + if (error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) { + (void) vn_close(vp, fp->f_flag, fp->f_cred, p); + ffree(fp); + fdp->fd_ofiles[indx] = NULL; + return (error); + } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + fp->f_flag |= FHASLOCK; + } + if ((vp->v_type == VREG) && (vp->v_object == NULL)) + vfs_object_create(vp, p, p->p_ucred); + VOP_UNLOCK(vp, 0, p); + p->p_retval[0] = indx; + return (0); +} + +#ifdef COMPAT_43 +/* + * Create a file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ocreat_args { + char *path; + int mode; +}; +#endif +int +ocreat(p, uap) + struct proc *p; + register struct ocreat_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; +{ + struct open_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + syscallarg(int) mode; + } */ nuap; + + SCARG(&nuap, path) = SCARG(uap, path); + SCARG(&nuap, mode) = SCARG(uap, mode); + SCARG(&nuap, flags) = O_WRONLY | O_CREAT | O_TRUNC; + return (open(p, &nuap)); +} +#endif /* COMPAT_43 */ + +/* + * Create a special file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mknod_args { + char *path; + int mode; + int dev; +}; +#endif +/* ARGSUSED */ +int +mknod(p, uap) + struct proc *p; + register struct mknod_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + syscallarg(int) dev; + } */ *uap; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + int whiteout = 0; + struct nameidata nd; + + error = suser(p->p_ucred, &p->p_acflag); + if (error) + return (error); + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp != NULL) + error = EEXIST; + else { + VATTR_NULL(&vattr); + vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask; + vattr.va_rdev = SCARG(uap, dev); + whiteout = 0; + + switch (SCARG(uap, mode) & S_IFMT) { + case S_IFMT: /* used by badsect to flag bad sectors */ + vattr.va_type = VBAD; + break; + case S_IFCHR: + vattr.va_type = VCHR; + break; + case S_IFBLK: + vattr.va_type = VBLK; + break; + case S_IFWHT: + whiteout = 1; + break; + default: + error = EINVAL; + break; + } + } + if (!error) { + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + if (whiteout) { + error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE); + if (error) + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + vput(nd.ni_dvp); + } else { + error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, + &nd.ni_cnd, &vattr); + vput(nd.ni_dvp); + } + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (vp) + vrele(vp); + } + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mknod"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "mknod"); + return (error); +} + +/* + * Create a named pipe. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mkfifo_args { + char *path; + int mode; +}; +#endif +/* ARGSUSED */ +int +mkfifo(p, uap) + struct proc *p; + register struct mkfifo_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; +{ + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + if (nd.ni_vp != NULL) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(nd.ni_vp); + return (EEXIST); + } + VATTR_NULL(&vattr); + vattr.va_type = VFIFO; + vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask; + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); + vput(nd.ni_dvp); + return (error); +} + +/* + * Make a hard file link. + */ +#ifndef _SYS_SYSPROTO_H_ +struct link_args { + char *path; + char *link; +}; +#endif +/* ARGSUSED */ +int +link(p, uap) + struct proc *p; + register struct link_args /* { + syscallarg(char *) path; + syscallarg(char *) link; + } */ *uap; +{ + register struct vnode *vp; + struct nameidata nd; + int error; + + NDINIT(&nd, LOOKUP, FOLLOW|NOOBJ, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type == VDIR) + error = EPERM; /* POSIX */ + else { + NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), p); + error = namei(&nd); + if (!error) { + if (nd.ni_vp != NULL) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_vp) + vrele(nd.ni_vp); + error = EEXIST; + } else { + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, + LEASE_WRITE); + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); + } + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + } + } + vrele(vp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "link"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "link"); + return (error); +} + +/* + * Make a symbolic link. + */ +#ifndef _SYS_SYSPROTO_H_ +struct symlink_args { + char *path; + char *link; +}; +#endif +/* ARGSUSED */ +int +symlink(p, uap) + struct proc *p; + register struct symlink_args /* { + syscallarg(char *) path; + syscallarg(char *) link; + } */ *uap; +{ + struct vattr vattr; + char *path; + int error; + struct nameidata nd; + + path = zalloc(namei_zone); + if (error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL)) + goto out; + NDINIT(&nd, CREATE, LOCKPARENT|NOOBJ, UIO_USERSPACE, SCARG(uap, link), p); + if (error = namei(&nd)) + goto out; + if (nd.ni_vp) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(nd.ni_vp); + error = EEXIST; + goto out; + } + VATTR_NULL(&vattr); + vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask; + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path); + vput(nd.ni_dvp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "symlink"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "symlink"); +out: + zfree(namei_zone, path); + return (error); +} + +/* + * Delete a whiteout from the filesystem. + */ +/* ARGSUSED */ +int +undelete(p, uap) + struct proc *p; + register struct undelete_args /* { + syscallarg(char *) path; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE, + SCARG(uap, path), p); + error = namei(&nd); + if (error) + return (error); + + if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (nd.ni_vp) + vrele(nd.ni_vp); + return (EEXIST); + } + + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + if (error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + vput(nd.ni_dvp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "undelete"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "undelete"); + return (error); +} + +/* + * Delete a name from the filesystem. + */ +#ifndef _SYS_SYSPROTO_H_ +struct unlink_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +unlink(p, uap) + struct proc *p; + struct unlink_args /* { + syscallarg(char *) path; + } */ *uap; +{ + register struct vnode *vp; + int error; + struct nameidata nd; + + NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + + if (vp->v_type == VDIR) + error = EPERM; /* POSIX */ + else { + /* + * The root of a mounted filesystem cannot be deleted. + * + * XXX: can this only be a VDIR case? + */ + if (vp->v_flag & VROOT) + error = EBUSY; + } + + if (!error) { + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_REMOVE(nd.ni_dvp, vp, &nd.ni_cnd); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + } + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (vp != NULLVP) + vput(vp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "unlink"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "unlink"); + return (error); +} + +/* + * Reposition read/write file offset. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lseek_args { + int fd; + int pad; + off_t offset; + int whence; +}; +#endif +int +lseek(p, uap) + struct proc *p; + register struct lseek_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) offset; + syscallarg(int) whence; + } */ *uap; +{ + struct ucred *cred = p->p_ucred; + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + struct vattr vattr; + int error; + + if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL) + return (EBADF); + if (fp->f_type != DTYPE_VNODE) + return (ESPIPE); + switch (SCARG(uap, whence)) { + case L_INCR: + fp->f_offset += SCARG(uap, offset); + break; + case L_XTND: + error=VOP_GETATTR((struct vnode *)fp->f_data, &vattr, cred, p); + if (error) + return (error); + fp->f_offset = SCARG(uap, offset) + vattr.va_size; + break; + case L_SET: + fp->f_offset = SCARG(uap, offset); + break; + default: + return (EINVAL); + } + *(off_t *)(p->p_retval) = fp->f_offset; + return (0); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Reposition read/write file offset. + */ +#ifndef _SYS_SYSPROTO_H_ +struct olseek_args { + int fd; + long offset; + int whence; +}; +#endif +int +olseek(p, uap) + struct proc *p; + register struct olseek_args /* { + syscallarg(int) fd; + syscallarg(long) offset; + syscallarg(int) whence; + } */ *uap; +{ + struct lseek_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) offset; + syscallarg(int) whence; + } */ nuap; + int error; + + SCARG(&nuap, fd) = SCARG(uap, fd); + SCARG(&nuap, offset) = SCARG(uap, offset); + SCARG(&nuap, whence) = SCARG(uap, whence); + error = lseek(p, &nuap); + return (error); +} +#endif /* COMPAT_43 */ + +/* + * Check access permissions. + */ +#ifndef _SYS_SYSPROTO_H_ +struct access_args { + char *path; + int flags; +}; +#endif +int +access(p, uap) + struct proc *p; + register struct access_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + } */ *uap; +{ + register struct ucred *cred = p->p_ucred; + register struct vnode *vp; + int error, flags, t_gid, t_uid; + struct nameidata nd; + + t_uid = cred->cr_uid; + t_gid = cred->cr_groups[0]; + cred->cr_uid = p->p_cred->p_ruid; + cred->cr_groups[0] = p->p_cred->p_rgid; + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + goto out1; + vp = nd.ni_vp; + + /* Flags == 0 means only check for existence. */ + if (SCARG(uap, flags)) { + flags = 0; + if (SCARG(uap, flags) & R_OK) + flags |= VREAD; + if (SCARG(uap, flags) & W_OK) + flags |= VWRITE; + if (SCARG(uap, flags) & X_OK) + flags |= VEXEC; + if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) + error = VOP_ACCESS(vp, flags, cred, p); + } + vput(vp); +out1: + cred->cr_uid = t_uid; + cred->cr_groups[0] = t_gid; + return (error); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Get file status; this version follows links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ostat_args { + char *path; + struct ostat *ub; +}; +#endif +/* ARGSUSED */ +int +ostat(p, uap) + struct proc *p; + register struct ostat_args /* { + syscallarg(char *) path; + syscallarg(struct ostat *) ub; + } */ *uap; +{ + struct stat sb; + struct ostat osb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = vn_stat(nd.ni_vp, &sb, p); + vput(nd.ni_vp); + if (error) + return (error); + cvtstat(&sb, &osb); + error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb)); + return (error); +} + +/* + * Get file status; this version does not follow links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct olstat_args { + char *path; + struct ostat *ub; +}; +#endif +/* ARGSUSED */ +int +olstat(p, uap) + struct proc *p; + register struct olstat_args /* { + syscallarg(char *) path; + syscallarg(struct ostat *) ub; + } */ *uap; +{ + struct vnode *vp; + struct stat sb; + struct ostat osb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + error = vn_stat(vp, &sb, p); + vput(vp); + if (error) + return (error); + cvtstat(&sb, &osb); + error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb)); + return (error); +} + +/* + * Convert from an old to a new stat structure. + */ +void +cvtstat(st, ost) + struct stat *st; + struct ostat *ost; +{ + + ost->st_dev = st->st_dev; + ost->st_ino = st->st_ino; + ost->st_mode = st->st_mode; + ost->st_nlink = st->st_nlink; + ost->st_uid = st->st_uid; + ost->st_gid = st->st_gid; + ost->st_rdev = st->st_rdev; + if (st->st_size < (quad_t)1 << 32) + ost->st_size = st->st_size; + else + ost->st_size = -2; + ost->st_atime = st->st_atime; + ost->st_mtime = st->st_mtime; + ost->st_ctime = st->st_ctime; + ost->st_blksize = st->st_blksize; + ost->st_blocks = st->st_blocks; + ost->st_flags = st->st_flags; + ost->st_gen = st->st_gen; +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Get file status; this version follows links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct stat_args { + char *path; + struct stat *ub; +}; +#endif +/* ARGSUSED */ +int +stat(p, uap) + struct proc *p; + register struct stat_args /* { + syscallarg(char *) path; + syscallarg(struct stat *) ub; + } */ *uap; +{ + struct stat sb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = vn_stat(nd.ni_vp, &sb, p); + vput(nd.ni_vp); + if (error) + return (error); + error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb)); + return (error); +} + +/* + * Get file status; this version does not follow links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lstat_args { + char *path; + struct stat *ub; +}; +#endif +/* ARGSUSED */ +int +lstat(p, uap) + struct proc *p; + register struct lstat_args /* { + syscallarg(char *) path; + syscallarg(struct stat *) ub; + } */ *uap; +{ + int error; + struct vnode *vp; + struct stat sb; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + error = vn_stat(vp, &sb, p); + vput(vp); + if (error) + return (error); + error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb)); + return (error); +} + +void +cvtnstat(sb, nsb) + struct stat *sb; + struct nstat *nsb; +{ + nsb->st_dev = sb->st_dev; + nsb->st_ino = sb->st_ino; + nsb->st_mode = sb->st_mode; + nsb->st_nlink = sb->st_nlink; + nsb->st_uid = sb->st_uid; + nsb->st_gid = sb->st_gid; + nsb->st_rdev = sb->st_rdev; + nsb->st_atimespec = sb->st_atimespec; + nsb->st_mtimespec = sb->st_mtimespec; + nsb->st_ctimespec = sb->st_ctimespec; + nsb->st_size = sb->st_size; + nsb->st_blocks = sb->st_blocks; + nsb->st_blksize = sb->st_blksize; + nsb->st_flags = sb->st_flags; + nsb->st_gen = sb->st_gen; +} + +#ifndef _SYS_SYSPROTO_H_ +struct nstat_args { + char *path; + struct nstat *ub; +}; +#endif +/* ARGSUSED */ +int +nstat(p, uap) + struct proc *p; + register struct nstat_args /* { + syscallarg(char *) path; + syscallarg(struct nstat *) ub; + } */ *uap; +{ + struct stat sb; + struct nstat nsb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = vn_stat(nd.ni_vp, &sb, p); + vput(nd.ni_vp); + if (error) + return (error); + cvtnstat(&sb, &nsb); + error = copyout((caddr_t)&nsb, (caddr_t)SCARG(uap, ub), sizeof (nsb)); + return (error); +} + +/* + * Get file status; this version does not follow links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lstat_args { + char *path; + struct stat *ub; +}; +#endif +/* ARGSUSED */ +int +nlstat(p, uap) + struct proc *p; + register struct nlstat_args /* { + syscallarg(char *) path; + syscallarg(struct nstat *) ub; + } */ *uap; +{ + int error; + struct vnode *vp; + struct stat sb; + struct nstat nsb; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + error = vn_stat(vp, &sb, p); + vput(vp); + if (error) + return (error); + cvtnstat(&sb, &nsb); + error = copyout((caddr_t)&nsb, (caddr_t)SCARG(uap, ub), sizeof (nsb)); + return (error); +} + +/* + * Get configurable pathname variables. + */ +#ifndef _SYS_SYSPROTO_H_ +struct pathconf_args { + char *path; + int name; +}; +#endif +/* ARGSUSED */ +int +pathconf(p, uap) + struct proc *p; + register struct pathconf_args /* { + syscallarg(char *) path; + syscallarg(int) name; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), p->p_retval); + vput(nd.ni_vp); + return (error); +} + +/* + * Return target name of a symbolic link. + */ +#ifndef _SYS_SYSPROTO_H_ +struct readlink_args { + char *path; + char *buf; + int count; +}; +#endif +/* ARGSUSED */ +int +readlink(p, uap) + struct proc *p; + register struct readlink_args /* { + syscallarg(char *) path; + syscallarg(char *) buf; + syscallarg(int) count; + } */ *uap; +{ + register struct vnode *vp; + struct iovec aiov; + struct uio auio; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | NOOBJ, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VLNK) + error = EINVAL; + else { + aiov.iov_base = SCARG(uap, buf); + aiov.iov_len = SCARG(uap, count); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + auio.uio_resid = SCARG(uap, count); + error = VOP_READLINK(vp, &auio, p->p_ucred); + } + vput(vp); + p->p_retval[0] = SCARG(uap, count) - auio.uio_resid; + return (error); +} + +static int +setfflags(p, vp, flags) + struct proc *p; + struct vnode *vp; + int flags; +{ + int error; + struct vattr vattr; + + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + VATTR_NULL(&vattr); + vattr.va_flags = flags; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + VOP_UNLOCK(vp, 0, p); + return error; +} + +/* + * Change flags of a file given a path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chflags_args { + char *path; + int flags; +}; +#endif +/* ARGSUSED */ +int +chflags(p, uap) + struct proc *p; + register struct chflags_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = setfflags(p, nd.ni_vp, SCARG(uap, flags)); + vrele(nd.ni_vp); + return error; +} + +/* + * Change flags of a file given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchflags_args { + int fd; + int flags; +}; +#endif +/* ARGSUSED */ +int +fchflags(p, uap) + struct proc *p; + register struct fchflags_args /* { + syscallarg(int) fd; + syscallarg(int) flags; + } */ *uap; +{ + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + return setfflags(p, (struct vnode *) fp->f_data, SCARG(uap, flags)); +} + +static int +setfmode(p, vp, mode) + struct proc *p; + struct vnode *vp; + int mode; +{ + int error; + struct vattr vattr; + + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + VATTR_NULL(&vattr); + vattr.va_mode = mode & ALLPERMS; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + VOP_UNLOCK(vp, 0, p); + return error; +} + +/* + * Change mode of a file given path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chmod_args { + char *path; + int mode; +}; +#endif +/* ARGSUSED */ +int +chmod(p, uap) + struct proc *p; + register struct chmod_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = setfmode(p, nd.ni_vp, SCARG(uap, mode)); + vrele(nd.ni_vp); + return error; +} + +/* + * Change mode of a file given path name (don't follow links.) + */ +#ifndef _SYS_SYSPROTO_H_ +struct lchmod_args { + char *path; + int mode; +}; +#endif +/* ARGSUSED */ +int +lchmod(p, uap) + struct proc *p; + register struct lchmod_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = setfmode(p, nd.ni_vp, SCARG(uap, mode)); + vrele(nd.ni_vp); + return error; +} + +/* + * Change mode of a file given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchmod_args { + int fd; + int mode; +}; +#endif +/* ARGSUSED */ +int +fchmod(p, uap) + struct proc *p; + register struct fchmod_args /* { + syscallarg(int) fd; + syscallarg(int) mode; + } */ *uap; +{ + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + return setfmode(p, (struct vnode *)fp->f_data, SCARG(uap, mode)); +} + +static int +setfown(p, vp, uid, gid) + struct proc *p; + struct vnode *vp; + uid_t uid; + gid_t gid; +{ + int error; + struct vattr vattr; + + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + VATTR_NULL(&vattr); + vattr.va_uid = uid; + vattr.va_gid = gid; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + VOP_UNLOCK(vp, 0, p); + return error; +} + +/* + * Set ownership given a path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chown_args { + char *path; + int uid; + int gid; +}; +#endif +/* ARGSUSED */ +int +chown(p, uap) + struct proc *p; + register struct chown_args /* { + syscallarg(char *) path; + syscallarg(int) uid; + syscallarg(int) gid; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = setfown(p, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid)); + vrele(nd.ni_vp); + + return (error); +} + +/* + * Set ownership given a path name, do not cross symlinks. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lchown_args { + char *path; + int uid; + int gid; +}; +#endif +/* ARGSUSED */ +int +lchown(p, uap) + struct proc *p; + register struct lchown_args /* { + syscallarg(char *) path; + syscallarg(int) uid; + syscallarg(int) gid; + } */ *uap; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = setfown(p, nd.ni_vp, SCARG(uap, uid), SCARG(uap, gid)); + vrele(nd.ni_vp); + return (error); +} + +/* + * Set ownership given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchown_args { + int fd; + int uid; + int gid; +}; +#endif +/* ARGSUSED */ +int +fchown(p, uap) + struct proc *p; + register struct fchown_args /* { + syscallarg(int) fd; + syscallarg(int) uid; + syscallarg(int) gid; + } */ *uap; +{ + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + return setfown(p, (struct vnode *)fp->f_data, + SCARG(uap, uid), SCARG(uap, gid)); +} + +static int +setutimes(p, vp, tv, nullflag) + struct proc *p; + struct vnode *vp; + struct timeval *tv; + int nullflag; +{ + int error; + struct vattr vattr; + + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + VATTR_NULL(&vattr); + vattr.va_atime.tv_sec = tv[0].tv_sec; + vattr.va_atime.tv_nsec = tv[0].tv_usec * 1000; + vattr.va_mtime.tv_sec = tv[1].tv_sec; + vattr.va_mtime.tv_nsec = tv[1].tv_usec * 1000; + if (nullflag) + vattr.va_vaflags |= VA_UTIMES_NULL; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + VOP_UNLOCK(vp, 0, p); + return error; +} + +/* + * Set the access and modification times of a file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct utimes_args { + char *path; + struct timeval *tptr; +}; +#endif +/* ARGSUSED */ +int +utimes(p, uap) + struct proc *p; + register struct utimes_args /* { + syscallarg(char *) path; + syscallarg(struct timeval *) tptr; + } */ *uap; +{ + struct timeval tv[2]; + int error; + struct nameidata nd; + int nullflag; + + nullflag = 0; + if (SCARG(uap, tptr) == NULL) { + microtime(&tv[0]); + tv[1] = tv[0]; + nullflag = 1; + } else if (error = copyin((caddr_t)SCARG(uap, tptr), (caddr_t)tv, + sizeof (tv))) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = setutimes(p, nd.ni_vp, tv, nullflag); + vrele(nd.ni_vp); + return (error); +} + +/* + * Set the access and modification times of a file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lutimes_args { + char *path; + struct timeval *tptr; +}; +#endif +/* ARGSUSED */ +int +lutimes(p, uap) + struct proc *p; + register struct lutimes_args /* { + syscallarg(char *) path; + syscallarg(struct timeval *) tptr; + } */ *uap; +{ + struct timeval tv[2]; + int error; + struct nameidata nd; + int nullflag; + + nullflag = 0; + if (SCARG(uap, tptr) == NULL) { + microtime(&tv[0]); + tv[1] = tv[0]; + nullflag = 1; + } else if (error = copyin((caddr_t)SCARG(uap, tptr), (caddr_t)tv, + sizeof (tv))) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + + error = setutimes(p, nd.ni_vp, tv, nullflag); + vrele(nd.ni_vp); + return (error); +} + +/* + * Set the access and modification times of a file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct futimes_args { + int fd; + struct timeval *tptr; +}; +#endif +/* ARGSUSED */ +int +futimes(p, uap) + struct proc *p; + register struct futimes_args /* { + syscallarg(int ) fd; + syscallarg(struct timeval *) tptr; + } */ *uap; +{ + struct timeval tv[2]; + struct file *fp; + int error; + int nullflag; + + nullflag = 0; + if (SCARG(uap, tptr) == NULL) { + microtime(&tv[0]); + tv[1] = tv[0]; + nullflag = 1; + } else if (error = copyin((caddr_t)SCARG(uap, tptr), (caddr_t)tv, + sizeof (tv))) + return (error); + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + return setutimes(p, (struct vnode *)fp->f_data, tv, nullflag); +} + +/* + * Truncate a file given its path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct truncate_args { + char *path; + int pad; + off_t length; +}; +#endif +/* ARGSUSED */ +int +truncate(p, uap) + struct proc *p; + register struct truncate_args /* { + syscallarg(char *) path; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ *uap; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + if (uap->length < 0) + return(EINVAL); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + if (vp->v_type == VDIR) + error = EISDIR; + else if ((error = vn_writechk(vp)) == 0 && + (error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) == 0) { + VATTR_NULL(&vattr); + vattr.va_size = SCARG(uap, length); + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + } + vput(vp); + return (error); +} + +/* + * Truncate a file given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ftruncate_args { + int fd; + int pad; + off_t length; +}; +#endif +/* ARGSUSED */ +int +ftruncate(p, uap) + struct proc *p; + register struct ftruncate_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ *uap; +{ + struct vattr vattr; + struct vnode *vp; + struct file *fp; + int error; + + if (uap->length < 0) + return(EINVAL); + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + if ((fp->f_flag & FWRITE) == 0) + return (EINVAL); + vp = (struct vnode *)fp->f_data; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + if (vp->v_type == VDIR) + error = EISDIR; + else if ((error = vn_writechk(vp)) == 0) { + VATTR_NULL(&vattr); + vattr.va_size = SCARG(uap, length); + error = VOP_SETATTR(vp, &vattr, fp->f_cred, p); + } + VOP_UNLOCK(vp, 0, p); + return (error); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Truncate a file given its path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct otruncate_args { + char *path; + long length; +}; +#endif +/* ARGSUSED */ +int +otruncate(p, uap) + struct proc *p; + register struct otruncate_args /* { + syscallarg(char *) path; + syscallarg(long) length; + } */ *uap; +{ + struct truncate_args /* { + syscallarg(char *) path; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ nuap; + + SCARG(&nuap, path) = SCARG(uap, path); + SCARG(&nuap, length) = SCARG(uap, length); + return (truncate(p, &nuap)); +} + +/* + * Truncate a file given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct oftruncate_args { + int fd; + long length; +}; +#endif +/* ARGSUSED */ +int +oftruncate(p, uap) + struct proc *p; + register struct oftruncate_args /* { + syscallarg(int) fd; + syscallarg(long) length; + } */ *uap; +{ + struct ftruncate_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ nuap; + + SCARG(&nuap, fd) = SCARG(uap, fd); + SCARG(&nuap, length) = SCARG(uap, length); + return (ftruncate(p, &nuap)); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Sync an open file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fsync_args { + int fd; +}; +#endif +/* ARGSUSED */ +int +fsync(p, uap) + struct proc *p; + struct fsync_args /* { + syscallarg(int) fd; + } */ *uap; +{ + register struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + if (vp->v_object) + vm_object_page_clean(vp->v_object, 0, 0, 0); + if ((error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, p)) == 0 && + vp->v_mount && (vp->v_mount->mnt_flag & MNT_SOFTDEP) && + bioops.io_fsync) + error = (*bioops.io_fsync)(vp); + VOP_UNLOCK(vp, 0, p); + return (error); +} + +/* + * Rename files. Source and destination must either both be directories, + * or both not be directories. If target is a directory, it must be empty. + */ +#ifndef _SYS_SYSPROTO_H_ +struct rename_args { + char *from; + char *to; +}; +#endif +/* ARGSUSED */ +int +rename(p, uap) + struct proc *p; + register struct rename_args /* { + syscallarg(char *) from; + syscallarg(char *) to; + } */ *uap; +{ + register struct vnode *tvp, *fvp, *tdvp; + struct nameidata fromnd, tond; + int error; + + NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE, + SCARG(uap, from), p); + if (error = namei(&fromnd)) + return (error); + fvp = fromnd.ni_vp; + NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART | NOOBJ, + UIO_USERSPACE, SCARG(uap, to), p); + if (fromnd.ni_vp->v_type == VDIR) + tond.ni_cnd.cn_flags |= WILLBEDIR; + if (error = namei(&tond)) { + /* Translate error code for rename("dir1", "dir2/."). */ + if (error == EISDIR && fvp->v_type == VDIR) + error = EINVAL; + VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); + vrele(fromnd.ni_dvp); + vrele(fvp); + goto out1; + } + tdvp = tond.ni_dvp; + tvp = tond.ni_vp; + if (tvp != NULL) { + if (fvp->v_type == VDIR && tvp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { + error = EISDIR; + goto out; + } + } + if (fvp == tdvp) + error = EINVAL; + /* + * If source is the same as the destination (that is the + * same inode number with the same name in the same directory), + * then there is nothing to do. + */ + if (fvp == tvp && fromnd.ni_dvp == tdvp && + fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen && + !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr, + fromnd.ni_cnd.cn_namelen)) + error = -1; +out: + if (!error) { + VOP_LEASE(tdvp, p, p->p_ucred, LEASE_WRITE); + if (fromnd.ni_dvp != tdvp) { + VOP_LEASE(fromnd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + } + if (tvp) { + VOP_LEASE(tvp, p, p->p_ucred, LEASE_WRITE); + } + error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, + tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); + } else { + VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd); + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + if (tvp) + vput(tvp); + VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); + vrele(fromnd.ni_dvp); + vrele(fvp); + } + vrele(tond.ni_startdir); + ASSERT_VOP_UNLOCKED(fromnd.ni_dvp, "rename"); + ASSERT_VOP_UNLOCKED(fromnd.ni_vp, "rename"); + ASSERT_VOP_UNLOCKED(tond.ni_dvp, "rename"); + ASSERT_VOP_UNLOCKED(tond.ni_vp, "rename"); + zfree(namei_zone, tond.ni_cnd.cn_pnbuf); +out1: + if (fromnd.ni_startdir) + vrele(fromnd.ni_startdir); + zfree(namei_zone, fromnd.ni_cnd.cn_pnbuf); + if (error == -1) + return (0); + return (error); +} + +/* + * Make a directory file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mkdir_args { + char *path; + int mode; +}; +#endif +/* ARGSUSED */ +int +mkdir(p, uap) + struct proc *p; + register struct mkdir_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); + nd.ni_cnd.cn_flags |= WILLBEDIR; + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp != NULL) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(vp); + return (EEXIST); + } + VATTR_NULL(&vattr); + vattr.va_type = VDIR; + vattr.va_mode = (SCARG(uap, mode) & ACCESSPERMS) &~ p->p_fd->fd_cmask; + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); + vput(nd.ni_dvp); + if (!error) + vput(nd.ni_vp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "mkdir"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "mkdir"); + return (error); +} + +/* + * Remove a directory file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct rmdir_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +rmdir(p, uap) + struct proc *p; + struct rmdir_args /* { + syscallarg(char *) path; + } */ *uap; +{ + register struct vnode *vp; + int error; + struct nameidata nd; + + NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + /* + * No rmdir "." please. + */ + if (nd.ni_dvp == vp) { + error = EINVAL; + goto out; + } + /* + * The root of a mounted filesystem cannot be deleted. + */ + if (vp->v_flag & VROOT) + error = EBUSY; +out: + if (!error) { + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + } + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (vp != NULLVP) + vput(vp); + ASSERT_VOP_UNLOCKED(nd.ni_dvp, "rmdir"); + ASSERT_VOP_UNLOCKED(nd.ni_vp, "rmdir"); + return (error); +} + +#ifdef COMPAT_43 +/* + * Read a block of directory entries in a file system independent format. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ogetdirentries_args { + int fd; + char *buf; + u_int count; + long *basep; +}; +#endif +int +ogetdirentries(p, uap) + struct proc *p; + register struct ogetdirentries_args /* { + syscallarg(int) fd; + syscallarg(char *) buf; + syscallarg(u_int) count; + syscallarg(long *) basep; + } */ *uap; +{ + struct vnode *vp; + struct file *fp; + struct uio auio, kuio; + struct iovec aiov, kiov; + struct dirent *dp, *edp; + caddr_t dirbuf; + int error, eofflag, readcnt; + long loff; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + if ((fp->f_flag & FREAD) == 0) + return (EBADF); + vp = (struct vnode *)fp->f_data; +unionread: + if (vp->v_type != VDIR) + return (EINVAL); + aiov.iov_base = SCARG(uap, buf); + aiov.iov_len = SCARG(uap, count); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + auio.uio_resid = SCARG(uap, count); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + loff = auio.uio_offset = fp->f_offset; +# if (BYTE_ORDER != LITTLE_ENDIAN) + if (vp->v_mount->mnt_maxsymlinklen <= 0) { + error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, + NULL, NULL); + fp->f_offset = auio.uio_offset; + } else +# endif + { + kuio = auio; + kuio.uio_iov = &kiov; + kuio.uio_segflg = UIO_SYSSPACE; + kiov.iov_len = SCARG(uap, count); + MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK); + kiov.iov_base = dirbuf; + error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag, + NULL, NULL); + fp->f_offset = kuio.uio_offset; + if (error == 0) { + readcnt = SCARG(uap, count) - kuio.uio_resid; + edp = (struct dirent *)&dirbuf[readcnt]; + for (dp = (struct dirent *)dirbuf; dp < edp; ) { +# if (BYTE_ORDER == LITTLE_ENDIAN) + /* + * The expected low byte of + * dp->d_namlen is our dp->d_type. + * The high MBZ byte of dp->d_namlen + * is our dp->d_namlen. + */ + dp->d_type = dp->d_namlen; + dp->d_namlen = 0; +# else + /* + * The dp->d_type is the high byte + * of the expected dp->d_namlen, + * so must be zero'ed. + */ + dp->d_type = 0; +# endif + if (dp->d_reclen > 0) { + dp = (struct dirent *) + ((char *)dp + dp->d_reclen); + } else { + error = EIO; + break; + } + } + if (dp >= edp) + error = uiomove(dirbuf, readcnt, &auio); + } + FREE(dirbuf, M_TEMP); + } + VOP_UNLOCK(vp, 0, p); + if (error) + return (error); + if (union_dircheckp && SCARG(uap, count) == auio.uio_resid) { + error = union_dircheckp(p, &vp, fp); + if (error == -1) + goto unionread; + if (error) + return (error); + } + error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep), + sizeof(long)); + p->p_retval[0] = SCARG(uap, count) - auio.uio_resid; + return (error); +} +#endif /* COMPAT_43 */ + +/* + * Read a block of directory entries in a file system independent format. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getdirentries_args { + int fd; + char *buf; + u_int count; + long *basep; +}; +#endif +int +getdirentries(p, uap) + struct proc *p; + register struct getdirentries_args /* { + syscallarg(int) fd; + syscallarg(char *) buf; + syscallarg(u_int) count; + syscallarg(long *) basep; + } */ *uap; +{ + struct vnode *vp; + struct file *fp; + struct uio auio; + struct iovec aiov; + long loff; + int error, eofflag; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + if ((fp->f_flag & FREAD) == 0) + return (EBADF); + vp = (struct vnode *)fp->f_data; +unionread: + if (vp->v_type != VDIR) + return (EINVAL); + aiov.iov_base = SCARG(uap, buf); + aiov.iov_len = SCARG(uap, count); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + auio.uio_resid = SCARG(uap, count); + /* vn_lock(vp, LK_SHARED | LK_RETRY, p); */ + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + loff = auio.uio_offset = fp->f_offset; + error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); + fp->f_offset = auio.uio_offset; + VOP_UNLOCK(vp, 0, p); + if (error) + return (error); + if (union_dircheckp && SCARG(uap, count) == auio.uio_resid) { + error = union_dircheckp(p, &vp, fp); + if (error == -1) + goto unionread; + if (error) + return (error); + } + if (SCARG(uap, basep) != NULL) { + error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep), + sizeof(long)); + } + p->p_retval[0] = SCARG(uap, count) - auio.uio_resid; + return (error); +} +#ifndef _SYS_SYSPROTO_H_ +struct getdents_args { + int fd; + char *buf; + size_t count; +}; +#endif +int +getdents(p, uap) + struct proc *p; + register struct getdents_args /* { + syscallarg(int) fd; + syscallarg(char *) buf; + syscallarg(u_int) count; + } */ *uap; +{ + struct getdirentries_args ap; + ap.fd = uap->fd; + ap.buf = uap->buf; + ap.count = uap->count; + ap.basep = NULL; + return getdirentries(p, &ap); +} + +/* + * Set the mode mask for creation of filesystem nodes. + */ +#ifndef _SYS_SYSPROTO_H_ +struct umask_args { + int newmask; +}; +#endif +int +umask(p, uap) + struct proc *p; + struct umask_args /* { + syscallarg(int) newmask; + } */ *uap; +{ + register struct filedesc *fdp; + + fdp = p->p_fd; + p->p_retval[0] = fdp->fd_cmask; + fdp->fd_cmask = SCARG(uap, newmask) & ALLPERMS; + return (0); +} + +/* + * Void all references to file by ripping underlying filesystem + * away from vnode. + */ +#ifndef _SYS_SYSPROTO_H_ +struct revoke_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +revoke(p, uap) + struct proc *p; + register struct revoke_args /* { + syscallarg(char *) path; + } */ *uap; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (error = VOP_GETATTR(vp, &vattr, p->p_ucred, p)) + goto out; + if (p->p_ucred->cr_uid != vattr.va_uid && + (error = suser(p->p_ucred, &p->p_acflag))) + goto out; + if (vp->v_usecount > 1 || (vp->v_flag & VALIASED)) + VOP_REVOKE(vp, REVOKEALL); +out: + vrele(vp); + return (error); +} + +/* + * Convert a user file descriptor to a kernel file entry. + */ +int +getvnode(fdp, fd, fpp) + struct filedesc *fdp; + int fd; + struct file **fpp; +{ + struct file *fp; + + if ((u_int)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) + return (EBADF); + if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) + return (EINVAL); + *fpp = fp; + return (0); +} +#ifndef _SYS_SYSPROTO_H_ +struct __getcwd_args { + u_char *buf; + u_int buflen; +}; +#endif +#define STATNODE(mode, name, var) \ + SYSCTL_INT(_vfs_cache, OID_AUTO, name, mode, var, 0, ""); + +static int disablecwd; +SYSCTL_INT(_debug, OID_AUTO, disablecwd, CTLFLAG_RW, &disablecwd, 0, ""); + +static u_long numcwdcalls; STATNODE(CTLFLAG_RD, numcwdcalls, &numcwdcalls); +static u_long numcwdfail1; STATNODE(CTLFLAG_RD, numcwdfail1, &numcwdfail1); +static u_long numcwdfail2; STATNODE(CTLFLAG_RD, numcwdfail2, &numcwdfail2); +static u_long numcwdfail3; STATNODE(CTLFLAG_RD, numcwdfail3, &numcwdfail3); +static u_long numcwdfail4; STATNODE(CTLFLAG_RD, numcwdfail4, &numcwdfail4); +static u_long numcwdfound; STATNODE(CTLFLAG_RD, numcwdfound, &numcwdfound); +int +__getcwd(p, uap) + struct proc *p; + struct __getcwd_args *uap; +{ + char *bp, *buf; + int error, i, slash_prefixed; + struct filedesc *fdp; + struct namecache *ncp; + struct vnode *vp; + + numcwdcalls++; + if (disablecwd) + return (ENODEV); + if (uap->buflen < 2) + return (EINVAL); + if (uap->buflen > MAXPATHLEN) + uap->buflen = MAXPATHLEN; + buf = bp = malloc(uap->buflen, M_TEMP, M_WAITOK); + bp += uap->buflen - 1; + *bp = '\0'; + fdp = p->p_fd; + slash_prefixed = 0; + for (vp = fdp->fd_cdir; vp != fdp->fd_rdir && vp != rootvnode;) { + if (vp->v_flag & VROOT) { + vp = vp->v_mount->mnt_vnodecovered; + continue; + } + if (vp->v_dd->v_id != vp->v_ddid) { + numcwdfail1++; + free(buf, M_TEMP); + return (ENOTDIR); + } + ncp = TAILQ_FIRST(&vp->v_cache_dst); + if (!ncp) { + numcwdfail2++; + free(buf, M_TEMP); + return (ENOENT); + } + if (ncp->nc_dvp != vp->v_dd) { + numcwdfail3++; + free(buf, M_TEMP); + return (EBADF); + } + for (i = ncp->nc_nlen - 1; i >= 0; i--) { + if (bp == buf) { + numcwdfail4++; + free(buf, M_TEMP); + return (ENOMEM); + } + *--bp = ncp->nc_name[i]; + } + if (bp == buf) { + numcwdfail4++; + free(buf, M_TEMP); + return (ENOMEM); + } + *--bp = '/'; + slash_prefixed = 1; + vp = vp->v_dd; + } + if (!slash_prefixed) { + if (bp == buf) { + numcwdfail4++; + free(buf, M_TEMP); + return (ENOMEM); + } + *--bp = '/'; + } + numcwdfound++; + error = copyout(bp, uap->buf, strlen(bp) + 1); + free(buf, M_TEMP); + return (error); +} diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c new file mode 100644 index 0000000..0b32a7d --- /dev/null +++ b/sys/kern/vfs_vnops.c @@ -0,0 +1,562 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 + * $Id: vfs_vnops.c,v 1.61 1999/01/05 18:49:56 eivind Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/proc.h> +#include <sys/mount.h> +#include <sys/namei.h> +#include <sys/vnode.h> +#include <sys/filio.h> +#include <sys/ttycom.h> + +static int vn_closefile __P((struct file *fp, struct proc *p)); +static int vn_ioctl __P((struct file *fp, u_long com, caddr_t data, + struct proc *p)); +static int vn_read __P((struct file *fp, struct uio *uio, + struct ucred *cred)); +static int vn_poll __P((struct file *fp, int events, struct ucred *cred, + struct proc *p)); +static int vn_write __P((struct file *fp, struct uio *uio, + struct ucred *cred)); + +struct fileops vnops = + { vn_read, vn_write, vn_ioctl, vn_poll, vn_closefile }; + +/* + * Common code for vnode open operations. + * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. + */ +int +vn_open(ndp, fmode, cmode) + register struct nameidata *ndp; + int fmode, cmode; +{ + register struct vnode *vp; + register struct proc *p = ndp->ni_cnd.cn_proc; + register struct ucred *cred = p->p_ucred; + struct vattr vat; + struct vattr *vap = &vat; + int mode, error; + + if (fmode & O_CREAT) { + ndp->ni_cnd.cn_nameiop = CREATE; + ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; + if ((fmode & O_EXCL) == 0 && (fmode & O_NOFOLLOW) == 0) + ndp->ni_cnd.cn_flags |= FOLLOW; + error = namei(ndp); + if (error) + return (error); + if (ndp->ni_vp == NULL) { + VATTR_NULL(vap); + vap->va_type = VREG; + vap->va_mode = cmode; + if (fmode & O_EXCL) + vap->va_vaflags |= VA_EXCLUSIVE; + VOP_LEASE(ndp->ni_dvp, p, cred, LEASE_WRITE); + error = VOP_CREATE(ndp->ni_dvp, &ndp->ni_vp, + &ndp->ni_cnd, vap); + vput(ndp->ni_dvp); + if (error) + return (error); + ASSERT_VOP_UNLOCKED(ndp->ni_dvp, "create"); + ASSERT_VOP_LOCKED(ndp->ni_vp, "create"); + fmode &= ~O_TRUNC; + vp = ndp->ni_vp; + } else { + VOP_ABORTOP(ndp->ni_dvp, &ndp->ni_cnd); + if (ndp->ni_dvp == ndp->ni_vp) + vrele(ndp->ni_dvp); + else + vput(ndp->ni_dvp); + ndp->ni_dvp = NULL; + vp = ndp->ni_vp; + if (fmode & O_EXCL) { + error = EEXIST; + goto bad; + } + fmode &= ~O_CREAT; + } + } else { + ndp->ni_cnd.cn_nameiop = LOOKUP; + ndp->ni_cnd.cn_flags = + ((fmode & O_NOFOLLOW) ? NOFOLLOW : FOLLOW) | LOCKLEAF; + error = namei(ndp); + if (error) + return (error); + vp = ndp->ni_vp; + } + if (vp->v_type == VLNK) { + error = EMLINK; + goto bad; + } + if (vp->v_type == VSOCK) { + error = EOPNOTSUPP; + goto bad; + } + if ((fmode & O_CREAT) == 0) { + mode = 0; + if (fmode & (FWRITE | O_TRUNC)) { + if (vp->v_type == VDIR) { + error = EISDIR; + goto bad; + } + error = vn_writechk(vp); + if (error) + goto bad; + mode |= VWRITE; + } + if (fmode & FREAD) + mode |= VREAD; + if (mode) { + error = VOP_ACCESS(vp, mode, cred, p); + if (error) + goto bad; + } + } + if (fmode & O_TRUNC) { + VOP_UNLOCK(vp, 0, p); /* XXX */ + VOP_LEASE(vp, p, cred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); /* XXX */ + VATTR_NULL(vap); + vap->va_size = 0; + error = VOP_SETATTR(vp, vap, cred, p); + if (error) + goto bad; + } + error = VOP_OPEN(vp, fmode, cred, p); + if (error) + goto bad; + /* + * Make sure that a VM object is created for VMIO support. + */ + if (vp->v_type == VREG) { + if ((error = vfs_object_create(vp, p, cred)) != 0) + goto bad; + } + + if (fmode & FWRITE) + vp->v_writecount++; + return (0); +bad: + vput(vp); + return (error); +} + +/* + * Check for write permissions on the specified vnode. + * Prototype text segments cannot be written. + */ +int +vn_writechk(vp) + register struct vnode *vp; +{ + + /* + * If there's shared text associated with + * the vnode, try to free it up once. If + * we fail, we can't allow writing. + */ + if (vp->v_flag & VTEXT) + return (ETXTBSY); + return (0); +} + +/* + * Vnode close call + */ +int +vn_close(vp, flags, cred, p) + register struct vnode *vp; + int flags; + struct ucred *cred; + struct proc *p; +{ + int error; + + if (flags & FWRITE) + vp->v_writecount--; + error = VOP_CLOSE(vp, flags, cred, p); + vrele(vp); + return (error); +} + +/* + * Package up an I/O request on a vnode into a uio and do it. + */ +int +vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p) + enum uio_rw rw; + struct vnode *vp; + caddr_t base; + int len; + off_t offset; + enum uio_seg segflg; + int ioflg; + struct ucred *cred; + int *aresid; + struct proc *p; +{ + struct uio auio; + struct iovec aiov; + int error; + + if ((ioflg & IO_NODELOCKED) == 0) + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + aiov.iov_base = base; + aiov.iov_len = len; + auio.uio_resid = len; + auio.uio_offset = offset; + auio.uio_segflg = segflg; + auio.uio_rw = rw; + auio.uio_procp = p; + if (rw == UIO_READ) { + error = VOP_READ(vp, &auio, ioflg, cred); + } else { + error = VOP_WRITE(vp, &auio, ioflg, cred); + } + if (aresid) + *aresid = auio.uio_resid; + else + if (auio.uio_resid && error == 0) + error = EIO; + if ((ioflg & IO_NODELOCKED) == 0) + VOP_UNLOCK(vp, 0, p); + return (error); +} + +/* + * File table vnode read routine. + */ +static int +vn_read(fp, uio, cred) + struct file *fp; + struct uio *uio; + struct ucred *cred; +{ + struct vnode *vp = (struct vnode *)fp->f_data; + struct proc *p = uio->uio_procp; + int count, error; + int flag; + + VOP_LEASE(vp, p, cred, LEASE_READ); + vn_lock(vp, LK_SHARED | LK_NOPAUSE | LK_RETRY, p); + if (uio->uio_offset == -1) + uio->uio_offset = fp->f_offset; + count = uio->uio_resid; + flag = 0; + if (fp->f_flag & FNONBLOCK) + flag |= IO_NDELAY; + + /* + * Sequential read heuristic. + * If we have been doing sequential input, + * a rewind operation doesn't turn off + * sequential input mode. + */ + if (((fp->f_offset == 0) && (fp->f_seqcount > 0)) || + (fp->f_offset == fp->f_nextread)) { + int tmpseq = fp->f_seqcount; + /* + * XXX we assume that the filesystem block size is + * the default. Not true, but still gives us a pretty + * good indicator of how sequential the read operations + * are. + */ + tmpseq += ((count + BKVASIZE - 1) / BKVASIZE); + if (tmpseq >= 127) + tmpseq = 127; + fp->f_seqcount = tmpseq; + flag |= (fp->f_seqcount << 16); + } else { + if (fp->f_seqcount > 1) + fp->f_seqcount = 1; + else + fp->f_seqcount = 0; + } + + error = VOP_READ(vp, uio, flag, cred); + fp->f_offset += count - uio->uio_resid; + fp->f_nextread = fp->f_offset; + VOP_UNLOCK(vp, 0, p); + return (error); +} + +/* + * File table vnode write routine. + */ +static int +vn_write(fp, uio, cred) + struct file *fp; + struct uio *uio; + struct ucred *cred; +{ + struct vnode *vp = (struct vnode *)fp->f_data; + struct proc *p = uio->uio_procp; + int count, error, ioflag = IO_UNIT; + + if (uio->uio_offset == -1 && vp->v_type == VREG && (fp->f_flag & O_APPEND)) + ioflag |= IO_APPEND; + if (fp->f_flag & FNONBLOCK) + ioflag |= IO_NDELAY; + if ((fp->f_flag & O_FSYNC) || + (vp->v_mount && (vp->v_mount->mnt_flag & MNT_SYNCHRONOUS))) + ioflag |= IO_SYNC; + VOP_LEASE(vp, p, cred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + uio->uio_offset = fp->f_offset; + count = uio->uio_resid; + error = VOP_WRITE(vp, uio, ioflag, cred); + if (ioflag & IO_APPEND) + fp->f_offset = uio->uio_offset; + else + fp->f_offset += count - uio->uio_resid; + VOP_UNLOCK(vp, 0, p); + return (error); +} + +/* + * File table vnode stat routine. + */ +int +vn_stat(vp, sb, p) + struct vnode *vp; + register struct stat *sb; + struct proc *p; +{ + struct vattr vattr; + register struct vattr *vap; + int error; + u_short mode; + + vap = &vattr; + error = VOP_GETATTR(vp, vap, p->p_ucred, p); + if (error) + return (error); + /* + * Copy from vattr table + */ + sb->st_dev = vap->va_fsid; + sb->st_ino = vap->va_fileid; + mode = vap->va_mode; + switch (vap->va_type) { + case VREG: + mode |= S_IFREG; + break; + case VDIR: + mode |= S_IFDIR; + break; + case VBLK: + mode |= S_IFBLK; + break; + case VCHR: + mode |= S_IFCHR; + break; + case VLNK: + mode |= S_IFLNK; + /* This is a cosmetic change, symlinks do not have a mode. */ + if (vp->v_mount->mnt_flag & MNT_NOSYMFOLLOW) + sb->st_mode &= ~ACCESSPERMS; /* 0000 */ + else + sb->st_mode |= ACCESSPERMS; /* 0777 */ + break; + case VSOCK: + mode |= S_IFSOCK; + break; + case VFIFO: + mode |= S_IFIFO; + break; + default: + return (EBADF); + }; + sb->st_mode = mode; + sb->st_nlink = vap->va_nlink; + sb->st_uid = vap->va_uid; + sb->st_gid = vap->va_gid; + sb->st_rdev = vap->va_rdev; + sb->st_size = vap->va_size; + sb->st_atimespec = vap->va_atime; + sb->st_mtimespec = vap->va_mtime; + sb->st_ctimespec = vap->va_ctime; + sb->st_blksize = vap->va_blocksize; + sb->st_flags = vap->va_flags; + if (p->p_ucred->cr_uid != 0) + sb->st_gen = 0; + else + sb->st_gen = vap->va_gen; + +#if (S_BLKSIZE == 512) + /* Optimize this case */ + sb->st_blocks = vap->va_bytes >> 9; +#else + sb->st_blocks = vap->va_bytes / S_BLKSIZE; +#endif + return (0); +} + +/* + * File table vnode ioctl routine. + */ +static int +vn_ioctl(fp, com, data, p) + struct file *fp; + u_long com; + caddr_t data; + struct proc *p; +{ + register struct vnode *vp = ((struct vnode *)fp->f_data); + struct vattr vattr; + int error; + + switch (vp->v_type) { + + case VREG: + case VDIR: + if (com == FIONREAD) { + error = VOP_GETATTR(vp, &vattr, p->p_ucred, p); + if (error) + return (error); + *(int *)data = vattr.va_size - fp->f_offset; + return (0); + } + if (com == FIONBIO || com == FIOASYNC) /* XXX */ + return (0); /* XXX */ + /* fall into ... */ + + default: +#if 0 + return (ENOTTY); +#endif + case VFIFO: + case VCHR: + case VBLK: + error = VOP_IOCTL(vp, com, data, fp->f_flag, p->p_ucred, p); + if (error == 0 && com == TIOCSCTTY) { + + /* Do nothing if reassigning same control tty */ + if (p->p_session->s_ttyvp == vp) + return (0); + + /* Get rid of reference to old control tty */ + if (p->p_session->s_ttyvp) + vrele(p->p_session->s_ttyvp); + + p->p_session->s_ttyvp = vp; + VREF(vp); + } + return (error); + } +} + +/* + * File table vnode poll routine. + */ +static int +vn_poll(fp, events, cred, p) + struct file *fp; + int events; + struct ucred *cred; + struct proc *p; +{ + + return (VOP_POLL(((struct vnode *)fp->f_data), events, cred, p)); +} + +/* + * Check that the vnode is still valid, and if so + * acquire requested lock. + */ +int +#ifndef DEBUG_LOCKS +vn_lock(vp, flags, p) +#else +debug_vn_lock(vp, flags, p, filename, line) +#endif + struct vnode *vp; + int flags; + struct proc *p; +#ifdef DEBUG_LOCKS + const char *filename; + int line; +#endif +{ + int error; + + do { + if ((flags & LK_INTERLOCK) == 0) + simple_lock(&vp->v_interlock); + if (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + simple_unlock(&vp->v_interlock); + tsleep((caddr_t)vp, PINOD, "vn_lock", 0); + error = ENOENT; + } else { +#ifdef DEBUG_LOCKS + vp->filename = filename; + vp->line = line; +#endif + error = VOP_LOCK(vp, + flags | LK_NOPAUSE | LK_INTERLOCK, p); + if (error == 0) + return (error); + } + flags &= ~LK_INTERLOCK; + } while (flags & LK_RETRY); + return (error); +} + +/* + * File table vnode close routine. + */ +static int +vn_closefile(fp, p) + struct file *fp; + struct proc *p; +{ + + return (vn_close(((struct vnode *)fp->f_data), fp->f_flag, + fp->f_cred, p)); +} diff --git a/sys/kern/vnode_if.pl b/sys/kern/vnode_if.pl new file mode 100644 index 0000000..8193edb --- /dev/null +++ b/sys/kern/vnode_if.pl @@ -0,0 +1,402 @@ +#!/bin/sh - +# +# Copyright (c) 1992, 1993 +# The Regents of the University of California. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. All advertising materials mentioning features or use of this software +# must display the following acknowledgement: +# This product includes software developed by the University of +# California, Berkeley and its contributors. +# 4. Neither the name of the University nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 +# $Id: vnode_if.sh,v 1.15 1998/07/04 20:45:32 julian Exp $ +# + +# Script to produce VFS front-end sugar. +# +# usage: vnode_if.sh srcfile +# (where srcfile is currently /sys/kern/vnode_if.src) +# +# These awk scripts are not particularly well written, specifically they +# don't use arrays well and figure out the same information repeatedly. +# Please rewrite them if you actually understand how to use awk. Note, +# they use nawk extensions and gawk's toupper. + +if [ $# -ne 1 ] ; then + echo 'usage: vnode_if.sh srcfile' + exit 1 +fi + +# Name of the source file. +SRC=$1 + +# Names of the created files. +CFILE=vnode_if.c +HEADER=vnode_if.h + +# Awk program (must support nawk extensions and gawk's "toupper") +# Use "awk" at Berkeley, "gawk" elsewhere. +AWK=awk + +# Print out header information for vnode_if.h. +cat << END_OF_LEADING_COMMENT > $HEADER +/* + * This file is produced automatically. + * Do not modify anything in here by hand. + * + * Created from @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 + */ + +extern struct vnodeop_desc vop_default_desc; +END_OF_LEADING_COMMENT + +# Awk script to take vnode_if.src and turn it into vnode_if.h. +$AWK ' + NF == 0 || $0 ~ "^#" { + next; + } + { + # Get the function name. + name = $1; + uname = toupper(name); + + # Get the function arguments. + for (c1 = 0;; ++c1) { + if (getline <= 0) + exit + if ($0 ~ "^};") + break; + a[c1] = $0; + } + + # Print out the vop_F_args structure. + printf("struct %s_args {\n\tstruct vnodeop_desc *a_desc;\n", + name); + for (c2 = 0; c2 < c1; ++c2) { + c3 = split(a[c2], t); + printf("\t"); + if (t[2] ~ "WILLRELE") + c4 = 3; + else + c4 = 2; + for (; c4 < c3; ++c4) + printf("%s ", t[c4]); + beg = match(t[c3], "[^*]"); + printf("%sa_%s\n", + substr(t[c4], 0, beg - 1), substr(t[c4], beg)); + } + printf("};\n"); + + # Print out extern declaration. + printf("extern struct vnodeop_desc %s_desc;\n", name); + + # Print out prototype. + printf("static int %s __P((\n", uname); + sep = ",\n"; + for (c2 = 0; c2 < c1; ++c2) { + if (c2 == c1 - 1) + sep = "));\n"; + c3 = split(a[c2], t); + printf("\t"); + if (t[2] ~ "WILLRELE") + c4 = 3; + else + c4 = 2; + for (; c4 < c3; ++c4) + printf("%s ", t[c4]); + beg = match(t[c3], "[^*]"); + end = match(t[c3], ";"); + printf("%s%s%s", + substr(t[c4], 0, beg - 1), + substr(t[c4], beg, end - beg), sep); + } + + # Print out inline struct. + printf("static __inline int %s(", uname); + sep = ", "; + for (c2 = 0; c2 < c1; ++c2) { + if (c2 == c1 - 1) + sep = ")\n"; + c3 = split(a[c2], t); + beg = match(t[c3], "[^*]"); + end = match(t[c3], ";"); + printf("%s%s", substr(t[c3], beg, end - beg), sep); + } + for (c2 = 0; c2 < c1; ++c2) { + c3 = split(a[c2], t); + printf("\t"); + if (t[2] ~ "WILLRELE") + c4 = 3; + else + c4 = 2; + for (; c4 < c3; ++c4) + printf("%s ", t[c4]); + beg = match(t[c3], "[^*]"); + printf("%s%s\n", + substr(t[c4], 0, beg - 1), substr(t[c4], beg)); + } + printf("{\n\tstruct %s_args a;\n\n", name); + printf("\ta.a_desc = VDESC(%s);\n", name); + for (c2 = 0; c2 < c1; ++c2) { + c3 = split(a[c2], t); + printf("\t"); + beg = match(t[c3], "[^*]"); + end = match(t[c3], ";"); + printf("a.a_%s = %s\n", + substr(t[c3], beg, end - beg), substr(t[c3], beg)); + } + c1 = split(a[0], t); + beg = match(t[c1], "[^*]"); + end = match(t[c1], ";"); + printf("\treturn (VCALL(%s, VOFFSET(%s), &a));\n}\n", + substr(t[c1], beg, end - beg), name); + }' < $SRC >> $HEADER + +# Print out header information for vnode_if.c. +cat << END_OF_LEADING_COMMENT > $CFILE +/* + * This file is produced automatically. + * Do not modify anything in here by hand. + * + * Created from @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 + */ + +#include <sys/param.h> +#include <sys/vnode.h> + +struct vnodeop_desc vop_default_desc = { + 1, /* special case, vop_default => 1 */ + "default", + 0, + NULL, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, +}; + +END_OF_LEADING_COMMENT + +# Awk script to take vnode_if.src and turn it into vnode_if.c. +$AWK 'function kill_surrounding_ws (s) { + sub (/^[ \t]*/, "", s); + sub (/[ \t]*$/, "", s); + return s; + } + + function read_args() { + numargs = 0; + while (getline ln) { + if (ln ~ /}/) { + break; + }; + + # Delete comments, if any. + gsub (/\/\*.*\*\//, "", ln); + + # Delete leading/trailing space. + ln = kill_surrounding_ws(ln); + + # Pick off direction. + if (1 == sub(/^INOUT[ \t]+/, "", ln)) + dir = "INOUT"; + else if (1 == sub(/^IN[ \t]+/, "", ln)) + dir = "IN"; + else if (1 == sub(/^OUT[ \t]+/, "", ln)) + dir = "OUT"; + else + bail("No IN/OUT direction for \"" ln "\"."); + + # check for "WILLRELE" + if (1 == sub(/^WILLRELE[ \t]+/, "", ln)) { + rele = "WILLRELE"; + } else { + rele = "WONTRELE"; + }; + + # kill trailing ; + if (1 != sub (/;$/, "", ln)) { + bail("Missing end-of-line ; in \"" ln "\"."); + }; + + # pick off variable name + if (!(i = match(ln, /[A-Za-z0-9_]+$/))) { + bail("Missing var name \"a_foo\" in \"" ln "\"."); + }; + arg = substr (ln, i); + # Want to <<substr(ln, i) = "";>>, but nawk cannot. + # Hack around this. + ln = substr(ln, 1, i-1); + + # what is left must be type + # (put clean it up some) + type = ln; + gsub (/[ \t]+/, " ", type); # condense whitespace + type = kill_surrounding_ws(type); + + # (boy this was easier in Perl) + + numargs++; + dirs[numargs] = dir; + reles[numargs] = rele; + types[numargs] = type; + args[numargs] = arg; + }; + } + + function generate_operation_vp_offsets() { + printf ("static int %s_vp_offsets[] = {\n", name); + # as a side effect, figure out the releflags + releflags = ""; + vpnum = 0; + for (i=1; i<=numargs; i++) { + if (types[i] == "struct vnode *") { + printf ("\tVOPARG_OFFSETOF(struct %s_args,a_%s),\n", + name, args[i]); + if (reles[i] == "WILLRELE") { + releflags = releflags "|VDESC_VP" vpnum "_WILLRELE"; + }; + vpnum++; + }; + }; + sub (/^\|/, "", releflags); + print "\tVDESC_NO_OFFSET"; + print "};"; + } + + function find_arg_with_type (type) { + for (i=1; i<=numargs; i++) { + if (types[i] == type) { + return "VOPARG_OFFSETOF(struct " name "_args,a_" args[i] ")"; + }; + }; + return "VDESC_NO_OFFSET"; + } + + function generate_operation_desc() { + printf ("struct vnodeop_desc %s_desc = {\n", name); + # offset + printf ("\t0,\n"); + # printable name + printf ("\t\"%s\",\n", name); + # flags + vppwillrele = ""; + for (i=1; i<=numargs; i++) { + if (types[i] == "struct vnode **" && + (reles[i] == "WILLRELE")) { + vppwillrele = "|VDESC_VPP_WILLRELE"; + }; + }; + if (releflags == "") { + printf ("\t0%s,\n", vppwillrele); + } else { + printf ("\t%s%s,\n", releflags, vppwillrele); + }; + # vp offsets + printf ("\t%s_vp_offsets,\n", name); + # vpp (if any) + printf ("\t%s,\n", find_arg_with_type("struct vnode **")); + # cred (if any) + printf ("\t%s,\n", find_arg_with_type("struct ucred *")); + # proc (if any) + printf ("\t%s,\n", find_arg_with_type("struct proc *")); + # componentname + printf ("\t%s,\n", find_arg_with_type("struct componentname *")); + # transport layer information + printf ("\tNULL,\n};\n"); + } + + NF == 0 || $0 ~ "^#" { + next; + } + { + # get the function name + name = $1; + + # get the function arguments + read_args(); + + # Print out the vop_F_vp_offsets structure. This all depends + # on naming conventions and nothing else. + generate_operation_vp_offsets(); + + # Print out the vnodeop_desc structure. + generate_operation_desc(); + + printf "\n"; + + }' < $SRC >> $CFILE +# THINGS THAT DON'T WORK RIGHT YET. +# +# Two existing BSD vnodeops (bwrite and strategy) don't take any vnodes as +# arguments. This means that these operations can't function successfully +# through a bypass routine. +# +# Bwrite and strategy will be replaced when the VM page/buffer cache +# integration happens. +# +# To get around this problem for now we handle these ops as special cases. + +cat << END_OF_SPECIAL_CASES >> $HEADER +#include <sys/buf.h> + +struct vop_bwrite_args { + struct vnodeop_desc *a_desc; + struct buf *a_bp; +}; +extern struct vnodeop_desc vop_bwrite_desc; +static int VOP_BWRITE __P(( + struct buf *bp)); +static __inline int VOP_BWRITE(bp) + struct buf *bp; +{ + struct vop_bwrite_args a; + + a.a_desc = VDESC(vop_bwrite); + a.a_bp = bp; + return (VCALL((bp)->b_vp, VOFFSET(vop_bwrite), &a)); +} + +extern int vfs_opv_numops; +END_OF_SPECIAL_CASES + +cat << END_OF_SPECIAL_CASES >> $CFILE +static int vop_bwrite_vp_offsets[] = { + VDESC_NO_OFFSET +}; +struct vnodeop_desc vop_bwrite_desc = { + 0, + "vop_bwrite", + 0, + vop_bwrite_vp_offsets, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, +}; +END_OF_SPECIAL_CASES diff --git a/sys/kern/vnode_if.sh b/sys/kern/vnode_if.sh new file mode 100644 index 0000000..8193edb --- /dev/null +++ b/sys/kern/vnode_if.sh @@ -0,0 +1,402 @@ +#!/bin/sh - +# +# Copyright (c) 1992, 1993 +# The Regents of the University of California. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. All advertising materials mentioning features or use of this software +# must display the following acknowledgement: +# This product includes software developed by the University of +# California, Berkeley and its contributors. +# 4. Neither the name of the University nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 +# $Id: vnode_if.sh,v 1.15 1998/07/04 20:45:32 julian Exp $ +# + +# Script to produce VFS front-end sugar. +# +# usage: vnode_if.sh srcfile +# (where srcfile is currently /sys/kern/vnode_if.src) +# +# These awk scripts are not particularly well written, specifically they +# don't use arrays well and figure out the same information repeatedly. +# Please rewrite them if you actually understand how to use awk. Note, +# they use nawk extensions and gawk's toupper. + +if [ $# -ne 1 ] ; then + echo 'usage: vnode_if.sh srcfile' + exit 1 +fi + +# Name of the source file. +SRC=$1 + +# Names of the created files. +CFILE=vnode_if.c +HEADER=vnode_if.h + +# Awk program (must support nawk extensions and gawk's "toupper") +# Use "awk" at Berkeley, "gawk" elsewhere. +AWK=awk + +# Print out header information for vnode_if.h. +cat << END_OF_LEADING_COMMENT > $HEADER +/* + * This file is produced automatically. + * Do not modify anything in here by hand. + * + * Created from @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 + */ + +extern struct vnodeop_desc vop_default_desc; +END_OF_LEADING_COMMENT + +# Awk script to take vnode_if.src and turn it into vnode_if.h. +$AWK ' + NF == 0 || $0 ~ "^#" { + next; + } + { + # Get the function name. + name = $1; + uname = toupper(name); + + # Get the function arguments. + for (c1 = 0;; ++c1) { + if (getline <= 0) + exit + if ($0 ~ "^};") + break; + a[c1] = $0; + } + + # Print out the vop_F_args structure. + printf("struct %s_args {\n\tstruct vnodeop_desc *a_desc;\n", + name); + for (c2 = 0; c2 < c1; ++c2) { + c3 = split(a[c2], t); + printf("\t"); + if (t[2] ~ "WILLRELE") + c4 = 3; + else + c4 = 2; + for (; c4 < c3; ++c4) + printf("%s ", t[c4]); + beg = match(t[c3], "[^*]"); + printf("%sa_%s\n", + substr(t[c4], 0, beg - 1), substr(t[c4], beg)); + } + printf("};\n"); + + # Print out extern declaration. + printf("extern struct vnodeop_desc %s_desc;\n", name); + + # Print out prototype. + printf("static int %s __P((\n", uname); + sep = ",\n"; + for (c2 = 0; c2 < c1; ++c2) { + if (c2 == c1 - 1) + sep = "));\n"; + c3 = split(a[c2], t); + printf("\t"); + if (t[2] ~ "WILLRELE") + c4 = 3; + else + c4 = 2; + for (; c4 < c3; ++c4) + printf("%s ", t[c4]); + beg = match(t[c3], "[^*]"); + end = match(t[c3], ";"); + printf("%s%s%s", + substr(t[c4], 0, beg - 1), + substr(t[c4], beg, end - beg), sep); + } + + # Print out inline struct. + printf("static __inline int %s(", uname); + sep = ", "; + for (c2 = 0; c2 < c1; ++c2) { + if (c2 == c1 - 1) + sep = ")\n"; + c3 = split(a[c2], t); + beg = match(t[c3], "[^*]"); + end = match(t[c3], ";"); + printf("%s%s", substr(t[c3], beg, end - beg), sep); + } + for (c2 = 0; c2 < c1; ++c2) { + c3 = split(a[c2], t); + printf("\t"); + if (t[2] ~ "WILLRELE") + c4 = 3; + else + c4 = 2; + for (; c4 < c3; ++c4) + printf("%s ", t[c4]); + beg = match(t[c3], "[^*]"); + printf("%s%s\n", + substr(t[c4], 0, beg - 1), substr(t[c4], beg)); + } + printf("{\n\tstruct %s_args a;\n\n", name); + printf("\ta.a_desc = VDESC(%s);\n", name); + for (c2 = 0; c2 < c1; ++c2) { + c3 = split(a[c2], t); + printf("\t"); + beg = match(t[c3], "[^*]"); + end = match(t[c3], ";"); + printf("a.a_%s = %s\n", + substr(t[c3], beg, end - beg), substr(t[c3], beg)); + } + c1 = split(a[0], t); + beg = match(t[c1], "[^*]"); + end = match(t[c1], ";"); + printf("\treturn (VCALL(%s, VOFFSET(%s), &a));\n}\n", + substr(t[c1], beg, end - beg), name); + }' < $SRC >> $HEADER + +# Print out header information for vnode_if.c. +cat << END_OF_LEADING_COMMENT > $CFILE +/* + * This file is produced automatically. + * Do not modify anything in here by hand. + * + * Created from @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 + */ + +#include <sys/param.h> +#include <sys/vnode.h> + +struct vnodeop_desc vop_default_desc = { + 1, /* special case, vop_default => 1 */ + "default", + 0, + NULL, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, +}; + +END_OF_LEADING_COMMENT + +# Awk script to take vnode_if.src and turn it into vnode_if.c. +$AWK 'function kill_surrounding_ws (s) { + sub (/^[ \t]*/, "", s); + sub (/[ \t]*$/, "", s); + return s; + } + + function read_args() { + numargs = 0; + while (getline ln) { + if (ln ~ /}/) { + break; + }; + + # Delete comments, if any. + gsub (/\/\*.*\*\//, "", ln); + + # Delete leading/trailing space. + ln = kill_surrounding_ws(ln); + + # Pick off direction. + if (1 == sub(/^INOUT[ \t]+/, "", ln)) + dir = "INOUT"; + else if (1 == sub(/^IN[ \t]+/, "", ln)) + dir = "IN"; + else if (1 == sub(/^OUT[ \t]+/, "", ln)) + dir = "OUT"; + else + bail("No IN/OUT direction for \"" ln "\"."); + + # check for "WILLRELE" + if (1 == sub(/^WILLRELE[ \t]+/, "", ln)) { + rele = "WILLRELE"; + } else { + rele = "WONTRELE"; + }; + + # kill trailing ; + if (1 != sub (/;$/, "", ln)) { + bail("Missing end-of-line ; in \"" ln "\"."); + }; + + # pick off variable name + if (!(i = match(ln, /[A-Za-z0-9_]+$/))) { + bail("Missing var name \"a_foo\" in \"" ln "\"."); + }; + arg = substr (ln, i); + # Want to <<substr(ln, i) = "";>>, but nawk cannot. + # Hack around this. + ln = substr(ln, 1, i-1); + + # what is left must be type + # (put clean it up some) + type = ln; + gsub (/[ \t]+/, " ", type); # condense whitespace + type = kill_surrounding_ws(type); + + # (boy this was easier in Perl) + + numargs++; + dirs[numargs] = dir; + reles[numargs] = rele; + types[numargs] = type; + args[numargs] = arg; + }; + } + + function generate_operation_vp_offsets() { + printf ("static int %s_vp_offsets[] = {\n", name); + # as a side effect, figure out the releflags + releflags = ""; + vpnum = 0; + for (i=1; i<=numargs; i++) { + if (types[i] == "struct vnode *") { + printf ("\tVOPARG_OFFSETOF(struct %s_args,a_%s),\n", + name, args[i]); + if (reles[i] == "WILLRELE") { + releflags = releflags "|VDESC_VP" vpnum "_WILLRELE"; + }; + vpnum++; + }; + }; + sub (/^\|/, "", releflags); + print "\tVDESC_NO_OFFSET"; + print "};"; + } + + function find_arg_with_type (type) { + for (i=1; i<=numargs; i++) { + if (types[i] == type) { + return "VOPARG_OFFSETOF(struct " name "_args,a_" args[i] ")"; + }; + }; + return "VDESC_NO_OFFSET"; + } + + function generate_operation_desc() { + printf ("struct vnodeop_desc %s_desc = {\n", name); + # offset + printf ("\t0,\n"); + # printable name + printf ("\t\"%s\",\n", name); + # flags + vppwillrele = ""; + for (i=1; i<=numargs; i++) { + if (types[i] == "struct vnode **" && + (reles[i] == "WILLRELE")) { + vppwillrele = "|VDESC_VPP_WILLRELE"; + }; + }; + if (releflags == "") { + printf ("\t0%s,\n", vppwillrele); + } else { + printf ("\t%s%s,\n", releflags, vppwillrele); + }; + # vp offsets + printf ("\t%s_vp_offsets,\n", name); + # vpp (if any) + printf ("\t%s,\n", find_arg_with_type("struct vnode **")); + # cred (if any) + printf ("\t%s,\n", find_arg_with_type("struct ucred *")); + # proc (if any) + printf ("\t%s,\n", find_arg_with_type("struct proc *")); + # componentname + printf ("\t%s,\n", find_arg_with_type("struct componentname *")); + # transport layer information + printf ("\tNULL,\n};\n"); + } + + NF == 0 || $0 ~ "^#" { + next; + } + { + # get the function name + name = $1; + + # get the function arguments + read_args(); + + # Print out the vop_F_vp_offsets structure. This all depends + # on naming conventions and nothing else. + generate_operation_vp_offsets(); + + # Print out the vnodeop_desc structure. + generate_operation_desc(); + + printf "\n"; + + }' < $SRC >> $CFILE +# THINGS THAT DON'T WORK RIGHT YET. +# +# Two existing BSD vnodeops (bwrite and strategy) don't take any vnodes as +# arguments. This means that these operations can't function successfully +# through a bypass routine. +# +# Bwrite and strategy will be replaced when the VM page/buffer cache +# integration happens. +# +# To get around this problem for now we handle these ops as special cases. + +cat << END_OF_SPECIAL_CASES >> $HEADER +#include <sys/buf.h> + +struct vop_bwrite_args { + struct vnodeop_desc *a_desc; + struct buf *a_bp; +}; +extern struct vnodeop_desc vop_bwrite_desc; +static int VOP_BWRITE __P(( + struct buf *bp)); +static __inline int VOP_BWRITE(bp) + struct buf *bp; +{ + struct vop_bwrite_args a; + + a.a_desc = VDESC(vop_bwrite); + a.a_bp = bp; + return (VCALL((bp)->b_vp, VOFFSET(vop_bwrite), &a)); +} + +extern int vfs_opv_numops; +END_OF_SPECIAL_CASES + +cat << END_OF_SPECIAL_CASES >> $CFILE +static int vop_bwrite_vp_offsets[] = { + VDESC_NO_OFFSET +}; +struct vnodeop_desc vop_bwrite_desc = { + 0, + "vop_bwrite", + 0, + vop_bwrite_vp_offsets, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, +}; +END_OF_SPECIAL_CASES diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src new file mode 100644 index 0000000..48c9fef --- /dev/null +++ b/sys/kern/vnode_if.src @@ -0,0 +1,488 @@ +# +# Copyright (c) 1992, 1993 +# The Regents of the University of California. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. All advertising materials mentioning features or use of this software +# must display the following acknowledgement: +# This product includes software developed by the University of +# California, Berkeley and its contributors. +# 4. Neither the name of the University nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# @(#)vnode_if.src 8.12 (Berkeley) 5/14/95 +# $Id: vnode_if.src,v 1.18 1998/07/04 20:45:32 julian Exp $ +# + +# +# Above each of the vop descriptors is a specification of the locking +# protocol used by each vop call. The first column is the name of +# the variable, the remaining three columns are in, out and error +# respectively. The "in" column defines the lock state on input, +# the "out" column defines the state on succesful return, and the +# "error" column defines the locking state on error exit. +# +# The locking value can take the following values: +# L: locked. +# U: unlocked/ +# -: not applicable. vnode does not yet (or no longer) exists. +# =: the same on input and output, may be either L or U. +# X: locked if not nil. +# + +# +#% lookup dvp L ? ? +#% lookup vpp - L - +# +# XXX - the lookup locking protocol defies simple description and depends +# on the flags and operation fields in the (cnp) structure. Note +# especially that *vpp may equal dvp and both may be locked. +# +vop_lookup { + IN struct vnode *dvp; + INOUT struct vnode **vpp; + IN struct componentname *cnp; +}; + +# +#% cachedlookup dvp L ? ? +#% cachedlookup vpp - L - +# +# This must be an exact copy of lookup. See kern/vfs_cache.c for details. +# +vop_cachedlookup { + IN struct vnode *dvp; + INOUT struct vnode **vpp; + IN struct componentname *cnp; +}; + +# +#% create dvp L L L +#% create vpp - L - +# +vop_create { + IN struct vnode *dvp; + OUT struct vnode **vpp; + IN struct componentname *cnp; + IN struct vattr *vap; +}; + +# +#% whiteout dvp L L L +#% whiteout cnp - - - +#% whiteout flag - - - +# +vop_whiteout { + IN struct vnode *dvp; + IN struct componentname *cnp; + IN int flags; +}; + +# +#% mknod dvp L L L +#% mknod vpp - X - +# +vop_mknod { + IN struct vnode *dvp; + OUT WILLRELE struct vnode **vpp; + IN struct componentname *cnp; + IN struct vattr *vap; +}; + +# +#% open vp L L L +# +vop_open { + IN struct vnode *vp; + IN int mode; + IN struct ucred *cred; + IN struct proc *p; +}; + +# +#% close vp U U U +# +vop_close { + IN struct vnode *vp; + IN int fflag; + IN struct ucred *cred; + IN struct proc *p; +}; + +# +#% access vp L L L +# +vop_access { + IN struct vnode *vp; + IN int mode; + IN struct ucred *cred; + IN struct proc *p; +}; + +# +#% getattr vp = = = +# +vop_getattr { + IN struct vnode *vp; + IN struct vattr *vap; + IN struct ucred *cred; + IN struct proc *p; +}; + +# +#% setattr vp L L L +# +vop_setattr { + IN struct vnode *vp; + IN struct vattr *vap; + IN struct ucred *cred; + IN struct proc *p; +}; + +# +#% read vp L L L +# +vop_read { + IN struct vnode *vp; + INOUT struct uio *uio; + IN int ioflag; + IN struct ucred *cred; +}; + +# +#% write vp L L L +# +vop_write { + IN struct vnode *vp; + INOUT struct uio *uio; + IN int ioflag; + IN struct ucred *cred; +}; + +# +#% lease vp = = = +# +vop_lease { + IN struct vnode *vp; + IN struct proc *p; + IN struct ucred *cred; + IN int flag; +}; + +# +#% ioctl vp U U U +# +vop_ioctl { + IN struct vnode *vp; + IN u_long command; + IN caddr_t data; + IN int fflag; + IN struct ucred *cred; + IN struct proc *p; +}; + +# +#% poll vp U U U +# +vop_poll { + IN struct vnode *vp; + IN int events; + IN struct ucred *cred; + IN struct proc *p; +}; + +# +#% revoke vp U U U +# +vop_revoke { + IN struct vnode *vp; + IN int flags; +}; + +# +# XXX - not used +# +vop_mmap { + IN struct vnode *vp; + IN int fflags; + IN struct ucred *cred; + IN struct proc *p; +}; + +# +#% fsync vp L L L +# +vop_fsync { + IN struct vnode *vp; + IN struct ucred *cred; + IN int waitfor; + IN struct proc *p; +}; + +# +#% remove dvp L L L +#% remove vp L L L +# +vop_remove { + IN struct vnode *dvp; + IN struct vnode *vp; + IN struct componentname *cnp; +}; + +# +#% link tdvp L L L +#% link vp U U U +# +vop_link { + IN struct vnode *tdvp; + IN struct vnode *vp; + IN struct componentname *cnp; +}; + +# +#% rename fdvp U U U +#% rename fvp U U U +#% rename tdvp L U U +#% rename tvp X U U +# +vop_rename { + IN WILLRELE struct vnode *fdvp; + IN WILLRELE struct vnode *fvp; + IN struct componentname *fcnp; + IN WILLRELE struct vnode *tdvp; + IN WILLRELE struct vnode *tvp; + IN struct componentname *tcnp; +}; + +# +#% mkdir dvp L L L +#% mkdir vpp - L - +# +vop_mkdir { + IN struct vnode *dvp; + OUT struct vnode **vpp; + IN struct componentname *cnp; + IN struct vattr *vap; +}; + +# +#% rmdir dvp L L L +#% rmdir vp L L L +# +vop_rmdir { + IN struct vnode *dvp; + IN struct vnode *vp; + IN struct componentname *cnp; +}; + +# +#% symlink dvp L L L +#% symlink vpp - U - +# +# XXX - note that the return vnode has already been VRELE'ed +# by the filesystem layer. To use it you must use vget, +# possibly with a further namei. +# +vop_symlink { + IN struct vnode *dvp; + OUT WILLRELE struct vnode **vpp; + IN struct componentname *cnp; + IN struct vattr *vap; + IN char *target; +}; + +# +#% readdir vp L L L +# +vop_readdir { + IN struct vnode *vp; + INOUT struct uio *uio; + IN struct ucred *cred; + INOUT int *eofflag; + OUT int *ncookies; + INOUT u_long **cookies; +}; + +# +#% readlink vp L L L +# +vop_readlink { + IN struct vnode *vp; + INOUT struct uio *uio; + IN struct ucred *cred; +}; + +# +#% abortop dvp = = = +# +vop_abortop { + IN struct vnode *dvp; + IN struct componentname *cnp; +}; + +# +#% inactive vp L U U +# +vop_inactive { + IN struct vnode *vp; + IN struct proc *p; +}; + +# +#% reclaim vp U U U +# +vop_reclaim { + IN struct vnode *vp; + IN struct proc *p; +}; + +# +#% lock vp U L U +# +vop_lock { + IN struct vnode *vp; + IN int flags; + IN struct proc *p; +}; + +# +#% unlock vp L U L +# +vop_unlock { + IN struct vnode *vp; + IN int flags; + IN struct proc *p; +}; + +# +#% bmap vp L L L +#% bmap vpp - U - +# +vop_bmap { + IN struct vnode *vp; + IN daddr_t bn; + OUT struct vnode **vpp; + IN daddr_t *bnp; + OUT int *runp; + OUT int *runb; +}; + +# +# Needs work: no vp? +# +vop_strategy { + IN struct vnode *vp; + IN struct buf *bp; +}; + +# +#% print vp = = = +# +vop_print { + IN struct vnode *vp; +}; + +# +#% islocked vp = = = +# +vop_islocked { + IN struct vnode *vp; +}; + +# +#% pathconf vp L L L +# +vop_pathconf { + IN struct vnode *vp; + IN int name; + OUT register_t *retval; +}; + +# +#% advlock vp U U U +# +vop_advlock { + IN struct vnode *vp; + IN caddr_t id; + IN int op; + IN struct flock *fl; + IN int flags; +}; + +# +#% balloc vp L L L +# +vop_balloc { + IN struct vnode *vp; + IN off_t startoffset; + IN int size; + IN struct ucred *cred; + IN int flags; + OUT struct buf **bpp; +}; + +# +#% reallocblks vp L L L +# +vop_reallocblks { + IN struct vnode *vp; + IN struct cluster_save *buflist; +}; + +vop_getpages { + IN struct vnode *vp; + IN vm_page_t *m; + IN int count; + IN int reqpage; + IN vm_ooffset_t offset; +}; + +vop_putpages { + IN struct vnode *vp; + IN vm_page_t *m; + IN int count; + IN int sync; + IN int *rtvals; + IN vm_ooffset_t offset; +}; + +# +#% freeblks vp - - - +# +# This call is used by the filesystem to release blocks back to +# device-driver. This is useful if the driver has a lengthy +# erase handling or similar. +# + +vop_freeblks { + IN struct vnode *vp; + IN daddr_t addr; + IN daddr_t length; +}; + +# +# Needs work: no vp? +# +#vop_bwrite { +# IN struct buf *bp; +#}; |