diff options
Diffstat (limited to 'sys/kern')
97 files changed, 40170 insertions, 7231 deletions
diff --git a/sys/kern/Make.tags.inc b/sys/kern/Make.tags.inc index 79cb83a..a09e484 100644 --- a/sys/kern/Make.tags.inc +++ b/sys/kern/Make.tags.inc @@ -1,4 +1,5 @@ -# @(#)Make.tags.inc 8.2 (Berkeley) 11/23/94 +# @(#)Make.tags.inc 8.1 (Berkeley) 6/11/93 +# $Id$ # Common files for "make tags". # Included by the Makefile for each architecture. @@ -9,7 +10,6 @@ COMM= /sys/conf/*.[ch] \ /sys/dev/*.[ch] /sys/dev/scsi/*.[ch] \ - /sys/isofs/*/*.[ch] \ /sys/kern/*.[ch] /sys/libkern/*.[ch] \ /sys/miscfs/*/*.[ch] \ /sys/net/*.[ch] /sys/netccitt/*.[ch] /sys/netinet/*.[ch] \ diff --git a/sys/kern/Makefile b/sys/kern/Makefile index 3159d20..f42a44e 100644 --- a/sys/kern/Makefile +++ b/sys/kern/Makefile @@ -1,17 +1,20 @@ -# @(#)Makefile 8.3 (Berkeley) 2/14/95 +# @(#)Makefile 8.2 (Berkeley) 3/21/94 # Makefile for kernel tags files, init_sysent, etc. -ARCH= hp300 i386 luna68k news3400 pmax sparc tahoe vax +ARCH= i386 # luna68k news3400 pmax sparc tahoe vax all: @echo "make tags, make links or init_sysent.c only" -init_sysent.c syscalls.c ../sys/syscall.h ../sys/syscallargs.h: makesyscalls.sh syscalls.master +init_sysent.c syscalls.c ../sys/syscall.h ../sys/syscall-hide.h \ +../sys/sysproto.h: makesyscalls.sh syscalls.master -mv -f init_sysent.c init_sysent.c.bak -mv -f syscalls.c syscalls.c.bak -mv -f ../sys/syscall.h ../sys/syscall.h.bak - sh makesyscalls.sh syscalls.conf syscalls.master + -mv -f ../sys/syscall-hide.h ../sys/syscall-hide.h.bak + -mv -f ../sys/sysproto.h ../sys/sysproto.h.bak + sh makesyscalls.sh syscalls.master # Kernel tags: # Tags files are built in the top-level directory for each architecture, diff --git a/sys/kern/imgact_aout.c b/sys/kern/imgact_aout.c new file mode 100644 index 0000000..4adbd05 --- /dev/null +++ b/sys/kern/imgact_aout.c @@ -0,0 +1,210 @@ +/* + * Copyright (c) 1993, David Greenman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + */ + +#include "opt_rlimit.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/resourcevar.h> +#include <sys/exec.h> +#include <sys/mman.h> +#include <sys/imgact.h> +#include <sys/imgact_aout.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/sysent.h> +#include <sys/vnode.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_extern.h> + +static int exec_aout_imgact __P((struct image_params *imgp)); + +static int +exec_aout_imgact(imgp) + struct image_params *imgp; +{ + const struct exec *a_out = (const struct exec *) imgp->image_header; + struct vmspace *vmspace = imgp->proc->p_vmspace; + vm_offset_t vmaddr; + unsigned long virtual_offset; + unsigned long file_offset; + unsigned long bss_size; + int error; + + /* + * Linux and *BSD binaries look very much alike, + * only the machine id is different: + * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI. + * NetBSD is in network byte order.. ugh. + */ + if (((a_out->a_magic >> 16) & 0xff) != 0x86 && + ((a_out->a_magic >> 16) & 0xff) != 0 && + ((((int)ntohl(a_out->a_magic)) >> 16) & 0xff) != 0x86) + return -1; + + /* + * Set file/virtual offset based on a.out variant. + * We do two cases: host byte order and network byte order + * (for NetBSD compatibility) + */ + switch ((int)(a_out->a_magic & 0xffff)) { + case ZMAGIC: + virtual_offset = 0; + if (a_out->a_text) { + file_offset = PAGE_SIZE; + } else { + /* Bill's "screwball mode" */ + file_offset = 0; + } + break; + case QMAGIC: + virtual_offset = PAGE_SIZE; + file_offset = 0; + break; + default: + /* NetBSD compatibility */ + switch ((int)(ntohl(a_out->a_magic) & 0xffff)) { + case ZMAGIC: + case QMAGIC: + virtual_offset = PAGE_SIZE; + file_offset = 0; + break; + default: + return (-1); + } + } + + bss_size = roundup(a_out->a_bss, PAGE_SIZE); + + /* + * Check various fields in header for validity/bounds. + */ + if (/* entry point must lay with text region */ + a_out->a_entry < virtual_offset || + a_out->a_entry >= virtual_offset + a_out->a_text || + + /* text and data size must each be page rounded */ + a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK) + return (-1); + + /* text + data can't exceed file size */ + if (a_out->a_data + a_out->a_text > imgp->attr->va_size) + return (EFAULT); + + /* + * text/data/bss must not exceed limits + */ + if (/* text can't exceed maximum text size */ + a_out->a_text > MAXTSIZ || + + /* data + bss can't exceed maximum data size */ + a_out->a_data + bss_size > MAXDSIZ || + + /* data + bss can't exceed rlimit */ + a_out->a_data + bss_size > + imgp->proc->p_rlimit[RLIMIT_DATA].rlim_cur) + return (ENOMEM); + + /* copy in arguments and/or environment from old process */ + error = exec_extract_strings(imgp); + if (error) + return (error); + + /* + * Destroy old process VM and create a new one (with a new stack) + */ + exec_new_vmspace(imgp); + + /* + * Map text/data read/execute + */ + vmaddr = virtual_offset; + error = + vm_mmap(&vmspace->vm_map, /* map */ + &vmaddr, /* address */ + a_out->a_text + a_out->a_data, /* size */ + VM_PROT_READ | VM_PROT_EXECUTE, /* protection */ + VM_PROT_ALL, /* max protection */ + MAP_PRIVATE | MAP_FIXED, /* flags */ + (caddr_t)imgp->vp, /* vnode */ + file_offset); /* offset */ + if (error) + return (error); + + /* + * allow writing of data + */ + vm_map_protect(&vmspace->vm_map, + vmaddr + a_out->a_text, + vmaddr + a_out->a_text + a_out->a_data, + VM_PROT_ALL, + FALSE); + + if (bss_size != 0) { + /* + * Allocate demand-zeroed area for uninitialized data + * "bss" = 'block started by symbol' - named after the IBM 7090 + * instruction of the same name. + */ + vmaddr = virtual_offset + a_out->a_text + a_out->a_data; + error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); + if (error) + return (error); + } + + /* Fill in process VM information */ + vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT; + vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT; + vmspace->vm_taddr = (caddr_t) virtual_offset; + vmspace->vm_daddr = (caddr_t) virtual_offset + a_out->a_text; + + /* Fill in image_params */ + imgp->interpreted = 0; + imgp->entry_addr = a_out->a_entry; + + imgp->proc->p_sysent = &aout_sysvec; + + /* Indicate that this file should not be modified */ + imgp->vp->v_flag |= VTEXT; + + return (0); +} + +/* + * Tell kern_execve.c about it, with a little help from the linker. + * Since `const' objects end up in the text segment, TEXT_SET is the + * correct directive to use. + */ +static const struct execsw aout_execsw = { exec_aout_imgact, "a.out" }; +TEXT_SET(execsw_set, aout_execsw); diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c new file mode 100644 index 0000000..525d76d --- /dev/null +++ b/sys/kern/imgact_elf.c @@ -0,0 +1,749 @@ +/*- + * Copyright (c) 1995-1996 Søren Schmidt + * Copyright (c) 1996 Peter Wemm + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer + * in this position and unchanged. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software withough specific prior written permission + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * $Id: imgact_elf.c,v 1.16 1997/02/22 09:38:56 peter Exp $ + */ + +#include "opt_rlimit.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/resourcevar.h> +#include <sys/exec.h> +#include <sys/mman.h> +#include <sys/imgact.h> +#include <sys/imgact_elf.h> +#include <sys/kernel.h> +#include <sys/sysent.h> +#include <sys/fcntl.h> +#include <sys/malloc.h> +#include <sys/mount.h> +#include <sys/namei.h> +#include <sys/proc.h> +#include <sys/sysproto.h> +#include <sys/syscall.h> +#include <sys/signalvar.h> +#include <sys/sysctl.h> +#include <sys/vnode.h> + +#include <vm/vm.h> +#include <vm/vm_kern.h> +#include <vm/vm_param.h> +#include <vm/pmap.h> +#include <sys/lock.h> +#include <vm/vm_map.h> +#include <vm/vm_prot.h> +#include <vm/vm_extern.h> + +#include <machine/md_var.h> +#include <i386/linux/linux_syscall.h> +#include <i386/linux/linux.h> + +#define MAX_PHDR 32 /* XXX enough ? */ + +static int map_pages __P((struct vnode *vp, vm_offset_t offset, vm_offset_t *buf, vm_size_t size)); +static void unmap_pages __P((vm_offset_t buf, vm_size_t size)); +static int elf_check_permissions __P((struct proc *p, struct vnode *vp)); +static int elf_check_header __P((const Elf32_Ehdr *hdr, int type)); +static int elf_load_section __P((struct vmspace *vmspace, struct vnode *vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot)); +static int elf_load_file __P((struct proc *p, char *file, u_long *addr, u_long *entry)); +static int elf_freebsd_fixup __P((int **stack_base, struct image_params *imgp)); +int exec_elf_imgact __P((struct image_params *imgp)); + +int elf_trace = 0; +SYSCTL_INT(_debug, 1, elf_trace, CTLFLAG_RW, &elf_trace, 0, ""); +#define UPRINTF if (elf_trace) uprintf + +static struct sysentvec elf_freebsd_sysvec = { + SYS_MAXSYSCALL, + sysent, + 0, + 0, + 0, + 0, + 0, + elf_freebsd_fixup, + sendsig, + sigcode, + &szsigcode, + 0, + "FreeBSD ELF" +}; + +static Elf32_Brandinfo freebsd_brand_info = { + "FreeBSD", + "", + "/usr/libexec/ld-elf.so.1", + &elf_freebsd_sysvec + }; +static Elf32_Brandinfo *elf_brand_list[MAX_BRANDS] = { + &freebsd_brand_info, + NULL, NULL, NULL, + NULL, NULL, NULL, NULL + }; + +int +elf_insert_brand_entry(Elf32_Brandinfo *entry) +{ + int i; + + for (i=1; i<MAX_BRANDS; i++) { + if (elf_brand_list[i] == NULL) { + elf_brand_list[i] = entry; + break; + } + } + if (i == MAX_BRANDS) + return -1; + return 0; +} + +int +elf_remove_brand_entry(Elf32_Brandinfo *entry) +{ + int i; + + for (i=1; i<MAX_BRANDS; i++) { + if (elf_brand_list[i] == entry) { + elf_brand_list[i] = NULL; + break; + } + } + if (i == MAX_BRANDS) + return -1; + return 0; +} + +static int +map_pages(struct vnode *vp, vm_offset_t offset, + vm_offset_t *buf, vm_size_t size) +{ + int error; + vm_offset_t kern_buf; + vm_size_t pageoff; + + /* + * The request may not be aligned, and may even cross several + * page boundaries in the file... + */ + pageoff = (offset & PAGE_MASK); + offset -= pageoff; /* start of first aligned page to map */ + size += pageoff; + size = round_page(size); /* size of aligned pages to map */ + + if (error = vm_mmap(kernel_map, + &kern_buf, + size, + VM_PROT_READ, + VM_PROT_READ, + 0, + (caddr_t)vp, + offset)) + return error; + + *buf = kern_buf + pageoff; + + return 0; +} + +static void +unmap_pages(vm_offset_t buf, vm_size_t size) +{ + vm_size_t pageoff; + + pageoff = (buf & PAGE_MASK); + buf -= pageoff; /* start of first aligned page to map */ + size += pageoff; + size = round_page(size);/* size of aligned pages to map */ + + vm_map_remove(kernel_map, buf, buf + size); +} + +static int +elf_check_permissions(struct proc *p, struct vnode *vp) +{ + struct vattr attr; + int error; + + /* + * Check number of open-for-writes on the file and deny execution + * if there are any. + */ + if (vp->v_writecount) { + return (ETXTBSY); + } + + /* Get file attributes */ + error = VOP_GETATTR(vp, &attr, p->p_ucred, p); + if (error) + return (error); + + /* + * 1) Check if file execution is disabled for the filesystem that this + * file resides on. + * 2) Insure that at least one execute bit is on - otherwise root + * will always succeed, and we don't want to happen unless the + * file really is executable. + * 3) Insure that the file is a regular file. + */ + if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || + ((attr.va_mode & 0111) == 0) || + (attr.va_type != VREG)) { + return (EACCES); + } + + /* + * Zero length files can't be exec'd + */ + if (attr.va_size == 0) + return (ENOEXEC); + + /* + * Check for execute permission to file based on current credentials. + * Then call filesystem specific open routine (which does nothing + * in the general case). + */ + error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p); + if (error) + return (error); + + error = VOP_OPEN(vp, FREAD, p->p_ucred, p); + if (error) + return (error); + + return (0); +} + +static int +elf_check_header(const Elf32_Ehdr *hdr, int type) +{ + if (!(hdr->e_ident[EI_MAG0] == ELFMAG0 && + hdr->e_ident[EI_MAG1] == ELFMAG1 && + hdr->e_ident[EI_MAG2] == ELFMAG2 && + hdr->e_ident[EI_MAG3] == ELFMAG3)) + return ENOEXEC; + + if (hdr->e_machine != EM_386 && hdr->e_machine != EM_486) + return ENOEXEC; + + if (hdr->e_type != type) + return ENOEXEC; + + return 0; +} + +static int +elf_load_section(struct vmspace *vmspace, struct vnode *vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot) +{ + size_t map_len; + vm_offset_t map_addr; + int error; + unsigned char *data_buf = 0; + size_t copy_len; + + map_addr = trunc_page(vmaddr); + + if (memsz > filsz) + map_len = trunc_page(offset+filsz) - trunc_page(offset); + else + map_len = round_page(offset+filsz) - trunc_page(offset); + + if (error = vm_mmap (&vmspace->vm_map, + &map_addr, + map_len, + prot, + VM_PROT_ALL, + MAP_PRIVATE | MAP_FIXED, + (caddr_t)vp, + trunc_page(offset))) + return error; + + if (memsz == filsz) + return 0; + + /* + * We have to map the remaining bit of the file into the kernel's + * memory map, allocate some anonymous memory, and copy that last + * bit into it. The remaining space should be .bss... + */ + copy_len = (offset + filsz) - trunc_page(offset + filsz); + map_addr = trunc_page(vmaddr + filsz); + map_len = round_page(vmaddr + memsz) - map_addr; + + if (map_len != 0) { + if (error = vm_map_find(&vmspace->vm_map, NULL, 0, + &map_addr, map_len, FALSE, + VM_PROT_ALL, VM_PROT_ALL,0)) + return error; + } + + if (error = vm_mmap(kernel_map, + (vm_offset_t *)&data_buf, + PAGE_SIZE, + VM_PROT_READ, + VM_PROT_READ, + 0, + (caddr_t)vp, + trunc_page(offset + filsz))) + return error; + + error = copyout(data_buf, (caddr_t)map_addr, copy_len); + + vm_map_remove(kernel_map, (vm_offset_t)data_buf, + (vm_offset_t)data_buf + PAGE_SIZE); + + /* + * set it to the specified protection + */ + vm_map_protect(&vmspace->vm_map, map_addr, map_addr + map_len, prot, + FALSE); + + UPRINTF("bss size %d (%x)\n", map_len-copy_len, map_len-copy_len); + return error; +} + +static int +elf_load_file(struct proc *p, char *file, u_long *addr, u_long *entry) +{ + Elf32_Ehdr *hdr = NULL; + Elf32_Phdr *phdr = NULL; + struct nameidata nd; + struct vmspace *vmspace = p->p_vmspace; + vm_prot_t prot = 0; + unsigned long text_size = 0, data_size = 0; + unsigned long text_addr = 0, data_addr = 0; + int header_size = 0; + int error, i; + + NDINIT(&nd, LOOKUP, LOCKLEAF|FOLLOW, UIO_SYSSPACE, file, p); + + if (error = namei(&nd)) + goto fail; + + if (nd.ni_vp == NULL) { + error = ENOEXEC; + goto fail; + } + + /* + * Check permissions, modes, uid, etc on the file, and "open" it. + */ + error = elf_check_permissions(p, nd.ni_vp); + + /* + * No longer need this, and it prevents demand paging. + */ + VOP_UNLOCK(nd.ni_vp, 0, p); + + if (error) + goto fail; + + /* + * Map in the header + */ + if (error = map_pages(nd.ni_vp, 0, (vm_offset_t *)&hdr, sizeof(hdr))) + goto fail; + + /* + * Do we have a valid ELF header ? + */ + if (error = elf_check_header(hdr, ET_DYN)) + goto fail; + + /* + * ouch, need to bounds check in case user gives us a corrupted + * file with an insane header size + */ + if (hdr->e_phnum > MAX_PHDR) { /* XXX: ever more than this? */ + error = ENOEXEC; + goto fail; + } + + header_size = hdr->e_phentsize * hdr->e_phnum; + + if (error = map_pages(nd.ni_vp, hdr->e_phoff, (vm_offset_t *)&phdr, + header_size)) + goto fail; + + for (i = 0; i < hdr->e_phnum; i++) { + switch(phdr[i].p_type) { + + case PT_NULL: /* NULL section */ + UPRINTF ("ELF(file) PT_NULL section\n"); + break; + case PT_LOAD: /* Loadable segment */ + { + UPRINTF ("ELF(file) PT_LOAD section "); + if (phdr[i].p_flags & PF_X) + prot |= VM_PROT_EXECUTE; + if (phdr[i].p_flags & PF_W) + prot |= VM_PROT_WRITE; + if (phdr[i].p_flags & PF_R) + prot |= VM_PROT_READ; + + if (error = elf_load_section(vmspace, nd.ni_vp, + phdr[i].p_offset, + (caddr_t)phdr[i].p_vaddr + + (*addr), + phdr[i].p_memsz, + phdr[i].p_filesz, prot)) + goto fail; + + /* + * Is this .text or .data ?? + * + * We only handle one each of those yet XXX + */ + if (hdr->e_entry >= phdr[i].p_vaddr && + hdr->e_entry <(phdr[i].p_vaddr+phdr[i].p_memsz)) { + text_addr = trunc_page(phdr[i].p_vaddr+(*addr)); + text_size = round_page(phdr[i].p_memsz + + phdr[i].p_vaddr - + trunc_page(phdr[i].p_vaddr)); + *entry=(unsigned long)hdr->e_entry+(*addr); + UPRINTF(".text <%08x,%08x> entry=%08x\n", + text_addr, text_size, *entry); + } else { + data_addr = trunc_page(phdr[i].p_vaddr+(*addr)); + data_size = round_page(phdr[i].p_memsz + + phdr[i].p_vaddr - + trunc_page(phdr[i].p_vaddr)); + UPRINTF(".data <%08x,%08x>\n", + data_addr, data_size); + } + } + break; + + case PT_DYNAMIC:/* Dynamic link information */ + UPRINTF ("ELF(file) PT_DYNAMIC section\n"); + break; + case PT_INTERP: /* Path to interpreter */ + UPRINTF ("ELF(file) PT_INTERP section\n"); + break; + case PT_NOTE: /* Note section */ + UPRINTF ("ELF(file) PT_NOTE section\n"); + break; + case PT_SHLIB: /* Shared lib section */ + UPRINTF ("ELF(file) PT_SHLIB section\n"); + break; + case PT_PHDR: /* Program header table info */ + UPRINTF ("ELF(file) PT_PHDR section\n"); + break; + default: + UPRINTF ("ELF(file) %d section ??\n", phdr[i].p_type ); + } + } + +fail: + if (phdr) + unmap_pages((vm_offset_t)phdr, header_size); + if (hdr) + unmap_pages((vm_offset_t)hdr, sizeof(hdr)); + + return error; +} + +int +exec_elf_imgact(struct image_params *imgp) +{ + const Elf32_Ehdr *hdr = (const Elf32_Ehdr *) imgp->image_header; + const Elf32_Phdr *phdr, *mapped_phdr = NULL; + Elf32_Auxargs *elf_auxargs = NULL; + struct vmspace *vmspace = imgp->proc->p_vmspace; + vm_prot_t prot = 0; + u_long text_size = 0, data_size = 0; + u_long text_addr = 0, data_addr = 0; + u_long addr, entry = 0, proghdr = 0; + int error, i, header_size = 0, interp_len = 0; + char *interp = NULL; + char *brand = NULL; + char path[MAXPATHLEN]; + + /* + * Do we have a valid ELF header ? + */ + if (elf_check_header(hdr, ET_EXEC)) + return -1; + + /* + * From here on down, we return an errno, not -1, as we've + * detected an ELF file. + */ + + /* + * ouch, need to bounds check in case user gives us a corrupted + * file with an insane header size + */ + if (hdr->e_phnum > MAX_PHDR) { /* XXX: ever more than this? */ + return ENOEXEC; + } + + header_size = hdr->e_phentsize * hdr->e_phnum; + + if ((hdr->e_phoff > PAGE_SIZE) || + (hdr->e_phoff + header_size) > PAGE_SIZE) { + /* + * Ouch ! we only get one page full of header... + * Try to map it in ourselves, and see how we go. + */ + if (error = map_pages(imgp->vp, hdr->e_phoff, + (vm_offset_t *)&mapped_phdr, header_size)) + return (error); + /* + * Save manual mapping for cleanup + */ + phdr = mapped_phdr; + } else { + phdr = (const Elf32_Phdr*) + ((const char *)imgp->image_header + hdr->e_phoff); + } + + /* + * From this point on, we may have resources that need to be freed. + */ + if (error = exec_extract_strings(imgp)) + goto fail; + + exec_new_vmspace(imgp); + + for (i = 0; i < hdr->e_phnum; i++) { + switch(phdr[i].p_type) { + + case PT_NULL: /* NULL section */ + UPRINTF ("ELF PT_NULL section\n"); + break; + case PT_LOAD: /* Loadable segment */ + { + UPRINTF ("ELF PT_LOAD section "); + if (phdr[i].p_flags & PF_X) + prot |= VM_PROT_EXECUTE; + if (phdr[i].p_flags & PF_W) + prot |= VM_PROT_WRITE; + if (phdr[i].p_flags & PF_R) + prot |= VM_PROT_READ; + + if (error = elf_load_section(vmspace, imgp->vp, + phdr[i].p_offset, + (caddr_t)phdr[i].p_vaddr, + phdr[i].p_memsz, + phdr[i].p_filesz, prot)) + goto fail; + + /* + * Is this .text or .data ?? + * + * We only handle one each of those yet XXX + */ + if (hdr->e_entry >= phdr[i].p_vaddr && + hdr->e_entry <(phdr[i].p_vaddr+phdr[i].p_memsz)) { + text_addr = trunc_page(phdr[i].p_vaddr); + text_size = round_page(phdr[i].p_memsz + + phdr[i].p_vaddr - + text_addr); + entry = (u_long)hdr->e_entry; + UPRINTF(".text <%08x,%08x> entry=%08x\n", + text_addr, text_size, entry); + } else { + data_addr = trunc_page(phdr[i].p_vaddr); + data_size = round_page(phdr[i].p_memsz + + phdr[i].p_vaddr - + data_addr); + UPRINTF(".data <%08x,%08x>\n", + data_addr, data_size); + } + } + break; + + case PT_DYNAMIC:/* Dynamic link information */ + UPRINTF ("ELF PT_DYNAMIC section ??\n"); + break; + case PT_INTERP: /* Path to interpreter */ + UPRINTF ("ELF PT_INTERP section "); + if (phdr[i].p_filesz > MAXPATHLEN) { + error = ENOEXEC; + goto fail; + } + interp_len = MAXPATHLEN; + if (error = map_pages(imgp->vp, phdr[i].p_offset, + (vm_offset_t *)&interp, interp_len)) + goto fail; + UPRINTF("<%s>\n", interp); + break; + case PT_NOTE: /* Note section */ + UPRINTF ("ELF PT_NOTE section\n"); + break; + case PT_SHLIB: /* Shared lib section */ + UPRINTF ("ELF PT_SHLIB section\n"); + break; + case PT_PHDR: /* Program header table info */ + UPRINTF ("ELF PT_PHDR section <%x>\n", phdr[i].p_vaddr); + proghdr = phdr[i].p_vaddr; + break; + default: + UPRINTF ("ELF %d section ??\n", phdr[i].p_type); + } + } + + vmspace->vm_tsize = text_size >> PAGE_SHIFT; + vmspace->vm_taddr = (caddr_t)text_addr; + vmspace->vm_dsize = data_size >> PAGE_SHIFT; + vmspace->vm_daddr = (caddr_t)data_addr; + + addr = 2*MAXDSIZ; /* May depend on OS type XXX */ + + imgp->entry_addr = entry; + + /* + * So which kind (brand) of ELF binary do we have at hand + * FreeBSD, Linux, SVR4 or something else ?? + * If its has a interpreter section try that first + */ + if (interp) { + for (i=0; i<MAX_BRANDS; i++) { + if (elf_brand_list[i] != NULL) { + if (!strcmp(interp, elf_brand_list[i]->interp_path)) { + imgp->proc->p_sysent = + elf_brand_list[i]->sysvec; + strcpy(path, elf_brand_list[i]->emul_path); + strcat(path, elf_brand_list[i]->interp_path); + UPRINTF("interpreter=<%s> %s\n", + elf_brand_list[i]->interp_path, + elf_brand_list[i]->emul_path); + break; + } + } + } + } + + /* + * If there is no interpreter, or recognition of it + * failed, se if the binary is branded. + */ + if (!interp || i == MAX_BRANDS) { + brand = (char *)&(hdr->e_ident[EI_BRAND]); + for (i=0; i<MAX_BRANDS; i++) { + if (elf_brand_list[i] != NULL) { + if (!strcmp(brand, elf_brand_list[i]->brand)) { + imgp->proc->p_sysent = elf_brand_list[i]->sysvec; + if (interp) { + strcpy(path, elf_brand_list[i]->emul_path); + strcat(path, elf_brand_list[i]->interp_path); + UPRINTF("interpreter=<%s> %s\n", + elf_brand_list[i]->interp_path, + elf_brand_list[i]->emul_path); + } + break; + } + } + } + } + if (i == MAX_BRANDS) { + uprintf("ELF binary type not known\n"); + error = ENOEXEC; + goto fail; + } + if (interp) { + if (error = elf_load_file(imgp->proc, + path, + &addr, /* XXX */ + &imgp->entry_addr)) { + uprintf("ELF interpreter %s not found\n", path); + goto fail; + } + } + + UPRINTF("Executing %s binary\n", elf_brand_list[i]->brand); + + /* + * Construct auxargs table (used by the fixup routine) + */ + elf_auxargs = malloc(sizeof(Elf32_Auxargs), M_TEMP, M_WAITOK); + elf_auxargs->execfd = -1; + elf_auxargs->phdr = proghdr; + elf_auxargs->phent = hdr->e_phentsize; + elf_auxargs->phnum = hdr->e_phnum; + elf_auxargs->pagesz = PAGE_SIZE; + elf_auxargs->base = addr; + elf_auxargs->flags = 0; + elf_auxargs->entry = entry; + elf_auxargs->trace = elf_trace; + + imgp->auxargs = elf_auxargs; + imgp->interpreted = 0; + + /* don't allow modifying the file while we run it */ + imgp->vp->v_flag |= VTEXT; + +fail: + if (mapped_phdr) + unmap_pages((vm_offset_t)mapped_phdr, header_size); + if (interp) + unmap_pages((vm_offset_t)interp, interp_len); + + return error; +} + +static int +elf_freebsd_fixup(int **stack_base, struct image_params *imgp) +{ + Elf32_Auxargs *args = (Elf32_Auxargs *)imgp->auxargs; + int *pos; + + pos = *stack_base + (imgp->argc + imgp->envc + 2); + + if (args->trace) { + AUXARGS_ENTRY(pos, AT_DEBUG, 1); + } + if (args->execfd != -1) { + AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd); + } + AUXARGS_ENTRY(pos, AT_PHDR, args->phdr); + AUXARGS_ENTRY(pos, AT_PHENT, args->phent); + AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum); + AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz); + AUXARGS_ENTRY(pos, AT_FLAGS, args->flags); + AUXARGS_ENTRY(pos, AT_ENTRY, args->entry); + AUXARGS_ENTRY(pos, AT_BASE, args->base); + AUXARGS_ENTRY(pos, AT_NULL, 0); + + free(imgp->auxargs, M_TEMP); + imgp->auxargs = NULL; + + (*stack_base)--; + **stack_base = (int)imgp->argc; + return 0; +} + +/* + * Tell kern_execve.c about it, with a little help from the linker. + * Since `const' objects end up in the text segment, TEXT_SET is the + * correct directive to use. + */ +const struct execsw elf_execsw = {exec_elf_imgact, "ELF"}; +TEXT_SET(execsw_set, elf_execsw); + diff --git a/sys/kern/imgact_gzip.c b/sys/kern/imgact_gzip.c new file mode 100644 index 0000000..9a3237f --- /dev/null +++ b/sys/kern/imgact_gzip.c @@ -0,0 +1,378 @@ +/* + * ---------------------------------------------------------------------------- + * "THE BEER-WARE LICENSE" (Revision 42): + * <phk@login.dkuug.dk> wrote this file. As long as you retain this notice you + * can do whatever you want with this stuff. If we meet some day, and you think + * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp + * ---------------------------------------------------------------------------- + * + * $Id$ + * + * This module handles execution of a.out files which have been run through + * "gzip". This saves diskspace, but wastes cpu-cycles and VM. + * + * TODO: + * text-segments should be made R/O after being filled + * is the vm-stuff safe ? + * should handle the entire header of gzip'ed stuff. + * inflate isn't quite reentrant yet... + * error-handling is a mess... + * so is the rest... + * tidy up unnecesary includes + */ + +#include "opt_rlimit.h" + +#include <sys/param.h> +#include <sys/exec.h> +#include <sys/imgact.h> +#include <sys/imgact_aout.h> +#include <sys/kernel.h> +#include <sys/mman.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/sysent.h> +#include <sys/systm.h> +#include <sys/vnode.h> +#include <sys/inflate.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> + +struct imgact_gzip { + struct image_params *ip; + struct exec a_out; + int error; + int where; + u_char *inbuf; + u_long offset; + u_long output; + u_long len; + int idx; + u_long virtual_offset, file_offset, file_end, bss_size; +}; + +static int exec_gzip_imgact __P((struct image_params *imgp)); +static int NextByte __P((void *vp)); +static int do_aout_hdr __P((struct imgact_gzip *)); +static int Flush __P((void *vp, u_char *, u_long siz)); + +static int +exec_gzip_imgact(imgp) + struct image_params *imgp; +{ + int error, error2 = 0; + const u_char *p = (const u_char *) imgp->image_header; + struct imgact_gzip igz; + struct inflate infl; + struct vmspace *vmspace; + + /* If these four are not OK, it isn't a gzip file */ + if (p[0] != 0x1f) + return -1; /* 0 Simply magic */ + if (p[1] != 0x8b) + return -1; /* 1 Simply magic */ + if (p[2] != 0x08) + return -1; /* 2 Compression method */ + if (p[9] != 0x03) + return -1; /* 9 OS compressed on */ + + /* + * If this one contains anything but a comment or a filename marker, + * we don't want to chew on it + */ + if (p[3] & ~(0x18)) + return ENOEXEC; /* 3 Flags */ + + /* These are of no use to us */ + /* 4-7 Timestamp */ + /* 8 Extra flags */ + + bzero(&igz, sizeof igz); + bzero(&infl, sizeof infl); + infl.gz_private = (void *) &igz; + infl.gz_input = NextByte; + infl.gz_output = Flush; + + igz.ip = imgp; + igz.idx = 10; + + if (p[3] & 0x08) { /* skip a filename */ + while (p[igz.idx++]) + if (igz.idx >= PAGE_SIZE) + return ENOEXEC; + } + if (p[3] & 0x10) { /* skip a comment */ + while (p[igz.idx++]) + if (igz.idx >= PAGE_SIZE) + return ENOEXEC; + } + igz.len = imgp->attr->va_size; + + error = inflate(&infl); + + if ( !error ) { + vmspace = imgp->proc->p_vmspace; + error = vm_map_protect(&vmspace->vm_map, + (vm_offset_t) vmspace->vm_taddr, + (vm_offset_t) (vmspace->vm_taddr + + (vmspace->vm_tsize << PAGE_SHIFT)) , + VM_PROT_READ|VM_PROT_EXECUTE,0); + } + + if (igz.inbuf) { + error2 = + vm_map_remove(kernel_map, (vm_offset_t) igz.inbuf, + (vm_offset_t) igz.inbuf + PAGE_SIZE); + } + if (igz.error || error || error2) { + printf("Output=%lu ", igz.output); + printf("Inflate_error=%d igz.error=%d error2=%d where=%d\n", + error, igz.error, error2, igz.where); + } + if (igz.error) + return igz.error; + if (error) + return ENOEXEC; + if (error2) + return error2; + return 0; +} + +static int +do_aout_hdr(struct imgact_gzip * gz) +{ + int error; + struct vmspace *vmspace = gz->ip->proc->p_vmspace; + vm_offset_t vmaddr; + + /* + * Set file/virtual offset based on a.out variant. We do two cases: + * host byte order and network byte order (for NetBSD compatibility) + */ + switch ((int) (gz->a_out.a_magic & 0xffff)) { + case ZMAGIC: + gz->virtual_offset = 0; + if (gz->a_out.a_text) { + gz->file_offset = PAGE_SIZE; + } else { + /* Bill's "screwball mode" */ + gz->file_offset = 0; + } + break; + case QMAGIC: + gz->virtual_offset = PAGE_SIZE; + gz->file_offset = 0; + break; + default: + /* NetBSD compatibility */ + switch ((int) (ntohl(gz->a_out.a_magic) & 0xffff)) { + case ZMAGIC: + case QMAGIC: + gz->virtual_offset = PAGE_SIZE; + gz->file_offset = 0; + break; + default: + gz->where = __LINE__; + return (-1); + } + } + + gz->bss_size = roundup(gz->a_out.a_bss, PAGE_SIZE); + + /* + * Check various fields in header for validity/bounds. + */ + if ( /* entry point must lay with text region */ + gz->a_out.a_entry < gz->virtual_offset || + gz->a_out.a_entry >= gz->virtual_offset + gz->a_out.a_text || + + /* text and data size must each be page rounded */ + gz->a_out.a_text & PAGE_MASK || gz->a_out.a_data & PAGE_MASK) { + gz->where = __LINE__; + return (-1); + } + /* + * text/data/bss must not exceed limits + */ + if ( /* text can't exceed maximum text size */ + gz->a_out.a_text > MAXTSIZ || + + /* data + bss can't exceed maximum data size */ + gz->a_out.a_data + gz->bss_size > MAXDSIZ || + + /* data + bss can't exceed rlimit */ + gz->a_out.a_data + gz->bss_size > + gz->ip->proc->p_rlimit[RLIMIT_DATA].rlim_cur) { + gz->where = __LINE__; + return (ENOMEM); + } + /* Find out how far we should go */ + gz->file_end = gz->file_offset + gz->a_out.a_text + gz->a_out.a_data; + + /* copy in arguments and/or environment from old process */ + error = exec_extract_strings(gz->ip); + if (error) { + gz->where = __LINE__; + return (error); + } + /* + * Destroy old process VM and create a new one (with a new stack) + */ + exec_new_vmspace(gz->ip); + + vmaddr = gz->virtual_offset; + + error = vm_mmap(&vmspace->vm_map, + &vmaddr, + gz->a_out.a_text + gz->a_out.a_data, + VM_PROT_ALL, VM_PROT_ALL, MAP_ANON | MAP_FIXED, + 0, + 0); + + if (error) { + gz->where = __LINE__; + return (error); + } + + if (gz->bss_size != 0) { + /* + * Allocate demand-zeroed area for uninitialized data. + * "bss" = 'block started by symbol' - named after the + * IBM 7090 instruction of the same name. + */ + vmaddr = gz->virtual_offset + gz->a_out.a_text + + gz->a_out.a_data; + error = vm_map_find(&vmspace->vm_map, + NULL, + 0, + &vmaddr, + gz->bss_size, + FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); + if (error) { + gz->where = __LINE__; + return (error); + } + } + /* Fill in process VM information */ + vmspace->vm_tsize = gz->a_out.a_text >> PAGE_SHIFT; + vmspace->vm_dsize = (gz->a_out.a_data + gz->bss_size) >> PAGE_SHIFT; + vmspace->vm_taddr = (caddr_t) gz->virtual_offset; + vmspace->vm_daddr = (caddr_t) gz->virtual_offset + gz->a_out.a_text; + + /* Fill in image_params */ + gz->ip->interpreted = 0; + gz->ip->entry_addr = gz->a_out.a_entry; + + gz->ip->proc->p_sysent = &aout_sysvec; + + return 0; +} + +static int +NextByte(void *vp) +{ + int error; + struct imgact_gzip *igz = (struct imgact_gzip *) vp; + + if (igz->idx >= igz->len) { + igz->where = __LINE__; + return GZ_EOF; + } + if (igz->inbuf && igz->idx < (igz->offset + PAGE_SIZE)) { + return igz->inbuf[(igz->idx++) - igz->offset]; + } + if (igz->inbuf) { + error = vm_map_remove(kernel_map, (vm_offset_t) igz->inbuf, + (vm_offset_t) igz->inbuf + PAGE_SIZE); + if (error) { + igz->where = __LINE__; + igz->error = error; + return GZ_EOF; + } + } + igz->offset = igz->idx & ~PAGE_MASK; + + error = vm_mmap(kernel_map, /* map */ + (vm_offset_t *) & igz->inbuf, /* address */ + PAGE_SIZE, /* size */ + VM_PROT_READ, /* protection */ + VM_PROT_READ, /* max protection */ + 0, /* flags */ + (caddr_t) igz->ip->vp, /* vnode */ + igz->offset); /* offset */ + if (error) { + igz->where = __LINE__; + igz->error = error; + return GZ_EOF; + } + return igz->inbuf[(igz->idx++) - igz->offset]; +} + +static int +Flush(void *vp, u_char * ptr, u_long siz) +{ + struct imgact_gzip *gz = (struct imgact_gzip *) vp; + u_char *p = ptr, *q; + int i; + + /* First, find a a.out-header */ + if (gz->output < sizeof gz->a_out) { + q = (u_char *) & gz->a_out; + i = min(siz, sizeof gz->a_out - gz->output); + bcopy(p, q + gz->output, i); + gz->output += i; + p += i; + siz -= i; + if (gz->output == sizeof gz->a_out) { + i = do_aout_hdr(gz); + if (i == -1) { + if (!gz->where) + gz->where = __LINE__; + gz->error = ENOEXEC; + return ENOEXEC; + } else if (i) { + gz->where = __LINE__; + gz->error = i; + return ENOEXEC; + } + if (gz->file_offset < sizeof gz->a_out) { + q = (u_char *) gz->virtual_offset + gz->output - gz->file_offset; + bcopy(&gz->a_out, q, sizeof gz->a_out - gz->file_offset); + } + } + } + /* Skip over zero-padded first PAGE if needed */ + if (gz->output < gz->file_offset && (gz->output + siz) > gz->file_offset) { + i = min(siz, gz->file_offset - gz->output); + gz->output += i; + p += i; + siz -= i; + } + if (gz->output >= gz->file_offset && gz->output < gz->file_end) { + i = min(siz, gz->file_end - gz->output); + q = (u_char *) gz->virtual_offset + gz->output - gz->file_offset; + bcopy(p, q, i); + gz->output += i; + p += i; + siz -= i; + } + gz->output += siz; + return 0; +} + + +/* + * Tell kern_execve.c about it, with a little help from the linker. + * Since `const' objects end up in the text segment, TEXT_SET is the + * correct directive to use. + */ + +static const struct execsw gzip_execsw = {exec_gzip_imgact, "gzip"}; +TEXT_SET(execsw_set, gzip_execsw); diff --git a/sys/kern/imgact_shell.c b/sys/kern/imgact_shell.c new file mode 100644 index 0000000..fb03011 --- /dev/null +++ b/sys/kern/imgact_shell.c @@ -0,0 +1,141 @@ +/* + * Copyright (c) 1993, David Greenman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/resourcevar.h> +#include <sys/exec.h> +#include <sys/imgact.h> +#include <sys/kernel.h> +#include <machine/endian.h> + +#if BYTE_ORDER == LITTLE_ENDIAN +#define SHELLMAGIC 0x2123 /* #! */ +#else +#define SHELLMAGIC 0x2321 +#endif + +#define MAXSHELLCMDLEN 64 + +static int exec_shell_imgact __P((struct image_params *imgp)); + +/* + * Shell interpreter image activator. A interpreter name beginning + * at imgp->stringbase is the minimal successful exit requirement. + */ +static int +exec_shell_imgact(imgp) + struct image_params *imgp; +{ + const char *image_header = imgp->image_header; + const char *ihp, *line_endp; + char *interp; + + /* a shell script? */ + if (((const short *) image_header)[0] != SHELLMAGIC) + return(-1); + + /* + * Don't allow a shell script to be the shell for a shell + * script. :-) + */ + if (imgp->interpreted) + return(ENOEXEC); + + imgp->interpreted = 1; + + /* + * Copy shell name and arguments from image_header into string + * buffer. + */ + + /* + * Find end of line; return if the line > MAXSHELLCMDLEN long. + */ + for (ihp = &image_header[2]; *ihp != '\n'; ++ihp) { + if (ihp >= &image_header[MAXSHELLCMDLEN]) + return(ENOEXEC); + } + line_endp = ihp; + + /* reset for another pass */ + ihp = &image_header[2]; + + /* Skip over leading spaces - until the interpreter name */ + while ((*ihp == ' ') || (*ihp == '\t')) ihp++; + + /* copy the interpreter name */ + interp = imgp->interpreter_name; + while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t')) + *interp++ = *ihp++; + *interp = '\0'; + + /* Disallow a null interpreter filename */ + if (*imgp->interpreter_name == '\0') + return(ENOEXEC); + + /* reset for another pass */ + ihp = &image_header[2]; + + /* copy the interpreter name and arguments */ + while (ihp < line_endp) { + /* Skip over leading spaces */ + while ((*ihp == ' ') || (*ihp == '\t')) ihp++; + + if (ihp < line_endp) { + /* + * Copy to end of token. No need to watch stringspace + * because this is at the front of the string buffer + * and the maximum shell command length is tiny. + */ + while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t')) { + *imgp->stringp++ = *ihp++; + imgp->stringspace--; + } + + *imgp->stringp++ = 0; + imgp->stringspace--; + + imgp->argc++; + } + } + + /* set argv[0] to point to original file name */ + suword(imgp->uap->argv, (int)imgp->uap->fname); + + return(0); +} + +/* + * Tell kern_execve.c about it, with a little help from the linker. + * Since `const' objects end up in the text segment, TEXT_SET is the + * correct directive to use. + */ +static const struct execsw shell_execsw = { exec_shell_imgact, "#!" }; +TEXT_SET(execsw_set, shell_execsw); diff --git a/sys/kern/inflate.c b/sys/kern/inflate.c new file mode 100644 index 0000000..2024bc1 --- /dev/null +++ b/sys/kern/inflate.c @@ -0,0 +1,1072 @@ +/* + * Most parts of this file are not covered by: + * ---------------------------------------------------------------------------- + * "THE BEER-WARE LICENSE" (Revision 42): + * <phk@login.dknet.dk> wrote this file. As long as you retain this notice you + * can do whatever you want with this stuff. If we meet some day, and you think + * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp + * ---------------------------------------------------------------------------- + * + * $Id$ + * + * + */ + +#include <sys/param.h> +#include <sys/inflate.h> +#ifdef KERNEL +#include <sys/systm.h> +#endif +#include <sys/mman.h> +#include <sys/malloc.h> + +/* needed to make inflate() work */ +#define uch u_char +#define ush u_short +#define ulg u_long + +/* Stuff to make inflate() work */ +#ifdef KERNEL +#define memzero(dest,len) bzero(dest,len) +#endif +#define NOMEMCPY +#ifdef KERNEL +#define FPRINTF printf +#else +extern void putstr (char *); +#define FPRINTF putstr +#endif + +#define FLUSH(x,y) { \ + int foo = (*x->gz_output)(x->gz_private,x->gz_slide,y); \ + if (foo) \ + return foo; \ + } + +static const int qflag = 0; + +#ifndef KERNEL /* want to use this file in kzip also */ +extern unsigned char *malloc (int, int, int); +extern void free (void*, int); +#endif + +/* + * This came from unzip-5.12. I have changed it the flow to pass + * a structure pointer around, thus hopefully making it re-entrant. + * Poul-Henning + */ + +/* inflate.c -- put in the public domain by Mark Adler + version c14o, 23 August 1994 */ + +/* You can do whatever you like with this source file, though I would + prefer that if you modify it and redistribute it that you include + comments to that effect with your name and the date. Thank you. + + History: + vers date who what + ---- --------- -------------- ------------------------------------ + a ~~ Feb 92 M. Adler used full (large, one-step) lookup table + b1 21 Mar 92 M. Adler first version with partial lookup tables + b2 21 Mar 92 M. Adler fixed bug in fixed-code blocks + b3 22 Mar 92 M. Adler sped up match copies, cleaned up some + b4 25 Mar 92 M. Adler added prototypes; removed window[] (now + is the responsibility of unzip.h--also + changed name to slide[]), so needs diffs + for unzip.c and unzip.h (this allows + compiling in the small model on MSDOS); + fixed cast of q in huft_build(); + b5 26 Mar 92 M. Adler got rid of unintended macro recursion. + b6 27 Mar 92 M. Adler got rid of nextbyte() routine. fixed + bug in inflate_fixed(). + c1 30 Mar 92 M. Adler removed lbits, dbits environment variables. + changed BMAX to 16 for explode. Removed + OUTB usage, and replaced it with flush()-- + this was a 20% speed improvement! Added + an explode.c (to replace unimplod.c) that + uses the huft routines here. Removed + register union. + c2 4 Apr 92 M. Adler fixed bug for file sizes a multiple of 32k. + c3 10 Apr 92 M. Adler reduced memory of code tables made by + huft_build significantly (factor of two to + three). + c4 15 Apr 92 M. Adler added NOMEMCPY do kill use of memcpy(). + worked around a Turbo C optimization bug. + c5 21 Apr 92 M. Adler added the GZ_WSIZE #define to allow reducing + the 32K window size for specialized + applications. + c6 31 May 92 M. Adler added some typecasts to eliminate warnings + c7 27 Jun 92 G. Roelofs added some more typecasts (444: MSC bug). + c8 5 Oct 92 J-l. Gailly added ifdef'd code to deal with PKZIP bug. + c9 9 Oct 92 M. Adler removed a memory error message (~line 416). + c10 17 Oct 92 G. Roelofs changed ULONG/UWORD/byte to ulg/ush/uch, + removed old inflate, renamed inflate_entry + to inflate, added Mark's fix to a comment. + c10.5 14 Dec 92 M. Adler fix up error messages for incomplete trees. + c11 2 Jan 93 M. Adler fixed bug in detection of incomplete + tables, and removed assumption that EOB is + the longest code (bad assumption). + c12 3 Jan 93 M. Adler make tables for fixed blocks only once. + c13 5 Jan 93 M. Adler allow all zero length codes (pkzip 2.04c + outputs one zero length code for an empty + distance tree). + c14 12 Mar 93 M. Adler made inflate.c standalone with the + introduction of inflate.h. + c14b 16 Jul 93 G. Roelofs added (unsigned) typecast to w at 470. + c14c 19 Jul 93 J. Bush changed v[N_MAX], l[288], ll[28x+3x] arrays + to static for Amiga. + c14d 13 Aug 93 J-l. Gailly de-complicatified Mark's c[*p++]++ thing. + c14e 8 Oct 93 G. Roelofs changed memset() to memzero(). + c14f 22 Oct 93 G. Roelofs renamed quietflg to qflag; made Trace() + conditional; added inflate_free(). + c14g 28 Oct 93 G. Roelofs changed l/(lx+1) macro to pointer (Cray bug) + c14h 7 Dec 93 C. Ghisler huft_build() optimizations. + c14i 9 Jan 94 A. Verheijen set fixed_t{d,l} to NULL after freeing; + G. Roelofs check NEXTBYTE macro for GZ_EOF. + c14j 23 Jan 94 G. Roelofs removed Ghisler "optimizations"; ifdef'd + GZ_EOF check. + c14k 27 Feb 94 G. Roelofs added some typecasts to avoid warnings. + c14l 9 Apr 94 G. Roelofs fixed split comments on preprocessor lines + to avoid bug in Encore compiler. + c14m 7 Jul 94 P. Kienitz modified to allow assembler version of + inflate_codes() (define ASM_INFLATECODES) + c14n 22 Jul 94 G. Roelofs changed fprintf to FPRINTF for DLL versions + c14o 23 Aug 94 C. Spieler added a newline to a debug statement; + G. Roelofs added another typecast to avoid MSC warning + */ + + +/* + Inflate deflated (PKZIP's method 8 compressed) data. The compression + method searches for as much of the current string of bytes (up to a + length of 258) in the previous 32K bytes. If it doesn't find any + matches (of at least length 3), it codes the next byte. Otherwise, it + codes the length of the matched string and its distance backwards from + the current position. There is a single Huffman code that codes both + single bytes (called "literals") and match lengths. A second Huffman + code codes the distance information, which follows a length code. Each + length or distance code actually represents a base value and a number + of "extra" (sometimes zero) bits to get to add to the base value. At + the end of each deflated block is a special end-of-block (EOB) literal/ + length code. The decoding process is basically: get a literal/length + code; if EOB then done; if a literal, emit the decoded byte; if a + length then get the distance and emit the referred-to bytes from the + sliding window of previously emitted data. + + There are (currently) three kinds of inflate blocks: stored, fixed, and + dynamic. The compressor outputs a chunk of data at a time and decides + which method to use on a chunk-by-chunk basis. A chunk might typically + be 32K to 64K, uncompressed. If the chunk is uncompressible, then the + "stored" method is used. In this case, the bytes are simply stored as + is, eight bits per byte, with none of the above coding. The bytes are + preceded by a count, since there is no longer an EOB code. + + If the data is compressible, then either the fixed or dynamic methods + are used. In the dynamic method, the compressed data is preceded by + an encoding of the literal/length and distance Huffman codes that are + to be used to decode this block. The representation is itself Huffman + coded, and so is preceded by a description of that code. These code + descriptions take up a little space, and so for small blocks, there is + a predefined set of codes, called the fixed codes. The fixed method is + used if the block ends up smaller that way (usually for quite small + chunks); otherwise the dynamic method is used. In the latter case, the + codes are customized to the probabilities in the current block and so + can code it much better than the pre-determined fixed codes can. + + The Huffman codes themselves are decoded using a mutli-level table + lookup, in order to maximize the speed of decoding plus the speed of + building the decoding tables. See the comments below that precede the + lbits and dbits tuning parameters. + */ + + +/* + Notes beyond the 1.93a appnote.txt: + + 1. Distance pointers never point before the beginning of the output + stream. + 2. Distance pointers can point back across blocks, up to 32k away. + 3. There is an implied maximum of 7 bits for the bit length table and + 15 bits for the actual data. + 4. If only one code exists, then it is encoded using one bit. (Zero + would be more efficient, but perhaps a little confusing.) If two + codes exist, they are coded using one bit each (0 and 1). + 5. There is no way of sending zero distance codes--a dummy must be + sent if there are none. (History: a pre 2.0 version of PKZIP would + store blocks with no distance codes, but this was discovered to be + too harsh a criterion.) Valid only for 1.93a. 2.04c does allow + zero distance codes, which is sent as one code of zero bits in + length. + 6. There are up to 286 literal/length codes. Code 256 represents the + end-of-block. Note however that the static length tree defines + 288 codes just to fill out the Huffman codes. Codes 286 and 287 + cannot be used though, since there is no length base or extra bits + defined for them. Similarily, there are up to 30 distance codes. + However, static trees define 32 codes (all 5 bits) to fill out the + Huffman codes, but the last two had better not show up in the data. + 7. Unzip can check dynamic Huffman blocks for complete code sets. + The exception is that a single code would not be complete (see #4). + 8. The five bits following the block type is really the number of + literal codes sent minus 257. + 9. Length codes 8,16,16 are interpreted as 13 length codes of 8 bits + (1+6+6). Therefore, to output three times the length, you output + three codes (1+1+1), whereas to output four times the same length, + you only need two codes (1+3). Hmm. + 10. In the tree reconstruction algorithm, Code = Code + Increment + only if BitLength(i) is not zero. (Pretty obvious.) + 11. Correction: 4 Bits: # of Bit Length codes - 4 (4 - 19) + 12. Note: length code 284 can represent 227-258, but length code 285 + really is 258. The last length deserves its own, short code + since it gets used a lot in very redundant files. The length + 258 is special since 258 - 3 (the min match length) is 255. + 13. The literal/length and distance code bit lengths are read as a + single stream of lengths. It is possible (and advantageous) for + a repeat code (16, 17, or 18) to go across the boundary between + the two sets of lengths. + */ + + +#define PKZIP_BUG_WORKAROUND /* PKZIP 1.93a problem--live with it */ + +/* + inflate.h must supply the uch slide[GZ_WSIZE] array and the NEXTBYTE, + FLUSH() and memzero macros. If the window size is not 32K, it + should also define GZ_WSIZE. If INFMOD is defined, it can include + compiled functions to support the NEXTBYTE and/or FLUSH() macros. + There are defaults for NEXTBYTE and FLUSH() below for use as + examples of what those functions need to do. Normally, you would + also want FLUSH() to compute a crc on the data. inflate.h also + needs to provide these typedefs: + + typedef unsigned char uch; + typedef unsigned short ush; + typedef unsigned long ulg; + + This module uses the external functions malloc() and free() (and + probably memset() or bzero() in the memzero() macro). Their + prototypes are normally found in <string.h> and <stdlib.h>. + */ +#define INFMOD /* tell inflate.h to include code to be + * compiled */ + +/* Huffman code lookup table entry--this entry is four bytes for machines + that have 16-bit pointers (e.g. PC's in the small or medium model). + Valid extra bits are 0..13. e == 15 is EOB (end of block), e == 16 + means that v is a literal, 16 < e < 32 means that v is a pointer to + the next table, which codes e - 16 bits, and lastly e == 99 indicates + an unused code. If a code with e == 99 is looked up, this implies an + error in the data. */ +struct huft { + uch e; /* number of extra bits or operation */ + uch b; /* number of bits in this code or subcode */ + union { + ush n; /* literal, length base, or distance + * base */ + struct huft *t; /* pointer to next level of table */ + } v; +}; + + +/* Function prototypes */ +static int huft_build __P((struct inflate *, unsigned *, unsigned, unsigned, const ush *, const ush *, struct huft **, int *)); +static int huft_free __P((struct inflate *, struct huft *)); +static int inflate_codes __P((struct inflate *, struct huft *, struct huft *, int, int)); +static int inflate_stored __P((struct inflate *)); +static int xinflate __P((struct inflate *)); +static int inflate_fixed __P((struct inflate *)); +static int inflate_dynamic __P((struct inflate *)); +static int inflate_block __P((struct inflate *, int *)); + +/* The inflate algorithm uses a sliding 32K byte window on the uncompressed + stream to find repeated byte strings. This is implemented here as a + circular buffer. The index is updated simply by incrementing and then + and'ing with 0x7fff (32K-1). */ +/* It is left to other modules to supply the 32K area. It is assumed + to be usable as if it were declared "uch slide[32768];" or as just + "uch *slide;" and then malloc'ed in the latter case. The definition + must be in unzip.h, included above. */ + + +/* Tables for deflate from PKZIP's appnote.txt. */ + +/* Order of the bit length code lengths */ +static const unsigned border[] = { + 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}; + +static const ush cplens[] = { /* Copy lengths for literal codes 257..285 */ + 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31, + 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0}; + /* note: see note #13 above about the 258 in this list. */ + +static const ush cplext[] = { /* Extra bits for literal codes 257..285 */ + 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, + 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 99, 99}; /* 99==invalid */ + +static const ush cpdist[] = { /* Copy offsets for distance codes 0..29 */ + 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193, + 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145, + 8193, 12289, 16385, 24577}; + +static const ush cpdext[] = { /* Extra bits for distance codes */ + 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, + 7, 7, 8, 8, 9, 9, 10, 10, 11, 11, + 12, 12, 13, 13}; + +/* And'ing with mask[n] masks the lower n bits */ +static const ush mask[] = { + 0x0000, + 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff, + 0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff +}; + + +/* Macros for inflate() bit peeking and grabbing. + The usage is: + + NEEDBITS(glbl,j) + x = b & mask[j]; + DUMPBITS(j) + + where NEEDBITS makes sure that b has at least j bits in it, and + DUMPBITS removes the bits from b. The macros use the variable k + for the number of bits in b. Normally, b and k are register + variables for speed, and are initialized at the begining of a + routine that uses these macros from a global bit buffer and count. + + In order to not ask for more bits than there are in the compressed + stream, the Huffman tables are constructed to only ask for just + enough bits to make up the end-of-block code (value 256). Then no + bytes need to be "returned" to the buffer at the end of the last + block. See the huft_build() routine. + */ + +/* + * The following 2 were global variables. + * They are now fields of the inflate structure. + */ + +#define NEEDBITS(glbl,n) { \ + while(k<(n)) { \ + int c=(*glbl->gz_input)(glbl->gz_private); \ + if(c==GZ_EOF) \ + return 1; \ + b|=((ulg)c)<<k; \ + k+=8; \ + } \ + } + +#define DUMPBITS(n) {b>>=(n);k-=(n);} + +/* + Huffman code decoding is performed using a multi-level table lookup. + The fastest way to decode is to simply build a lookup table whose + size is determined by the longest code. However, the time it takes + to build this table can also be a factor if the data being decoded + is not very long. The most common codes are necessarily the + shortest codes, so those codes dominate the decoding time, and hence + the speed. The idea is you can have a shorter table that decodes the + shorter, more probable codes, and then point to subsidiary tables for + the longer codes. The time it costs to decode the longer codes is + then traded against the time it takes to make longer tables. + + This results of this trade are in the variables lbits and dbits + below. lbits is the number of bits the first level table for literal/ + length codes can decode in one step, and dbits is the same thing for + the distance codes. Subsequent tables are also less than or equal to + those sizes. These values may be adjusted either when all of the + codes are shorter than that, in which case the longest code length in + bits is used, or when the shortest code is *longer* than the requested + table size, in which case the length of the shortest code in bits is + used. + + There are two different values for the two tables, since they code a + different number of possibilities each. The literal/length table + codes 286 possible values, or in a flat code, a little over eight + bits. The distance table codes 30 possible values, or a little less + than five bits, flat. The optimum values for speed end up being + about one bit more than those, so lbits is 8+1 and dbits is 5+1. + The optimum values may differ though from machine to machine, and + possibly even between compilers. Your mileage may vary. + */ + +static const int lbits = 9; /* bits in base literal/length lookup table */ +static const int dbits = 6; /* bits in base distance lookup table */ + + +/* If BMAX needs to be larger than 16, then h and x[] should be ulg. */ +#define BMAX 16 /* maximum bit length of any code (16 for + * explode) */ +#define N_MAX 288 /* maximum number of codes in any set */ + +/* Given a list of code lengths and a maximum table size, make a set of + tables to decode that set of codes. Return zero on success, one if + the given code set is incomplete (the tables are still built in this + case), two if the input is invalid (all zero length codes or an + oversubscribed set of lengths), and three if not enough memory. + The code with value 256 is special, and the tables are constructed + so that no bits beyond that code are fetched when that code is + decoded. */ +static int +huft_build(glbl, b, n, s, d, e, t, m) + struct inflate *glbl; + unsigned *b; /* code lengths in bits (all assumed <= BMAX) */ + unsigned n; /* number of codes (assumed <= N_MAX) */ + unsigned s; /* number of simple-valued codes (0..s-1) */ + const ush *d; /* list of base values for non-simple codes */ + const ush *e; /* list of extra bits for non-simple codes */ + struct huft **t; /* result: starting table */ + int *m; /* maximum lookup bits, returns actual */ +{ + unsigned a; /* counter for codes of length k */ + unsigned c[BMAX + 1]; /* bit length count table */ + unsigned el; /* length of EOB code (value 256) */ + unsigned f; /* i repeats in table every f entries */ + int g; /* maximum code length */ + int h; /* table level */ + register unsigned i; /* counter, current code */ + register unsigned j; /* counter */ + register int k; /* number of bits in current code */ + int lx[BMAX + 1]; /* memory for l[-1..BMAX-1] */ + int *l = lx + 1; /* stack of bits per table */ + register unsigned *p; /* pointer into c[], b[], or v[] */ + register struct huft *q;/* points to current table */ + struct huft r; /* table entry for structure assignment */ + struct huft *u[BMAX];/* table stack */ + unsigned v[N_MAX]; /* values in order of bit length */ + register int w; /* bits before this table == (l * h) */ + unsigned x[BMAX + 1]; /* bit offsets, then code stack */ + unsigned *xp; /* pointer into x */ + int y; /* number of dummy codes added */ + unsigned z; /* number of entries in current table */ + + /* Generate counts for each bit length */ + el = n > 256 ? b[256] : BMAX; /* set length of EOB code, if any */ +#ifdef KERNEL + memzero((char *) c, sizeof(c)); +#else + for (i = 0; i < BMAX+1; i++) + c [i] = 0; +#endif + p = b; + i = n; + do { + c[*p]++; + p++; /* assume all entries <= BMAX */ + } while (--i); + if (c[0] == n) { /* null input--all zero length codes */ + *t = (struct huft *) NULL; + *m = 0; + return 0; + } + /* Find minimum and maximum length, bound *m by those */ + for (j = 1; j <= BMAX; j++) + if (c[j]) + break; + k = j; /* minimum code length */ + if ((unsigned) *m < j) + *m = j; + for (i = BMAX; i; i--) + if (c[i]) + break; + g = i; /* maximum code length */ + if ((unsigned) *m > i) + *m = i; + + /* Adjust last length count to fill out codes, if needed */ + for (y = 1 << j; j < i; j++, y <<= 1) + if ((y -= c[j]) < 0) + return 2; /* bad input: more codes than bits */ + if ((y -= c[i]) < 0) + return 2; + c[i] += y; + + /* Generate starting offsets into the value table for each length */ + x[1] = j = 0; + p = c + 1; + xp = x + 2; + while (--i) { /* note that i == g from above */ + *xp++ = (j += *p++); + } + + /* Make a table of values in order of bit lengths */ + p = b; + i = 0; + do { + if ((j = *p++) != 0) + v[x[j]++] = i; + } while (++i < n); + + /* Generate the Huffman codes and for each, make the table entries */ + x[0] = i = 0; /* first Huffman code is zero */ + p = v; /* grab values in bit order */ + h = -1; /* no tables yet--level -1 */ + w = l[-1] = 0; /* no bits decoded yet */ + u[0] = (struct huft *) NULL; /* just to keep compilers happy */ + q = (struct huft *) NULL; /* ditto */ + z = 0; /* ditto */ + + /* go through the bit lengths (k already is bits in shortest code) */ + for (; k <= g; k++) { + a = c[k]; + while (a--) { + /* + * here i is the Huffman code of length k bits for + * value *p + */ + /* make tables up to required level */ + while (k > w + l[h]) { + w += l[h++]; /* add bits already decoded */ + + /* + * compute minimum size table less than or + * equal to *m bits + */ + z = (z = g - w) > (unsigned) *m ? *m : z; /* upper limit */ + if ((f = 1 << (j = k - w)) > a + 1) { /* try a k-w bit table *//* t + * oo few codes for k-w + * bit table */ + f -= a + 1; /* deduct codes from + * patterns left */ + xp = c + k; + while (++j < z) { /* try smaller tables up + * to z bits */ + if ((f <<= 1) <= *++xp) + break; /* enough codes to use + * up j bits */ + f -= *xp; /* else deduct codes + * from patterns */ + } + } + if ((unsigned) w + j > el && (unsigned) w < el) + j = el - w; /* make EOB code end at + * table */ + z = 1 << j; /* table entries for j-bit + * table */ + l[h] = j; /* set table size in stack */ + + /* allocate and link in new table */ + if ((q = (struct huft *) malloc((z + 1) * sizeof(struct huft), M_GZIP, M_WAITOK)) == + (struct huft *) NULL) { + if (h) + huft_free(glbl, u[0]); + return 3; /* not enough memory */ + } + glbl->gz_hufts += z + 1; /* track memory usage */ + *t = q + 1; /* link to list for + * huft_free() */ + *(t = &(q->v.t)) = (struct huft *) NULL; + u[h] = ++q; /* table starts after link */ + + /* connect to last table, if there is one */ + if (h) { + x[h] = i; /* save pattern for + * backing up */ + r.b = (uch) l[h - 1]; /* bits to dump before + * this table */ + r.e = (uch) (16 + j); /* bits in this table */ + r.v.t = q; /* pointer to this table */ + j = (i & ((1 << w) - 1)) >> (w - l[h - 1]); + u[h - 1][j] = r; /* connect to last table */ + } + } + + /* set up table entry in r */ + r.b = (uch) (k - w); + if (p >= v + n) + r.e = 99; /* out of values--invalid + * code */ + else if (*p < s) { + r.e = (uch) (*p < 256 ? 16 : 15); /* 256 is end-of-block + * code */ + r.v.n = *p++; /* simple code is just the + * value */ + } else { + r.e = (uch) e[*p - s]; /* non-simple--look up + * in lists */ + r.v.n = d[*p++ - s]; + } + + /* fill code-like entries with r */ + f = 1 << (k - w); + for (j = i >> w; j < z; j += f) + q[j] = r; + + /* backwards increment the k-bit code i */ + for (j = 1 << (k - 1); i & j; j >>= 1) + i ^= j; + i ^= j; + + /* backup over finished tables */ + while ((i & ((1 << w) - 1)) != x[h]) + w -= l[--h]; /* don't need to update q */ + } + } + + /* return actual size of base table */ + *m = l[0]; + + /* Return true (1) if we were given an incomplete table */ + return y != 0 && g != 1; +} + +static int +huft_free(glbl, t) + struct inflate *glbl; + struct huft *t; /* table to free */ +/* Free the malloc'ed tables built by huft_build(), which makes a linked + list of the tables it made, with the links in a dummy first entry of + each table. */ +{ + register struct huft *p, *q; + + /* Go through linked list, freeing from the malloced (t[-1]) address. */ + p = t; + while (p != (struct huft *) NULL) { + q = (--p)->v.t; + free(p, M_GZIP); + p = q; + } + return 0; +} + +/* inflate (decompress) the codes in a deflated (compressed) block. + Return an error code or zero if it all goes ok. */ +static int +inflate_codes(glbl, tl, td, bl, bd) + struct inflate *glbl; + struct huft *tl, *td;/* literal/length and distance decoder tables */ + int bl, bd; /* number of bits decoded by tl[] and td[] */ +{ + register unsigned e; /* table entry flag/number of extra bits */ + unsigned n, d; /* length and index for copy */ + unsigned w; /* current window position */ + struct huft *t; /* pointer to table entry */ + unsigned ml, md; /* masks for bl and bd bits */ + register ulg b; /* bit buffer */ + register unsigned k; /* number of bits in bit buffer */ + + /* make local copies of globals */ + b = glbl->gz_bb; /* initialize bit buffer */ + k = glbl->gz_bk; + w = glbl->gz_wp; /* initialize window position */ + + /* inflate the coded data */ + ml = mask[bl]; /* precompute masks for speed */ + md = mask[bd]; + while (1) { /* do until end of block */ + NEEDBITS(glbl, (unsigned) bl) + if ((e = (t = tl + ((unsigned) b & ml))->e) > 16) + do { + if (e == 99) + return 1; + DUMPBITS(t->b) + e -= 16; + NEEDBITS(glbl, e) + } while ((e = (t = t->v.t + ((unsigned) b & mask[e]))->e) > 16); + DUMPBITS(t->b) + if (e == 16) { /* then it's a literal */ + glbl->gz_slide[w++] = (uch) t->v.n; + if (w == GZ_WSIZE) { + FLUSH(glbl, w); + w = 0; + } + } else { /* it's an EOB or a length */ + /* exit if end of block */ + if (e == 15) + break; + + /* get length of block to copy */ + NEEDBITS(glbl, e) + n = t->v.n + ((unsigned) b & mask[e]); + DUMPBITS(e); + + /* decode distance of block to copy */ + NEEDBITS(glbl, (unsigned) bd) + if ((e = (t = td + ((unsigned) b & md))->e) > 16) + do { + if (e == 99) + return 1; + DUMPBITS(t->b) + e -= 16; + NEEDBITS(glbl, e) + } while ((e = (t = t->v.t + ((unsigned) b & mask[e]))->e) > 16); + DUMPBITS(t->b) + NEEDBITS(glbl, e) + d = w - t->v.n - ((unsigned) b & mask[e]); + DUMPBITS(e) + /* do the copy */ + do { + n -= (e = (e = GZ_WSIZE - ((d &= GZ_WSIZE - 1) > w ? d : w)) > n ? n : e); +#ifndef NOMEMCPY + if (w - d >= e) { /* (this test assumes + * unsigned comparison) */ + memcpy(glbl->gz_slide + w, glbl->gz_slide + d, e); + w += e; + d += e; + } else /* do it slow to avoid memcpy() + * overlap */ +#endif /* !NOMEMCPY */ + do { + glbl->gz_slide[w++] = glbl->gz_slide[d++]; + } while (--e); + if (w == GZ_WSIZE) { + FLUSH(glbl, w); + w = 0; + } + } while (n); + } + } + + /* restore the globals from the locals */ + glbl->gz_wp = w; /* restore global window pointer */ + glbl->gz_bb = b; /* restore global bit buffer */ + glbl->gz_bk = k; + + /* done */ + return 0; +} + +/* "decompress" an inflated type 0 (stored) block. */ +static int +inflate_stored(glbl) + struct inflate *glbl; +{ + unsigned n; /* number of bytes in block */ + unsigned w; /* current window position */ + register ulg b; /* bit buffer */ + register unsigned k; /* number of bits in bit buffer */ + + /* make local copies of globals */ + b = glbl->gz_bb; /* initialize bit buffer */ + k = glbl->gz_bk; + w = glbl->gz_wp; /* initialize window position */ + + /* go to byte boundary */ + n = k & 7; + DUMPBITS(n); + + /* get the length and its complement */ + NEEDBITS(glbl, 16) + n = ((unsigned) b & 0xffff); + DUMPBITS(16) + NEEDBITS(glbl, 16) + if (n != (unsigned) ((~b) & 0xffff)) + return 1; /* error in compressed data */ + DUMPBITS(16) + /* read and output the compressed data */ + while (n--) { + NEEDBITS(glbl, 8) + glbl->gz_slide[w++] = (uch) b; + if (w == GZ_WSIZE) { + FLUSH(glbl, w); + w = 0; + } + DUMPBITS(8) + } + + /* restore the globals from the locals */ + glbl->gz_wp = w; /* restore global window pointer */ + glbl->gz_bb = b; /* restore global bit buffer */ + glbl->gz_bk = k; + return 0; +} + +/* decompress an inflated type 1 (fixed Huffman codes) block. We should + either replace this with a custom decoder, or at least precompute the + Huffman tables. */ +static int +inflate_fixed(glbl) + struct inflate *glbl; +{ + /* if first time, set up tables for fixed blocks */ + if (glbl->gz_fixed_tl == (struct huft *) NULL) { + int i; /* temporary variable */ + static unsigned l[288]; /* length list for huft_build */ + + /* literal table */ + for (i = 0; i < 144; i++) + l[i] = 8; + for (; i < 256; i++) + l[i] = 9; + for (; i < 280; i++) + l[i] = 7; + for (; i < 288; i++) /* make a complete, but wrong code + * set */ + l[i] = 8; + glbl->gz_fixed_bl = 7; + if ((i = huft_build(glbl, l, 288, 257, cplens, cplext, + &glbl->gz_fixed_tl, &glbl->gz_fixed_bl)) != 0) { + glbl->gz_fixed_tl = (struct huft *) NULL; + return i; + } + /* distance table */ + for (i = 0; i < 30; i++) /* make an incomplete code + * set */ + l[i] = 5; + glbl->gz_fixed_bd = 5; + if ((i = huft_build(glbl, l, 30, 0, cpdist, cpdext, + &glbl->gz_fixed_td, &glbl->gz_fixed_bd)) > 1) { + huft_free(glbl, glbl->gz_fixed_tl); + glbl->gz_fixed_tl = (struct huft *) NULL; + return i; + } + } + /* decompress until an end-of-block code */ + return inflate_codes(glbl, glbl->gz_fixed_tl, glbl->gz_fixed_td, glbl->gz_fixed_bl, glbl->gz_fixed_bd) != 0; +} + +/* decompress an inflated type 2 (dynamic Huffman codes) block. */ +static int +inflate_dynamic(glbl) + struct inflate *glbl; +{ + int i; /* temporary variables */ + unsigned j; + unsigned l; /* last length */ + unsigned m; /* mask for bit lengths table */ + unsigned n; /* number of lengths to get */ + struct huft *tl; /* literal/length code table */ + struct huft *td; /* distance code table */ + int bl; /* lookup bits for tl */ + int bd; /* lookup bits for td */ + unsigned nb; /* number of bit length codes */ + unsigned nl; /* number of literal/length codes */ + unsigned nd; /* number of distance codes */ +#ifdef PKZIP_BUG_WORKAROUND + unsigned ll[288 + 32]; /* literal/length and distance code + * lengths */ +#else + unsigned ll[286 + 30]; /* literal/length and distance code + * lengths */ +#endif + register ulg b; /* bit buffer */ + register unsigned k; /* number of bits in bit buffer */ + + /* make local bit buffer */ + b = glbl->gz_bb; + k = glbl->gz_bk; + + /* read in table lengths */ + NEEDBITS(glbl, 5) + nl = 257 + ((unsigned) b & 0x1f); /* number of + * literal/length codes */ + DUMPBITS(5) + NEEDBITS(glbl, 5) + nd = 1 + ((unsigned) b & 0x1f); /* number of distance codes */ + DUMPBITS(5) + NEEDBITS(glbl, 4) + nb = 4 + ((unsigned) b & 0xf); /* number of bit length codes */ + DUMPBITS(4) +#ifdef PKZIP_BUG_WORKAROUND + if (nl > 288 || nd > 32) +#else + if (nl > 286 || nd > 30) +#endif + return 1; /* bad lengths */ + /* read in bit-length-code lengths */ + for (j = 0; j < nb; j++) { + NEEDBITS(glbl, 3) + ll[border[j]] = (unsigned) b & 7; + DUMPBITS(3) + } + for (; j < 19; j++) + ll[border[j]] = 0; + + /* build decoding table for trees--single level, 7 bit lookup */ + bl = 7; + if ((i = huft_build(glbl, ll, 19, 19, NULL, NULL, &tl, &bl)) != 0) { + if (i == 1) + huft_free(glbl, tl); + return i; /* incomplete code set */ + } + /* read in literal and distance code lengths */ + n = nl + nd; + m = mask[bl]; + i = l = 0; + while ((unsigned) i < n) { + NEEDBITS(glbl, (unsigned) bl) + j = (td = tl + ((unsigned) b & m))->b; + DUMPBITS(j) + j = td->v.n; + if (j < 16) /* length of code in bits (0..15) */ + ll[i++] = l = j; /* save last length in l */ + else if (j == 16) { /* repeat last length 3 to 6 times */ + NEEDBITS(glbl, 2) + j = 3 + ((unsigned) b & 3); + DUMPBITS(2) + if ((unsigned) i + j > n) + return 1; + while (j--) + ll[i++] = l; + } else if (j == 17) { /* 3 to 10 zero length codes */ + NEEDBITS(glbl, 3) + j = 3 + ((unsigned) b & 7); + DUMPBITS(3) + if ((unsigned) i + j > n) + return 1; + while (j--) + ll[i++] = 0; + l = 0; + } else { /* j == 18: 11 to 138 zero length codes */ + NEEDBITS(glbl, 7) + j = 11 + ((unsigned) b & 0x7f); + DUMPBITS(7) + if ((unsigned) i + j > n) + return 1; + while (j--) + ll[i++] = 0; + l = 0; + } + } + + /* free decoding table for trees */ + huft_free(glbl, tl); + + /* restore the global bit buffer */ + glbl->gz_bb = b; + glbl->gz_bk = k; + + /* build the decoding tables for literal/length and distance codes */ + bl = lbits; + i = huft_build(glbl, ll, nl, 257, cplens, cplext, &tl, &bl); + if (i != 0) { + if (i == 1 && !qflag) { + FPRINTF("(incomplete l-tree) "); + huft_free(glbl, tl); + } + return i; /* incomplete code set */ + } + bd = dbits; + i = huft_build(glbl, ll + nl, nd, 0, cpdist, cpdext, &td, &bd); + if (i != 0) { + if (i == 1 && !qflag) { + FPRINTF("(incomplete d-tree) "); +#ifdef PKZIP_BUG_WORKAROUND + i = 0; + } +#else + huft_free(glbl, td); + } + huft_free(glbl, tl); + return i; /* incomplete code set */ +#endif + } + /* decompress until an end-of-block code */ + if (inflate_codes(glbl, tl, td, bl, bd)) + return 1; + + /* free the decoding tables, return */ + huft_free(glbl, tl); + huft_free(glbl, td); + return 0; +} + +/* decompress an inflated block */ +static int +inflate_block(glbl, e) + struct inflate *glbl; + int *e; /* last block flag */ +{ + unsigned t; /* block type */ + register ulg b; /* bit buffer */ + register unsigned k; /* number of bits in bit buffer */ + + /* make local bit buffer */ + b = glbl->gz_bb; + k = glbl->gz_bk; + + /* read in last block bit */ + NEEDBITS(glbl, 1) + * e = (int) b & 1; + DUMPBITS(1) + /* read in block type */ + NEEDBITS(glbl, 2) + t = (unsigned) b & 3; + DUMPBITS(2) + /* restore the global bit buffer */ + glbl->gz_bb = b; + glbl->gz_bk = k; + + /* inflate that block type */ + if (t == 2) + return inflate_dynamic(glbl); + if (t == 0) + return inflate_stored(glbl); + if (t == 1) + return inflate_fixed(glbl); + /* bad block type */ + return 2; +} + + + +/* decompress an inflated entry */ +static int +xinflate(glbl) + struct inflate *glbl; +{ + int e; /* last block flag */ + int r; /* result code */ + unsigned h; /* maximum struct huft's malloc'ed */ + + glbl->gz_fixed_tl = (struct huft *) NULL; + + /* initialize window, bit buffer */ + glbl->gz_wp = 0; + glbl->gz_bk = 0; + glbl->gz_bb = 0; + + /* decompress until the last block */ + h = 0; + do { + glbl->gz_hufts = 0; + if ((r = inflate_block(glbl, &e)) != 0) + return r; + if (glbl->gz_hufts > h) + h = glbl->gz_hufts; + } while (!e); + + /* flush out slide */ + FLUSH(glbl, glbl->gz_wp); + + /* return success */ + return 0; +} + +/* Nobody uses this - why not? */ +int +inflate(glbl) + struct inflate *glbl; +{ + int i; +#ifdef KERNEL + u_char *p = NULL; + + if (!glbl->gz_slide) + p = glbl->gz_slide = malloc(GZ_WSIZE, M_GZIP, M_WAITOK); +#endif + if (!glbl->gz_slide) +#ifdef KERNEL + return(ENOMEM); +#else + return 3; /* kzip expects 3 */ +#endif + i = xinflate(glbl); + + if (glbl->gz_fixed_td != (struct huft *) NULL) { + huft_free(glbl, glbl->gz_fixed_td); + glbl->gz_fixed_td = (struct huft *) NULL; + } + if (glbl->gz_fixed_tl != (struct huft *) NULL) { + huft_free(glbl, glbl->gz_fixed_tl); + glbl->gz_fixed_tl = (struct huft *) NULL; + } +#ifdef KERNEL + if (p == glbl->gz_slide) { + free(glbl->gz_slide, M_GZIP); + glbl->gz_slide = NULL; + } +#endif + return i; +} +/* ----------------------- END INFLATE.C */ diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c index 61a0a14..f108547 100644 --- a/sys/kern/init_main.c +++ b/sys/kern/init_main.c @@ -1,4 +1,7 @@ /* + * Copyright (c) 1995 Terrence R. Lambert + * All rights reserved. + * * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. @@ -35,100 +38,270 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)init_main.c 8.16 (Berkeley) 5/14/95 + * @(#)init_main.c 8.9 (Berkeley) 1/21/94 + * $Id: init_main.c,v 1.58 1997/03/01 17:49:09 wosch Exp $ */ +#include "opt_rlimit.h" +#include "opt_devfs.h" + #include <sys/param.h> +#include <sys/file.h> #include <sys/filedesc.h> -#include <sys/errno.h> -#include <sys/exec.h> #include <sys/kernel.h> #include <sys/mount.h> -#include <sys/map.h> +#include <sys/sysctl.h> #include <sys/proc.h> #include <sys/resourcevar.h> #include <sys/signalvar.h> #include <sys/systm.h> #include <sys/vnode.h> -#include <sys/conf.h> -#include <sys/buf.h> -#include <sys/clist.h> -#include <sys/device.h> -#include <sys/protosw.h> +#include <sys/sysent.h> #include <sys/reboot.h> -#include <sys/user.h> -#include <sys/syscallargs.h> - -#include <ufs/ufs/quota.h> +#include <sys/sysproto.h> +#include <sys/vmmeter.h> #include <machine/cpu.h> #include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <sys/user.h> +#include <sys/copyright.h> -#ifdef HPFPLIB -char copyright[] = -"Copyright (c) 1982, 1986, 1989, 1991, 1993\n\tThe Regents of the University of California.\nCopyright (c) 1992 Hewlett-Packard Company\nCopyright (c) 1992 Motorola Inc.\nAll rights reserved.\n\n"; -#else -char copyright[] = -"Copyright (c) 1982, 1986, 1989, 1991, 1993\n\tThe Regents of the University of California. All rights reserved.\n\n"; -#endif +extern struct linker_set sysinit_set; /* XXX */ + +extern void __main __P((void)); +extern void main __P((void *framep)); /* Components of the first process -- never freed. */ -struct session session0; -struct pgrp pgrp0; +static struct session session0; +static struct pgrp pgrp0; struct proc proc0; -struct pcred cred0; -struct filedesc0 filedesc0; -struct plimit limit0; -struct vmspace vmspace0; +static struct pcred cred0; +static struct filedesc0 filedesc0; +static struct plimit limit0; +static struct vmspace vmspace0; struct proc *curproc = &proc0; -struct proc *initproc, *pageproc; +struct proc *initproc; -int cmask = CMASK; +int cmask = CMASK; extern struct user *proc0paddr; -struct vnode *rootvp, *swapdev_vp; +struct vnode *rootvp; int boothowto; + struct timeval boottime; +SYSCTL_STRUCT(_kern, KERN_BOOTTIME, boottime, + CTLFLAG_RW, &boottime, timeval, ""); + struct timeval runtime; -static void start_init __P((struct proc *p, void *framep)); +/* + * Promiscuous argument pass for start_init() + * + * This is a kludge because we use a return from main() rather than a call + * to a new routine in locore.s to kick the kernel alive from locore.s. + */ +static void *init_framep; + + +#if __GNUC__ >= 2 +void __main() {} +#endif + + +/* + * This ensures that there is at least one entry so that the sysinit_set + * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never + * executed. + */ +SYSINIT(placeholder, SI_SUB_DUMMY,SI_ORDER_ANY, NULL, NULL) + /* * System startup; initialize the world, create process 0, mount root * filesystem, and fork to create init and pagedaemon. Most of the * hard work is done in the lower-level initialization routines including * startup(), which does memory initialization and autoconfiguration. + * + * This allows simple addition of new kernel subsystems that require + * boot time initialization. It also allows substitution of subsystem + * (for instance, a scheduler, kernel profiler, or VM system) by object + * module. Finally, it allows for optional "kernel threads", like an LFS + * cleaner. */ +void main(framep) void *framep; { - register struct proc *p; - register struct filedesc0 *fdp; - register struct pdevinit *pdev; - register int i; - int s; - register_t rval[2]; - extern struct pdevinit pdevinit[]; - extern void roundrobin __P((void *)); - extern void schedcpu __P((void *)); + + register struct sysinit **sipp; /* system initialization*/ + register struct sysinit **xipp; /* interior loop of sort*/ + register struct sysinit *save; /* bubble*/ + int rval[2]; /* SI_TYPE_KTHREAD support*/ /* - * Initialize the current process pointer (curproc) before - * any possible traps/probes to simplify trap processing. + * Save the locore.s frame pointer for start_init(). */ - p = &proc0; - curproc = p; + init_framep = framep; + /* - * Attempt to find console and initialize - * in case of early panic or other messages. + * Perform a bubble sort of the system initialization objects by + * their subsystem (primary key) and order (secondary key). + * + * Since some things care about execution order, this is the + * operation which ensures continued function. */ - consinit(); - printf(copyright); + for( sipp = (struct sysinit **)sysinit_set.ls_items; *sipp; sipp++) { + for( xipp = sipp + 1; *xipp; xipp++) { + if( (*sipp)->subsystem < (*xipp)->subsystem || + ( (*sipp)->subsystem == (*xipp)->subsystem && + (*sipp)->order < (*xipp)->order)) + continue; /* skip*/ + save = *sipp; + *sipp = *xipp; + *xipp = save; + } + } + + /* + * Traverse the (now) ordered list of system initialization tasks. + * Perform each task, and continue on to the next task. + * + * The last item on the list is expected to be the scheduler, + * which will not return. + */ + for( sipp = (struct sysinit **)sysinit_set.ls_items; *sipp; sipp++) { + if( (*sipp)->subsystem == SI_SUB_DUMMY) + continue; /* skip dummy task(s)*/ + + switch( (*sipp)->type) { + case SI_TYPE_DEFAULT: + /* no special processing*/ + (*((*sipp)->func))( (*sipp)->udata); + break; + + case SI_TYPE_KTHREAD: + /* kernel thread*/ + if (fork(&proc0, NULL, rval)) + panic("fork kernel process"); + if (rval[1]) { + (*((*sipp)->func))( (*sipp)->udata); + /* + * The call to start "init" returns + * here after the scheduler has been + * started, and returns to the caller + * in i386/i386/locore.s. This is a + * necessary part of initialization + * and is rather non-obvious. + * + * No other "kernel threads" should + * return here. Call panic() instead. + */ + return; + } + break; + + default: + panic( "init_main: unrecognized init type"); + } + } + + /* NOTREACHED*/ +} + + +/* + * Start a kernel process. This is called after a fork() call in + * main() in the file kern/init_main.c. + * + * This function is used to start "internal" daemons. + */ +/* ARGSUSED*/ +void +kproc_start(udata) + void *udata; +{ + struct kproc_desc *kp = udata; + struct proc *p = curproc; + + /* save a global descriptor, if desired*/ + if( kp->global_procpp != NULL) + *kp->global_procpp = p; + + /* this is a non-swapped system process*/ + p->p_flag |= P_INMEM | P_SYSTEM; - vm_mem_init(); - kmeminit(); - cpu_startup(); + /* set up arg0 for 'ps', et al*/ + strcpy( p->p_comm, kp->arg0); + + /* call the processes' main()...*/ + (*kp->func)(); + + /* NOTREACHED */ + panic("kproc_start: %s", kp->arg0); +} + + +/* + *************************************************************************** + **** + **** The following SYSINIT's belong elsewhere, but have not yet + **** been moved. + **** + *************************************************************************** + */ +#ifdef OMIT +/* + * Handled by vfs_mountroot (bad idea) at this time... should be + * done the same as 4.4Lite2. + */ +SYSINIT(swapinit, SI_SUB_SWAP, SI_ORDER_FIRST, swapinit, NULL) +#endif /* OMIT*/ + +static void print_caddr_t __P((void *data)); +static void +print_caddr_t(data) + void *data; +{ + printf("%s", (char *)data); +} +SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright) + + +/* + *************************************************************************** + **** + **** The two following SYSINT's are proc0 specific glue code. I am not + **** convinced that they can not be safely combined, but their order of + **** operation has been maintained as the same as the original init_main.c + **** for right now. + **** + **** These probably belong in init_proc.c or kern_proc.c, since they + **** deal with proc0 (the fork template process). + **** + *************************************************************************** + */ +/* ARGSUSED*/ +static void proc0_init __P((void *dummy)); +static void +proc0_init(dummy) + void *dummy; +{ + register struct proc *p; + register struct filedesc0 *fdp; + register unsigned i; + + /* + * Initialize the current process pointer (curproc) before + * any possible traps/probes to simplify trap processing. + */ + p = &proc0; + curproc = p; /* XXX redundant*/ /* * Initialize process and pgrp structures. @@ -136,6 +309,11 @@ main(framep) procinit(); /* + * Initialize sleep queue hash table + */ + sleepinit(); + + /* * Create process 0 (the swapper). */ LIST_INSERT_HEAD(&allproc, p, p_list); @@ -148,9 +326,14 @@ main(framep) session0.s_count = 1; session0.s_leader = p; + p->p_sysent = &aout_sysvec; + p->p_flag = P_INMEM | P_SYSTEM; p->p_stat = SRUN; p->p_nice = NZERO; + p->p_rtprio.type = RTP_PRIO_NORMAL; + p->p_rtprio.prio = 0; + bcopy("swapper", p->p_comm, sizeof ("swapper")); /* Create credentials. */ @@ -173,8 +356,10 @@ main(framep) for (i = 0; i < sizeof(p->p_rlimit)/sizeof(p->p_rlimit[0]); i++) limit0.pl_rlimit[i].rlim_cur = limit0.pl_rlimit[i].rlim_max = RLIM_INFINITY; - limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur = NOFILE; - limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur = MAXUPRC; + limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur = + limit0.pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles; + limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur = + limit0.pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc; i = ptoa(cnt.v_free_count); limit0.pl_rlimit[RLIMIT_RSS].rlim_max = i; limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i; @@ -185,11 +370,22 @@ main(framep) p->p_vmspace = &vmspace0; vmspace0.vm_refcnt = 1; pmap_pinit(&vmspace0.vm_pmap); - vm_map_init(&p->p_vmspace->vm_map, round_page(VM_MIN_ADDRESS), - trunc_page(VM_MAX_ADDRESS), TRUE); + vm_map_init(&vmspace0.vm_map, round_page(VM_MIN_ADDRESS), + trunc_page(VM_MAXUSER_ADDRESS), TRUE); vmspace0.vm_map.pmap = &vmspace0.vm_pmap; p->p_addr = proc0paddr; /* XXX */ +#define INCOMPAT_LITES2 +#ifdef INCOMPAT_LITES2 + /* + * proc0 needs to have a coherent frame base, too. + * This probably makes the identical call for the init proc + * that happens later unnecessary since it should inherit + * it during the fork. + */ + cpu_set_init_frame(p, init_framep); /* XXX! */ +#endif /* INCOMPAT_LITES2*/ + /* * We continue to place resource usage info and signal * actions in the user struct so they're pageable. @@ -201,104 +397,127 @@ main(framep) * Charge root for one process. */ (void)chgproccnt(0, 1); +} +SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL) - rqinit(); - - /* Configure virtual memory system, set vm rlimits. */ - vm_init_limits(p); - - /* Initialize the file systems. */ - vfsinit(); +/* ARGSUSED*/ +static void proc0_post __P((void *dummy)); +static void +proc0_post(dummy) + void *dummy; +{ + struct timeval tv; - /* Start real time and statistics clocks. */ - initclocks(); + /* + * Now can look at time, having had a chance to verify the time + * from the file system. Reset p->p_rtime as it may have been + * munched in mi_switch() after the time got set. + */ + gettime(&boottime); + proc0.p_stats->p_start = runtime = mono_time = boottime; + proc0.p_rtime.tv_sec = proc0.p_rtime.tv_usec = 0; - /* Initialize mbuf's. */ - mbinit(); + /* + * Give the ``random'' number generator a thump. + */ + microtime(&tv); + srandom(tv.tv_sec ^ tv.tv_usec); - /* Initialize clists. */ - clist_init(); + /* Initialize signal state for process 0. */ + siginit(&proc0); +} +SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL) -#ifdef SYSVSHM - /* Initialize System V style shared memory. */ - shminit(); -#endif - /* Attach pseudo-devices. */ - for (pdev = pdevinit; pdev->pdev_attach != NULL; pdev++) - (*pdev->pdev_attach)(pdev->pdev_count); - /* - * Initialize protocols. Block reception of incoming packets - * until everything is ready. - */ - s = splimp(); - ifinit(); - domaininit(); - splx(s); - -#ifdef GPROF - /* Initialize kernel profiling. */ - kmstartup(); -#endif +/* + *************************************************************************** + **** + **** The following SYSINIT's and glue code should be moved to the + **** respective files on a per subsystem basis. + **** + *************************************************************************** + */ +/* ARGSUSED*/ +static void sched_setup __P((void *dummy)); +static void +sched_setup(dummy) + void *dummy; +{ /* Kick off timeout driven events by calling first time. */ roundrobin(NULL); schedcpu(NULL); +} +SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL) +/* ARGSUSED*/ +static void xxx_vfs_mountroot __P((void *fsnamep)); +static void +xxx_vfs_mountroot(fsnamep) + void *fsnamep; +{ /* Mount the root file system. */ - if (vfs_mountroot()) + if (vfs_mountrootfs(*((char **) fsnamep))) panic("cannot mount root"); - mountlist.cqh_first->mnt_flag |= MNT_ROOTFS; +} +SYSINIT(mountroot, SI_SUB_ROOT, SI_ORDER_FIRST, xxx_vfs_mountroot, &mountrootfsname) + +/* ARGSUSED*/ +static void xxx_vfs_root_fdtab __P((void *dummy)); +static void +xxx_vfs_root_fdtab(dummy) + void *dummy; +{ + register struct filedesc0 *fdp = &filedesc0; /* Get the vnode for '/'. Set fdp->fd_fd.fd_cdir to reference it. */ if (VFS_ROOT(mountlist.cqh_first, &rootvnode)) panic("cannot find root vnode"); fdp->fd_fd.fd_cdir = rootvnode; VREF(fdp->fd_fd.fd_cdir); - VOP_UNLOCK(rootvnode, 0, p); + VOP_UNLOCK(rootvnode, 0, &proc0); fdp->fd_fd.fd_rdir = NULL; - swapinit(); +} +SYSINIT(retrofit, SI_SUB_ROOT_FDTAB, SI_ORDER_FIRST, xxx_vfs_root_fdtab, NULL) - /* - * Now can look at time, having had a chance to verify the time - * from the file system. Reset p->p_rtime as it may have been - * munched in mi_switch() after the time got set. - */ - p->p_stats->p_start = runtime = mono_time = boottime = time; - p->p_rtime.tv_sec = p->p_rtime.tv_usec = 0; - /* Initialize signal state for process 0. */ - siginit(p); +/* + *************************************************************************** + **** + **** The following code probably belongs in another file, like + **** kern/init_init.c. It is here for two reasons only: + **** + **** 1) This code returns to startup the system; this is + **** abnormal for a kernel thread. + **** 2) This code promiscuously uses init_frame + **** + *************************************************************************** + */ - /* Create process 1 (init(8)). */ - if (fork(p, NULL, rval)) - panic("fork init"); - if (rval[1]) { - start_init(curproc, framep); - return; - } +static void kthread_init __P((void *dummy)); +SYSINIT_KT(init,SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kthread_init, NULL) - /* Create process 2 (the pageout daemon). */ - if (fork(p, NULL, rval)) - panic("fork pager"); - if (rval[1]) { - /* - * Now in process 2. - */ - p = curproc; - pageproc = p; - p->p_flag |= P_INMEM | P_SYSTEM; /* XXX */ - bcopy("pagedaemon", curproc->p_comm, sizeof ("pagedaemon")); - vm_pageout(); - /* NOTREACHED */ - } - /* The scheduler is an infinite loop. */ - scheduler(); - /* NOTREACHED */ +static void start_init __P((struct proc *p, void *framep)); + +/* ARGSUSED*/ +static void +kthread_init(dummy) + void *dummy; +{ + + /* Create process 1 (init(8)). */ + start_init(curproc, init_framep); + + /* + * This is the only kernel thread allowed to return yo the + * caller!!! + */ + return; } + /* * List of paths to try when searching for "init". */ @@ -306,6 +525,7 @@ static char *initpaths[] = { "/sbin/init", "/sbin/oinit", "/sbin/init.bak", + "/stand/sysinstall", NULL, }; @@ -319,14 +539,8 @@ start_init(p, framep) void *framep; { vm_offset_t addr; - struct execve_args /* { - syscallarg(char *) path; - syscallarg(char **) argp; - syscallarg(char **) envp; - } */ args; - int options, i, error; - register_t retval[2]; - char flags[4] = "-", *flagsp; + struct execve_args args; + int options, i, retval[2], error; char **pathp, *path, *ucp, **uap, *arg0, *arg1; initproc = p; @@ -343,66 +557,74 @@ start_init(p, framep) /* * Need just enough stack to hold the faked-up "execve()" arguments. */ - addr = trunc_page(VM_MAX_ADDRESS - PAGE_SIZE); - if (vm_allocate(&p->p_vmspace->vm_map, &addr, PAGE_SIZE, FALSE) != 0) + addr = trunc_page(VM_MAXUSER_ADDRESS - PAGE_SIZE); + if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0) panic("init: couldn't allocate argument space"); p->p_vmspace->vm_maxsaddr = (caddr_t)addr; + p->p_vmspace->vm_ssize = 1; for (pathp = &initpaths[0]; (path = *pathp) != NULL; pathp++) { /* - * Construct the boot flag argument. + * Move out the boot flag argument. */ options = 0; - flagsp = flags + 1; ucp = (char *)USRSTACK; + (void)subyte(--ucp, 0); /* trailing zero */ if (boothowto & RB_SINGLE) { - *flagsp++ = 's'; + (void)subyte(--ucp, 's'); options = 1; } #ifdef notyet if (boothowto & RB_FASTBOOT) { - *flagsp++ = 'f'; + (void)subyte(--ucp, 'f'); options = 1; } #endif - /* - * Move out the flags (arg 1), if necessary. - */ - if (options != 0) { - *flagsp++ = '\0'; - i = flagsp - flags; - (void)copyout((caddr_t)flags, (caddr_t)(ucp -= i), i); - arg1 = ucp; - } + +#ifdef BOOTCDROM + (void)subyte(--ucp, 'C'); + options = 1; +#endif + +#if defined(DEVFS) && defined(DEVFS_ROOT) + (void)subyte(--ucp, 'd'); + options = 1; +#endif + if (options == 0) + (void)subyte(--ucp, '-'); + (void)subyte(--ucp, '-'); /* leading hyphen */ + arg1 = ucp; /* * Move out the file name (also arg 0). */ - i = strlen(path) + 1; - (void)copyout((caddr_t)path, (caddr_t)(ucp -= i), i); + for (i = strlen(path) + 1; i >= 0; i--) + (void)subyte(--ucp, path[i]); arg0 = ucp; /* * Move out the arg pointers. */ - uap = (char **)((long)ucp & ~ALIGNBYTES); + uap = (char **)((int)ucp & ~(NBPW-1)); (void)suword((caddr_t)--uap, 0); /* terminator */ - if (options != 0) - (void)suword((caddr_t)--uap, (long)arg1); - (void)suword((caddr_t)--uap, (long)arg0); + (void)suword((caddr_t)--uap, (int)arg1); + (void)suword((caddr_t)--uap, (int)arg0); /* * Point at the arguments. */ - SCARG(&args, path) = arg0; - SCARG(&args, argp) = uap; - SCARG(&args, envp) = NULL; + args.fname = arg0; + args.argv = uap; + args.envv = NULL; /* * Now try to exec the program. If can't for any reason * other than it doesn't exist, complain. + * + * Otherwise return to main() which returns to btext + * which completes the system startup. */ - if ((error = execve(p, &args, retval)) == 0) + if ((error = execve(p, &args, &retval[0])) == 0) return; if (error != ENOENT) printf("exec %s: error %d\n", path, error); diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c index 0bbdd20..6954a04 100644 --- a/sys/kern/init_sysent.c +++ b/sys/kern/init_sysent.c @@ -2,766 +2,286 @@ * System call switch table. * * DO NOT EDIT-- this file is automatically generated. - * created from @(#)syscalls.master 8.6 (Berkeley) 3/30/95 + * created from Id: syscalls.master,v 1.33 1997/02/22 09:39:21 peter Exp */ #include <sys/param.h> -#include <sys/systm.h> -#include <sys/signal.h> -#include <sys/mount.h> -#include <sys/syscallargs.h> -int nosys(); -int exit(); -int fork(); -int read(); -int write(); -int open(); -int close(); -int wait4(); -int link(); -int unlink(); -int chdir(); -int fchdir(); -int mknod(); -int chmod(); -int chown(); -int obreak(); -int getfsstat(); -int getpid(); -int mount(); -int unmount(); -int setuid(); -int getuid(); -int geteuid(); -int ptrace(); -int recvmsg(); -int sendmsg(); -int recvfrom(); -int accept(); -int getpeername(); -int getsockname(); -int access(); -int chflags(); -int fchflags(); -int sync(); -int kill(); -int getppid(); -int dup(); -int pipe(); -int getegid(); -int profil(); -#ifdef KTRACE -int ktrace(); -#else -#endif -int sigaction(); -int getgid(); -int sigprocmask(); -int getlogin(); -int setlogin(); -int acct(); -int sigpending(); -int sigaltstack(); -int ioctl(); -int reboot(); -int revoke(); -int symlink(); -int readlink(); -int execve(); -int umask(); -int chroot(); -int msync(); -int vfork(); -int sbrk(); -int sstk(); -int ovadvise(); -int munmap(); -int mprotect(); -int madvise(); -int mincore(); -int getgroups(); -int setgroups(); -int getpgrp(); -int setpgid(); -int setitimer(); -int swapon(); -int getitimer(); -int getdtablesize(); -int dup2(); -int fcntl(); -int select(); -int fsync(); -int setpriority(); -int socket(); -int connect(); -int getpriority(); -int sigreturn(); -int bind(); -int setsockopt(); -int listen(); -int sigsuspend(); -#ifdef TRACE -int vtrace(); -#else -#endif -int gettimeofday(); -int getrusage(); -int getsockopt(); -#ifdef vax -int resuba(); -#else -#endif -int readv(); -int writev(); -int settimeofday(); -int fchown(); -int fchmod(); -int rename(); -int flock(); -int mkfifo(); -int sendto(); -int shutdown(); -int socketpair(); -int mkdir(); -int rmdir(); -int utimes(); -int adjtime(); -int setsid(); -int quotactl(); -#ifdef NFS -int nfssvc(); -#else -#endif -int statfs(); -int fstatfs(); -#ifdef NFS -int getfh(); -#else -#endif -#if defined(SYSVSHM) && !defined(alpha) -#else -#endif -int setgid(); -int setegid(); -int seteuid(); -#ifdef LFS -int lfs_bmapv(); -int lfs_markv(); -int lfs_segclean(); -int lfs_segwait(); -#else -#endif -int stat(); -int fstat(); -int lstat(); -int pathconf(); -int fpathconf(); -int getrlimit(); -int setrlimit(); -int getdirentries(); -int mmap(); -int nosys(); -int lseek(); -int truncate(); -int ftruncate(); -int __sysctl(); -int mlock(); -int munlock(); -int undelete(); -#if defined(SYSVSHM) && 0 -int shmat(); -int shmctl(); -int shmdt(); -int shmget(); -#else -#endif +#include <sys/sysent.h> +#include <sys/sysproto.h> #ifdef COMPAT_43 -#define compat_43(func) __CONCAT(compat_43_,func) - -int compat_43(creat)(); -int compat_43(lseek)(); -int compat_43(stat)(); -int compat_43(lstat)(); -#ifdef KTRACE +#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name) #else +#define compat(n, name) 0, (sy_call_t *)nosys #endif -int compat_43(fstat)(); -int compat_43(getkerninfo)(); -int compat_43(getpagesize)(); -int compat_43(mmap)(); -int compat_43(wait)(); -int compat_43(gethostname)(); -int compat_43(sethostname)(); -int compat_43(accept)(); -int compat_43(send)(); -int compat_43(recv)(); -int compat_43(sigvec)(); -int compat_43(sigblock)(); -int compat_43(sigsetmask)(); -int compat_43(sigstack)(); -int compat_43(recvmsg)(); -int compat_43(sendmsg)(); -#ifdef TRACE -#else -#endif -#ifdef vax -#else -#endif -int compat_43(recvfrom)(); -int compat_43(setreuid)(); -int compat_43(setregid)(); -int compat_43(truncate)(); -int compat_43(ftruncate)(); -int compat_43(getpeername)(); -int compat_43(gethostid)(); -int compat_43(sethostid)(); -int compat_43(getrlimit)(); -int compat_43(setrlimit)(); -int compat_43(killpg)(); -int compat_43(quota)(); -int compat_43(getsockname)(); -#ifdef NFS -#else -#endif -int compat_43(getdirentries)(); -#ifdef NFS -#else -#endif -#if defined(SYSVSHM) && !defined(alpha) -int compat_43(shmsys)(); -#else -#endif -#ifdef LFS -#else -#endif -#if defined(SYSVSHM) && 0 -#else -#endif - -#else /* COMPAT_43 */ -#define compat_43(func) nosys -#endif /* COMPAT_43 */ - -#define s(type) sizeof(type) +/* The casts are bogus but will do for now. */ struct sysent sysent[] = { - { 0, 0, - nosys }, /* 0 = syscall */ - { 1, s(struct exit_args), - exit }, /* 1 = exit */ - { 0, 0, - fork }, /* 2 = fork */ - { 3, s(struct read_args), - read }, /* 3 = read */ - { 3, s(struct write_args), - write }, /* 4 = write */ - { 3, s(struct open_args), - open }, /* 5 = open */ - { 1, s(struct close_args), - close }, /* 6 = close */ - { 4, s(struct wait4_args), - wait4 }, /* 7 = wait4 */ - { 2, s(struct compat_43_creat_args), - compat_43(creat) }, /* 8 = compat_43 creat */ - { 2, s(struct link_args), - link }, /* 9 = link */ - { 1, s(struct unlink_args), - unlink }, /* 10 = unlink */ - { 0, 0, - nosys }, /* 11 = obsolete execv */ - { 1, s(struct chdir_args), - chdir }, /* 12 = chdir */ - { 1, s(struct fchdir_args), - fchdir }, /* 13 = fchdir */ - { 3, s(struct mknod_args), - mknod }, /* 14 = mknod */ - { 2, s(struct chmod_args), - chmod }, /* 15 = chmod */ - { 3, s(struct chown_args), - chown }, /* 16 = chown */ - { 1, s(struct obreak_args), - obreak }, /* 17 = break */ - { 3, s(struct getfsstat_args), - getfsstat }, /* 18 = getfsstat */ - { 3, s(struct compat_43_lseek_args), - compat_43(lseek) }, /* 19 = compat_43 lseek */ - { 0, 0, - getpid }, /* 20 = getpid */ - { 4, s(struct mount_args), - mount }, /* 21 = mount */ - { 2, s(struct unmount_args), - unmount }, /* 22 = unmount */ - { 1, s(struct setuid_args), - setuid }, /* 23 = setuid */ - { 0, 0, - getuid }, /* 24 = getuid */ - { 0, 0, - geteuid }, /* 25 = geteuid */ - { 4, s(struct ptrace_args), - ptrace }, /* 26 = ptrace */ - { 3, s(struct recvmsg_args), - recvmsg }, /* 27 = recvmsg */ - { 3, s(struct sendmsg_args), - sendmsg }, /* 28 = sendmsg */ - { 6, s(struct recvfrom_args), - recvfrom }, /* 29 = recvfrom */ - { 3, s(struct accept_args), - accept }, /* 30 = accept */ - { 3, s(struct getpeername_args), - getpeername }, /* 31 = getpeername */ - { 3, s(struct getsockname_args), - getsockname }, /* 32 = getsockname */ - { 2, s(struct access_args), - access }, /* 33 = access */ - { 2, s(struct chflags_args), - chflags }, /* 34 = chflags */ - { 2, s(struct fchflags_args), - fchflags }, /* 35 = fchflags */ - { 0, 0, - sync }, /* 36 = sync */ - { 2, s(struct kill_args), - kill }, /* 37 = kill */ - { 2, s(struct compat_43_stat_args), - compat_43(stat) }, /* 38 = compat_43 stat */ - { 0, 0, - getppid }, /* 39 = getppid */ - { 2, s(struct compat_43_lstat_args), - compat_43(lstat) }, /* 40 = compat_43 lstat */ - { 1, s(struct dup_args), - dup }, /* 41 = dup */ - { 0, 0, - pipe }, /* 42 = pipe */ - { 0, 0, - getegid }, /* 43 = getegid */ - { 4, s(struct profil_args), - profil }, /* 44 = profil */ -#ifdef KTRACE - { 4, s(struct ktrace_args), - ktrace }, /* 45 = ktrace */ -#else - { 0, 0, - nosys }, /* 45 = unimplemented ktrace */ -#endif - { 3, s(struct sigaction_args), - sigaction }, /* 46 = sigaction */ - { 0, 0, - getgid }, /* 47 = getgid */ - { 2, s(struct sigprocmask_args), - sigprocmask }, /* 48 = sigprocmask */ - { 2, s(struct getlogin_args), - getlogin }, /* 49 = getlogin */ - { 1, s(struct setlogin_args), - setlogin }, /* 50 = setlogin */ - { 1, s(struct acct_args), - acct }, /* 51 = acct */ - { 0, 0, - sigpending }, /* 52 = sigpending */ - { 2, s(struct sigaltstack_args), - sigaltstack }, /* 53 = sigaltstack */ - { 3, s(struct ioctl_args), - ioctl }, /* 54 = ioctl */ - { 1, s(struct reboot_args), - reboot }, /* 55 = reboot */ - { 1, s(struct revoke_args), - revoke }, /* 56 = revoke */ - { 2, s(struct symlink_args), - symlink }, /* 57 = symlink */ - { 3, s(struct readlink_args), - readlink }, /* 58 = readlink */ - { 3, s(struct execve_args), - execve }, /* 59 = execve */ - { 1, s(struct umask_args), - umask }, /* 60 = umask */ - { 1, s(struct chroot_args), - chroot }, /* 61 = chroot */ - { 2, s(struct compat_43_fstat_args), - compat_43(fstat) }, /* 62 = compat_43 fstat */ - { 4, s(struct compat_43_getkerninfo_args), - compat_43(getkerninfo) }, /* 63 = compat_43 getkerninfo */ - { 0, 0, - compat_43(getpagesize) }, /* 64 = compat_43 getpagesize */ - { 2, s(struct msync_args), - msync }, /* 65 = msync */ - { 0, 0, - vfork }, /* 66 = vfork */ - { 0, 0, - nosys }, /* 67 = obsolete vread */ - { 0, 0, - nosys }, /* 68 = obsolete vwrite */ - { 1, s(struct sbrk_args), - sbrk }, /* 69 = sbrk */ - { 1, s(struct sstk_args), - sstk }, /* 70 = sstk */ - { 6, s(struct compat_43_mmap_args), - compat_43(mmap) }, /* 71 = compat_43 mmap */ - { 1, s(struct ovadvise_args), - ovadvise }, /* 72 = vadvise */ - { 2, s(struct munmap_args), - munmap }, /* 73 = munmap */ - { 3, s(struct mprotect_args), - mprotect }, /* 74 = mprotect */ - { 3, s(struct madvise_args), - madvise }, /* 75 = madvise */ - { 0, 0, - nosys }, /* 76 = obsolete vhangup */ - { 0, 0, - nosys }, /* 77 = obsolete vlimit */ - { 3, s(struct mincore_args), - mincore }, /* 78 = mincore */ - { 2, s(struct getgroups_args), - getgroups }, /* 79 = getgroups */ - { 2, s(struct setgroups_args), - setgroups }, /* 80 = setgroups */ - { 0, 0, - getpgrp }, /* 81 = getpgrp */ - { 2, s(struct setpgid_args), - setpgid }, /* 82 = setpgid */ - { 3, s(struct setitimer_args), - setitimer }, /* 83 = setitimer */ - { 0, 0, - compat_43(wait) }, /* 84 = compat_43 wait */ - { 1, s(struct swapon_args), - swapon }, /* 85 = swapon */ - { 2, s(struct getitimer_args), - getitimer }, /* 86 = getitimer */ - { 2, s(struct compat_43_gethostname_args), - compat_43(gethostname) }, /* 87 = compat_43 gethostname */ - { 2, s(struct compat_43_sethostname_args), - compat_43(sethostname) }, /* 88 = compat_43 sethostname */ - { 0, 0, - getdtablesize }, /* 89 = getdtablesize */ - { 2, s(struct dup2_args), - dup2 }, /* 90 = dup2 */ - { 0, 0, - nosys }, /* 91 = unimplemented getdopt */ - { 3, s(struct fcntl_args), - fcntl }, /* 92 = fcntl */ - { 5, s(struct select_args), - select }, /* 93 = select */ - { 0, 0, - nosys }, /* 94 = unimplemented setdopt */ - { 1, s(struct fsync_args), - fsync }, /* 95 = fsync */ - { 3, s(struct setpriority_args), - setpriority }, /* 96 = setpriority */ - { 3, s(struct socket_args), - socket }, /* 97 = socket */ - { 3, s(struct connect_args), - connect }, /* 98 = connect */ - { 3, s(struct compat_43_accept_args), - compat_43(accept) }, /* 99 = compat_43 accept */ - { 2, s(struct getpriority_args), - getpriority }, /* 100 = getpriority */ - { 4, s(struct compat_43_send_args), - compat_43(send) }, /* 101 = compat_43 send */ - { 4, s(struct compat_43_recv_args), - compat_43(recv) }, /* 102 = compat_43 recv */ - { 1, s(struct sigreturn_args), - sigreturn }, /* 103 = sigreturn */ - { 3, s(struct bind_args), - bind }, /* 104 = bind */ - { 5, s(struct setsockopt_args), - setsockopt }, /* 105 = setsockopt */ - { 2, s(struct listen_args), - listen }, /* 106 = listen */ - { 0, 0, - nosys }, /* 107 = obsolete vtimes */ - { 3, s(struct compat_43_sigvec_args), - compat_43(sigvec) }, /* 108 = compat_43 sigvec */ - { 1, s(struct compat_43_sigblock_args), - compat_43(sigblock) }, /* 109 = compat_43 sigblock */ - { 1, s(struct compat_43_sigsetmask_args), - compat_43(sigsetmask) }, /* 110 = compat_43 sigsetmask */ - { 1, s(struct sigsuspend_args), - sigsuspend }, /* 111 = sigsuspend */ - { 2, s(struct compat_43_sigstack_args), - compat_43(sigstack) }, /* 112 = compat_43 sigstack */ - { 3, s(struct compat_43_recvmsg_args), - compat_43(recvmsg) }, /* 113 = compat_43 recvmsg */ - { 3, s(struct compat_43_sendmsg_args), - compat_43(sendmsg) }, /* 114 = compat_43 sendmsg */ -#ifdef TRACE - { 2, s(struct vtrace_args), - vtrace }, /* 115 = vtrace */ -#else - { 0, 0, - nosys }, /* 115 = obsolete vtrace */ -#endif - { 2, s(struct gettimeofday_args), - gettimeofday }, /* 116 = gettimeofday */ - { 2, s(struct getrusage_args), - getrusage }, /* 117 = getrusage */ - { 5, s(struct getsockopt_args), - getsockopt }, /* 118 = getsockopt */ -#ifdef vax - { 1, s(struct resuba_args), - resuba }, /* 119 = resuba */ -#else - { 0, 0, - nosys }, /* 119 = unimplemented resuba */ -#endif - { 3, s(struct readv_args), - readv }, /* 120 = readv */ - { 3, s(struct writev_args), - writev }, /* 121 = writev */ - { 2, s(struct settimeofday_args), - settimeofday }, /* 122 = settimeofday */ - { 3, s(struct fchown_args), - fchown }, /* 123 = fchown */ - { 2, s(struct fchmod_args), - fchmod }, /* 124 = fchmod */ - { 6, s(struct compat_43_recvfrom_args), - compat_43(recvfrom) }, /* 125 = compat_43 recvfrom */ - { 2, s(struct compat_43_setreuid_args), - compat_43(setreuid) }, /* 126 = compat_43 setreuid */ - { 2, s(struct compat_43_setregid_args), - compat_43(setregid) }, /* 127 = compat_43 setregid */ - { 2, s(struct rename_args), - rename }, /* 128 = rename */ - { 2, s(struct compat_43_truncate_args), - compat_43(truncate) }, /* 129 = compat_43 truncate */ - { 2, s(struct compat_43_ftruncate_args), - compat_43(ftruncate) }, /* 130 = compat_43 ftruncate */ - { 2, s(struct flock_args), - flock }, /* 131 = flock */ - { 2, s(struct mkfifo_args), - mkfifo }, /* 132 = mkfifo */ - { 6, s(struct sendto_args), - sendto }, /* 133 = sendto */ - { 2, s(struct shutdown_args), - shutdown }, /* 134 = shutdown */ - { 4, s(struct socketpair_args), - socketpair }, /* 135 = socketpair */ - { 2, s(struct mkdir_args), - mkdir }, /* 136 = mkdir */ - { 1, s(struct rmdir_args), - rmdir }, /* 137 = rmdir */ - { 2, s(struct utimes_args), - utimes }, /* 138 = utimes */ - { 0, 0, - nosys }, /* 139 = obsolete 4.2 sigreturn */ - { 2, s(struct adjtime_args), - adjtime }, /* 140 = adjtime */ - { 3, s(struct compat_43_getpeername_args), - compat_43(getpeername) }, /* 141 = compat_43 getpeername */ - { 0, 0, - compat_43(gethostid) }, /* 142 = compat_43 gethostid */ - { 1, s(struct compat_43_sethostid_args), - compat_43(sethostid) }, /* 143 = compat_43 sethostid */ - { 2, s(struct compat_43_getrlimit_args), - compat_43(getrlimit) }, /* 144 = compat_43 getrlimit */ - { 2, s(struct compat_43_setrlimit_args), - compat_43(setrlimit) }, /* 145 = compat_43 setrlimit */ - { 2, s(struct compat_43_killpg_args), - compat_43(killpg) }, /* 146 = compat_43 killpg */ - { 0, 0, - setsid }, /* 147 = setsid */ - { 4, s(struct quotactl_args), - quotactl }, /* 148 = quotactl */ - { 0, 0, - compat_43(quota) }, /* 149 = compat_43 quota */ - { 3, s(struct compat_43_getsockname_args), - compat_43(getsockname) }, /* 150 = compat_43 getsockname */ - { 0, 0, - nosys }, /* 151 = unimplemented */ - { 0, 0, - nosys }, /* 152 = unimplemented */ - { 0, 0, - nosys }, /* 153 = unimplemented */ - { 0, 0, - nosys }, /* 154 = unimplemented */ + { 0, (sy_call_t *)nosys }, /* 0 = syscall */ + { 1, (sy_call_t *)exit }, /* 1 = exit */ + { 0, (sy_call_t *)fork }, /* 2 = fork */ + { 3, (sy_call_t *)read }, /* 3 = read */ + { 3, (sy_call_t *)write }, /* 4 = write */ + { 3, (sy_call_t *)open }, /* 5 = open */ + { 1, (sy_call_t *)close }, /* 6 = close */ + { 4, (sy_call_t *)wait4 }, /* 7 = wait4 */ + { compat(2,creat) }, /* 8 = old creat */ + { 2, (sy_call_t *)link }, /* 9 = link */ + { 1, (sy_call_t *)unlink }, /* 10 = unlink */ + { 0, (sy_call_t *)nosys }, /* 11 = obsolete execv */ + { 1, (sy_call_t *)chdir }, /* 12 = chdir */ + { 1, (sy_call_t *)fchdir }, /* 13 = fchdir */ + { 3, (sy_call_t *)mknod }, /* 14 = mknod */ + { 2, (sy_call_t *)chmod }, /* 15 = chmod */ + { 3, (sy_call_t *)chown }, /* 16 = chown */ + { 1, (sy_call_t *)obreak }, /* 17 = break */ + { 3, (sy_call_t *)getfsstat }, /* 18 = getfsstat */ + { compat(3,lseek) }, /* 19 = old lseek */ + { 0, (sy_call_t *)getpid }, /* 20 = getpid */ + { 4, (sy_call_t *)mount }, /* 21 = mount */ + { 2, (sy_call_t *)unmount }, /* 22 = unmount */ + { 1, (sy_call_t *)setuid }, /* 23 = setuid */ + { 0, (sy_call_t *)getuid }, /* 24 = getuid */ + { 0, (sy_call_t *)geteuid }, /* 25 = geteuid */ + { 4, (sy_call_t *)ptrace }, /* 26 = ptrace */ + { 3, (sy_call_t *)recvmsg }, /* 27 = recvmsg */ + { 3, (sy_call_t *)sendmsg }, /* 28 = sendmsg */ + { 6, (sy_call_t *)recvfrom }, /* 29 = recvfrom */ + { 3, (sy_call_t *)accept }, /* 30 = accept */ + { 3, (sy_call_t *)getpeername }, /* 31 = getpeername */ + { 3, (sy_call_t *)getsockname }, /* 32 = getsockname */ + { 2, (sy_call_t *)access }, /* 33 = access */ + { 2, (sy_call_t *)chflags }, /* 34 = chflags */ + { 2, (sy_call_t *)fchflags }, /* 35 = fchflags */ + { 0, (sy_call_t *)sync }, /* 36 = sync */ + { 2, (sy_call_t *)kill }, /* 37 = kill */ + { compat(2,stat) }, /* 38 = old stat */ + { 0, (sy_call_t *)getppid }, /* 39 = getppid */ + { compat(2,lstat) }, /* 40 = old lstat */ + { 1, (sy_call_t *)dup }, /* 41 = dup */ + { 0, (sy_call_t *)pipe }, /* 42 = pipe */ + { 0, (sy_call_t *)getegid }, /* 43 = getegid */ + { 4, (sy_call_t *)profil }, /* 44 = profil */ + { 4, (sy_call_t *)ktrace }, /* 45 = ktrace */ + { 3, (sy_call_t *)sigaction }, /* 46 = sigaction */ + { 0, (sy_call_t *)getgid }, /* 47 = getgid */ + { 2, (sy_call_t *)sigprocmask }, /* 48 = sigprocmask */ + { 2, (sy_call_t *)getlogin }, /* 49 = getlogin */ + { 1, (sy_call_t *)setlogin }, /* 50 = setlogin */ + { 1, (sy_call_t *)acct }, /* 51 = acct */ + { 0, (sy_call_t *)sigpending }, /* 52 = sigpending */ + { 2, (sy_call_t *)sigaltstack }, /* 53 = sigaltstack */ + { 3, (sy_call_t *)ioctl }, /* 54 = ioctl */ + { 1, (sy_call_t *)reboot }, /* 55 = reboot */ + { 1, (sy_call_t *)revoke }, /* 56 = revoke */ + { 2, (sy_call_t *)symlink }, /* 57 = symlink */ + { 3, (sy_call_t *)readlink }, /* 58 = readlink */ + { 3, (sy_call_t *)execve }, /* 59 = execve */ + { 1, (sy_call_t *)umask }, /* 60 = umask */ + { 1, (sy_call_t *)chroot }, /* 61 = chroot */ + { compat(2,fstat) }, /* 62 = old fstat */ + { compat(4,getkerninfo) }, /* 63 = old getkerninfo */ + { compat(0,getpagesize) }, /* 64 = old getpagesize */ + { 3, (sy_call_t *)msync }, /* 65 = msync */ + { 0, (sy_call_t *)vfork }, /* 66 = vfork */ + { 0, (sy_call_t *)nosys }, /* 67 = obsolete vread */ + { 0, (sy_call_t *)nosys }, /* 68 = obsolete vwrite */ + { 1, (sy_call_t *)sbrk }, /* 69 = sbrk */ + { 1, (sy_call_t *)sstk }, /* 70 = sstk */ + { compat(6,mmap) }, /* 71 = old mmap */ + { 1, (sy_call_t *)ovadvise }, /* 72 = vadvise */ + { 2, (sy_call_t *)munmap }, /* 73 = munmap */ + { 3, (sy_call_t *)mprotect }, /* 74 = mprotect */ + { 3, (sy_call_t *)madvise }, /* 75 = madvise */ + { 0, (sy_call_t *)nosys }, /* 76 = obsolete vhangup */ + { 0, (sy_call_t *)nosys }, /* 77 = obsolete vlimit */ + { 3, (sy_call_t *)mincore }, /* 78 = mincore */ + { 2, (sy_call_t *)getgroups }, /* 79 = getgroups */ + { 2, (sy_call_t *)setgroups }, /* 80 = setgroups */ + { 0, (sy_call_t *)getpgrp }, /* 81 = getpgrp */ + { 2, (sy_call_t *)setpgid }, /* 82 = setpgid */ + { 3, (sy_call_t *)setitimer }, /* 83 = setitimer */ + { compat(0,wait) }, /* 84 = old wait */ + { 1, (sy_call_t *)swapon }, /* 85 = swapon */ + { 2, (sy_call_t *)getitimer }, /* 86 = getitimer */ + { compat(2,gethostname) }, /* 87 = old gethostname */ + { compat(2,sethostname) }, /* 88 = old sethostname */ + { 0, (sy_call_t *)getdtablesize }, /* 89 = getdtablesize */ + { 2, (sy_call_t *)dup2 }, /* 90 = dup2 */ + { 0, (sy_call_t *)nosys }, /* 91 = getdopt */ + { 3, (sy_call_t *)fcntl }, /* 92 = fcntl */ + { 5, (sy_call_t *)select }, /* 93 = select */ + { 0, (sy_call_t *)nosys }, /* 94 = setdopt */ + { 1, (sy_call_t *)fsync }, /* 95 = fsync */ + { 3, (sy_call_t *)setpriority }, /* 96 = setpriority */ + { 3, (sy_call_t *)socket }, /* 97 = socket */ + { 3, (sy_call_t *)connect }, /* 98 = connect */ + { compat(3,accept) }, /* 99 = old accept */ + { 2, (sy_call_t *)getpriority }, /* 100 = getpriority */ + { compat(4,send) }, /* 101 = old send */ + { compat(4,recv) }, /* 102 = old recv */ + { 1, (sy_call_t *)sigreturn }, /* 103 = sigreturn */ + { 3, (sy_call_t *)bind }, /* 104 = bind */ + { 5, (sy_call_t *)setsockopt }, /* 105 = setsockopt */ + { 2, (sy_call_t *)listen }, /* 106 = listen */ + { 0, (sy_call_t *)nosys }, /* 107 = obsolete vtimes */ + { compat(3,sigvec) }, /* 108 = old sigvec */ + { compat(1,sigblock) }, /* 109 = old sigblock */ + { compat(1,sigsetmask) }, /* 110 = old sigsetmask */ + { 1, (sy_call_t *)sigsuspend }, /* 111 = sigsuspend */ + { compat(2,sigstack) }, /* 112 = old sigstack */ + { compat(3,recvmsg) }, /* 113 = old recvmsg */ + { compat(3,sendmsg) }, /* 114 = old sendmsg */ + { 0, (sy_call_t *)nosys }, /* 115 = obsolete vtrace */ + { 2, (sy_call_t *)gettimeofday }, /* 116 = gettimeofday */ + { 2, (sy_call_t *)getrusage }, /* 117 = getrusage */ + { 5, (sy_call_t *)getsockopt }, /* 118 = getsockopt */ + { 0, (sy_call_t *)nosys }, /* 119 = resuba */ + { 3, (sy_call_t *)readv }, /* 120 = readv */ + { 3, (sy_call_t *)writev }, /* 121 = writev */ + { 2, (sy_call_t *)settimeofday }, /* 122 = settimeofday */ + { 3, (sy_call_t *)fchown }, /* 123 = fchown */ + { 2, (sy_call_t *)fchmod }, /* 124 = fchmod */ + { compat(6,recvfrom) }, /* 125 = old recvfrom */ + { 2, (sy_call_t *)setreuid }, /* 126 = setreuid */ + { 2, (sy_call_t *)setregid }, /* 127 = setregid */ + { 2, (sy_call_t *)rename }, /* 128 = rename */ + { compat(2,truncate) }, /* 129 = old truncate */ + { compat(2,ftruncate) }, /* 130 = old ftruncate */ + { 2, (sy_call_t *)flock }, /* 131 = flock */ + { 2, (sy_call_t *)mkfifo }, /* 132 = mkfifo */ + { 6, (sy_call_t *)sendto }, /* 133 = sendto */ + { 2, (sy_call_t *)shutdown }, /* 134 = shutdown */ + { 4, (sy_call_t *)socketpair }, /* 135 = socketpair */ + { 2, (sy_call_t *)mkdir }, /* 136 = mkdir */ + { 1, (sy_call_t *)rmdir }, /* 137 = rmdir */ + { 2, (sy_call_t *)utimes }, /* 138 = utimes */ + { 0, (sy_call_t *)nosys }, /* 139 = obsolete 4.2 sigreturn */ + { 2, (sy_call_t *)adjtime }, /* 140 = adjtime */ + { compat(3,getpeername) }, /* 141 = old getpeername */ + { compat(0,gethostid) }, /* 142 = old gethostid */ + { compat(1,sethostid) }, /* 143 = old sethostid */ + { compat(2,getrlimit) }, /* 144 = old getrlimit */ + { compat(2,setrlimit) }, /* 145 = old setrlimit */ + { compat(2,killpg) }, /* 146 = old killpg */ + { 0, (sy_call_t *)setsid }, /* 147 = setsid */ + { 4, (sy_call_t *)quotactl }, /* 148 = quotactl */ + { compat(0,quota) }, /* 149 = old quota */ + { compat(3,getsockname) }, /* 150 = old getsockname */ + { 0, (sy_call_t *)nosys }, /* 151 = sem_lock */ + { 0, (sy_call_t *)nosys }, /* 152 = sem_wakeup */ + { 0, (sy_call_t *)nosys }, /* 153 = asyncdaemon */ + { 0, (sy_call_t *)nosys }, /* 154 = nosys */ #ifdef NFS - { 2, s(struct nfssvc_args), - nfssvc }, /* 155 = nfssvc */ + { 2, (sy_call_t *)nfssvc }, /* 155 = nfssvc */ #else - { 0, 0, - nosys }, /* 155 = unimplemented nfssvc */ + { 0, (sy_call_t *)nosys }, /* 155 = nosys */ #endif - { 4, s(struct compat_43_getdirentries_args), - compat_43(getdirentries) }, /* 156 = compat_43 getdirentries */ - { 2, s(struct statfs_args), - statfs }, /* 157 = statfs */ - { 2, s(struct fstatfs_args), - fstatfs }, /* 158 = fstatfs */ - { 0, 0, - nosys }, /* 159 = unimplemented */ - { 0, 0, - nosys }, /* 160 = unimplemented */ -#ifdef NFS - { 2, s(struct getfh_args), - getfh }, /* 161 = getfh */ -#else - { 0, 0, - nosys }, /* 161 = unimplemented getfh */ -#endif - { 0, 0, - nosys }, /* 162 = unimplemented getdomainname */ - { 0, 0, - nosys }, /* 163 = unimplemented setdomainname */ - { 0, 0, - nosys }, /* 164 = unimplemented */ - { 0, 0, - nosys }, /* 165 = unimplemented */ - { 0, 0, - nosys }, /* 166 = unimplemented */ - { 0, 0, - nosys }, /* 167 = unimplemented */ - { 0, 0, - nosys }, /* 168 = unimplemented */ - { 0, 0, - nosys }, /* 169 = unimplemented semsys */ - { 0, 0, - nosys }, /* 170 = unimplemented msgsys */ -#if defined(SYSVSHM) && !defined(alpha) - { 4, s(struct compat_43_shmsys_args), - compat_43(shmsys) }, /* 171 = compat_43 shmsys */ + { compat(4,getdirentries) }, /* 156 = old getdirentries */ + { 2, (sy_call_t *)statfs }, /* 157 = statfs */ + { 2, (sy_call_t *)fstatfs }, /* 158 = fstatfs */ + { 0, (sy_call_t *)nosys }, /* 159 = nosys */ + { 0, (sy_call_t *)nosys }, /* 160 = nosys */ +#if defined(NFS) && !defined (NFS_NOSERVER) + { 2, (sy_call_t *)getfh }, /* 161 = getfh */ #else - { 0, 0, - nosys }, /* 171 = unimplemented shmsys */ + { 0, (sy_call_t *)nosys }, /* 161 = nosys */ #endif - { 0, 0, - nosys }, /* 172 = unimplemented */ - { 0, 0, - nosys }, /* 173 = unimplemented */ - { 0, 0, - nosys }, /* 174 = unimplemented */ - { 0, 0, - nosys }, /* 175 = unimplemented */ - { 0, 0, - nosys }, /* 176 = unimplemented */ - { 0, 0, - nosys }, /* 177 = unimplemented */ - { 0, 0, - nosys }, /* 178 = unimplemented */ - { 0, 0, - nosys }, /* 179 = unimplemented */ - { 0, 0, - nosys }, /* 180 = unimplemented */ - { 1, s(struct setgid_args), - setgid }, /* 181 = setgid */ - { 1, s(struct setegid_args), - setegid }, /* 182 = setegid */ - { 1, s(struct seteuid_args), - seteuid }, /* 183 = seteuid */ + { 2, (sy_call_t *)getdomainname }, /* 162 = getdomainname */ + { 2, (sy_call_t *)setdomainname }, /* 163 = setdomainname */ + { 1, (sy_call_t *)uname }, /* 164 = uname */ + { 2, (sy_call_t *)sysarch }, /* 165 = sysarch */ + { 3, (sy_call_t *)rtprio }, /* 166 = rtprio */ + { 0, (sy_call_t *)nosys }, /* 167 = nosys */ + { 0, (sy_call_t *)nosys }, /* 168 = nosys */ + { 5, (sy_call_t *)semsys }, /* 169 = semsys */ + { 6, (sy_call_t *)msgsys }, /* 170 = msgsys */ + { 4, (sy_call_t *)shmsys }, /* 171 = shmsys */ + { 0, (sy_call_t *)nosys }, /* 172 = nosys */ + { 0, (sy_call_t *)nosys }, /* 173 = nosys */ + { 0, (sy_call_t *)nosys }, /* 174 = nosys */ + { 0, (sy_call_t *)nosys }, /* 175 = nosys */ + { 1, (sy_call_t *)ntp_adjtime }, /* 176 = ntp_adjtime */ + { 0, (sy_call_t *)nosys }, /* 177 = sfork */ + { 0, (sy_call_t *)nosys }, /* 178 = getdescriptor */ + { 0, (sy_call_t *)nosys }, /* 179 = setdescriptor */ + { 0, (sy_call_t *)nosys }, /* 180 = nosys */ + { 1, (sy_call_t *)setgid }, /* 181 = setgid */ + { 1, (sy_call_t *)setegid }, /* 182 = setegid */ + { 1, (sy_call_t *)seteuid }, /* 183 = seteuid */ #ifdef LFS - { 3, s(struct lfs_bmapv_args), - lfs_bmapv }, /* 184 = lfs_bmapv */ - { 3, s(struct lfs_markv_args), - lfs_markv }, /* 185 = lfs_markv */ - { 2, s(struct lfs_segclean_args), - lfs_segclean }, /* 186 = lfs_segclean */ - { 2, s(struct lfs_segwait_args), - lfs_segwait }, /* 187 = lfs_segwait */ + { 3, (sy_call_t *)lfs_bmapv }, /* 184 = lfs_bmapv */ + { 3, (sy_call_t *)lfs_markv }, /* 185 = lfs_markv */ + { 2, (sy_call_t *)lfs_segclean }, /* 186 = lfs_segclean */ + { 2, (sy_call_t *)lfs_segwait }, /* 187 = lfs_segwait */ #else - { 0, 0, - nosys }, /* 184 = unimplemented lfs_bmapv */ - { 0, 0, - nosys }, /* 185 = unimplemented lfs_markv */ - { 0, 0, - nosys }, /* 186 = unimplemented lfs_segclean */ - { 0, 0, - nosys }, /* 187 = unimplemented lfs_segwait */ -#endif - { 2, s(struct stat_args), - stat }, /* 188 = stat */ - { 2, s(struct fstat_args), - fstat }, /* 189 = fstat */ - { 2, s(struct lstat_args), - lstat }, /* 190 = lstat */ - { 2, s(struct pathconf_args), - pathconf }, /* 191 = pathconf */ - { 2, s(struct fpathconf_args), - fpathconf }, /* 192 = fpathconf */ - { 0, 0, - nosys }, /* 193 = unimplemented */ - { 2, s(struct getrlimit_args), - getrlimit }, /* 194 = getrlimit */ - { 2, s(struct setrlimit_args), - setrlimit }, /* 195 = setrlimit */ - { 4, s(struct getdirentries_args), - getdirentries }, /* 196 = getdirentries */ - { 7, s(struct mmap_args), - mmap }, /* 197 = mmap */ - { 0, 0, - nosys }, /* 198 = __syscall */ - { 4, s(struct lseek_args), - lseek }, /* 199 = lseek */ - { 3, s(struct truncate_args), - truncate }, /* 200 = truncate */ - { 3, s(struct ftruncate_args), - ftruncate }, /* 201 = ftruncate */ - { 6, s(struct __sysctl_args), - __sysctl }, /* 202 = __sysctl */ - { 2, s(struct mlock_args), - mlock }, /* 203 = mlock */ - { 2, s(struct munlock_args), - munlock }, /* 204 = munlock */ - { 1, s(struct undelete_args), - undelete }, /* 205 = undelete */ - { 0, 0, - nosys }, /* 206 = unimplemented */ - { 0, 0, - nosys }, /* 207 = unimplemented */ - { 0, 0, - nosys }, /* 208 = unimplemented */ - { 0, 0, - nosys }, /* 209 = unimplemented */ - { 0, 0, - nosys }, /* 210 = unimplemented */ - { 0, 0, - nosys }, /* 211 = unimplemented */ - { 0, 0, - nosys }, /* 212 = unimplemented */ - { 0, 0, - nosys }, /* 213 = unimplemented */ - { 0, 0, - nosys }, /* 214 = unimplemented */ - { 0, 0, - nosys }, /* 215 = unimplemented */ - { 0, 0, - nosys }, /* 216 = unimplemented */ - { 0, 0, - nosys }, /* 217 = unimplemented */ - { 0, 0, - nosys }, /* 218 = unimplemented */ - { 0, 0, - nosys }, /* 219 = unimplemented */ - { 0, 0, - nosys }, /* 220 = unimplemented semctl */ - { 0, 0, - nosys }, /* 221 = unimplemented semget */ - { 0, 0, - nosys }, /* 222 = unimplemented semop */ - { 0, 0, - nosys }, /* 223 = unimplemented semconfig */ - { 0, 0, - nosys }, /* 224 = unimplemented msgctl */ - { 0, 0, - nosys }, /* 225 = unimplemented msgget */ - { 0, 0, - nosys }, /* 226 = unimplemented msgsnd */ - { 0, 0, - nosys }, /* 227 = unimplemented msgrcv */ -#if defined(SYSVSHM) && 0 - { 3, s(struct shmat_args), - shmat }, /* 228 = shmat */ - { 3, s(struct shmctl_args), - shmctl }, /* 229 = shmctl */ - { 1, s(struct shmdt_args), - shmdt }, /* 230 = shmdt */ - { 3, s(struct shmget_args), - shmget }, /* 231 = shmget */ -#else - { 0, 0, - nosys }, /* 228 = unimplemented shmat */ - { 0, 0, - nosys }, /* 229 = unimplemented shmctl */ - { 0, 0, - nosys }, /* 230 = unimplemented shmdt */ - { 0, 0, - nosys }, /* 231 = unimplemented shmget */ + { 0, (sy_call_t *)nosys }, /* 184 = nosys */ + { 0, (sy_call_t *)nosys }, /* 185 = nosys */ + { 0, (sy_call_t *)nosys }, /* 186 = nosys */ + { 0, (sy_call_t *)nosys }, /* 187 = nosys */ #endif + { 2, (sy_call_t *)stat }, /* 188 = stat */ + { 2, (sy_call_t *)fstat }, /* 189 = fstat */ + { 2, (sy_call_t *)lstat }, /* 190 = lstat */ + { 2, (sy_call_t *)pathconf }, /* 191 = pathconf */ + { 2, (sy_call_t *)fpathconf }, /* 192 = fpathconf */ + { 0, (sy_call_t *)nosys }, /* 193 = nosys */ + { 2, (sy_call_t *)getrlimit }, /* 194 = getrlimit */ + { 2, (sy_call_t *)setrlimit }, /* 195 = setrlimit */ + { 4, (sy_call_t *)getdirentries }, /* 196 = getdirentries */ + { 8, (sy_call_t *)mmap }, /* 197 = mmap */ + { 0, (sy_call_t *)nosys }, /* 198 = __syscall */ + { 5, (sy_call_t *)lseek }, /* 199 = lseek */ + { 4, (sy_call_t *)truncate }, /* 200 = truncate */ + { 4, (sy_call_t *)ftruncate }, /* 201 = ftruncate */ + { 6, (sy_call_t *)__sysctl }, /* 202 = __sysctl */ + { 2, (sy_call_t *)mlock }, /* 203 = mlock */ + { 2, (sy_call_t *)munlock }, /* 204 = munlock */ + { 2, (sy_call_t *)utrace }, /* 205 = utrace */ + { 1, (sy_call_t *)undelete }, /* 206 = undelete */ + { 0, (sy_call_t *)nosys }, /* 207 = nosys */ + { 0, (sy_call_t *)nosys }, /* 208 = nosys */ + { 0, (sy_call_t *)nosys }, /* 209 = nosys */ + { 0, (sy_call_t *)lkmnosys }, /* 210 = lkmnosys */ + { 0, (sy_call_t *)lkmnosys }, /* 211 = lkmnosys */ + { 0, (sy_call_t *)lkmnosys }, /* 212 = lkmnosys */ + { 0, (sy_call_t *)lkmnosys }, /* 213 = lkmnosys */ + { 0, (sy_call_t *)lkmnosys }, /* 214 = lkmnosys */ + { 0, (sy_call_t *)lkmnosys }, /* 215 = lkmnosys */ + { 0, (sy_call_t *)lkmnosys }, /* 216 = lkmnosys */ + { 0, (sy_call_t *)lkmnosys }, /* 217 = lkmnosys */ + { 0, (sy_call_t *)lkmnosys }, /* 218 = lkmnosys */ + { 0, (sy_call_t *)lkmnosys }, /* 219 = lkmnosys */ + { 4, (sy_call_t *)__semctl }, /* 220 = __semctl */ + { 3, (sy_call_t *)semget }, /* 221 = semget */ + { 3, (sy_call_t *)semop }, /* 222 = semop */ + { 1, (sy_call_t *)semconfig }, /* 223 = semconfig */ + { 3, (sy_call_t *)msgctl }, /* 224 = msgctl */ + { 2, (sy_call_t *)msgget }, /* 225 = msgget */ + { 4, (sy_call_t *)msgsnd }, /* 226 = msgsnd */ + { 5, (sy_call_t *)msgrcv }, /* 227 = msgrcv */ + { 3, (sy_call_t *)shmat }, /* 228 = shmat */ + { 3, (sy_call_t *)shmctl }, /* 229 = shmctl */ + { 1, (sy_call_t *)shmdt }, /* 230 = shmdt */ + { 3, (sy_call_t *)shmget }, /* 231 = shmget */ + { 0, (sy_call_t *)nosys }, /* 232 = nosys */ + { 0, (sy_call_t *)nosys }, /* 233 = nosys */ + { 0, (sy_call_t *)nosys }, /* 234 = nosys */ + { 0, (sy_call_t *)nosys }, /* 235 = nosys */ + { 0, (sy_call_t *)nosys }, /* 236 = nosys */ + { 0, (sy_call_t *)nosys }, /* 237 = nosys */ + { 0, (sy_call_t *)nosys }, /* 238 = nosys */ + { 0, (sy_call_t *)nosys }, /* 239 = nosys */ + { 0, (sy_call_t *)nosys }, /* 240 = nosys */ + { 0, (sy_call_t *)nosys }, /* 241 = nosys */ + { 0, (sy_call_t *)nosys }, /* 242 = nosys */ + { 0, (sy_call_t *)nosys }, /* 243 = nosys */ + { 0, (sy_call_t *)nosys }, /* 244 = nosys */ + { 0, (sy_call_t *)nosys }, /* 245 = nosys */ + { 0, (sy_call_t *)nosys }, /* 246 = nosys */ + { 0, (sy_call_t *)nosys }, /* 247 = nosys */ + { 0, (sy_call_t *)nosys }, /* 248 = nosys */ + { 0, (sy_call_t *)nosys }, /* 249 = nosys */ + { 3, (sy_call_t *)minherit }, /* 250 = minherit */ + { 1, (sy_call_t *)rfork }, /* 251 = rfork */ }; - -int nsysent= sizeof(sysent) / sizeof(sysent[0]); diff --git a/sys/kern/init_sysvec.c b/sys/kern/init_sysvec.c new file mode 100644 index 0000000..379a1bf --- /dev/null +++ b/sys/kern/init_sysvec.c @@ -0,0 +1,30 @@ +/* + * sysentvec for native FreeBSD a.out executable format. + * + * $Id$ + */ + +#include <sys/types.h> +#include <sys/param.h> +#include <sys/mount.h> +#include <sys/sysent.h> +#include <sys/sysproto.h> +#include <sys/syscall.h> +#include <sys/signalvar.h> +#include <machine/md_var.h> + +struct sysentvec aout_sysvec = { + SYS_MAXSYSCALL, + sysent, + 0, + 0, + 0, + 0, + 0, + 0, + sendsig, + sigcode, + &szsigcode, + 0, + "FreeBSD a.out" +}; diff --git a/sys/kern/kern_acct.c b/sys/kern/kern_acct.c index a23543c..f72d2d0 100644 --- a/sys/kern/kern_acct.c +++ b/sys/kern/kern_acct.c @@ -1,4 +1,5 @@ /*- + * Copyright (c) 1994 Christopher G. Demetriou * Copyright (c) 1982, 1986, 1989, 1993 * The Regents of the University of California. All rights reserved. * (c) UNIX System Laboratories, Inc. @@ -35,91 +36,278 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * from: @(#)kern_acct.c 8.8 (Berkeley) 5/14/95 + * @(#)kern_acct.c 8.1 (Berkeley) 6/14/93 + * $Id: kern_acct.c,v 1.14 1997/03/23 03:36:17 bde Exp $ */ #include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> #include <sys/proc.h> #include <sys/mount.h> #include <sys/vnode.h> -#include <sys/file.h> +#include <sys/fcntl.h> #include <sys/syslog.h> #include <sys/kernel.h> +#include <sys/sysent.h> +#include <sys/sysctl.h> +#include <sys/namei.h> +#include <sys/errno.h> +#include <sys/acct.h> +#include <sys/resourcevar.h> +#include <sys/tty.h> -acct(a1, a2, a3) +/* + * The routines implemented in this file are described in: + * Leffler, et al.: The Design and Implementation of the 4.3BSD + * UNIX Operating System (Addison Welley, 1989) + * on pages 62-63. + * + * Arguably, to simplify accounting operations, this mechanism should + * be replaced by one in which an accounting log file (similar to /dev/klog) + * is read by a user process, etc. However, that has its own problems. + */ + +/* + * Internal accounting functions. + * The former's operation is described in Leffler, et al., and the latter + * was provided by UCB with the 4.4BSD-Lite release + */ +static comp_t encode_comp_t __P((u_long, u_long)); +static void acctwatch __P((void *)); + +/* + * Accounting vnode pointer, and saved vnode pointer. + */ +static struct vnode *acctp; +static struct vnode *savacctp; + +/* + * Values associated with enabling and disabling accounting + */ +static int acctsuspend = 2; /* stop accounting when < 2% free space left */ +SYSCTL_INT(_kern, OID_AUTO, acct_suspend, CTLFLAG_RW, + &acctsuspend, 0, ""); + +static int acctresume = 4; /* resume when free space risen to > 4% */ +SYSCTL_INT(_kern, OID_AUTO, acct_resume, CTLFLAG_RW, + &acctresume, 0, ""); + +static int acctchkfreq = 15; /* frequency (in seconds) to check space */ +SYSCTL_INT(_kern, OID_AUTO, acct_chkfreq, CTLFLAG_RW, + &acctchkfreq, 0, ""); + +/* + * Accounting system call. Written based on the specification and + * previous implementation done by Mark Tinguely. + */ +int +acct(a1, uap, a3) struct proc *a1; struct acct_args /* { syscallarg(char *) path; - } */ *a2; + } */ *uap; int *a3; { + struct proc *p = curproc; /* XXX */ + struct nameidata nd; + int error; + + /* Make sure that the caller is root. */ + error = suser(p->p_ucred, &p->p_acflag); + if (error) + return (error); + /* - * Body deleted. + * If accounting is to be started to a file, open that file for + * writing and make sure it's a 'normal'. */ - return (ENOSYS); -} + if (SCARG(uap, path) != NULL) { + NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path), + p); + error = vn_open(&nd, FWRITE, 0); + if (error) + return (error); + VOP_UNLOCK(nd.ni_vp, 0, p); + if (nd.ni_vp->v_type != VREG) { + vn_close(nd.ni_vp, FWRITE, p->p_ucred, p); + return (EACCES); + } + } -acct_process(a1) - struct proc *a1; -{ + /* + * If accounting was previously enabled, kill the old space-watcher, + * close the file, and (if no new file was specified, leave). + */ + if (acctp != NULLVP || savacctp != NULLVP) { + untimeout(acctwatch, NULL); + error = vn_close((acctp != NULLVP ? acctp : savacctp), FWRITE, + p->p_ucred, p); + acctp = savacctp = NULLVP; + } + if (SCARG(uap, path) == NULL) + return (error); /* - * Body deleted. + * Save the new accounting file vnode, and schedule the new + * free space watcher. */ - return; + acctp = nd.ni_vp; + acctwatch(NULL); + return (error); } /* - * Periodically check the file system to see if accounting - * should be turned on or off. Beware the case where the vnode - * has been vgone()'d out from underneath us, e.g. when the file - * system containing the accounting file has been forcibly unmounted. + * Write out process accounting information, on process exit. + * Data to be written out is specified in Leffler, et al. + * and are enumerated below. (They're also noted in the system + * "acct.h" header file.) */ +int +acct_process(p) + struct proc *p; +{ + struct acct acct; + struct rusage *r; + struct timeval ut, st, tmp; + int t; + struct vnode *vp; + + /* If accounting isn't enabled, don't bother */ + vp = acctp; + if (vp == NULLVP) + return (0); + + /* + * Get process accounting information. + */ + + /* (1) The name of the command that ran */ + bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm); + + /* (2) The amount of user and system time that was used */ + calcru(p, &ut, &st, NULL); + acct.ac_utime = encode_comp_t(ut.tv_sec, ut.tv_usec); + acct.ac_stime = encode_comp_t(st.tv_sec, st.tv_usec); + + /* (3) The elapsed time the commmand ran (and its starting time) */ + acct.ac_btime = p->p_stats->p_start.tv_sec; + microtime(&tmp); + timevalsub(&tmp, &p->p_stats->p_start); + acct.ac_etime = encode_comp_t(tmp.tv_sec, tmp.tv_usec); + + /* (4) The average amount of memory used */ + r = &p->p_stats->p_ru; + tmp = ut; + timevaladd(&tmp, &st); + t = tmp.tv_sec * hz + tmp.tv_usec / tick; + if (t) + acct.ac_mem = (r->ru_ixrss + r->ru_idrss + r->ru_isrss) / t; + else + acct.ac_mem = 0; + + /* (5) The number of disk I/O operations done */ + acct.ac_io = encode_comp_t(r->ru_inblock + r->ru_oublock, 0); + + /* (6) The UID and GID of the process */ + acct.ac_uid = p->p_cred->p_ruid; + acct.ac_gid = p->p_cred->p_rgid; + + /* (7) The terminal from which the process was started */ + if ((p->p_flag & P_CONTROLT) && p->p_pgrp->pg_session->s_ttyp) + acct.ac_tty = p->p_pgrp->pg_session->s_ttyp->t_dev; + else + acct.ac_tty = NODEV; + + /* (8) The boolean flags that tell how the process terminated, etc. */ + acct.ac_flag = p->p_acflag; + + /* + * Now, just write the accounting information to the file. + */ + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + return (vn_rdwr(UIO_WRITE, vp, (caddr_t)&acct, sizeof (acct), + (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, p->p_ucred, + (int *)0, p)); +} + /* - * Values associated with enabling and disabling accounting + * Encode_comp_t converts from ticks in seconds and microseconds + * to ticks in 1/AHZ seconds. The encoding is described in + * Leffler, et al., on page 63. */ -int acctsuspend = 2; /* stop accounting when < 2% free space left */ -int acctresume = 4; /* resume when free space risen to > 4% */ -int acctchkfreq = 15; /* frequency (in seconds) to check space */ + +#define MANTSIZE 13 /* 13 bit mantissa. */ +#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */ +#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */ + +static comp_t +encode_comp_t(s, us) + u_long s, us; +{ + int exp, rnd; + + exp = 0; + rnd = 0; + s *= AHZ; + s += us / (1000000 / AHZ); /* Maximize precision. */ + + while (s > MAXFRACT) { + rnd = s & (1 << (EXPSIZE - 1)); /* Round up? */ + s >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */ + exp++; + } + + /* If we need to round up, do it (and handle overflow correctly). */ + if (rnd && (++s > MAXFRACT)) { + s >>= EXPSIZE; + exp++; + } + + /* Clean it up and polish it off. */ + exp <<= MANTSIZE; /* Shift the exponent into place */ + exp += s; /* and add on the mantissa. */ + return (exp); +} /* - * SHOULD REPLACE THIS WITH A DRIVER THAT CAN BE READ TO SIMPLIFY. + * Periodically check the file system to see if accounting + * should be turned on or off. Beware the case where the vnode + * has been vgone()'d out from underneath us, e.g. when the file + * system containing the accounting file has been forcibly unmounted. */ -struct vnode *acctp; -struct vnode *savacctp; - /* ARGSUSED */ -void +static void acctwatch(a) void *a; { struct statfs sb; - if (savacctp) { + if (savacctp != NULLVP) { if (savacctp->v_type == VBAD) { (void) vn_close(savacctp, FWRITE, NOCRED, NULL); - savacctp = NULL; + savacctp = NULLVP; return; } (void)VFS_STATFS(savacctp->v_mount, &sb, (struct proc *)0); if (sb.f_bavail > acctresume * sb.f_blocks / 100) { acctp = savacctp; - savacctp = NULL; + savacctp = NULLVP; log(LOG_NOTICE, "Accounting resumed\n"); } } else { - if (acctp == NULL) + if (acctp == NULLVP) return; if (acctp->v_type == VBAD) { (void) vn_close(acctp, FWRITE, NOCRED, NULL); - acctp = NULL; + acctp = NULLVP; return; } (void)VFS_STATFS(acctp->v_mount, &sb, (struct proc *)0); if (sb.f_bavail <= acctsuspend * sb.f_blocks / 100) { savacctp = acctp; - acctp = NULL; + acctp = NULLVP; log(LOG_NOTICE, "Accounting suspended\n"); } } diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c index f42900c..171ed0e 100644 --- a/sys/kern/kern_clock.c +++ b/sys/kern/kern_clock.c @@ -36,8 +36,28 @@ * SUCH DAMAGE. * * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 + * $Id: kern_clock.c,v 1.34 1997/03/22 16:52:19 mpp Exp $ */ +/* Portions of this software are covered by the following: */ +/****************************************************************************** + * * + * Copyright (c) David L. Mills 1993, 1994 * + * * + * Permission to use, copy, modify, and distribute this software and its * + * documentation for any purpose and without fee is hereby granted, provided * + * that the above copyright notice appears in all copies and that both the * + * copyright notice and this permission notice appear in supporting * + * documentation, and that the name University of Delaware not be used in * + * advertising or publicity pertaining to distribution of the software * + * without specific, written prior permission. The University of Delaware * + * makes no representations about the suitability this software for any * + * purpose. It is provided "as is" without express or implied warranty. * + * * + *****************************************************************************/ + +#include "opt_cpu.h" /* XXX */ + #include <sys/param.h> #include <sys/systm.h> #include <sys/dkstat.h> @@ -45,13 +65,49 @@ #include <sys/kernel.h> #include <sys/proc.h> #include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/timex.h> +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <sys/sysctl.h> #include <machine/cpu.h> +#define CLOCK_HAIR /* XXX */ +#include <machine/clock.h> #ifdef GPROF #include <sys/gmon.h> #endif +static void initclocks __P((void *dummy)); +SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL) + +/* Exported to machdep.c. */ +struct callout *callfree, *callout; + +static struct callout calltodo; + +/* Some of these don't belong here, but it's easiest to concentrate them. */ +static long cp_time[CPUSTATES]; +long dk_seek[DK_NDRIVE]; +static long dk_time[DK_NDRIVE]; +long dk_wds[DK_NDRIVE]; +long dk_wpms[DK_NDRIVE]; +long dk_xfer[DK_NDRIVE]; + +int dk_busy; +int dk_ndrive = 0; +char dk_names[DK_NDRIVE][DK_NAMELEN]; + +long tk_cancc; +long tk_nin; +long tk_nout; +long tk_rawcc; + /* * Clock handling routines. * @@ -97,19 +153,278 @@ int stathz; int profhz; -int profprocs; +static int profprocs; int ticks; static int psdiv, pscnt; /* prof => stat divider */ -int psratio; /* ratio: prof / stat */ +int psratio; /* ratio: prof / stat */ volatile struct timeval time; volatile struct timeval mono_time; /* - * Initialize clock frequencies and start both clocks running. + * Phase/frequency-lock loop (PLL/FLL) definitions + * + * The following variables are read and set by the ntp_adjtime() system + * call. + * + * time_state shows the state of the system clock, with values defined + * in the timex.h header file. + * + * time_status shows the status of the system clock, with bits defined + * in the timex.h header file. + * + * time_offset is used by the PLL/FLL to adjust the system time in small + * increments. + * + * time_constant determines the bandwidth or "stiffness" of the PLL. + * + * time_tolerance determines maximum frequency error or tolerance of the + * CPU clock oscillator and is a property of the architecture; however, + * in principle it could change as result of the presence of external + * discipline signals, for instance. + * + * time_precision is usually equal to the kernel tick variable; however, + * in cases where a precision clock counter or external clock is + * available, the resolution can be much less than this and depend on + * whether the external clock is working or not. + * + * time_maxerror is initialized by a ntp_adjtime() call and increased by + * the kernel once each second to reflect the maximum error + * bound growth. + * + * time_esterror is set and read by the ntp_adjtime() call, but + * otherwise not used by the kernel. + */ +int time_status = STA_UNSYNC; /* clock status bits */ +int time_state = TIME_OK; /* clock state */ +long time_offset = 0; /* time offset (us) */ +long time_constant = 0; /* pll time constant */ +long time_tolerance = MAXFREQ; /* frequency tolerance (scaled ppm) */ +long time_precision = 1; /* clock precision (us) */ +long time_maxerror = MAXPHASE; /* maximum error (us) */ +long time_esterror = MAXPHASE; /* estimated error (us) */ + +/* + * The following variables establish the state of the PLL/FLL and the + * residual time and frequency offset of the local clock. The scale + * factors are defined in the timex.h header file. + * + * time_phase and time_freq are the phase increment and the frequency + * increment, respectively, of the kernel time variable at each tick of + * the clock. + * + * time_freq is set via ntp_adjtime() from a value stored in a file when + * the synchronization daemon is first started. Its value is retrieved + * via ntp_adjtime() and written to the file about once per hour by the + * daemon. + * + * time_adj is the adjustment added to the value of tick at each timer + * interrupt and is recomputed from time_phase and time_freq at each + * seconds rollover. + * + * time_reftime is the second's portion of the system time on the last + * call to ntp_adjtime(). It is used to adjust the time_freq variable + * and to increase the time_maxerror as the time since last update + * increases. + */ +static long time_phase = 0; /* phase offset (scaled us) */ +long time_freq = 0; /* frequency offset (scaled ppm) */ +static long time_adj = 0; /* tick adjust (scaled 1 / hz) */ +static long time_reftime = 0; /* time at last adjustment (s) */ + +#ifdef PPS_SYNC +/* + * The following variables are used only if the kernel PPS discipline + * code is configured (PPS_SYNC). The scale factors are defined in the + * timex.h header file. + * + * pps_time contains the time at each calibration interval, as read by + * microtime(). pps_count counts the seconds of the calibration + * interval, the duration of which is nominally pps_shift in powers of + * two. + * + * pps_offset is the time offset produced by the time median filter + * pps_tf[], while pps_jitter is the dispersion (jitter) measured by + * this filter. + * + * pps_freq is the frequency offset produced by the frequency median + * filter pps_ff[], while pps_stabil is the dispersion (wander) measured + * by this filter. + * + * pps_usec is latched from a high resolution counter or external clock + * at pps_time. Here we want the hardware counter contents only, not the + * contents plus the time_tv.usec as usual. + * + * pps_valid counts the number of seconds since the last PPS update. It + * is used as a watchdog timer to disable the PPS discipline should the + * PPS signal be lost. + * + * pps_glitch counts the number of seconds since the beginning of an + * offset burst more than tick/2 from current nominal offset. It is used + * mainly to suppress error bursts due to priority conflicts between the + * PPS interrupt and timer interrupt. + * + * pps_intcnt counts the calibration intervals for use in the interval- + * adaptation algorithm. It's just too complicated for words. + */ +struct timeval pps_time; /* kernel time at last interval */ +long pps_offset = 0; /* pps time offset (us) */ +long pps_jitter = MAXTIME; /* pps time dispersion (jitter) (us) */ +long pps_tf[] = {0, 0, 0}; /* pps time offset median filter (us) */ +long pps_freq = 0; /* frequency offset (scaled ppm) */ +long pps_stabil = MAXFREQ; /* frequency dispersion (scaled ppm) */ +long pps_ff[] = {0, 0, 0}; /* frequency offset median filter */ +long pps_usec = 0; /* microsec counter at last interval */ +long pps_valid = PPS_VALID; /* pps signal watchdog counter */ +int pps_glitch = 0; /* pps signal glitch counter */ +int pps_count = 0; /* calibration interval counter (s) */ +int pps_shift = PPS_SHIFT; /* interval duration (s) (shift) */ +int pps_intcnt = 0; /* intervals at current duration */ + +/* + * PPS signal quality monitors + * + * pps_jitcnt counts the seconds that have been discarded because the + * jitter measured by the time median filter exceeds the limit MAXTIME + * (100 us). + * + * pps_calcnt counts the frequency calibration intervals, which are + * variable from 4 s to 256 s. + * + * pps_errcnt counts the calibration intervals which have been discarded + * because the wander exceeds the limit MAXFREQ (100 ppm) or where the + * calibration interval jitter exceeds two ticks. + * + * pps_stbcnt counts the calibration intervals that have been discarded + * because the frequency wander exceeds the limit MAXFREQ / 4 (25 us). + */ +long pps_jitcnt = 0; /* jitter limit exceeded */ +long pps_calcnt = 0; /* calibration intervals */ +long pps_errcnt = 0; /* calibration errors */ +long pps_stbcnt = 0; /* stability limit exceeded */ +#endif /* PPS_SYNC */ + +/* XXX none of this stuff works under FreeBSD */ +#ifdef EXT_CLOCK +/* + * External clock definitions + * + * The following definitions and declarations are used only if an + * external clock (HIGHBALL or TPRO) is configured on the system. + */ +#define CLOCK_INTERVAL 30 /* CPU clock update interval (s) */ + +/* + * The clock_count variable is set to CLOCK_INTERVAL at each PPS + * interrupt and decremented once each second. + */ +int clock_count = 0; /* CPU clock counter */ + +#ifdef HIGHBALL +/* + * The clock_offset and clock_cpu variables are used by the HIGHBALL + * interface. The clock_offset variable defines the offset between + * system time and the HIGBALL counters. The clock_cpu variable contains + * the offset between the system clock and the HIGHBALL clock for use in + * disciplining the kernel time variable. + */ +extern struct timeval clock_offset; /* Highball clock offset */ +long clock_cpu = 0; /* CPU clock adjust */ +#endif /* HIGHBALL */ +#endif /* EXT_CLOCK */ + +/* + * hardupdate() - local clock update + * + * This routine is called by ntp_adjtime() to update the local clock + * phase and frequency. The implementation is of an adaptive-parameter, + * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new + * time and frequency offset estimates for each call. If the kernel PPS + * discipline code is configured (PPS_SYNC), the PPS signal itself + * determines the new time offset, instead of the calling argument. + * Presumably, calls to ntp_adjtime() occur only when the caller + * believes the local clock is valid within some bound (+-128 ms with + * NTP). If the caller's time is far different than the PPS time, an + * argument will ensue, and it's not clear who will lose. + * + * For uncompensated quartz crystal oscillatores and nominal update + * intervals less than 1024 s, operation should be in phase-lock mode + * (STA_FLL = 0), where the loop is disciplined to phase. For update + * intervals greater than thiss, operation should be in frequency-lock + * mode (STA_FLL = 1), where the loop is disciplined to frequency. + * + * Note: splclock() is in effect. */ void -initclocks() +hardupdate(offset) + long offset; +{ + long ltemp, mtemp; + + if (!(time_status & STA_PLL) && !(time_status & STA_PPSTIME)) + return; + ltemp = offset; +#ifdef PPS_SYNC + if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) + ltemp = pps_offset; +#endif /* PPS_SYNC */ + + /* + * Scale the phase adjustment and clamp to the operating range. + */ + if (ltemp > MAXPHASE) + time_offset = MAXPHASE << SHIFT_UPDATE; + else if (ltemp < -MAXPHASE) + time_offset = -(MAXPHASE << SHIFT_UPDATE); + else + time_offset = ltemp << SHIFT_UPDATE; + + /* + * Select whether the frequency is to be controlled and in which + * mode (PLL or FLL). Clamp to the operating range. Ugly + * multiply/divide should be replaced someday. + */ + if (time_status & STA_FREQHOLD || time_reftime == 0) + time_reftime = time.tv_sec; + mtemp = time.tv_sec - time_reftime; + time_reftime = time.tv_sec; + if (time_status & STA_FLL) { + if (mtemp >= MINSEC) { + ltemp = ((time_offset / mtemp) << (SHIFT_USEC - + SHIFT_UPDATE)); + if (ltemp < 0) + time_freq -= -ltemp >> SHIFT_KH; + else + time_freq += ltemp >> SHIFT_KH; + } + } else { + if (mtemp < MAXSEC) { + ltemp *= mtemp; + if (ltemp < 0) + time_freq -= -ltemp >> (time_constant + + time_constant + SHIFT_KF - + SHIFT_USEC); + else + time_freq += ltemp >> (time_constant + + time_constant + SHIFT_KF - + SHIFT_USEC); + } + } + if (time_freq > time_tolerance) + time_freq = time_tolerance; + else if (time_freq < -time_tolerance) + time_freq = -time_tolerance; +} + + + +/* + * Initialize clock frequencies and start both clocks running. + */ +/* ARGSUSED*/ +static void +initclocks(dummy) + void *dummy; { register int i; @@ -138,9 +453,7 @@ hardclock(frame) { register struct callout *p1; register struct proc *p; - register int delta, needsoft; - extern int tickdelta; - extern long timedelta; + register int needsoft; /* * Update real-time timeout queue. @@ -185,18 +498,181 @@ hardclock(frame) statclock(frame); /* - * Increment the time-of-day. The increment is just ``tick'' unless - * we are still adjusting the clock; see adjtime(). + * Increment the time-of-day. */ ticks++; - if (timedelta == 0) - delta = tick; - else { - delta = tick + tickdelta; - timedelta -= tickdelta; + { + int time_update; + struct timeval newtime = time; + long ltemp; + + if (timedelta == 0) { + time_update = CPU_THISTICKLEN(tick); + } else { + time_update = CPU_THISTICKLEN(tick) + tickdelta; + timedelta -= tickdelta; + } + BUMPTIME(&mono_time, time_update); + + /* + * Compute the phase adjustment. If the low-order bits + * (time_phase) of the update overflow, bump the high-order bits + * (time_update). + */ + time_phase += time_adj; + if (time_phase <= -FINEUSEC) { + ltemp = -time_phase >> SHIFT_SCALE; + time_phase += ltemp << SHIFT_SCALE; + time_update -= ltemp; + } + else if (time_phase >= FINEUSEC) { + ltemp = time_phase >> SHIFT_SCALE; + time_phase -= ltemp << SHIFT_SCALE; + time_update += ltemp; + } + + newtime.tv_usec += time_update; + /* + * On rollover of the second the phase adjustment to be used for + * the next second is calculated. Also, the maximum error is + * increased by the tolerance. If the PPS frequency discipline + * code is present, the phase is increased to compensate for the + * CPU clock oscillator frequency error. + * + * On a 32-bit machine and given parameters in the timex.h + * header file, the maximum phase adjustment is +-512 ms and + * maximum frequency offset is a tad less than) +-512 ppm. On a + * 64-bit machine, you shouldn't need to ask. + */ + if (newtime.tv_usec >= 1000000) { + newtime.tv_usec -= 1000000; + newtime.tv_sec++; + time_maxerror += time_tolerance >> SHIFT_USEC; + + /* + * Compute the phase adjustment for the next second. In + * PLL mode, the offset is reduced by a fixed factor + * times the time constant. In FLL mode the offset is + * used directly. In either mode, the maximum phase + * adjustment for each second is clamped so as to spread + * the adjustment over not more than the number of + * seconds between updates. + */ + if (time_offset < 0) { + ltemp = -time_offset; + if (!(time_status & STA_FLL)) + ltemp >>= SHIFT_KG + time_constant; + if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) + ltemp = (MAXPHASE / MINSEC) << + SHIFT_UPDATE; + time_offset += ltemp; + time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - + SHIFT_UPDATE); + } else { + ltemp = time_offset; + if (!(time_status & STA_FLL)) + ltemp >>= SHIFT_KG + time_constant; + if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) + ltemp = (MAXPHASE / MINSEC) << + SHIFT_UPDATE; + time_offset -= ltemp; + time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - + SHIFT_UPDATE); + } + + /* + * Compute the frequency estimate and additional phase + * adjustment due to frequency error for the next + * second. When the PPS signal is engaged, gnaw on the + * watchdog counter and update the frequency computed by + * the pll and the PPS signal. + */ +#ifdef PPS_SYNC + pps_valid++; + if (pps_valid == PPS_VALID) { + pps_jitter = MAXTIME; + pps_stabil = MAXFREQ; + time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); + } + ltemp = time_freq + pps_freq; +#else + ltemp = time_freq; +#endif /* PPS_SYNC */ + if (ltemp < 0) + time_adj -= -ltemp >> + (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); + else + time_adj += ltemp >> + (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); + +#if SHIFT_HZ == 7 + /* + * When the CPU clock oscillator frequency is not a + * power of two in Hz, the SHIFT_HZ is only an + * approximate scale factor. In the SunOS kernel, this + * results in a PLL gain factor of 1/1.28 = 0.78 what it + * should be. In the following code the overall gain is + * increased by a factor of 1.25, which results in a + * residual error less than 3 percent. + */ + /* Same thing applies for FreeBSD --GAW */ + if (hz == 100) { + if (time_adj < 0) + time_adj -= -time_adj >> 2; + else + time_adj += time_adj >> 2; + } +#endif /* SHIFT_HZ */ + + /* XXX - this is really bogus, but can't be fixed until + xntpd's idea of the system clock is fixed to know how + the user wants leap seconds handled; in the mean time, + we assume that users of NTP are running without proper + leap second support (this is now the default anyway) */ + /* + * Leap second processing. If in leap-insert state at + * the end of the day, the system clock is set back one + * second; if in leap-delete state, the system clock is + * set ahead one second. The microtime() routine or + * external clock driver will insure that reported time + * is always monotonic. The ugly divides should be + * replaced. + */ + switch (time_state) { + + case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; + break; + + case TIME_INS: + if (newtime.tv_sec % 86400 == 0) { + newtime.tv_sec--; + time_state = TIME_OOP; + } + break; + + case TIME_DEL: + if ((newtime.tv_sec + 1) % 86400 == 0) { + newtime.tv_sec++; + time_state = TIME_WAIT; + } + break; + + case TIME_OOP: + time_state = TIME_WAIT; + break; + + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + } + } + CPU_CLOCKUPDATE(&time, &newtime); } - BUMPTIME(&time, delta); - BUMPTIME(&mono_time, delta); /* * Process callouts at a very low cpu priority, so we don't keep the @@ -256,7 +732,7 @@ softclock() */ void timeout(ftn, arg, ticks) - void (*ftn) __P((void *)); + timeout_t ftn; void *arg; register int ticks; { @@ -301,7 +777,7 @@ timeout(ftn, arg, ticks) void untimeout(ftn, arg) - void (*ftn) __P((void *)); + timeout_t ftn; void *arg; { register struct callout *p, *t; @@ -323,6 +799,17 @@ untimeout(ftn, arg) splx(s); } +void +gettime(struct timeval *tvp) +{ + int s; + + s = splclock(); + /* XXX should use microtime() iff tv_usec is used. */ + *tvp = time; + splx(s); +} + /* * Compute number of hz until specified time. Used to * compute third argument to timeout() from an absolute time. @@ -331,28 +818,54 @@ int hzto(tv) struct timeval *tv; { - register long ticks, sec; + register unsigned long ticks; + register long sec, usec; int s; /* - * If number of milliseconds will fit in 32 bit arithmetic, - * then compute number of milliseconds to time and scale to - * ticks. Otherwise just compute number of hz in time, rounding - * times greater than representible to maximum value. + * If the number of usecs in the whole seconds part of the time + * difference fits in a long, then the total number of usecs will + * fit in an unsigned long. Compute the total and convert it to + * ticks, rounding up and adding 1 to allow for the current tick + * to expire. Rounding also depends on unsigned long arithmetic + * to avoid overflow. + * + * Otherwise, if the number of ticks in the whole seconds part of + * the time difference fits in a long, then convert the parts to + * ticks separately and add, using similar rounding methods and + * overflow avoidance. This method would work in the previous + * case but it is slightly slower and assumes that hz is integral. * - * Delta times less than 25 days can be computed ``exactly''. - * Maximum value for any timeout in 10ms ticks is 250 days. + * Otherwise, round the time difference down to the maximum + * representable value. + * + * If ints have 32 bits, then the maximum value for any timeout in + * 10ms ticks is 248 days. */ - s = splhigh(); + s = splclock(); sec = tv->tv_sec - time.tv_sec; - if (sec <= 0x7fffffff / 1000 - 1000) - ticks = ((tv->tv_sec - time.tv_sec) * 1000 + - (tv->tv_usec - time.tv_usec) / 1000) / (tick / 1000); - else if (sec <= 0x7fffffff / hz) - ticks = sec * hz; - else - ticks = 0x7fffffff; + usec = tv->tv_usec - time.tv_usec; splx(s); + if (usec < 0) { + sec--; + usec += 1000000; + } + if (sec < 0) { +#ifdef DIAGNOSTIC + printf("hzto: negative time difference %ld sec %ld usec\n", + sec, usec); +#endif + ticks = 1; + } else if (sec <= LONG_MAX / 1000000) + ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1)) + / tick + 1; + else if (sec <= LONG_MAX / hz) + ticks = sec * hz + + ((unsigned long)usec + (tick - 1)) / tick + 1; + else + ticks = LONG_MAX; + if (ticks > INT_MAX) + ticks = INT_MAX; return (ticks); } @@ -399,8 +912,6 @@ stopprofclock(p) } } -int dk_ndrive = DK_NDRIVE; - /* * Statistics clock. Grab profile sample, and if divider reaches 0, * do process and kernel statistics. @@ -414,6 +925,10 @@ statclock(frame) #endif register struct proc *p; register int i; + struct pstats *pstats; + long rss; + struct rusage *ru; + struct vmspace *vm; if (CLKF_USERMODE(frame)) { p = curproc; @@ -505,18 +1020,29 @@ statclock(frame) if (p->p_priority >= PUSER) p->p_priority = p->p_usrpri; } + + /* Update resource usage integrals and maximums. */ + if ((pstats = p->p_stats) != NULL && + (ru = &pstats->p_ru) != NULL && + (vm = p->p_vmspace) != NULL) { + ru->ru_ixrss += vm->vm_tsize * PAGE_SIZE / 1024; + ru->ru_idrss += vm->vm_dsize * PAGE_SIZE / 1024; + ru->ru_isrss += vm->vm_ssize * PAGE_SIZE / 1024; + rss = vm->vm_pmap.pm_stats.resident_count * + PAGE_SIZE / 1024; + if (ru->ru_maxrss < rss) + ru->ru_maxrss = rss; + } } } /* * Return information about system clocks. */ -sysctl_clockrate(where, sizep) - register char *where; - size_t *sizep; +static int +sysctl_kern_clockrate SYSCTL_HANDLER_ARGS { struct clockinfo clkinfo; - /* * Construct clockinfo structure. */ @@ -524,5 +1050,254 @@ sysctl_clockrate(where, sizep) clkinfo.tick = tick; clkinfo.profhz = profhz; clkinfo.stathz = stathz ? stathz : hz; - return (sysctl_rdstruct(where, sizep, NULL, &clkinfo, sizeof(clkinfo))); + return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req)); +} + +SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD, + 0, 0, sysctl_kern_clockrate, "S,clockinfo",""); + +#ifdef PPS_SYNC +/* + * hardpps() - discipline CPU clock oscillator to external PPS signal + * + * This routine is called at each PPS interrupt in order to discipline + * the CPU clock oscillator to the PPS signal. It measures the PPS phase + * and leaves it in a handy spot for the hardclock() routine. It + * integrates successive PPS phase differences and calculates the + * frequency offset. This is used in hardclock() to discipline the CPU + * clock oscillator so that intrinsic frequency error is cancelled out. + * The code requires the caller to capture the time and hardware counter + * value at the on-time PPS signal transition. + * + * Note that, on some Unix systems, this routine runs at an interrupt + * priority level higher than the timer interrupt routine hardclock(). + * Therefore, the variables used are distinct from the hardclock() + * variables, except for certain exceptions: The PPS frequency pps_freq + * and phase pps_offset variables are determined by this routine and + * updated atomically. The time_tolerance variable can be considered a + * constant, since it is infrequently changed, and then only when the + * PPS signal is disabled. The watchdog counter pps_valid is updated + * once per second by hardclock() and is atomically cleared in this + * routine. + */ +void +hardpps(tvp, usec) + struct timeval *tvp; /* time at PPS */ + long usec; /* hardware counter at PPS */ +{ + long u_usec, v_usec, bigtick; + long cal_sec, cal_usec; + + /* + * An occasional glitch can be produced when the PPS interrupt + * occurs in the hardclock() routine before the time variable is + * updated. Here the offset is discarded when the difference + * between it and the last one is greater than tick/2, but not + * if the interval since the first discard exceeds 30 s. + */ + time_status |= STA_PPSSIGNAL; + time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); + pps_valid = 0; + u_usec = -tvp->tv_usec; + if (u_usec < -500000) + u_usec += 1000000; + v_usec = pps_offset - u_usec; + if (v_usec < 0) + v_usec = -v_usec; + if (v_usec > (tick >> 1)) { + if (pps_glitch > MAXGLITCH) { + pps_glitch = 0; + pps_tf[2] = u_usec; + pps_tf[1] = u_usec; + } else { + pps_glitch++; + u_usec = pps_offset; + } + } else + pps_glitch = 0; + + /* + * A three-stage median filter is used to help deglitch the pps + * time. The median sample becomes the time offset estimate; the + * difference between the other two samples becomes the time + * dispersion (jitter) estimate. + */ + pps_tf[2] = pps_tf[1]; + pps_tf[1] = pps_tf[0]; + pps_tf[0] = u_usec; + if (pps_tf[0] > pps_tf[1]) { + if (pps_tf[1] > pps_tf[2]) { + pps_offset = pps_tf[1]; /* 0 1 2 */ + v_usec = pps_tf[0] - pps_tf[2]; + } else if (pps_tf[2] > pps_tf[0]) { + pps_offset = pps_tf[0]; /* 2 0 1 */ + v_usec = pps_tf[2] - pps_tf[1]; + } else { + pps_offset = pps_tf[2]; /* 0 2 1 */ + v_usec = pps_tf[0] - pps_tf[1]; + } + } else { + if (pps_tf[1] < pps_tf[2]) { + pps_offset = pps_tf[1]; /* 2 1 0 */ + v_usec = pps_tf[2] - pps_tf[0]; + } else if (pps_tf[2] < pps_tf[0]) { + pps_offset = pps_tf[0]; /* 1 0 2 */ + v_usec = pps_tf[1] - pps_tf[2]; + } else { + pps_offset = pps_tf[2]; /* 1 2 0 */ + v_usec = pps_tf[1] - pps_tf[0]; + } + } + if (v_usec > MAXTIME) + pps_jitcnt++; + v_usec = (v_usec << PPS_AVG) - pps_jitter; + if (v_usec < 0) + pps_jitter -= -v_usec >> PPS_AVG; + else + pps_jitter += v_usec >> PPS_AVG; + if (pps_jitter > (MAXTIME >> 1)) + time_status |= STA_PPSJITTER; + + /* + * During the calibration interval adjust the starting time when + * the tick overflows. At the end of the interval compute the + * duration of the interval and the difference of the hardware + * counters at the beginning and end of the interval. This code + * is deliciously complicated by the fact valid differences may + * exceed the value of tick when using long calibration + * intervals and small ticks. Note that the counter can be + * greater than tick if caught at just the wrong instant, but + * the values returned and used here are correct. + */ + bigtick = (long)tick << SHIFT_USEC; + pps_usec -= pps_freq; + if (pps_usec >= bigtick) + pps_usec -= bigtick; + if (pps_usec < 0) + pps_usec += bigtick; + pps_time.tv_sec++; + pps_count++; + if (pps_count < (1 << pps_shift)) + return; + pps_count = 0; + pps_calcnt++; + u_usec = usec << SHIFT_USEC; + v_usec = pps_usec - u_usec; + if (v_usec >= bigtick >> 1) + v_usec -= bigtick; + if (v_usec < -(bigtick >> 1)) + v_usec += bigtick; + if (v_usec < 0) + v_usec = -(-v_usec >> pps_shift); + else + v_usec = v_usec >> pps_shift; + pps_usec = u_usec; + cal_sec = tvp->tv_sec; + cal_usec = tvp->tv_usec; + cal_sec -= pps_time.tv_sec; + cal_usec -= pps_time.tv_usec; + if (cal_usec < 0) { + cal_usec += 1000000; + cal_sec--; + } + pps_time = *tvp; + + /* + * Check for lost interrupts, noise, excessive jitter and + * excessive frequency error. The number of timer ticks during + * the interval may vary +-1 tick. Add to this a margin of one + * tick for the PPS signal jitter and maximum frequency + * deviation. If the limits are exceeded, the calibration + * interval is reset to the minimum and we start over. + */ + u_usec = (long)tick << 1; + if (!((cal_sec == -1 && cal_usec > (1000000 - u_usec)) + || (cal_sec == 0 && cal_usec < u_usec)) + || v_usec > time_tolerance || v_usec < -time_tolerance) { + pps_errcnt++; + pps_shift = PPS_SHIFT; + pps_intcnt = 0; + time_status |= STA_PPSERROR; + return; + } + + /* + * A three-stage median filter is used to help deglitch the pps + * frequency. The median sample becomes the frequency offset + * estimate; the difference between the other two samples + * becomes the frequency dispersion (stability) estimate. + */ + pps_ff[2] = pps_ff[1]; + pps_ff[1] = pps_ff[0]; + pps_ff[0] = v_usec; + if (pps_ff[0] > pps_ff[1]) { + if (pps_ff[1] > pps_ff[2]) { + u_usec = pps_ff[1]; /* 0 1 2 */ + v_usec = pps_ff[0] - pps_ff[2]; + } else if (pps_ff[2] > pps_ff[0]) { + u_usec = pps_ff[0]; /* 2 0 1 */ + v_usec = pps_ff[2] - pps_ff[1]; + } else { + u_usec = pps_ff[2]; /* 0 2 1 */ + v_usec = pps_ff[0] - pps_ff[1]; + } + } else { + if (pps_ff[1] < pps_ff[2]) { + u_usec = pps_ff[1]; /* 2 1 0 */ + v_usec = pps_ff[2] - pps_ff[0]; + } else if (pps_ff[2] < pps_ff[0]) { + u_usec = pps_ff[0]; /* 1 0 2 */ + v_usec = pps_ff[1] - pps_ff[2]; + } else { + u_usec = pps_ff[2]; /* 1 2 0 */ + v_usec = pps_ff[1] - pps_ff[0]; + } + } + + /* + * Here the frequency dispersion (stability) is updated. If it + * is less than one-fourth the maximum (MAXFREQ), the frequency + * offset is updated as well, but clamped to the tolerance. It + * will be processed later by the hardclock() routine. + */ + v_usec = (v_usec >> 1) - pps_stabil; + if (v_usec < 0) + pps_stabil -= -v_usec >> PPS_AVG; + else + pps_stabil += v_usec >> PPS_AVG; + if (pps_stabil > MAXFREQ >> 2) { + pps_stbcnt++; + time_status |= STA_PPSWANDER; + return; + } + if (time_status & STA_PPSFREQ) { + if (u_usec < 0) { + pps_freq -= -u_usec >> PPS_AVG; + if (pps_freq < -time_tolerance) + pps_freq = -time_tolerance; + u_usec = -u_usec; + } else { + pps_freq += u_usec >> PPS_AVG; + if (pps_freq > time_tolerance) + pps_freq = time_tolerance; + } + } + + /* + * Here the calibration interval is adjusted. If the maximum + * time difference is greater than tick / 4, reduce the interval + * by half. If this is not the case for four consecutive + * intervals, double the interval. + */ + if (u_usec << pps_shift > bigtick >> 2) { + pps_intcnt = 0; + if (pps_shift > PPS_SHIFT) + pps_shift--; + } else if (pps_intcnt >= 4) { + pps_intcnt = 0; + if (pps_shift < PPS_SHIFTMAX) + pps_shift++; + } else + pps_intcnt++; } +#endif /* PPS_SYNC */ diff --git a/sys/kern/kern_conf.c b/sys/kern/kern_conf.c new file mode 100644 index 0000000..bee8b87 --- /dev/null +++ b/sys/kern/kern_conf.c @@ -0,0 +1,208 @@ +/*- + * Parts Copyright (c) 1995 Terrence R. Lambert + * Copyright (c) 1995 Julian R. Elischer + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Terrence R. Lambert. + * 4. The name Terrence R. Lambert may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY Julian R. Elischer ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE TERRENCE R. LAMBERT BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + */ + +#include <sys/param.h> +#include <sys/types.h> +#include <sys/systm.h> +#include <sys/conf.h> +#include <sys/vnode.h> + +#define NUMBDEV 128 +#define NUMCDEV 256 +#define bdevsw_ALLOCSTART (NUMBDEV/2) +#define cdevsw_ALLOCSTART (NUMCDEV/2) + +struct bdevsw *bdevsw[NUMBDEV]; +int nblkdev = NUMBDEV; +struct cdevsw *cdevsw[NUMCDEV]; +int nchrdev = NUMCDEV; + + + +/* + * Routine to determine if a device is a disk. + * + * KLUDGE XXX add flags to cdevsw entries for disks XXX + * A minimal stub routine can always return 0. + */ +int +isdisk(dev, type) + dev_t dev; + int type; +{ + + switch (major(dev)) { + case 15: /* VBLK: vn, VCHR: cd */ + return (1); + case 0: /* wd */ + case 2: /* fd */ + case 4: /* sd */ + case 6: /* cd */ + case 7: /* mcd */ + case 16: /* scd */ + case 17: /* matcd */ + case 18: /* ata */ + case 19: /* wcd */ + case 20: /* od */ + case 22: /* gd */ + if (type == VBLK) + return (1); + return (0); + case 3: /* wd */ + case 9: /* fd */ + case 13: /* sd */ + case 29: /* mcd */ + case 43: /* vn */ + case 45: /* scd */ + case 46: /* matcd */ + case 69: /* wcd */ + case 70: /* od */ + case 78: /* gd */ + if (type == VCHR) + return (1); + /* fall through */ + default: + return (0); + } + /* NOTREACHED */ +} + + +/* + * Routine to convert from character to block device number. + * + * A minimal stub routine can always return NODEV. + */ +dev_t +chrtoblk(dev_t dev) +{ + struct bdevsw *bd; + struct cdevsw *cd; + + if(cd = cdevsw[major(dev)]) { + if ( (bd = cd->d_bdev) ) + return(makedev(bd->d_maj,minor(dev))); + } + return(NODEV); +} + +/* + * (re)place an entry in the bdevsw or cdevsw table + * return the slot used in major(*descrip) + */ +#define ADDENTRY(TTYPE,NXXXDEV,ALLOCSTART) \ +int TTYPE##_add(dev_t *descrip, \ + struct TTYPE *newentry, \ + struct TTYPE **oldentry) \ +{ \ + int i ; \ + if ( (int)*descrip == NODEV) { /* auto (0 is valid) */ \ + /* \ + * Search the table looking for a slot... \ + */ \ + for (i = ALLOCSTART; i < NXXXDEV; i++) \ + if (TTYPE[i] == NULL) \ + break; /* found one! */ \ + /* out of allocable slots? */ \ + if (i >= NXXXDEV) { \ + return ENFILE; \ + } \ + } else { /* assign */ \ + i = major(*descrip); \ + if (i < 0 || i >= NXXXDEV) { \ + return EINVAL; \ + } \ + } \ + \ + /* maybe save old */ \ + if (oldentry) { \ + *oldentry = TTYPE[i]; \ + } \ + if (newentry) \ + newentry->d_maj = i; \ + /* replace with new */ \ + TTYPE[i] = newentry; \ + \ + /* done! let them know where we put it */ \ + *descrip = makedev(i,0); \ + return 0; \ +} \ + +ADDENTRY(bdevsw, nblkdev,bdevsw_ALLOCSTART) +ADDENTRY(cdevsw, nchrdev,cdevsw_ALLOCSTART) + +/* Maybe the author might indicate what the f*@# tehis is for? */ + +void +cdevsw_make(struct bdevsw *from) +{ + struct cdevsw *to = from->d_cdev; + + if (!to) + panic("No target cdevsw in bdevsw"); + to->d_open = from->d_open; + to->d_close = from->d_close; + to->d_read = rawread; + to->d_write = rawwrite; + to->d_ioctl = from->d_ioctl; + to->d_stop = nostop; + to->d_reset = nullreset; + to->d_devtotty = nodevtotty; + to->d_select = seltrue; + to->d_mmap = nommap; + to->d_strategy = from->d_strategy; + to->d_name = from->d_name; + to->d_bdev = from; + to->d_maj = -1; +} + +void +bdevsw_add_generic(int bdev, int cdev, struct bdevsw *bdevsw) +{ + dev_t dev; + /* + * XXX hack alert. + */ + if (isdisk(makedev(bdev, 0), VBLK) && bdevsw->d_flags != D_DISK) { + printf("bdevsw_add_generic: adding D_DISK flag for device %d\n", + bdev); + bdevsw->d_flags = D_DISK; + } + cdevsw_make(bdevsw); + dev = makedev(cdev, 0); + cdevsw_add(&dev, bdevsw->d_cdev, NULL); + dev = makedev(bdev, 0); + bdevsw_add(&dev, bdevsw , NULL); +} diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c index 3f2e424..a5c6d94 100644 --- a/sys/kern/kern_descrip.c +++ b/sys/kern/kern_descrip.c @@ -35,111 +35,105 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95 + * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94 + * $Id$ */ #include <sys/param.h> #include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/conf.h> #include <sys/filedesc.h> #include <sys/kernel.h> +#include <sys/sysctl.h> #include <sys/vnode.h> #include <sys/proc.h> #include <sys/file.h> -#include <sys/socket.h> #include <sys/socketvar.h> #include <sys/stat.h> -#include <sys/ioctl.h> +#include <sys/filio.h> +#include <sys/ttycom.h> #include <sys/fcntl.h> #include <sys/malloc.h> -#include <sys/syslog.h> #include <sys/unistd.h> #include <sys/resourcevar.h> +#include <sys/pipe.h> -#include <sys/mount.h> -#include <sys/syscallargs.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> +#ifdef DEVFS +#include <sys/devfsext.h> +#endif /*DEVFS*/ + +static d_open_t fdopen; +#define NUMFDESC 64 + +#define CDEV_MAJOR 22 +static struct cdevsw fildesc_cdevsw = + { fdopen, noclose, noread, nowrite, /*22*/ + noioc, nostop, nullreset, nodevtotty,/*fd(!=Fd)*/ + noselect, nommap, nostrat }; + +static int finishdup(struct filedesc *fdp, int old, int new, int *retval); /* * Descriptor management. */ struct filelist filehead; /* head of list of open files */ int nfiles; /* actual number of open files */ +extern int cmask; /* * System calls on descriptors. */ +#ifndef _SYS_SYSPROTO_H_ +struct getdtablesize_args { + int dummy; +}; +#endif /* ARGSUSED */ int getdtablesize(p, uap, retval) struct proc *p; - void *uap; - register_t *retval; + struct getdtablesize_args *uap; + int *retval; { - *retval = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles); + *retval = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc); return (0); } /* - * Duplicate a file descriptor. - */ -/* ARGSUSED */ -int -dup(p, uap, retval) - struct proc *p; - struct dup_args /* { - syscallarg(u_int) fd; - } */ *uap; - register_t *retval; -{ - register struct filedesc *fdp; - u_int old; - int new, error; - - old = SCARG(uap, fd); - /* - * XXX Compatibility - */ - if (old &~ 077) { - SCARG(uap, fd) &= 077; - return (dup2(p, uap, retval)); - } - - fdp = p->p_fd; - if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) - return (EBADF); - if (error = fdalloc(p, 0, &new)) - return (error); - return (finishdup(fdp, (int)old, new, retval)); -} - -/* * Duplicate a file descriptor to a particular value. */ +#ifndef _SYS_SYSPROTO_H_ +struct dup2_args { + u_int from; + u_int to; +}; +#endif /* ARGSUSED */ int dup2(p, uap, retval) struct proc *p; - struct dup2_args /* { - syscallarg(u_int) from; - syscallarg(u_int) to; - } */ *uap; - register_t *retval; + struct dup2_args *uap; + int *retval; { register struct filedesc *fdp = p->p_fd; - register int old = SCARG(uap, from), new = SCARG(uap, to); + register u_int old = uap->from, new = uap->to; int i, error; if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL || new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur || - new >= maxfiles) + new >= maxfilesperproc) return (EBADF); if (old == new) { *retval = new; return (0); } if (new >= fdp->fd_nfiles) { - if (error = fdalloc(p, new, &i)) + if ((error = fdalloc(p, new, &i))) return (error); if (new != i) panic("dup2: fdalloc"); @@ -155,20 +149,58 @@ dup2(p, uap, retval) } /* + * Duplicate a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct dup_args { + u_int fd; +}; +#endif +/* ARGSUSED */ +int +dup(p, uap, retval) + struct proc *p; + struct dup_args *uap; + int *retval; +{ + register struct filedesc *fdp; + u_int old; + int new, error; + + old = uap->fd; + +#if 0 + /* + * XXX Compatibility + */ + if (old &~ 077) { uap->fd &= 077; return (dup2(p, uap, retval)); } +#endif + + fdp = p->p_fd; + if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL) + return (EBADF); + if ((error = fdalloc(p, 0, &new))) + return (error); + return (finishdup(fdp, (int)old, new, retval)); +} + +/* * The file control system call. */ +#ifndef _SYS_SYSPROTO_H_ +struct fcntl_args { + int fd; + int cmd; + int arg; +}; +#endif /* ARGSUSED */ int fcntl(p, uap, retval) struct proc *p; - register struct fcntl_args /* { - syscallarg(int) fd; - syscallarg(int) cmd; - syscallarg(void *) arg; - } */ *uap; - register_t *retval; + register struct fcntl_args *uap; + int *retval; { - int fd = SCARG(uap, fd); register struct filedesc *fdp = p->p_fd; register struct file *fp; register char *pop; @@ -177,27 +209,27 @@ fcntl(p, uap, retval) struct flock fl; u_int newmin; - if ((u_int)fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fd]) == NULL) + if ((unsigned)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) return (EBADF); - pop = &fdp->fd_ofileflags[fd]; - switch (SCARG(uap, cmd)) { + pop = &fdp->fd_ofileflags[uap->fd]; + switch (uap->cmd) { case F_DUPFD: - newmin = (long)SCARG(uap, arg); + newmin = uap->arg; if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur || - newmin >= maxfiles) + newmin >= maxfilesperproc) return (EINVAL); - if (error = fdalloc(p, newmin, &i)) + if ((error = fdalloc(p, newmin, &i))) return (error); - return (finishdup(fdp, fd, i, retval)); + return (finishdup(fdp, uap->fd, i, retval)); case F_GETFD: *retval = *pop & 1; return (0); case F_SETFD: - *pop = (*pop &~ 1) | ((long)SCARG(uap, arg) & 1); + *pop = (*pop &~ 1) | (uap->arg & 1); return (0); case F_GETFL: @@ -206,7 +238,7 @@ fcntl(p, uap, retval) case F_SETFL: fp->f_flag &= ~FCNTLFLAGS; - fp->f_flag |= FFLAGS((long)SCARG(uap, arg)) & FCNTLFLAGS; + fp->f_flag |= FFLAGS(uap->arg) & FCNTLFLAGS; tmp = fp->f_flag & FNONBLOCK; error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p); if (error) @@ -232,20 +264,19 @@ fcntl(p, uap, retval) case F_SETOWN: if (fp->f_type == DTYPE_SOCKET) { - ((struct socket *)fp->f_data)->so_pgid = - (long)SCARG(uap, arg); + ((struct socket *)fp->f_data)->so_pgid = uap->arg; return (0); } - if ((long)SCARG(uap, arg) <= 0) { - SCARG(uap, arg) = (void *)(-(long)SCARG(uap, arg)); + if (uap->arg <= 0) { + uap->arg = -uap->arg; } else { - struct proc *p1 = pfind((long)SCARG(uap, arg)); + struct proc *p1 = pfind(uap->arg); if (p1 == 0) return (ESRCH); - SCARG(uap, arg) = (void *)(long)p1->p_pgrp->pg_id; + uap->arg = p1->p_pgrp->pg_id; } return ((*fp->f_ops->fo_ioctl) - (fp, TIOCSPGRP, (caddr_t)&SCARG(uap, arg), p)); + (fp, TIOCSPGRP, (caddr_t)&uap->arg, p)); case F_SETLKW: flg |= F_WAIT; @@ -256,8 +287,7 @@ fcntl(p, uap, retval) return (EBADF); vp = (struct vnode *)fp->f_data; /* Copy in the lock structure */ - error = copyin((caddr_t)SCARG(uap, arg), (caddr_t)&fl, - sizeof (fl)); + error = copyin((caddr_t)uap->arg, (caddr_t)&fl, sizeof (fl)); if (error) return (error); if (fl.l_whence == SEEK_CUR) @@ -289,16 +319,17 @@ fcntl(p, uap, retval) return (EBADF); vp = (struct vnode *)fp->f_data; /* Copy in the lock structure */ - error = copyin((caddr_t)SCARG(uap, arg), (caddr_t)&fl, - sizeof (fl)); + error = copyin((caddr_t)uap->arg, (caddr_t)&fl, sizeof (fl)); if (error) return (error); + if (fl.l_type != F_RDLCK && fl.l_type != F_WRLCK && + fl.l_type != F_UNLCK) + return (EINVAL); if (fl.l_whence == SEEK_CUR) fl.l_start += fp->f_offset; - if (error = VOP_ADVLOCK(vp, (caddr_t)p, F_GETLK, &fl, F_POSIX)) + if ((error = VOP_ADVLOCK(vp,(caddr_t)p,F_GETLK,&fl,F_POSIX))) return (error); - return (copyout((caddr_t)&fl, (caddr_t)SCARG(uap, arg), - sizeof (fl))); + return (copyout((caddr_t)&fl, (caddr_t)uap->arg, sizeof (fl))); default: return (EINVAL); @@ -309,11 +340,10 @@ fcntl(p, uap, retval) /* * Common code for dup, dup2, and fcntl(F_DUPFD). */ -int +static int finishdup(fdp, old, new, retval) register struct filedesc *fdp; - register int old, new; - register_t *retval; + register int old, new, *retval; { register struct file *fp; @@ -330,21 +360,24 @@ finishdup(fdp, old, new, retval) /* * Close a file descriptor. */ +#ifndef _SYS_SYSPROTO_H_ +struct close_args { + int fd; +}; +#endif /* ARGSUSED */ int close(p, uap, retval) struct proc *p; - struct close_args /* { - syscallarg(int) fd; - } */ *uap; - register_t *retval; + struct close_args *uap; + int *retval; { - int fd = SCARG(uap, fd); register struct filedesc *fdp = p->p_fd; register struct file *fp; + register int fd = uap->fd; register u_char *pf; - if ((u_int)fd >= fdp->fd_nfiles || + if ((unsigned)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) return (EBADF); pf = (u_char *)&fdp->fd_ofileflags[fd]; @@ -363,28 +396,31 @@ close(p, uap, retval) /* * Return status information about a file descriptor. */ +#ifndef _SYS_SYSPROTO_H_ +struct ofstat_args { + int fd; + struct ostat *sb; +}; +#endif /* ARGSUSED */ int -compat_43_fstat(p, uap, retval) +ofstat(p, uap, retval) struct proc *p; - register struct compat_43_fstat_args /* { - syscallarg(int) fd; - syscallarg(struct ostat *) sb; - } */ *uap; - register_t *retval; + register struct ofstat_args *uap; + int *retval; { - int fd = SCARG(uap, fd); register struct filedesc *fdp = p->p_fd; register struct file *fp; struct stat ub; struct ostat oub; int error; - if ((u_int)fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fd]) == NULL) + if ((unsigned)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) return (EBADF); switch (fp->f_type) { + case DTYPE_FIFO: case DTYPE_VNODE: error = vn_stat((struct vnode *)fp->f_data, &ub, p); break; @@ -393,14 +429,19 @@ compat_43_fstat(p, uap, retval) error = soo_stat((struct socket *)fp->f_data, &ub); break; +#ifndef OLD_PIPE + case DTYPE_PIPE: + error = pipe_stat((struct pipe *)fp->f_data, &ub); + break; +#endif + default: panic("ofstat"); /*NOTREACHED*/ } cvtstat(&ub, &oub); if (error == 0) - error = copyout((caddr_t)&oub, (caddr_t)SCARG(uap, sb), - sizeof (oub)); + error = copyout((caddr_t)&oub, (caddr_t)uap->sb, sizeof (oub)); return (error); } #endif /* COMPAT_43 || COMPAT_SUNOS */ @@ -408,27 +449,30 @@ compat_43_fstat(p, uap, retval) /* * Return status information about a file descriptor. */ +#ifndef _SYS_SYSPROTO_H_ +struct fstat_args { + int fd; + struct stat *sb; +}; +#endif /* ARGSUSED */ int fstat(p, uap, retval) struct proc *p; - register struct fstat_args /* { - syscallarg(int) fd; - syscallarg(struct stat *) sb; - } */ *uap; - register_t *retval; + register struct fstat_args *uap; + int *retval; { - int fd = SCARG(uap, fd); register struct filedesc *fdp = p->p_fd; register struct file *fp; struct stat ub; int error; - if ((u_int)fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fd]) == NULL) + if ((unsigned)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) return (EBADF); switch (fp->f_type) { + case DTYPE_FIFO: case DTYPE_VNODE: error = vn_stat((struct vnode *)fp->f_data, &ub, p); break; @@ -437,48 +481,59 @@ fstat(p, uap, retval) error = soo_stat((struct socket *)fp->f_data, &ub); break; +#ifndef OLD_PIPE + case DTYPE_PIPE: + error = pipe_stat((struct pipe *)fp->f_data, &ub); + break; +#endif + default: panic("fstat"); /*NOTREACHED*/ } if (error == 0) - error = copyout((caddr_t)&ub, (caddr_t)SCARG(uap, sb), - sizeof (ub)); + error = copyout((caddr_t)&ub, (caddr_t)uap->sb, sizeof (ub)); return (error); } /* * Return pathconf information about a file descriptor. */ +#ifndef _SYS_SYSPROTO_H_ +struct fpathconf_args { + int fd; + int name; +}; +#endif /* ARGSUSED */ int fpathconf(p, uap, retval) struct proc *p; - register struct fpathconf_args /* { - syscallarg(int) fd; - syscallarg(int) name; - } */ *uap; - register_t *retval; + register struct fpathconf_args *uap; + int *retval; { - int fd = SCARG(uap, fd); struct filedesc *fdp = p->p_fd; struct file *fp; struct vnode *vp; - if ((u_int)fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fd]) == NULL) + if ((unsigned)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) return (EBADF); switch (fp->f_type) { +#ifndef OLD_PIPE + case DTYPE_PIPE: +#endif case DTYPE_SOCKET: - if (SCARG(uap, name) != _PC_PIPE_BUF) + if (uap->name != _PC_PIPE_BUF) return (EINVAL); *retval = PIPE_BUF; return (0); + case DTYPE_FIFO: case DTYPE_VNODE: vp = (struct vnode *)fp->f_data; - return (VOP_PATHCONF(vp, SCARG(uap, name), retval)); + return (VOP_PATHCONF(vp, uap->name, retval)); default: panic("fpathconf"); @@ -489,7 +544,8 @@ fpathconf(p, uap, retval) /* * Allocate a file descriptor for the process. */ -int fdexpand; +static int fdexpand; +SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, ""); int fdalloc(p, want, result) @@ -508,7 +564,7 @@ fdalloc(p, want, result) * of want or fd_freefile. If that fails, consider * expanding the ofile array. */ - lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles); + lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc); for (;;) { last = min(fdp->fd_nfiles, lim); if ((i = want) < fdp->fd_freefile) @@ -554,6 +610,7 @@ fdalloc(p, want, result) fdp->fd_nfiles = nfiles; fdexpand++; } + return (0); } /* @@ -567,13 +624,15 @@ fdavail(p, n) { register struct filedesc *fdp = p->p_fd; register struct file **fpp; - register int i, lim; + register int i, lim, last; - lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles); + lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc); if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0) return (1); + + last = min(fdp->fd_nfiles, lim); fpp = &fdp->fd_ofiles[fdp->fd_freefile]; - for (i = fdp->fd_nfiles - fdp->fd_freefile; --i >= 0; fpp++) + for (i = last - fdp->fd_freefile; --i >= 0; fpp++) if (*fpp == NULL && --n <= 0) return (1); return (0); @@ -592,7 +651,7 @@ falloc(p, resultfp, resultfd) register struct file *fp, *fq; int error, i; - if (error = fdalloc(p, 0, &i)) + if ((error = fdalloc(p, 0, &i))) return (error); if (nfiles >= maxfiles) { tablefull("file"); @@ -607,7 +666,7 @@ falloc(p, resultfp, resultfd) nfiles++; MALLOC(fp, struct file *, sizeof(struct file), M_FILE, M_WAITOK); bzero(fp, sizeof(struct file)); - if (fq = p->p_fd->fd_ofiles[0]) { + if ((fq = p->p_fd->fd_ofiles[0])) { LIST_INSERT_AFTER(fq, fp, f_list); } else { LIST_INSERT_HEAD(&filehead, fp, f_list); @@ -615,6 +674,7 @@ falloc(p, resultfp, resultfd) p->p_fd->fd_ofiles[i] = fp; fp->f_count = 1; fp->f_cred = p->p_ucred; + fp->f_seqcount = 1; crhold(fp->f_cred); if (resultfp) *resultfp = fp; @@ -630,8 +690,6 @@ void ffree(fp) register struct file *fp; { - register struct file *fq; - LIST_REMOVE(fp, f_list); crfree(fp->f_cred); #ifdef DIAGNOSTIC @@ -642,6 +700,49 @@ ffree(fp) } /* + * Build a new filedesc structure. + */ +struct filedesc * +fdinit(p) + struct proc *p; +{ + register struct filedesc0 *newfdp; + register struct filedesc *fdp = p->p_fd; + + MALLOC(newfdp, struct filedesc0 *, sizeof(struct filedesc0), + M_FILEDESC, M_WAITOK); + bzero(newfdp, sizeof(struct filedesc0)); + newfdp->fd_fd.fd_cdir = fdp->fd_cdir; + VREF(newfdp->fd_fd.fd_cdir); + newfdp->fd_fd.fd_rdir = fdp->fd_rdir; + if (newfdp->fd_fd.fd_rdir) + VREF(newfdp->fd_fd.fd_rdir); + + /* Create the file descriptor table. */ + newfdp->fd_fd.fd_refcnt = 1; + newfdp->fd_fd.fd_cmask = cmask; + newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles; + newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags; + newfdp->fd_fd.fd_nfiles = NDFILE; + + newfdp->fd_fd.fd_freefile = 0; + newfdp->fd_fd.fd_lastfile = 0; + + return (&newfdp->fd_fd); +} + +/* + * Share a filedesc structure. + */ +struct filedesc * +fdshare(p) + struct proc *p; +{ + p->p_fd->fd_refcnt++; + return (p->p_fd); +} + +/* * Copy a filedesc structure. */ struct filedesc * @@ -720,6 +821,34 @@ fdfree(p) } /* + * Close any files on exec? + */ +void +fdcloseexec(p) + struct proc *p; +{ + struct filedesc *fdp = p->p_fd; + struct file **fpp; + char *fdfp; + register int i; + + fpp = fdp->fd_ofiles; + fdfp = fdp->fd_ofileflags; + for (i = 0; i <= fdp->fd_lastfile; i++, fpp++, fdfp++) + if (*fpp != NULL && (*fdfp & UF_EXCLOSE)) { + if (*fdfp & UF_MAPPED) + (void) munmapfd(p, i); + (void) closef(*fpp, p); + *fpp = NULL; + *fdfp = 0; + if (i < fdp->fd_freefile) + fdp->fd_freefile = i; + } + while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL) + fdp->fd_lastfile--; +} + +/* * Internal form of close. * Decrement reference count on file structure. * Note: p may be NULL when closing a file @@ -778,25 +907,26 @@ closef(fp, p) * Just attempt to get a record lock of the requested type on * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0). */ +#ifndef _SYS_SYSPROTO_H_ +struct flock_args { + int fd; + int how; +}; +#endif /* ARGSUSED */ int flock(p, uap, retval) struct proc *p; - register struct flock_args /* { - syscallarg(int) fd; - syscallarg(int) how; - } */ *uap; - register_t *retval; + register struct flock_args *uap; + int *retval; { - int fd = SCARG(uap, fd); - int how = SCARG(uap, how); register struct filedesc *fdp = p->p_fd; register struct file *fp; struct vnode *vp; struct flock lf; - if ((u_int)fd >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[fd]) == NULL) + if ((unsigned)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) return (EBADF); if (fp->f_type != DTYPE_VNODE) return (EOPNOTSUPP); @@ -804,19 +934,19 @@ flock(p, uap, retval) lf.l_whence = SEEK_SET; lf.l_start = 0; lf.l_len = 0; - if (how & LOCK_UN) { + if (uap->how & LOCK_UN) { lf.l_type = F_UNLCK; fp->f_flag &= ~FHASLOCK; return (VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK)); } - if (how & LOCK_EX) + if (uap->how & LOCK_EX) lf.l_type = F_WRLCK; - else if (how & LOCK_SH) + else if (uap->how & LOCK_SH) lf.l_type = F_RDLCK; else return (EBADF); fp->f_flag |= FHASLOCK; - if (how & LOCK_NB) + if (uap->how & LOCK_NB) return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK)); return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK|F_WAIT)); } @@ -830,7 +960,7 @@ flock(p, uap, retval) * references to this file will be direct to the other driver. */ /* ARGSUSED */ -int +static int fdopen(dev, mode, type, p) dev_t dev; int mode, type; @@ -839,7 +969,7 @@ fdopen(dev, mode, type, p) /* * XXX Kludge: set curproc->p_dupfd to contain the value of the - * the file descriptor being sought for duplication. The error + * the file descriptor being sought for duplication. The error * return ensures that the vnode for this device will be released * by vn_open. Open will detect this special error and take the * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN @@ -928,3 +1058,89 @@ dupfdopen(fdp, indx, dfd, mode, error) } /* NOTREACHED */ } + +/* + * Get file structures. + */ +static int +sysctl_kern_file SYSCTL_HANDLER_ARGS +{ + int error; + struct file *fp; + + if (!req->oldptr) { + /* + * overestimate by 10 files + */ + return (SYSCTL_OUT(req, 0, sizeof(filehead) + + (nfiles + 10) * sizeof(struct file))); + } + + error = SYSCTL_OUT(req, (caddr_t)&filehead, sizeof(filehead)); + if (error) + return (error); + + /* + * followed by an array of file structures + */ + for (fp = filehead.lh_first; fp != NULL; fp = fp->f_list.le_next) { + error = SYSCTL_OUT(req, (caddr_t)fp, sizeof (struct file)); + if (error) + return (error); + } + return (0); +} + +SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD, + 0, 0, sysctl_kern_file, "S,file", ""); + +SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc, + CTLFLAG_RW, &maxfilesperproc, 0, ""); + +SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, &maxfiles, 0, ""); + +static fildesc_devsw_installed = 0; +#ifdef DEVFS +static void *devfs_token_stdin; +static void *devfs_token_stdout; +static void *devfs_token_stderr; +static void *devfs_token_fildesc[NUMFDESC]; +#endif + +static void fildesc_drvinit(void *unused) +{ + dev_t dev; +#ifdef DEVFS + int fd; +#endif + + if( ! fildesc_devsw_installed ) { + dev = makedev(CDEV_MAJOR,0); + cdevsw_add(&dev,&fildesc_cdevsw,NULL); + fildesc_devsw_installed = 1; +#ifdef DEVFS + for (fd = 0; fd < NUMFDESC; fd++) + devfs_token_fildesc[fd] = + devfs_add_devswf(&fildesc_cdevsw, fd, DV_CHR, + UID_BIN, GID_BIN, 0666, + "fd/%d", fd); + devfs_token_stdin = + devfs_add_devswf(&fildesc_cdevsw, 0, DV_CHR, + UID_ROOT, GID_WHEEL, 0666, + "stdin", fd); + devfs_token_stdout = + devfs_add_devswf(&fildesc_cdevsw, 1, DV_CHR, + UID_ROOT, GID_WHEEL, 0666, + "stdout", fd); + devfs_token_stderr = + devfs_add_devswf(&fildesc_cdevsw, 2, DV_CHR, + UID_ROOT, GID_WHEEL, 0666, + "stderr", fd); +#endif + } +} + +SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR, + fildesc_drvinit,NULL) + + diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c index fbb4444..21049a3 100644 --- a/sys/kern/kern_exec.c +++ b/sys/kern/kern_exec.c @@ -1,11 +1,6 @@ -/*- - * Copyright (c) 1982, 1986, 1991, 1993 - * The Regents of the University of California. All rights reserved. - * (c) UNIX System Laboratories, Inc. - * All or some portions of this file are derived from material licensed - * to the University of California by American Telephone and Telegraph - * Co. or Unix System Laboratories, Inc. and are reproduced herein with - * the permission of UNIX System Laboratories, Inc. +/* + * Copyright (c) 1993, David Greenman + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -15,18 +10,11 @@ * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) @@ -35,30 +23,597 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * from: @(#)kern_exec.c 8.1 (Berkeley) 6/10/93 + * $Id$ */ #include <sys/param.h> -#include <sys/errno.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/signalvar.h> +#include <sys/kernel.h> +#include <sys/mount.h> +#include <sys/filedesc.h> +#include <sys/fcntl.h> +#include <sys/acct.h> +#include <sys/exec.h> +#include <sys/imgact.h> +#include <sys/imgact_elf.h> +#include <sys/wait.h> #include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/namei.h> +#include <sys/sysent.h> +#include <sys/syslog.h> +#include <sys/shm.h> +#include <sys/sysctl.h> +#include <sys/vnode.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> + +#include <machine/reg.h> + +static int *exec_copyout_strings __P((struct image_params *)); + +static int exec_check_permissions(struct image_params *); /* - * exec system call + * XXX trouble here if sizeof(caddr_t) != sizeof(int), other parts + * of the sysctl code also assumes this, and sizeof(int) == sizeof(long). */ +static struct ps_strings *ps_strings = PS_STRINGS; +SYSCTL_INT(_kern, KERN_PS_STRINGS, ps_strings, 0, &ps_strings, 0, ""); + +static caddr_t usrstack = (caddr_t)USRSTACK; +SYSCTL_INT(_kern, KERN_USRSTACK, usrstack, 0, &usrstack, 0, ""); + +/* + * execsw_set is constructed for us by the linker. Each of the items + * is a pointer to a `const struct execsw', hence the double pointer here. + */ +static const struct execsw **execsw = + (const struct execsw **)&execsw_set.ls_items[0]; + +#ifndef _SYS_SYSPROTO_H_ struct execve_args { - char *fname; - char **argp; - char **envp; + char *fname; + char **argv; + char **envv; }; -/* ARGSUSED */ -execve(a1, a2, a3) - struct proc *a1; - struct execve_args *a2; - int *a3; +#endif + +/* + * execve() system call. + */ +int +execve(p, uap, retval) + struct proc *p; + register struct execve_args *uap; + int *retval; +{ + struct nameidata nd, *ndp; + int *stack_base; + int error, len, i; + struct image_params image_params, *imgp; + struct vattr attr; + + imgp = &image_params; + + /* + * Initialize part of the common data + */ + imgp->proc = p; + imgp->uap = uap; + imgp->attr = &attr; + imgp->image_header = NULL; + imgp->argc = imgp->envc = 0; + imgp->entry_addr = 0; + imgp->vmspace_destroyed = 0; + imgp->interpreted = 0; + imgp->interpreter_name[0] = '\0'; + imgp->auxargs = NULL; + + /* + * Allocate temporary demand zeroed space for argument and + * environment strings + */ + imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX); + if (imgp->stringbase == NULL) { + error = ENOMEM; + goto exec_fail; + } + imgp->stringp = imgp->stringbase; + imgp->stringspace = ARG_MAX; + + /* + * Translate the file name. namei() returns a vnode pointer + * in ni_vp amoung other things. + */ + ndp = &nd; + NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, + UIO_USERSPACE, uap->fname, p); + +interpret: + + error = namei(ndp); + if (error) { + kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX); + goto exec_fail; + } + + imgp->vp = ndp->ni_vp; + if (imgp->vp == NULL) { + error = ENOEXEC; + goto exec_fail_dealloc; + } + + /* + * Check file permissions (also 'opens' file) + */ + error = exec_check_permissions(imgp); + + /* + * Lose the lock on the vnode. It's no longer needed, and must not + * exist for the pagefault paging to work below. + */ + VOP_UNLOCK(imgp->vp, 0, p); + + if (error) + goto exec_fail_dealloc; + + /* + * Map the image header (first page) of the file into + * kernel address space + */ + error = vm_mmap(exech_map, /* map */ + (vm_offset_t *)&imgp->image_header, /* address */ + PAGE_SIZE, /* size */ + VM_PROT_READ, /* protection */ + VM_PROT_READ, /* max protection */ + 0, /* flags */ + (caddr_t)imgp->vp, /* vnode */ + 0); /* offset */ + if (error) { + uprintf("mmap failed: %d\n",error); + goto exec_fail_dealloc; + } + + /* + * Loop through list of image activators, calling each one. + * If there is no match, the activator returns -1. If there + * is a match, but there was an error during the activation, + * the error is returned. Otherwise 0 means success. If the + * image is interpreted, loop back up and try activating + * the interpreter. + */ + for (i = 0; execsw[i]; ++i) { + if (execsw[i]->ex_imgact) + error = (*execsw[i]->ex_imgact)(imgp); + else + continue; + + if (error == -1) + continue; + if (error) + goto exec_fail_dealloc; + if (imgp->interpreted) { + /* free old vnode and name buffer */ + vrele(ndp->ni_vp); + FREE(ndp->ni_cnd.cn_pnbuf, M_NAMEI); + if (vm_map_remove(exech_map, (vm_offset_t)imgp->image_header, + (vm_offset_t)imgp->image_header + PAGE_SIZE)) + panic("execve: header dealloc failed (1)"); + + /* set new name to that of the interpreter */ + NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME, + UIO_SYSSPACE, imgp->interpreter_name, p); + goto interpret; + } + break; + } + /* If we made it through all the activators and none matched, exit. */ + if (error == -1) { + error = ENOEXEC; + goto exec_fail_dealloc; + } + + /* + * Copy out strings (args and env) and initialize stack base + */ + stack_base = exec_copyout_strings(imgp); + p->p_vmspace->vm_minsaddr = (char *)stack_base; + + /* + * If custom stack fixup routine present for this process + * let it do the stack setup. + * Else stuff argument count as first item on stack + */ + if (p->p_sysent->sv_fixup) + (*p->p_sysent->sv_fixup)(&stack_base, imgp); + else + suword(--stack_base, imgp->argc); + + /* close files on exec */ + fdcloseexec(p); + + /* reset caught signals */ + execsigs(p); + + /* name this process - nameiexec(p, ndp) */ + len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN); + bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len); + p->p_comm[len] = 0; + + /* + * mark as execed, wakeup the process that vforked (if any) and tell + * it that it now has it's own resources back + */ + p->p_flag |= P_EXEC; + if (p->p_pptr && (p->p_flag & P_PPWAIT)) { + p->p_flag &= ~P_PPWAIT; + wakeup((caddr_t)p->p_pptr); + } + + /* + * Implement image setuid/setgid. Disallow if the process is + * being traced. + */ + if ((attr.va_mode & (VSUID | VSGID)) && + (p->p_flag & P_TRACED) == 0) { + /* + * Turn off syscall tracing for set-id programs, except for + * root. + */ + if (p->p_tracep && suser(p->p_ucred, &p->p_acflag)) { + p->p_traceflag = 0; + vrele(p->p_tracep); + p->p_tracep = NULL; + } + /* + * Set the new credentials. + */ + p->p_ucred = crcopy(p->p_ucred); + if (attr.va_mode & VSUID) + p->p_ucred->cr_uid = attr.va_uid; + if (attr.va_mode & VSGID) + p->p_ucred->cr_groups[0] = attr.va_gid; + p->p_flag |= P_SUGID; + } else { + if (p->p_ucred->cr_uid == p->p_cred->p_ruid && + p->p_ucred->cr_gid == p->p_cred->p_rgid) + p->p_flag &= ~P_SUGID; + } + + /* + * Implement correct POSIX saved-id behavior. + */ + p->p_cred->p_svuid = p->p_ucred->cr_uid; + p->p_cred->p_svgid = p->p_ucred->cr_gid; + + /* + * Store the vp for use in procfs + */ + if (p->p_textvp) /* release old reference */ + vrele(p->p_textvp); + VREF(ndp->ni_vp); + p->p_textvp = ndp->ni_vp; + + /* + * If tracing the process, trap to debugger so breakpoints + * can be set before the program executes. + */ + if (p->p_flag & P_TRACED) + psignal(p, SIGTRAP); + + /* clear "fork but no exec" flag, as we _are_ execing */ + p->p_acflag &= ~AFORK; + + /* Set entry address */ + setregs(p, imgp->entry_addr, (u_long)stack_base); + + /* + * free various allocated resources + */ + kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX); + if (vm_map_remove(exech_map, (vm_offset_t)imgp->image_header, + (vm_offset_t)imgp->image_header + PAGE_SIZE)) + panic("execve: header dealloc failed (2)"); + vrele(ndp->ni_vp); + FREE(ndp->ni_cnd.cn_pnbuf, M_NAMEI); + + return (0); + +exec_fail_dealloc: + if (imgp->stringbase != NULL) + kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX); + if (imgp->image_header && imgp->image_header != (char *)-1) + if (vm_map_remove(exech_map, (vm_offset_t)imgp->image_header, + (vm_offset_t)imgp->image_header + PAGE_SIZE)) + panic("execve: header dealloc failed (3)"); + if (ndp->ni_vp) + vrele(ndp->ni_vp); + FREE(ndp->ni_cnd.cn_pnbuf, M_NAMEI); + +exec_fail: + if (imgp->vmspace_destroyed) { + /* sorry, no more process anymore. exit gracefully */ + exit1(p, W_EXITCODE(0, SIGABRT)); + /* NOT REACHED */ + return(0); + } else { + return(error); + } +} + +/* + * Destroy old address space, and allocate a new stack + * The new stack is only SGROWSIZ large because it is grown + * automatically in trap.c. + */ +int +exec_new_vmspace(imgp) + struct image_params *imgp; +{ + int error; + struct vmspace *vmspace = imgp->proc->p_vmspace; + caddr_t stack_addr = (caddr_t) (USRSTACK - SGROWSIZ); + + imgp->vmspace_destroyed = 1; + + /* Blow away entire process VM */ + if (vmspace->vm_shm) + shmexit(imgp->proc); + pmap_remove_pages(&vmspace->vm_pmap, 0, USRSTACK); + vm_map_remove(&vmspace->vm_map, 0, USRSTACK); + + /* Allocate a new stack */ + error = vm_map_find(&vmspace->vm_map, NULL, 0, (vm_offset_t *)&stack_addr, + SGROWSIZ, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0); + if (error) + return(error); + + vmspace->vm_ssize = SGROWSIZ >> PAGE_SHIFT; + + /* Initialize maximum stack address */ + vmspace->vm_maxsaddr = (char *)USRSTACK - MAXSSIZ; + + return(0); +} + +/* + * Copy out argument and environment strings from the old process + * address space into the temporary string buffer. + */ +int +exec_extract_strings(imgp) + struct image_params *imgp; +{ + char **argv, **envv; + char *argp, *envp; + int error, length; + + /* + * extract arguments first + */ + + argv = imgp->uap->argv; + + if (argv) { + while ((argp = (caddr_t) fuword(argv++))) { + if (argp == (caddr_t) -1) + return (EFAULT); + if ((error = copyinstr(argp, imgp->stringp, + imgp->stringspace, &length))) { + if (error == ENAMETOOLONG) + return(E2BIG); + return (error); + } + imgp->stringspace -= length; + imgp->stringp += length; + imgp->argc++; + } + } + + /* + * extract environment strings + */ + + envv = imgp->uap->envv; + + if (envv) { + while ((envp = (caddr_t) fuword(envv++))) { + if (envp == (caddr_t) -1) + return (EFAULT); + if ((error = copyinstr(envp, imgp->stringp, + imgp->stringspace, &length))) { + if (error == ENAMETOOLONG) + return(E2BIG); + return (error); + } + imgp->stringspace -= length; + imgp->stringp += length; + imgp->envc++; + } + } + + return (0); +} + +/* + * Copy strings out to the new process address space, constructing + * new arg and env vector tables. Return a pointer to the base + * so that it can be used as the initial stack pointer. + */ +int * +exec_copyout_strings(imgp) + struct image_params *imgp; { + int argc, envc; + char **vectp; + char *stringp, *destp; + int *stack_base; + struct ps_strings *arginfo; + int szsigcode; + + /* + * Calculate string base and vector table pointers. + * Also deal with signal trampoline code for this exec type. + */ + arginfo = PS_STRINGS; + szsigcode = *(imgp->proc->p_sysent->sv_szsigcode); + destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE - + roundup((ARG_MAX - imgp->stringspace), sizeof(char *)); + + /* + * install sigcode + */ + if (szsigcode) + copyout(imgp->proc->p_sysent->sv_sigcode, + ((caddr_t)arginfo - szsigcode), szsigcode); + + /* + * If we have a valid auxargs ptr, prepare some room + * on the stack. + */ + if (imgp->auxargs) + /* + * The '+ 2' is for the null pointers at the end of each of the + * arg and env vector sets, and 'AT_COUNT*2' is room for the + * ELF Auxargs data. + */ + vectp = (char **)(destp - (imgp->argc + imgp->envc + 2 + + AT_COUNT*2) * sizeof(char*)); + else + /* + * The '+ 2' is for the null pointers at the end of each of the + * arg and env vector sets + */ + vectp = (char **) + (destp - (imgp->argc + imgp->envc + 2) * sizeof(char*)); /* - * Body deleted. + * vectp also becomes our initial stack base */ - return (ENOSYS); + stack_base = (int *)vectp; + + stringp = imgp->stringbase; + argc = imgp->argc; + envc = imgp->envc; + + /* + * Copy out strings - arguments and environment. + */ + copyout(stringp, destp, ARG_MAX - imgp->stringspace); + + /* + * Fill in "ps_strings" struct for ps, w, etc. + */ + suword(&arginfo->ps_argvstr, (int)vectp); + suword(&arginfo->ps_nargvstr, argc); + + /* + * Fill in argument portion of vector table. + */ + for (; argc > 0; --argc) { + suword(vectp++, (int)destp); + while (*stringp++ != 0) + destp++; + destp++; + } + + /* a null vector table pointer seperates the argp's from the envp's */ + suword(vectp++, 0); + + suword(&arginfo->ps_envstr, (int)vectp); + suword(&arginfo->ps_nenvstr, envc); + + /* + * Fill in environment portion of vector table. + */ + for (; envc > 0; --envc) { + suword(vectp++, (int)destp); + while (*stringp++ != 0) + destp++; + destp++; + } + + /* end of vector table is a null pointer */ + suword(vectp, 0); + + return (stack_base); +} + +/* + * Check permissions of file to execute. + * Return 0 for success or error code on failure. + */ +static int +exec_check_permissions(imgp) + struct image_params *imgp; +{ + struct proc *p = imgp->proc; + struct vnode *vp = imgp->vp; + struct vattr *attr = imgp->attr; + int error; + + /* + * Check number of open-for-writes on the file and deny execution + * if there are any. + */ + if (vp->v_writecount) { + return (ETXTBSY); + } + + /* Get file attributes */ + error = VOP_GETATTR(vp, attr, p->p_ucred, p); + if (error) + return (error); + + /* + * 1) Check if file execution is disabled for the filesystem that this + * file resides on. + * 2) Insure that at least one execute bit is on - otherwise root + * will always succeed, and we don't want to happen unless the + * file really is executable. + * 3) Insure that the file is a regular file. + */ + if ((vp->v_mount->mnt_flag & MNT_NOEXEC) || + ((attr->va_mode & 0111) == 0) || + (attr->va_type != VREG)) { + return (EACCES); + } + + /* + * Zero length files can't be exec'd + */ + if (attr->va_size == 0) + return (ENOEXEC); + + /* + * Disable setuid/setgid if the filesystem prohibits it or if + * the process is being traced. + */ + if ((vp->v_mount->mnt_flag & MNT_NOSUID) || (p->p_flag & P_TRACED)) + attr->va_mode &= ~(VSUID | VSGID); + + /* + * Check for execute permission to file based on current credentials. + * Then call filesystem specific open routine (which does nothing + * in the general case). + */ + error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p); + if (error) + return (error); + + error = VOP_OPEN(vp, FREAD, p->p_ucred, p); + if (error) + return (error); + + return (0); } diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c index 4ed48ac..2f8074c 100644 --- a/sys/kern/kern_exit.c +++ b/sys/kern/kern_exit.c @@ -35,13 +35,16 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)kern_exit.c 8.10 (Berkeley) 2/23/95 + * @(#)kern_exit.c 8.7 (Berkeley) 2/12/94 + * $Id: kern_exit.c,v 1.45 1997/02/22 09:39:04 peter Exp $ */ +#include "opt_ktrace.h" + #include <sys/param.h> #include <sys/systm.h> -#include <sys/map.h> -#include <sys/ioctl.h> +#include <sys/sysproto.h> +#include <sys/sysent.h> #include <sys/proc.h> #include <sys/tty.h> #include <sys/time.h> @@ -54,31 +57,48 @@ #include <sys/syslog.h> #include <sys/malloc.h> #include <sys/resourcevar.h> +#include <sys/signalvar.h> #include <sys/ptrace.h> +#include <sys/acct.h> /* for acct_process() function prototype */ +#include <sys/filedesc.h> +#include <sys/shm.h> +#include <sys/sem.h> -#include <machine/cpu.h> #ifdef COMPAT_43 #include <machine/reg.h> #include <machine/psl.h> #endif #include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> #include <vm/vm_kern.h> -__dead void cpu_exit __P((struct proc *)); -__dead void exit1 __P((struct proc *, int)); +static int wait1 __P((struct proc *, struct wait_args *, int [], int)); + +/* + * callout list for things to do at exit time + */ +typedef struct exit_list_element { + struct exit_list_element *next; + exitlist_fn function; +} *ele_p; + +static ele_p exit_list; /* * exit -- * Death of process. */ -struct rexit_args { - int rval; -}; -__dead void +void exit(p, uap, retval) struct proc *p; - struct rexit_args *uap; + struct rexit_args /* { + int rval; + } */ *uap; int *retval; { @@ -91,21 +111,33 @@ exit(p, uap, retval) * to zombie, and unlink proc from allproc and parent's lists. Save exit * status and rusage for wait(). Check for child processes and orphan them. */ -__dead void +void exit1(p, rv) register struct proc *p; int rv; { register struct proc *q, *nq; - register struct proc **pp; register struct vmspace *vm; + ele_p ep = exit_list; - if (p->p_pid == 1) - panic("init died (signal %d, exit %d)", + if (p->p_pid == 1) { + printf("init died (signal %d, exit %d)\n", WTERMSIG(rv), WEXITSTATUS(rv)); + panic("Going nowhere without my init!"); + } #ifdef PGINPROF vmsizmon(); #endif + /* + * Check if any LKMs need anything done at process exit. + * e.g. SYSV IPC stuff + * XXX what if one of these generates an error? + */ + while (ep) { + (*ep->function)(p); + ep = ep->next; + } + if (p->p_flag & P_PROFIL) stopprofclock(p); MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage), @@ -126,12 +158,21 @@ exit1(p, rv) */ fdfree(p); + /* + * Delete select() buffers + */ + if (p->p_selbits) + free (p->p_selbits, M_SELECT); + + /* + * XXX Shutdown SYSV semaphores + */ + semexit(p); + /* The next two chunks should probably be moved to vmspace_exit. */ vm = p->p_vmspace; -#ifdef SYSVSHM if (vm->vm_shm) shmexit(p); -#endif /* * Release user portion of address space. * This releases references to vnodes, @@ -140,9 +181,12 @@ exit1(p, rv) * Can't free the entire vmspace as the kernel stack * may be mapped within that space also. */ - if (vm->vm_refcnt == 1) + if (vm->vm_refcnt == 1) { + pmap_remove_pages(&vm->vm_pmap, VM_MIN_ADDRESS, + VM_MAXUSER_ADDRESS); (void) vm_map_remove(&vm->vm_map, VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS); + } if (SESS_LEADER(p)) { register struct session *sp = p->p_session; @@ -154,7 +198,7 @@ exit1(p, rv) * drain controlling terminal * and revoke access to controlling terminal. */ - if (sp->s_ttyp->t_session == sp) { + if (sp->s_ttyp && (sp->s_ttyp->t_session == sp)) { if (sp->s_ttyp->t_pgrp) pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1); (void) ttywait(sp->s_ttyp); @@ -177,10 +221,15 @@ exit1(p, rv) sp->s_leader = NULL; } fixjobc(p, p->p_pgrp, 0); + if (p->p_limit->p_refcnt > 1 && + (p->p_limit->p_lflags & PL_SHAREMOD) == 0) { + p->p_limit->p_refcnt--; + p->p_limit = limcopy(p->p_limit); + } p->p_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; (void)acct_process(p); #ifdef KTRACE - /* + /* * release trace file */ p->p_traceflag = 0; /* don't trace the vrele() */ @@ -244,8 +293,10 @@ exit1(p, rv) * Other substructures are freed from wait(). */ curproc = NULL; - if (--p->p_limit->p_refcnt == 0) + if (--p->p_limit->p_refcnt == 0) { FREE(p->p_limit, M_SUBPROC); + p->p_limit = NULL; + } /* * Finally, call machine-dependent code to release the remaining @@ -253,22 +304,12 @@ exit1(p, rv) * The address space is released by "vmspace_free(p->p_vmspace)"; * This is machine-dependent, as we may have to change stacks * or ensure that the current one isn't reallocated before we - * finish. cpu_exit will end with a call to cpu_swtch(), finishing + * finish. cpu_exit will end with a call to cpu_switch(), finishing * our execution (pun intended). */ cpu_exit(p); } -struct wait_args { - int pid; - int *status; - int options; - struct rusage *rusage; -#ifdef COMPAT_43 - int compat; /* pseudo */ -#endif -}; - #ifdef COMPAT_43 #if defined(hp300) || defined(luna68k) #include <machine/frame.h> @@ -277,48 +318,55 @@ struct wait_args { #define GETPS(rp) (rp)[PS] #endif -compat_43_wait(p, uap, retval) +int +owait(p, uap, retval) struct proc *p; - register struct wait_args *uap; + register struct owait_args /* { + int dummy; + } */ *uap; int *retval; { + struct wait_args w; #ifdef PSL_ALLCC if ((GETPS(p->p_md.md_regs) & PSL_ALLCC) != PSL_ALLCC) { - uap->options = 0; - uap->rusage = NULL; + w.options = 0; + w.rusage = NULL; } else { - uap->options = p->p_md.md_regs[R0]; - uap->rusage = (struct rusage *)p->p_md.md_regs[R1]; + w.options = p->p_md.md_regs[R0]; + w.rusage = (struct rusage *)p->p_md.md_regs[R1]; } #else - uap->options = 0; - uap->rusage = NULL; + w.options = 0; + w.rusage = NULL; #endif - uap->pid = WAIT_ANY; - uap->status = NULL; - uap->compat = 1; - return (wait1(p, uap, retval)); + w.pid = WAIT_ANY; + w.status = NULL; + return (wait1(p, &w, retval, 1)); } +#endif /* COMPAT_43 */ +int wait4(p, uap, retval) struct proc *p; struct wait_args *uap; int *retval; { - uap->compat = 0; - return (wait1(p, uap, retval)); + return (wait1(p, uap, retval, 0)); } -#else -#define wait1 wait4 -#endif -int -wait1(q, uap, retval) +static int +wait1(q, uap, retval, compat) register struct proc *q; - register struct wait_args *uap; + register struct wait_args /* { + int pid; + int *status; + int options; + struct rusage *rusage; + } */ *uap; int retval[]; + int compat; { register int nfound; register struct proc *p, *t; @@ -338,16 +386,22 @@ loop: continue; nfound++; if (p->p_stat == SZOMB) { + /* charge childs scheduling cpu usage to parent */ + if (curproc->p_pid != 1) { + curproc->p_estcpu = min(curproc->p_estcpu + + p->p_estcpu, UCHAR_MAX); + } + retval[0] = p->p_pid; #ifdef COMPAT_43 - if (uap->compat) + if (compat) retval[1] = p->p_xstat; else #endif if (uap->status) { status = p->p_xstat; /* convert to int */ - if (error = copyout((caddr_t)&status, - (caddr_t)uap->status, sizeof(status))) + if ((error = copyout((caddr_t)&status, + (caddr_t)uap->status, sizeof(status)))) return (error); } if (uap->rusage && (error = copyout((caddr_t)p->p_ru, @@ -367,6 +421,7 @@ loop: p->p_xstat = 0; ruadd(&q->p_stats->p_cru, p->p_ru); FREE(p->p_ru, M_ZOMBIE); + p->p_ru = NULL; /* * Decrement the count of procs running with this uid. @@ -374,20 +429,21 @@ loop: (void)chgproccnt(p->p_cred->p_ruid, -1); /* + * Release reference to text vnode + */ + if (p->p_textvp) + vrele(p->p_textvp); + + /* * Free up credentials. */ if (--p->p_cred->p_refcnt == 0) { crfree(p->p_cred->pc_ucred); FREE(p->p_cred, M_SUBPROC); + p->p_cred = NULL; } /* - * Release reference to text vnode - */ - if (p->p_textvp) - vrele(p->p_textvp); - - /* * Finally finished with old proc entry. * Unlink it from its process group and free it. */ @@ -410,7 +466,7 @@ loop: p->p_flag |= P_WAITED; retval[0] = p->p_pid; #ifdef COMPAT_43 - if (uap->compat) { + if (compat) { retval[1] = W_STOPCODE(p->p_xstat); error = 0; } else @@ -430,7 +486,7 @@ loop: retval[0] = 0; return (0); } - if (error = tsleep((caddr_t)q, PWAIT | PCATCH, "wait", 0)) + if ((error = tsleep((caddr_t)q, PWAIT | PCATCH, "wait", 0))) return (error); goto loop; } @@ -451,3 +507,57 @@ proc_reparent(child, parent) LIST_INSERT_HEAD(&parent->p_children, child, p_sibling); child->p_pptr = parent; } + +/* + * The next two functions are to handle adding/deleting items on the + * exit callout list + * + * at_exit(): + * Take the arguments given and put them onto the exit callout list, + * However first make sure that it's not already there. + * returns 0 on success. + */ +int +at_exit(exitlist_fn function) +{ + ele_p ep; + + /* Be noisy if the programmer has lost track of things */ + if (rm_at_exit(function)) + printf("exit callout entry already present\n"); + ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT); + if (ep == NULL) + return (ENOMEM); + ep->next = exit_list; + ep->function = function; + exit_list = ep; + return (0); +} +/* + * Scan the exit callout list for the given items and remove them. + * Returns the number of items removed. + * Logically this can only be 0 or 1. + */ +int +rm_at_exit(exitlist_fn function) +{ + ele_p *epp, ep; + int count; + + count = 0; + epp = &exit_list; + ep = *epp; + while (ep) { + if (ep->function == function) { + *epp = ep->next; + free(ep, M_TEMP); + count++; + } else { + epp = &ep->next; + } + ep = *epp; + } + return (count); +} + + diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c index 6c5f22f..8327b81 100644 --- a/sys/kern/kern_fork.c +++ b/sys/kern/kern_fork.c @@ -35,55 +35,104 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)kern_fork.c 8.8 (Berkeley) 2/14/95 + * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94 + * $Id$ */ +#include "opt_ktrace.h" + #include <sys/param.h> #include <sys/systm.h> -#include <sys/map.h> +#include <sys/sysproto.h> #include <sys/filedesc.h> #include <sys/kernel.h> #include <sys/malloc.h> #include <sys/proc.h> #include <sys/resourcevar.h> #include <sys/vnode.h> -#include <sys/file.h> #include <sys/acct.h> #include <sys/ktrace.h> +#include <sys/unistd.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_extern.h> +#include <vm/vm_inherit.h> + +static int fork1 __P((struct proc *p, int flags, int *retval)); + +/* + * These are the stuctures used to create a callout list for things to do + * when forking a process + */ +typedef struct fork_list_element { + struct fork_list_element *next; + forklist_fn function; +} *fle_p; + +static fle_p fork_list; + +#ifndef _SYS_SYSPROTO_H_ +struct fork_args { + int dummy; +}; +#endif /* ARGSUSED */ +int fork(p, uap, retval) struct proc *p; - void *uap; - register_t *retval; + struct fork_args *uap; + int retval[]; { - - return (fork1(p, 0, retval)); + return (fork1(p, (RFFDG|RFPROC), retval)); } /* ARGSUSED */ +int vfork(p, uap, retval) struct proc *p; - void *uap; - register_t *retval; + struct vfork_args *uap; + int retval[]; { + return (fork1(p, (RFFDG|RFPROC|RFPPWAIT), retval)); +} - return (fork1(p, 1, retval)); +/* ARGSUSED */ +int +rfork(p, uap, retval) + struct proc *p; + struct rfork_args *uap; + int retval[]; +{ + return (fork1(p, uap->flags, retval)); } + int nprocs = 1; /* process 0 */ +static int nextpid = 0; -fork1(p1, isvfork, retval) +static int +fork1(p1, flags, retval) register struct proc *p1; - int isvfork; - register_t *retval; + int flags; + int retval[]; { - register struct proc *p2; + register struct proc *p2, *pptr; register uid_t uid; struct proc *newproc; - struct proc **hash; int count; - static int nextpid, pidchecked = 0; + static int pidchecked = 0; + fle_p ep ; + + ep = fork_list; + if ((flags & RFPROC) == 0) + return (EINVAL); + if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG)) + return (EINVAL); /* * Although process entries are dynamically created, we still keep @@ -97,6 +146,11 @@ fork1(p1, isvfork, retval) tablefull("proc"); return (EAGAIN); } + /* + * Increment the nprocs resource before blocking can occur. There + * are hard-limits as to the number of processes that can run. + */ + nprocs++; /* * Increment the count of procs running with this uid. Don't allow @@ -105,6 +159,10 @@ fork1(p1, isvfork, retval) count = chgproccnt(uid, 1); if (uid != 0 && count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur) { (void)chgproccnt(uid, -1); + /* + * Back out the process count + */ + nprocs--; return (EAGAIN); } @@ -146,7 +204,7 @@ again: } if (p2->p_pid > nextpid && pidchecked > p2->p_pid) pidchecked = p2->p_pid; - if (p2->p_pgrp->pg_id > nextpid && + if (p2->p_pgrp->pg_id > nextpid && pidchecked > p2->p_pgrp->pg_id) pidchecked = p2->p_pgrp->pg_id; } @@ -157,12 +215,10 @@ again: } } - nprocs++; p2 = newproc; p2->p_stat = SIDL; /* protect against others */ p2->p_pid = nextpid; LIST_INSERT_HEAD(&allproc, p2, p_list); - p2->p_forw = p2->p_back = NULL; /* shouldn't be necessary */ LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash); /* @@ -176,6 +232,11 @@ again: (unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy)); /* + * XXX: this should be done as part of the startzero above + */ + p2->p_vmspace = 0; /* XXX */ + + /* * Duplicate sub-structures as needed. * Increase reference counts on shared objects. * The p_stats and p_sigacts substructs are set in vm_fork. @@ -194,7 +255,13 @@ again: if (p2->p_textvp) VREF(p2->p_textvp); - p2->p_fd = fdcopy(p1); + if (flags & RFCFDG) + p2->p_fd = fdinit(p1); + else if (flags & RFFDG) + p2->p_fd = fdcopy(p1); + else + p2->p_fd = fdshare(p1); + /* * If p_limit is still copy-on-write, bump refcnt, * otherwise get a copy that won't be modified. @@ -208,13 +275,29 @@ again: p2->p_limit->p_refcnt++; } + /* + * Preserve some flags in subprocess. + */ + p2->p_flag |= p1->p_flag & P_SUGID; if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT) p2->p_flag |= P_CONTROLT; - if (isvfork) + if (flags & RFPPWAIT) p2->p_flag |= P_PPWAIT; LIST_INSERT_AFTER(p1, p2, p_pglist); - p2->p_pptr = p1; - LIST_INSERT_HEAD(&p1->p_children, p2, p_sibling); + + /* + * Attach the new process to its parent. + * + * If RFNOWAIT is set, the newly created process becomes a child + * of init. This effectively disassociates the child from the + * parent. + */ + if (flags & RFNOWAIT) + pptr = initproc; + else + pptr = p1; + p2->p_pptr = pptr; + LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling); LIST_INIT(&p2->p_children); #ifdef KTRACE @@ -230,10 +313,25 @@ again: #endif /* + * set priority of child to be that of parent + */ + p2->p_estcpu = p1->p_estcpu; + + /* * This begins the section where we must prevent the parent * from being swapped. */ p1->p_flag |= P_NOSWAP; + + /* + * share as much address space as possible + * XXX this should probably go in vm_fork() + */ + if (flags & RFMEM) + (void) vm_map_inherit(&p1->p_vmspace->vm_map, + VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS - MAXSSIZ, + VM_INHERIT_SHARE); + /* * Set return values for child before vm_fork, * so they can be copied to child stack. @@ -244,18 +342,28 @@ again: */ retval[0] = p1->p_pid; retval[1] = 1; - if (vm_fork(p1, p2, isvfork)) { + if (vm_fork(p1, p2)) { /* * Child process. Set start time and get to work. */ - (void) splclock(); - p2->p_stats->p_start = time; + microtime(&runtime); (void) spl0(); + p2->p_stats->p_start = runtime; p2->p_acflag = AFORK; return (0); } /* + * Both processes are set up, now check if any LKMs want + * to adjust anything. + * What if they have an error? XXX + */ + while (ep) { + (*ep->function)(p1, p2, flags); + ep = ep->next; + } + + /* * Make child runnable and add to run queue. */ (void) splhigh(); @@ -273,9 +381,8 @@ again: * child to exec or exit, set P_PPWAIT on child, and sleep on our * proc (in case of exit). */ - if (isvfork) - while (p2->p_flag & P_PPWAIT) - tsleep(p1, PWAIT, "ppwait", 0); + while (p2->p_flag & P_PPWAIT) + tsleep(p1, PWAIT, "ppwait", 0); /* * Return child pid to parent process, @@ -285,3 +392,58 @@ again: retval[1] = 0; return (0); } + +/* + * The next two functionms are general routines to handle adding/deleting + * items on the fork callout list. + * + * at_fork(): + * Take the arguments given and put them onto the fork callout list, + * However first make sure that it's not already there. + * Returns 0 on success or a standard error number. + */ +int +at_fork(forklist_fn function) +{ + fle_p ep; + + /* let the programmer know if he's been stupid */ + if (rm_at_fork(function)) + printf("fork callout entry already present\n"); + ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT); + if (ep == NULL) + return (ENOMEM); + ep->next = fork_list; + ep->function = function; + fork_list = ep; + return (0); +} + +/* + * Scan the exit callout list for the given items and remove them. + * Returns the number of items removed. + * Theoretically this value can only be 0 or 1. + */ +int +rm_at_fork(forklist_fn function) +{ + fle_p *epp, ep; + int count; + + count= 0; + epp = &fork_list; + ep = *epp; + while (ep) { + if (ep->function == function) { + *epp = ep->next; + free(ep, M_TEMP); + count++; + } else { + epp = &ep->next; + } + ep = *epp; + } + return (count); +} + + diff --git a/sys/kern/kern_ktrace.c b/sys/kern/kern_ktrace.c index b841754..f8e4e25 100644 --- a/sys/kern/kern_ktrace.c +++ b/sys/kern/kern_ktrace.c @@ -30,33 +30,40 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)kern_ktrace.c 8.5 (Berkeley) 5/14/95 + * @(#)kern_ktrace.c 8.2 (Berkeley) 9/23/93 + * $Id: kern_ktrace.c,v 1.17 1997/02/22 09:39:05 peter Exp $ */ -#ifdef KTRACE +#include "opt_ktrace.h" #include <sys/param.h> #include <sys/systm.h> +#include <sys/sysproto.h> #include <sys/proc.h> -#include <sys/file.h> +#include <sys/fcntl.h> #include <sys/namei.h> #include <sys/vnode.h> #include <sys/ktrace.h> #include <sys/malloc.h> #include <sys/syslog.h> -#include <sys/mount.h> -#include <sys/syscallargs.h> +#ifdef KTRACE +static struct ktr_header *ktrgetheader __P((int type)); +static void ktrwrite __P((struct vnode *, struct ktr_header *)); +static int ktrcanset __P((struct proc *,struct proc *)); +static int ktrsetchildren __P((struct proc *,struct proc *,int,int,struct vnode *)); +static int ktrops __P((struct proc *,struct proc *,int,int,struct vnode *)); + -struct ktr_header * +static struct ktr_header * ktrgetheader(type) int type; { register struct ktr_header *kth; struct proc *p = curproc; /* XXX */ - MALLOC(kth, struct ktr_header *, sizeof (struct ktr_header), - M_TEMP, M_WAITOK); + MALLOC(kth, struct ktr_header *, sizeof (struct ktr_header), + M_KTRACE, M_WAITOK); kth->ktr_type = type; microtime(&kth->ktr_time); kth->ktr_pid = p->p_pid; @@ -65,31 +72,29 @@ ktrgetheader(type) } void -ktrsyscall(vp, code, argsize, args) +ktrsyscall(vp, code, narg, args) struct vnode *vp; - int code, argsize; - register_t args[]; + int code, narg, args[]; { struct ktr_header *kth; struct ktr_syscall *ktp; - register len = sizeof(struct ktr_syscall) + argsize; + register len = sizeof(struct ktr_syscall) + (narg * sizeof(int)); struct proc *p = curproc; /* XXX */ - register_t *argp; - int i; + int *argp, i; p->p_traceflag |= KTRFAC_ACTIVE; kth = ktrgetheader(KTR_SYSCALL); - MALLOC(ktp, struct ktr_syscall *, len, M_TEMP, M_WAITOK); + MALLOC(ktp, struct ktr_syscall *, len, M_KTRACE, M_WAITOK); ktp->ktr_code = code; - ktp->ktr_argsize = argsize; - argp = (register_t *)((char *)ktp + sizeof(struct ktr_syscall)); - for (i = 0; i < (argsize / sizeof *argp); i++) + ktp->ktr_narg = narg; + argp = (int *)((char *)ktp + sizeof(struct ktr_syscall)); + for (i = 0; i < narg; i++) *argp++ = args[i]; kth->ktr_buf = (caddr_t)ktp; kth->ktr_len = len; ktrwrite(vp, kth); - FREE(ktp, M_TEMP); - FREE(kth, M_TEMP); + FREE(ktp, M_KTRACE); + FREE(kth, M_KTRACE); p->p_traceflag &= ~KTRFAC_ACTIVE; } @@ -112,7 +117,7 @@ ktrsysret(vp, code, error, retval) kth->ktr_len = sizeof(struct ktr_sysret); ktrwrite(vp, kth); - FREE(kth, M_TEMP); + FREE(kth, M_KTRACE); p->p_traceflag &= ~KTRFAC_ACTIVE; } @@ -130,7 +135,7 @@ ktrnamei(vp, path) kth->ktr_buf = path; ktrwrite(vp, kth); - FREE(kth, M_TEMP); + FREE(kth, M_KTRACE); p->p_traceflag &= ~KTRFAC_ACTIVE; } @@ -147,13 +152,13 @@ ktrgenio(vp, fd, rw, iov, len, error) register caddr_t cp; register int resid = len, cnt; struct proc *p = curproc; /* XXX */ - + if (error) return; p->p_traceflag |= KTRFAC_ACTIVE; kth = ktrgetheader(KTR_GENIO); MALLOC(ktp, struct ktr_genio *, sizeof(struct ktr_genio) + len, - M_TEMP, M_WAITOK); + M_KTRACE, M_WAITOK); ktp->ktr_fd = fd; ktp->ktr_rw = rw; cp = (caddr_t)((char *)ktp + sizeof (struct ktr_genio)); @@ -171,8 +176,8 @@ ktrgenio(vp, fd, rw, iov, len, error) ktrwrite(vp, kth); done: - FREE(kth, M_TEMP); - FREE(ktp, M_TEMP); + FREE(kth, M_KTRACE); + FREE(ktp, M_KTRACE); p->p_traceflag &= ~KTRFAC_ACTIVE; } @@ -197,7 +202,7 @@ ktrpsig(vp, sig, action, mask, code) kth->ktr_len = sizeof (struct ktr_psig); ktrwrite(vp, kth); - FREE(kth, M_TEMP); + FREE(kth, M_KTRACE); p->p_traceflag &= ~KTRFAC_ACTIVE; } @@ -218,33 +223,38 @@ ktrcsw(vp, out, user) kth->ktr_len = sizeof (struct ktr_csw); ktrwrite(vp, kth); - FREE(kth, M_TEMP); + FREE(kth, M_KTRACE); p->p_traceflag &= ~KTRFAC_ACTIVE; } +#endif /* Interface and common routines */ /* * ktrace system call */ +#ifndef _SYS_SYSPROTO_H_ +struct ktrace_args { + char *fname; + int ops; + int facs; + int pid; +}; +#endif /* ARGSUSED */ int ktrace(curp, uap, retval) struct proc *curp; - register struct ktrace_args /* { - syscallarg(char *) fname; - syscallarg(int) ops; - syscallarg(int) facs; - syscallarg(int) pid; - } */ *uap; - register_t *retval; + register struct ktrace_args *uap; + int *retval; { +#ifdef KTRACE register struct vnode *vp = NULL; register struct proc *p; struct pgrp *pg; - int facs = SCARG(uap, facs) & ~KTRFAC_ROOT; - int ops = KTROP(SCARG(uap, ops)); - int descend = SCARG(uap, ops) & KTRFLAG_DESCEND; + int facs = uap->facs & ~KTRFAC_ROOT; + int ops = KTROP(uap->ops); + int descend = uap->ops & KTRFLAG_DESCEND; int ret = 0; int error = 0; struct nameidata nd; @@ -254,14 +264,14 @@ ktrace(curp, uap, retval) /* * an operation which requires a file argument. */ - NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, fname), - curp); - if (error = vn_open(&nd, FREAD|FWRITE, 0)) { + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->fname, curp); + error = vn_open(&nd, FREAD|FWRITE, 0); + if (error) { curp->p_traceflag &= ~KTRFAC_ACTIVE; return (error); } vp = nd.ni_vp; - VOP_UNLOCK(vp, 0, p); + VOP_UNLOCK(vp, 0, curp); if (vp->v_type != VREG) { (void) vn_close(vp, FREAD|FWRITE, curp->p_ucred, curp); curp->p_traceflag &= ~KTRFAC_ACTIVE; @@ -292,14 +302,14 @@ ktrace(curp, uap, retval) error = EINVAL; goto done; } - /* + /* * do it */ - if (SCARG(uap, pid) < 0) { + if (uap->pid < 0) { /* * by process group */ - pg = pgfind(-SCARG(uap, pid)); + pg = pgfind(-uap->pid); if (pg == NULL) { error = ESRCH; goto done; @@ -307,14 +317,14 @@ ktrace(curp, uap, retval) for (p = pg->pg_members.lh_first; p != 0; p = p->p_pglist.le_next) if (descend) ret |= ktrsetchildren(curp, p, ops, facs, vp); - else + else ret |= ktrops(curp, p, ops, facs, vp); - + } else { /* * by pid */ - p = pfind(SCARG(uap, pid)); + p = pfind(uap->pid); if (p == NULL) { error = ESRCH; goto done; @@ -331,9 +341,48 @@ done: (void) vn_close(vp, FWRITE, curp->p_ucred, curp); curp->p_traceflag &= ~KTRFAC_ACTIVE; return (error); +#else + return ENOSYS; +#endif } +/* + * utrace system call + */ +/* ARGSUSED */ int +utrace(curp, uap, retval) + struct proc *curp; + register struct utrace_args *uap; + int *retval; +{ +#ifdef KTRACE + struct ktr_header *kth; + struct proc *p = curproc; /* XXX */ + register caddr_t cp; + + if (!KTRPOINT(p, KTR_USER)) + return (0); + p->p_traceflag |= KTRFAC_ACTIVE; + kth = ktrgetheader(KTR_USER); + MALLOC(cp, caddr_t, uap->len, M_KTRACE, M_WAITOK); + if (!copyin(uap->addr, cp, uap->len)) { + kth->ktr_buf = cp; + kth->ktr_len = uap->len; + ktrwrite(p->p_tracep, kth); + } + FREE(kth, M_KTRACE); + FREE(cp, M_KTRACE); + p->p_traceflag &= ~KTRFAC_ACTIVE; + + return (0); +#else + return (ENOSYS); +#endif +} + +#ifdef KTRACE +static int ktrops(curp, p, ops, facs, vp) struct proc *p, *curp; int ops, facs; @@ -343,7 +392,7 @@ ktrops(curp, p, ops, facs, vp) if (!ktrcanset(curp, p)) return (0); if (ops == KTROP_SET) { - if (p->p_tracep != vp) { + if (p->p_tracep != vp) { /* * if trace file already in use, relinquish */ @@ -355,7 +404,7 @@ ktrops(curp, p, ops, facs, vp) p->p_traceflag |= facs; if (curp->p_ucred->cr_uid == 0) p->p_traceflag |= KTRFAC_ROOT; - } else { + } else { /* KTROP_CLEAR */ if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) { /* no more tracing */ @@ -370,6 +419,7 @@ ktrops(curp, p, ops, facs, vp) return (1); } +static int ktrsetchildren(curp, top, ops, facs, vp) struct proc *curp, *top; int ops, facs; @@ -401,6 +451,7 @@ ktrsetchildren(curp, top, ops, facs, vp) /*NOTREACHED*/ } +static void ktrwrite(vp, kth) struct vnode *vp; register struct ktr_header *kth; @@ -450,11 +501,12 @@ ktrwrite(vp, kth) * Return true if caller has permission to set the ktracing state * of target. Essentially, the target can't possess any * more permissions than the caller. KTRFAC_ROOT signifies that - * root previously set the tracing status on the target process, and + * root previously set the tracing status on the target process, and * so, only root may further change it. * * TODO: check groups. use caller effective gid. */ +static int ktrcanset(callp, targetp) struct proc *callp, *targetp; { @@ -472,4 +524,4 @@ ktrcanset(callp, targetp) return (0); } -#endif +#endif /* KTRACE */ diff --git a/sys/kern/kern_lkm.c b/sys/kern/kern_lkm.c new file mode 100644 index 0000000..f371c37 --- /dev/null +++ b/sys/kern/kern_lkm.c @@ -0,0 +1,957 @@ +/*- + * Copyright (c) 1992 Terrence R. Lambert. + * Copyright (c) 1994 Christopher G. Demetriou + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Terrence R. Lambert. + * 4. The name Terrence R. Lambert may not be used to endorse or promote + * products derived from this software without specific prior written + * permission. + * + * THIS SOFTWARE IS PROVIDED BY TERRENCE R. LAMBERT ``AS IS'' AND ANY + * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE TERRENCE R. LAMBERT BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: kern_lkm.c,v 1.38 1997/03/23 03:36:20 bde Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/tty.h> +#include <sys/conf.h> +#include <sys/fcntl.h> +#include <sys/proc.h> +#include <sys/uio.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#include <sys/mount.h> +#include <sys/sysent.h> +#include <sys/exec.h> +#include <sys/imgact.h> +#include <sys/lkm.h> +#include <sys/vnode.h> +#ifdef DEVFS +#include <sys/devfsext.h> +#endif /*DEVFS*/ + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> + + +#define PAGESIZE 1024 /* kmem_alloc() allocation quantum */ + +#define LKM_ALLOC 0x01 +#define LKM_WANT 0x02 + +#define LKMS_IDLE 0x00 +#define LKMS_RESERVED 0x01 +#define LKMS_LOADING 0x02 +#define LKMS_LOADED 0x04 +#define LKMS_UNLOADING 0x08 + +static int lkm_v = 0; +static int lkm_state = LKMS_IDLE; + +#ifndef MAXLKMS +#define MAXLKMS 20 +#endif + +static struct lkm_table lkmods[MAXLKMS]; /* table of loaded modules */ +static struct lkm_table *curp; /* global for in-progress ops */ + +static int _lkm_dev __P((struct lkm_table *lkmtp, int cmd)); +static int _lkm_exec __P((struct lkm_table *lkmtp, int cmd)); +static int _lkm_vfs __P((struct lkm_table *lkmtp, int cmd)); +static int _lkm_syscall __P((struct lkm_table *lkmtp, int cmd)); +static void lkmunreserve __P((void)); + +static d_open_t lkmcopen; +static d_close_t lkmcclose; +static d_ioctl_t lkmcioctl; + +#define CDEV_MAJOR 32 +static struct cdevsw lkmc_cdevsw = + { lkmcopen, lkmcclose, noread, nowrite, /*32*/ + lkmcioctl, nostop, nullreset, nodevtotty, + noselect, nommap, NULL, "lkm", NULL, -1 }; + + +/*ARGSUSED*/ +static int +lkmcopen(dev, flag, devtype, p) + dev_t dev; + int flag; + int devtype; + struct proc *p; +{ + int error; + + if (minor(dev) != 0) + return(ENXIO); /* bad minor # */ + + /* + * Use of the loadable kernel module device must be exclusive; we + * may try to remove this restriction later, but it's really no + * hardship. + */ + while (lkm_v & LKM_ALLOC) { + if (flag & FNONBLOCK) /* don't hang */ + return(EBUSY); + lkm_v |= LKM_WANT; + /* + * Sleep pending unlock; we use tsleep() to allow + * an alarm out of the open. + */ + error = tsleep((caddr_t)&lkm_v, TTIPRI|PCATCH, "lkmopn", 0); + if (error) + return(error); /* leave LKM_WANT set -- no problem */ + } + lkm_v |= LKM_ALLOC; + + return(0); /* pseudo-device open */ +} + +/* + * Unreserve the memory associated with the current loaded module; done on + * a coerced close of the lkm device (close on premature exit of modload) + * or explicitly by modload as a result of a link failure. + */ +static void +lkmunreserve() +{ + + if (lkm_state == LKMS_IDLE) + return; + + /* + * Actually unreserve the memory + */ + if (curp && curp->area) { + kmem_free(kernel_map, curp->area, curp->size);/**/ + curp->area = 0; + if (curp->private.lkm_any != NULL) + curp->private.lkm_any = NULL; + } + + lkm_state = LKMS_IDLE; +} + +static int +lkmcclose(dev, flag, mode, p) + dev_t dev; + int flag; + int mode; + struct proc *p; +{ + + if (!(lkm_v & LKM_ALLOC)) { +#ifdef DEBUG + printf("LKM: close before open!\n"); +#endif /* DEBUG */ + return(EBADF); + } + + /* do this before waking the herd... */ + if (curp && !curp->used) { + /* + * If we close before setting used, we have aborted + * by way of error or by way of close-on-exit from + * a premature exit of "modload". + */ + lkmunreserve(); /* coerce state to LKM_IDLE */ + } + + lkm_v &= ~LKM_ALLOC; + wakeup((caddr_t)&lkm_v); /* thundering herd "problem" here */ + + return(0); /* pseudo-device closed */ +} + +/*ARGSUSED*/ +static int +lkmcioctl(dev, cmd, data, flag, p) + dev_t dev; + int cmd; + caddr_t data; + int flag; + struct proc *p; +{ + int err = 0; + int i; + struct lmc_resrv *resrvp; + struct lmc_loadbuf *loadbufp; + struct lmc_unload *unloadp; + struct lmc_stat *statp; + char istr[MAXLKMNAME]; + + switch(cmd) { + case LMRESERV: /* reserve pages for a module */ + if ((flag & FWRITE) == 0 || securelevel > 0) + /* only allow this if writing and insecure */ + return EPERM; + + resrvp = (struct lmc_resrv *)data; + + /* + * Find a free slot. + */ + for (i = 0; i < MAXLKMS; i++) + if (!lkmods[i].used) + break; + if (i == MAXLKMS) { + err = ENOMEM; /* no slots available */ + break; + } + curp = &lkmods[i]; + curp->id = i; /* self reference slot offset */ + + resrvp->slot = i; /* return slot */ + + /* + * Get memory for module + */ + curp->size = resrvp->size; + + curp->area = kmem_alloc(kernel_map, curp->size);/**/ + + curp->offset = 0; /* load offset */ + + resrvp->addr = curp->area; /* ret kernel addr */ + +#ifdef DEBUG + printf("LKM: LMRESERV (actual = 0x%08x)\n", curp->area); + printf("LKM: LMRESERV (adjusted = 0x%08x)\n", + trunc_page(curp->area)); +#endif /* DEBUG */ + lkm_state = LKMS_RESERVED; + break; + + case LMLOADBUF: /* Copy in; stateful, follows LMRESERV */ + if ((flag & FWRITE) == 0 || securelevel > 0) + /* only allow this if writing and insecure */ + return EPERM; + + loadbufp = (struct lmc_loadbuf *)data; + i = loadbufp->cnt; + if ((lkm_state != LKMS_RESERVED && lkm_state != LKMS_LOADING) + || i < 0 + || i > MODIOBUF + || i > curp->size - curp->offset) { + err = ENOMEM; + break; + } + + /* copy in buffer full of data */ + err = copyin((caddr_t)loadbufp->data, + (caddr_t)curp->area + curp->offset, i); + if (err) + break; + + if ((curp->offset + i) < curp->size) { + lkm_state = LKMS_LOADING; +#ifdef DEBUG + printf("LKM: LMLOADBUF (loading @ %d of %d, i = %d)\n", + curp->offset, curp->size, i); +#endif /* DEBUG */ + } else { + lkm_state = LKMS_LOADED; +#ifdef DEBUG + printf("LKM: LMLOADBUF (loaded)\n"); +#endif /* DEBUG */ + } + curp->offset += i; + break; + + case LMUNRESRV: /* discard reserved pages for a module */ + if ((flag & FWRITE) == 0 || securelevel > 0) + /* only allow this if writing and insecure */ + return EPERM; + + lkmunreserve(); /* coerce state to LKM_IDLE */ +#ifdef DEBUG + printf("LKM: LMUNRESERV\n"); +#endif /* DEBUG */ + break; + + case LMREADY: /* module loaded: call entry */ + if ((flag & FWRITE) == 0 || securelevel > 0) + /* only allow this if writing or insecure */ + return EPERM; + + switch (lkm_state) { + case LKMS_LOADED: + break; + case LKMS_LOADING: + /* The remainder must be bss, so we clear it */ + bzero((caddr_t)curp->area + curp->offset, + curp->size - curp->offset); + break; + default: + +#ifdef DEBUG + printf("lkm_state is %02x\n", lkm_state); +#endif /* DEBUG */ + return ENXIO; + } + + /* XXX gack */ + curp->entry = (int (*) __P((struct lkm_table *, int, int))) + (*((int *)data)); + + /* call entry(load)... (assigns "private" portion) */ + err = (*(curp->entry))(curp, LKM_E_LOAD, LKM_VERSION); + if (err) { + /* + * Module may refuse loading or may have a + * version mismatch... + */ + lkm_state = LKMS_UNLOADING; /* for lkmunreserve */ + lkmunreserve(); /* free memory */ + curp->used = 0; /* free slot */ + break; + } + /* + * It's possible for a user to load a module that doesn't + * initialize itself correctly. (You can even get away with + * using it for a while.) Unfortunately, we are faced with + * the following problems: + * - we can't tell a good module from a bad one until + * after we've run its entry function (if the private + * section is uninitalized after we return from the + * entry, then something's fishy) + * - now that we've called the entry function, we can't + * forcibly unload the module without risking a crash + * - since we don't know what the module's entry function + * did, we can't easily clean up the mess it may have + * made, so we can't know just how unstable the system + * may be + * So, being stuck between a rock and a hard place, we + * have no choice but to do this... + */ + if (curp->private.lkm_any == NULL) + panic("loadable module initialization failed"); + + curp->used = 1; +#ifdef DEBUG + printf("LKM: LMREADY\n"); +#endif /* DEBUG */ + lkm_state = LKMS_IDLE; + break; + + case LMUNLOAD: /* unload a module */ + if ((flag & FWRITE) == 0 || securelevel > 0) + /* only allow this if writing and insecure */ + return EPERM; + + unloadp = (struct lmc_unload *)data; + + if ((i = unloadp->id) == -1) { /* unload by name */ + /* + * Copy name and lookup id from all loaded + * modules. May fail. + */ + err =copyinstr(unloadp->name, istr, MAXLKMNAME-1, NULL); + if (err) + break; + + /* + * look up id... + */ + for (i = 0; i < MAXLKMS; i++) { + if (!lkmods[i].used) + continue; + if (!strcmp(istr, + lkmods[i].private.lkm_any->lkm_name)) + break; + } + } + + /* + * Range check the value; on failure, return EINVAL + */ + if (i < 0 || i >= MAXLKMS) { + err = EINVAL; + break; + } + + curp = &lkmods[i]; + + if (!curp->used) { + err = ENOENT; + break; + } + + /* call entry(unload) */ + if ((*(curp->entry))(curp, LKM_E_UNLOAD, LKM_VERSION)) { + err = EBUSY; + break; + } + + lkm_state = LKMS_UNLOADING; /* non-idle for lkmunreserve */ + lkmunreserve(); /* free memory */ + curp->used = 0; /* free slot */ + break; + + case LMSTAT: /* stat a module by id/name */ + /* allow readers and writers to stat */ + + statp = (struct lmc_stat *)data; + + if ((i = statp->id) == -1) { /* stat by name */ + /* + * Copy name and lookup id from all loaded + * modules. + */ + copystr(statp->name, istr, MAXLKMNAME-1, NULL); + /* + * look up id... + */ + for (i = 0; i < MAXLKMS; i++) { + if (!lkmods[i].used) + continue; + if (!strcmp(istr, + lkmods[i].private.lkm_any->lkm_name)) + break; + } + + if (i == MAXLKMS) { /* Not found */ + err = ENOENT; + break; + } + } + + /* + * Range check the value; on failure, return EINVAL + */ + if (i < 0 || i >= MAXLKMS) { + err = EINVAL; + break; + } + + curp = &lkmods[i]; + + if (!curp->used) { /* Not found */ + err = ENOENT; + break; + } + + /* + * Copy out stat information for this module... + */ + statp->id = curp->id; + statp->offset = curp->private.lkm_any->lkm_offset; + statp->type = curp->private.lkm_any->lkm_type; + statp->area = curp->area; + statp->size = curp->size / PAGESIZE; + statp->private = (unsigned long)curp->private.lkm_any; + statp->ver = curp->private.lkm_any->lkm_ver; + copystr(curp->private.lkm_any->lkm_name, + statp->name, + MAXLKMNAME - 2, + NULL); + + break; + + default: /* bad ioctl()... */ + err = ENOTTY; + break; + } + + return (err); +} + +/* + * Acts like "nosys" but can be identified in sysent for dynamic call + * number assignment for a limited number of calls. + * + * Place holder for system call slots reserved for loadable modules. + */ +int +lkmnosys(p, args, retval) + struct proc *p; + struct nosys_args *args; + int *retval; +{ + + return(nosys(p, args, retval)); +} + +int +lkmexists(lkmtp) + struct lkm_table *lkmtp; +{ + int i; + + /* + * see if name exists... + */ + for (i = 0; i < MAXLKMS; i++) { + /* + * An unused module and the one we are testing are not + * considered. + */ + if (!lkmods[i].used || &lkmods[i] == lkmtp) + continue; + if (!strcmp(lkmtp->private.lkm_any->lkm_name, + lkmods[i].private.lkm_any->lkm_name)) + return(1); /* already loaded... */ + } + + return(0); /* module not loaded... */ +} + +/* + * For the loadable system call described by the structure pointed to + * by lkmtp, load/unload/stat it depending on the cmd requested. + */ +static int +_lkm_syscall(lkmtp, cmd) + struct lkm_table *lkmtp; + int cmd; +{ + struct lkm_syscall *args = lkmtp->private.lkm_syscall; + int i; + int err = 0; + + switch(cmd) { + case LKM_E_LOAD: + /* don't load twice! */ + if (lkmexists(lkmtp)) + return(EEXIST); + if ((i = args->lkm_offset) == -1) { /* auto */ + /* + * Search the table looking for a slot... + */ + for (i = 0; i < aout_sysvec.sv_size; i++) + if (aout_sysvec.sv_table[i].sy_call == + (sy_call_t *)lkmnosys) + break; /* found it! */ + /* out of allocable slots? */ + if (i == aout_sysvec.sv_size) { + err = ENFILE; + break; + } + } else { /* assign */ + if (i < 0 || i >= aout_sysvec.sv_size) { + err = EINVAL; + break; + } + } + + /* save old */ + bcopy(&aout_sysvec.sv_table[i], + &(args->lkm_oldent), + sizeof(struct sysent)); + + /* replace with new */ + bcopy(args->lkm_sysent, + &aout_sysvec.sv_table[i], + sizeof(struct sysent)); + + /* done! */ + args->lkm_offset = i; /* slot in sysent[] */ + + break; + + case LKM_E_UNLOAD: + /* current slot... */ + i = args->lkm_offset; + + /* replace current slot contents with old contents */ + bcopy(&(args->lkm_oldent), + &aout_sysvec.sv_table[i], + sizeof(struct sysent)); + + break; + + case LKM_E_STAT: /* no special handling... */ + break; + } + + return(err); +} + +/* + * For the loadable virtual file system described by the structure pointed + * to by lkmtp, load/unload/stat it depending on the cmd requested. + */ +static int +_lkm_vfs(lkmtp, cmd) + struct lkm_table *lkmtp; + int cmd; +{ + struct lkm_vfs *args = lkmtp->private.lkm_vfs; + struct vfsconf *vfc = args->lkm_vfsconf; + struct vfsconf *vfsp, *prev_vfsp; + int i, maxtypenum; + int err = 0; + + switch(cmd) { + case LKM_E_LOAD: + /* don't load twice! */ + if (lkmexists(lkmtp)) + return(EEXIST); + + for (vfsp = vfsconf; vfsp->vfc_next; vfsp = vfsp->vfc_next) { + if (!strcmp(vfc->vfc_name, vfsp->vfc_name)) { + return EEXIST; + } + } + + i = args->lkm_offset = vfc->vfc_typenum; + if (i < 0) { + i = maxvfsconf; + } + args->lkm_offset = vfc->vfc_typenum = i; + + if (maxvfsconf <= i) + maxvfsconf = i + 1; + + vfsp->vfc_next = vfc; + vfc->vfc_next = NULL; + + /* like in vfs_op_init */ + for(i = 0; args->lkm_vnodeops->ls_items[i]; i++) { + const struct vnodeopv_desc *opv = + args->lkm_vnodeops->ls_items[i]; + *(opv->opv_desc_vector_p) = NULL; + } + vfs_opv_init((struct vnodeopv_desc **)args->lkm_vnodeops->ls_items); + + /* + * Call init function for this VFS... + */ + (*(vfc->vfc_vfsops->vfs_init))(vfc); + + /* done! */ + break; + + case LKM_E_UNLOAD: + /* current slot... */ + i = args->lkm_offset; + + prev_vfsp = NULL; + for (vfsp = vfsconf; vfsp; + prev_vfsp = vfsp, vfsp = vfsp->vfc_next) { + if (!strcmp(vfc->vfc_name, vfsp->vfc_name)) + break; + } + if (vfsp == NULL) { + return EINVAL; + } + + if (vfsp->vfc_refcount) { + return EBUSY; + } + + FREE(vfsp, M_VFSCONF); + + prev_vfsp->vfc_next = vfsp->vfc_next; + + /* + * Maintain maxvfsconf. + */ + maxtypenum = 0; + for (vfsp = vfsconf; vfsp != NULL; vfsp = vfsp->vfc_next) + if (maxtypenum < vfsp->vfc_typenum) + maxtypenum = vfsp->vfc_typenum; + maxvfsconf = maxtypenum + 1; + + break; + + case LKM_E_STAT: /* no special handling... */ + break; + } + return(err); +} + +/* + * For the loadable device driver described by the structure pointed to + * by lkmtp, load/unload/stat it depending on the cmd requested. + */ +static int +_lkm_dev(lkmtp, cmd) + struct lkm_table *lkmtp; + int cmd; +{ + struct lkm_dev *args = lkmtp->private.lkm_dev; + int i; + dev_t descrip; + int err = 0; + + switch(cmd) { + case LKM_E_LOAD: + /* don't load twice! */ + if (lkmexists(lkmtp)) + return(EEXIST); + switch(args->lkm_devtype) { + case LM_DT_BLOCK: + if ((i = args->lkm_offset) == -1) + descrip = (dev_t) -1; + else + descrip = makedev(args->lkm_offset,0); + if ( err = bdevsw_add(&descrip, args->lkm_dev.bdev, + &(args->lkm_olddev.bdev))) { + break; + } + args->lkm_offset = major(descrip) ; + break; + + case LM_DT_CHAR: + if ((i = args->lkm_offset) == -1) + descrip = (dev_t) -1; + else + descrip = makedev(args->lkm_offset,0); + if ( err = cdevsw_add(&descrip, args->lkm_dev.cdev, + &(args->lkm_olddev.cdev))) { + break; + } + args->lkm_offset = major(descrip) ; + break; + + default: + err = ENODEV; + break; + } + break; + + case LKM_E_UNLOAD: + /* current slot... */ + i = args->lkm_offset; + descrip = makedev(i,0); + + switch(args->lkm_devtype) { + case LM_DT_BLOCK: + /* replace current slot contents with old contents */ + bdevsw_add(&descrip, args->lkm_olddev.bdev,NULL); + break; + + case LM_DT_CHAR: + /* replace current slot contents with old contents */ + cdevsw_add(&descrip, args->lkm_olddev.cdev,NULL); + break; + + default: + err = ENODEV; + break; + } + break; + + case LKM_E_STAT: /* no special handling... */ + break; + } + + return(err); +} + +#ifdef STREAMS +/* + * For the loadable streams module described by the structure pointed to + * by lkmtp, load/unload/stat it depending on the cmd requested. + */ +static int +_lkm_strmod(lkmtp, cmd) + struct lkm_table *lkmtp; + int cmd; +{ + struct lkm_strmod *args = lkmtp->private.lkm_strmod; + int i; + int err = 0; + + switch(cmd) { + case LKM_E_LOAD: + /* don't load twice! */ + if (lkmexists(lkmtp)) + return(EEXIST); + break; + + case LKM_E_UNLOAD: + break; + + case LKM_E_STAT: /* no special handling... */ + break; + } + + return(err); +} +#endif /* STREAMS */ + +/* + * For the loadable execution class described by the structure pointed to + * by lkmtp, load/unload/stat it depending on the cmd requested. + */ +static int +_lkm_exec(lkmtp, cmd) + struct lkm_table *lkmtp; + int cmd; +{ + struct lkm_exec *args = lkmtp->private.lkm_exec; + int i; + int err = 0; + const struct execsw **execsw = + (const struct execsw **)&execsw_set.ls_items[0]; + + switch(cmd) { + case LKM_E_LOAD: + /* don't load twice! */ + if (lkmexists(lkmtp)) + return(EEXIST); + if ((i = args->lkm_offset) == -1) { /* auto */ + /* + * Search the table looking for a slot... + */ + for (i = 0; execsw[i] != NULL; i++) + if (execsw[i]->ex_imgact == NULL) + break; /* found it! */ + /* out of allocable slots? */ + if (execsw[i] == NULL) { + err = ENFILE; + break; + } + } else { /* assign */ + err = EINVAL; + break; + } + + /* save old */ + bcopy(&execsw[i], &(args->lkm_oldexec), sizeof(struct execsw*)); + + /* replace with new */ + bcopy(&(args->lkm_exec), &execsw[i], sizeof(struct execsw*)); + + /* done! */ + args->lkm_offset = i; /* slot in execsw[] */ + + break; + + case LKM_E_UNLOAD: + /* current slot... */ + i = args->lkm_offset; + + /* replace current slot contents with old contents */ + bcopy(&(args->lkm_oldexec), &execsw[i], sizeof(struct execsw*)); + + break; + + case LKM_E_STAT: /* no special handling... */ + break; + } + return(err); +} + +/* XXX: This is bogus. we should find a better method RSN! */ +static const struct execsw lkm_exec_dummy1 = { NULL, "lkm" }; +static const struct execsw lkm_exec_dummy2 = { NULL, "lkm" }; +static const struct execsw lkm_exec_dummy3 = { NULL, "lkm" }; +static const struct execsw lkm_exec_dummy4 = { NULL, "lkm" }; +TEXT_SET(execsw_set, lkm_exec_dummy1); +TEXT_SET(execsw_set, lkm_exec_dummy2); +TEXT_SET(execsw_set, lkm_exec_dummy3); +TEXT_SET(execsw_set, lkm_exec_dummy4); + +/* + * This code handles the per-module type "wiring-in" of loadable modules + * into existing kernel tables. For "LM_MISC" modules, wiring and unwiring + * is assumed to be done in their entry routines internal to the module + * itself. + */ +int +lkmdispatch(lkmtp, cmd) + struct lkm_table *lkmtp; + int cmd; +{ + int err = 0; /* default = success */ + + switch(lkmtp->private.lkm_any->lkm_type) { + case LM_SYSCALL: + err = _lkm_syscall(lkmtp, cmd); + break; + + case LM_VFS: + err = _lkm_vfs(lkmtp, cmd); + break; + + case LM_DEV: + err = _lkm_dev(lkmtp, cmd); + break; + +#ifdef STREAMS + case LM_STRMOD: + { + struct lkm_strmod *args = lkmtp->private.lkm_strmod; + } + break; + +#endif /* STREAMS */ + + case LM_EXEC: + err = _lkm_exec(lkmtp, cmd); + break; + + case LM_MISC: /* ignore content -- no "misc-specific" procedure */ + if (lkmexists(lkmtp)) + err = EEXIST; + break; + + default: + err = ENXIO; /* unknown type */ + break; + } + + return(err); +} + +int +lkm_nullcmd(lkmtp, cmd) + struct lkm_table *lkmtp; + int cmd; +{ + + return (0); +} + +static lkm_devsw_installed = 0; +#ifdef DEVFS +static void *lkmc_devfs_token; +#endif + +static void lkm_drvinit(void *unused) +{ + dev_t dev; + + if( ! lkm_devsw_installed ) { + dev = makedev(CDEV_MAJOR, 0); + cdevsw_add(&dev,&lkmc_cdevsw, NULL); + lkm_devsw_installed = 1; +#ifdef DEVFS + lkmc_devfs_token = devfs_add_devswf(&lkmc_cdevsw, 0, DV_CHR, + UID_ROOT, GID_WHEEL, 0644, + "lkm"); +#endif + } +} + +SYSINIT(lkmdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,lkm_drvinit,NULL) + + diff --git a/sys/kern/kern_lockf.c b/sys/kern/kern_lockf.c new file mode 100644 index 0000000..fb1a8a0 --- /dev/null +++ b/sys/kern/kern_lockf.c @@ -0,0 +1,796 @@ +/* + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Scooter Morris at Genentech Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94 + * $Id$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/unistd.h> +#include <sys/vnode.h> +#include <sys/malloc.h> +#include <sys/fcntl.h> + +#include <sys/lockf.h> + +/* + * This variable controls the maximum number of processes that will + * be checked in doing deadlock detection. + */ +static int maxlockdepth = MAXDEPTH; + +#ifdef LOCKF_DEBUG +#include <sys/kernel.h> +#include <sys/sysctl.h> + +#include <vm/vm.h> + +#include <ufs/ufs/quota.h> +#include <ufs/ufs/inode.h> + +int lockf_debug = 0; +SYSCTL_INT(_debug, 4, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, ""); +#endif + +#define NOLOCKF (struct lockf *)0 +#define SELF 0x1 +#define OTHERS 0x2 +static int lf_clearlock __P((struct lockf *)); +static int lf_findoverlap __P((struct lockf *, + struct lockf *, int, struct lockf ***, struct lockf **)); +static struct lockf * + lf_getblock __P((struct lockf *)); +static int lf_getlock __P((struct lockf *, struct flock *)); +static int lf_setlock __P((struct lockf *)); +static void lf_split __P((struct lockf *, struct lockf *)); +static void lf_wakelock __P((struct lockf *)); + +/* + * Advisory record locking support + */ +int +lf_advlock(ap, head, size) + struct vop_advlock_args /* { + struct vnode *a_vp; + caddr_t a_id; + int a_op; + struct flock *a_fl; + int a_flags; + } */ *ap; + struct lockf **head; + u_quad_t size; +{ + register struct flock *fl = ap->a_fl; + register struct lockf *lock; + off_t start, end; + int error; + + /* + * Convert the flock structure into a start and end. + */ + switch (fl->l_whence) { + + case SEEK_SET: + case SEEK_CUR: + /* + * Caller is responsible for adding any necessary offset + * when SEEK_CUR is used. + */ + start = fl->l_start; + break; + + case SEEK_END: + start = size + fl->l_start; + break; + + default: + return (EINVAL); + } + if (start < 0) + return (EINVAL); + if (fl->l_len == 0) + end = -1; + else { + end = start + fl->l_len - 1; + if (end < start) + return (EINVAL); + } + /* + * Avoid the common case of unlocking when inode has no locks. + */ + if (*head == (struct lockf *)0) { + if (ap->a_op != F_SETLK) { + fl->l_type = F_UNLCK; + return (0); + } + } + /* + * Create the lockf structure + */ + MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK); + lock->lf_start = start; + lock->lf_end = end; + lock->lf_id = ap->a_id; +/* lock->lf_inode = ip; */ /* XXX JH */ + lock->lf_type = fl->l_type; + lock->lf_head = head; + lock->lf_next = (struct lockf *)0; + TAILQ_INIT(&lock->lf_blkhd); + lock->lf_flags = ap->a_flags; + /* + * Do the requested operation. + */ + switch(ap->a_op) { + case F_SETLK: + return (lf_setlock(lock)); + + case F_UNLCK: + error = lf_clearlock(lock); + FREE(lock, M_LOCKF); + return (error); + + case F_GETLK: + error = lf_getlock(lock, fl); + FREE(lock, M_LOCKF); + return (error); + + default: + free(lock, M_LOCKF); + return (EINVAL); + } + /* NOTREACHED */ +} + +/* + * Set a byte-range lock. + */ +static int +lf_setlock(lock) + register struct lockf *lock; +{ + register struct lockf *block; + struct lockf **head = lock->lf_head; + struct lockf **prev, *overlap, *ltmp; + static char lockstr[] = "lockf"; + int ovcase, priority, needtolink, error; + +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) + lf_print("lf_setlock", lock); +#endif /* LOCKF_DEBUG */ + + /* + * Set the priority + */ + priority = PLOCK; + if (lock->lf_type == F_WRLCK) + priority += 4; + priority |= PCATCH; + /* + * Scan lock list for this file looking for locks that would block us. + */ + while ((block = lf_getblock(lock))) { + /* + * Free the structure and return if nonblocking. + */ + if ((lock->lf_flags & F_WAIT) == 0) { + FREE(lock, M_LOCKF); + return (EAGAIN); + } + /* + * We are blocked. Since flock style locks cover + * the whole file, there is no chance for deadlock. + * For byte-range locks we must check for deadlock. + * + * Deadlock detection is done by looking through the + * wait channels to see if there are any cycles that + * involve us. MAXDEPTH is set just to make sure we + * do not go off into neverland. + */ + if ((lock->lf_flags & F_POSIX) && + (block->lf_flags & F_POSIX)) { + register struct proc *wproc; + register struct lockf *waitblock; + int i = 0; + + /* The block is waiting on something */ + wproc = (struct proc *)block->lf_id; + while (wproc->p_wchan && + (wproc->p_wmesg == lockstr) && + (i++ < maxlockdepth)) { + waitblock = (struct lockf *)wproc->p_wchan; + /* Get the owner of the blocking lock */ + waitblock = waitblock->lf_next; + if ((waitblock->lf_flags & F_POSIX) == 0) + break; + wproc = (struct proc *)waitblock->lf_id; + if (wproc == (struct proc *)lock->lf_id) { + free(lock, M_LOCKF); + return (EDEADLK); + } + } + } + /* + * For flock type locks, we must first remove + * any shared locks that we hold before we sleep + * waiting for an exclusive lock. + */ + if ((lock->lf_flags & F_FLOCK) && + lock->lf_type == F_WRLCK) { + lock->lf_type = F_UNLCK; + (void) lf_clearlock(lock); + lock->lf_type = F_WRLCK; + } + /* + * Add our lock to the blocked list and sleep until we're free. + * Remember who blocked us (for deadlock detection). + */ + lock->lf_next = block; + TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block); +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) { + lf_print("lf_setlock: blocking on", block); + lf_printlist("lf_setlock", block); + } +#endif /* LOCKF_DEBUG */ + if ((error = tsleep((caddr_t)lock, priority, lockstr, 0))) { + /* + * We may have been awakened by a signal (in + * which case we must remove ourselves from the + * blocked list) and/or by another process + * releasing a lock (in which case we have already + * been removed from the blocked list and our + * lf_next field set to NOLOCKF). + */ + if (lock->lf_next) + TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock, + lf_block); + free(lock, M_LOCKF); + return (error); + } + } + /* + * No blocks!! Add the lock. Note that we will + * downgrade or upgrade any overlapping locks this + * process already owns. + * + * Skip over locks owned by other processes. + * Handle any locks that overlap and are owned by ourselves. + */ + prev = head; + block = *head; + needtolink = 1; + for (;;) { + ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap); + if (ovcase) + block = overlap->lf_next; + /* + * Six cases: + * 0) no overlap + * 1) overlap == lock + * 2) overlap contains lock + * 3) lock contains overlap + * 4) overlap starts before lock + * 5) overlap ends after lock + */ + switch (ovcase) { + case 0: /* no overlap */ + if (needtolink) { + *prev = lock; + lock->lf_next = overlap; + } + break; + + case 1: /* overlap == lock */ + /* + * If downgrading lock, others may be + * able to acquire it. + */ + if (lock->lf_type == F_RDLCK && + overlap->lf_type == F_WRLCK) + lf_wakelock(overlap); + overlap->lf_type = lock->lf_type; + FREE(lock, M_LOCKF); + lock = overlap; /* for debug output below */ + break; + + case 2: /* overlap contains lock */ + /* + * Check for common starting point and different types. + */ + if (overlap->lf_type == lock->lf_type) { + free(lock, M_LOCKF); + lock = overlap; /* for debug output below */ + break; + } + if (overlap->lf_start == lock->lf_start) { + *prev = lock; + lock->lf_next = overlap; + overlap->lf_start = lock->lf_end + 1; + } else + lf_split(overlap, lock); + lf_wakelock(overlap); + break; + + case 3: /* lock contains overlap */ + /* + * If downgrading lock, others may be able to + * acquire it, otherwise take the list. + */ + if (lock->lf_type == F_RDLCK && + overlap->lf_type == F_WRLCK) { + lf_wakelock(overlap); + } else { + while (ltmp = overlap->lf_blkhd.tqh_first) { + TAILQ_REMOVE(&overlap->lf_blkhd, ltmp, + lf_block); + TAILQ_INSERT_TAIL(&lock->lf_blkhd, + ltmp, lf_block); + } + } + /* + * Add the new lock if necessary and delete the overlap. + */ + if (needtolink) { + *prev = lock; + lock->lf_next = overlap->lf_next; + prev = &lock->lf_next; + needtolink = 0; + } else + *prev = overlap->lf_next; + free(overlap, M_LOCKF); + continue; + + case 4: /* overlap starts before lock */ + /* + * Add lock after overlap on the list. + */ + lock->lf_next = overlap->lf_next; + overlap->lf_next = lock; + overlap->lf_end = lock->lf_start - 1; + prev = &lock->lf_next; + lf_wakelock(overlap); + needtolink = 0; + continue; + + case 5: /* overlap ends after lock */ + /* + * Add the new lock before overlap. + */ + if (needtolink) { + *prev = lock; + lock->lf_next = overlap; + } + overlap->lf_start = lock->lf_end + 1; + lf_wakelock(overlap); + break; + } + break; + } +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) { + lf_print("lf_setlock: got the lock", lock); + lf_printlist("lf_setlock", lock); + } +#endif /* LOCKF_DEBUG */ + return (0); +} + +/* + * Remove a byte-range lock on an inode. + * + * Generally, find the lock (or an overlap to that lock) + * and remove it (or shrink it), then wakeup anyone we can. + */ +static int +lf_clearlock(unlock) + register struct lockf *unlock; +{ + struct lockf **head = unlock->lf_head; + register struct lockf *lf = *head; + struct lockf *overlap, **prev; + int ovcase; + + if (lf == NOLOCKF) + return (0); +#ifdef LOCKF_DEBUG + if (unlock->lf_type != F_UNLCK) + panic("lf_clearlock: bad type"); + if (lockf_debug & 1) + lf_print("lf_clearlock", unlock); +#endif /* LOCKF_DEBUG */ + prev = head; + while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap))) { + /* + * Wakeup the list of locks to be retried. + */ + lf_wakelock(overlap); + + switch (ovcase) { + + case 1: /* overlap == lock */ + *prev = overlap->lf_next; + FREE(overlap, M_LOCKF); + break; + + case 2: /* overlap contains lock: split it */ + if (overlap->lf_start == unlock->lf_start) { + overlap->lf_start = unlock->lf_end + 1; + break; + } + lf_split(overlap, unlock); + overlap->lf_next = unlock->lf_next; + break; + + case 3: /* lock contains overlap */ + *prev = overlap->lf_next; + lf = overlap->lf_next; + free(overlap, M_LOCKF); + continue; + + case 4: /* overlap starts before lock */ + overlap->lf_end = unlock->lf_start - 1; + prev = &overlap->lf_next; + lf = overlap->lf_next; + continue; + + case 5: /* overlap ends after lock */ + overlap->lf_start = unlock->lf_end + 1; + break; + } + break; + } +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) + lf_printlist("lf_clearlock", unlock); +#endif /* LOCKF_DEBUG */ + return (0); +} + +/* + * Check whether there is a blocking lock, + * and if so return its process identifier. + */ +static int +lf_getlock(lock, fl) + register struct lockf *lock; + register struct flock *fl; +{ + register struct lockf *block; + +#ifdef LOCKF_DEBUG + if (lockf_debug & 1) + lf_print("lf_getlock", lock); +#endif /* LOCKF_DEBUG */ + + if ((block = lf_getblock(lock))) { + fl->l_type = block->lf_type; + fl->l_whence = SEEK_SET; + fl->l_start = block->lf_start; + if (block->lf_end == -1) + fl->l_len = 0; + else + fl->l_len = block->lf_end - block->lf_start + 1; + if (block->lf_flags & F_POSIX) + fl->l_pid = ((struct proc *)(block->lf_id))->p_pid; + else + fl->l_pid = -1; + } else { + fl->l_type = F_UNLCK; + } + return (0); +} + +/* + * Walk the list of locks for an inode and + * return the first blocking lock. + */ +static struct lockf * +lf_getblock(lock) + register struct lockf *lock; +{ + struct lockf **prev, *overlap, *lf = *(lock->lf_head); + int ovcase; + + prev = lock->lf_head; + while ((ovcase = lf_findoverlap(lf, lock, OTHERS, &prev, &overlap))) { + /* + * We've found an overlap, see if it blocks us + */ + if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK)) + return (overlap); + /* + * Nope, point to the next one on the list and + * see if it blocks us + */ + lf = overlap->lf_next; + } + return (NOLOCKF); +} + +/* + * Walk the list of locks for an inode to + * find an overlapping lock (if any). + * + * NOTE: this returns only the FIRST overlapping lock. There + * may be more than one. + */ +static int +lf_findoverlap(lf, lock, type, prev, overlap) + register struct lockf *lf; + struct lockf *lock; + int type; + struct lockf ***prev; + struct lockf **overlap; +{ + off_t start, end; + + *overlap = lf; + if (lf == NOLOCKF) + return (0); +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + lf_print("lf_findoverlap: looking for overlap in", lock); +#endif /* LOCKF_DEBUG */ + start = lock->lf_start; + end = lock->lf_end; + while (lf != NOLOCKF) { + if (((type & SELF) && lf->lf_id != lock->lf_id) || + ((type & OTHERS) && lf->lf_id == lock->lf_id)) { + *prev = &lf->lf_next; + *overlap = lf = lf->lf_next; + continue; + } +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + lf_print("\tchecking", lf); +#endif /* LOCKF_DEBUG */ + /* + * OK, check for overlap + * + * Six cases: + * 0) no overlap + * 1) overlap == lock + * 2) overlap contains lock + * 3) lock contains overlap + * 4) overlap starts before lock + * 5) overlap ends after lock + */ + if ((lf->lf_end != -1 && start > lf->lf_end) || + (end != -1 && lf->lf_start > end)) { + /* Case 0 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("no overlap\n"); +#endif /* LOCKF_DEBUG */ + if ((type & SELF) && end != -1 && lf->lf_start > end) + return (0); + *prev = &lf->lf_next; + *overlap = lf = lf->lf_next; + continue; + } + if ((lf->lf_start == start) && (lf->lf_end == end)) { + /* Case 1 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap == lock\n"); +#endif /* LOCKF_DEBUG */ + return (1); + } + if ((lf->lf_start <= start) && + (end != -1) && + ((lf->lf_end >= end) || (lf->lf_end == -1))) { + /* Case 2 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap contains lock\n"); +#endif /* LOCKF_DEBUG */ + return (2); + } + if (start <= lf->lf_start && + (end == -1 || + (lf->lf_end != -1 && end >= lf->lf_end))) { + /* Case 3 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("lock contains overlap\n"); +#endif /* LOCKF_DEBUG */ + return (3); + } + if ((lf->lf_start < start) && + ((lf->lf_end >= start) || (lf->lf_end == -1))) { + /* Case 4 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap starts before lock\n"); +#endif /* LOCKF_DEBUG */ + return (4); + } + if ((lf->lf_start > start) && + (end != -1) && + ((lf->lf_end > end) || (lf->lf_end == -1))) { + /* Case 5 */ +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + printf("overlap ends after lock\n"); +#endif /* LOCKF_DEBUG */ + return (5); + } + panic("lf_findoverlap: default"); + } + return (0); +} + +/* + * Split a lock and a contained region into + * two or three locks as necessary. + */ +static void +lf_split(lock1, lock2) + register struct lockf *lock1; + register struct lockf *lock2; +{ + register struct lockf *splitlock; + +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) { + lf_print("lf_split", lock1); + lf_print("splitting from", lock2); + } +#endif /* LOCKF_DEBUG */ + /* + * Check to see if spliting into only two pieces. + */ + if (lock1->lf_start == lock2->lf_start) { + lock1->lf_start = lock2->lf_end + 1; + lock2->lf_next = lock1; + return; + } + if (lock1->lf_end == lock2->lf_end) { + lock1->lf_end = lock2->lf_start - 1; + lock2->lf_next = lock1->lf_next; + lock1->lf_next = lock2; + return; + } + /* + * Make a new lock consisting of the last part of + * the encompassing lock + */ + MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK); + bcopy((caddr_t)lock1, (caddr_t)splitlock, sizeof *splitlock); + splitlock->lf_start = lock2->lf_end + 1; + TAILQ_INIT(&splitlock->lf_blkhd); + lock1->lf_end = lock2->lf_start - 1; + /* + * OK, now link it in + */ + splitlock->lf_next = lock1->lf_next; + lock2->lf_next = splitlock; + lock1->lf_next = lock2; +} + +/* + * Wakeup a blocklist + */ +static void +lf_wakelock(listhead) + struct lockf *listhead; +{ + register struct lockf *wakelock; + + while (wakelock = listhead->lf_blkhd.tqh_first) { + TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block); + wakelock->lf_next = NOLOCKF; +#ifdef LOCKF_DEBUG + if (lockf_debug & 2) + lf_print("lf_wakelock: awakening", wakelock); +#endif /* LOCKF_DEBUG */ + wakeup((caddr_t)wakelock); + } +} + +#ifdef LOCKF_DEBUG +/* + * Print out a lock. + */ +void +lf_print(tag, lock) + char *tag; + register struct lockf *lock; +{ + + printf("%s: lock 0x%lx for ", tag, lock); + if (lock->lf_flags & F_POSIX) + printf("proc %d", ((struct proc *)(lock->lf_id))->p_pid); + else + printf("id 0x%x", lock->lf_id); + printf(" in ino %d on dev <%d, %d>, %s, start %d, end %d", + lock->lf_inode->i_number, + major(lock->lf_inode->i_dev), + minor(lock->lf_inode->i_dev), + lock->lf_type == F_RDLCK ? "shared" : + lock->lf_type == F_WRLCK ? "exclusive" : + lock->lf_type == F_UNLCK ? "unlock" : + "unknown", lock->lf_start, lock->lf_end); + if (lock->lf_blkhd.tqh_first) + printf(" block 0x%x\n", lock->lf_blkhd.tqh_first); + else + printf("\n"); +} + +void +lf_printlist(tag, lock) + char *tag; + struct lockf *lock; +{ + register struct lockf *lf, *blk; + + printf("%s: Lock list for ino %d on dev <%d, %d>:\n", + tag, lock->lf_inode->i_number, + major(lock->lf_inode->i_dev), + minor(lock->lf_inode->i_dev)); + for (lf = lock->lf_inode->i_lockf; lf; lf = lf->lf_next) { + printf("\tlock 0x%lx for ", lf); + if (lf->lf_flags & F_POSIX) + printf("proc %d", ((struct proc *)(lf->lf_id))->p_pid); + else + printf("id 0x%x", lf->lf_id); + printf(", %s, start %d, end %d", + lf->lf_type == F_RDLCK ? "shared" : + lf->lf_type == F_WRLCK ? "exclusive" : + lf->lf_type == F_UNLCK ? "unlock" : + "unknown", lf->lf_start, lf->lf_end); + for (blk = lf->lf_blkhd.tqh_first; blk; + blk = blk->lf_block.tqe_next) { + printf("\n\t\tlock request 0x%lx for ", blk); + if (blk->lf_flags & F_POSIX) + printf("proc %d", + ((struct proc *)(blk->lf_id))->p_pid); + else + printf("id 0x%x", blk->lf_id); + printf(", %s, start %d, end %d", + blk->lf_type == F_RDLCK ? "shared" : + blk->lf_type == F_WRLCK ? "exclusive" : + blk->lf_type == F_UNLCK ? "unlock" : + "unknown", blk->lf_start, blk->lf_end); + if (blk->lf_blkhd.tqh_first) + panic("lf_printlist: bad list"); + } + printf("\n"); + } +} +#endif /* LOCKF_DEBUG */ diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c index 363cde5..94c6b4e 100644 --- a/sys/kern/kern_malloc.c +++ b/sys/kern/kern_malloc.c @@ -30,19 +30,27 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)kern_malloc.c 8.4 (Berkeley) 5/20/95 + * @(#)kern_malloc.c 8.3 (Berkeley) 1/4/94 + * $Id$ */ #include <sys/param.h> +#include <sys/systm.h> #include <sys/proc.h> -#include <sys/map.h> #include <sys/kernel.h> #include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/vmmeter.h> #include <vm/vm.h> +#include <vm/vm_param.h> #include <vm/vm_kern.h> +#include <vm/vm_extern.h> -struct kmembuckets bucket[MINBUCKET + 16]; +static void kmeminit __P((void *)); +SYSINIT(kmem, SI_SUB_KMEM, SI_ORDER_FIRST, kmeminit, NULL) + +static struct kmembuckets bucket[MINBUCKET + 16]; struct kmemstats kmemstats[M_LAST]; struct kmemusage *kmemusage; char *kmembase, *kmemlimit; @@ -52,7 +60,7 @@ char *memname[] = INITKMEMNAMES; /* * This structure provides a set of masks to catch unaligned frees. */ -long addrmask[] = { 0, +static long addrmask[] = { 0, 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff, @@ -63,8 +71,8 @@ long addrmask[] = { 0, * The WEIRD_ADDR is used as known text to copy into free objects so * that modifications after frees can be detected. */ -#define WEIRD_ADDR 0xdeadbeef -#define MAX_COPY 32 +#define WEIRD_ADDR 0xdeadc0de +#define MAX_COPY 64 /* * Normally the first word of the structure is used to hold the list @@ -103,9 +111,6 @@ malloc(size, type, flags) int copysize; char *savedtype; #endif -#ifdef DEBUG - extern int simplelockrecurse; -#endif #ifdef KMEMSTATS register struct kmemstats *ksp = &kmemstats[type]; @@ -114,7 +119,7 @@ malloc(size, type, flags) #endif indx = BUCKETINDX(size); kbp = &bucket[indx]; - s = splimp(); + s = splhigh(); #ifdef KMEMSTATS while (ksp->ks_memuse >= ksp->ks_limit) { if (flags & M_NOWAIT) { @@ -130,25 +135,16 @@ malloc(size, type, flags) #ifdef DIAGNOSTIC copysize = 1 << indx < MAX_COPY ? 1 << indx : MAX_COPY; #endif -#ifdef DEBUG - if (flags & M_NOWAIT) - simplelockrecurse++; -#endif if (kbp->kb_next == NULL) { kbp->kb_last = NULL; if (size > MAXALLOCSAVE) - allocsize = roundup(size, CLBYTES); + allocsize = roundup(size, PAGE_SIZE); else allocsize = 1 << indx; - npg = clrnd(btoc(allocsize)); - va = (caddr_t) kmem_malloc(kmem_map, (vm_size_t)ctob(npg), - !(flags & M_NOWAIT)); + npg = btoc(allocsize); + va = (caddr_t) kmem_malloc(kmem_map, (vm_size_t)ctob(npg), flags); if (va == NULL) { splx(s); -#ifdef DEBUG - if (flags & M_NOWAIT) - simplelockrecurse--; -#endif return ((void *) NULL); } #ifdef KMEMSTATS @@ -175,7 +171,7 @@ malloc(size, type, flags) * bucket, don't assume the list is still empty. */ savedlist = kbp->kb_next; - kbp->kb_next = cp = va + (npg * NBPG) - allocsize; + kbp->kb_next = cp = va + (npg * PAGE_SIZE) - allocsize; for (;;) { freep = (struct freelist *)cp; #ifdef DIAGNOSTIC @@ -205,7 +201,7 @@ malloc(size, type, flags) memname[freep->type] : "???"; if (kbp->kb_next && !kernacc(kbp->kb_next, sizeof(struct freelist), 0)) { - printf("%s of object 0x%x size %d %s %s (invalid addr 0x%x)\n", + printf("%s of object %p size %ld %s %s (invalid addr %p)\n", "Data modified on freelist: word 2.5", va, size, "previous type", savedtype, kbp->kb_next); kbp->kb_next = NULL; @@ -224,7 +220,7 @@ malloc(size, type, flags) for (lp = (long *)va; lp < end; lp++) { if (*lp == WEIRD_ADDR) continue; - printf("%s %d of object 0x%x size %d %s %s (0x%x != 0x%x)\n", + printf("%s %d of object %p size %ld %s %s (0x%lx != 0x%x)\n", "Data modified on freelist: word", lp - (long *)va, va, size, "previous type", savedtype, *lp, WEIRD_ADDR); break; @@ -250,10 +246,6 @@ out: out: #endif splx(s); -#ifdef DEBUG - if (flags & M_NOWAIT) - simplelockrecurse--; -#endif return ((void *) va); } @@ -271,34 +263,42 @@ free(addr, type) long size; int s; #ifdef DIAGNOSTIC - caddr_t cp; + struct freelist *fp; long *end, *lp, alloc, copysize; #endif #ifdef KMEMSTATS register struct kmemstats *ksp = &kmemstats[type]; #endif +#ifdef DIAGNOSTIC + if ((char *)addr < kmembase || (char *)addr >= kmemlimit) { + panic("free: address 0x%x out of range", addr); + } + if ((u_long)type > M_LAST) { + panic("free: type %d out of range", type); + } +#endif kup = btokup(addr); size = 1 << kup->ku_indx; kbp = &bucket[kup->ku_indx]; - s = splimp(); + s = splhigh(); #ifdef DIAGNOSTIC /* * Check for returns of data that do not point to the * beginning of the allocation. */ - if (size > NBPG * CLSIZE) - alloc = addrmask[BUCKETINDX(NBPG * CLSIZE)]; + if (size > PAGE_SIZE) + alloc = addrmask[BUCKETINDX(PAGE_SIZE)]; else alloc = addrmask[kup->ku_indx]; if (((u_long)addr & alloc) != 0) - panic("free: unaligned addr 0x%x, size %d, type %s, mask %d\n", + panic("free: unaligned addr 0x%x, size %d, type %s, mask %d", addr, size, memname[type], alloc); #endif /* DIAGNOSTIC */ if (size > MAXALLOCSAVE) { kmem_free(kmem_map, (vm_offset_t)addr, ctob(kup->ku_pagecnt)); #ifdef KMEMSTATS - size = kup->ku_pagecnt << PGSHIFT; + size = kup->ku_pagecnt << PAGE_SHIFT; ksp->ks_memuse -= size; kup->ku_indx = 0; kup->ku_pagecnt = 0; @@ -318,11 +318,16 @@ free(addr, type) * it looks free before laboriously searching the freelist. */ if (freep->spare0 == WEIRD_ADDR) { - for (cp = kbp->kb_next; cp; cp = *(caddr_t *)cp) { - if (addr != cp) - continue; - printf("multiply freed item 0x%x\n", addr); - panic("free: duplicated free"); + fp = (struct freelist *)kbp->kb_next; + while (fp) { + if (fp->spare0 != WEIRD_ADDR) { + printf("trashed free item %p\n", fp); + panic("free: free item modified"); + } else if (addr == (caddr_t)fp) { + printf("multiple freed item %p\n", addr); + panic("free: multiple free"); + } + fp = (struct freelist *)fp->next; } } /* @@ -351,46 +356,75 @@ free(addr, type) wakeup((caddr_t)ksp); ksp->ks_inuse--; #endif +#ifdef OLD_MALLOC_MEMORY_POLICY if (kbp->kb_next == NULL) kbp->kb_next = addr; else ((struct freelist *)kbp->kb_last)->next = addr; freep->next = NULL; kbp->kb_last = addr; +#else + /* + * Return memory to the head of the queue for quick reuse. This + * can improve performance by improving the probability of the + * item being in the cache when it is reused. + */ + if (kbp->kb_next == NULL) { + kbp->kb_next = addr; + kbp->kb_last = addr; + freep->next = NULL; + } else { + freep->next = kbp->kb_next; + kbp->kb_next = addr; + } +#endif splx(s); } /* * Initialize the kernel memory allocator */ -kmeminit() +/* ARGSUSED*/ +static void +kmeminit(dummy) + void *dummy; { register long indx; int npg; #if ((MAXALLOCSAVE & (MAXALLOCSAVE - 1)) != 0) - ERROR!_kmeminit:_MAXALLOCSAVE_not_power_of_2 +#error "kmeminit: MAXALLOCSAVE not power of 2" #endif #if (MAXALLOCSAVE > MINALLOCSIZE * 32768) - ERROR!_kmeminit:_MAXALLOCSAVE_too_big +#error "kmeminit: MAXALLOCSAVE too big" #endif -#if (MAXALLOCSAVE < CLBYTES) - ERROR!_kmeminit:_MAXALLOCSAVE_too_small +#if (MAXALLOCSAVE < PAGE_SIZE) +#error "kmeminit: MAXALLOCSAVE too small" #endif - npg = VM_KMEM_SIZE/ NBPG; + npg = (nmbufs * MSIZE + nmbclusters * MCLBYTES + VM_KMEM_SIZE) + / PAGE_SIZE; + kmemusage = (struct kmemusage *) kmem_alloc(kernel_map, (vm_size_t)(npg * sizeof(struct kmemusage))); kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase, - (vm_offset_t *)&kmemlimit, (vm_size_t)(npg * NBPG), FALSE); + (vm_offset_t *)&kmemlimit, (vm_size_t)(npg * PAGE_SIZE), + FALSE); #ifdef KMEMSTATS for (indx = 0; indx < MINBUCKET + 16; indx++) { - if (1 << indx >= CLBYTES) + if (1 << indx >= PAGE_SIZE) bucket[indx].kb_elmpercl = 1; else - bucket[indx].kb_elmpercl = CLBYTES / (1 << indx); + bucket[indx].kb_elmpercl = PAGE_SIZE / (1 << indx); bucket[indx].kb_highwat = 5 * bucket[indx].kb_elmpercl; } - for (indx = 0; indx < M_LAST; indx++) - kmemstats[indx].ks_limit = npg * NBPG * 6 / 10; + /* + * Limit maximum memory for each type to 60% of malloc area size or + * 60% of physical memory, whichever is smaller. + */ + for (indx = 0; indx < M_LAST; indx++) { + kmemstats[indx].ks_limit = min(cnt.v_page_count * PAGE_SIZE, + (npg * PAGE_SIZE - nmbclusters * MCLBYTES + - nmbufs * MSIZE)) * 6 / 10; + } #endif } diff --git a/sys/kern/kern_mib.c b/sys/kern/kern_mib.c new file mode 100644 index 0000000..8105aa4 --- /dev/null +++ b/sys/kern/kern_mib.c @@ -0,0 +1,167 @@ +/*- + * Copyright (c) 1982, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * Mike Karels at Berkeley Software Design, Inc. + * + * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD + * project, to make these variables more userfriendly. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94 + * $Id: kern_mib.c,v 1.7 1997/03/03 12:58:19 bde Exp $ + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/sysctl.h> +#include <sys/proc.h> +#include <sys/unistd.h> + +SYSCTL_NODE(, 0, sysctl, CTLFLAG_RW, 0, + "Sysctl internal magic"); +SYSCTL_NODE(, CTL_KERN, kern, CTLFLAG_RW, 0, + "High kernel, proc, limits &c"); +SYSCTL_NODE(, CTL_VM, vm, CTLFLAG_RW, 0, + "Virtual memory"); +SYSCTL_NODE(, CTL_VFS, vfs, CTLFLAG_RW, 0, + "File system"); +SYSCTL_NODE(, CTL_NET, net, CTLFLAG_RW, 0, + "Network, (see socket.h)"); +SYSCTL_NODE(, CTL_DEBUG, debug, CTLFLAG_RW, 0, + "Debugging"); +SYSCTL_NODE(, CTL_HW, hw, CTLFLAG_RW, 0, + "hardware"); +SYSCTL_NODE(, CTL_MACHDEP, machdep, CTLFLAG_RW, 0, + "machine dependent"); +SYSCTL_NODE(, CTL_USER, user, CTLFLAG_RW, 0, + "user-level"); + +SYSCTL_STRING(_kern, KERN_OSRELEASE, osrelease, CTLFLAG_RD, osrelease, 0, ""); + +SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD, 0, BSD, ""); + +SYSCTL_STRING(_kern, KERN_VERSION, version, CTLFLAG_RD, version, 0, ""); + +SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD, ostype, 0, ""); + +extern int osreldate; +SYSCTL_INT(_kern, KERN_OSRELDATE, osreldate, CTLFLAG_RD, &osreldate, 0, ""); + +SYSCTL_INT(_kern, KERN_MAXPROC, maxproc, CTLFLAG_RW, &maxproc, 0, ""); + +SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid, + CTLFLAG_RW, &maxprocperuid, 0, ""); + +SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD, 0, ARG_MAX, ""); + +SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD, 0, _POSIX_VERSION, ""); + +SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RD, 0, NGROUPS_MAX, ""); + +SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD, 0, 1, ""); + +#ifdef _POSIX_SAVED_IDS +SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD, 0, 1, ""); +#else +SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD, 0, 0, ""); +#endif + +char kernelname[MAXPATHLEN] = "/kernel"; /* XXX bloat */ + +SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile, + CTLFLAG_RW, kernelname, sizeof kernelname, ""); + +SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD, 0, 1, ""); + +SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD, 0, BYTE_ORDER, ""); + +SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD, 0, PAGE_SIZE, ""); + +char hostname[MAXHOSTNAMELEN]; + +SYSCTL_STRING(_kern, KERN_HOSTNAME, hostname, CTLFLAG_RW, + hostname, sizeof(hostname), ""); + +int securelevel = -1; + +static int +sysctl_kern_securelvl SYSCTL_HANDLER_ARGS +{ + int error, level; + + level = securelevel; + error = sysctl_handle_int(oidp, &level, 0, req); + if (error || !req->newptr) + return (error); + if (level < securelevel && req->p->p_pid != 1) + return (EPERM); + securelevel = level; + return (error); +} + +SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel, CTLTYPE_INT|CTLFLAG_RW, + 0, 0, sysctl_kern_securelvl, "I", ""); + +char domainname[MAXHOSTNAMELEN]; +SYSCTL_STRING(_kern, KERN_NISDOMAINNAME, domainname, CTLFLAG_RW, + &domainname, sizeof(domainname), ""); + +long hostid; +/* Some trouble here, if sizeof (int) != sizeof (long) */ +SYSCTL_INT(_kern, KERN_HOSTID, hostid, CTLFLAG_RW, &hostid, 0, ""); + +/* + * This is really cheating. These actually live in the libc, something + * which I'm not quite sure is a good idea anyway, but in order for + * getnext and friends to actually work, we define dummies here. + */ +SYSCTL_STRING(_user, USER_CS_PATH, cs_path, CTLFLAG_RD, "", 0, ""); +SYSCTL_INT(_user, USER_BC_BASE_MAX, bc_base_max, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_BC_DIM_MAX, bc_dim_max, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_BC_SCALE_MAX, bc_scale_max, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_BC_STRING_MAX, bc_string_max, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_COLL_WEIGHTS_MAX, coll_weights_max, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_EXPR_NEST_MAX, expr_nest_max, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_LINE_MAX, line_max, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_RE_DUP_MAX, re_dup_max, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_POSIX2_VERSION, posix2_version, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_POSIX2_C_BIND, posix2_c_bind, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_POSIX2_C_DEV, posix2_c_dev, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_POSIX2_CHAR_TERM, posix2_char_term, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_POSIX2_FORT_DEV, posix2_fort_dev, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_POSIX2_FORT_RUN, posix2_fort_run, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_POSIX2_LOCALEDEF, posix2_localedef, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_POSIX2_SW_DEV, posix2_sw_dev, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_POSIX2_UPE, posix2_upe, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_STREAM_MAX, stream_max, CTLFLAG_RD, 0, 0, ""); +SYSCTL_INT(_user, USER_TZNAME_MAX, tzname_max, CTLFLAG_RD, 0, 0, ""); diff --git a/sys/kern/kern_ntptime.c b/sys/kern/kern_ntptime.c new file mode 100644 index 0000000..88ba077b --- /dev/null +++ b/sys/kern/kern_ntptime.c @@ -0,0 +1,269 @@ +/****************************************************************************** + * * + * Copyright (c) David L. Mills 1993, 1994 * + * * + * Permission to use, copy, modify, and distribute this software and its * + * documentation for any purpose and without fee is hereby granted, provided * + * that the above copyright notice appears in all copies and that both the * + * copyright notice and this permission notice appear in supporting * + * documentation, and that the name University of Delaware not be used in * + * advertising or publicity pertaining to distribution of the software * + * without specific, written prior permission. The University of Delaware * + * makes no representations about the suitability this software for any * + * purpose. It is provided "as is" without express or implied warranty. * + * * + ******************************************************************************/ + +/* + * Modification history kern_ntptime.c + * + * 24 Sep 94 David L. Mills + * Tightened code at exits. + * + * 24 Mar 94 David L. Mills + * Revised syscall interface to include new variables for PPS + * time discipline. + * + * 14 Feb 94 David L. Mills + * Added code for external clock + * + * 28 Nov 93 David L. Mills + * Revised frequency scaling to conform with adjusted parameters + * + * 17 Sep 93 David L. Mills + * Created file + */ +/* + * ntp_gettime(), ntp_adjtime() - precision time interface for SunOS + * V4.1.1 and V4.1.3 + * + * These routines consitute the Network Time Protocol (NTP) interfaces + * for user and daemon application programs. The ntp_gettime() routine + * provides the time, maximum error (synch distance) and estimated error + * (dispersion) to client user application programs. The ntp_adjtime() + * routine is used by the NTP daemon to adjust the system clock to an + * externally derived time. The time offset and related variables set by + * this routine are used by hardclock() to adjust the phase and + * frequency of the phase-lock loop which controls the system clock. + */ +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/timex.h> +#include <sys/sysctl.h> + +/* + * The following variables are used by the hardclock() routine in the + * kern_clock.c module and are described in that module. + */ +extern int time_state; /* clock state */ +extern int time_status; /* clock status bits */ +extern long time_offset; /* time adjustment (us) */ +extern long time_freq; /* frequency offset (scaled ppm) */ +extern long time_maxerror; /* maximum error (us) */ +extern long time_esterror; /* estimated error (us) */ +extern long time_constant; /* pll time constant */ +extern long time_precision; /* clock precision (us) */ +extern long time_tolerance; /* frequency tolerance (scaled ppm) */ + +#ifdef PPS_SYNC +/* + * The following variables are used only if the PPS signal discipline + * is configured in the kernel. + */ +extern int pps_shift; /* interval duration (s) (shift) */ +extern long pps_freq; /* pps frequency offset (scaled ppm) */ +extern long pps_jitter; /* pps jitter (us) */ +extern long pps_stabil; /* pps stability (scaled ppm) */ +extern long pps_jitcnt; /* jitter limit exceeded */ +extern long pps_calcnt; /* calibration intervals */ +extern long pps_errcnt; /* calibration errors */ +extern long pps_stbcnt; /* stability limit exceeded */ +#endif /* PPS_SYNC */ + +static int +ntp_sysctl SYSCTL_HANDLER_ARGS +{ + struct timeval atv; + struct ntptimeval ntv; + int s; + + s = splclock(); +#ifdef EXT_CLOCK + /* + * The microtime() external clock routine returns a + * status code. If less than zero, we declare an error + * in the clock status word and return the kernel + * (software) time variable. While there are other + * places that call microtime(), this is the only place + * that matters from an application point of view. + */ + if (microtime(&atv) < 0) { + time_status |= STA_CLOCKERR; + ntv.time = time; + } else { + time_status &= ~STA_CLOCKERR; + } +#else /* EXT_CLOCK */ + microtime(&atv); +#endif /* EXT_CLOCK */ + ntv.time = atv; + ntv.maxerror = time_maxerror; + ntv.esterror = time_esterror; + splx(s); + + ntv.time_state = time_state; + + /* + * Status word error decode. If any of these conditions + * occur, an error is returned, instead of the status + * word. Most applications will care only about the fact + * the system clock may not be trusted, not about the + * details. + * + * Hardware or software error + */ + if (time_status & (STA_UNSYNC | STA_CLOCKERR)) { + ntv.time_state = TIME_ERROR; + } + + /* + * PPS signal lost when either time or frequency + * synchronization requested + */ + if (time_status & (STA_PPSFREQ | STA_PPSTIME) && + !(time_status & STA_PPSSIGNAL)) { + ntv.time_state = TIME_ERROR; + } + + /* + * PPS jitter exceeded when time synchronization + * requested + */ + if (time_status & STA_PPSTIME && + time_status & STA_PPSJITTER) { + ntv.time_state = TIME_ERROR; + } + + /* + * PPS wander exceeded or calibration error when + * frequency synchronization requested + */ + if (time_status & STA_PPSFREQ && + time_status & (STA_PPSWANDER | STA_PPSERROR)) { + ntv.time_state = TIME_ERROR; + } + return (sysctl_handle_opaque(oidp, &ntv, sizeof ntv, req)); +} + +SYSCTL_NODE(_kern, KERN_NTP_PLL, ntp_pll, CTLFLAG_RW, 0, + "NTP kernel PLL related stuff"); +SYSCTL_PROC(_kern_ntp_pll, NTP_PLL_GETTIME, gettime, CTLTYPE_OPAQUE|CTLFLAG_RD, + 0, sizeof(struct ntptimeval) , ntp_sysctl, "S,ntptimeval", ""); + +/* + * ntp_adjtime() - NTP daemon application interface + */ +#ifndef _SYS_SYSPROTO_H_ +struct ntp_adjtime_args { + struct timex *tp; +}; +#endif + +int +ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap, int *retval) +{ + struct timex ntv; + int modes; + int s; + int error; + + error = copyin((caddr_t)uap->tp, (caddr_t)&ntv, sizeof(ntv)); + if (error) + return error; + + /* + * Update selected clock variables - only the superuser can + * change anything. Note that there is no error checking here on + * the assumption the superuser should know what it is doing. + */ + modes = ntv.modes; + if ((modes != 0) + && (error = suser(p->p_cred->pc_ucred, &p->p_acflag))) + return error; + + s = splclock(); + if (modes & MOD_FREQUENCY) +#ifdef PPS_SYNC + time_freq = ntv.freq - pps_freq; +#else /* PPS_SYNC */ + time_freq = ntv.freq; +#endif /* PPS_SYNC */ + if (modes & MOD_MAXERROR) + time_maxerror = ntv.maxerror; + if (modes & MOD_ESTERROR) + time_esterror = ntv.esterror; + if (modes & MOD_STATUS) { + time_status &= STA_RONLY; + time_status |= ntv.status & ~STA_RONLY; + } + if (modes & MOD_TIMECONST) + time_constant = ntv.constant; + if (modes & MOD_OFFSET) + hardupdate(ntv.offset); + + /* + * Retrieve all clock variables + */ + if (time_offset < 0) + ntv.offset = -(-time_offset >> SHIFT_UPDATE); + else + ntv.offset = time_offset >> SHIFT_UPDATE; +#ifdef PPS_SYNC + ntv.freq = time_freq + pps_freq; +#else /* PPS_SYNC */ + ntv.freq = time_freq; +#endif /* PPS_SYNC */ + ntv.maxerror = time_maxerror; + ntv.esterror = time_esterror; + ntv.status = time_status; + ntv.constant = time_constant; + ntv.precision = time_precision; + ntv.tolerance = time_tolerance; +#ifdef PPS_SYNC + ntv.shift = pps_shift; + ntv.ppsfreq = pps_freq; + ntv.jitter = pps_jitter >> PPS_AVG; + ntv.stabil = pps_stabil; + ntv.calcnt = pps_calcnt; + ntv.errcnt = pps_errcnt; + ntv.jitcnt = pps_jitcnt; + ntv.stbcnt = pps_stbcnt; +#endif /* PPS_SYNC */ + (void)splx(s); + + error = copyout((caddr_t)&ntv, (caddr_t)uap->tp, sizeof(ntv)); + if (!error) { + /* + * Status word error decode. See comments in + * ntp_gettime() routine. + */ + retval[0] = time_state; + if (time_status & (STA_UNSYNC | STA_CLOCKERR)) + retval[0] = TIME_ERROR; + if (time_status & (STA_PPSFREQ | STA_PPSTIME) && + !(time_status & STA_PPSSIGNAL)) + retval[0] = TIME_ERROR; + if (time_status & STA_PPSTIME && + time_status & STA_PPSJITTER) + retval[0] = TIME_ERROR; + if (time_status & STA_PPSFREQ && + time_status & (STA_PPSWANDER | STA_PPSERROR)) + retval[0] = TIME_ERROR; + } + return error; +} + + diff --git a/sys/kern/kern_opt.c b/sys/kern/kern_opt.c new file mode 100644 index 0000000..08b04b2 --- /dev/null +++ b/sys/kern/kern_opt.c @@ -0,0 +1,49 @@ +/*- + * Copyright (c) 1997 Bruce D. Evans + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id$ + */ + +#include "opt_defunct.h" + +#ifdef ARP_PROXYALL +#warning "obsolete option ARP_PROXYALL - use `sysctl -w net.link.ether.inet.proxyall=1'" +#endif + +#ifdef CHILD_MAX +#warning "obsolete option CHILD_MAX - use /etc/login.conf" +#endif + +#ifdef EXTRAVNODES +#warning "obsolete option EXTRAVNODES - use `sysctl -w kern.maxvnodes=value'" +#endif + +#ifdef GATEWAY +#warning "obsolete option GATEWAY - use `sysctl -w net.inet.ip.forwarding=1'" +#endif + +#ifdef OPEN_MAX +#warning "obsolete option OPEN_MAX - use /etc/login.conf" +#endif diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c index 1eaae35..42d1d21 100644 --- a/sys/kern/kern_physio.c +++ b/sys/kern/kern_physio.c @@ -1,41 +1,22 @@ -/*- - * Copyright (c) 1982, 1986, 1990, 1993 - * The Regents of the University of California. All rights reserved. - * (c) UNIX System Laboratories, Inc. - * All or some portions of this file are derived from material licensed - * to the University of California by American Telephone and Telegraph - * Co. or Unix System Laboratories, Inc. and are reproduced herein with - * the permission of UNIX System Laboratories, Inc. +/* + * Copyright (c) 1994 John S. Dyson + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. + * notice immediately at the beginning of the file, without modification, + * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. + * 3. Absolutely no warranty of function or purpose is made by the author + * John S. Dyson. + * 4. Modifications may be freely made to this file if the above conditions + * are met. * - * from: @(#)kern_physio.c 8.1 (Berkeley) 6/10/93 + * $Id$ */ #include <sys/param.h> @@ -43,51 +24,176 @@ #include <sys/buf.h> #include <sys/conf.h> #include <sys/proc.h> +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_extern.h> + +static void physwakeup __P((struct buf *bp)); -physio(a1, a2, a3, a4, a5, a6) - int (*a1)(); - struct buf *a2; - dev_t a3; - int a4; - u_int (*a5)(); - struct uio *a6; +int +physio(strategy, bp, dev, rw, minp, uio) + d_strategy_t *strategy; + struct buf *bp; + dev_t dev; + int rw; + u_int (*minp) __P((struct buf *bp)); + struct uio *uio; { + int i; + int bufflags = rw?B_READ:0; + int error; + int spl; + caddr_t sa; + int bp_alloc = (bp == 0); + struct buf *bpa; + +/* + * keep the process from being swapped + */ + curproc->p_flag |= P_PHYSIO; + + /* create and build a buffer header for a transfer */ + bpa = (struct buf *)getpbuf(); + if (!bp_alloc) { + spl = splbio(); + while (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + tsleep((caddr_t)bp, PRIBIO, "physbw", 0); + } + bp->b_flags |= B_BUSY; + splx(spl); + } else { + bp = bpa; + } /* - * Body deleted. + * get a copy of the kva from the physical buffer */ - return (EIO); + sa = bpa->b_data; + bp->b_proc = curproc; + bp->b_dev = dev; + error = bp->b_error = 0; + + for(i=0;i<uio->uio_iovcnt;i++) { + while( uio->uio_iov[i].iov_len) { + + bp->b_bcount = uio->uio_iov[i].iov_len; + bp->b_flags = B_BUSY | B_PHYS | B_CALL | bufflags; + bp->b_iodone = physwakeup; + bp->b_data = uio->uio_iov[i].iov_base; + bp->b_bcount = minp( bp); + if( minp != minphys) + bp->b_bcount = minphys( bp); + bp->b_bufsize = bp->b_bcount; + /* + * pass in the kva from the physical buffer + * for the temporary kernel mapping. + */ + bp->b_saveaddr = sa; + bp->b_blkno = btodb(uio->uio_offset); + + + if (uio->uio_segflg == UIO_USERSPACE) { + if (rw && !useracc(bp->b_data, bp->b_bufsize, B_WRITE)) { + error = EFAULT; + goto doerror; + } + if (!rw && !useracc(bp->b_data, bp->b_bufsize, B_READ)) { + error = EFAULT; + goto doerror; + } + + /* bring buffer into kernel space */ + vmapbuf(bp); + } + + /* perform transfer */ + (*strategy)(bp); + + spl = splbio(); + while ((bp->b_flags & B_DONE) == 0) + tsleep((caddr_t)bp, PRIBIO, "physstr", 0); + splx(spl); + + /* release mapping into kernel space */ + if (uio->uio_segflg == UIO_USERSPACE) + vunmapbuf(bp); + + /* + * update the uio data + */ + { + int iolen = bp->b_bcount - bp->b_resid; + + if (iolen == 0 && !(bp->b_flags & B_ERROR)) + goto doerror; /* EOF */ + uio->uio_iov[i].iov_len -= iolen; + uio->uio_iov[i].iov_base += iolen; + uio->uio_resid -= iolen; + uio->uio_offset += iolen; + } + + /* + * check for an error + */ + if( bp->b_flags & B_ERROR) { + error = bp->b_error; + goto doerror; + } + } + } + + +doerror: + relpbuf(bpa); + if (!bp_alloc) { + bp->b_flags &= ~(B_BUSY|B_PHYS); + if( bp->b_flags & B_WANTED) { + bp->b_flags &= ~B_WANTED; + wakeup((caddr_t)bp); + } + } +/* + * allow the process to be swapped + */ + curproc->p_flag &= ~P_PHYSIO; + + return (error); } u_int -minphys(a1) - struct buf *a1; +minphys(struct buf *bp) { + u_int maxphys = MAXPHYS; - /* - * Body deleted. - */ - return (0); + if( ((vm_offset_t) bp->b_data) & PAGE_MASK) { + maxphys = MAXPHYS - PAGE_SIZE; + } + + if( bp->b_bcount > maxphys) { + bp->b_bcount = maxphys; + } + return bp->b_bcount; } -/* - * Do a read on a device for a user process. - */ -rawread(dev, uio) - dev_t dev; - struct uio *uio; +int +rawread(dev_t dev, struct uio *uio, int ioflag) { - return (physio(cdevsw[major(dev)].d_strategy, (struct buf *)NULL, - dev, B_READ, minphys, uio)); + return (physio(cdevsw[major(dev)]->d_strategy, (struct buf *)NULL, + dev, 1, minphys, uio)); } -/* - * Do a write on a device for a user process. - */ -rawwrite(dev, uio) - dev_t dev; - struct uio *uio; +int +rawwrite(dev_t dev, struct uio *uio, int ioflag) +{ + return (physio(cdevsw[major(dev)]->d_strategy, (struct buf *)NULL, + dev, 0, minphys, uio)); +} + +static void +physwakeup(bp) + struct buf *bp; { - return (physio(cdevsw[major(dev)].d_strategy, (struct buf *)NULL, - dev, B_WRITE, minphys, uio)); + wakeup((caddr_t) bp); + bp->b_flags &= ~B_CALL; } diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c index 6701793..cecf89f 100644 --- a/sys/kern/kern_proc.c +++ b/sys/kern/kern_proc.c @@ -31,12 +31,13 @@ * SUCH DAMAGE. * * @(#)kern_proc.c 8.7 (Berkeley) 2/14/95 + * $Id: kern_proc.c,v 1.25 1997/02/22 09:39:08 peter Exp $ */ #include <sys/param.h> #include <sys/systm.h> -#include <sys/map.h> #include <sys/kernel.h> +#include <sys/sysctl.h> #include <sys/proc.h> #include <sys/buf.h> #include <sys/acct.h> @@ -46,8 +47,21 @@ #include <sys/uio.h> #include <sys/malloc.h> #include <sys/mbuf.h> -#include <sys/ioctl.h> #include <sys/tty.h> +#include <sys/signalvar.h> +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <sys/user.h> + +struct prochd qs[NQS]; /* as good a place as any... */ +struct prochd rtqs[NQS]; /* Space for REALTIME queues too */ +struct prochd idqs[NQS]; /* Space for IDLE queues too */ + +static void pgdelete __P((struct pgrp *)); /* * Structure associated with user cacheing. @@ -59,7 +73,9 @@ struct uidinfo { }; #define UIHASH(uid) (&uihashtbl[(uid) & uihash]) LIST_HEAD(uihashhead, uidinfo) *uihashtbl; -u_long uihash; /* size of hash table - 1 */ +static u_long uihash; /* size of hash table - 1 */ + +static void orphanpg __P((struct pgrp *pg)); /* * Other process lists @@ -126,6 +142,7 @@ chgproccnt(uid, diff) /* * Is p an inferior of the current process? */ +int inferior(p) register struct proc *p; { @@ -263,12 +280,12 @@ leavepgrp(p) /* * delete a process group */ -void +static void pgdelete(pgrp) register struct pgrp *pgrp; { - if (pgrp->pg_session->s_ttyp != NULL && + if (pgrp->pg_session->s_ttyp != NULL && pgrp->pg_session->s_ttyp->t_pgrp == pgrp) pgrp->pg_session->s_ttyp->t_pgrp = NULL; LIST_REMOVE(pgrp, pg_hash); @@ -277,8 +294,6 @@ pgdelete(pgrp) FREE(pgrp, M_PGRP); } -static void orphanpg(); - /* * Adjust pgrp jobc counters when specified process changes process group. * We count the number of processes in each process group that "qualify" @@ -324,7 +339,7 @@ fixjobc(p, pgrp, entering) orphanpg(hispgrp); } -/* +/* * A process group has become orphaned; * if there are any stopped processes in the group, * hang-up all process in that group. @@ -347,8 +362,11 @@ orphanpg(pg) } } -#ifdef DEBUG -pgrpdump() +#include "opt_ddb.h" +#ifdef DDB +#include <ddb/ddb.h> + +DB_SHOW_COMMAND(pgrpdump, pgrpdump) { register struct pgrp *pgrp; register struct proc *p; @@ -371,4 +389,204 @@ pgrpdump() } } } -#endif /* DEBUG */ +#endif /* DDB */ + +/* + * Fill in an eproc structure for the specified process. + */ +void +fill_eproc(p, ep) + register struct proc *p; + register struct eproc *ep; +{ + register struct tty *tp; + + bzero(ep, sizeof(*ep)); + + ep->e_paddr = p; + if (p->p_cred) { + ep->e_pcred = *p->p_cred; + if (p->p_ucred) + ep->e_ucred = *p->p_ucred; + } + if (p->p_stat != SIDL && p->p_stat != SZOMB && p->p_vmspace != NULL) { + register struct vmspace *vm = p->p_vmspace; + +#ifdef pmap_resident_count + ep->e_vm.vm_rssize = pmap_resident_count(&vm->vm_pmap); /*XXX*/ +#else + ep->e_vm.vm_rssize = vm->vm_rssize; +#endif + ep->e_vm.vm_tsize = vm->vm_tsize; + ep->e_vm.vm_dsize = vm->vm_dsize; + ep->e_vm.vm_ssize = vm->vm_ssize; +#ifndef sparc + ep->e_vm.vm_pmap = vm->vm_pmap; +#endif + } + if (p->p_pptr) + ep->e_ppid = p->p_pptr->p_pid; + if (p->p_pgrp) { + ep->e_pgid = p->p_pgrp->pg_id; + ep->e_jobc = p->p_pgrp->pg_jobc; + ep->e_sess = p->p_pgrp->pg_session; + + if (ep->e_sess) { + bcopy(ep->e_sess->s_login, ep->e_login, sizeof(ep->e_login)); + if (ep->e_sess->s_ttyvp) + ep->e_flag = EPROC_CTTY; + if (p->p_session && SESS_LEADER(p)) + ep->e_flag |= EPROC_SLEADER; + } + } + if ((p->p_flag & P_CONTROLT) && + (ep->e_sess != NULL) && + ((tp = ep->e_sess->s_ttyp) != NULL)) { + ep->e_tdev = tp->t_dev; + ep->e_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID; + ep->e_tsess = tp->t_session; + } else + ep->e_tdev = NODEV; + if (p->p_wmesg) { + strncpy(ep->e_wmesg, p->p_wmesg, WMESGLEN); + ep->e_wmesg[WMESGLEN] = 0; + } +} + +static struct proc * +zpfind(pid_t pid) +{ + struct proc *p; + + for (p = zombproc.lh_first; p != 0; p = p->p_list.le_next) + if (p->p_pid == pid) + return (p); + return (NULL); +} + + +static int +sysctl_out_proc(struct proc *p, struct sysctl_req *req, int doingzomb) +{ + struct eproc eproc; + int error; + pid_t pid = p->p_pid; + + fill_eproc(p, &eproc); + error = SYSCTL_OUT(req,(caddr_t)p, sizeof(struct proc)); + if (error) + return (error); + error = SYSCTL_OUT(req,(caddr_t)&eproc, sizeof(eproc)); + if (error) + return (error); + if (!doingzomb && pid && (pfind(pid) != p)) + return EAGAIN; + if (doingzomb && zpfind(pid) != p) + return EAGAIN; + return (0); +} + +static int +sysctl_kern_proc SYSCTL_HANDLER_ARGS +{ + int *name = (int*) arg1; + u_int namelen = arg2; + struct proc *p; + int doingzomb; + int error = 0; + + if (oidp->oid_number == KERN_PROC_PID) { + if (namelen != 1) + return (EINVAL); + p = pfind((pid_t)name[0]); + if (!p) + return (0); + error = sysctl_out_proc(p, req, 0); + return (error); + } + if (oidp->oid_number == KERN_PROC_ALL && !namelen) + ; + else if (oidp->oid_number != KERN_PROC_ALL && namelen == 1) + ; + else + return (EINVAL); + + if (!req->oldptr) { + /* overestimate by 5 procs */ + error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5); + if (error) + return (error); + } + for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) { + if (!doingzomb) + p = allproc.lh_first; + else + p = zombproc.lh_first; + for (; p != 0; p = p->p_list.le_next) { + /* + * Skip embryonic processes. + */ + if (p->p_stat == SIDL) + continue; + /* + * TODO - make more efficient (see notes below). + * do by session. + */ + switch (oidp->oid_number) { + + case KERN_PROC_PGRP: + /* could do this by traversing pgrp */ + if (p->p_pgrp == NULL || + p->p_pgrp->pg_id != (pid_t)name[0]) + continue; + break; + + case KERN_PROC_TTY: + if ((p->p_flag & P_CONTROLT) == 0 || + p->p_session == NULL || + p->p_session->s_ttyp == NULL || + p->p_session->s_ttyp->t_dev != (dev_t)name[0]) + continue; + break; + + case KERN_PROC_UID: + if (p->p_ucred == NULL || + p->p_ucred->cr_uid != (uid_t)name[0]) + continue; + break; + + case KERN_PROC_RUID: + if (p->p_ucred == NULL || + p->p_cred->p_ruid != (uid_t)name[0]) + continue; + break; + } + + error = sysctl_out_proc(p, req, doingzomb); + if (error) + return (error); + } + } + return (0); +} + + +SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD, 0, "Process table"); + +SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT, + 0, 0, sysctl_kern_proc, "S,proc", ""); + +SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD, + sysctl_kern_proc, "Process table"); + +SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD, + sysctl_kern_proc, "Process table"); + +SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD, + sysctl_kern_proc, "Process table"); + +SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD, + sysctl_kern_proc, "Process table"); + +SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD, + sysctl_kern_proc, "Process table"); diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c index 29e4c67..5c2ec5b 100644 --- a/sys/kern/kern_prot.c +++ b/sys/kern/kern_prot.c @@ -35,7 +35,8 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)kern_prot.c 8.9 (Berkeley) 2/14/95 + * @(#)kern_prot.c 8.6 (Berkeley) 1/21/94 + * $Id: kern_prot.c,v 1.25 1997/03/03 22:46:16 ache Exp $ */ /* @@ -45,21 +46,26 @@ #include <sys/param.h> #include <sys/acct.h> #include <sys/systm.h> +#include <sys/sysproto.h> #include <sys/ucred.h> #include <sys/proc.h> #include <sys/timeb.h> #include <sys/times.h> #include <sys/malloc.h> +#include <sys/unistd.h> -#include <sys/mount.h> -#include <sys/syscallargs.h> +#ifndef _SYS_SYSPROTO_H_ +struct getpid_args { + int dummy; +}; +#endif /* ARGSUSED */ int getpid(p, uap, retval) struct proc *p; - void *uap; - register_t *retval; + struct getpid_args *uap; + int *retval; { *retval = p->p_pid; @@ -69,12 +75,17 @@ getpid(p, uap, retval) return (0); } +#ifndef _SYS_SYSPROTO_H_ +struct getppid_args { + int dummy; +}; +#endif /* ARGSUSED */ int getppid(p, uap, retval) struct proc *p; - void *uap; - register_t *retval; + struct getppid_args *uap; + int *retval; { *retval = p->p_pptr->p_pid; @@ -82,23 +93,35 @@ getppid(p, uap, retval) } /* Get process group ID; note that POSIX getpgrp takes no parameter */ +#ifndef _SYS_SYSPROTO_H_ +struct getpgrp_args { + int dummy; +}; +#endif + int getpgrp(p, uap, retval) struct proc *p; - void *uap; - register_t *retval; + struct getpgrp_args *uap; + int *retval; { *retval = p->p_pgrp->pg_id; return (0); } +#ifndef _SYS_SYSPROTO_H_ +struct getuid_args { + int dummy; +}; +#endif + /* ARGSUSED */ int getuid(p, uap, retval) struct proc *p; - void *uap; - register_t *retval; + struct getuid_args *uap; + int *retval; { *retval = p->p_cred->p_ruid; @@ -108,24 +131,36 @@ getuid(p, uap, retval) return (0); } +#ifndef _SYS_SYSPROTO_H_ +struct geteuid_args { + int dummy; +}; +#endif + /* ARGSUSED */ int geteuid(p, uap, retval) struct proc *p; - void *uap; - register_t *retval; + struct geteuid_args *uap; + int *retval; { *retval = p->p_ucred->cr_uid; return (0); } +#ifndef _SYS_SYSPROTO_H_ +struct getgid_args { + int dummy; +}; +#endif + /* ARGSUSED */ int getgid(p, uap, retval) struct proc *p; - void *uap; - register_t *retval; + struct getgid_args *uap; + int *retval; { *retval = p->p_cred->p_rgid; @@ -140,51 +175,66 @@ getgid(p, uap, retval) * via getgroups. This syscall exists because it is somewhat painful to do * correctly in a library function. */ +#ifndef _SYS_SYSPROTO_H_ +struct getegid_args { + int dummy; +}; +#endif + /* ARGSUSED */ int getegid(p, uap, retval) struct proc *p; - void *uap; - register_t *retval; + struct getegid_args *uap; + int *retval; { *retval = p->p_ucred->cr_groups[0]; return (0); } +#ifndef _SYS_SYSPROTO_H_ +struct getgroups_args { + u_int gidsetsize; + gid_t *gidset; +}; +#endif int getgroups(p, uap, retval) struct proc *p; - register struct getgroups_args /* { - syscallarg(u_int) gidsetsize; - syscallarg(gid_t *) gidset; - } */ *uap; - register_t *retval; + register struct getgroups_args *uap; + int *retval; { register struct pcred *pc = p->p_cred; register u_int ngrp; int error; - if ((ngrp = SCARG(uap, gidsetsize)) == 0) { + if ((ngrp = uap->gidsetsize) == 0) { *retval = pc->pc_ucred->cr_ngroups; return (0); } if (ngrp < pc->pc_ucred->cr_ngroups) return (EINVAL); ngrp = pc->pc_ucred->cr_ngroups; - if (error = copyout((caddr_t)pc->pc_ucred->cr_groups, - (caddr_t)SCARG(uap, gidset), ngrp * sizeof(gid_t))) + if ((error = copyout((caddr_t)pc->pc_ucred->cr_groups, + (caddr_t)uap->gidset, ngrp * sizeof(gid_t)))) return (error); *retval = ngrp; return (0); } +#ifndef _SYS_SYSPROTO_H_ +struct setsid_args { + int dummy; +}; +#endif + /* ARGSUSED */ int setsid(p, uap, retval) register struct proc *p; - void *uap; - register_t *retval; + struct setsid_args *uap; + int *retval; { if (p->p_pgid == p->p_pid || pgfind(p->p_pid)) { @@ -209,23 +259,28 @@ setsid(p, uap, retval) * there must exist some pid in same session having pgid (EPERM) * pid must not be session leader (EPERM) */ +#ifndef _SYS_SYSPROTO_H_ +struct setpgid_args { + int pid; /* target process id */ + int pgid; /* target pgrp id */ +}; +#endif /* ARGSUSED */ int setpgid(curp, uap, retval) struct proc *curp; - register struct setpgid_args /* { - syscallarg(int) pid; - syscallarg(int) pgid; - } */ *uap; - register_t *retval; + register struct setpgid_args *uap; + int *retval; { register struct proc *targp; /* target process */ register struct pgrp *pgrp; /* target pgrp */ - if (SCARG(uap, pid) != 0 && SCARG(uap, pid) != curp->p_pid) { - if ((targp = pfind(SCARG(uap, pid))) == 0 || !inferior(targp)) + if (uap->pgid < 0) + return (EINVAL); + if (uap->pid != 0 && uap->pid != curp->p_pid) { + if ((targp = pfind(uap->pid)) == 0 || !inferior(targp)) return (ESRCH); - if (targp->p_session != curp->p_session) + if (targp->p_pgrp == NULL || targp->p_session != curp->p_session) return (EPERM); if (targp->p_flag & P_EXEC) return (EACCES); @@ -233,30 +288,36 @@ setpgid(curp, uap, retval) targp = curp; if (SESS_LEADER(targp)) return (EPERM); - if (SCARG(uap, pgid) == 0) - SCARG(uap, pgid) = targp->p_pid; - else if (SCARG(uap, pgid) != targp->p_pid) - if ((pgrp = pgfind(SCARG(uap, pgid))) == 0 || + if (uap->pgid == 0) + uap->pgid = targp->p_pid; + else if (uap->pgid != targp->p_pid) + if ((pgrp = pgfind(uap->pgid)) == 0 || pgrp->pg_session != curp->p_session) return (EPERM); - return (enterpgrp(targp, SCARG(uap, pgid), 0)); + return (enterpgrp(targp, uap->pgid, 0)); } +#ifndef _SYS_SYSPROTO_H_ +struct setuid_args { + uid_t uid; +}; +#endif /* ARGSUSED */ int setuid(p, uap, retval) struct proc *p; - struct setuid_args /* { - syscallarg(uid_t) uid; - } */ *uap; - register_t *retval; + struct setuid_args *uap; + int *retval; { register struct pcred *pc = p->p_cred; register uid_t uid; int error; - uid = SCARG(uap, uid); + uid = uap->uid; if (uid != pc->p_ruid && +#ifdef _POSIX_SAVED_IDS + uid != pc->p_svuid && +#endif (error = suser(pc->pc_ucred, &p->p_acflag))) return (error); /* @@ -264,30 +325,45 @@ setuid(p, uap, retval) * Transfer proc count to new user. * Copy credentials so other references do not see our changes. */ - (void)chgproccnt(pc->p_ruid, -1); - (void)chgproccnt(uid, 1); + if ( +#ifdef _POSIX_SAVED_IDS + pc->pc_ucred->cr_uid == 0 && +#endif + uid != pc->p_ruid) { + (void)chgproccnt(pc->p_ruid, -1); + (void)chgproccnt(uid, 1); + } pc->pc_ucred = crcopy(pc->pc_ucred); +#ifdef _POSIX_SAVED_IDS + if (pc->pc_ucred->cr_uid == 0) { +#endif + pc->p_ruid = uid; + pc->p_svuid = uid; +#ifdef _POSIX_SAVED_IDS + } +#endif pc->pc_ucred->cr_uid = uid; - pc->p_ruid = uid; - pc->p_svuid = uid; p->p_flag |= P_SUGID; return (0); } +#ifndef _SYS_SYSPROTO_H_ +struct seteuid_args { + uid_t euid; +}; +#endif /* ARGSUSED */ int seteuid(p, uap, retval) struct proc *p; - struct seteuid_args /* { - syscallarg(uid_t) euid; - } */ *uap; - register_t *retval; + struct seteuid_args *uap; + int *retval; { register struct pcred *pc = p->p_cred; register uid_t euid; int error; - euid = SCARG(uap, euid); + euid = uap->euid; if (euid != pc->p_ruid && euid != pc->p_svuid && (error = suser(pc->pc_ucred, &p->p_acflag))) return (error); @@ -301,44 +377,60 @@ seteuid(p, uap, retval) return (0); } +#ifndef _SYS_SYSPROTO_H_ +struct setgid_args { + gid_t gid; +}; +#endif /* ARGSUSED */ int setgid(p, uap, retval) struct proc *p; - struct setgid_args /* { - syscallarg(gid_t) gid; - } */ *uap; - register_t *retval; + struct setgid_args *uap; + int *retval; { register struct pcred *pc = p->p_cred; register gid_t gid; int error; - gid = SCARG(uap, gid); - if (gid != pc->p_rgid && (error = suser(pc->pc_ucred, &p->p_acflag))) + gid = uap->gid; + if (gid != pc->p_rgid && +#ifdef _POSIX_SAVED_IDS + gid != pc->p_svgid && +#endif + (error = suser(pc->pc_ucred, &p->p_acflag))) return (error); pc->pc_ucred = crcopy(pc->pc_ucred); pc->pc_ucred->cr_groups[0] = gid; - pc->p_rgid = gid; - pc->p_svgid = gid; /* ??? */ +#ifdef _POSIX_SAVED_IDS + if (pc->pc_ucred->cr_uid == 0) { +#endif + pc->p_rgid = gid; + pc->p_svgid = gid; +#ifdef _POSIX_SAVED_IDS + } +#endif p->p_flag |= P_SUGID; return (0); } +#ifndef _SYS_SYSPROTO_H_ +struct setegid_args { + gid_t egid; +}; +#endif /* ARGSUSED */ int setegid(p, uap, retval) struct proc *p; - struct setegid_args /* { - syscallarg(gid_t) egid; - } */ *uap; - register_t *retval; + struct setegid_args *uap; + int *retval; { register struct pcred *pc = p->p_cred; register gid_t egid; int error; - egid = SCARG(uap, egid); + egid = uap->egid; if (egid != pc->p_rgid && egid != pc->p_svgid && (error = suser(pc->pc_ucred, &p->p_acflag))) return (error); @@ -348,113 +440,109 @@ setegid(p, uap, retval) return (0); } +#ifndef _SYS_SYSPROTO_H_ +struct setgroups_args { + u_int gidsetsize; + gid_t *gidset; +}; +#endif /* ARGSUSED */ int setgroups(p, uap, retval) struct proc *p; - struct setgroups_args /* { - syscallarg(u_int) gidsetsize; - syscallarg(gid_t *) gidset; - } */ *uap; - register_t *retval; + struct setgroups_args *uap; + int *retval; { register struct pcred *pc = p->p_cred; register u_int ngrp; int error; - if (error = suser(pc->pc_ucred, &p->p_acflag)) + if ((error = suser(pc->pc_ucred, &p->p_acflag))) return (error); - ngrp = SCARG(uap, gidsetsize); + ngrp = uap->gidsetsize; if (ngrp < 1 || ngrp > NGROUPS) return (EINVAL); pc->pc_ucred = crcopy(pc->pc_ucred); - if (error = copyin((caddr_t)SCARG(uap, gidset), - (caddr_t)pc->pc_ucred->cr_groups, ngrp * sizeof(gid_t))) + if ((error = copyin((caddr_t)uap->gidset, + (caddr_t)pc->pc_ucred->cr_groups, ngrp * sizeof(gid_t)))) return (error); pc->pc_ucred->cr_ngroups = ngrp; p->p_flag |= P_SUGID; return (0); } -#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#ifndef _SYS_SYSPROTO_H_ +struct setreuid_args { + uid_t ruid; + uid_t euid; +}; +#endif /* ARGSUSED */ int -compat_43_setreuid(p, uap, retval) +setreuid(p, uap, retval) register struct proc *p; - struct compat_43_setreuid_args /* { - syscallarg(int) ruid; - syscallarg(int) euid; - } */ *uap; - register_t *retval; + struct setreuid_args *uap; + int *retval; { register struct pcred *pc = p->p_cred; - union { - struct setuid_args sa; - struct seteuid_args ea; - } args; + register uid_t ruid, euid; + int error; - /* - * If ruid == euid then setreuid is being used to emulate setuid, - * just do it. - */ - if (SCARG(uap, ruid) != -1 && SCARG(uap, ruid) == SCARG(uap, euid)) { - SCARG(&args.sa, uid) = SCARG(uap, ruid); - return (setuid(p, &args.sa, retval)); + ruid = uap->ruid; + euid = uap->euid; + if ((ruid != (uid_t)-1 && ruid != pc->p_ruid && ruid != pc->p_svuid || + euid != (uid_t)-1 && euid != pc->p_ruid && euid != pc->p_svuid) && + (error = suser(pc->pc_ucred, &p->p_acflag))) + return (error); + + pc->pc_ucred = crcopy(pc->pc_ucred); + if (euid != (uid_t)-1) + pc->pc_ucred->cr_uid = euid; + if (ruid != (uid_t)-1 && ruid != pc->p_ruid) { + (void)chgproccnt(pc->p_ruid, -1); + (void)chgproccnt(ruid, 1); + pc->p_ruid = ruid; } - /* - * Otherwise we assume that the intent of setting ruid is to be - * able to get back ruid priviledge (i.e. swapping ruid and euid). - * So we make sure that we will be able to do so, but do not - * actually set the ruid. - */ - if (SCARG(uap, ruid) != (uid_t)-1 && SCARG(uap, ruid) != pc->p_ruid && - SCARG(uap, ruid) != pc->p_svuid) - return (EPERM); - if (SCARG(uap, euid) == (uid_t)-1) - return (0); - SCARG(&args.ea, euid) = SCARG(uap, euid); - return (seteuid(p, &args.ea, retval)); + if (ruid != (uid_t)-1 || pc->pc_ucred->cr_uid != pc->p_ruid) + pc->p_svuid = pc->pc_ucred->cr_uid; + p->p_flag |= P_SUGID; + return (0); } +#ifndef _SYS_SYSPROTO_H_ +struct setregid_args { + gid_t rgid; + gid_t egid; +}; +#endif /* ARGSUSED */ int -compat_43_setregid(p, uap, retval) +setregid(p, uap, retval) register struct proc *p; - struct compat_43_setregid_args /* { - syscallarg(int) rgid; - syscallarg(int) egid; - } */ *uap; - register_t *retval; + struct setregid_args *uap; + int *retval; { register struct pcred *pc = p->p_cred; - union { - struct setgid_args sa; - struct setegid_args ea; - } args; + register gid_t rgid, egid; + int error; - /* - * If rgid == egid then setreuid is being used to emulate setgid, - * just do it. - */ - if (SCARG(uap, rgid) != -1 && SCARG(uap, rgid) == SCARG(uap, egid)) { - SCARG(&args.sa, gid) = SCARG(uap, rgid); - return (setgid(p, &args.sa, retval)); - } - /* - * Otherwise we assume that the intent of setting rgid is to be - * able to get back rgid priviledge (i.e. swapping rgid and egid). - * So we make sure that we will be able to do so, but do not - * actually set the rgid. - */ - if (SCARG(uap, rgid) != (gid_t)-1 && SCARG(uap, rgid) != pc->p_rgid && - SCARG(uap, rgid) != pc->p_svgid) - return (EPERM); - if (SCARG(uap, egid) == (gid_t)-1) - return (0); - SCARG(&args.ea, egid) = SCARG(uap, egid); - return (setegid(p, &args.ea, retval)); + rgid = uap->rgid; + egid = uap->egid; + if ((rgid != (gid_t)-1 && rgid != pc->p_rgid && rgid != pc->p_svgid || + egid != (gid_t)-1 && egid != pc->p_rgid && egid != pc->p_svgid) && + (error = suser(pc->pc_ucred, &p->p_acflag))) + return (error); + + pc->pc_ucred = crcopy(pc->pc_ucred); + if (egid != (gid_t)-1) + pc->pc_ucred->cr_groups[0] = egid; + if (rgid != (gid_t)-1) + pc->p_rgid = rgid; + if (rgid != (gid_t)-1 || pc->pc_ucred->cr_groups[0] != pc->p_rgid) + pc->p_svgid = pc->pc_ucred->cr_groups[0]; + p->p_flag |= P_SUGID; + return (0); } -#endif /* defined(COMPAT_43) || defined(COMPAT_SUNOS) */ /* * Check if gid is a member of the group set. @@ -559,43 +647,52 @@ crdup(cr) /* * Get login name, if available. */ +#ifndef _SYS_SYSPROTO_H_ +struct getlogin_args { + char *namebuf; + u_int namelen; +}; +#endif /* ARGSUSED */ int getlogin(p, uap, retval) struct proc *p; - struct getlogin_args /* { - syscallarg(char *) namebuf; - syscallarg(u_int) namelen; - } */ *uap; - register_t *retval; + struct getlogin_args *uap; + int *retval; { - if (SCARG(uap, namelen) > sizeof (p->p_pgrp->pg_session->s_login)) - SCARG(uap, namelen) = sizeof (p->p_pgrp->pg_session->s_login); + if (uap->namelen > MAXLOGNAME) + uap->namelen = MAXLOGNAME; return (copyout((caddr_t) p->p_pgrp->pg_session->s_login, - (caddr_t) SCARG(uap, namebuf), SCARG(uap, namelen))); + (caddr_t) uap->namebuf, uap->namelen)); } /* * Set login name. */ +#ifndef _SYS_SYSPROTO_H_ +struct setlogin_args { + char *namebuf; +}; +#endif /* ARGSUSED */ int setlogin(p, uap, retval) struct proc *p; - struct setlogin_args /* { - syscallarg(char *) namebuf; - } */ *uap; - register_t *retval; + struct setlogin_args *uap; + int *retval; { int error; + char logintmp[MAXLOGNAME]; - if (error = suser(p->p_ucred, &p->p_acflag)) + if ((error = suser(p->p_ucred, &p->p_acflag))) return (error); - error = copyinstr((caddr_t) SCARG(uap, namebuf), - (caddr_t) p->p_pgrp->pg_session->s_login, - sizeof (p->p_pgrp->pg_session->s_login) - 1, (u_int *)0); + error = copyinstr((caddr_t) uap->namebuf, (caddr_t) logintmp, + sizeof(logintmp), (u_int *)0); if (error == ENAMETOOLONG) error = EINVAL; + else if (!error) + (void) memcpy(p->p_pgrp->pg_session->s_login, logintmp, + sizeof(logintmp)); return (error); } diff --git a/sys/kern/kern_random.c b/sys/kern/kern_random.c new file mode 100644 index 0000000..64c215c --- /dev/null +++ b/sys/kern/kern_random.c @@ -0,0 +1,515 @@ +/* + * random_machdep.c -- A strong random number generator + * + * $Id$ + * + * Version 0.95, last modified 18-Oct-95 + * + * Copyright Theodore Ts'o, 1994, 1995. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, and the entire permission notice in its entirety, + * including the disclaimer of warranties. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote + * products derived from this software without specific prior + * written permission. + * + * ALTERNATIVELY, this product may be distributed under the terms of + * the GNU Public License, in which case the provisions of the GPL are + * required INSTEAD OF the above restrictions. (This clause is + * necessary due to a potential bad interaction between the GPL and + * the restrictions contained in a BSD-style copyright.) + * + * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, + * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES + * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED + * OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "opt_cpu.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/select.h> +#include <sys/fcntl.h> + +#include <machine/clock.h> +#include <machine/random.h> + +#include <i386/isa/icu.h> +#ifdef PC98 +#include <pc98/pc98/pc98.h> +#else +#include <i386/isa/isa.h> +#endif +#include <i386/isa/timerreg.h> + +#define MAX_BLKDEV 4 + +/* + * The pool is stirred with a primitive polynomial of degree 128 + * over GF(2), namely x^128 + x^99 + x^59 + x^31 + x^9 + x^7 + 1. + * For a pool of size 64, try x^64+x^62+x^38+x^10+x^6+x+1. + */ +#define POOLWORDS 128 /* Power of 2 - note that this is 32-bit words */ +#define POOLBITS (POOLWORDS*32) + +#if POOLWORDS == 128 +#define TAP1 99 /* The polynomial taps */ +#define TAP2 59 +#define TAP3 31 +#define TAP4 9 +#define TAP5 7 +#elif POOLWORDS == 64 +#define TAP1 62 /* The polynomial taps */ +#define TAP2 38 +#define TAP3 10 +#define TAP4 6 +#define TAP5 1 +#else +#error No primitive polynomial available for chosen POOLWORDS +#endif + +#define WRITEBUFFER 512 /* size in bytes */ + +/* There is actually only one of these, globally. */ +struct random_bucket { + u_int add_ptr; + u_int entropy_count; + int input_rotate; + u_int32_t *pool; + struct selinfo rsel; +}; + +/* There is one of these per entropy source */ +struct timer_rand_state { + u_long last_time; + int last_delta; + int nbits; +}; + +static struct random_bucket random_state; +static u_int32_t random_pool[POOLWORDS]; +static struct timer_rand_state keyboard_timer_state; +static struct timer_rand_state extract_timer_state; +static struct timer_rand_state irq_timer_state[ICU_LEN]; +#ifdef notyet +static struct timer_rand_state blkdev_timer_state[MAX_BLKDEV]; +#endif +static struct wait_queue *random_wait; + +inthand2_t *sec_intr_handler[ICU_LEN]; +int sec_intr_unit[ICU_LEN]; + +#ifndef MIN +#define MIN(a,b) (((a) < (b)) ? (a) : (b)) +#endif + +void +rand_initialize(void) +{ + random_state.add_ptr = 0; + random_state.entropy_count = 0; + random_state.pool = random_pool; + random_wait = NULL; + random_state.rsel.si_flags = 0; + random_state.rsel.si_pid = 0; +} + +/* + * This function adds an int into the entropy "pool". It does not + * update the entropy estimate. The caller must do this if appropriate. + * + * The pool is stirred with a primitive polynomial of degree 128 + * over GF(2), namely x^128 + x^99 + x^59 + x^31 + x^9 + x^7 + 1. + * For a pool of size 64, try x^64+x^62+x^38+x^10+x^6+x+1. + * + * We rotate the input word by a changing number of bits, to help + * assure that all bits in the entropy get toggled. Otherwise, if we + * consistently feed the entropy pool small numbers (like ticks and + * scancodes, for example), the upper bits of the entropy pool don't + * get affected. --- TYT, 10/11/95 + */ +static inline void +add_entropy_word(struct random_bucket *r, const u_int32_t input) +{ + u_int i; + u_int32_t w; + + w = (input << r->input_rotate) | (input >> (32 - r->input_rotate)); + i = r->add_ptr = (r->add_ptr - 1) & (POOLWORDS-1); + if (i) + r->input_rotate = (r->input_rotate + 7) & 31; + else + /* + * At the beginning of the pool, add an extra 7 bits + * rotation, so that successive passes spread the + * input bits across the pool evenly. + */ + r->input_rotate = (r->input_rotate + 14) & 31; + + /* XOR in the various taps */ + w ^= r->pool[(i+TAP1)&(POOLWORDS-1)]; + w ^= r->pool[(i+TAP2)&(POOLWORDS-1)]; + w ^= r->pool[(i+TAP3)&(POOLWORDS-1)]; + w ^= r->pool[(i+TAP4)&(POOLWORDS-1)]; + w ^= r->pool[(i+TAP5)&(POOLWORDS-1)]; + w ^= r->pool[i]; + /* Rotate w left 1 bit (stolen from SHA) and store */ + r->pool[i] = (w << 1) | (w >> 31); +} + +/* + * This function adds entropy to the entropy "pool" by using timing + * delays. It uses the timer_rand_state structure to make an estimate + * of how any bits of entropy this call has added to the pool. + * + * The number "num" is also added to the pool - it should somehow describe + * the type of event which just happened. This is currently 0-255 for + * keyboard scan codes, and 256 upwards for interrupts. + * On the i386, this is assumed to be at most 16 bits, and the high bits + * are used for a high-resolution timer. + */ +static void +add_timer_randomness(struct random_bucket *r, struct timer_rand_state *state, + u_int num) +{ + int delta, delta2; + u_int nbits; + u_int32_t time; + +#if defined(I586_CPU) || defined(I686_CPU) + if (i586_ctr_freq != 0) { + num ^= (u_int32_t) rdtsc() << 16; + r->entropy_count += 2; + } else { +#endif + disable_intr(); + outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH); + num ^= inb(TIMER_CNTR0) << 16; + num ^= inb(TIMER_CNTR0) << 24; + enable_intr(); + r->entropy_count += 2; +#if defined(I586_CPU) || defined(I686_CPU) + } +#endif + + time = ticks; + + add_entropy_word(r, (u_int32_t) num); + add_entropy_word(r, time); + + /* + * Calculate number of bits of randomness we probably + * added. We take into account the first and second order + * deltas in order to make our estimate. + */ + delta = time - state->last_time; + state->last_time = time; + + delta2 = delta - state->last_delta; + state->last_delta = delta; + + if (delta < 0) delta = -delta; + if (delta2 < 0) delta2 = -delta2; + delta = MIN(delta, delta2) >> 1; + for (nbits = 0; delta; nbits++) + delta >>= 1; + + r->entropy_count += nbits; + + /* Prevent overflow */ + if (r->entropy_count > POOLBITS) + r->entropy_count = POOLBITS; + + if (r->entropy_count >= 8) + selwakeup(&random_state.rsel); +} + +void +add_keyboard_randomness(u_char scancode) +{ + add_timer_randomness(&random_state, &keyboard_timer_state, scancode); +} + +void +add_interrupt_randomness(int irq) +{ + (sec_intr_handler[irq])(sec_intr_unit[irq]); + add_timer_randomness(&random_state, &irq_timer_state[irq], irq); +} + +#ifdef notused +void +add_blkdev_randomness(int major) +{ + if (major >= MAX_BLKDEV) + return; + + add_timer_randomness(&random_state, &blkdev_timer_state[major], + 0x200+major); +} +#endif /* notused */ + +/* + * MD5 transform algorithm, taken from code written by Colin Plumb, + * and put into the public domain + * + * QUESTION: Replace this with SHA, which as generally received better + * reviews from the cryptographic community? + */ + +/* The four core functions - F1 is optimized somewhat */ + +/* #define F1(x, y, z) (x & y | ~x & z) */ +#define F1(x, y, z) (z ^ (x & (y ^ z))) +#define F2(x, y, z) F1(z, x, y) +#define F3(x, y, z) (x ^ y ^ z) +#define F4(x, y, z) (y ^ (x | ~z)) + +/* This is the central step in the MD5 algorithm. */ +#define MD5STEP(f, w, x, y, z, data, s) \ + ( w += f(x, y, z) + data, w = w<<s | w>>(32-s), w += x ) + +/* + * The core of the MD5 algorithm, this alters an existing MD5 hash to + * reflect the addition of 16 longwords of new data. MD5Update blocks + * the data and converts bytes into longwords for this routine. + */ +static void +MD5Transform(u_int32_t buf[4], + u_int32_t const in[16]) +{ + u_int32_t a, b, c, d; + + a = buf[0]; + b = buf[1]; + c = buf[2]; + d = buf[3]; + + MD5STEP(F1, a, b, c, d, in[ 0]+0xd76aa478, 7); + MD5STEP(F1, d, a, b, c, in[ 1]+0xe8c7b756, 12); + MD5STEP(F1, c, d, a, b, in[ 2]+0x242070db, 17); + MD5STEP(F1, b, c, d, a, in[ 3]+0xc1bdceee, 22); + MD5STEP(F1, a, b, c, d, in[ 4]+0xf57c0faf, 7); + MD5STEP(F1, d, a, b, c, in[ 5]+0x4787c62a, 12); + MD5STEP(F1, c, d, a, b, in[ 6]+0xa8304613, 17); + MD5STEP(F1, b, c, d, a, in[ 7]+0xfd469501, 22); + MD5STEP(F1, a, b, c, d, in[ 8]+0x698098d8, 7); + MD5STEP(F1, d, a, b, c, in[ 9]+0x8b44f7af, 12); + MD5STEP(F1, c, d, a, b, in[10]+0xffff5bb1, 17); + MD5STEP(F1, b, c, d, a, in[11]+0x895cd7be, 22); + MD5STEP(F1, a, b, c, d, in[12]+0x6b901122, 7); + MD5STEP(F1, d, a, b, c, in[13]+0xfd987193, 12); + MD5STEP(F1, c, d, a, b, in[14]+0xa679438e, 17); + MD5STEP(F1, b, c, d, a, in[15]+0x49b40821, 22); + + MD5STEP(F2, a, b, c, d, in[ 1]+0xf61e2562, 5); + MD5STEP(F2, d, a, b, c, in[ 6]+0xc040b340, 9); + MD5STEP(F2, c, d, a, b, in[11]+0x265e5a51, 14); + MD5STEP(F2, b, c, d, a, in[ 0]+0xe9b6c7aa, 20); + MD5STEP(F2, a, b, c, d, in[ 5]+0xd62f105d, 5); + MD5STEP(F2, d, a, b, c, in[10]+0x02441453, 9); + MD5STEP(F2, c, d, a, b, in[15]+0xd8a1e681, 14); + MD5STEP(F2, b, c, d, a, in[ 4]+0xe7d3fbc8, 20); + MD5STEP(F2, a, b, c, d, in[ 9]+0x21e1cde6, 5); + MD5STEP(F2, d, a, b, c, in[14]+0xc33707d6, 9); + MD5STEP(F2, c, d, a, b, in[ 3]+0xf4d50d87, 14); + MD5STEP(F2, b, c, d, a, in[ 8]+0x455a14ed, 20); + MD5STEP(F2, a, b, c, d, in[13]+0xa9e3e905, 5); + MD5STEP(F2, d, a, b, c, in[ 2]+0xfcefa3f8, 9); + MD5STEP(F2, c, d, a, b, in[ 7]+0x676f02d9, 14); + MD5STEP(F2, b, c, d, a, in[12]+0x8d2a4c8a, 20); + + MD5STEP(F3, a, b, c, d, in[ 5]+0xfffa3942, 4); + MD5STEP(F3, d, a, b, c, in[ 8]+0x8771f681, 11); + MD5STEP(F3, c, d, a, b, in[11]+0x6d9d6122, 16); + MD5STEP(F3, b, c, d, a, in[14]+0xfde5380c, 23); + MD5STEP(F3, a, b, c, d, in[ 1]+0xa4beea44, 4); + MD5STEP(F3, d, a, b, c, in[ 4]+0x4bdecfa9, 11); + MD5STEP(F3, c, d, a, b, in[ 7]+0xf6bb4b60, 16); + MD5STEP(F3, b, c, d, a, in[10]+0xbebfbc70, 23); + MD5STEP(F3, a, b, c, d, in[13]+0x289b7ec6, 4); + MD5STEP(F3, d, a, b, c, in[ 0]+0xeaa127fa, 11); + MD5STEP(F3, c, d, a, b, in[ 3]+0xd4ef3085, 16); + MD5STEP(F3, b, c, d, a, in[ 6]+0x04881d05, 23); + MD5STEP(F3, a, b, c, d, in[ 9]+0xd9d4d039, 4); + MD5STEP(F3, d, a, b, c, in[12]+0xe6db99e5, 11); + MD5STEP(F3, c, d, a, b, in[15]+0x1fa27cf8, 16); + MD5STEP(F3, b, c, d, a, in[ 2]+0xc4ac5665, 23); + + MD5STEP(F4, a, b, c, d, in[ 0]+0xf4292244, 6); + MD5STEP(F4, d, a, b, c, in[ 7]+0x432aff97, 10); + MD5STEP(F4, c, d, a, b, in[14]+0xab9423a7, 15); + MD5STEP(F4, b, c, d, a, in[ 5]+0xfc93a039, 21); + MD5STEP(F4, a, b, c, d, in[12]+0x655b59c3, 6); + MD5STEP(F4, d, a, b, c, in[ 3]+0x8f0ccc92, 10); + MD5STEP(F4, c, d, a, b, in[10]+0xffeff47d, 15); + MD5STEP(F4, b, c, d, a, in[ 1]+0x85845dd1, 21); + MD5STEP(F4, a, b, c, d, in[ 8]+0x6fa87e4f, 6); + MD5STEP(F4, d, a, b, c, in[15]+0xfe2ce6e0, 10); + MD5STEP(F4, c, d, a, b, in[ 6]+0xa3014314, 15); + MD5STEP(F4, b, c, d, a, in[13]+0x4e0811a1, 21); + MD5STEP(F4, a, b, c, d, in[ 4]+0xf7537e82, 6); + MD5STEP(F4, d, a, b, c, in[11]+0xbd3af235, 10); + MD5STEP(F4, c, d, a, b, in[ 2]+0x2ad7d2bb, 15); + MD5STEP(F4, b, c, d, a, in[ 9]+0xeb86d391, 21); + + buf[0] += a; + buf[1] += b; + buf[2] += c; + buf[3] += d; +} + +#undef F1 +#undef F2 +#undef F3 +#undef F4 +#undef MD5STEP + + +#if POOLWORDS % 16 +#error extract_entropy() assumes that POOLWORDS is a multiple of 16 words. +#endif +/* + * This function extracts randomness from the "entropy pool", and + * returns it in a buffer. This function computes how many remaining + * bits of entropy are left in the pool, but it does not restrict the + * number of bytes that are actually obtained. + */ +static inline int +extract_entropy(struct random_bucket *r, char *buf, int nbytes) +{ + int ret, i; + u_int32_t tmp[4]; + + add_timer_randomness(r, &extract_timer_state, nbytes); + + /* Redundant, but just in case... */ + if (r->entropy_count > POOLBITS) + r->entropy_count = POOLBITS; + /* Why is this here? Left in from Ted Ts'o. Perhaps to limit time. */ + if (nbytes > 32768) + nbytes = 32768; + + ret = nbytes; + if (r->entropy_count / 8 >= nbytes) + r->entropy_count -= nbytes*8; + else + r->entropy_count = 0; + + while (nbytes) { + /* Hash the pool to get the output */ + tmp[0] = 0x67452301; + tmp[1] = 0xefcdab89; + tmp[2] = 0x98badcfe; + tmp[3] = 0x10325476; + for (i = 0; i < POOLWORDS; i += 16) + MD5Transform(tmp, r->pool+i); + /* Modify pool so next hash will produce different results */ + add_entropy_word(r, tmp[0]); + add_entropy_word(r, tmp[1]); + add_entropy_word(r, tmp[2]); + add_entropy_word(r, tmp[3]); + /* + * Run the MD5 Transform one more time, since we want + * to add at least minimal obscuring of the inputs to + * add_entropy_word(). --- TYT + */ + MD5Transform(tmp, r->pool); + + /* Copy data to destination buffer */ + i = MIN(nbytes, 16); + bcopy(tmp, buf, i); + nbytes -= i; + buf += i; + } + + /* Wipe data from memory */ + bzero(tmp, sizeof(tmp)); + + return ret; +} + +#ifdef notused /* XXX NOT the exported kernel interface */ +/* + * This function is the exported kernel interface. It returns some + * number of good random numbers, suitable for seeding TCP sequence + * numbers, etc. + */ +void +get_random_bytes(void *buf, u_int nbytes) +{ + extract_entropy(&random_state, (char *) buf, nbytes); +} +#endif /* notused */ + +u_int +read_random(char *buf, u_int nbytes) +{ + if ((nbytes * 8) > random_state.entropy_count) + nbytes = random_state.entropy_count / 8; + + return extract_entropy(&random_state, buf, nbytes); +} + +u_int +read_random_unlimited(char *buf, u_int nbytes) +{ + return extract_entropy(&random_state, buf, nbytes); +} + +#ifdef notused +u_int +write_random(const char *buf, u_int nbytes) +{ + u_int i; + u_int32_t word, *p; + + for (i = nbytes, p = (u_int32_t *)buf; + i >= sizeof(u_int32_t); + i-= sizeof(u_int32_t), p++) + add_entropy_word(&random_state, *p); + if (i) { + word = 0; + bcopy(p, &word, i); + add_entropy_word(&random_state, word); + } + return nbytes; +} +#endif /* notused */ + +int +random_select(dev_t dev, int rw, struct proc *p) +{ + int s, ret; + + if (rw == FWRITE) + return 1; /* heh. */ + + s = splhigh(); + if (random_state.entropy_count >= 8) + ret = 1; + else { + selrecord(p, &random_state.rsel); + ret = 0; + } + splx(s); + + return ret; +} + diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c index 569b9d9..fe50cf9 100644 --- a/sys/kern/kern_resource.c +++ b/sys/kern/kern_resource.c @@ -35,21 +35,27 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)kern_resource.c 8.8 (Berkeley) 2/14/95 + * @(#)kern_resource.c 8.5 (Berkeley) 1/21/94 + * $Id$ */ +#include "opt_rlimit.h" + #include <sys/param.h> #include <sys/systm.h> +#include <sys/sysproto.h> #include <sys/kernel.h> #include <sys/file.h> #include <sys/resourcevar.h> #include <sys/malloc.h> #include <sys/proc.h> -#include <sys/mount.h> -#include <sys/syscallargs.h> - #include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> int donice __P((struct proc *curp, struct proc *chgp, int n)); int dosetrlimit __P((struct proc *p, u_int which, struct rlimit *limp)); @@ -58,25 +64,28 @@ int dosetrlimit __P((struct proc *p, u_int which, struct rlimit *limp)); * Resource controls and accounting. */ +#ifndef _SYS_SYSPROTO_H_ +struct getpriority_args { + int which; + int who; +}; +#endif int getpriority(curp, uap, retval) struct proc *curp; - register struct getpriority_args /* { - syscallarg(int) which; - syscallarg(int) who; - } */ *uap; - register_t *retval; + register struct getpriority_args *uap; + int *retval; { register struct proc *p; register int low = PRIO_MAX + 1; - switch (SCARG(uap, which)) { + switch (uap->which) { case PRIO_PROCESS: - if (SCARG(uap, who) == 0) + if (uap->who == 0) p = curp; else - p = pfind(SCARG(uap, who)); + p = pfind(uap->who); if (p == 0) break; low = p->p_nice; @@ -85,9 +94,9 @@ getpriority(curp, uap, retval) case PRIO_PGRP: { register struct pgrp *pg; - if (SCARG(uap, who) == 0) + if (uap->who == 0) pg = curp->p_pgrp; - else if ((pg = pgfind(SCARG(uap, who))) == NULL) + else if ((pg = pgfind(uap->who)) == NULL) break; for (p = pg->pg_members.lh_first; p != 0; p = p->p_pglist.le_next) { @@ -98,10 +107,10 @@ getpriority(curp, uap, retval) } case PRIO_USER: - if (SCARG(uap, who) == 0) - SCARG(uap, who) = curp->p_ucred->cr_uid; + if (uap->who == 0) + uap->who = curp->p_ucred->cr_uid; for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) - if (p->p_ucred->cr_uid == SCARG(uap, who) && + if (p->p_ucred->cr_uid == uap->who && p->p_nice < low) low = p->p_nice; break; @@ -115,54 +124,57 @@ getpriority(curp, uap, retval) return (0); } +#ifndef _SYS_SYSPROTO_H_ +struct setpriority_args { + int which; + int who; + int prio; +}; +#endif /* ARGSUSED */ int setpriority(curp, uap, retval) struct proc *curp; - register struct setpriority_args /* { - syscallarg(int) which; - syscallarg(int) who; - syscallarg(int) prio; - } */ *uap; - register_t *retval; + register struct setpriority_args *uap; + int *retval; { register struct proc *p; int found = 0, error = 0; - switch (SCARG(uap, which)) { + switch (uap->which) { case PRIO_PROCESS: - if (SCARG(uap, who) == 0) + if (uap->who == 0) p = curp; else - p = pfind(SCARG(uap, who)); + p = pfind(uap->who); if (p == 0) break; - error = donice(curp, p, SCARG(uap, prio)); + error = donice(curp, p, uap->prio); found++; break; case PRIO_PGRP: { register struct pgrp *pg; - - if (SCARG(uap, who) == 0) + + if (uap->who == 0) pg = curp->p_pgrp; - else if ((pg = pgfind(SCARG(uap, who))) == NULL) + else if ((pg = pgfind(uap->who)) == NULL) break; for (p = pg->pg_members.lh_first; p != 0; p = p->p_pglist.le_next) { - error = donice(curp, p, SCARG(uap, prio)); + error = donice(curp, p, uap->prio); found++; } break; } case PRIO_USER: - if (SCARG(uap, who) == 0) - SCARG(uap, who) = curp->p_ucred->cr_uid; + if (uap->who == 0) + uap->who = curp->p_ucred->cr_uid; for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) - if (p->p_ucred->cr_uid == SCARG(uap, who)) { - error = donice(curp, p, SCARG(uap, prio)); + if (p->p_ucred->cr_uid == uap->who) { + error = donice(curp, p, uap->prio); found++; } break; @@ -197,71 +209,150 @@ donice(curp, chgp, n) return (0); } +/* rtprio system call */ +#ifndef _SYS_SYSPROTO_H_ +struct rtprio_args { + int function; + pid_t pid; + struct rtprio *rtp; +}; +#endif + +/* + * Set realtime priority + */ + +/* ARGSUSED */ +int +rtprio(curp, uap, retval) + struct proc *curp; + register struct rtprio_args *uap; + int *retval; +{ + register struct proc *p; + register struct pcred *pcred = curp->p_cred; + struct rtprio rtp; + int error; + + error = copyin(uap->rtp, &rtp, sizeof(struct rtprio)); + if (error) + return (error); + + if (uap->pid == 0) + p = curp; + else + p = pfind(uap->pid); + + if (p == 0) + return (ESRCH); + + switch (uap->function) { + case RTP_LOOKUP: + return (copyout(&p->p_rtprio, uap->rtp, sizeof(struct rtprio))); + case RTP_SET: + if (pcred->pc_ucred->cr_uid && pcred->p_ruid && + pcred->pc_ucred->cr_uid != p->p_ucred->cr_uid && + pcred->p_ruid != p->p_ucred->cr_uid) + return (EPERM); + /* disallow setting rtprio in most cases if not superuser */ + if (suser(pcred->pc_ucred, &curp->p_acflag)) { + /* can't set someone else's */ + if (uap->pid) + return (EPERM); + /* can't set realtime priority */ + if (rtp.type == RTP_PRIO_REALTIME) + return (EPERM); + } + switch (rtp.type) { + case RTP_PRIO_REALTIME: + case RTP_PRIO_NORMAL: + case RTP_PRIO_IDLE: + if (rtp.prio > RTP_PRIO_MAX) + return (EINVAL); + p->p_rtprio = rtp; + return (0); + default: + return (EINVAL); + } + + default: + return (EINVAL); + } +} + #if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#ifndef _SYS_SYSPROTO_H_ +struct osetrlimit_args { + u_int which; + struct orlimit *rlp; +}; +#endif /* ARGSUSED */ int -compat_43_setrlimit(p, uap, retval) +osetrlimit(p, uap, retval) struct proc *p; - struct compat_43_setrlimit_args /* { - syscallarg(u_int) which; - syscallarg(struct ogetrlimit *) rlp; - } */ *uap; - register_t *retval; + register struct osetrlimit_args *uap; + int *retval; { struct orlimit olim; struct rlimit lim; int error; - if (error = copyin((caddr_t)SCARG(uap, rlp), (caddr_t)&olim, - sizeof (struct orlimit))) + if ((error = + copyin((caddr_t)uap->rlp, (caddr_t)&olim, sizeof(struct orlimit)))) return (error); lim.rlim_cur = olim.rlim_cur; lim.rlim_max = olim.rlim_max; - return (dosetrlimit(p, SCARG(uap, which), &lim)); + return (dosetrlimit(p, uap->which, &lim)); } +#ifndef _SYS_SYSPROTO_H_ +struct ogetrlimit_args { + u_int which; + struct orlimit *rlp; +}; +#endif /* ARGSUSED */ int -compat_43_getrlimit(p, uap, retval) +ogetrlimit(p, uap, retval) struct proc *p; - register struct compat_43_getrlimit_args /* { - syscallarg(u_int) which; - syscallarg(struct ogetrlimit *) rlp; - } */ *uap; - register_t *retval; + register struct ogetrlimit_args *uap; + int *retval; { struct orlimit olim; - if (SCARG(uap, which) >= RLIM_NLIMITS) + if (uap->which >= RLIM_NLIMITS) return (EINVAL); - olim.rlim_cur = p->p_rlimit[SCARG(uap, which)].rlim_cur; + olim.rlim_cur = p->p_rlimit[uap->which].rlim_cur; if (olim.rlim_cur == -1) olim.rlim_cur = 0x7fffffff; - olim.rlim_max = p->p_rlimit[SCARG(uap, which)].rlim_max; + olim.rlim_max = p->p_rlimit[uap->which].rlim_max; if (olim.rlim_max == -1) olim.rlim_max = 0x7fffffff; - return (copyout((caddr_t)&olim, (caddr_t)SCARG(uap, rlp), - sizeof(olim))); + return (copyout((caddr_t)&olim, (caddr_t)uap->rlp, sizeof(olim))); } #endif /* COMPAT_43 || COMPAT_SUNOS */ +#ifndef _SYS_SYSPROTO_H_ +struct __setrlimit_args { + u_int which; + struct rlimit *rlp; +}; +#endif /* ARGSUSED */ int setrlimit(p, uap, retval) struct proc *p; - register struct setrlimit_args /* { - syscallarg(u_int) which; - syscallarg(struct rlimit *) rlp; - } */ *uap; - register_t *retval; + register struct __setrlimit_args *uap; + int *retval; { struct rlimit alim; int error; - if (error = copyin((caddr_t)SCARG(uap, rlp), (caddr_t)&alim, - sizeof (struct rlimit))) + if ((error = + copyin((caddr_t)uap->rlp, (caddr_t)&alim, sizeof (struct rlimit)))) return (error); - return (dosetrlimit(p, SCARG(uap, which), &alim)); + return (dosetrlimit(p, uap->which, &alim)); } int @@ -271,15 +362,23 @@ dosetrlimit(p, which, limp) struct rlimit *limp; { register struct rlimit *alimp; - extern unsigned maxdmap; int error; if (which >= RLIM_NLIMITS) return (EINVAL); alimp = &p->p_rlimit[which]; - if (limp->rlim_cur > alimp->rlim_max || + + /* + * Preserve historical bugs by treating negative limits as unsigned. + */ + if (limp->rlim_cur < 0) + limp->rlim_cur = RLIM_INFINITY; + if (limp->rlim_max < 0) + limp->rlim_max = RLIM_INFINITY; + + if (limp->rlim_cur > alimp->rlim_max || limp->rlim_max > alimp->rlim_max) - if (error = suser(p->p_ucred, &p->p_acflag)) + if ((error = suser(p->p_ucred, &p->p_acflag))) return (error); if (limp->rlim_cur > limp->rlim_max) limp->rlim_cur = limp->rlim_max; @@ -293,17 +392,17 @@ dosetrlimit(p, which, limp) switch (which) { case RLIMIT_DATA: - if (limp->rlim_cur > maxdmap) - limp->rlim_cur = maxdmap; - if (limp->rlim_max > maxdmap) - limp->rlim_max = maxdmap; + if (limp->rlim_cur > MAXDSIZ) + limp->rlim_cur = MAXDSIZ; + if (limp->rlim_max > MAXDSIZ) + limp->rlim_max = MAXDSIZ; break; case RLIMIT_STACK: - if (limp->rlim_cur > maxdmap) - limp->rlim_cur = maxdmap; - if (limp->rlim_max > maxdmap) - limp->rlim_max = maxdmap; + if (limp->rlim_cur > MAXSSIZ) + limp->rlim_cur = MAXSSIZ; + if (limp->rlim_max > MAXSSIZ) + limp->rlim_max = MAXSSIZ; /* * Stack is allocated to the max at exec time with only * "rlim_cur" bytes accessible. If stack limit is going @@ -331,38 +430,41 @@ dosetrlimit(p, which, limp) break; case RLIMIT_NOFILE: - if (limp->rlim_cur > maxfiles) - limp->rlim_cur = maxfiles; - if (limp->rlim_max > maxfiles) - limp->rlim_max = maxfiles; + if (limp->rlim_cur > maxfilesperproc) + limp->rlim_cur = maxfilesperproc; + if (limp->rlim_max > maxfilesperproc) + limp->rlim_max = maxfilesperproc; break; case RLIMIT_NPROC: - if (limp->rlim_cur > maxproc) - limp->rlim_cur = maxproc; - if (limp->rlim_max > maxproc) - limp->rlim_max = maxproc; + if (limp->rlim_cur > maxprocperuid) + limp->rlim_cur = maxprocperuid; + if (limp->rlim_max > maxprocperuid) + limp->rlim_max = maxprocperuid; break; } *alimp = *limp; return (0); } +#ifndef _SYS_SYSPROTO_H_ +struct __getrlimit_args { + u_int which; + struct rlimit *rlp; +}; +#endif /* ARGSUSED */ int getrlimit(p, uap, retval) struct proc *p; - register struct getrlimit_args /* { - syscallarg(u_int) which; - syscallarg(struct rlimit *) rlp; - } */ *uap; - register_t *retval; + register struct __getrlimit_args *uap; + int *retval; { - if (SCARG(uap, which) >= RLIM_NLIMITS) + if (uap->which >= RLIM_NLIMITS) return (EINVAL); - return (copyout((caddr_t)&p->p_rlimit[SCARG(uap, which)], - (caddr_t)SCARG(uap, rlp), sizeof (struct rlimit))); + return (copyout((caddr_t)&p->p_rlimit[uap->which], (caddr_t)uap->rlp, + sizeof (struct rlimit))); } /* @@ -371,14 +473,15 @@ getrlimit(p, uap, retval) */ void calcru(p, up, sp, ip) - register struct proc *p; - register struct timeval *up; - register struct timeval *sp; - register struct timeval *ip; + struct proc *p; + struct timeval *up; + struct timeval *sp; + struct timeval *ip; { - register u_quad_t u, st, ut, it, tot; - register u_long sec, usec; - register int s; + quad_t totusec; + u_quad_t u, st, ut, it, tot; + long sec, usec; + int s; struct timeval tv; s = splstatclock(); @@ -389,11 +492,8 @@ calcru(p, up, sp, ip) tot = st + ut + it; if (tot == 0) { - up->tv_sec = up->tv_usec = 0; - sp->tv_sec = sp->tv_usec = 0; - if (ip != NULL) - ip->tv_sec = ip->tv_usec = 0; - return; + st = 1; + tot = 1; } sec = p->p_rtime.tv_sec; @@ -408,7 +508,13 @@ calcru(p, up, sp, ip) sec += tv.tv_sec - runtime.tv_sec; usec += tv.tv_usec - runtime.tv_usec; } - u = sec * 1000000 + usec; + totusec = (quad_t)sec * 1000000 + usec; + if (totusec < 0) { + /* XXX no %qd in kernel. Truncate. */ + printf("calcru: negative time: %ld usec\n", (long)totusec); + totusec = 0; + } + u = totusec; st = (u * st) / tot; sp->tv_sec = st / 1000000; sp->tv_usec = st % 1000000; @@ -422,19 +528,22 @@ calcru(p, up, sp, ip) } } +#ifndef _SYS_SYSPROTO_H_ +struct getrusage_args { + int who; + struct rusage *rusage; +}; +#endif /* ARGSUSED */ int getrusage(p, uap, retval) register struct proc *p; - register struct getrusage_args /* { - syscallarg(int) who; - syscallarg(struct rusage *) rusage; - } */ *uap; - register_t *retval; + register struct getrusage_args *uap; + int *retval; { register struct rusage *rup; - switch (SCARG(uap, who)) { + switch (uap->who) { case RUSAGE_SELF: rup = &p->p_stats->p_ru; @@ -448,7 +557,7 @@ getrusage(p, uap, retval) default: return (EINVAL); } - return (copyout((caddr_t)rup, (caddr_t)SCARG(uap, rusage), + return (copyout((caddr_t)rup, (caddr_t)uap->rusage, sizeof (struct rusage))); } diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c new file mode 100644 index 0000000..c4922d0 --- /dev/null +++ b/sys/kern/kern_shutdown.c @@ -0,0 +1,445 @@ +/*- + * Copyright (c) 1986, 1988, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_shutdown.c 8.3 (Berkeley) 1/21/94 + * $Id$ + */ + +#include "opt_ddb.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/reboot.h> +#include <sys/msgbuf.h> +#include <sys/proc.h> +#include <sys/vnode.h> +#include <sys/tty.h> +#include <sys/tprintf.h> +#include <sys/syslog.h> +#include <sys/malloc.h> +#include <sys/kernel.h> +#include <sys/mount.h> +#include <sys/sysctl.h> +#include <sys/conf.h> +#include <sys/sysproto.h> + +#include <machine/pcb.h> +#include <machine/clock.h> +#include <machine/cons.h> +#include <machine/md_var.h> + +#include <sys/utsname.h> +#include <sys/signalvar.h> + +#ifndef PANIC_REBOOT_WAIT_TIME +#define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */ +#endif + +/* + * Note that stdarg.h and the ANSI style va_start macro is used for both + * ANSI and traditional C compilers. + */ +#include <machine/stdarg.h> + +#if defined(DDB) +#ifdef DDB_UNATTENDED + static int debugger_on_panic = 0; +#else + static int debugger_on_panic = 1; +#endif + +SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic, CTLFLAG_RW, + &debugger_on_panic, 0, ""); +#endif + + +/* + * Variable panicstr contains argument to first call to panic; used as flag + * to indicate that the kernel has already called panic. + */ +const char *panicstr; + +/* + * callout list for things to do a shutdown + */ +typedef struct shutdown_list_element { + struct shutdown_list_element *next; + bootlist_fn function; + void *arg; +} *sle_p; + +/* + * there are two shutdown lists. Some things need to be shut down + * Earlier than others. + */ +static sle_p shutdown_list1; +static sle_p shutdown_list2; + + +static void dumpsys(void); + +#ifndef _SYS_SYSPROTO_H_ +struct reboot_args { + int opt; +}; +#endif +/* ARGSUSED */ + +/* + * The system call that results in a reboot + */ +int +reboot(p, uap, retval) + struct proc *p; + struct reboot_args *uap; + int *retval; +{ + int error; + + if ((error = suser(p->p_ucred, &p->p_acflag))) + return (error); + + boot(uap->opt); + return (0); +} + +/* + * Called by events that want to shut down.. e.g <CTL><ALT><DEL> on a PC + */ +void +shutdown_nice(void) +{ + /* Send a signal to init(8) and have it shutdown the world */ + if (initproc != NULL) { + psignal(initproc, SIGINT); + } else { + /* No init(8) running, so simply reboot */ + boot(RB_NOSYNC); + } + return; +} +static int waittime = -1; +static struct pcb dumppcb; + +/* + * Go through the rigmarole of shutting down.. + * this used to be in machdep.c but I'll be dammned if I could see + * anything machine dependant in it. + */ +void +boot(howto) + int howto; +{ + sle_p ep; + + ep = shutdown_list1; + while (ep) { + shutdown_list1 = ep->next; + (*ep->function)(howto, ep->arg); + ep = ep->next; + } + if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) { + register struct buf *bp; + int iter, nbusy; + + waittime = 0; + printf("\nsyncing disks... "); + + sync(&proc0, NULL, NULL); + + for (iter = 0; iter < 20; iter++) { + nbusy = 0; + for (bp = &buf[nbuf]; --bp >= buf; ) { + if ((bp->b_flags & (B_BUSY | B_INVAL)) == B_BUSY) { + nbusy++; + } + } + if (nbusy == 0) + break; + printf("%d ", nbusy); + DELAY(40000 * iter); + } + if (nbusy) { + /* + * Failed to sync all blocks. Indicate this and don't + * unmount filesystems (thus forcing an fsck on reboot). + */ + printf("giving up\n"); +#ifdef SHOW_BUSYBUFS + nbusy = 0; + for (bp = &buf[nbuf]; --bp >= buf; ) { + if ((bp->b_flags & (B_BUSY | B_INVAL)) == B_BUSY) { + nbusy++; + printf("%d: dev:%08x, flags:%08x, blkno:%d, lblkno:%d\n", nbusy, bp->b_dev, bp->b_flags, bp->b_blkno, bp->b_lblkno); + } + } + DELAY(5000000); /* 5 seconds */ +#endif + } else { + printf("done\n"); + /* + * Unmount filesystems + */ + if (panicstr == 0) + vfs_unmountall(); + } + DELAY(100000); /* wait for console output to finish */ + } + ep = shutdown_list2; + while (ep) { + shutdown_list2 = ep->next; + (*ep->function)(howto, ep->arg); + ep = ep->next; + } + splhigh(); + if (howto & RB_HALT) { + printf("\n"); + printf("The operating system has halted.\n"); + printf("Please press any key to reboot.\n\n"); + switch (cngetc()) { + case -1: /* No console, just die */ + cpu_halt(); + /* NOTREACHED */ + default: + break; + } + } else { + if (howto & RB_DUMP) { + if (!cold) { + savectx(&dumppcb); + dumppcb.pcb_cr3 = rcr3(); + dumpsys(); + } + + if (PANIC_REBOOT_WAIT_TIME != 0) { + if (PANIC_REBOOT_WAIT_TIME != -1) { + int loop; + printf("Automatic reboot in %d seconds - press a key on the console to abort\n", + PANIC_REBOOT_WAIT_TIME); + for (loop = PANIC_REBOOT_WAIT_TIME * 10; loop > 0; --loop) { + DELAY(1000 * 100); /* 1/10th second */ + /* Did user type a key? */ + if (cncheckc() != -1) + break; + } + if (!loop) + goto die; + } + } else { /* zero time specified - reboot NOW */ + goto die; + } + printf("--> Press a key on the console to reboot <--\n"); + cngetc(); + } + } +die: + printf("Rebooting...\n"); + DELAY(1000000); /* wait 1 sec for printf's to complete and be read */ + /* cpu_boot(howto); */ /* doesn't do anything at the moment */ + cpu_reset(); + for(;;) ; + /* NOTREACHED */ +} + +/* + * Magic number for savecore + * + * exported (symorder) and used at least by savecore(8) + * + */ +static u_long const dumpmag = 0x8fca0101UL; + +static int dumpsize = 0; /* also for savecore */ + +static int dodump = 1; +SYSCTL_INT(_machdep, OID_AUTO, do_dump, CTLFLAG_RW, &dodump, 0, ""); + +/* + * Doadump comes here after turning off memory management and + * getting on the dump stack, either when called above, or by + * the auto-restart code. + */ +static void +dumpsys(void) +{ + + if (!dodump) + return; + if (dumpdev == NODEV) + return; + if ((minor(dumpdev)&07) != 1) + return; + if (!(bdevsw[major(dumpdev)])) + return; + if (!(bdevsw[major(dumpdev)]->d_dump)) + return; + dumpsize = Maxmem; + printf("\ndumping to dev %lx, offset %ld\n", dumpdev, dumplo); + printf("dump "); + switch ((*bdevsw[major(dumpdev)]->d_dump)(dumpdev)) { + + case ENXIO: + printf("device bad\n"); + break; + + case EFAULT: + printf("device not ready\n"); + break; + + case EINVAL: + printf("area improper\n"); + break; + + case EIO: + printf("i/o error\n"); + break; + + case EINTR: + printf("aborted from console\n"); + break; + + default: + printf("succeeded\n"); + break; + } +} + +/* + * Panic is called on unresolvable fatal errors. It prints "panic: mesg", + * and then reboots. If we are called twice, then we avoid trying to sync + * the disks as this often leads to recursive panics. + */ +void +panic(const char *fmt, ...) +{ + int bootopt; + va_list ap; + + bootopt = RB_AUTOBOOT | RB_DUMP; + if (panicstr) + bootopt |= RB_NOSYNC; + else + panicstr = fmt; + + printf("panic: "); + va_start(ap, fmt); + vprintf(fmt, ap); + va_end(ap); + printf("\n"); + +#if defined(DDB) + if (debugger_on_panic) + Debugger ("panic"); +#endif + boot(bootopt); +} + +/* + * Two routines to handle adding/deleting items on the + * shutdown callout lists + * + * at_shutdown(): + * Take the arguments given and put them onto the shutdown callout list. + * However first make sure that it's not already there. + * returns 0 on success. + */ +int +at_shutdown(bootlist_fn function, void *arg, int position) +{ + sle_p ep, *epp; + + switch(position) { + case SHUTDOWN_PRE_SYNC: + epp = &shutdown_list1; + break; + case SHUTDOWN_POST_SYNC: + epp = &shutdown_list2; + break; + default: + printf("bad exit callout list specified\n"); + return (EINVAL); + } + if (rm_at_shutdown(function, arg)) + printf("exit callout entry already present\n"); + ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT); + if (ep == NULL) + return (ENOMEM); + ep->next = *epp; + ep->function = function; + ep->arg = arg; + *epp = ep; + return (0); +} + +/* + * Scan the exit callout lists for the given items and remove them. + * Returns the number of items removed. + */ +int +rm_at_shutdown(bootlist_fn function, void *arg) +{ + sle_p *epp, ep; + int count; + + count = 0; + epp = &shutdown_list1; + ep = *epp; + while (ep) { + if ((ep->function == function) && (ep->arg == arg)) { + *epp = ep->next; + free(ep, M_TEMP); + count++; + } else { + epp = &ep->next; + } + ep = *epp; + } + epp = &shutdown_list2; + ep = *epp; + while (ep) { + if ((ep->function == function) && (ep->arg == arg)) { + *epp = ep->next; + free(ep, M_TEMP); + count++; + } else { + epp = &ep->next; + } + ep = *epp; + } + return (count); +} + diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c index 5683b9c..e0b28e0 100644 --- a/sys/kern/kern_sig.c +++ b/sys/kern/kern_sig.c @@ -35,11 +35,15 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)kern_sig.c 8.14 (Berkeley) 5/14/95 + * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94 + * $Id: kern_sig.c,v 1.30 1997/02/22 09:39:11 peter Exp $ */ +#include "opt_ktrace.h" + #define SIGPROP /* include signal properties table */ #include <sys/param.h> +#include <sys/sysproto.h> #include <sys/signalvar.h> #include <sys/resourcevar.h> #include <sys/namei.h> @@ -50,22 +54,27 @@ #include <sys/times.h> #include <sys/buf.h> #include <sys/acct.h> -#include <sys/file.h> +#include <sys/fcntl.h> #include <sys/kernel.h> #include <sys/wait.h> #include <sys/ktrace.h> #include <sys/syslog.h> #include <sys/stat.h> - -#include <sys/mount.h> -#include <sys/syscallargs.h> +#include <sys/sysent.h> #include <machine/cpu.h> #include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> #include <sys/user.h> /* for coredump */ -void stop __P((struct proc *p)); +static int coredump __P((struct proc *p)); +static int killpg1 __P((struct proc *cp, int signum, int pgid, int all)); +static void stop __P((struct proc *)); /* * Can process p, with pcred pc, send the signal signum to process q? @@ -78,16 +87,19 @@ void stop __P((struct proc *p)); (pc)->pc_ucred->cr_uid == (q)->p_ucred->cr_uid || \ ((signum) == SIGCONT && (q)->p_session == (p)->p_session)) +#ifndef _SYS_SYSPROTO_H_ +struct sigaction_args { + int signum; + struct sigaction *nsa; + struct sigaction *osa; +}; +#endif /* ARGSUSED */ int sigaction(p, uap, retval) struct proc *p; - register struct sigaction_args /* { - syscallarg(int) signum; - syscallarg(struct sigaction *) nsa; - syscallarg(struct sigaction *) osa; - } */ *uap; - register_t *retval; + register struct sigaction_args *uap; + int *retval; { struct sigaction vec; register struct sigaction *sa; @@ -95,12 +107,11 @@ sigaction(p, uap, retval) register int signum; int bit, error; - signum = SCARG(uap, signum); - if (signum <= 0 || signum >= NSIG || - signum == SIGKILL || signum == SIGSTOP) + signum = uap->signum; + if (signum <= 0 || signum >= NSIG) return (EINVAL); sa = &vec; - if (SCARG(uap, osa)) { + if (uap->osa) { sa->sa_handler = ps->ps_sigact[signum]; sa->sa_mask = ps->ps_catchmask[signum]; bit = sigmask(signum); @@ -109,16 +120,23 @@ sigaction(p, uap, retval) sa->sa_flags |= SA_ONSTACK; if ((ps->ps_sigintr & bit) == 0) sa->sa_flags |= SA_RESTART; - if (p->p_flag & P_NOCLDSTOP) + if ((ps->ps_sigreset & bit) != 0) + sa->sa_flags |= SA_RESETHAND; + if ((ps->ps_signodefer & bit) != 0) + sa->sa_flags |= SA_NODEFER; + if (signum == SIGCHLD && p->p_flag & P_NOCLDSTOP) sa->sa_flags |= SA_NOCLDSTOP; - if (error = copyout((caddr_t)sa, (caddr_t)SCARG(uap, osa), - sizeof (vec))) + if ((error = copyout((caddr_t)sa, (caddr_t)uap->osa, + sizeof (vec)))) return (error); } - if (SCARG(uap, nsa)) { - if (error = copyin((caddr_t)SCARG(uap, nsa), (caddr_t)sa, - sizeof (vec))) + if (uap->nsa) { + if ((error = copyin((caddr_t)uap->nsa, (caddr_t)sa, + sizeof (vec)))) return (error); + if ((signum == SIGKILL || signum == SIGSTOP) && + sa->sa_handler != SIG_DFL) + return (EINVAL); setsigvec(p, signum, sa); } return (0); @@ -148,6 +166,14 @@ setsigvec(p, signum, sa) ps->ps_sigonstack |= bit; else ps->ps_sigonstack &= ~bit; + if (sa->sa_flags & SA_RESETHAND) + ps->ps_sigreset |= bit; + else + ps->ps_sigreset &= ~bit; + if (sa->sa_flags & SA_NODEFER) + ps->ps_signodefer |= bit; + else + ps->ps_signodefer &= ~bit; #ifdef COMPAT_SUNOS if (sa->sa_flags & SA_USERTRAMP) ps->ps_usertramp |= bit; @@ -227,9 +253,9 @@ execsigs(p) * Reset stack state to the user stack. * Clear set of signals caught on the signal stack. */ - ps->ps_sigstk.ss_flags = SA_DISABLE; + ps->ps_sigstk.ss_flags = SS_DISABLE; ps->ps_sigstk.ss_size = 0; - ps->ps_sigstk.ss_base = 0; + ps->ps_sigstk.ss_sp = 0; ps->ps_flags = 0; } @@ -239,33 +265,36 @@ execsigs(p) * and return old mask as return value; * the library stub does the rest. */ +#ifndef _SYS_SYSPROTO_H_ +struct sigprocmask_args { + int how; + sigset_t mask; +}; +#endif int sigprocmask(p, uap, retval) register struct proc *p; - struct sigprocmask_args /* { - syscallarg(int) how; - syscallarg(sigset_t) mask; - } */ *uap; - register_t *retval; + struct sigprocmask_args *uap; + int *retval; { int error = 0; *retval = p->p_sigmask; (void) splhigh(); - switch (SCARG(uap, how)) { + switch (uap->how) { case SIG_BLOCK: - p->p_sigmask |= SCARG(uap, mask) &~ sigcantmask; + p->p_sigmask |= uap->mask &~ sigcantmask; break; case SIG_UNBLOCK: - p->p_sigmask &= ~SCARG(uap, mask); + p->p_sigmask &= ~uap->mask; break; case SIG_SETMASK: - p->p_sigmask = SCARG(uap, mask) &~ sigcantmask; + p->p_sigmask = uap->mask &~ sigcantmask; break; - + default: error = EINVAL; break; @@ -274,12 +303,17 @@ sigprocmask(p, uap, retval) return (error); } +#ifndef _SYS_SYSPROTO_H_ +struct sigpending_args { + int dummy; +}; +#endif /* ARGSUSED */ int sigpending(p, uap, retval) struct proc *p; - void *uap; - register_t *retval; + struct sigpending_args *uap; + int *retval; { *retval = p->p_siglist; @@ -290,16 +324,19 @@ sigpending(p, uap, retval) /* * Generalized interface signal handler, 4.3-compatible. */ +#ifndef _SYS_SYSPROTO_H_ +struct osigvec_args { + int signum; + struct sigvec *nsv; + struct sigvec *osv; +}; +#endif /* ARGSUSED */ int -compat_43_sigvec(p, uap, retval) +osigvec(p, uap, retval) struct proc *p; - register struct compat_43_sigvec_args /* { - syscallarg(int) signum; - syscallarg(struct sigvec *) nsv; - syscallarg(struct sigvec *) osv; - } */ *uap; - register_t *retval; + register struct osigvec_args *uap; + int *retval; { struct sigvec vec; register struct sigacts *ps = p->p_sigacts; @@ -307,12 +344,11 @@ compat_43_sigvec(p, uap, retval) register int signum; int bit, error; - signum = SCARG(uap, signum); - if (signum <= 0 || signum >= NSIG || - signum == SIGKILL || signum == SIGSTOP) + signum = uap->signum; + if (signum <= 0 || signum >= NSIG) return (EINVAL); sv = &vec; - if (SCARG(uap, osv)) { + if (uap->osv) { *(sig_t *)&sv->sv_handler = ps->ps_sigact[signum]; sv->sv_mask = ps->ps_catchmask[signum]; bit = sigmask(signum); @@ -321,26 +357,26 @@ compat_43_sigvec(p, uap, retval) sv->sv_flags |= SV_ONSTACK; if ((ps->ps_sigintr & bit) != 0) sv->sv_flags |= SV_INTERRUPT; + if ((ps->ps_sigreset & bit) != 0) + sv->sv_flags |= SV_RESETHAND; + if ((ps->ps_signodefer & bit) != 0) + sv->sv_flags |= SV_NODEFER; #ifndef COMPAT_SUNOS - if (p->p_flag & P_NOCLDSTOP) - sv->sv_flags |= SA_NOCLDSTOP; + if (signum == SIGCHLD && p->p_flag & P_NOCLDSTOP) + sv->sv_flags |= SV_NOCLDSTOP; #endif - if (error = copyout((caddr_t)sv, (caddr_t)SCARG(uap, osv), - sizeof (vec))) + if ((error = copyout((caddr_t)sv, (caddr_t)uap->osv, + sizeof (vec)))) return (error); } - if (SCARG(uap, nsv)) { - if (error = copyin((caddr_t)SCARG(uap, nsv), (caddr_t)sv, - sizeof (vec))) + if (uap->nsv) { + if ((error = copyin((caddr_t)uap->nsv, (caddr_t)sv, + sizeof (vec)))) return (error); -#ifdef COMPAT_SUNOS - /* - * SunOS uses this bit (4, aka SA_DISABLE) as SV_RESETHAND, - * `reset to SIG_DFL on delivery'. We have no such option - * now or ever! - */ - if (sv->sv_flags & SA_DISABLE) + if ((signum == SIGKILL || signum == SIGSTOP) && + sv->sv_handler != SIG_DFL) return (EINVAL); +#ifdef COMPAT_SUNOS sv->sv_flags |= SA_USERTRAMP; #endif sv->sv_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */ @@ -349,34 +385,40 @@ compat_43_sigvec(p, uap, retval) return (0); } +#ifndef _SYS_SYSPROTO_H_ +struct osigblock_args { + int mask; +}; +#endif int -compat_43_sigblock(p, uap, retval) +osigblock(p, uap, retval) register struct proc *p; - struct compat_43_sigblock_args /* { - syscallarg(int) mask; - } */ *uap; - register_t *retval; + struct osigblock_args *uap; + int *retval; { (void) splhigh(); *retval = p->p_sigmask; - p->p_sigmask |= SCARG(uap, mask) &~ sigcantmask; + p->p_sigmask |= uap->mask &~ sigcantmask; (void) spl0(); return (0); } +#ifndef _SYS_SYSPROTO_H_ +struct osigsetmask_args { + int mask; +}; +#endif int -compat_43_sigsetmask(p, uap, retval) +osigsetmask(p, uap, retval) struct proc *p; - struct compat_43_sigsetmask_args /* { - syscallarg(int) mask; - } */ *uap; - register_t *retval; + struct osigsetmask_args *uap; + int *retval; { (void) splhigh(); *retval = p->p_sigmask; - p->p_sigmask = SCARG(uap, mask) &~ sigcantmask; + p->p_sigmask = uap->mask &~ sigcantmask; (void) spl0(); return (0); } @@ -387,14 +429,17 @@ compat_43_sigsetmask(p, uap, retval) * in the meantime. Note nonstandard calling convention: * libc stub passes mask, not pointer, to save a copyin. */ +#ifndef _SYS_SYSPROTO_H_ +struct sigsuspend_args { + sigset_t mask; +}; +#endif /* ARGSUSED */ int sigsuspend(p, uap, retval) register struct proc *p; - struct sigsuspend_args /* { - syscallarg(int) mask; - } */ *uap; - register_t *retval; + struct sigsuspend_args *uap; + int *retval; { register struct sigacts *ps = p->p_sigacts; @@ -407,7 +452,7 @@ sigsuspend(p, uap, retval) */ ps->ps_oldmask = p->p_sigmask; ps->ps_flags |= SAS_OLDMASK; - p->p_sigmask = SCARG(uap, mask) &~ sigcantmask; + p->p_sigmask = uap->mask &~ sigcantmask; while (tsleep((caddr_t) ps, PPAUSE|PCATCH, "pause", 0) == 0) /* void */; /* always return EINTR rather than ERESTART... */ @@ -415,46 +460,52 @@ sigsuspend(p, uap, retval) } #if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#ifndef _SYS_SYSPROTO_H_ +struct osigstack_args { + struct sigstack *nss; + struct sigstack *oss; +}; +#endif /* ARGSUSED */ int -compat_43_sigstack(p, uap, retval) +osigstack(p, uap, retval) struct proc *p; - register struct compat_43_sigstack_args /* { - syscallarg(struct sigstack *) nss; - syscallarg(struct sigstack *) oss; - } */ *uap; - register_t *retval; + register struct osigstack_args *uap; + int *retval; { struct sigstack ss; struct sigacts *psp; int error = 0; psp = p->p_sigacts; - ss.ss_sp = psp->ps_sigstk.ss_base; - ss.ss_onstack = psp->ps_sigstk.ss_flags & SA_ONSTACK; - if (SCARG(uap, oss) && (error = copyout((caddr_t)&ss, - (caddr_t)SCARG(uap, oss), sizeof (struct sigstack)))) + ss.ss_sp = psp->ps_sigstk.ss_sp; + ss.ss_onstack = psp->ps_sigstk.ss_flags & SS_ONSTACK; + if (uap->oss && (error = copyout((caddr_t)&ss, (caddr_t)uap->oss, + sizeof (struct sigstack)))) return (error); - if (SCARG(uap, nss) && (error = copyin((caddr_t)SCARG(uap, nss), - (caddr_t)&ss, sizeof (ss))) == 0) { - psp->ps_sigstk.ss_base = ss.ss_sp; + if (uap->nss && (error = copyin((caddr_t)uap->nss, (caddr_t)&ss, + sizeof (ss))) == 0) { + psp->ps_sigstk.ss_sp = ss.ss_sp; psp->ps_sigstk.ss_size = 0; - psp->ps_sigstk.ss_flags |= ss.ss_onstack & SA_ONSTACK; + psp->ps_sigstk.ss_flags |= ss.ss_onstack & SS_ONSTACK; psp->ps_flags |= SAS_ALTSTACK; } return (error); } #endif /* COMPAT_43 || COMPAT_SUNOS */ +#ifndef _SYS_SYSPROTO_H_ +struct sigaltstack_args { + struct sigaltstack *nss; + struct sigaltstack *oss; +}; +#endif /* ARGSUSED */ int sigaltstack(p, uap, retval) struct proc *p; - register struct sigaltstack_args /* { - syscallarg(struct sigaltstack *) nss; - syscallarg(struct sigaltstack *) oss; - } */ *uap; - register_t *retval; + register struct sigaltstack_args *uap; + int *retval; { struct sigacts *psp; struct sigaltstack ss; @@ -462,17 +513,16 @@ sigaltstack(p, uap, retval) psp = p->p_sigacts; if ((psp->ps_flags & SAS_ALTSTACK) == 0) - psp->ps_sigstk.ss_flags |= SA_DISABLE; - if (SCARG(uap, oss) && (error = copyout((caddr_t)&psp->ps_sigstk, - (caddr_t)SCARG(uap, oss), sizeof (struct sigaltstack)))) + psp->ps_sigstk.ss_flags |= SS_DISABLE; + if (uap->oss && (error = copyout((caddr_t)&psp->ps_sigstk, + (caddr_t)uap->oss, sizeof (struct sigaltstack)))) return (error); - if (SCARG(uap, nss) == 0) + if (uap->nss == 0) return (0); - if (error = copyin((caddr_t)SCARG(uap, nss), (caddr_t)&ss, - sizeof (ss))) + if ((error = copyin((caddr_t)uap->nss, (caddr_t)&ss, sizeof (ss)))) return (error); - if (ss.ss_flags & SA_DISABLE) { - if (psp->ps_sigstk.ss_flags & SA_ONSTACK) + if (ss.ss_flags & SS_DISABLE) { + if (psp->ps_sigstk.ss_flags & SS_ONSTACK) return (EINVAL); psp->ps_flags &= ~SAS_ALTSTACK; psp->ps_sigstk.ss_flags = ss.ss_flags; @@ -485,60 +535,6 @@ sigaltstack(p, uap, retval) return (0); } -/* ARGSUSED */ -int -kill(cp, uap, retval) - register struct proc *cp; - register struct kill_args /* { - syscallarg(int) pid; - syscallarg(int) signum; - } */ *uap; - register_t *retval; -{ - register struct proc *p; - register struct pcred *pc = cp->p_cred; - - if ((u_int)SCARG(uap, signum) >= NSIG) - return (EINVAL); - if (SCARG(uap, pid) > 0) { - /* kill single process */ - if ((p = pfind(SCARG(uap, pid))) == NULL) - return (ESRCH); - if (!CANSIGNAL(cp, pc, p, SCARG(uap, signum))) - return (EPERM); - if (SCARG(uap, signum)) - psignal(p, SCARG(uap, signum)); - return (0); - } - switch (SCARG(uap, pid)) { - case -1: /* broadcast signal */ - return (killpg1(cp, SCARG(uap, signum), 0, 1)); - case 0: /* signal own process group */ - return (killpg1(cp, SCARG(uap, signum), 0, 0)); - default: /* negative explicit process group */ - return (killpg1(cp, SCARG(uap, signum), -SCARG(uap, pid), 0)); - } - /* NOTREACHED */ -} - -#if defined(COMPAT_43) || defined(COMPAT_SUNOS) -/* ARGSUSED */ -int -compat_43_killpg(p, uap, retval) - struct proc *p; - register struct compat_43_killpg_args /* { - syscallarg(int) pgid; - syscallarg(int) signum; - } */ *uap; - register_t *retval; -{ - - if ((u_int)SCARG(uap, signum) >= NSIG) - return (EINVAL); - return (killpg1(p, SCARG(uap, signum), SCARG(uap, pgid), 0)); -} -#endif /* COMPAT_43 || COMPAT_SUNOS */ - /* * Common code for kill process group/broadcast kill. * cp is calling process. @@ -552,13 +548,13 @@ killpg1(cp, signum, pgid, all) register struct pcred *pc = cp->p_cred; struct pgrp *pgrp; int nfound = 0; - - if (all) - /* - * broadcast + + if (all) + /* + * broadcast */ for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { - if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || + if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || p == cp || !CANSIGNAL(cp, pc, p, signum)) continue; nfound++; @@ -566,8 +562,8 @@ killpg1(cp, signum, pgid, all) psignal(p, signum); } else { - if (pgid == 0) - /* + if (pgid == 0) + /* * zero pgid means send to my process group. */ pgrp = cp->p_pgrp; @@ -590,6 +586,66 @@ killpg1(cp, signum, pgid, all) return (nfound ? 0 : ESRCH); } +#ifndef _SYS_SYSPROTO_H_ +struct kill_args { + int pid; + int signum; +}; +#endif +/* ARGSUSED */ +int +kill(cp, uap, retval) + register struct proc *cp; + register struct kill_args *uap; + int *retval; +{ + register struct proc *p; + register struct pcred *pc = cp->p_cred; + + if ((u_int)uap->signum >= NSIG) + return (EINVAL); + if (uap->pid > 0) { + /* kill single process */ + if ((p = pfind(uap->pid)) == NULL) + return (ESRCH); + if (!CANSIGNAL(cp, pc, p, uap->signum)) + return (EPERM); + if (uap->signum) + psignal(p, uap->signum); + return (0); + } + switch (uap->pid) { + case -1: /* broadcast signal */ + return (killpg1(cp, uap->signum, 0, 1)); + case 0: /* signal own process group */ + return (killpg1(cp, uap->signum, 0, 0)); + default: /* negative explicit process group */ + return (killpg1(cp, uap->signum, -uap->pid, 0)); + } + /* NOTREACHED */ +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#ifndef _SYS_SYSPROTO_H_ +struct okillpg_args { + int pgid; + int signum; +}; +#endif +/* ARGSUSED */ +int +okillpg(p, uap, retval) + struct proc *p; + register struct okillpg_args *uap; + int *retval; +{ + + if ((u_int)uap->signum >= NSIG) + return (EINVAL); + return (killpg1(p, uap->signum, uap->pgid, 0)); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + /* * Send a signal to a process group. */ @@ -641,11 +697,22 @@ trapsignal(p, signum, code) p->p_stats->p_ru.ru_nsignals++; #ifdef KTRACE if (KTRPOINT(p, KTR_PSIG)) - ktrpsig(p->p_tracep, signum, ps->ps_sigact[signum], + ktrpsig(p->p_tracep, signum, ps->ps_sigact[signum], p->p_sigmask, code); #endif - sendsig(ps->ps_sigact[signum], signum, p->p_sigmask, code); - p->p_sigmask |= ps->ps_catchmask[signum] | mask; + (*p->p_sysent->sv_sendsig)(ps->ps_sigact[signum], signum, + p->p_sigmask, code); + p->p_sigmask |= ps->ps_catchmask[signum] | + (mask & ~ps->ps_signodefer); + if ((ps->ps_sigreset & mask) != 0) { + /* + * See setsigvec() for origin of this code. + */ + p->p_sigcatch &= ~mask; + if (signum != SIGCONT && sigprop[signum] & SA_IGNORE) + p->p_sigignore |= mask; + ps->ps_sigact[signum] = SIG_DFL; + } } else { ps->ps_code = code; /* XXX for core dump/debugger */ ps->ps_sig = signum; /* XXX to verify code */ @@ -719,7 +786,7 @@ psignal(p, signum) */ if (prop & SA_TTYSTOP && p->p_pgrp->pg_jobc == 0 && action == SIG_DFL) - return; + return; p->p_siglist &= ~contsigmask; } p->p_siglist |= mask; @@ -900,17 +967,8 @@ issignal(p) /* * If traced, always stop, and stay * stopped until released by the parent. - * - * Note that we must clear the pending signal - * before we call trace_req since that routine - * might cause a fault, calling tsleep and - * leading us back here again with the same signal. - * Then we would be deadlocked because the tracer - * would still be blocked on the ipc struct from - * the initial request. */ p->p_xstat = signum; - p->p_siglist &= ~mask; psignal(p->p_pptr, SIGCHLD); do { stop(p); @@ -918,10 +976,19 @@ issignal(p) } while (!trace_req(p) && p->p_flag & P_TRACED); /* + * If the traced bit got turned off, go back up + * to the top to rescan signals. This ensures + * that p_sig* and ps_sigact are consistent. + */ + if ((p->p_flag & P_TRACED) == 0) + continue; + + /* * If parent wants us to take the signal, * then it will leave it in p->p_xstat; * otherwise we just look for signals again. */ + p->p_siglist &= ~mask; /* clear the old signal */ signum = p->p_xstat; if (signum == 0) continue; @@ -934,14 +1001,6 @@ issignal(p) p->p_siglist |= mask; if (p->p_sigmask & mask) continue; - - /* - * If the traced bit got turned off, go back up - * to the top to rescan signals. This ensures - * that p_sig* and ps_sigact are consistent. - */ - if ((p->p_flag & P_TRACED) == 0) - continue; } /* @@ -949,9 +1008,9 @@ issignal(p) * Return the signal's number, or fall through * to clear it from the pending mask. */ - switch ((long)p->p_sigacts->ps_sigact[signum]) { + switch ((int)p->p_sigacts->ps_sigact[signum]) { - case (long)SIG_DFL: + case (int)SIG_DFL: /* * Don't take default actions on system processes. */ @@ -961,8 +1020,8 @@ issignal(p) * Are you sure you want to ignore SIGSEGV * in init? XXX */ - printf("Process (pid %d) got signal %d\n", - p->p_pid, signum); + printf("Process (pid %lu) got signal %d\n", + (u_long)p->p_pid, signum); #endif break; /* == ignore */ } @@ -994,7 +1053,7 @@ issignal(p) return (signum); /*NOTREACHED*/ - case (long)SIG_IGN: + case (int)SIG_IGN: /* * Masking above should prevent us ever trying * to take action on an ignored signal other @@ -1043,8 +1102,7 @@ postsig(signum) register struct proc *p = curproc; register struct sigacts *ps = p->p_sigacts; register sig_t action; - u_long code; - int mask, returnmask; + int code, mask, returnmask; #ifdef DIAGNOSTIC if (signum == 0) @@ -1089,7 +1147,17 @@ postsig(signum) ps->ps_flags &= ~SAS_OLDMASK; } else returnmask = p->p_sigmask; - p->p_sigmask |= ps->ps_catchmask[signum] | mask; + p->p_sigmask |= ps->ps_catchmask[signum] | + (mask & ~ps->ps_signodefer); + if ((ps->ps_sigreset & mask) != 0) { + /* + * See setsigvec() for origin of this code. + */ + p->p_sigcatch &= ~mask; + if (signum != SIGCONT && sigprop[signum] & SA_IGNORE) + p->p_sigignore |= mask; + ps->ps_sigact[signum] = SIG_DFL; + } (void) spl0(); p->p_stats->p_ru.ru_nsignals++; if (ps->ps_sig != signum) { @@ -1099,7 +1167,7 @@ postsig(signum) ps->ps_code = 0; ps->ps_sig = 0; } - sendsig(action, signum, returnmask, code); + (*p->p_sysent->sv_sendsig)(action, signum, returnmask, code); } } @@ -1111,9 +1179,8 @@ killproc(p, why) struct proc *p; char *why; { - - log(LOG_ERR, "pid %d was killed: %s\n", p->p_pid, why); - uprintf("sorry, pid %d was killed: %s\n", p->p_pid, why); + log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm, + p->p_cred && p->p_ucred ? p->p_ucred->cr_uid : -1, why); psignal(p, SIGKILL); } @@ -1134,8 +1201,19 @@ sigexit(p, signum) p->p_acflag |= AXSIG; if (sigprop[signum] & SA_CORE) { p->p_sigacts->ps_sig = signum; + /* + * Log signals which would cause core dumps + * (Log as LOG_INFO to appease those who don't want + * these messages.) + * XXX : Todo, as well as euid, write out ruid too + */ if (coredump(p) == 0) signum |= WCOREFLAG; + log(LOG_INFO, "pid %d (%s), uid %d: exited on signal %d%s\n", + p->p_pid, p->p_comm, + p->p_cred && p->p_ucred ? p->p_ucred->cr_uid : -1, + signum &~ WCOREFLAG, + signum & WCOREFLAG ? " (core dumped)" : ""); } exit1(p, W_EXITCODE(0, signum)); /* NOTREACHED */ @@ -1145,28 +1223,27 @@ sigexit(p, signum) * Dump core, into a file named "progname.core", unless the process was * setuid/setgid. */ -int +static int coredump(p) register struct proc *p; { register struct vnode *vp; - register struct pcred *pcred = p->p_cred; - register struct ucred *cred = pcred->pc_ucred; + register struct ucred *cred = p->p_cred->pc_ucred; register struct vmspace *vm = p->p_vmspace; struct nameidata nd; struct vattr vattr; int error, error1; char name[MAXCOMLEN+6]; /* progname.core */ - if (pcred->p_svuid != pcred->p_ruid || pcred->p_svgid != pcred->p_rgid) + if (p->p_flag & P_SUGID) return (EFAULT); if (ctob(UPAGES + vm->vm_dsize + vm->vm_ssize) >= p->p_rlimit[RLIMIT_CORE].rlim_cur) return (EFAULT); sprintf(name, "%s.core", p->p_comm); NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, name, p); - if (error = vn_open(&nd, - O_CREAT | FWRITE, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH)) + if ((error = vn_open(&nd, + O_CREAT | FWRITE, S_IRUSR | S_IWUSR))) return (error); vp = nd.ni_vp; @@ -1206,14 +1283,19 @@ out: * Nonexistent system call-- signal process (may want to handle it). * Flag error in case process won't see signal immediately (blocked or ignored). */ +#ifndef _SYS_SYSPROTO_H_ +struct nosys_args { + int dummy; +}; +#endif /* ARGSUSED */ int nosys(p, args, retval) struct proc *p; - void *args; - register_t *retval; + struct nosys_args *args; + int *retval; { psignal(p, SIGSYS); - return (ENOSYS); + return (EINVAL); } diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c index df83710..d0097df 100644 --- a/sys/kern/kern_subr.c +++ b/sys/kern/kern_subr.c @@ -35,7 +35,8 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)kern_subr.c 8.4 (Berkeley) 2/14/95 + * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94 + * $Id$ */ #include <sys/param.h> @@ -52,7 +53,7 @@ uiomove(cp, n, uio) { register struct iovec *iov; u_int cnt; - int error = 0; + int error; #ifdef DIAGNOSTIC if (uio->uio_rw != UIO_READ && uio->uio_rw != UIO_WRITE) @@ -70,6 +71,7 @@ uiomove(cp, n, uio) } if (cnt > n) cnt = n; + switch (uio->uio_segflg) { case UIO_USERSPACE: @@ -88,6 +90,8 @@ uiomove(cp, n, uio) else bcopy(iov->iov_base, (caddr_t)cp, cnt); break; + case UIO_NOCOPY: + break; } iov->iov_base += cnt; iov->iov_len -= cnt; @@ -96,7 +100,7 @@ uiomove(cp, n, uio) cp += cnt; n -= cnt; } - return (error); + return (0); } /* @@ -109,13 +113,11 @@ ureadc(c, uio) { register struct iovec *iov; - if (uio->uio_resid <= 0) - panic("ureadc: non-positive resid"); again: - if (uio->uio_iovcnt <= 0) - panic("ureadc: non-positive iovcnt"); + if (uio->uio_iovcnt == 0 || uio->uio_resid == 0) + panic("ureadc"); iov = uio->uio_iov; - if (iov->iov_len <= 0) { + if (iov->iov_len == 0) { uio->uio_iovcnt--; uio->uio_iov++; goto again; @@ -135,6 +137,8 @@ again: if (suibyte(iov->iov_base, c) < 0) return (EFAULT); break; + case UIO_NOCOPY: + break; } iov->iov_base++; iov->iov_len--; @@ -158,7 +162,7 @@ uwritec(uio) return (-1); again: if (uio->uio_iovcnt <= 0) - panic("uwritec: non-positive iovcnt"); + panic("uwritec"); iov = uio->uio_iov; if (iov->iov_len == 0) { uio->uio_iov++; @@ -203,7 +207,7 @@ hashinit(elements, type, hashmask) int i; if (elements <= 0) - panic("hashinit: bad cnt"); + panic("hashinit: bad elements"); for (hashsize = 1; hashsize <= elements; hashsize <<= 1) continue; hashsize >>= 1; @@ -213,3 +217,36 @@ hashinit(elements, type, hashmask) *hashmask = hashsize - 1; return (hashtbl); } + +#define NPRIMES 27 +static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039, + 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653, + 7159, 7673, 8191, 12281, 16381, 24571, 32749 }; + +/* + * General routine to allocate a prime number sized hash table. + */ +void * +phashinit(elements, type, nentries) + int elements, type; + u_long *nentries; +{ + long hashsize; + LIST_HEAD(generic, generic) *hashtbl; + int i; + + if (elements <= 0) + panic("phashinit: bad elements"); + for (i = 1, hashsize = primes[1]; hashsize <= elements;) { + i++; + if (i == NPRIMES) + break; + hashsize = primes[i]; + } + hashsize = primes[i - 1]; + hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK); + for (i = 0; i < hashsize; i++) + LIST_INIT(&hashtbl[i]); + *nentries = hashsize; + return (hashtbl); +} diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c index 6c82027..04339cd 100644 --- a/sys/kern/kern_synch.c +++ b/sys/kern/kern_synch.c @@ -36,8 +36,11 @@ * SUCH DAMAGE. * * @(#)kern_synch.c 8.9 (Berkeley) 5/19/95 + * $Id: kern_synch.c,v 1.29 1997/02/22 09:39:12 peter Exp $ */ +#include "opt_ktrace.h" + #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> @@ -45,16 +48,26 @@ #include <sys/buf.h> #include <sys/signalvar.h> #include <sys/resourcevar.h> +#include <sys/signalvar.h> #include <sys/vmmeter.h> +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_extern.h> #ifdef KTRACE #include <sys/ktrace.h> #endif #include <machine/cpu.h> +static void rqinit __P((void *)); +SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL) + u_char curpriority; /* usrpri of curproc */ int lbolt; /* once a second sleep address */ +extern void endtsleep __P((void *)); +extern void updatepri __P((struct proc *p)); + /* * Force switch among equal priority processes every 100ms. */ @@ -75,7 +88,7 @@ roundrobin(arg) * Note that, as ps(1) mentions, this can let percentages * total over 100% (I've seen 137.9% for 3 processes). * - * Note that hardclock updates p_estcpu and p_cpticks independently. + * Note that statclock updates p_estcpu and p_cpticks independently. * * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds. * That is, the system wants to compute a value of decay such @@ -104,7 +117,7 @@ roundrobin(arg) * We now need to prove two things: * 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1) * 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg) - * + * * Facts: * For x close to zero, exp(x) =~ 1 + x, since * exp(x) = 0! + x**1/1! + x**2/2! + ... . @@ -183,7 +196,7 @@ schedcpu(arg) */ if (p->p_slptime > 1) continue; - s = splstatclock(); /* prevent state changes */ + s = splhigh(); /* prevent state changes and protect run queue */ /* * p_pctcpu is only for ps. */ @@ -215,8 +228,6 @@ schedcpu(arg) splx(s); } vmmeter(); - if (bclnlist != NULL) - wakeup((caddr_t)pageproc); timeout(schedcpu, (void *)0, hz); } @@ -249,11 +260,8 @@ updatepri(p) * of 2. Shift right by 8, i.e. drop the bottom 256 worth. */ #define TABLESIZE 128 +TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE]; #define LOOKUP(x) (((long)(x) >> 8) & (TABLESIZE - 1)) -struct slpque { - struct proc *sq_head; - struct proc **sq_tailp; -} slpque[TABLESIZE]; /* * During autoconfiguration or after a panic, a sleep will simply @@ -266,6 +274,15 @@ struct slpque { */ int safepri; +void +sleepinit() +{ + int i; + + for (i = 0; i < TABLESIZE; i++) + TAILQ_INIT(&slpque[i]); +} + /* * General sleep call. Suspends the current process until a wakeup is * performed on the specified identifier. The process will then be made @@ -283,12 +300,8 @@ tsleep(ident, priority, wmesg, timo) int priority, timo; char *wmesg; { - register struct proc *p = curproc; - register struct slpque *qp; - register s; - int sig, catch = priority & PCATCH; - extern int cold; - void endtsleep __P((void *)); + struct proc *p = curproc; + int s, sig, catch = priority & PCATCH; #ifdef KTRACE if (KTRPOINT(p, KTR_CSW)) @@ -307,19 +320,14 @@ tsleep(ident, priority, wmesg, timo) return (0); } #ifdef DIAGNOSTIC - if (ident == NULL || p->p_stat != SRUN || p->p_back) + if (ident == NULL || p->p_stat != SRUN) panic("tsleep"); #endif p->p_wchan = ident; p->p_wmesg = wmesg; p->p_slptime = 0; p->p_priority = priority & PRIMASK; - qp = &slpque[LOOKUP(ident)]; - if (qp->sq_head == 0) - qp->sq_head = p; - else - *qp->sq_tailp = p; - *(qp->sq_tailp = &p->p_forw) = 0; + TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq); if (timo) timeout(endtsleep, (void *)p, timo); /* @@ -333,7 +341,7 @@ tsleep(ident, priority, wmesg, timo) */ if (catch) { p->p_flag |= P_SINTR; - if (sig = CURSIG(p)) { + if ((sig = CURSIG(p))) { if (p->p_wchan) unsleep(p); p->p_stat = SRUN; @@ -405,85 +413,17 @@ endtsleep(arg) } /* - * Short-term, non-interruptable sleep. - */ -void -sleep(ident, priority) - void *ident; - int priority; -{ - register struct proc *p = curproc; - register struct slpque *qp; - register s; - extern int cold; - -#ifdef DIAGNOSTIC - if (priority > PZERO) { - printf("sleep called with priority %d > PZERO, wchan: %x\n", - priority, ident); - panic("old sleep"); - } -#endif - s = splhigh(); - if (cold || panicstr) { - /* - * After a panic, or during autoconfiguration, - * just give interrupts a chance, then just return; - * don't run any other procs or panic below, - * in case this is the idle process and already asleep. - */ - splx(safepri); - splx(s); - return; - } -#ifdef DIAGNOSTIC - if (ident == NULL || p->p_stat != SRUN || p->p_back) - panic("sleep"); -#endif - p->p_wchan = ident; - p->p_wmesg = NULL; - p->p_slptime = 0; - p->p_priority = priority; - qp = &slpque[LOOKUP(ident)]; - if (qp->sq_head == 0) - qp->sq_head = p; - else - *qp->sq_tailp = p; - *(qp->sq_tailp = &p->p_forw) = 0; - p->p_stat = SSLEEP; - p->p_stats->p_ru.ru_nvcsw++; -#ifdef KTRACE - if (KTRPOINT(p, KTR_CSW)) - ktrcsw(p->p_tracep, 1, 0); -#endif - mi_switch(); -#ifdef KTRACE - if (KTRPOINT(p, KTR_CSW)) - ktrcsw(p->p_tracep, 0, 0); -#endif - curpriority = p->p_usrpri; - splx(s); -} - -/* * Remove a process from its wait queue */ void unsleep(p) register struct proc *p; { - register struct slpque *qp; - register struct proc **hp; int s; s = splhigh(); if (p->p_wchan) { - hp = &(qp = &slpque[LOOKUP(p->p_wchan)])->sq_head; - while (*hp != p) - hp = &(*hp)->p_forw; - *hp = p->p_forw; - if (qp->sq_tailp == &p->p_forw) - qp->sq_tailp = hp; + TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_procq); p->p_wchan = 0; } splx(s); @@ -496,45 +436,83 @@ void wakeup(ident) register void *ident; { - register struct slpque *qp; - register struct proc *p, **q; + register struct slpquehead *qp; + register struct proc *p; int s; s = splhigh(); qp = &slpque[LOOKUP(ident)]; restart: - for (q = &qp->sq_head; p = *q; ) { + for (p = qp->tqh_first; p != NULL; p = p->p_procq.tqe_next) { #ifdef DIAGNOSTIC - if (p->p_back || p->p_stat != SSLEEP && p->p_stat != SSTOP) + if (p->p_stat != SSLEEP && p->p_stat != SSTOP) panic("wakeup"); #endif if (p->p_wchan == ident) { + TAILQ_REMOVE(qp, p, p_procq); p->p_wchan = 0; - *q = p->p_forw; - if (qp->sq_tailp == &p->p_forw) - qp->sq_tailp = q; if (p->p_stat == SSLEEP) { /* OPTIMIZED EXPANSION OF setrunnable(p); */ if (p->p_slptime > 1) updatepri(p); p->p_slptime = 0; p->p_stat = SRUN; - if (p->p_flag & P_INMEM) + if (p->p_flag & P_INMEM) { setrunqueue(p); - /* - * Since curpriority is a user priority, - * p->p_priority is always better than - * curpriority. - */ - if ((p->p_flag & P_INMEM) == 0) - wakeup((caddr_t)&proc0); - else need_resched(); + } else { + p->p_flag |= P_SWAPINREQ; + wakeup((caddr_t)&proc0); + } /* END INLINE EXPANSION */ goto restart; } - } else - q = &p->p_forw; + } + } + splx(s); +} + +/* + * Make a process sleeping on the specified identifier runnable. + * May wake more than one process if a target prcoess is currently + * swapped out. + */ +void +wakeup_one(ident) + register void *ident; +{ + register struct slpquehead *qp; + register struct proc *p; + int s; + + s = splhigh(); + qp = &slpque[LOOKUP(ident)]; + + for (p = qp->tqh_first; p != NULL; p = p->p_procq.tqe_next) { +#ifdef DIAGNOSTIC + if (p->p_stat != SSLEEP && p->p_stat != SSTOP) + panic("wakeup_one"); +#endif + if (p->p_wchan == ident) { + TAILQ_REMOVE(qp, p, p_procq); + p->p_wchan = 0; + if (p->p_stat == SSLEEP) { + /* OPTIMIZED EXPANSION OF setrunnable(p); */ + if (p->p_slptime > 1) + updatepri(p); + p->p_slptime = 0; + p->p_stat = SRUN; + if (p->p_flag & P_INMEM) { + setrunqueue(p); + need_resched(); + break; + } else { + p->p_flag |= P_SWAPINREQ; + wakeup((caddr_t)&proc0); + } + /* END INLINE EXPANSION */ + } + } } splx(s); } @@ -549,11 +527,31 @@ mi_switch() register struct proc *p = curproc; /* XXX */ register struct rlimit *rlim; register long s, u; + int x; struct timeval tv; -#ifdef DEBUG + /* + * XXX this spl is almost unnecessary. It is partly to allow for + * sloppy callers that don't do it (issignal() via CURSIG() is the + * main offender). It is partly to work around a bug in the i386 + * cpu_switch() (the ipl is not preserved). We ran for years + * without it. I think there was only a interrupt latency problem. + * The main caller, tsleep(), does an splx() a couple of instructions + * after calling here. The buggy caller, issignal(), usually calls + * here at spl0() and sometimes returns at splhigh(). The process + * then runs for a little too long at splhigh(). The ipl gets fixed + * when the process returns to user mode (or earlier). + * + * It would probably be better to always call here at spl0(). Callers + * are prepared to give up control to another process, so they must + * be prepared to be interrupted. The clock stuff here may not + * actually need splstatclock(). + */ + x = splstatclock(); + +#ifdef SIMPLELOCK_DEBUG if (p->p_simple_locks) - panic("sleep: holding simple lock"); + printf("sleep: holding simple lock"); #endif /* * Compute the amount of time during which the current @@ -574,23 +572,20 @@ mi_switch() /* * Check if the process exceeds its cpu resource allocation. - * If over max, kill it. In any case, if it has run for more - * than 10 minutes, reduce priority to give others a chance. + * If over max, kill it. */ - rlim = &p->p_rlimit[RLIMIT_CPU]; - if (s >= rlim->rlim_cur) { - if (s >= rlim->rlim_max) - psignal(p, SIGKILL); - else { - psignal(p, SIGXCPU); - if (rlim->rlim_cur < rlim->rlim_max) - rlim->rlim_cur += 5; + if (p->p_stat != SZOMB) { + rlim = &p->p_rlimit[RLIMIT_CPU]; + if (s >= rlim->rlim_cur) { + if (s >= rlim->rlim_max) + killproc(p, "exceeded maximum CPU limit"); + else { + psignal(p, SIGXCPU); + if (rlim->rlim_cur < rlim->rlim_max) + rlim->rlim_cur += 5; + } } } - if (s > 10 * 60 && p->p_ucred->cr_uid && p->p_nice == NZERO) { - p->p_nice = NZERO + 4; - resetpriority(p); - } /* * Pick a new current process and record its start time. @@ -598,19 +593,25 @@ mi_switch() cnt.v_swtch++; cpu_switch(p); microtime(&runtime); + splx(x); } /* * Initialize the (doubly-linked) run queues * to be empty. */ -void -rqinit() +/* ARGSUSED*/ +static void +rqinit(dummy) + void *dummy; { register int i; - for (i = 0; i < NQS; i++) + for (i = 0; i < NQS; i++) { qs[i].ph_link = qs[i].ph_rlink = (struct proc *)&qs[i]; + rtqs[i].ph_link = rtqs[i].ph_rlink = (struct proc *)&rtqs[i]; + idqs[i].ph_link = idqs[i].ph_rlink = (struct proc *)&idqs[i]; + } } /* @@ -646,8 +647,10 @@ setrunnable(p) if (p->p_slptime > 1) updatepri(p); p->p_slptime = 0; - if ((p->p_flag & P_INMEM) == 0) + if ((p->p_flag & P_INMEM) == 0) { + p->p_flag |= P_SWAPINREQ; wakeup((caddr_t)&proc0); + } else if (p->p_priority < curpriority) need_resched(); } @@ -663,9 +666,13 @@ resetpriority(p) { register unsigned int newpriority; - newpriority = PUSER + p->p_estcpu / 4 + 2 * p->p_nice; - newpriority = min(newpriority, MAXPRI); - p->p_usrpri = newpriority; - if (newpriority < curpriority) + if (p->p_rtprio.type == RTP_PRIO_NORMAL) { + newpriority = PUSER + p->p_estcpu / 4 + 2 * p->p_nice; + newpriority = min(newpriority, MAXPRI); + p->p_usrpri = newpriority; + if (newpriority < curpriority) + need_resched(); + } else { need_resched(); + } } diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c index b178da3..fb07f18 100644 --- a/sys/kern/kern_sysctl.c +++ b/sys/kern/kern_sysctl.c @@ -5,6 +5,9 @@ * This code is derived from software contributed to Berkeley by * Mike Karels at Berkeley Software Design, Inc. * + * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD + * project, to make these variables more userfriendly. + * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: @@ -33,39 +36,20 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)kern_sysctl.c 8.9 (Berkeley) 5/20/95 - */ - -/* - * sysctl system call. + * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94 + * $Id$ */ #include <sys/param.h> -#include <sys/systm.h> #include <sys/kernel.h> +#include <sys/sysctl.h> #include <sys/malloc.h> #include <sys/proc.h> -#include <sys/file.h> -#include <sys/vnode.h> -#include <sys/unistd.h> -#include <sys/buf.h> -#include <sys/ioctl.h> -#include <sys/tty.h> +#include <sys/systm.h> +#include <sys/sysproto.h> #include <vm/vm.h> -#include <sys/sysctl.h> - -#include <sys/mount.h> -#include <sys/syscallargs.h> - -sysctlfn kern_sysctl; -sysctlfn hw_sysctl; -#ifdef DEBUG -sysctlfn debug_sysctl; -#endif -extern sysctlfn vm_sysctl; -extern sysctlfn vfs_sysctl; -extern sysctlfn net_sysctl; -extern sysctlfn cpu_sysctl; +#include <vm/vm_extern.h> +#include <sys/vnode.h> /* * Locking and stats @@ -76,634 +60,818 @@ static struct sysctl_lock { int sl_locked; } memlock; -int -__sysctl(p, uap, retval) - struct proc *p; - register struct __sysctl_args /* { - syscallarg(int *) name; - syscallarg(u_int) namelen; - syscallarg(void *) old; - syscallarg(size_t *) oldlenp; - syscallarg(void *) new; - syscallarg(size_t) newlen; - } */ *uap; - register_t *retval; +static int sysctl_root SYSCTL_HANDLER_ARGS; + +extern struct linker_set sysctl_; + +/* + * Initialization of the MIB tree. + * + * Order by number in each linker_set. + */ + +static int +sysctl_order_cmp(const void *a, const void *b) { - int error, dolock = 1; - size_t savelen, oldlen = 0; - sysctlfn *fn; - int name[CTL_MAXNAME]; + struct sysctl_oid const * const *pa; + struct sysctl_oid const * const *pb; - if (SCARG(uap, new) != NULL && - (error = suser(p->p_ucred, &p->p_acflag))) - return (error); - /* - * all top-level sysctl names are non-terminal - */ - if (SCARG(uap, namelen) > CTL_MAXNAME || SCARG(uap, namelen) < 2) - return (EINVAL); - if (error = - copyin(SCARG(uap, name), &name, SCARG(uap, namelen) * sizeof(int))) - return (error); + pa = (struct sysctl_oid const * const *)a; + pb = (struct sysctl_oid const * const *)b; + if (*pa == NULL) + return (1); + if (*pb == NULL) + return (-1); + return ((*pa)->oid_number - (*pb)->oid_number); +} - switch (name[0]) { - case CTL_KERN: - fn = kern_sysctl; - if (name[2] == KERN_VNODE) /* XXX */ - dolock = 0; - break; - case CTL_HW: - fn = hw_sysctl; - break; - case CTL_VM: - fn = vm_sysctl; - break; - case CTL_NET: - fn = net_sysctl; - break; - case CTL_VFS: - fn = vfs_sysctl; - break; - case CTL_MACHDEP: - fn = cpu_sysctl; - break; -#ifdef DEBUG - case CTL_DEBUG: - fn = debug_sysctl; - break; -#endif - default: - return (EOPNOTSUPP); - } +static void +sysctl_order(void *arg) +{ + int j, k; + struct linker_set *l = (struct linker_set *) arg; + struct sysctl_oid **oidpp; - if (SCARG(uap, oldlenp) && - (error = copyin(SCARG(uap, oldlenp), &oldlen, sizeof(oldlen)))) - return (error); - if (SCARG(uap, old) != NULL) { - if (!useracc(SCARG(uap, old), oldlen, B_WRITE)) - return (EFAULT); - while (memlock.sl_lock) { - memlock.sl_want = 1; - sleep((caddr_t)&memlock, PRIBIO+1); - memlock.sl_locked++; + /* First, find the highest oid we have */ + j = l->ls_length; + oidpp = (struct sysctl_oid **) l->ls_items; + for (k = 0; j--; oidpp++) { + if ((*oidpp)->oid_arg1 == arg) { + *oidpp = 0; + continue; } - memlock.sl_lock = 1; - if (dolock) - vslock(SCARG(uap, old), oldlen); - savelen = oldlen; + if (*oidpp && (*oidpp)->oid_number > k) + k = (*oidpp)->oid_number; } - error = (*fn)(name + 1, SCARG(uap, namelen) - 1, SCARG(uap, old), - &oldlen, SCARG(uap, new), SCARG(uap, newlen), p); - if (SCARG(uap, old) != NULL) { - if (dolock) - vsunlock(SCARG(uap, old), savelen, B_WRITE); - memlock.sl_lock = 0; - if (memlock.sl_want) { - memlock.sl_want = 0; - wakeup((caddr_t)&memlock); - } + + /* Next, replace all OID_AUTO oids with new numbers */ + j = l->ls_length; + oidpp = (struct sysctl_oid **) l->ls_items; + k += 100; + for (; j--; oidpp++) + if (*oidpp && (*oidpp)->oid_number == OID_AUTO) + (*oidpp)->oid_number = k++; + + /* Finally: sort by oid */ + j = l->ls_length; + oidpp = (struct sysctl_oid **) l->ls_items; + for (; j--; oidpp++) { + if (!*oidpp) + continue; + if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE) + if (!(*oidpp)->oid_handler) + sysctl_order((*oidpp)->oid_arg1); } - if (error) - return (error); - if (SCARG(uap, oldlenp)) - error = copyout(&oldlen, SCARG(uap, oldlenp), sizeof(oldlen)); - *retval = oldlen; - return (0); + qsort(l->ls_items, l->ls_length, sizeof l->ls_items[0], + sysctl_order_cmp); } -/* - * Attributes stored in the kernel. - */ -char hostname[MAXHOSTNAMELEN]; -int hostnamelen; -long hostid; -int securelevel; +SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_ANY, sysctl_order, &sysctl_); /* - * kernel related system variables. + * "Staff-functions" + * + * These functions implement a presently undocumented interface + * used by the sysctl program to walk the tree, and get the type + * so it can print the value. + * This interface is under work and consideration, and should probably + * be killed with a big axe by the first person who can find the time. + * (be aware though, that the proper interface isn't as obvious as it + * may seem, there are various conflicting requirements. + * + * {0,0} printf the entire MIB-tree. + * {0,1,...} return the name of the "..." OID. + * {0,2,...} return the next OID. + * {0,3} return the OID of the name in "new" + * {0,4,...} return the kind & format info for the "..." OID. */ -kern_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) - int *name; - u_int namelen; - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; - struct proc *p; + +static void +sysctl_sysctl_debug_dump_node(struct linker_set *l, int i) { - int error, level, inthostid; - extern char ostype[], osrelease[], version[]; - - /* all sysctl names at this level are terminal */ - if (namelen != 1 && !(name[0] == KERN_PROC || name[0] == KERN_PROF)) - return (ENOTDIR); /* overloaded */ - - switch (name[0]) { - case KERN_OSTYPE: - return (sysctl_rdstring(oldp, oldlenp, newp, ostype)); - case KERN_OSRELEASE: - return (sysctl_rdstring(oldp, oldlenp, newp, osrelease)); - case KERN_OSREV: - return (sysctl_rdint(oldp, oldlenp, newp, BSD)); - case KERN_VERSION: - return (sysctl_rdstring(oldp, oldlenp, newp, version)); - case KERN_MAXVNODES: - return(sysctl_int(oldp, oldlenp, newp, newlen, &desiredvnodes)); - case KERN_MAXPROC: - return (sysctl_int(oldp, oldlenp, newp, newlen, &maxproc)); - case KERN_MAXFILES: - return (sysctl_int(oldp, oldlenp, newp, newlen, &maxfiles)); - case KERN_ARGMAX: - return (sysctl_rdint(oldp, oldlenp, newp, ARG_MAX)); - case KERN_SECURELVL: - level = securelevel; - if ((error = sysctl_int(oldp, oldlenp, newp, newlen, &level)) || - newp == NULL) - return (error); - if (level < securelevel && p->p_pid != 1) - return (EPERM); - securelevel = level; - return (0); - case KERN_HOSTNAME: - error = sysctl_string(oldp, oldlenp, newp, newlen, - hostname, sizeof(hostname)); - if (newp && !error) - hostnamelen = newlen; - return (error); - case KERN_HOSTID: - inthostid = hostid; /* XXX assumes sizeof long <= sizeof int */ - error = sysctl_int(oldp, oldlenp, newp, newlen, &inthostid); - hostid = inthostid; - return (error); - case KERN_CLOCKRATE: - return (sysctl_clockrate(oldp, oldlenp)); - case KERN_BOOTTIME: - return (sysctl_rdstruct(oldp, oldlenp, newp, &boottime, - sizeof(struct timeval))); - case KERN_VNODE: - return (sysctl_vnode(oldp, oldlenp, p)); - case KERN_PROC: - return (sysctl_doproc(name + 1, namelen - 1, oldp, oldlenp)); - case KERN_FILE: - return (sysctl_file(oldp, oldlenp)); -#ifdef GPROF - case KERN_PROF: - return (sysctl_doprof(name + 1, namelen - 1, oldp, oldlenp, - newp, newlen)); -#endif - case KERN_POSIX1: - return (sysctl_rdint(oldp, oldlenp, newp, _POSIX_VERSION)); - case KERN_NGROUPS: - return (sysctl_rdint(oldp, oldlenp, newp, NGROUPS_MAX)); - case KERN_JOB_CONTROL: - return (sysctl_rdint(oldp, oldlenp, newp, 1)); - case KERN_SAVED_IDS: -#ifdef _POSIX_SAVED_IDS - return (sysctl_rdint(oldp, oldlenp, newp, 1)); -#else - return (sysctl_rdint(oldp, oldlenp, newp, 0)); -#endif - default: - return (EOPNOTSUPP); + int j, k; + struct sysctl_oid **oidpp; + + j = l->ls_length; + oidpp = (struct sysctl_oid **) l->ls_items; + for (; j--; oidpp++) { + + if (!*oidpp) + continue; + + for (k=0; k<i; k++) + printf(" "); + + printf("%d %s ", (*oidpp)->oid_number, (*oidpp)->oid_name); + + printf("%c%c", + (*oidpp)->oid_kind & CTLFLAG_RD ? 'R':' ', + (*oidpp)->oid_kind & CTLFLAG_WR ? 'W':' '); + + if ((*oidpp)->oid_handler) + printf(" *Handler"); + + switch ((*oidpp)->oid_kind & CTLTYPE) { + case CTLTYPE_NODE: + printf(" Node\n"); + if (!(*oidpp)->oid_handler) { + sysctl_sysctl_debug_dump_node( + (*oidpp)->oid_arg1, i+2); + } + break; + case CTLTYPE_INT: printf(" Int\n"); break; + case CTLTYPE_STRING: printf(" String\n"); break; + case CTLTYPE_QUAD: printf(" Quad\n"); break; + case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break; + default: printf("\n"); + } + } - /* NOTREACHED */ } -/* - * hardware related system variables. - */ -hw_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) - int *name; - u_int namelen; - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; - struct proc *p; +static int +sysctl_sysctl_debug SYSCTL_HANDLER_ARGS { - extern char machine[], cpu_model[]; - - /* all sysctl names at this level are terminal */ - if (namelen != 1) - return (ENOTDIR); /* overloaded */ - - switch (name[0]) { - case HW_MACHINE: - return (sysctl_rdstring(oldp, oldlenp, newp, machine)); - case HW_MODEL: - return (sysctl_rdstring(oldp, oldlenp, newp, cpu_model)); - case HW_NCPU: - return (sysctl_rdint(oldp, oldlenp, newp, 1)); /* XXX */ - case HW_BYTEORDER: - return (sysctl_rdint(oldp, oldlenp, newp, BYTE_ORDER)); - case HW_PHYSMEM: - return (sysctl_rdint(oldp, oldlenp, newp, ctob(physmem))); - case HW_USERMEM: - return (sysctl_rdint(oldp, oldlenp, newp, - ctob(physmem - cnt.v_wire_count))); - case HW_PAGESIZE: - return (sysctl_rdint(oldp, oldlenp, newp, PAGE_SIZE)); - default: - return (EOPNOTSUPP); + sysctl_sysctl_debug_dump_node(&sysctl_, 0); + return ENOENT; +} + +SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD, + 0, 0, sysctl_sysctl_debug, "-", ""); + +static int +sysctl_sysctl_name SYSCTL_HANDLER_ARGS +{ + int *name = (int *) arg1; + u_int namelen = arg2; + int i, j, error = 0; + struct sysctl_oid **oidpp; + struct linker_set *lsp = &sysctl_; + char buf[10]; + + while (namelen) { + if (!lsp) { + sprintf(buf,"%d",*name); + if (req->oldidx) + error = SYSCTL_OUT(req, ".", 1); + if (!error) + error = SYSCTL_OUT(req, buf, strlen(buf)); + if (error) + return (error); + namelen--; + name++; + continue; + } + oidpp = (struct sysctl_oid **) lsp->ls_items; + j = lsp->ls_length; + lsp = 0; + for (i = 0; i < j; i++, oidpp++) { + if (*oidpp && ((*oidpp)->oid_number != *name)) + continue; + + if (req->oldidx) + error = SYSCTL_OUT(req, ".", 1); + if (!error) + error = SYSCTL_OUT(req, (*oidpp)->oid_name, + strlen((*oidpp)->oid_name)); + if (error) + return (error); + + namelen--; + name++; + + if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE) + break; + + if ((*oidpp)->oid_handler) + break; + + lsp = (struct linker_set*)(*oidpp)->oid_arg1; + break; + } } - /* NOTREACHED */ + return (SYSCTL_OUT(req, "", 1)); } -#ifdef DEBUG -/* - * Debugging related system variables. - */ -struct ctldebug debug0, debug1, debug2, debug3, debug4; -struct ctldebug debug5, debug6, debug7, debug8, debug9; -struct ctldebug debug10, debug11, debug12, debug13, debug14; -struct ctldebug debug15, debug16, debug17, debug18, debug19; -static struct ctldebug *debugvars[CTL_DEBUG_MAXID] = { - &debug0, &debug1, &debug2, &debug3, &debug4, - &debug5, &debug6, &debug7, &debug8, &debug9, - &debug10, &debug11, &debug12, &debug13, &debug14, - &debug15, &debug16, &debug17, &debug18, &debug19, -}; -int -debug_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) - int *name; - u_int namelen; - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; - struct proc *p; +SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD, sysctl_sysctl_name, ""); + +static int +sysctl_sysctl_next_ls (struct linker_set *lsp, int *name, u_int namelen, + int *next, int *len, int level, struct sysctl_oid **oidp) { - struct ctldebug *cdp; + int i, j; + struct sysctl_oid **oidpp; - /* all sysctl names at this level are name and field */ - if (namelen != 2) - return (ENOTDIR); /* overloaded */ - cdp = debugvars[name[0]]; - if (name[0] >= CTL_DEBUG_MAXID || cdp->debugname == 0) - return (EOPNOTSUPP); - switch (name[1]) { - case CTL_DEBUG_NAME: - return (sysctl_rdstring(oldp, oldlenp, newp, cdp->debugname)); - case CTL_DEBUG_VALUE: - return (sysctl_int(oldp, oldlenp, newp, newlen, cdp->debugvar)); - default: - return (EOPNOTSUPP); + oidpp = (struct sysctl_oid **) lsp->ls_items; + j = lsp->ls_length; + *len = level; + for (i = 0; i < j; i++, oidpp++) { + if (!*oidpp) + continue; + + *next = (*oidpp)->oid_number; + *oidp = *oidpp; + + if (!namelen) { + if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE) + return 0; + if ((*oidpp)->oid_handler) + /* We really should call the handler here...*/ + return 0; + lsp = (struct linker_set*)(*oidpp)->oid_arg1; + if (!sysctl_sysctl_next_ls (lsp, 0, 0, next+1, + len, level+1, oidp)) + return 0; + goto next; + } + + if ((*oidpp)->oid_number < *name) + continue; + + if ((*oidpp)->oid_number > *name) { + if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE) + return 0; + if ((*oidpp)->oid_handler) + return 0; + lsp = (struct linker_set*)(*oidpp)->oid_arg1; + if (!sysctl_sysctl_next_ls (lsp, name+1, namelen-1, + next+1, len, level+1, oidp)) + return (0); + goto next; + } + if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE) + continue; + + if ((*oidpp)->oid_handler) + continue; + + lsp = (struct linker_set*)(*oidpp)->oid_arg1; + if (!sysctl_sysctl_next_ls (lsp, name+1, namelen-1, next+1, + len, level+1, oidp)) + return (0); + next: + namelen = 1; + *len = level; } - /* NOTREACHED */ + return 1; } -#endif /* DEBUG */ -/* - * Validate parameters and get old / set new parameters - * for an integer-valued sysctl function. - */ -sysctl_int(oldp, oldlenp, newp, newlen, valp) - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; - int *valp; +static int +sysctl_sysctl_next SYSCTL_HANDLER_ARGS { - int error = 0; + int *name = (int *) arg1; + u_int namelen = arg2; + int i, j, error; + struct sysctl_oid *oid; + struct linker_set *lsp = &sysctl_; + int newoid[CTL_MAXNAME]; - if (oldp && *oldlenp < sizeof(int)) - return (ENOMEM); - if (newp && newlen != sizeof(int)) - return (EINVAL); - *oldlenp = sizeof(int); - if (oldp) - error = copyout(valp, oldp, sizeof(int)); - if (error == 0 && newp) - error = copyin(newp, valp, sizeof(int)); + i = sysctl_sysctl_next_ls (lsp, name, namelen, newoid, &j, 1, &oid); + if (i) + return ENOENT; + error = SYSCTL_OUT(req, newoid, j * sizeof (int)); return (error); } +SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD, sysctl_sysctl_next, ""); + +static int +name2oid (char *name, int *oid, int *len, struct sysctl_oid **oidp) +{ + int i, j; + struct sysctl_oid **oidpp; + struct linker_set *lsp = &sysctl_; + char *p; + + if (!*name) + return ENOENT; + + p = name + strlen(name) - 1 ; + if (*p == '.') + *p = '\0'; + + *len = 0; + + for (p = name; *p && *p != '.'; p++) + ; + i = *p; + if (i == '.') + *p = '\0'; + + j = lsp->ls_length; + oidpp = (struct sysctl_oid **) lsp->ls_items; + + while (j-- && *len < CTL_MAXNAME) { + if (!*oidpp) + continue; + if (strcmp(name, (*oidpp)->oid_name)) { + oidpp++; + continue; + } + *oid++ = (*oidpp)->oid_number; + (*len)++; + + if (!i) { + if (oidp) + *oidp = *oidpp; + return (0); + } + + if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE) + break; + + if ((*oidpp)->oid_handler) + break; + + lsp = (struct linker_set*)(*oidpp)->oid_arg1; + j = lsp->ls_length; + oidpp = (struct sysctl_oid **)lsp->ls_items; + name = p+1; + for (p = name; *p && *p != '.'; p++) + ; + i = *p; + if (i == '.') + *p = '\0'; + } + return ENOENT; +} + +static int +sysctl_sysctl_name2oid SYSCTL_HANDLER_ARGS +{ + char *p; + int error, oid[CTL_MAXNAME], len; + struct sysctl_oid *op = 0; + + if (!req->newlen) + return ENOENT; + + p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK); + + error = SYSCTL_IN(req, p, req->newlen); + if (error) { + free(p, M_SYSCTL); + return (error); + } + + p [req->newlen] = '\0'; + + error = name2oid(p, oid, &len, &op); + + free(p, M_SYSCTL); + + if (error) + return (error); + + error = SYSCTL_OUT(req, oid, len * sizeof *oid); + return (error); +} + +SYSCTL_PROC(_sysctl, 3, name2oid, CTLFLAG_RW|CTLFLAG_ANYBODY, 0, 0, + sysctl_sysctl_name2oid, "I", ""); + +static int +sysctl_sysctl_oidfmt SYSCTL_HANDLER_ARGS +{ + int *name = (int *) arg1, error; + u_int namelen = arg2; + int indx, j; + struct sysctl_oid **oidpp; + struct linker_set *lsp = &sysctl_; + + j = lsp->ls_length; + oidpp = (struct sysctl_oid **) lsp->ls_items; + + indx = 0; + while (j-- && indx < CTL_MAXNAME) { + if (*oidpp && ((*oidpp)->oid_number == name[indx])) { + indx++; + if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE) { + if ((*oidpp)->oid_handler) + goto found; + if (indx == namelen) + goto found; + lsp = (struct linker_set*)(*oidpp)->oid_arg1; + j = lsp->ls_length; + oidpp = (struct sysctl_oid **)lsp->ls_items; + } else { + if (indx != namelen) + return EISDIR; + goto found; + } + } else { + oidpp++; + } + } + return ENOENT; +found: + if (!(*oidpp)->oid_fmt) + return ENOENT; + error = SYSCTL_OUT(req, + &(*oidpp)->oid_kind, sizeof((*oidpp)->oid_kind)); + if (!error) + error = SYSCTL_OUT(req, (*oidpp)->oid_fmt, + strlen((*oidpp)->oid_fmt)+1); + return (error); +} + + +SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD, sysctl_sysctl_oidfmt, ""); + +/* + * Default "handler" functions. + */ + /* - * As above, but read-only. + * Handle an integer, signed or unsigned. + * Two cases: + * a variable: point arg1 at it. + * a constant: pass it in arg2. */ -sysctl_rdint(oldp, oldlenp, newp, val) - void *oldp; - size_t *oldlenp; - void *newp; - int val; + +int +sysctl_handle_int SYSCTL_HANDLER_ARGS { int error = 0; - if (oldp && *oldlenp < sizeof(int)) - return (ENOMEM); - if (newp) - return (EPERM); - *oldlenp = sizeof(int); - if (oldp) - error = copyout((caddr_t)&val, oldp, sizeof(int)); + if (arg1) + error = SYSCTL_OUT(req, arg1, sizeof(int)); + else + error = SYSCTL_OUT(req, &arg2, sizeof(int)); + + if (error || !req->newptr) + return (error); + + if (!arg1) + error = EPERM; + else + error = SYSCTL_IN(req, arg1, sizeof(int)); return (error); } /* - * Validate parameters and get old / set new parameters - * for a string-valued sysctl function. + * Handle our generic '\0' terminated 'C' string. + * Two cases: + * a variable string: point arg1 at it, arg2 is max length. + * a constant string: point arg1 at it, arg2 is zero. */ -sysctl_string(oldp, oldlenp, newp, newlen, str, maxlen) - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; - char *str; - int maxlen; + +int +sysctl_handle_string SYSCTL_HANDLER_ARGS { - int len, error = 0; + int error=0; - len = strlen(str) + 1; - if (oldp && *oldlenp < len) - return (ENOMEM); - if (newp && newlen >= maxlen) - return (EINVAL); - if (oldp) { - *oldlenp = len; - error = copyout(str, oldp, len); - } - if (error == 0 && newp) { - error = copyin(newp, str, newlen); - str[newlen] = 0; + error = SYSCTL_OUT(req, arg1, strlen((char *)arg1)+1); + + if (error || !req->newptr || !arg2) + return (error); + + if ((req->newlen - req->newidx) > arg2) { + error = E2BIG; + } else { + arg2 = (req->newlen - req->newidx); + error = SYSCTL_IN(req, arg1, arg2); + ((char *)arg1)[arg2] = '\0'; } + return (error); } /* - * As above, but read-only. + * Handle any kind of opaque data. + * arg1 points to it, arg2 is the size. */ -sysctl_rdstring(oldp, oldlenp, newp, str) - void *oldp; - size_t *oldlenp; - void *newp; - char *str; + +int +sysctl_handle_opaque SYSCTL_HANDLER_ARGS { - int len, error = 0; + int error; + + error = SYSCTL_OUT(req, arg1, arg2); + + if (error || !req->newptr) + return (error); + + error = SYSCTL_IN(req, arg1, arg2); - len = strlen(str) + 1; - if (oldp && *oldlenp < len) - return (ENOMEM); - if (newp) - return (EPERM); - *oldlenp = len; - if (oldp) - error = copyout(str, oldp, len); return (error); } /* - * Validate parameters and get old / set new parameters - * for a structure oriented sysctl function. + * Transfer functions to/from kernel space. + * XXX: rather untested at this point */ -sysctl_struct(oldp, oldlenp, newp, newlen, sp, len) - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; - void *sp; - int len; +static int +sysctl_old_kernel(struct sysctl_req *req, const void *p, int l) { - int error = 0; + int i = 0; - if (oldp && *oldlenp < len) + if (req->oldptr) { + i = min(req->oldlen - req->oldidx, l); + if (i > 0) + bcopy(p, (char *)req->oldptr + req->oldidx, i); + } + req->oldidx += l; + if (req->oldptr && i != l) return (ENOMEM); - if (newp && newlen > len) + return (0); +} + +static int +sysctl_new_kernel(struct sysctl_req *req, void *p, int l) +{ + if (!req->newptr) + return 0; + if (req->newlen - req->newidx < l) return (EINVAL); - if (oldp) { - *oldlenp = len; - error = copyout(sp, oldp, len); + bcopy((char *)req->newptr + req->newidx, p, l); + req->newidx += l; + return (0); +} + +int +kernel_sysctl(struct proc *p, int *name, u_int namelen, void *old, size_t *oldlenp, void *new, size_t newlen, int *retval) +{ + int error = 0; + struct sysctl_req req; + + bzero(&req, sizeof req); + + req.p = p; + + if (oldlenp) { + req.oldlen = *oldlenp; + } + + if (old) { + req.oldptr= old; + } + + if (newlen) { + req.newlen = newlen; + req.newptr = new; + } + + req.oldfunc = sysctl_old_kernel; + req.newfunc = sysctl_new_kernel; + req.lock = 1; + + /* XXX this should probably be done in a general way */ + while (memlock.sl_lock) { + memlock.sl_want = 1; + (void) tsleep((caddr_t)&memlock, PRIBIO+1, "sysctl", 0); + memlock.sl_locked++; + } + memlock.sl_lock = 1; + + error = sysctl_root(0, name, namelen, &req); + + if (req.lock == 2) + vsunlock(req.oldptr, req.oldlen, B_WRITE); + + memlock.sl_lock = 0; + + if (memlock.sl_want) { + memlock.sl_want = 0; + wakeup((caddr_t)&memlock); + } + + if (error && error != ENOMEM) + return (error); + + if (retval) { + if (req.oldptr && req.oldidx > req.oldlen) + *retval = req.oldlen; + else + *retval = req.oldidx; } - if (error == 0 && newp) - error = copyin(newp, sp, len); return (error); } /* - * Validate parameters and get old parameters - * for a structure oriented sysctl function. + * Transfer function to/from user space. */ -sysctl_rdstruct(oldp, oldlenp, newp, sp, len) - void *oldp; - size_t *oldlenp; - void *newp, *sp; - int len; +static int +sysctl_old_user(struct sysctl_req *req, const void *p, int l) { - int error = 0; + int error = 0, i = 0; - if (oldp && *oldlenp < len) + if (req->lock == 1 && req->oldptr) { + vslock(req->oldptr, req->oldlen); + req->lock = 2; + } + if (req->oldptr) { + i = min(req->oldlen - req->oldidx, l); + if (i > 0) + error = copyout(p, (char *)req->oldptr + req->oldidx, + i); + } + req->oldidx += l; + if (error) + return (error); + if (req->oldptr && i < l) return (ENOMEM); - if (newp) - return (EPERM); - *oldlenp = len; - if (oldp) - error = copyout(sp, oldp, len); + return (0); +} + +static int +sysctl_new_user(struct sysctl_req *req, void *p, int l) +{ + int error; + + if (!req->newptr) + return 0; + if (req->newlen - req->newidx < l) + return (EINVAL); + error = copyin((char *)req->newptr + req->newidx, p, l); + req->newidx += l; return (error); } /* - * Get file structures. + * Traverse our tree, and find the right node, execute whatever it points + * at, and return the resulting error code. */ -sysctl_file(where, sizep) - char *where; - size_t *sizep; + +int +sysctl_root SYSCTL_HANDLER_ARGS { - int buflen, error; - struct file *fp; - char *start = where; + int *name = (int *) arg1; + u_int namelen = arg2; + int indx, i, j; + struct sysctl_oid **oidpp; + struct linker_set *lsp = &sysctl_; - buflen = *sizep; - if (where == NULL) { - /* - * overestimate by 10 files - */ - *sizep = sizeof(filehead) + (nfiles + 10) * sizeof(struct file); - return (0); - } + j = lsp->ls_length; + oidpp = (struct sysctl_oid **) lsp->ls_items; - /* - * first copyout filehead - */ - if (buflen < sizeof(filehead)) { - *sizep = 0; - return (0); - } - if (error = copyout((caddr_t)&filehead, where, sizeof(filehead))) - return (error); - buflen -= sizeof(filehead); - where += sizeof(filehead); - - /* - * followed by an array of file structures - */ - for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next) { - if (buflen < sizeof(struct file)) { - *sizep = where - start; - return (ENOMEM); + indx = 0; + while (j-- && indx < CTL_MAXNAME) { + if (*oidpp && ((*oidpp)->oid_number == name[indx])) { + indx++; + if ((*oidpp)->oid_kind & CTLFLAG_NOLOCK) + req->lock = 0; + if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE) { + if ((*oidpp)->oid_handler) + goto found; + if (indx == namelen) + return ENOENT; + lsp = (struct linker_set*)(*oidpp)->oid_arg1; + j = lsp->ls_length; + oidpp = (struct sysctl_oid **)lsp->ls_items; + } else { + if (indx != namelen) + return EISDIR; + goto found; + } + } else { + oidpp++; } - if (error = copyout((caddr_t)fp, where, sizeof (struct file))) - return (error); - buflen -= sizeof(struct file); - where += sizeof(struct file); } - *sizep = where - start; - return (0); + return ENOENT; +found: + /* If writing isn't allowed */ + if (req->newptr && !((*oidpp)->oid_kind & CTLFLAG_WR)) + return (EPERM); + + /* Most likely only root can write */ + if (!((*oidpp)->oid_kind & CTLFLAG_ANYBODY) && + req->newptr && req->p && + (i = suser(req->p->p_ucred, &req->p->p_acflag))) + return (i); + + if (!(*oidpp)->oid_handler) + return EINVAL; + + if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE) { + i = ((*oidpp)->oid_handler) (*oidpp, + name + indx, namelen - indx, + req); + } else { + i = ((*oidpp)->oid_handler) (*oidpp, + (*oidpp)->oid_arg1, (*oidpp)->oid_arg2, + req); + } + return (i); } -/* - * try over estimating by 5 procs - */ -#define KERN_PROCSLOP (5 * sizeof (struct kinfo_proc)) +#ifndef _SYS_SYSPROTO_H_ +struct sysctl_args { + int *name; + u_int namelen; + void *old; + size_t *oldlenp; + void *new; + size_t newlen; +}; +#endif -sysctl_doproc(name, namelen, where, sizep) - int *name; - u_int namelen; - char *where; - size_t *sizep; +int +__sysctl(struct proc *p, struct sysctl_args *uap, int *retval) { - register struct proc *p; - register struct kinfo_proc *dp = (struct kinfo_proc *)where; - register int needed = 0; - int buflen = where != NULL ? *sizep : 0; - int doingzomb; - struct eproc eproc; - int error = 0; + int error, i, j, name[CTL_MAXNAME]; - if (namelen != 2 && !(namelen == 1 && name[0] == KERN_PROC_ALL)) + if (uap->namelen > CTL_MAXNAME || uap->namelen < 2) return (EINVAL); - p = allproc.lh_first; - doingzomb = 0; -again: - for (; p != 0; p = p->p_list.le_next) { - /* - * Skip embryonic processes. - */ - if (p->p_stat == SIDL) - continue; - /* - * TODO - make more efficient (see notes below). - * do by session. - */ - switch (name[0]) { - case KERN_PROC_PID: - /* could do this with just a lookup */ - if (p->p_pid != (pid_t)name[1]) - continue; - break; + error = copyin(uap->name, &name, uap->namelen * sizeof(int)); + if (error) + return (error); - case KERN_PROC_PGRP: - /* could do this by traversing pgrp */ - if (p->p_pgrp->pg_id != (pid_t)name[1]) - continue; - break; + error = userland_sysctl(p, name, uap->namelen, + uap->old, uap->oldlenp, 0, + uap->new, uap->newlen, &j); + if (error && error != ENOMEM) + return (error); + if (uap->oldlenp) { + i = copyout(&j, uap->oldlenp, sizeof(j)); + if (i) + return (i); + } + return (error); +} - case KERN_PROC_TTY: - if ((p->p_flag & P_CONTROLT) == 0 || - p->p_session->s_ttyp == NULL || - p->p_session->s_ttyp->t_dev != (dev_t)name[1]) - continue; - break; +/* + * This is used from various compatibility syscalls too. That's why name + * must be in kernel space. + */ +int +userland_sysctl(struct proc *p, int *name, u_int namelen, void *old, size_t *oldlenp, int inkernel, void *new, size_t newlen, int *retval) +{ + int error = 0; + struct sysctl_req req, req2; - case KERN_PROC_UID: - if (p->p_ucred->cr_uid != (uid_t)name[1]) - continue; - break; + bzero(&req, sizeof req); - case KERN_PROC_RUID: - if (p->p_cred->p_ruid != (uid_t)name[1]) - continue; - break; - } - if (buflen >= sizeof(struct kinfo_proc)) { - fill_eproc(p, &eproc); - if (error = copyout((caddr_t)p, &dp->kp_proc, - sizeof(struct proc))) - return (error); - if (error = copyout((caddr_t)&eproc, &dp->kp_eproc, - sizeof(eproc))) + req.p = p; + + if (oldlenp) { + if (inkernel) { + req.oldlen = *oldlenp; + } else { + error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp)); + if (error) return (error); - dp++; - buflen -= sizeof(struct kinfo_proc); } - needed += sizeof(struct kinfo_proc); } - if (doingzomb == 0) { - p = zombproc.lh_first; - doingzomb++; - goto again; + + if (old) { + if (!useracc(old, req.oldlen, B_WRITE)) + return (EFAULT); + req.oldptr= old; + } + + if (newlen) { + if (!useracc(new, req.newlen, B_READ)) + return (EFAULT); + req.newlen = newlen; + req.newptr = new; } - if (where != NULL) { - *sizep = (caddr_t)dp - where; - if (needed > *sizep) - return (ENOMEM); - } else { - needed += KERN_PROCSLOP; - *sizep = needed; + + req.oldfunc = sysctl_old_user; + req.newfunc = sysctl_new_user; + req.lock = 1; + + /* XXX this should probably be done in a general way */ + while (memlock.sl_lock) { + memlock.sl_want = 1; + (void) tsleep((caddr_t)&memlock, PRIBIO+1, "sysctl", 0); + memlock.sl_locked++; } - return (0); -} + memlock.sl_lock = 1; -/* - * Fill in an eproc structure for the specified process. - */ -void -fill_eproc(p, ep) - register struct proc *p; - register struct eproc *ep; -{ - register struct tty *tp; - - ep->e_paddr = p; - ep->e_sess = p->p_pgrp->pg_session; - ep->e_pcred = *p->p_cred; - ep->e_ucred = *p->p_ucred; - if (p->p_stat == SIDL || p->p_stat == SZOMB) { - ep->e_vm.vm_rssize = 0; - ep->e_vm.vm_tsize = 0; - ep->e_vm.vm_dsize = 0; - ep->e_vm.vm_ssize = 0; -#ifndef sparc - /* ep->e_vm.vm_pmap = XXX; */ -#endif - } else { - register struct vmspace *vm = p->p_vmspace; + do { + req2 = req; + error = sysctl_root(0, name, namelen, &req2); + } while (error == EAGAIN); -#ifdef pmap_resident_count - ep->e_vm.vm_rssize = pmap_resident_count(&vm->vm_pmap); /*XXX*/ -#else - ep->e_vm.vm_rssize = vm->vm_rssize; -#endif - ep->e_vm.vm_tsize = vm->vm_tsize; - ep->e_vm.vm_dsize = vm->vm_dsize; - ep->e_vm.vm_ssize = vm->vm_ssize; -#ifndef sparc - ep->e_vm.vm_pmap = vm->vm_pmap; -#endif + req = req2; + if (req.lock == 2) + vsunlock(req.oldptr, req.oldlen, B_WRITE); + + memlock.sl_lock = 0; + + if (memlock.sl_want) { + memlock.sl_want = 0; + wakeup((caddr_t)&memlock); } - if (p->p_pptr) - ep->e_ppid = p->p_pptr->p_pid; - else - ep->e_ppid = 0; - ep->e_pgid = p->p_pgrp->pg_id; - ep->e_jobc = p->p_pgrp->pg_jobc; - if ((p->p_flag & P_CONTROLT) && - (tp = ep->e_sess->s_ttyp)) { - ep->e_tdev = tp->t_dev; - ep->e_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID; - ep->e_tsess = tp->t_session; - } else - ep->e_tdev = NODEV; - ep->e_flag = ep->e_sess->s_ttyvp ? EPROC_CTTY : 0; - if (SESS_LEADER(p)) - ep->e_flag |= EPROC_SLEADER; - if (p->p_wmesg) - strncpy(ep->e_wmesg, p->p_wmesg, WMESGLEN); - ep->e_xsize = ep->e_xrssize = 0; - ep->e_xccount = ep->e_xswrss = 0; + + if (error && error != ENOMEM) + return (error); + + if (retval) { + if (req.oldptr && req.oldidx > req.oldlen) + *retval = req.oldlen; + else + *retval = req.oldidx; + } + return (error); } #ifdef COMPAT_43 #include <sys/socket.h> +#include <vm/vm_param.h> + #define KINFO_PROC (0<<8) #define KINFO_RT (1<<8) #define KINFO_VNODE (2<<8) @@ -712,81 +880,197 @@ fill_eproc(p, ep) #define KINFO_LOADAVG (5<<8) #define KINFO_CLOCKRATE (6<<8) -compat_43_getkerninfo(p, uap, retval) - struct proc *p; - register struct compat_43_getkerninfo_args /* { - syscallarg(int) op; - syscallarg(char *) where; - syscallarg(int *) size; - syscallarg(int) arg; - } */ *uap; - register_t *retval; -{ - int error, name[5]; - size_t size; +/* Non-standard BSDI extension - only present on their 4.3 net-2 releases */ +#define KINFO_BSDI_SYSINFO (101<<8) - if (SCARG(uap, size) && (error = copyin((caddr_t)SCARG(uap, size), - (caddr_t)&size, sizeof(size)))) - return (error); +/* + * XXX this is bloat, but I hope it's better here than on the potentially + * limited kernel stack... -Peter + */ + +static struct { + int bsdi_machine; /* "i386" on BSD/386 */ +/* ^^^ this is an offset to the string, relative to the struct start */ + char *pad0; + long pad1; + long pad2; + long pad3; + u_long pad4; + u_long pad5; + u_long pad6; + + int bsdi_ostype; /* "BSD/386" on BSD/386 */ + int bsdi_osrelease; /* "1.1" on BSD/386 */ + long pad7; + long pad8; + char *pad9; + + long pad10; + long pad11; + int pad12; + long pad13; + quad_t pad14; + long pad15; + + struct timeval pad16; + /* we dont set this, because BSDI's uname used gethostname() instead */ + int bsdi_hostname; /* hostname on BSD/386 */ - switch (SCARG(uap, op) & 0xff00) { + /* the actual string data is appended here */ + +} bsdi_si; +/* + * this data is appended to the end of the bsdi_si structure during copyout. + * The "char *" offsets are relative to the base of the bsdi_si struct. + * This contains "FreeBSD\02.0-BUILT-nnnnnn\0i386\0", and these strings + * should not exceed the length of the buffer here... (or else!! :-) + */ +static char bsdi_strings[80]; /* It had better be less than this! */ + +#ifndef _SYS_SYSPROTO_H_ +struct getkerninfo_args { + int op; + char *where; + int *size; + int arg; +}; +#endif + +int +ogetkerninfo(struct proc *p, struct getkerninfo_args *uap, int *retval) +{ + int error, name[6]; + u_int size; + + switch (uap->op & 0xff00) { case KINFO_RT: - name[0] = PF_ROUTE; - name[1] = 0; - name[2] = (SCARG(uap, op) & 0xff0000) >> 16; - name[3] = SCARG(uap, op) & 0xff; - name[4] = SCARG(uap, arg); - error = - net_sysctl(name, 5, SCARG(uap, where), &size, NULL, 0, p); + name[0] = CTL_NET; + name[1] = PF_ROUTE; + name[2] = 0; + name[3] = (uap->op & 0xff0000) >> 16; + name[4] = uap->op & 0xff; + name[5] = uap->arg; + error = userland_sysctl(p, name, 6, uap->where, uap->size, + 0, 0, 0, &size); break; case KINFO_VNODE: - name[0] = KERN_VNODE; - error = - kern_sysctl(name, 1, SCARG(uap, where), &size, NULL, 0, p); + name[0] = CTL_KERN; + name[1] = KERN_VNODE; + error = userland_sysctl(p, name, 2, uap->where, uap->size, + 0, 0, 0, &size); break; case KINFO_PROC: - name[0] = KERN_PROC; - name[1] = SCARG(uap, op) & 0xff; - name[2] = SCARG(uap, arg); - error = - kern_sysctl(name, 3, SCARG(uap, where), &size, NULL, 0, p); + name[0] = CTL_KERN; + name[1] = KERN_PROC; + name[2] = uap->op & 0xff; + name[3] = uap->arg; + error = userland_sysctl(p, name, 4, uap->where, uap->size, + 0, 0, 0, &size); break; case KINFO_FILE: - name[0] = KERN_FILE; - error = - kern_sysctl(name, 1, SCARG(uap, where), &size, NULL, 0, p); + name[0] = CTL_KERN; + name[1] = KERN_FILE; + error = userland_sysctl(p, name, 2, uap->where, uap->size, + 0, 0, 0, &size); break; case KINFO_METER: - name[0] = VM_METER; - error = - vm_sysctl(name, 1, SCARG(uap, where), &size, NULL, 0, p); + name[0] = CTL_VM; + name[1] = VM_METER; + error = userland_sysctl(p, name, 2, uap->where, uap->size, + 0, 0, 0, &size); break; case KINFO_LOADAVG: - name[0] = VM_LOADAVG; - error = - vm_sysctl(name, 1, SCARG(uap, where), &size, NULL, 0, p); + name[0] = CTL_VM; + name[1] = VM_LOADAVG; + error = userland_sysctl(p, name, 2, uap->where, uap->size, + 0, 0, 0, &size); break; case KINFO_CLOCKRATE: - name[0] = KERN_CLOCKRATE; - error = - kern_sysctl(name, 1, SCARG(uap, where), &size, NULL, 0, p); + name[0] = CTL_KERN; + name[1] = KERN_CLOCKRATE; + error = userland_sysctl(p, name, 2, uap->where, uap->size, + 0, 0, 0, &size); break; + case KINFO_BSDI_SYSINFO: { + /* + * this is pretty crude, but it's just enough for uname() + * from BSDI's 1.x libc to work. + * + * In particular, it doesn't return the same results when + * the supplied buffer is too small. BSDI's version apparently + * will return the amount copied, and set the *size to how + * much was needed. The emulation framework here isn't capable + * of that, so we just set both to the amount copied. + * BSDI's 2.x product apparently fails with ENOMEM in this + * scenario. + */ + + u_int needed; + u_int left; + char *s; + + bzero((char *)&bsdi_si, sizeof(bsdi_si)); + bzero(bsdi_strings, sizeof(bsdi_strings)); + + s = bsdi_strings; + + bsdi_si.bsdi_ostype = (s - bsdi_strings) + sizeof(bsdi_si); + strcpy(s, ostype); + s += strlen(s) + 1; + + bsdi_si.bsdi_osrelease = (s - bsdi_strings) + sizeof(bsdi_si); + strcpy(s, osrelease); + s += strlen(s) + 1; + + bsdi_si.bsdi_machine = (s - bsdi_strings) + sizeof(bsdi_si); + strcpy(s, machine); + s += strlen(s) + 1; + + needed = sizeof(bsdi_si) + (s - bsdi_strings); + + if (uap->where == NULL) { + /* process is asking how much buffer to supply.. */ + size = needed; + error = 0; + break; + } + + + /* if too much buffer supplied, trim it down */ + if (size > needed) + size = needed; + + /* how much of the buffer is remaining */ + left = size; + + if ((error = copyout((char *)&bsdi_si, uap->where, left)) != 0) + break; + + /* is there any point in continuing? */ + if (left > sizeof(bsdi_si)) { + left -= sizeof(bsdi_si); + error = copyout(&bsdi_strings, + uap->where + sizeof(bsdi_si), left); + } + break; + } + default: return (EOPNOTSUPP); } if (error) return (error); *retval = size; - if (SCARG(uap, size)) - error = copyout((caddr_t)&size, (caddr_t)SCARG(uap, size), + if (uap->size) + error = copyout((caddr_t)&size, (caddr_t)uap->size, sizeof(size)); return (error); } diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c new file mode 100644 index 0000000..171ed0e --- /dev/null +++ b/sys/kern/kern_tc.c @@ -0,0 +1,1303 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 + * $Id: kern_clock.c,v 1.34 1997/03/22 16:52:19 mpp Exp $ + */ + +/* Portions of this software are covered by the following: */ +/****************************************************************************** + * * + * Copyright (c) David L. Mills 1993, 1994 * + * * + * Permission to use, copy, modify, and distribute this software and its * + * documentation for any purpose and without fee is hereby granted, provided * + * that the above copyright notice appears in all copies and that both the * + * copyright notice and this permission notice appear in supporting * + * documentation, and that the name University of Delaware not be used in * + * advertising or publicity pertaining to distribution of the software * + * without specific, written prior permission. The University of Delaware * + * makes no representations about the suitability this software for any * + * purpose. It is provided "as is" without express or implied warranty. * + * * + *****************************************************************************/ + +#include "opt_cpu.h" /* XXX */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/dkstat.h> +#include <sys/callout.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/timex.h> +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <sys/sysctl.h> + +#include <machine/cpu.h> +#define CLOCK_HAIR /* XXX */ +#include <machine/clock.h> + +#ifdef GPROF +#include <sys/gmon.h> +#endif + +static void initclocks __P((void *dummy)); +SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL) + +/* Exported to machdep.c. */ +struct callout *callfree, *callout; + +static struct callout calltodo; + +/* Some of these don't belong here, but it's easiest to concentrate them. */ +static long cp_time[CPUSTATES]; +long dk_seek[DK_NDRIVE]; +static long dk_time[DK_NDRIVE]; +long dk_wds[DK_NDRIVE]; +long dk_wpms[DK_NDRIVE]; +long dk_xfer[DK_NDRIVE]; + +int dk_busy; +int dk_ndrive = 0; +char dk_names[DK_NDRIVE][DK_NAMELEN]; + +long tk_cancc; +long tk_nin; +long tk_nout; +long tk_rawcc; + +/* + * Clock handling routines. + * + * This code is written to operate with two timers that run independently of + * each other. The main clock, running hz times per second, is used to keep + * track of real time. The second timer handles kernel and user profiling, + * and does resource use estimation. If the second timer is programmable, + * it is randomized to avoid aliasing between the two clocks. For example, + * the randomization prevents an adversary from always giving up the cpu + * just before its quantum expires. Otherwise, it would never accumulate + * cpu ticks. The mean frequency of the second timer is stathz. + * + * If no second timer exists, stathz will be zero; in this case we drive + * profiling and statistics off the main clock. This WILL NOT be accurate; + * do not do it unless absolutely necessary. + * + * The statistics clock may (or may not) be run at a higher rate while + * profiling. This profile clock runs at profhz. We require that profhz + * be an integral multiple of stathz. + * + * If the statistics clock is running fast, it must be divided by the ratio + * profhz/stathz for statistics. (For profiling, every tick counts.) + */ + +/* + * TODO: + * allocate more timeout table slots when table overflows. + */ + +/* + * Bump a timeval by a small number of usec's. + */ +#define BUMPTIME(t, usec) { \ + register volatile struct timeval *tp = (t); \ + register long us; \ + \ + tp->tv_usec = us = tp->tv_usec + (usec); \ + if (us >= 1000000) { \ + tp->tv_usec = us - 1000000; \ + tp->tv_sec++; \ + } \ +} + +int stathz; +int profhz; +static int profprocs; +int ticks; +static int psdiv, pscnt; /* prof => stat divider */ +int psratio; /* ratio: prof / stat */ + +volatile struct timeval time; +volatile struct timeval mono_time; + +/* + * Phase/frequency-lock loop (PLL/FLL) definitions + * + * The following variables are read and set by the ntp_adjtime() system + * call. + * + * time_state shows the state of the system clock, with values defined + * in the timex.h header file. + * + * time_status shows the status of the system clock, with bits defined + * in the timex.h header file. + * + * time_offset is used by the PLL/FLL to adjust the system time in small + * increments. + * + * time_constant determines the bandwidth or "stiffness" of the PLL. + * + * time_tolerance determines maximum frequency error or tolerance of the + * CPU clock oscillator and is a property of the architecture; however, + * in principle it could change as result of the presence of external + * discipline signals, for instance. + * + * time_precision is usually equal to the kernel tick variable; however, + * in cases where a precision clock counter or external clock is + * available, the resolution can be much less than this and depend on + * whether the external clock is working or not. + * + * time_maxerror is initialized by a ntp_adjtime() call and increased by + * the kernel once each second to reflect the maximum error + * bound growth. + * + * time_esterror is set and read by the ntp_adjtime() call, but + * otherwise not used by the kernel. + */ +int time_status = STA_UNSYNC; /* clock status bits */ +int time_state = TIME_OK; /* clock state */ +long time_offset = 0; /* time offset (us) */ +long time_constant = 0; /* pll time constant */ +long time_tolerance = MAXFREQ; /* frequency tolerance (scaled ppm) */ +long time_precision = 1; /* clock precision (us) */ +long time_maxerror = MAXPHASE; /* maximum error (us) */ +long time_esterror = MAXPHASE; /* estimated error (us) */ + +/* + * The following variables establish the state of the PLL/FLL and the + * residual time and frequency offset of the local clock. The scale + * factors are defined in the timex.h header file. + * + * time_phase and time_freq are the phase increment and the frequency + * increment, respectively, of the kernel time variable at each tick of + * the clock. + * + * time_freq is set via ntp_adjtime() from a value stored in a file when + * the synchronization daemon is first started. Its value is retrieved + * via ntp_adjtime() and written to the file about once per hour by the + * daemon. + * + * time_adj is the adjustment added to the value of tick at each timer + * interrupt and is recomputed from time_phase and time_freq at each + * seconds rollover. + * + * time_reftime is the second's portion of the system time on the last + * call to ntp_adjtime(). It is used to adjust the time_freq variable + * and to increase the time_maxerror as the time since last update + * increases. + */ +static long time_phase = 0; /* phase offset (scaled us) */ +long time_freq = 0; /* frequency offset (scaled ppm) */ +static long time_adj = 0; /* tick adjust (scaled 1 / hz) */ +static long time_reftime = 0; /* time at last adjustment (s) */ + +#ifdef PPS_SYNC +/* + * The following variables are used only if the kernel PPS discipline + * code is configured (PPS_SYNC). The scale factors are defined in the + * timex.h header file. + * + * pps_time contains the time at each calibration interval, as read by + * microtime(). pps_count counts the seconds of the calibration + * interval, the duration of which is nominally pps_shift in powers of + * two. + * + * pps_offset is the time offset produced by the time median filter + * pps_tf[], while pps_jitter is the dispersion (jitter) measured by + * this filter. + * + * pps_freq is the frequency offset produced by the frequency median + * filter pps_ff[], while pps_stabil is the dispersion (wander) measured + * by this filter. + * + * pps_usec is latched from a high resolution counter or external clock + * at pps_time. Here we want the hardware counter contents only, not the + * contents plus the time_tv.usec as usual. + * + * pps_valid counts the number of seconds since the last PPS update. It + * is used as a watchdog timer to disable the PPS discipline should the + * PPS signal be lost. + * + * pps_glitch counts the number of seconds since the beginning of an + * offset burst more than tick/2 from current nominal offset. It is used + * mainly to suppress error bursts due to priority conflicts between the + * PPS interrupt and timer interrupt. + * + * pps_intcnt counts the calibration intervals for use in the interval- + * adaptation algorithm. It's just too complicated for words. + */ +struct timeval pps_time; /* kernel time at last interval */ +long pps_offset = 0; /* pps time offset (us) */ +long pps_jitter = MAXTIME; /* pps time dispersion (jitter) (us) */ +long pps_tf[] = {0, 0, 0}; /* pps time offset median filter (us) */ +long pps_freq = 0; /* frequency offset (scaled ppm) */ +long pps_stabil = MAXFREQ; /* frequency dispersion (scaled ppm) */ +long pps_ff[] = {0, 0, 0}; /* frequency offset median filter */ +long pps_usec = 0; /* microsec counter at last interval */ +long pps_valid = PPS_VALID; /* pps signal watchdog counter */ +int pps_glitch = 0; /* pps signal glitch counter */ +int pps_count = 0; /* calibration interval counter (s) */ +int pps_shift = PPS_SHIFT; /* interval duration (s) (shift) */ +int pps_intcnt = 0; /* intervals at current duration */ + +/* + * PPS signal quality monitors + * + * pps_jitcnt counts the seconds that have been discarded because the + * jitter measured by the time median filter exceeds the limit MAXTIME + * (100 us). + * + * pps_calcnt counts the frequency calibration intervals, which are + * variable from 4 s to 256 s. + * + * pps_errcnt counts the calibration intervals which have been discarded + * because the wander exceeds the limit MAXFREQ (100 ppm) or where the + * calibration interval jitter exceeds two ticks. + * + * pps_stbcnt counts the calibration intervals that have been discarded + * because the frequency wander exceeds the limit MAXFREQ / 4 (25 us). + */ +long pps_jitcnt = 0; /* jitter limit exceeded */ +long pps_calcnt = 0; /* calibration intervals */ +long pps_errcnt = 0; /* calibration errors */ +long pps_stbcnt = 0; /* stability limit exceeded */ +#endif /* PPS_SYNC */ + +/* XXX none of this stuff works under FreeBSD */ +#ifdef EXT_CLOCK +/* + * External clock definitions + * + * The following definitions and declarations are used only if an + * external clock (HIGHBALL or TPRO) is configured on the system. + */ +#define CLOCK_INTERVAL 30 /* CPU clock update interval (s) */ + +/* + * The clock_count variable is set to CLOCK_INTERVAL at each PPS + * interrupt and decremented once each second. + */ +int clock_count = 0; /* CPU clock counter */ + +#ifdef HIGHBALL +/* + * The clock_offset and clock_cpu variables are used by the HIGHBALL + * interface. The clock_offset variable defines the offset between + * system time and the HIGBALL counters. The clock_cpu variable contains + * the offset between the system clock and the HIGHBALL clock for use in + * disciplining the kernel time variable. + */ +extern struct timeval clock_offset; /* Highball clock offset */ +long clock_cpu = 0; /* CPU clock adjust */ +#endif /* HIGHBALL */ +#endif /* EXT_CLOCK */ + +/* + * hardupdate() - local clock update + * + * This routine is called by ntp_adjtime() to update the local clock + * phase and frequency. The implementation is of an adaptive-parameter, + * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new + * time and frequency offset estimates for each call. If the kernel PPS + * discipline code is configured (PPS_SYNC), the PPS signal itself + * determines the new time offset, instead of the calling argument. + * Presumably, calls to ntp_adjtime() occur only when the caller + * believes the local clock is valid within some bound (+-128 ms with + * NTP). If the caller's time is far different than the PPS time, an + * argument will ensue, and it's not clear who will lose. + * + * For uncompensated quartz crystal oscillatores and nominal update + * intervals less than 1024 s, operation should be in phase-lock mode + * (STA_FLL = 0), where the loop is disciplined to phase. For update + * intervals greater than thiss, operation should be in frequency-lock + * mode (STA_FLL = 1), where the loop is disciplined to frequency. + * + * Note: splclock() is in effect. + */ +void +hardupdate(offset) + long offset; +{ + long ltemp, mtemp; + + if (!(time_status & STA_PLL) && !(time_status & STA_PPSTIME)) + return; + ltemp = offset; +#ifdef PPS_SYNC + if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) + ltemp = pps_offset; +#endif /* PPS_SYNC */ + + /* + * Scale the phase adjustment and clamp to the operating range. + */ + if (ltemp > MAXPHASE) + time_offset = MAXPHASE << SHIFT_UPDATE; + else if (ltemp < -MAXPHASE) + time_offset = -(MAXPHASE << SHIFT_UPDATE); + else + time_offset = ltemp << SHIFT_UPDATE; + + /* + * Select whether the frequency is to be controlled and in which + * mode (PLL or FLL). Clamp to the operating range. Ugly + * multiply/divide should be replaced someday. + */ + if (time_status & STA_FREQHOLD || time_reftime == 0) + time_reftime = time.tv_sec; + mtemp = time.tv_sec - time_reftime; + time_reftime = time.tv_sec; + if (time_status & STA_FLL) { + if (mtemp >= MINSEC) { + ltemp = ((time_offset / mtemp) << (SHIFT_USEC - + SHIFT_UPDATE)); + if (ltemp < 0) + time_freq -= -ltemp >> SHIFT_KH; + else + time_freq += ltemp >> SHIFT_KH; + } + } else { + if (mtemp < MAXSEC) { + ltemp *= mtemp; + if (ltemp < 0) + time_freq -= -ltemp >> (time_constant + + time_constant + SHIFT_KF - + SHIFT_USEC); + else + time_freq += ltemp >> (time_constant + + time_constant + SHIFT_KF - + SHIFT_USEC); + } + } + if (time_freq > time_tolerance) + time_freq = time_tolerance; + else if (time_freq < -time_tolerance) + time_freq = -time_tolerance; +} + + + +/* + * Initialize clock frequencies and start both clocks running. + */ +/* ARGSUSED*/ +static void +initclocks(dummy) + void *dummy; +{ + register int i; + + /* + * Set divisors to 1 (normal case) and let the machine-specific + * code do its bit. + */ + psdiv = pscnt = 1; + cpu_initclocks(); + + /* + * Compute profhz/stathz, and fix profhz if needed. + */ + i = stathz ? stathz : hz; + if (profhz == 0) + profhz = i; + psratio = profhz / i; +} + +/* + * The real-time timer, interrupting hz times per second. + */ +void +hardclock(frame) + register struct clockframe *frame; +{ + register struct callout *p1; + register struct proc *p; + register int needsoft; + + /* + * Update real-time timeout queue. + * At front of queue are some number of events which are ``due''. + * The time to these is <= 0 and if negative represents the + * number of ticks which have passed since it was supposed to happen. + * The rest of the q elements (times > 0) are events yet to happen, + * where the time for each is given as a delta from the previous. + * Decrementing just the first of these serves to decrement the time + * to all events. + */ + needsoft = 0; + for (p1 = calltodo.c_next; p1 != NULL; p1 = p1->c_next) { + if (--p1->c_time > 0) + break; + needsoft = 1; + if (p1->c_time == 0) + break; + } + + p = curproc; + if (p) { + register struct pstats *pstats; + + /* + * Run current process's virtual and profile time, as needed. + */ + pstats = p->p_stats; + if (CLKF_USERMODE(frame) && + timerisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) && + itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) + psignal(p, SIGVTALRM); + if (timerisset(&pstats->p_timer[ITIMER_PROF].it_value) && + itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) + psignal(p, SIGPROF); + } + + /* + * If no separate statistics clock is available, run it from here. + */ + if (stathz == 0) + statclock(frame); + + /* + * Increment the time-of-day. + */ + ticks++; + { + int time_update; + struct timeval newtime = time; + long ltemp; + + if (timedelta == 0) { + time_update = CPU_THISTICKLEN(tick); + } else { + time_update = CPU_THISTICKLEN(tick) + tickdelta; + timedelta -= tickdelta; + } + BUMPTIME(&mono_time, time_update); + + /* + * Compute the phase adjustment. If the low-order bits + * (time_phase) of the update overflow, bump the high-order bits + * (time_update). + */ + time_phase += time_adj; + if (time_phase <= -FINEUSEC) { + ltemp = -time_phase >> SHIFT_SCALE; + time_phase += ltemp << SHIFT_SCALE; + time_update -= ltemp; + } + else if (time_phase >= FINEUSEC) { + ltemp = time_phase >> SHIFT_SCALE; + time_phase -= ltemp << SHIFT_SCALE; + time_update += ltemp; + } + + newtime.tv_usec += time_update; + /* + * On rollover of the second the phase adjustment to be used for + * the next second is calculated. Also, the maximum error is + * increased by the tolerance. If the PPS frequency discipline + * code is present, the phase is increased to compensate for the + * CPU clock oscillator frequency error. + * + * On a 32-bit machine and given parameters in the timex.h + * header file, the maximum phase adjustment is +-512 ms and + * maximum frequency offset is a tad less than) +-512 ppm. On a + * 64-bit machine, you shouldn't need to ask. + */ + if (newtime.tv_usec >= 1000000) { + newtime.tv_usec -= 1000000; + newtime.tv_sec++; + time_maxerror += time_tolerance >> SHIFT_USEC; + + /* + * Compute the phase adjustment for the next second. In + * PLL mode, the offset is reduced by a fixed factor + * times the time constant. In FLL mode the offset is + * used directly. In either mode, the maximum phase + * adjustment for each second is clamped so as to spread + * the adjustment over not more than the number of + * seconds between updates. + */ + if (time_offset < 0) { + ltemp = -time_offset; + if (!(time_status & STA_FLL)) + ltemp >>= SHIFT_KG + time_constant; + if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) + ltemp = (MAXPHASE / MINSEC) << + SHIFT_UPDATE; + time_offset += ltemp; + time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - + SHIFT_UPDATE); + } else { + ltemp = time_offset; + if (!(time_status & STA_FLL)) + ltemp >>= SHIFT_KG + time_constant; + if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) + ltemp = (MAXPHASE / MINSEC) << + SHIFT_UPDATE; + time_offset -= ltemp; + time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - + SHIFT_UPDATE); + } + + /* + * Compute the frequency estimate and additional phase + * adjustment due to frequency error for the next + * second. When the PPS signal is engaged, gnaw on the + * watchdog counter and update the frequency computed by + * the pll and the PPS signal. + */ +#ifdef PPS_SYNC + pps_valid++; + if (pps_valid == PPS_VALID) { + pps_jitter = MAXTIME; + pps_stabil = MAXFREQ; + time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); + } + ltemp = time_freq + pps_freq; +#else + ltemp = time_freq; +#endif /* PPS_SYNC */ + if (ltemp < 0) + time_adj -= -ltemp >> + (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); + else + time_adj += ltemp >> + (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); + +#if SHIFT_HZ == 7 + /* + * When the CPU clock oscillator frequency is not a + * power of two in Hz, the SHIFT_HZ is only an + * approximate scale factor. In the SunOS kernel, this + * results in a PLL gain factor of 1/1.28 = 0.78 what it + * should be. In the following code the overall gain is + * increased by a factor of 1.25, which results in a + * residual error less than 3 percent. + */ + /* Same thing applies for FreeBSD --GAW */ + if (hz == 100) { + if (time_adj < 0) + time_adj -= -time_adj >> 2; + else + time_adj += time_adj >> 2; + } +#endif /* SHIFT_HZ */ + + /* XXX - this is really bogus, but can't be fixed until + xntpd's idea of the system clock is fixed to know how + the user wants leap seconds handled; in the mean time, + we assume that users of NTP are running without proper + leap second support (this is now the default anyway) */ + /* + * Leap second processing. If in leap-insert state at + * the end of the day, the system clock is set back one + * second; if in leap-delete state, the system clock is + * set ahead one second. The microtime() routine or + * external clock driver will insure that reported time + * is always monotonic. The ugly divides should be + * replaced. + */ + switch (time_state) { + + case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; + break; + + case TIME_INS: + if (newtime.tv_sec % 86400 == 0) { + newtime.tv_sec--; + time_state = TIME_OOP; + } + break; + + case TIME_DEL: + if ((newtime.tv_sec + 1) % 86400 == 0) { + newtime.tv_sec++; + time_state = TIME_WAIT; + } + break; + + case TIME_OOP: + time_state = TIME_WAIT; + break; + + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + } + } + CPU_CLOCKUPDATE(&time, &newtime); + } + + /* + * Process callouts at a very low cpu priority, so we don't keep the + * relatively high clock interrupt priority any longer than necessary. + */ + if (needsoft) { + if (CLKF_BASEPRI(frame)) { + /* + * Save the overhead of a software interrupt; + * it will happen as soon as we return, so do it now. + */ + (void)splsoftclock(); + softclock(); + } else + setsoftclock(); + } +} + +/* + * Software (low priority) clock interrupt. + * Run periodic events from timeout queue. + */ +/*ARGSUSED*/ +void +softclock() +{ + register struct callout *c; + register void *arg; + register void (*func) __P((void *)); + register int s; + + s = splhigh(); + while ((c = calltodo.c_next) != NULL && c->c_time <= 0) { + func = c->c_func; + arg = c->c_arg; + calltodo.c_next = c->c_next; + c->c_next = callfree; + callfree = c; + splx(s); + (*func)(arg); + (void) splhigh(); + } + splx(s); +} + +/* + * timeout -- + * Execute a function after a specified length of time. + * + * untimeout -- + * Cancel previous timeout function call. + * + * See AT&T BCI Driver Reference Manual for specification. This + * implementation differs from that one in that no identification + * value is returned from timeout, rather, the original arguments + * to timeout are used to identify entries for untimeout. + */ +void +timeout(ftn, arg, ticks) + timeout_t ftn; + void *arg; + register int ticks; +{ + register struct callout *new, *p, *t; + register int s; + + if (ticks <= 0) + ticks = 1; + + /* Lock out the clock. */ + s = splhigh(); + + /* Fill in the next free callout structure. */ + if (callfree == NULL) + panic("timeout table full"); + new = callfree; + callfree = new->c_next; + new->c_arg = arg; + new->c_func = ftn; + + /* + * The time for each event is stored as a difference from the time + * of the previous event on the queue. Walk the queue, correcting + * the ticks argument for queue entries passed. Correct the ticks + * value for the queue entry immediately after the insertion point + * as well. Watch out for negative c_time values; these represent + * overdue events. + */ + for (p = &calltodo; + (t = p->c_next) != NULL && ticks > t->c_time; p = t) + if (t->c_time > 0) + ticks -= t->c_time; + new->c_time = ticks; + if (t != NULL) + t->c_time -= ticks; + + /* Insert the new entry into the queue. */ + p->c_next = new; + new->c_next = t; + splx(s); +} + +void +untimeout(ftn, arg) + timeout_t ftn; + void *arg; +{ + register struct callout *p, *t; + register int s; + + s = splhigh(); + for (p = &calltodo; (t = p->c_next) != NULL; p = t) + if (t->c_func == ftn && t->c_arg == arg) { + /* Increment next entry's tick count. */ + if (t->c_next && t->c_time > 0) + t->c_next->c_time += t->c_time; + + /* Move entry from callout queue to callfree queue. */ + p->c_next = t->c_next; + t->c_next = callfree; + callfree = t; + break; + } + splx(s); +} + +void +gettime(struct timeval *tvp) +{ + int s; + + s = splclock(); + /* XXX should use microtime() iff tv_usec is used. */ + *tvp = time; + splx(s); +} + +/* + * Compute number of hz until specified time. Used to + * compute third argument to timeout() from an absolute time. + */ +int +hzto(tv) + struct timeval *tv; +{ + register unsigned long ticks; + register long sec, usec; + int s; + + /* + * If the number of usecs in the whole seconds part of the time + * difference fits in a long, then the total number of usecs will + * fit in an unsigned long. Compute the total and convert it to + * ticks, rounding up and adding 1 to allow for the current tick + * to expire. Rounding also depends on unsigned long arithmetic + * to avoid overflow. + * + * Otherwise, if the number of ticks in the whole seconds part of + * the time difference fits in a long, then convert the parts to + * ticks separately and add, using similar rounding methods and + * overflow avoidance. This method would work in the previous + * case but it is slightly slower and assumes that hz is integral. + * + * Otherwise, round the time difference down to the maximum + * representable value. + * + * If ints have 32 bits, then the maximum value for any timeout in + * 10ms ticks is 248 days. + */ + s = splclock(); + sec = tv->tv_sec - time.tv_sec; + usec = tv->tv_usec - time.tv_usec; + splx(s); + if (usec < 0) { + sec--; + usec += 1000000; + } + if (sec < 0) { +#ifdef DIAGNOSTIC + printf("hzto: negative time difference %ld sec %ld usec\n", + sec, usec); +#endif + ticks = 1; + } else if (sec <= LONG_MAX / 1000000) + ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1)) + / tick + 1; + else if (sec <= LONG_MAX / hz) + ticks = sec * hz + + ((unsigned long)usec + (tick - 1)) / tick + 1; + else + ticks = LONG_MAX; + if (ticks > INT_MAX) + ticks = INT_MAX; + return (ticks); +} + +/* + * Start profiling on a process. + * + * Kernel profiling passes proc0 which never exits and hence + * keeps the profile clock running constantly. + */ +void +startprofclock(p) + register struct proc *p; +{ + int s; + + if ((p->p_flag & P_PROFIL) == 0) { + p->p_flag |= P_PROFIL; + if (++profprocs == 1 && stathz != 0) { + s = splstatclock(); + psdiv = pscnt = psratio; + setstatclockrate(profhz); + splx(s); + } + } +} + +/* + * Stop profiling on a process. + */ +void +stopprofclock(p) + register struct proc *p; +{ + int s; + + if (p->p_flag & P_PROFIL) { + p->p_flag &= ~P_PROFIL; + if (--profprocs == 0 && stathz != 0) { + s = splstatclock(); + psdiv = pscnt = 1; + setstatclockrate(stathz); + splx(s); + } + } +} + +/* + * Statistics clock. Grab profile sample, and if divider reaches 0, + * do process and kernel statistics. + */ +void +statclock(frame) + register struct clockframe *frame; +{ +#ifdef GPROF + register struct gmonparam *g; +#endif + register struct proc *p; + register int i; + struct pstats *pstats; + long rss; + struct rusage *ru; + struct vmspace *vm; + + if (CLKF_USERMODE(frame)) { + p = curproc; + if (p->p_flag & P_PROFIL) + addupc_intr(p, CLKF_PC(frame), 1); + if (--pscnt > 0) + return; + /* + * Came from user mode; CPU was in user state. + * If this process is being profiled record the tick. + */ + p->p_uticks++; + if (p->p_nice > NZERO) + cp_time[CP_NICE]++; + else + cp_time[CP_USER]++; + } else { +#ifdef GPROF + /* + * Kernel statistics are just like addupc_intr, only easier. + */ + g = &_gmonparam; + if (g->state == GMON_PROF_ON) { + i = CLKF_PC(frame) - g->lowpc; + if (i < g->textsize) { + i /= HISTFRACTION * sizeof(*g->kcount); + g->kcount[i]++; + } + } +#endif + if (--pscnt > 0) + return; + /* + * Came from kernel mode, so we were: + * - handling an interrupt, + * - doing syscall or trap work on behalf of the current + * user process, or + * - spinning in the idle loop. + * Whichever it is, charge the time as appropriate. + * Note that we charge interrupts to the current process, + * regardless of whether they are ``for'' that process, + * so that we know how much of its real time was spent + * in ``non-process'' (i.e., interrupt) work. + */ + p = curproc; + if (CLKF_INTR(frame)) { + if (p != NULL) + p->p_iticks++; + cp_time[CP_INTR]++; + } else if (p != NULL) { + p->p_sticks++; + cp_time[CP_SYS]++; + } else + cp_time[CP_IDLE]++; + } + pscnt = psdiv; + + /* + * We maintain statistics shown by user-level statistics + * programs: the amount of time in each cpu state, and + * the amount of time each of DK_NDRIVE ``drives'' is busy. + * + * XXX should either run linked list of drives, or (better) + * grab timestamps in the start & done code. + */ + for (i = 0; i < DK_NDRIVE; i++) + if (dk_busy & (1 << i)) + dk_time[i]++; + + /* + * We adjust the priority of the current process. The priority of + * a process gets worse as it accumulates CPU time. The cpu usage + * estimator (p_estcpu) is increased here. The formula for computing + * priorities (in kern_synch.c) will compute a different value each + * time p_estcpu increases by 4. The cpu usage estimator ramps up + * quite quickly when the process is running (linearly), and decays + * away exponentially, at a rate which is proportionally slower when + * the system is busy. The basic principal is that the system will + * 90% forget that the process used a lot of CPU time in 5 * loadav + * seconds. This causes the system to favor processes which haven't + * run much recently, and to round-robin among other processes. + */ + if (p != NULL) { + p->p_cpticks++; + if (++p->p_estcpu == 0) + p->p_estcpu--; + if ((p->p_estcpu & 3) == 0) { + resetpriority(p); + if (p->p_priority >= PUSER) + p->p_priority = p->p_usrpri; + } + + /* Update resource usage integrals and maximums. */ + if ((pstats = p->p_stats) != NULL && + (ru = &pstats->p_ru) != NULL && + (vm = p->p_vmspace) != NULL) { + ru->ru_ixrss += vm->vm_tsize * PAGE_SIZE / 1024; + ru->ru_idrss += vm->vm_dsize * PAGE_SIZE / 1024; + ru->ru_isrss += vm->vm_ssize * PAGE_SIZE / 1024; + rss = vm->vm_pmap.pm_stats.resident_count * + PAGE_SIZE / 1024; + if (ru->ru_maxrss < rss) + ru->ru_maxrss = rss; + } + } +} + +/* + * Return information about system clocks. + */ +static int +sysctl_kern_clockrate SYSCTL_HANDLER_ARGS +{ + struct clockinfo clkinfo; + /* + * Construct clockinfo structure. + */ + clkinfo.hz = hz; + clkinfo.tick = tick; + clkinfo.profhz = profhz; + clkinfo.stathz = stathz ? stathz : hz; + return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req)); +} + +SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD, + 0, 0, sysctl_kern_clockrate, "S,clockinfo",""); + +#ifdef PPS_SYNC +/* + * hardpps() - discipline CPU clock oscillator to external PPS signal + * + * This routine is called at each PPS interrupt in order to discipline + * the CPU clock oscillator to the PPS signal. It measures the PPS phase + * and leaves it in a handy spot for the hardclock() routine. It + * integrates successive PPS phase differences and calculates the + * frequency offset. This is used in hardclock() to discipline the CPU + * clock oscillator so that intrinsic frequency error is cancelled out. + * The code requires the caller to capture the time and hardware counter + * value at the on-time PPS signal transition. + * + * Note that, on some Unix systems, this routine runs at an interrupt + * priority level higher than the timer interrupt routine hardclock(). + * Therefore, the variables used are distinct from the hardclock() + * variables, except for certain exceptions: The PPS frequency pps_freq + * and phase pps_offset variables are determined by this routine and + * updated atomically. The time_tolerance variable can be considered a + * constant, since it is infrequently changed, and then only when the + * PPS signal is disabled. The watchdog counter pps_valid is updated + * once per second by hardclock() and is atomically cleared in this + * routine. + */ +void +hardpps(tvp, usec) + struct timeval *tvp; /* time at PPS */ + long usec; /* hardware counter at PPS */ +{ + long u_usec, v_usec, bigtick; + long cal_sec, cal_usec; + + /* + * An occasional glitch can be produced when the PPS interrupt + * occurs in the hardclock() routine before the time variable is + * updated. Here the offset is discarded when the difference + * between it and the last one is greater than tick/2, but not + * if the interval since the first discard exceeds 30 s. + */ + time_status |= STA_PPSSIGNAL; + time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); + pps_valid = 0; + u_usec = -tvp->tv_usec; + if (u_usec < -500000) + u_usec += 1000000; + v_usec = pps_offset - u_usec; + if (v_usec < 0) + v_usec = -v_usec; + if (v_usec > (tick >> 1)) { + if (pps_glitch > MAXGLITCH) { + pps_glitch = 0; + pps_tf[2] = u_usec; + pps_tf[1] = u_usec; + } else { + pps_glitch++; + u_usec = pps_offset; + } + } else + pps_glitch = 0; + + /* + * A three-stage median filter is used to help deglitch the pps + * time. The median sample becomes the time offset estimate; the + * difference between the other two samples becomes the time + * dispersion (jitter) estimate. + */ + pps_tf[2] = pps_tf[1]; + pps_tf[1] = pps_tf[0]; + pps_tf[0] = u_usec; + if (pps_tf[0] > pps_tf[1]) { + if (pps_tf[1] > pps_tf[2]) { + pps_offset = pps_tf[1]; /* 0 1 2 */ + v_usec = pps_tf[0] - pps_tf[2]; + } else if (pps_tf[2] > pps_tf[0]) { + pps_offset = pps_tf[0]; /* 2 0 1 */ + v_usec = pps_tf[2] - pps_tf[1]; + } else { + pps_offset = pps_tf[2]; /* 0 2 1 */ + v_usec = pps_tf[0] - pps_tf[1]; + } + } else { + if (pps_tf[1] < pps_tf[2]) { + pps_offset = pps_tf[1]; /* 2 1 0 */ + v_usec = pps_tf[2] - pps_tf[0]; + } else if (pps_tf[2] < pps_tf[0]) { + pps_offset = pps_tf[0]; /* 1 0 2 */ + v_usec = pps_tf[1] - pps_tf[2]; + } else { + pps_offset = pps_tf[2]; /* 1 2 0 */ + v_usec = pps_tf[1] - pps_tf[0]; + } + } + if (v_usec > MAXTIME) + pps_jitcnt++; + v_usec = (v_usec << PPS_AVG) - pps_jitter; + if (v_usec < 0) + pps_jitter -= -v_usec >> PPS_AVG; + else + pps_jitter += v_usec >> PPS_AVG; + if (pps_jitter > (MAXTIME >> 1)) + time_status |= STA_PPSJITTER; + + /* + * During the calibration interval adjust the starting time when + * the tick overflows. At the end of the interval compute the + * duration of the interval and the difference of the hardware + * counters at the beginning and end of the interval. This code + * is deliciously complicated by the fact valid differences may + * exceed the value of tick when using long calibration + * intervals and small ticks. Note that the counter can be + * greater than tick if caught at just the wrong instant, but + * the values returned and used here are correct. + */ + bigtick = (long)tick << SHIFT_USEC; + pps_usec -= pps_freq; + if (pps_usec >= bigtick) + pps_usec -= bigtick; + if (pps_usec < 0) + pps_usec += bigtick; + pps_time.tv_sec++; + pps_count++; + if (pps_count < (1 << pps_shift)) + return; + pps_count = 0; + pps_calcnt++; + u_usec = usec << SHIFT_USEC; + v_usec = pps_usec - u_usec; + if (v_usec >= bigtick >> 1) + v_usec -= bigtick; + if (v_usec < -(bigtick >> 1)) + v_usec += bigtick; + if (v_usec < 0) + v_usec = -(-v_usec >> pps_shift); + else + v_usec = v_usec >> pps_shift; + pps_usec = u_usec; + cal_sec = tvp->tv_sec; + cal_usec = tvp->tv_usec; + cal_sec -= pps_time.tv_sec; + cal_usec -= pps_time.tv_usec; + if (cal_usec < 0) { + cal_usec += 1000000; + cal_sec--; + } + pps_time = *tvp; + + /* + * Check for lost interrupts, noise, excessive jitter and + * excessive frequency error. The number of timer ticks during + * the interval may vary +-1 tick. Add to this a margin of one + * tick for the PPS signal jitter and maximum frequency + * deviation. If the limits are exceeded, the calibration + * interval is reset to the minimum and we start over. + */ + u_usec = (long)tick << 1; + if (!((cal_sec == -1 && cal_usec > (1000000 - u_usec)) + || (cal_sec == 0 && cal_usec < u_usec)) + || v_usec > time_tolerance || v_usec < -time_tolerance) { + pps_errcnt++; + pps_shift = PPS_SHIFT; + pps_intcnt = 0; + time_status |= STA_PPSERROR; + return; + } + + /* + * A three-stage median filter is used to help deglitch the pps + * frequency. The median sample becomes the frequency offset + * estimate; the difference between the other two samples + * becomes the frequency dispersion (stability) estimate. + */ + pps_ff[2] = pps_ff[1]; + pps_ff[1] = pps_ff[0]; + pps_ff[0] = v_usec; + if (pps_ff[0] > pps_ff[1]) { + if (pps_ff[1] > pps_ff[2]) { + u_usec = pps_ff[1]; /* 0 1 2 */ + v_usec = pps_ff[0] - pps_ff[2]; + } else if (pps_ff[2] > pps_ff[0]) { + u_usec = pps_ff[0]; /* 2 0 1 */ + v_usec = pps_ff[2] - pps_ff[1]; + } else { + u_usec = pps_ff[2]; /* 0 2 1 */ + v_usec = pps_ff[0] - pps_ff[1]; + } + } else { + if (pps_ff[1] < pps_ff[2]) { + u_usec = pps_ff[1]; /* 2 1 0 */ + v_usec = pps_ff[2] - pps_ff[0]; + } else if (pps_ff[2] < pps_ff[0]) { + u_usec = pps_ff[0]; /* 1 0 2 */ + v_usec = pps_ff[1] - pps_ff[2]; + } else { + u_usec = pps_ff[2]; /* 1 2 0 */ + v_usec = pps_ff[1] - pps_ff[0]; + } + } + + /* + * Here the frequency dispersion (stability) is updated. If it + * is less than one-fourth the maximum (MAXFREQ), the frequency + * offset is updated as well, but clamped to the tolerance. It + * will be processed later by the hardclock() routine. + */ + v_usec = (v_usec >> 1) - pps_stabil; + if (v_usec < 0) + pps_stabil -= -v_usec >> PPS_AVG; + else + pps_stabil += v_usec >> PPS_AVG; + if (pps_stabil > MAXFREQ >> 2) { + pps_stbcnt++; + time_status |= STA_PPSWANDER; + return; + } + if (time_status & STA_PPSFREQ) { + if (u_usec < 0) { + pps_freq -= -u_usec >> PPS_AVG; + if (pps_freq < -time_tolerance) + pps_freq = -time_tolerance; + u_usec = -u_usec; + } else { + pps_freq += u_usec >> PPS_AVG; + if (pps_freq > time_tolerance) + pps_freq = time_tolerance; + } + } + + /* + * Here the calibration interval is adjusted. If the maximum + * time difference is greater than tick / 4, reduce the interval + * by half. If this is not the case for four consecutive + * intervals, double the interval. + */ + if (u_usec << pps_shift > bigtick >> 2) { + pps_intcnt = 0; + if (pps_shift > PPS_SHIFT) + pps_shift--; + } else if (pps_intcnt >= 4) { + pps_intcnt = 0; + if (pps_shift < PPS_SHIFTMAX) + pps_shift++; + } else + pps_intcnt++; +} +#endif /* PPS_SYNC */ diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c index f4facf6..797ea2c 100644 --- a/sys/kern/kern_time.c +++ b/sys/kern/kern_time.c @@ -30,22 +30,22 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)kern_time.c 8.4 (Berkeley) 5/26/95 + * @(#)kern_time.c 8.1 (Berkeley) 6/10/93 + * $Id: kern_time.c,v 1.21 1997/02/22 09:39:13 peter Exp $ */ #include <sys/param.h> +#include <sys/sysproto.h> #include <sys/resourcevar.h> +#include <sys/signalvar.h> #include <sys/kernel.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/vnode.h> -#include <sys/mount.h> -#include <sys/syscallargs.h> +struct timezone tz; -#include <machine/cpu.h> - -/* +/* * Time of day and interval timer support. * * These routines provide the kernel entry points to get and set @@ -55,81 +55,97 @@ * timers when they expire. */ +static void timevalfix __P((struct timeval *)); + +#ifndef _SYS_SYSPROTO_H_ +struct gettimeofday_args { + struct timeval *tp; + struct timezone *tzp; +}; +#endif /* ARGSUSED */ int gettimeofday(p, uap, retval) struct proc *p; - register struct gettimeofday_args /* { - syscallarg(struct timeval *) tp; - syscallarg(struct timezone *) tzp; - } */ *uap; - register_t *retval; + register struct gettimeofday_args *uap; + int *retval; { struct timeval atv; int error = 0; - if (SCARG(uap, tp)) { + if (uap->tp) { microtime(&atv); - if (error = copyout((caddr_t)&atv, (caddr_t)SCARG(uap, tp), - sizeof (atv))) + if ((error = copyout((caddr_t)&atv, (caddr_t)uap->tp, + sizeof (atv)))) return (error); } - if (SCARG(uap, tzp)) - error = copyout((caddr_t)&tz, (caddr_t)SCARG(uap, tzp), + if (uap->tzp) + error = copyout((caddr_t)&tz, (caddr_t)uap->tzp, sizeof (tz)); return (error); } +#ifndef _SYS_SYSPROTO_H_ +struct settimeofday_args { + struct timeval *tv; + struct timezone *tzp; +}; +#endif /* ARGSUSED */ int settimeofday(p, uap, retval) struct proc *p; - struct settimeofday_args /* { - syscallarg(struct timeval *) tv; - syscallarg(struct timezone *) tzp; - } */ *uap; - register_t *retval; + struct settimeofday_args *uap; + int *retval; { struct timeval atv, delta; struct timezone atz; int error, s; - if (error = suser(p->p_ucred, &p->p_acflag)) + if ((error = suser(p->p_ucred, &p->p_acflag))) return (error); /* Verify all parameters before changing time. */ - if (SCARG(uap, tv) && (error = copyin((caddr_t)SCARG(uap, tv), - (caddr_t)&atv, sizeof(atv)))) + if (uap->tv && + (error = copyin((caddr_t)uap->tv, (caddr_t)&atv, sizeof(atv)))) return (error); - if (SCARG(uap, tzp) && (error = copyin((caddr_t)SCARG(uap, tzp), - (caddr_t)&atz, sizeof(atz)))) + if (atv.tv_usec < 0 || atv.tv_usec >= 1000000) + return (EINVAL); + if (uap->tzp && + (error = copyin((caddr_t)uap->tzp, (caddr_t)&atz, sizeof(atz)))) return (error); - if (SCARG(uap, tv)) { + if (uap->tv) { + s = splclock(); /* - * If the system is secure, we do not allow the time to be - * set to an earlier value (it may be slowed using adjtime, - * but not set back). This feature prevent interlopers from - * setting arbitrary time stamps on files. + * Calculate delta directly to minimize clock interrupt + * latency. Fix it after the ipl has been lowered. */ - if (securelevel > 0 && timercmp(&atv, &time, <)) - return (EPERM); - /* WHAT DO WE DO ABOUT PENDING REAL-TIME TIMEOUTS??? */ - s = splclock(); - /* nb. delta.tv_usec may be < 0, but this is OK here */ delta.tv_sec = atv.tv_sec - time.tv_sec; delta.tv_usec = atv.tv_usec - time.tv_usec; time = atv; + /* + * XXX should arrange for microtime() to agree with atv if + * it is called now. As it is, it may add up to about + * `tick' unwanted usec. + * Another problem is that clock interrupts may occur at + * other than multiples of `tick'. It's not worth fixing + * this here, since the problem is also caused by tick + * adjustments. + */ (void) splsoftclock(); + timevalfix(&delta); timevaladd(&boottime, &delta); - timevalfix(&boottime); timevaladd(&runtime, &delta); - timevalfix(&runtime); + /* re-use 'p' */ + for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) + if (timerisset(&p->p_realtimer.it_value)) + timevaladd(&p->p_realtimer.it_value, &delta); # ifdef NFS lease_updatetime(delta.tv_sec); # endif splx(s); resettodr(); } - if (SCARG(uap, tzp)) + if (uap->tzp) tz = atz; return (0); } @@ -137,26 +153,29 @@ settimeofday(p, uap, retval) extern int tickadj; /* "standard" clock skew, us./tick */ int tickdelta; /* current clock skew, us. per tick */ long timedelta; /* unapplied time correction, us. */ -long bigadj = 1000000; /* use 10x skew above bigadj us. */ +static long bigadj = 1000000; /* use 10x skew above bigadj us. */ +#ifndef _SYS_SYSPROTO_H_ +struct adjtime_args { + struct timeval *delta; + struct timeval *olddelta; +}; +#endif /* ARGSUSED */ int adjtime(p, uap, retval) struct proc *p; - register struct adjtime_args /* { - syscallarg(struct timeval *) delta; - syscallarg(struct timeval *) olddelta; - } */ *uap; - register_t *retval; + register struct adjtime_args *uap; + int *retval; { struct timeval atv; register long ndelta, ntickdelta, odelta; int s, error; - if (error = suser(p->p_ucred, &p->p_acflag)) + if ((error = suser(p->p_ucred, &p->p_acflag))) return (error); - if (error = copyin((caddr_t)SCARG(uap, delta), (caddr_t)&atv, - sizeof(struct timeval))) + if ((error = + copyin((caddr_t)uap->delta, (caddr_t)&atv, sizeof(struct timeval)))) return (error); /* @@ -167,7 +186,7 @@ adjtime(p, uap, retval) * overshoot and start taking us away from the desired final time. */ ndelta = atv.tv_sec * 1000000 + atv.tv_usec; - if (ndelta > bigadj) + if (ndelta > bigadj || ndelta < -bigadj) ntickdelta = 10 * tickadj; else ntickdelta = tickadj; @@ -187,10 +206,10 @@ adjtime(p, uap, retval) tickdelta = ntickdelta; splx(s); - if (SCARG(uap, olddelta)) { + if (uap->olddelta) { atv.tv_sec = odelta / 1000000; atv.tv_usec = odelta % 1000000; - (void) copyout((caddr_t)&atv, (caddr_t)SCARG(uap, olddelta), + (void) copyout((caddr_t)&atv, (caddr_t)uap->olddelta, sizeof(struct timeval)); } return (0); @@ -217,25 +236,28 @@ adjtime(p, uap, retval) * real time timers .it_interval. Rather, we compute the next time in * absolute time the timer should go off. */ +#ifndef _SYS_SYSPROTO_H_ +struct getitimer_args { + u_int which; + struct itimerval *itv; +}; +#endif /* ARGSUSED */ int getitimer(p, uap, retval) struct proc *p; - register struct getitimer_args /* { - syscallarg(u_int) which; - syscallarg(struct itimerval *) itv; - } */ *uap; - register_t *retval; + register struct getitimer_args *uap; + int *retval; { struct itimerval aitv; int s; - if (SCARG(uap, which) > ITIMER_PROF) + if (uap->which > ITIMER_PROF) return (EINVAL); s = splclock(); - if (SCARG(uap, which) == ITIMER_REAL) { + if (uap->which == ITIMER_REAL) { /* - * Convert from absolute to relative time in .it_value + * Convert from absoulte to relative time in .it_value * part of real time timer. If time for real time timer * has passed return 0, else return difference between * current time and time for the timer to go off. @@ -245,53 +267,54 @@ getitimer(p, uap, retval) if (timercmp(&aitv.it_value, &time, <)) timerclear(&aitv.it_value); else - timevalsub(&aitv.it_value, - (struct timeval *)&time); + timevalsub(&aitv.it_value, &time); } else - aitv = p->p_stats->p_timer[SCARG(uap, which)]; + aitv = p->p_stats->p_timer[uap->which]; splx(s); - return (copyout((caddr_t)&aitv, (caddr_t)SCARG(uap, itv), + return (copyout((caddr_t)&aitv, (caddr_t)uap->itv, sizeof (struct itimerval))); } +#ifndef _SYS_SYSPROTO_H_ +struct setitimer_args { + u_int which; + struct itimerval *itv, *oitv; +}; +#endif /* ARGSUSED */ int setitimer(p, uap, retval) struct proc *p; - register struct setitimer_args /* { - syscallarg(u_int) which; - syscallarg(struct itimerval *) itv; - syscallarg(struct itimerval *) oitv; - } */ *uap; - register_t *retval; + register struct setitimer_args *uap; + int *retval; { struct itimerval aitv; register struct itimerval *itvp; int s, error; - if (SCARG(uap, which) > ITIMER_PROF) + if (uap->which > ITIMER_PROF) return (EINVAL); - itvp = SCARG(uap, itv); + itvp = uap->itv; if (itvp && (error = copyin((caddr_t)itvp, (caddr_t)&aitv, sizeof(struct itimerval)))) return (error); - if ((SCARG(uap, itv) = SCARG(uap, oitv)) && - (error = getitimer(p, uap, retval))) + if ((uap->itv = uap->oitv) && + (error = getitimer(p, (struct getitimer_args *)uap, retval))) return (error); if (itvp == 0) return (0); if (itimerfix(&aitv.it_value) || itimerfix(&aitv.it_interval)) return (EINVAL); s = splclock(); - if (SCARG(uap, which) == ITIMER_REAL) { + if (uap->which == ITIMER_REAL) { untimeout(realitexpire, (caddr_t)p); if (timerisset(&aitv.it_value)) { - timevaladd(&aitv.it_value, (struct timeval *)&time); + timevaladd(&aitv.it_value, &time); timeout(realitexpire, (caddr_t)p, hzto(&aitv.it_value)); } p->p_realtimer = aitv; } else - p->p_stats->p_timer[SCARG(uap, which)] = aitv; + p->p_stats->p_timer[uap->which] = aitv; splx(s); return (0); } @@ -303,6 +326,10 @@ setitimer(p, uap, retval) * Else compute next time timer should go off which is > current time. * This is where delay in processing this timeout causes multiple * SIGALRM calls to be compressed into one. + * hzto() always adds 1 to allow for the time until the next clock + * interrupt being strictly less than 1 clock tick, but we don't want + * that here since we want to appear to be in sync with the clock + * interrupt even when we're delayed. */ void realitexpire(arg) @@ -323,7 +350,7 @@ realitexpire(arg) &p->p_realtimer.it_interval); if (timercmp(&p->p_realtimer.it_value, &time, >)) { timeout(realitexpire, (caddr_t)p, - hzto(&p->p_realtimer.it_value)); + hzto(&p->p_realtimer.it_value) - 1); splx(s); return; } @@ -400,6 +427,7 @@ expire: * it just gets very confused in this case. * Caveat emptor. */ +void timevaladd(t1, t2) struct timeval *t1, *t2; { @@ -409,6 +437,7 @@ timevaladd(t1, t2) timevalfix(t1); } +void timevalsub(t1, t2) struct timeval *t1, *t2; { @@ -418,6 +447,7 @@ timevalsub(t1, t2) timevalfix(t1); } +static void timevalfix(t1) struct timeval *t1; { diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c new file mode 100644 index 0000000..171ed0e --- /dev/null +++ b/sys/kern/kern_timeout.c @@ -0,0 +1,1303 @@ +/*- + * Copyright (c) 1982, 1986, 1991, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94 + * $Id: kern_clock.c,v 1.34 1997/03/22 16:52:19 mpp Exp $ + */ + +/* Portions of this software are covered by the following: */ +/****************************************************************************** + * * + * Copyright (c) David L. Mills 1993, 1994 * + * * + * Permission to use, copy, modify, and distribute this software and its * + * documentation for any purpose and without fee is hereby granted, provided * + * that the above copyright notice appears in all copies and that both the * + * copyright notice and this permission notice appear in supporting * + * documentation, and that the name University of Delaware not be used in * + * advertising or publicity pertaining to distribution of the software * + * without specific, written prior permission. The University of Delaware * + * makes no representations about the suitability this software for any * + * purpose. It is provided "as is" without express or implied warranty. * + * * + *****************************************************************************/ + +#include "opt_cpu.h" /* XXX */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/dkstat.h> +#include <sys/callout.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/timex.h> +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <sys/sysctl.h> + +#include <machine/cpu.h> +#define CLOCK_HAIR /* XXX */ +#include <machine/clock.h> + +#ifdef GPROF +#include <sys/gmon.h> +#endif + +static void initclocks __P((void *dummy)); +SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL) + +/* Exported to machdep.c. */ +struct callout *callfree, *callout; + +static struct callout calltodo; + +/* Some of these don't belong here, but it's easiest to concentrate them. */ +static long cp_time[CPUSTATES]; +long dk_seek[DK_NDRIVE]; +static long dk_time[DK_NDRIVE]; +long dk_wds[DK_NDRIVE]; +long dk_wpms[DK_NDRIVE]; +long dk_xfer[DK_NDRIVE]; + +int dk_busy; +int dk_ndrive = 0; +char dk_names[DK_NDRIVE][DK_NAMELEN]; + +long tk_cancc; +long tk_nin; +long tk_nout; +long tk_rawcc; + +/* + * Clock handling routines. + * + * This code is written to operate with two timers that run independently of + * each other. The main clock, running hz times per second, is used to keep + * track of real time. The second timer handles kernel and user profiling, + * and does resource use estimation. If the second timer is programmable, + * it is randomized to avoid aliasing between the two clocks. For example, + * the randomization prevents an adversary from always giving up the cpu + * just before its quantum expires. Otherwise, it would never accumulate + * cpu ticks. The mean frequency of the second timer is stathz. + * + * If no second timer exists, stathz will be zero; in this case we drive + * profiling and statistics off the main clock. This WILL NOT be accurate; + * do not do it unless absolutely necessary. + * + * The statistics clock may (or may not) be run at a higher rate while + * profiling. This profile clock runs at profhz. We require that profhz + * be an integral multiple of stathz. + * + * If the statistics clock is running fast, it must be divided by the ratio + * profhz/stathz for statistics. (For profiling, every tick counts.) + */ + +/* + * TODO: + * allocate more timeout table slots when table overflows. + */ + +/* + * Bump a timeval by a small number of usec's. + */ +#define BUMPTIME(t, usec) { \ + register volatile struct timeval *tp = (t); \ + register long us; \ + \ + tp->tv_usec = us = tp->tv_usec + (usec); \ + if (us >= 1000000) { \ + tp->tv_usec = us - 1000000; \ + tp->tv_sec++; \ + } \ +} + +int stathz; +int profhz; +static int profprocs; +int ticks; +static int psdiv, pscnt; /* prof => stat divider */ +int psratio; /* ratio: prof / stat */ + +volatile struct timeval time; +volatile struct timeval mono_time; + +/* + * Phase/frequency-lock loop (PLL/FLL) definitions + * + * The following variables are read and set by the ntp_adjtime() system + * call. + * + * time_state shows the state of the system clock, with values defined + * in the timex.h header file. + * + * time_status shows the status of the system clock, with bits defined + * in the timex.h header file. + * + * time_offset is used by the PLL/FLL to adjust the system time in small + * increments. + * + * time_constant determines the bandwidth or "stiffness" of the PLL. + * + * time_tolerance determines maximum frequency error or tolerance of the + * CPU clock oscillator and is a property of the architecture; however, + * in principle it could change as result of the presence of external + * discipline signals, for instance. + * + * time_precision is usually equal to the kernel tick variable; however, + * in cases where a precision clock counter or external clock is + * available, the resolution can be much less than this and depend on + * whether the external clock is working or not. + * + * time_maxerror is initialized by a ntp_adjtime() call and increased by + * the kernel once each second to reflect the maximum error + * bound growth. + * + * time_esterror is set and read by the ntp_adjtime() call, but + * otherwise not used by the kernel. + */ +int time_status = STA_UNSYNC; /* clock status bits */ +int time_state = TIME_OK; /* clock state */ +long time_offset = 0; /* time offset (us) */ +long time_constant = 0; /* pll time constant */ +long time_tolerance = MAXFREQ; /* frequency tolerance (scaled ppm) */ +long time_precision = 1; /* clock precision (us) */ +long time_maxerror = MAXPHASE; /* maximum error (us) */ +long time_esterror = MAXPHASE; /* estimated error (us) */ + +/* + * The following variables establish the state of the PLL/FLL and the + * residual time and frequency offset of the local clock. The scale + * factors are defined in the timex.h header file. + * + * time_phase and time_freq are the phase increment and the frequency + * increment, respectively, of the kernel time variable at each tick of + * the clock. + * + * time_freq is set via ntp_adjtime() from a value stored in a file when + * the synchronization daemon is first started. Its value is retrieved + * via ntp_adjtime() and written to the file about once per hour by the + * daemon. + * + * time_adj is the adjustment added to the value of tick at each timer + * interrupt and is recomputed from time_phase and time_freq at each + * seconds rollover. + * + * time_reftime is the second's portion of the system time on the last + * call to ntp_adjtime(). It is used to adjust the time_freq variable + * and to increase the time_maxerror as the time since last update + * increases. + */ +static long time_phase = 0; /* phase offset (scaled us) */ +long time_freq = 0; /* frequency offset (scaled ppm) */ +static long time_adj = 0; /* tick adjust (scaled 1 / hz) */ +static long time_reftime = 0; /* time at last adjustment (s) */ + +#ifdef PPS_SYNC +/* + * The following variables are used only if the kernel PPS discipline + * code is configured (PPS_SYNC). The scale factors are defined in the + * timex.h header file. + * + * pps_time contains the time at each calibration interval, as read by + * microtime(). pps_count counts the seconds of the calibration + * interval, the duration of which is nominally pps_shift in powers of + * two. + * + * pps_offset is the time offset produced by the time median filter + * pps_tf[], while pps_jitter is the dispersion (jitter) measured by + * this filter. + * + * pps_freq is the frequency offset produced by the frequency median + * filter pps_ff[], while pps_stabil is the dispersion (wander) measured + * by this filter. + * + * pps_usec is latched from a high resolution counter or external clock + * at pps_time. Here we want the hardware counter contents only, not the + * contents plus the time_tv.usec as usual. + * + * pps_valid counts the number of seconds since the last PPS update. It + * is used as a watchdog timer to disable the PPS discipline should the + * PPS signal be lost. + * + * pps_glitch counts the number of seconds since the beginning of an + * offset burst more than tick/2 from current nominal offset. It is used + * mainly to suppress error bursts due to priority conflicts between the + * PPS interrupt and timer interrupt. + * + * pps_intcnt counts the calibration intervals for use in the interval- + * adaptation algorithm. It's just too complicated for words. + */ +struct timeval pps_time; /* kernel time at last interval */ +long pps_offset = 0; /* pps time offset (us) */ +long pps_jitter = MAXTIME; /* pps time dispersion (jitter) (us) */ +long pps_tf[] = {0, 0, 0}; /* pps time offset median filter (us) */ +long pps_freq = 0; /* frequency offset (scaled ppm) */ +long pps_stabil = MAXFREQ; /* frequency dispersion (scaled ppm) */ +long pps_ff[] = {0, 0, 0}; /* frequency offset median filter */ +long pps_usec = 0; /* microsec counter at last interval */ +long pps_valid = PPS_VALID; /* pps signal watchdog counter */ +int pps_glitch = 0; /* pps signal glitch counter */ +int pps_count = 0; /* calibration interval counter (s) */ +int pps_shift = PPS_SHIFT; /* interval duration (s) (shift) */ +int pps_intcnt = 0; /* intervals at current duration */ + +/* + * PPS signal quality monitors + * + * pps_jitcnt counts the seconds that have been discarded because the + * jitter measured by the time median filter exceeds the limit MAXTIME + * (100 us). + * + * pps_calcnt counts the frequency calibration intervals, which are + * variable from 4 s to 256 s. + * + * pps_errcnt counts the calibration intervals which have been discarded + * because the wander exceeds the limit MAXFREQ (100 ppm) or where the + * calibration interval jitter exceeds two ticks. + * + * pps_stbcnt counts the calibration intervals that have been discarded + * because the frequency wander exceeds the limit MAXFREQ / 4 (25 us). + */ +long pps_jitcnt = 0; /* jitter limit exceeded */ +long pps_calcnt = 0; /* calibration intervals */ +long pps_errcnt = 0; /* calibration errors */ +long pps_stbcnt = 0; /* stability limit exceeded */ +#endif /* PPS_SYNC */ + +/* XXX none of this stuff works under FreeBSD */ +#ifdef EXT_CLOCK +/* + * External clock definitions + * + * The following definitions and declarations are used only if an + * external clock (HIGHBALL or TPRO) is configured on the system. + */ +#define CLOCK_INTERVAL 30 /* CPU clock update interval (s) */ + +/* + * The clock_count variable is set to CLOCK_INTERVAL at each PPS + * interrupt and decremented once each second. + */ +int clock_count = 0; /* CPU clock counter */ + +#ifdef HIGHBALL +/* + * The clock_offset and clock_cpu variables are used by the HIGHBALL + * interface. The clock_offset variable defines the offset between + * system time and the HIGBALL counters. The clock_cpu variable contains + * the offset between the system clock and the HIGHBALL clock for use in + * disciplining the kernel time variable. + */ +extern struct timeval clock_offset; /* Highball clock offset */ +long clock_cpu = 0; /* CPU clock adjust */ +#endif /* HIGHBALL */ +#endif /* EXT_CLOCK */ + +/* + * hardupdate() - local clock update + * + * This routine is called by ntp_adjtime() to update the local clock + * phase and frequency. The implementation is of an adaptive-parameter, + * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new + * time and frequency offset estimates for each call. If the kernel PPS + * discipline code is configured (PPS_SYNC), the PPS signal itself + * determines the new time offset, instead of the calling argument. + * Presumably, calls to ntp_adjtime() occur only when the caller + * believes the local clock is valid within some bound (+-128 ms with + * NTP). If the caller's time is far different than the PPS time, an + * argument will ensue, and it's not clear who will lose. + * + * For uncompensated quartz crystal oscillatores and nominal update + * intervals less than 1024 s, operation should be in phase-lock mode + * (STA_FLL = 0), where the loop is disciplined to phase. For update + * intervals greater than thiss, operation should be in frequency-lock + * mode (STA_FLL = 1), where the loop is disciplined to frequency. + * + * Note: splclock() is in effect. + */ +void +hardupdate(offset) + long offset; +{ + long ltemp, mtemp; + + if (!(time_status & STA_PLL) && !(time_status & STA_PPSTIME)) + return; + ltemp = offset; +#ifdef PPS_SYNC + if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL) + ltemp = pps_offset; +#endif /* PPS_SYNC */ + + /* + * Scale the phase adjustment and clamp to the operating range. + */ + if (ltemp > MAXPHASE) + time_offset = MAXPHASE << SHIFT_UPDATE; + else if (ltemp < -MAXPHASE) + time_offset = -(MAXPHASE << SHIFT_UPDATE); + else + time_offset = ltemp << SHIFT_UPDATE; + + /* + * Select whether the frequency is to be controlled and in which + * mode (PLL or FLL). Clamp to the operating range. Ugly + * multiply/divide should be replaced someday. + */ + if (time_status & STA_FREQHOLD || time_reftime == 0) + time_reftime = time.tv_sec; + mtemp = time.tv_sec - time_reftime; + time_reftime = time.tv_sec; + if (time_status & STA_FLL) { + if (mtemp >= MINSEC) { + ltemp = ((time_offset / mtemp) << (SHIFT_USEC - + SHIFT_UPDATE)); + if (ltemp < 0) + time_freq -= -ltemp >> SHIFT_KH; + else + time_freq += ltemp >> SHIFT_KH; + } + } else { + if (mtemp < MAXSEC) { + ltemp *= mtemp; + if (ltemp < 0) + time_freq -= -ltemp >> (time_constant + + time_constant + SHIFT_KF - + SHIFT_USEC); + else + time_freq += ltemp >> (time_constant + + time_constant + SHIFT_KF - + SHIFT_USEC); + } + } + if (time_freq > time_tolerance) + time_freq = time_tolerance; + else if (time_freq < -time_tolerance) + time_freq = -time_tolerance; +} + + + +/* + * Initialize clock frequencies and start both clocks running. + */ +/* ARGSUSED*/ +static void +initclocks(dummy) + void *dummy; +{ + register int i; + + /* + * Set divisors to 1 (normal case) and let the machine-specific + * code do its bit. + */ + psdiv = pscnt = 1; + cpu_initclocks(); + + /* + * Compute profhz/stathz, and fix profhz if needed. + */ + i = stathz ? stathz : hz; + if (profhz == 0) + profhz = i; + psratio = profhz / i; +} + +/* + * The real-time timer, interrupting hz times per second. + */ +void +hardclock(frame) + register struct clockframe *frame; +{ + register struct callout *p1; + register struct proc *p; + register int needsoft; + + /* + * Update real-time timeout queue. + * At front of queue are some number of events which are ``due''. + * The time to these is <= 0 and if negative represents the + * number of ticks which have passed since it was supposed to happen. + * The rest of the q elements (times > 0) are events yet to happen, + * where the time for each is given as a delta from the previous. + * Decrementing just the first of these serves to decrement the time + * to all events. + */ + needsoft = 0; + for (p1 = calltodo.c_next; p1 != NULL; p1 = p1->c_next) { + if (--p1->c_time > 0) + break; + needsoft = 1; + if (p1->c_time == 0) + break; + } + + p = curproc; + if (p) { + register struct pstats *pstats; + + /* + * Run current process's virtual and profile time, as needed. + */ + pstats = p->p_stats; + if (CLKF_USERMODE(frame) && + timerisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) && + itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0) + psignal(p, SIGVTALRM); + if (timerisset(&pstats->p_timer[ITIMER_PROF].it_value) && + itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0) + psignal(p, SIGPROF); + } + + /* + * If no separate statistics clock is available, run it from here. + */ + if (stathz == 0) + statclock(frame); + + /* + * Increment the time-of-day. + */ + ticks++; + { + int time_update; + struct timeval newtime = time; + long ltemp; + + if (timedelta == 0) { + time_update = CPU_THISTICKLEN(tick); + } else { + time_update = CPU_THISTICKLEN(tick) + tickdelta; + timedelta -= tickdelta; + } + BUMPTIME(&mono_time, time_update); + + /* + * Compute the phase adjustment. If the low-order bits + * (time_phase) of the update overflow, bump the high-order bits + * (time_update). + */ + time_phase += time_adj; + if (time_phase <= -FINEUSEC) { + ltemp = -time_phase >> SHIFT_SCALE; + time_phase += ltemp << SHIFT_SCALE; + time_update -= ltemp; + } + else if (time_phase >= FINEUSEC) { + ltemp = time_phase >> SHIFT_SCALE; + time_phase -= ltemp << SHIFT_SCALE; + time_update += ltemp; + } + + newtime.tv_usec += time_update; + /* + * On rollover of the second the phase adjustment to be used for + * the next second is calculated. Also, the maximum error is + * increased by the tolerance. If the PPS frequency discipline + * code is present, the phase is increased to compensate for the + * CPU clock oscillator frequency error. + * + * On a 32-bit machine and given parameters in the timex.h + * header file, the maximum phase adjustment is +-512 ms and + * maximum frequency offset is a tad less than) +-512 ppm. On a + * 64-bit machine, you shouldn't need to ask. + */ + if (newtime.tv_usec >= 1000000) { + newtime.tv_usec -= 1000000; + newtime.tv_sec++; + time_maxerror += time_tolerance >> SHIFT_USEC; + + /* + * Compute the phase adjustment for the next second. In + * PLL mode, the offset is reduced by a fixed factor + * times the time constant. In FLL mode the offset is + * used directly. In either mode, the maximum phase + * adjustment for each second is clamped so as to spread + * the adjustment over not more than the number of + * seconds between updates. + */ + if (time_offset < 0) { + ltemp = -time_offset; + if (!(time_status & STA_FLL)) + ltemp >>= SHIFT_KG + time_constant; + if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) + ltemp = (MAXPHASE / MINSEC) << + SHIFT_UPDATE; + time_offset += ltemp; + time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ - + SHIFT_UPDATE); + } else { + ltemp = time_offset; + if (!(time_status & STA_FLL)) + ltemp >>= SHIFT_KG + time_constant; + if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE) + ltemp = (MAXPHASE / MINSEC) << + SHIFT_UPDATE; + time_offset -= ltemp; + time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ - + SHIFT_UPDATE); + } + + /* + * Compute the frequency estimate and additional phase + * adjustment due to frequency error for the next + * second. When the PPS signal is engaged, gnaw on the + * watchdog counter and update the frequency computed by + * the pll and the PPS signal. + */ +#ifdef PPS_SYNC + pps_valid++; + if (pps_valid == PPS_VALID) { + pps_jitter = MAXTIME; + pps_stabil = MAXFREQ; + time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER | + STA_PPSWANDER | STA_PPSERROR); + } + ltemp = time_freq + pps_freq; +#else + ltemp = time_freq; +#endif /* PPS_SYNC */ + if (ltemp < 0) + time_adj -= -ltemp >> + (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); + else + time_adj += ltemp >> + (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE); + +#if SHIFT_HZ == 7 + /* + * When the CPU clock oscillator frequency is not a + * power of two in Hz, the SHIFT_HZ is only an + * approximate scale factor. In the SunOS kernel, this + * results in a PLL gain factor of 1/1.28 = 0.78 what it + * should be. In the following code the overall gain is + * increased by a factor of 1.25, which results in a + * residual error less than 3 percent. + */ + /* Same thing applies for FreeBSD --GAW */ + if (hz == 100) { + if (time_adj < 0) + time_adj -= -time_adj >> 2; + else + time_adj += time_adj >> 2; + } +#endif /* SHIFT_HZ */ + + /* XXX - this is really bogus, but can't be fixed until + xntpd's idea of the system clock is fixed to know how + the user wants leap seconds handled; in the mean time, + we assume that users of NTP are running without proper + leap second support (this is now the default anyway) */ + /* + * Leap second processing. If in leap-insert state at + * the end of the day, the system clock is set back one + * second; if in leap-delete state, the system clock is + * set ahead one second. The microtime() routine or + * external clock driver will insure that reported time + * is always monotonic. The ugly divides should be + * replaced. + */ + switch (time_state) { + + case TIME_OK: + if (time_status & STA_INS) + time_state = TIME_INS; + else if (time_status & STA_DEL) + time_state = TIME_DEL; + break; + + case TIME_INS: + if (newtime.tv_sec % 86400 == 0) { + newtime.tv_sec--; + time_state = TIME_OOP; + } + break; + + case TIME_DEL: + if ((newtime.tv_sec + 1) % 86400 == 0) { + newtime.tv_sec++; + time_state = TIME_WAIT; + } + break; + + case TIME_OOP: + time_state = TIME_WAIT; + break; + + case TIME_WAIT: + if (!(time_status & (STA_INS | STA_DEL))) + time_state = TIME_OK; + } + } + CPU_CLOCKUPDATE(&time, &newtime); + } + + /* + * Process callouts at a very low cpu priority, so we don't keep the + * relatively high clock interrupt priority any longer than necessary. + */ + if (needsoft) { + if (CLKF_BASEPRI(frame)) { + /* + * Save the overhead of a software interrupt; + * it will happen as soon as we return, so do it now. + */ + (void)splsoftclock(); + softclock(); + } else + setsoftclock(); + } +} + +/* + * Software (low priority) clock interrupt. + * Run periodic events from timeout queue. + */ +/*ARGSUSED*/ +void +softclock() +{ + register struct callout *c; + register void *arg; + register void (*func) __P((void *)); + register int s; + + s = splhigh(); + while ((c = calltodo.c_next) != NULL && c->c_time <= 0) { + func = c->c_func; + arg = c->c_arg; + calltodo.c_next = c->c_next; + c->c_next = callfree; + callfree = c; + splx(s); + (*func)(arg); + (void) splhigh(); + } + splx(s); +} + +/* + * timeout -- + * Execute a function after a specified length of time. + * + * untimeout -- + * Cancel previous timeout function call. + * + * See AT&T BCI Driver Reference Manual for specification. This + * implementation differs from that one in that no identification + * value is returned from timeout, rather, the original arguments + * to timeout are used to identify entries for untimeout. + */ +void +timeout(ftn, arg, ticks) + timeout_t ftn; + void *arg; + register int ticks; +{ + register struct callout *new, *p, *t; + register int s; + + if (ticks <= 0) + ticks = 1; + + /* Lock out the clock. */ + s = splhigh(); + + /* Fill in the next free callout structure. */ + if (callfree == NULL) + panic("timeout table full"); + new = callfree; + callfree = new->c_next; + new->c_arg = arg; + new->c_func = ftn; + + /* + * The time for each event is stored as a difference from the time + * of the previous event on the queue. Walk the queue, correcting + * the ticks argument for queue entries passed. Correct the ticks + * value for the queue entry immediately after the insertion point + * as well. Watch out for negative c_time values; these represent + * overdue events. + */ + for (p = &calltodo; + (t = p->c_next) != NULL && ticks > t->c_time; p = t) + if (t->c_time > 0) + ticks -= t->c_time; + new->c_time = ticks; + if (t != NULL) + t->c_time -= ticks; + + /* Insert the new entry into the queue. */ + p->c_next = new; + new->c_next = t; + splx(s); +} + +void +untimeout(ftn, arg) + timeout_t ftn; + void *arg; +{ + register struct callout *p, *t; + register int s; + + s = splhigh(); + for (p = &calltodo; (t = p->c_next) != NULL; p = t) + if (t->c_func == ftn && t->c_arg == arg) { + /* Increment next entry's tick count. */ + if (t->c_next && t->c_time > 0) + t->c_next->c_time += t->c_time; + + /* Move entry from callout queue to callfree queue. */ + p->c_next = t->c_next; + t->c_next = callfree; + callfree = t; + break; + } + splx(s); +} + +void +gettime(struct timeval *tvp) +{ + int s; + + s = splclock(); + /* XXX should use microtime() iff tv_usec is used. */ + *tvp = time; + splx(s); +} + +/* + * Compute number of hz until specified time. Used to + * compute third argument to timeout() from an absolute time. + */ +int +hzto(tv) + struct timeval *tv; +{ + register unsigned long ticks; + register long sec, usec; + int s; + + /* + * If the number of usecs in the whole seconds part of the time + * difference fits in a long, then the total number of usecs will + * fit in an unsigned long. Compute the total and convert it to + * ticks, rounding up and adding 1 to allow for the current tick + * to expire. Rounding also depends on unsigned long arithmetic + * to avoid overflow. + * + * Otherwise, if the number of ticks in the whole seconds part of + * the time difference fits in a long, then convert the parts to + * ticks separately and add, using similar rounding methods and + * overflow avoidance. This method would work in the previous + * case but it is slightly slower and assumes that hz is integral. + * + * Otherwise, round the time difference down to the maximum + * representable value. + * + * If ints have 32 bits, then the maximum value for any timeout in + * 10ms ticks is 248 days. + */ + s = splclock(); + sec = tv->tv_sec - time.tv_sec; + usec = tv->tv_usec - time.tv_usec; + splx(s); + if (usec < 0) { + sec--; + usec += 1000000; + } + if (sec < 0) { +#ifdef DIAGNOSTIC + printf("hzto: negative time difference %ld sec %ld usec\n", + sec, usec); +#endif + ticks = 1; + } else if (sec <= LONG_MAX / 1000000) + ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1)) + / tick + 1; + else if (sec <= LONG_MAX / hz) + ticks = sec * hz + + ((unsigned long)usec + (tick - 1)) / tick + 1; + else + ticks = LONG_MAX; + if (ticks > INT_MAX) + ticks = INT_MAX; + return (ticks); +} + +/* + * Start profiling on a process. + * + * Kernel profiling passes proc0 which never exits and hence + * keeps the profile clock running constantly. + */ +void +startprofclock(p) + register struct proc *p; +{ + int s; + + if ((p->p_flag & P_PROFIL) == 0) { + p->p_flag |= P_PROFIL; + if (++profprocs == 1 && stathz != 0) { + s = splstatclock(); + psdiv = pscnt = psratio; + setstatclockrate(profhz); + splx(s); + } + } +} + +/* + * Stop profiling on a process. + */ +void +stopprofclock(p) + register struct proc *p; +{ + int s; + + if (p->p_flag & P_PROFIL) { + p->p_flag &= ~P_PROFIL; + if (--profprocs == 0 && stathz != 0) { + s = splstatclock(); + psdiv = pscnt = 1; + setstatclockrate(stathz); + splx(s); + } + } +} + +/* + * Statistics clock. Grab profile sample, and if divider reaches 0, + * do process and kernel statistics. + */ +void +statclock(frame) + register struct clockframe *frame; +{ +#ifdef GPROF + register struct gmonparam *g; +#endif + register struct proc *p; + register int i; + struct pstats *pstats; + long rss; + struct rusage *ru; + struct vmspace *vm; + + if (CLKF_USERMODE(frame)) { + p = curproc; + if (p->p_flag & P_PROFIL) + addupc_intr(p, CLKF_PC(frame), 1); + if (--pscnt > 0) + return; + /* + * Came from user mode; CPU was in user state. + * If this process is being profiled record the tick. + */ + p->p_uticks++; + if (p->p_nice > NZERO) + cp_time[CP_NICE]++; + else + cp_time[CP_USER]++; + } else { +#ifdef GPROF + /* + * Kernel statistics are just like addupc_intr, only easier. + */ + g = &_gmonparam; + if (g->state == GMON_PROF_ON) { + i = CLKF_PC(frame) - g->lowpc; + if (i < g->textsize) { + i /= HISTFRACTION * sizeof(*g->kcount); + g->kcount[i]++; + } + } +#endif + if (--pscnt > 0) + return; + /* + * Came from kernel mode, so we were: + * - handling an interrupt, + * - doing syscall or trap work on behalf of the current + * user process, or + * - spinning in the idle loop. + * Whichever it is, charge the time as appropriate. + * Note that we charge interrupts to the current process, + * regardless of whether they are ``for'' that process, + * so that we know how much of its real time was spent + * in ``non-process'' (i.e., interrupt) work. + */ + p = curproc; + if (CLKF_INTR(frame)) { + if (p != NULL) + p->p_iticks++; + cp_time[CP_INTR]++; + } else if (p != NULL) { + p->p_sticks++; + cp_time[CP_SYS]++; + } else + cp_time[CP_IDLE]++; + } + pscnt = psdiv; + + /* + * We maintain statistics shown by user-level statistics + * programs: the amount of time in each cpu state, and + * the amount of time each of DK_NDRIVE ``drives'' is busy. + * + * XXX should either run linked list of drives, or (better) + * grab timestamps in the start & done code. + */ + for (i = 0; i < DK_NDRIVE; i++) + if (dk_busy & (1 << i)) + dk_time[i]++; + + /* + * We adjust the priority of the current process. The priority of + * a process gets worse as it accumulates CPU time. The cpu usage + * estimator (p_estcpu) is increased here. The formula for computing + * priorities (in kern_synch.c) will compute a different value each + * time p_estcpu increases by 4. The cpu usage estimator ramps up + * quite quickly when the process is running (linearly), and decays + * away exponentially, at a rate which is proportionally slower when + * the system is busy. The basic principal is that the system will + * 90% forget that the process used a lot of CPU time in 5 * loadav + * seconds. This causes the system to favor processes which haven't + * run much recently, and to round-robin among other processes. + */ + if (p != NULL) { + p->p_cpticks++; + if (++p->p_estcpu == 0) + p->p_estcpu--; + if ((p->p_estcpu & 3) == 0) { + resetpriority(p); + if (p->p_priority >= PUSER) + p->p_priority = p->p_usrpri; + } + + /* Update resource usage integrals and maximums. */ + if ((pstats = p->p_stats) != NULL && + (ru = &pstats->p_ru) != NULL && + (vm = p->p_vmspace) != NULL) { + ru->ru_ixrss += vm->vm_tsize * PAGE_SIZE / 1024; + ru->ru_idrss += vm->vm_dsize * PAGE_SIZE / 1024; + ru->ru_isrss += vm->vm_ssize * PAGE_SIZE / 1024; + rss = vm->vm_pmap.pm_stats.resident_count * + PAGE_SIZE / 1024; + if (ru->ru_maxrss < rss) + ru->ru_maxrss = rss; + } + } +} + +/* + * Return information about system clocks. + */ +static int +sysctl_kern_clockrate SYSCTL_HANDLER_ARGS +{ + struct clockinfo clkinfo; + /* + * Construct clockinfo structure. + */ + clkinfo.hz = hz; + clkinfo.tick = tick; + clkinfo.profhz = profhz; + clkinfo.stathz = stathz ? stathz : hz; + return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req)); +} + +SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD, + 0, 0, sysctl_kern_clockrate, "S,clockinfo",""); + +#ifdef PPS_SYNC +/* + * hardpps() - discipline CPU clock oscillator to external PPS signal + * + * This routine is called at each PPS interrupt in order to discipline + * the CPU clock oscillator to the PPS signal. It measures the PPS phase + * and leaves it in a handy spot for the hardclock() routine. It + * integrates successive PPS phase differences and calculates the + * frequency offset. This is used in hardclock() to discipline the CPU + * clock oscillator so that intrinsic frequency error is cancelled out. + * The code requires the caller to capture the time and hardware counter + * value at the on-time PPS signal transition. + * + * Note that, on some Unix systems, this routine runs at an interrupt + * priority level higher than the timer interrupt routine hardclock(). + * Therefore, the variables used are distinct from the hardclock() + * variables, except for certain exceptions: The PPS frequency pps_freq + * and phase pps_offset variables are determined by this routine and + * updated atomically. The time_tolerance variable can be considered a + * constant, since it is infrequently changed, and then only when the + * PPS signal is disabled. The watchdog counter pps_valid is updated + * once per second by hardclock() and is atomically cleared in this + * routine. + */ +void +hardpps(tvp, usec) + struct timeval *tvp; /* time at PPS */ + long usec; /* hardware counter at PPS */ +{ + long u_usec, v_usec, bigtick; + long cal_sec, cal_usec; + + /* + * An occasional glitch can be produced when the PPS interrupt + * occurs in the hardclock() routine before the time variable is + * updated. Here the offset is discarded when the difference + * between it and the last one is greater than tick/2, but not + * if the interval since the first discard exceeds 30 s. + */ + time_status |= STA_PPSSIGNAL; + time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR); + pps_valid = 0; + u_usec = -tvp->tv_usec; + if (u_usec < -500000) + u_usec += 1000000; + v_usec = pps_offset - u_usec; + if (v_usec < 0) + v_usec = -v_usec; + if (v_usec > (tick >> 1)) { + if (pps_glitch > MAXGLITCH) { + pps_glitch = 0; + pps_tf[2] = u_usec; + pps_tf[1] = u_usec; + } else { + pps_glitch++; + u_usec = pps_offset; + } + } else + pps_glitch = 0; + + /* + * A three-stage median filter is used to help deglitch the pps + * time. The median sample becomes the time offset estimate; the + * difference between the other two samples becomes the time + * dispersion (jitter) estimate. + */ + pps_tf[2] = pps_tf[1]; + pps_tf[1] = pps_tf[0]; + pps_tf[0] = u_usec; + if (pps_tf[0] > pps_tf[1]) { + if (pps_tf[1] > pps_tf[2]) { + pps_offset = pps_tf[1]; /* 0 1 2 */ + v_usec = pps_tf[0] - pps_tf[2]; + } else if (pps_tf[2] > pps_tf[0]) { + pps_offset = pps_tf[0]; /* 2 0 1 */ + v_usec = pps_tf[2] - pps_tf[1]; + } else { + pps_offset = pps_tf[2]; /* 0 2 1 */ + v_usec = pps_tf[0] - pps_tf[1]; + } + } else { + if (pps_tf[1] < pps_tf[2]) { + pps_offset = pps_tf[1]; /* 2 1 0 */ + v_usec = pps_tf[2] - pps_tf[0]; + } else if (pps_tf[2] < pps_tf[0]) { + pps_offset = pps_tf[0]; /* 1 0 2 */ + v_usec = pps_tf[1] - pps_tf[2]; + } else { + pps_offset = pps_tf[2]; /* 1 2 0 */ + v_usec = pps_tf[1] - pps_tf[0]; + } + } + if (v_usec > MAXTIME) + pps_jitcnt++; + v_usec = (v_usec << PPS_AVG) - pps_jitter; + if (v_usec < 0) + pps_jitter -= -v_usec >> PPS_AVG; + else + pps_jitter += v_usec >> PPS_AVG; + if (pps_jitter > (MAXTIME >> 1)) + time_status |= STA_PPSJITTER; + + /* + * During the calibration interval adjust the starting time when + * the tick overflows. At the end of the interval compute the + * duration of the interval and the difference of the hardware + * counters at the beginning and end of the interval. This code + * is deliciously complicated by the fact valid differences may + * exceed the value of tick when using long calibration + * intervals and small ticks. Note that the counter can be + * greater than tick if caught at just the wrong instant, but + * the values returned and used here are correct. + */ + bigtick = (long)tick << SHIFT_USEC; + pps_usec -= pps_freq; + if (pps_usec >= bigtick) + pps_usec -= bigtick; + if (pps_usec < 0) + pps_usec += bigtick; + pps_time.tv_sec++; + pps_count++; + if (pps_count < (1 << pps_shift)) + return; + pps_count = 0; + pps_calcnt++; + u_usec = usec << SHIFT_USEC; + v_usec = pps_usec - u_usec; + if (v_usec >= bigtick >> 1) + v_usec -= bigtick; + if (v_usec < -(bigtick >> 1)) + v_usec += bigtick; + if (v_usec < 0) + v_usec = -(-v_usec >> pps_shift); + else + v_usec = v_usec >> pps_shift; + pps_usec = u_usec; + cal_sec = tvp->tv_sec; + cal_usec = tvp->tv_usec; + cal_sec -= pps_time.tv_sec; + cal_usec -= pps_time.tv_usec; + if (cal_usec < 0) { + cal_usec += 1000000; + cal_sec--; + } + pps_time = *tvp; + + /* + * Check for lost interrupts, noise, excessive jitter and + * excessive frequency error. The number of timer ticks during + * the interval may vary +-1 tick. Add to this a margin of one + * tick for the PPS signal jitter and maximum frequency + * deviation. If the limits are exceeded, the calibration + * interval is reset to the minimum and we start over. + */ + u_usec = (long)tick << 1; + if (!((cal_sec == -1 && cal_usec > (1000000 - u_usec)) + || (cal_sec == 0 && cal_usec < u_usec)) + || v_usec > time_tolerance || v_usec < -time_tolerance) { + pps_errcnt++; + pps_shift = PPS_SHIFT; + pps_intcnt = 0; + time_status |= STA_PPSERROR; + return; + } + + /* + * A three-stage median filter is used to help deglitch the pps + * frequency. The median sample becomes the frequency offset + * estimate; the difference between the other two samples + * becomes the frequency dispersion (stability) estimate. + */ + pps_ff[2] = pps_ff[1]; + pps_ff[1] = pps_ff[0]; + pps_ff[0] = v_usec; + if (pps_ff[0] > pps_ff[1]) { + if (pps_ff[1] > pps_ff[2]) { + u_usec = pps_ff[1]; /* 0 1 2 */ + v_usec = pps_ff[0] - pps_ff[2]; + } else if (pps_ff[2] > pps_ff[0]) { + u_usec = pps_ff[0]; /* 2 0 1 */ + v_usec = pps_ff[2] - pps_ff[1]; + } else { + u_usec = pps_ff[2]; /* 0 2 1 */ + v_usec = pps_ff[0] - pps_ff[1]; + } + } else { + if (pps_ff[1] < pps_ff[2]) { + u_usec = pps_ff[1]; /* 2 1 0 */ + v_usec = pps_ff[2] - pps_ff[0]; + } else if (pps_ff[2] < pps_ff[0]) { + u_usec = pps_ff[0]; /* 1 0 2 */ + v_usec = pps_ff[1] - pps_ff[2]; + } else { + u_usec = pps_ff[2]; /* 1 2 0 */ + v_usec = pps_ff[1] - pps_ff[0]; + } + } + + /* + * Here the frequency dispersion (stability) is updated. If it + * is less than one-fourth the maximum (MAXFREQ), the frequency + * offset is updated as well, but clamped to the tolerance. It + * will be processed later by the hardclock() routine. + */ + v_usec = (v_usec >> 1) - pps_stabil; + if (v_usec < 0) + pps_stabil -= -v_usec >> PPS_AVG; + else + pps_stabil += v_usec >> PPS_AVG; + if (pps_stabil > MAXFREQ >> 2) { + pps_stbcnt++; + time_status |= STA_PPSWANDER; + return; + } + if (time_status & STA_PPSFREQ) { + if (u_usec < 0) { + pps_freq -= -u_usec >> PPS_AVG; + if (pps_freq < -time_tolerance) + pps_freq = -time_tolerance; + u_usec = -u_usec; + } else { + pps_freq += u_usec >> PPS_AVG; + if (pps_freq > time_tolerance) + pps_freq = time_tolerance; + } + } + + /* + * Here the calibration interval is adjusted. If the maximum + * time difference is greater than tick / 4, reduce the interval + * by half. If this is not the case for four consecutive + * intervals, double the interval. + */ + if (u_usec << pps_shift > bigtick >> 2) { + pps_intcnt = 0; + if (pps_shift > PPS_SHIFT) + pps_shift--; + } else if (pps_intcnt >= 4) { + pps_intcnt = 0; + if (pps_shift < PPS_SHIFTMAX) + pps_shift++; + } else + pps_intcnt++; +} +#endif /* PPS_SYNC */ diff --git a/sys/kern/kern_xxx.c b/sys/kern/kern_xxx.c index caa1cdd..17550b6 100644 --- a/sys/kern/kern_xxx.c +++ b/sys/kern/kern_xxx.c @@ -30,114 +30,230 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)kern_xxx.c 8.3 (Berkeley) 2/14/95 + * @(#)kern_xxx.c 8.2 (Berkeley) 11/14/93 + * $Id$ */ #include <sys/param.h> #include <sys/systm.h> +#include <sys/sysproto.h> #include <sys/kernel.h> #include <sys/proc.h> -#include <sys/reboot.h> -#include <vm/vm.h> #include <sys/sysctl.h> +#include <sys/utsname.h> -#include <sys/mount.h> -#include <sys/syscallargs.h> - -/* ARGSUSED */ -int -reboot(p, uap, retval) - struct proc *p; - struct reboot_args /* { - syscallarg(int) opt; - } */ *uap; - register_t *retval; -{ - int error; - - if (error = suser(p->p_ucred, &p->p_acflag)) - return (error); - boot(SCARG(uap, opt)); - return (0); -} #if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#ifndef _SYS_SYSPROTO_H_ +struct gethostname_args { + char *hostname; + u_int len; +}; +#endif /* ARGSUSED */ int -compat_43_gethostname(p, uap, retval) +ogethostname(p, uap, retval) struct proc *p; - struct compat_43_gethostname_args /* { - syscallarg(char *) hostname; - syscallarg(u_int) len; - } */ *uap; - register_t *retval; + struct gethostname_args *uap; + int *retval; { - int name; + int name[2]; - name = KERN_HOSTNAME; - return (kern_sysctl(&name, 1, SCARG(uap, hostname), &SCARG(uap, len), - 0, 0)); + name[0] = CTL_KERN; + name[1] = KERN_HOSTNAME; + return (userland_sysctl(p, name, 2, uap->hostname, &uap->len, + 1, 0, 0, 0)); } +#ifndef _SYS_SYSPROTO_H_ +struct sethostname_args { + char *hostname; + u_int len; +}; +#endif /* ARGSUSED */ int -compat_43_sethostname(p, uap, retval) +osethostname(p, uap, retval) struct proc *p; - register struct compat_43_sethostname_args /* { - syscallarg(char *) hostname; - syscallarg(u_int) len; - } */ *uap; - register_t *retval; + register struct sethostname_args *uap; + int *retval; { - int name; + int name[2]; int error; - if (error = suser(p->p_ucred, &p->p_acflag)) + name[0] = CTL_KERN; + name[1] = KERN_HOSTNAME; + if ((error = suser(p->p_ucred, &p->p_acflag))) return (error); - name = KERN_HOSTNAME; - return (kern_sysctl(&name, 1, 0, 0, SCARG(uap, hostname), - SCARG(uap, len))); + return (userland_sysctl(p, name, 2, 0, 0, 0, + uap->hostname, uap->len, 0)); } +#ifndef _SYS_SYSPROTO_H_ +struct ogethostid_args { + int dummy; +}; +#endif /* ARGSUSED */ int -compat_43_gethostid(p, uap, retval) +ogethostid(p, uap, retval) struct proc *p; - void *uap; - register_t *retval; + struct ogethostid_args *uap; + int *retval; { - *(int32_t *)retval = hostid; + *(long *)retval = hostid; return (0); } #endif /* COMPAT_43 || COMPAT_SUNOS */ #ifdef COMPAT_43 +#ifndef _SYS_SYSPROTO_H_ +struct osethostid_args { + long hostid; +}; +#endif /* ARGSUSED */ int -compat_43_sethostid(p, uap, retval) +osethostid(p, uap, retval) struct proc *p; - struct compat_43_sethostid_args /* { - syscallarg(int32_t) hostid; - } */ *uap; - register_t *retval; + struct osethostid_args *uap; + int *retval; { int error; - if (error = suser(p->p_ucred, &p->p_acflag)) + if ((error = suser(p->p_ucred, &p->p_acflag))) return (error); - hostid = SCARG(uap, hostid); + hostid = uap->hostid; return (0); } int -compat_43_quota(p, uap, retval) +oquota(p, uap, retval) struct proc *p; - void *uap; - register_t *retval; + struct oquota_args *uap; + int *retval; { return (ENOSYS); } #endif /* COMPAT_43 */ + +#ifndef _SYS_SYSPROTO_H_ +struct uname_args { + struct utsname *name; +}; +#endif + +/* ARGSUSED */ +int +uname(p, uap, retval) + struct proc *p; + struct uname_args *uap; + int *retval; +{ + int name[2], len, rtval; + char *s, *us; + + name[0] = CTL_KERN; + name[1] = KERN_OSTYPE; + len = sizeof uap->name->sysname; + rtval = userland_sysctl(p, name, 2, uap->name->sysname, &len, + 1, 0, 0, 0); + if( rtval) return rtval; + subyte( uap->name->sysname + sizeof(uap->name->sysname) - 1, 0); + + name[1] = KERN_HOSTNAME; + len = sizeof uap->name->nodename; + rtval = userland_sysctl(p, name, 2, uap->name->nodename, &len, + 1, 0, 0, 0); + if( rtval) return rtval; + subyte( uap->name->nodename + sizeof(uap->name->nodename) - 1, 0); + + name[1] = KERN_OSRELEASE; + len = sizeof uap->name->release; + rtval = userland_sysctl(p, name, 2, uap->name->release, &len, + 1, 0, 0, 0); + if( rtval) return rtval; + subyte( uap->name->release + sizeof(uap->name->release) - 1, 0); + +/* + name = KERN_VERSION; + len = sizeof uap->name->version; + rtval = userland_sysctl(p, name, 2, uap->name->version, &len, + 1, 0, 0, 0); + if( rtval) return rtval; + subyte( uap->name->version + sizeof(uap->name->version) - 1, 0); +*/ + +/* + * this stupid hackery to make the version field look like FreeBSD 1.1 + */ + for(s = version; *s && *s != '#'; s++); + + for(us = uap->name->version; *s && *s != ':'; s++) { + rtval = subyte( us++, *s); + if( rtval) + return rtval; + } + rtval = subyte( us++, 0); + if( rtval) + return rtval; + + name[1] = HW_MACHINE; + len = sizeof uap->name->machine; + rtval = userland_sysctl(p, name, 2, uap->name->machine, &len, + 1, 0, 0, 0); + if( rtval) return rtval; + subyte( uap->name->machine + sizeof(uap->name->machine) - 1, 0); + + return 0; +} + +#ifndef _SYS_SYSPROTO_H_ +struct getdomainname_args { + char *domainname; + int len; +}; +#endif + +/* ARGSUSED */ +int +getdomainname(p, uap, retval) + struct proc *p; + struct getdomainname_args *uap; + int *retval; +{ + int domainnamelen = strlen(domainname) + 1; + if ((u_int)uap->len > domainnamelen + 1) + uap->len = domainnamelen + 1; + return (copyout((caddr_t)domainname, (caddr_t)uap->domainname, uap->len)); +} + +#ifndef _SYS_SYSPROTO_H_ +struct setdomainname_args { + char *domainname; + int len; +}; +#endif + +/* ARGSUSED */ +int +setdomainname(p, uap, retval) + struct proc *p; + struct setdomainname_args *uap; + int *retval; +{ + int error, domainnamelen; + + if ((error = suser(p->p_ucred, &p->p_acflag))) + return (error); + if ((u_int)uap->len > sizeof (domainname) - 1) + return EINVAL; + domainnamelen = uap->len; + error = copyin((caddr_t)uap->domainname, domainname, uap->len); + domainname[domainnamelen] = 0; + return (error); +} + diff --git a/sys/kern/makesyscalls.sh b/sys/kern/makesyscalls.sh index 4e2c28c..dc78413 100644 --- a/sys/kern/makesyscalls.sh +++ b/sys/kern/makesyscalls.sh @@ -1,72 +1,43 @@ #! /bin/sh - -# -# @(#)makesyscalls.sh 8.2 (Berkeley) 2/14/95 +# @(#)makesyscalls.sh 8.1 (Berkeley) 6/10/93 +# $Id$ set -e -case $# in - 2) ;; - *) echo "Usage: $0 config-file input-file" 1>&2 - exit 1 - ;; -esac - -# source the config file. -. $1 +# name of compat option: +compat=COMPAT_43 -# the config file sets the following variables: -# sysnames the syscall names file -# sysnumhdr the syscall numbers file -# syssw the syscall switch file -# sysarghdr the syscall argument struct definitions -# compatopts those syscall types that are for 'compat' syscalls -# switchname the name for the 'struct sysent' we define -# namesname the name for the 'char *[]' we define -# constprefix the prefix for the system call constants -# -# NOTE THAT THIS makesyscalls.sh DOES NOT SUPPORT 'LIBCOMPAT'. +# output files: +sysnames="syscalls.c" +sysproto="../sys/sysproto.h" +sysproto_h=_SYS_SYSPROTO_H_ +syshdr="../sys/syscall.h" +syssw="init_sysent.c" +syshide="../sys/syscall-hide.h" +syscallprefix="SYS_" +switchname="sysent" +namesname="syscallnames" # tmp files: sysdcl="sysent.dcl" -syscompat_pref="sysent." +syscompat="sysent.compat" +syscompatdcl="sysent.compatdcl" sysent="sysent.switch" +sysinc="sysinc.switch" +sysarg="sysarg.switch" -syscompat_files="" -for file in $compatopts; do - syscompat_files="$syscompat_files $syscompat_pref$file" -done +trap "rm $sysdcl $syscompat $syscompatdcl $sysent $sysinc $sysarg" 0 -trap "rm $sysdcl $syscompat_files $sysent" 0 - -# Awk program (must support nawk extensions) -# Use "awk" at Berkeley, "nawk" or "gawk" elsewhere. -awk=${AWK:-awk} - -# Does this awk have a "toupper" function? (i.e. is it GNU awk) -isgawk=`$awk 'BEGIN { print toupper("true"); exit; }' 2>/dev/null` +case $# in + 0) echo "Usage: $0 input-file <config-file>" 1>&2 + exit 1 + ;; +esac -# If this awk does not define "toupper" then define our own. -if [ "$isgawk" = TRUE ] ; then - # GNU awk provides it. - toupper= -else - # Provide our own toupper() - toupper=' -function toupper(str) { - _toupper_cmd = "echo "str" |tr a-z A-Z" - _toupper_cmd | getline _toupper_str; - close(_toupper_cmd); - return _toupper_str; -}' +if [ -f $2 ]; then + . $2 fi -# before handing it off to awk, make a few adjustments: -# (1) insert spaces around {, }, (, ), *, and commas. -# (2) get rid of any and all dollar signs (so that rcs id use safe) -# -# The awk script will deal with blank lines and lines that -# start with the comment character (';'). - sed -e ' s/\$//g :join @@ -79,287 +50,311 @@ s/\$//g 2,${ /^#/!s/\([{}()*,]\)/ \1 /g } -' < $2 | $awk " -$toupper -BEGIN { - sysnames = \"$sysnames\" - sysnumhdr = \"$sysnumhdr\" - sysarghdr = \"$sysarghdr\" - switchname = \"$switchname\" - namesname = \"$namesname\" - constprefix = \"$constprefix\" - - sysdcl = \"$sysdcl\" - syscompat_pref = \"$syscompat_pref\" - sysent = \"$sysent\" - infile = \"$2\" - - compatopts = \"$compatopts\" - "' - - printf "/*\n * System call switch table.\n *\n" > sysdcl - printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysdcl - - ncompat = split(compatopts,compat) - for (i = 1; i <= ncompat; i++) { - compat_upper[i] = toupper(compat[i]) - compat_file[i] = sprintf("%s%s", syscompat_pref, compat[i]) - - printf "\n#ifdef %s\n", compat_upper[i] > compat_file[i] - printf "#define %s(func) __CONCAT(%s_,func)\n\n", \ - compat[i], compat[i] > compat_file[i] +' < $1 | awk " + BEGIN { + sysdcl = \"$sysdcl\" + sysproto = \"$sysproto\" + sysproto_h = \"$sysproto_h\" + syscompat = \"$syscompat\" + syscompatdcl = \"$syscompatdcl\" + sysent = \"$sysent\" + sysinc = \"$sysinc\" + sysarg = \"$sysarg\" + sysnames = \"$sysnames\" + syshdr = \"$syshdr\" + compat = \"$compat\" + syshide = \"$syshide\" + syscallprefix = \"$syscallprefix\" + switchname = \"$switchname\" + namesname = \"$namesname\" + infile = \"$1\" + "' + + printf "/*\n * System call switch table.\n *\n" > sysinc + printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysinc + + printf "/*\n * System call prototypes.\n *\n" > sysarg + printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysarg + + printf "\n#ifdef %s\n\n", compat > syscompat + + printf "/*\n * System call names.\n *\n" > sysnames + printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnames + + printf "/*\n * System call numbers.\n *\n" > syshdr + printf " * DO NOT EDIT-- this file is automatically generated.\n" > syshdr + printf "/*\n * System call hiders.\n *\n" > syshide + printf " * DO NOT EDIT-- this file is automatically generated.\n" > syshide } + NR == 1 { + gsub("[$]Id: ", "", $0) + gsub(" [$]", "", $0) - printf "/*\n * System call names.\n *\n" > sysnames - printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnames + printf " * created from%s\n */\n\n", $0 > sysinc - printf "/*\n * System call numbers.\n *\n" > sysnumhdr - printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnumhdr + printf "\n#ifdef %s\n", compat > sysent + printf "#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name)\n" > sysent + printf("#else\n") > sysent + printf("#define compat(n, name) 0, (sy_call_t *)nosys\n") > sysent + printf("#endif\n\n") > sysent + printf("/* The casts are bogus but will do for now. */\n") > sysent + printf "struct sysent %s[] = {\n",switchname > sysent - printf "/*\n * System call argument lists.\n *\n" > sysarghdr - printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysarghdr -} -NR == 1 { - printf " * created from%s\n */\n\n", $0 > sysdcl - - printf "#define\ts(type)\tsizeof(type)\n\n" > sysent - printf "struct sysent %s[] = {\n",switchname > sysent + printf " * created from%s\n */\n\n", $0 > sysarg + printf("#ifndef %s\n", sysproto_h) > sysarg + printf("#define\t%s\n\n", sysproto_h) > sysarg + printf "#include <sys/signal.h>\n\n", $0 > sysarg - printf " * created from%s\n */\n\n", $0 > sysnames - printf "char *%s[] = {\n",namesname > sysnames + printf " * created from%s\n */\n\n", $0 > sysnames + printf "char *%s[] = {\n", namesname > sysnames - printf " * created from%s\n */\n\n", $0 > sysnumhdr + printf " * created from%s\n */\n\n", $0 > syshdr - printf " * created from%s\n */\n\n", $0 > sysarghdr - printf "#define\tsyscallarg(x)\tunion { x datum; register_t pad; }\n" \ - > sysarghdr - next -} -NF == 0 || $1 ~ /^;/ { - next -} -$1 ~ /^#[ ]*include/ { - print > sysdcl - next -} -$1 ~ /^#[ ]*if/ { - print > sysent - print > sysdcl - for (i = 1; i <= ncompat; i++) - print > compat_file[i] - print > sysnames - savesyscall = syscall - next -} -$1 ~ /^#[ ]*else/ { - print > sysent - print > sysdcl - for (i = 1; i <= ncompat; i++) - print > compat_file[i] - print > sysnames - syscall = savesyscall - next -} -$1 ~ /^#/ { - print > sysent - print > sysdcl - for (i = 1; i <= ncompat; i++) - print > compat_file[i] - print > sysnames - next -} -syscall != $1 { - printf "%s: line %d: syscall number out of sync at %d\n", \ - infile, NR, syscall - printf "line is:\n" - print - exit 1 -} -function parserr(was, wanted) { - printf "%s: line %d: unexpected %s (expected %s)\n", \ - infile, NR, was, wanted - exit 1 -} -function parseline() { - f=3 # toss number and type - if ($NF != "}") { - funcalias=$NF - end=NF-1 - } else { - funcalias="" - end=NF + printf " * created from%s\n */\n\n", $0 > syshide + next } - if ($f != "{") - parserr($f, "{") - f++ - if ($end != "}") - parserr($end, "}") - end-- - if ($end != ";") - parserr($end, ";") - end-- - if ($end != ")") - parserr($end, ")") - end-- - - f++ # toss return type - - funcname=$f - if (funcalias == "") - funcalias=funcname - f++ + NF == 0 || $1 ~ /^;/ { + next + } + $1 ~ /^#[ ]*include/ { + print > sysinc + next + } + $1 ~ /^#[ ]*if/ { + print > sysent + print > sysdcl + print > sysarg + print > syscompat + print > sysnames + print > syshide + savesyscall = syscall + next + } + $1 ~ /^#[ ]*else/ { + print > sysent + print > sysdcl + print > sysarg + print > syscompat + print > sysnames + print > syshide + syscall = savesyscall + next + } + $1 ~ /^#/ { + print > sysent + print > sysdcl + print > sysarg + print > syscompat + print > sysnames + print > syshide + next + } + syscall != $1 { + printf "%s: line %d: syscall number out of sync at %d\n", \ + infile, NR, syscall + printf "line is:\n" + print + exit 1 + } + function parserr(was, wanted) { + printf "%s: line %d: unexpected %s (expected %s)\n", \ + infile, NR, was, wanted + exit 1 + } + function parseline() { + f=4 # toss number and type + argc= 0; + bigargc = 0; + if ($NF != "}") { + funcalias=$(NF-2) + argalias=$(NF-1) + rettype=$NF + end=NF-3 + } else { + funcalias="" + argalias="" + rettype="int" + end=NF + } + if ($2 == "NODEF") { + funcname=$4 + return + } + if ($f != "{") + parserr($f, "{") + f++ + if ($end != "}") + parserr($end, "}") + end-- + if ($end != ";") + parserr($end, ";") + end-- + if ($end != ")") + parserr($end, ")") + end-- + + f++ #function return type + + funcname=$f + if (funcalias == "") + funcalias = funcname + if (argalias == "") { + argalias = funcname "_args" + if ($2 == "COMPAT") + argalias = "o" argalias + } + f++ - if ($f != "(") - parserr($f, ")") - f++ + if ($f != "(") + parserr($f, ")") + f++ - argc= 0; - if (f == end) { - if ($f != "void") - parserr($f, "argument definition") - return - } + if (f == end) { + if ($f != "void") + parserr($f, "argument definition") + return + } - while (f <= end) { - argc++ - argtype[argc]="" - oldf="" - while (f < end && $(f+1) != ",") { - if (argtype[argc] != "" && oldf != "*") - argtype[argc] = argtype[argc]" "; - argtype[argc] = argtype[argc]$f; - oldf = $f; - f++ + while (f <= end) { + argc++ + argtype[argc]="" + oldf="" + while (f < end && $(f+1) != ",") { + if (argtype[argc] != "" && oldf != "*") + argtype[argc] = argtype[argc]" "; + argtype[argc] = argtype[argc]$f; + oldf = $f; + f++ + } + if (argtype[argc] == "") + parserr($f, "argument definition") + if (argtype[argc] == "off_t") + bigargc++ + argname[argc]=$f; + f += 2; # skip name, and any comma } - if (argtype[argc] == "") - parserr($f, "argument definition") - argname[argc]=$f; - f += 2; # skip name, and any comma } -} -function putent(nodefs, declfile, compatwrap) { - # output syscall declaration for switch table - if (compatwrap == "") - printf("int\t%s();\n", funcname) > declfile - else - printf("int\t%s(%s)();\n", compatwrap, funcname) > declfile - - # output syscall switch entry -# printf("\t{ { %d", argc) > sysent -# for (i = 1; i <= argc; i++) { -# if (i == 5) # wrap the line -# printf(",\n\t ") > sysent -# else -# printf(", ") > sysent -# printf("s(%s)", argtypenospc[i]) > sysent -# } - printf("\t{ %d, ", argc) > sysent - if (argc == 0) - printf("0") > sysent - else if (compatwrap == "") - printf("s(struct %s_args)", funcname) > sysent - else - printf("s(struct %s_%s_args)", compatwrap, funcname) > sysent - if (compatwrap == "") - wfn = sprintf("%s", funcname); - else - wfn = sprintf("%s(%s)", compatwrap, funcname); - printf(",\n\t %s },", wfn) > sysent - for (i = 0; i < (33 - length(wfn)) / 8; i++) - printf("\t") > sysent - if (compatwrap == "") + { comment = $4 + if (NF < 7) + for (i = 5; i <= NF; i++) + comment = comment " " $i + } + $2 == "STD" || $2 == "NODEF" || $2 == "NOARGS" || $2 == "NOPROTO" { + parseline() + if ((!nosys || funcname != "nosys") && \ + (funcname != "lkmnosys")) { + if (argc != 0 && $2 != "NOARGS" && $2 != "NOPROTO") { + printf("struct\t%s {\n", argalias) > sysarg + for (i = 1; i <= argc; i++) + printf("\t%s %s;\n", argtype[i], + argname[i]) > sysarg + printf("};\n") > sysarg + } + else if($2 != "NOARGS" && $2 != "NOPROTO") + printf("struct\t%s {\n\tint dummy;\n};\n", \ + argalias) > sysarg + } + if ($2 != "NOPROTO" && (!nosys || funcname != "nosys") && \ + (!lkmnosys || funcname != "lkmnosys")) { + printf("%s\t%s __P((struct proc *, struct %s *, int []))", \ + rettype, funcname, argalias) > sysdcl + if (funcname == "exit") + printf(" __dead2") > sysdcl + printf(";\n") > sysdcl + } + if (funcname == "nosys") + nosys = 1 + if (funcname == "lkmnosys") + lkmnosys = 1 + printf("\t{ %d, (sy_call_t *)%s },\t\t", \ + argc+bigargc, funcname) > sysent + if(length(funcname) < 11) + printf("\t") > sysent printf("/* %d = %s */\n", syscall, funcalias) > sysent - else - printf("/* %d = %s %s */\n", syscall, compatwrap, - funcalias) > sysent - - # output syscall name for names table - if (compatwrap == "") - printf("\t\"%s\",\t\t\t/* %d = %s */\n", funcalias, syscall, - funcalias) > sysnames - else - printf("\t\"%s_%s\",\t/* %d = %s %s */\n", compatwrap, - funcalias, syscall, compatwrap, funcalias) > sysnames - - # output syscall number of header, if appropriate - if (nodefs == "" || nodefs == "NOARGS") - printf("#define\t%s%s\t%d\n", constprefix, funcalias, - syscall) > sysnumhdr - else if (nodefs != "NODEF") - printf("\t\t\t\t/* %d is %s %s */\n", syscall, - compatwrap, funcalias) > sysnumhdr - - # output syscall argument structure, if it has arguments - if (argc != 0 && nodefs != "NOARGS") { - if (compatwrap == "") - printf("\nstruct %s_args {\n", funcname) > sysarghdr - else - printf("\nstruct %s_%s_args {\n", compatwrap, - funcname) > sysarghdr - for (i = 1; i <= argc; i++) - printf("\tsyscallarg(%s) %s;\n", argtype[i], - argname[i]) > sysarghdr - printf("};\n") > sysarghdr + printf("\t\"%s\",\t\t\t/* %d = %s */\n", \ + funcalias, syscall, funcalias) > sysnames + if ($2 != "NODEF") + printf("#define\t%s%s\t%d\n", syscallprefix, \ + funcalias, syscall) > syshdr + if ($3 != "NOHIDE") + printf("HIDE_%s(%s)\n", $3, funcname) > syshide + syscall++ + next } -} -$2 == "STD" { - parseline() - putent("", sysdcl, "") - syscall++ - next -} -$2 == "NODEF" || $2 == "NOARGS" { - parseline() - putent($2, sysdcl, "") - syscall++ - next -} -$2 == "OBSOL" || $2 == "UNIMPL" { - if ($2 == "OBSOL") - comment="obsolete" - else - comment="unimplemented" - for (i = 3; i <= NF; i++) - comment=comment " " $i - - printf("\t{ 0, 0,\n\t nosys },\t\t\t\t/* %d = %s */\n", \ - syscall, comment) > sysent - printf("\t\"#%d (%s)\",\t\t/* %d = %s */\n", \ - syscall, comment, syscall, comment) > sysnames - if ($2 != "UNIMPL") - printf("\t\t\t\t/* %d is %s */\n", syscall, comment) > sysnumhdr - syscall++ - next -} -{ - for (i = 1; i <= ncompat; i++) { - if ($2 == compat_upper[i]) { - parseline(); - putent("COMMENT", compat_file[i], compat[i]) - syscall++ - next + $2 == "COMPAT" || $2 == "CPT_NOA" { + parseline() + if (argc != 0 && $2 != "CPT_NOA") { + printf("struct\t%s {\n", argalias) > syscompat + for (i = 1; i <= argc; i++) + printf("\t%s %s;\n", argtype[i], + argname[i]) > syscompat + printf("};\n") > syscompat } + else if($2 != "CPT_NOA") + printf("struct\t%s {\n\tint dummy;\n};\n", \ + argalias) > sysarg + printf("%s\to%s __P((struct proc *, struct %s *, int []));\n", \ + rettype, funcname, argalias) > syscompatdcl + printf("\t{ compat(%d,%s) },\t\t/* %d = old %s */\n", \ + argc+bigargc, funcname, syscall, funcalias) > sysent + printf("\t\"old.%s\",\t\t/* %d = old %s */\n", \ + funcalias, syscall, funcalias) > sysnames + printf("\t\t\t\t/* %d is old %s */\n", \ + syscall, funcalias) > syshdr + if ($3 != "NOHIDE") + printf("HIDE_%s(%s)\n", $3, funcname) > syshide + syscall++ + next } - printf "%s: line %d: unrecognized keyword %s\n", infile, NR, $2 - exit 1 -} -END { - printf "\n#undef\tsyscallarg\n" > sysarghdr - - for (i = 1; i <= ncompat; i++) { - printf("\n#else /* %s */\n", compat_upper[i]) > compat_file[i] - printf("#define %s(func) nosys\n", compat[i]) > \ - compat_file[i] - printf("#endif /* %s */\n\n", compat_upper[i]) > compat_file[i] - } - - printf("};\n\n") > sysent - printf("int\tn%s= sizeof(%s) / sizeof(%s[0]);\n", switchname, - switchname, switchname) > sysent - - printf("};\n") > sysnames -} ' - -cat $sysdcl $syscompat_files $sysent > $syssw - -#chmod 444 $sysnames $syshdr $syssw + $2 == "LIBCOMPAT" { + parseline() + printf("%s\to%s();\n", rettype, funcname) > syscompatdcl + printf("\t{ compat(%d,%s) },\t\t/* %d = old %s */\n", \ + argc+bigargc, funcname, syscall, funcalias) > sysent + printf("\t\"old.%s\",\t\t/* %d = old %s */\n", \ + funcalias, syscall, funcalias) > sysnames + printf("#define\t%s%s\t%d\t/* compatibility; still used by libc */\n", \ + syscallprefix, funcalias, syscall) > syshdr + if ($3 != "NOHIDE") + printf("HIDE_%s(%s)\n", $3, funcname) > syshide + syscall++ + next + } + $2 == "OBSOL" { + printf("\t{ 0, (sy_call_t *)nosys },\t\t\t/* %d = obsolete %s */\n", \ + syscall, comment) > sysent + printf("\t\"obs_%s\",\t\t\t/* %d = obsolete %s */\n", \ + $4, syscall, comment) > sysnames + printf("\t\t\t\t/* %d is obsolete %s */\n", \ + syscall, comment) > syshdr + if ($3 != "NOHIDE") + printf("HIDE_%s(%s)\n", $3, $4) > syshide + syscall++ + next + } + $2 == "UNIMPL" { + printf("\t{ 0, (sy_call_t *)nosys },\t\t\t/* %d = %s */\n", \ + syscall, comment) > sysent + printf("\t\"#%d\",\t\t\t/* %d = %s */\n", \ + syscall, syscall, comment) > sysnames + if ($3 != "NOHIDE") + printf("HIDE_%s(%s)\n", $3, $4) > syshide + syscall++ + next + } + { + printf "%s: line %d: unrecognized keyword %s\n", infile, NR, $2 + exit 1 + } + END { + printf("\n#endif /* %s */\n", compat) > syscompatdcl + printf("\n#endif /* !%s */\n", sysproto_h) > syscompatdcl + + printf("};\n") > sysent + printf("};\n") > sysnames + printf("#define\t%sMAXSYSCALL\t%d\n", syscallprefix, syscall) \ + > syshdr + } ' + +cat $sysinc $sysent >$syssw +cat $sysarg $sysdcl $syscompat $syscompatdcl > $sysproto diff --git a/sys/kern/md5c.c b/sys/kern/md5c.c new file mode 100644 index 0000000..583d009 --- /dev/null +++ b/sys/kern/md5c.c @@ -0,0 +1,331 @@ +/* + * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm + * + * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All + * rights reserved. + * + * License to copy and use this software is granted provided that it + * is identified as the "RSA Data Security, Inc. MD5 Message-Digest + * Algorithm" in all material mentioning or referencing this software + * or this function. + * + * License is also granted to make and use derivative works provided + * that such works are identified as "derived from the RSA Data + * Security, Inc. MD5 Message-Digest Algorithm" in all material + * mentioning or referencing the derived work. + * + * RSA Data Security, Inc. makes no representations concerning either + * the merchantability of this software or the suitability of this + * software for any particular purpose. It is provided "as is" + * without express or implied warranty of any kind. + * + * These notices must be retained in any copies of any part of this + * documentation and/or software. + * + * $Id$ + * + * This code is the same as the code published by RSA Inc. It has been + * edited for clarity and style only. + */ + +#include <sys/types.h> + +#ifdef KERNEL +#include <sys/param.h> +#include <sys/systm.h> +#else +#include <string.h> +#endif + +#include <sys/md5.h> + +static void MD5Transform __P((u_int32_t [4], const unsigned char [64])); + +#ifdef KERNEL +#define memset(x,y,z) bzero(x,z); +#define memcpy(x,y,z) bcopy(y, x, z) +#endif + +#ifdef i386 +#define Encode memcpy +#define Decode memcpy +#else /* i386 */ + +/* + * Encodes input (u_int32_t) into output (unsigned char). Assumes len is + * a multiple of 4. + */ + +static void +Encode (output, input, len) + unsigned char *output; + u_int32_t *input; + unsigned int len; +{ + unsigned int i, j; + + for (i = 0, j = 0; j < len; i++, j += 4) { + output[j] = (unsigned char)(input[i] & 0xff); + output[j+1] = (unsigned char)((input[i] >> 8) & 0xff); + output[j+2] = (unsigned char)((input[i] >> 16) & 0xff); + output[j+3] = (unsigned char)((input[i] >> 24) & 0xff); + } +} + +/* + * Decodes input (unsigned char) into output (u_int32_t). Assumes len is + * a multiple of 4. + */ + +static void +Decode (output, input, len) + u_int32_t *output; + const unsigned char *input; + unsigned int len; +{ + unsigned int i, j; + + for (i = 0, j = 0; j < len; i++, j += 4) + output[i] = ((u_int32_t)input[j]) | (((u_int32_t)input[j+1]) << 8) | + (((u_int32_t)input[j+2]) << 16) | (((u_int32_t)input[j+3]) << 24); +} +#endif /* i386 */ + +static unsigned char PADDING[64] = { + 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 +}; + +/* F, G, H and I are basic MD5 functions. */ +#define F(x, y, z) (((x) & (y)) | ((~x) & (z))) +#define G(x, y, z) (((x) & (z)) | ((y) & (~z))) +#define H(x, y, z) ((x) ^ (y) ^ (z)) +#define I(x, y, z) ((y) ^ ((x) | (~z))) + +/* ROTATE_LEFT rotates x left n bits. */ +#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n)))) + +/* + * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4. + * Rotation is separate from addition to prevent recomputation. + */ +#define FF(a, b, c, d, x, s, ac) { \ + (a) += F ((b), (c), (d)) + (x) + (u_int32_t)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define GG(a, b, c, d, x, s, ac) { \ + (a) += G ((b), (c), (d)) + (x) + (u_int32_t)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define HH(a, b, c, d, x, s, ac) { \ + (a) += H ((b), (c), (d)) + (x) + (u_int32_t)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } +#define II(a, b, c, d, x, s, ac) { \ + (a) += I ((b), (c), (d)) + (x) + (u_int32_t)(ac); \ + (a) = ROTATE_LEFT ((a), (s)); \ + (a) += (b); \ + } + +/* MD5 initialization. Begins an MD5 operation, writing a new context. */ + +void +MD5Init (context) + MD5_CTX *context; +{ + + context->count[0] = context->count[1] = 0; + + /* Load magic initialization constants. */ + context->state[0] = 0x67452301; + context->state[1] = 0xefcdab89; + context->state[2] = 0x98badcfe; + context->state[3] = 0x10325476; +} + +/* + * MD5 block update operation. Continues an MD5 message-digest + * operation, processing another message block, and updating the + * context. + */ + +void +MD5Update (context, input, inputLen) + MD5_CTX *context; + const unsigned char *input; + unsigned int inputLen; +{ + unsigned int i, index, partLen; + + /* Compute number of bytes mod 64 */ + index = (unsigned int)((context->count[0] >> 3) & 0x3F); + + /* Update number of bits */ + if ((context->count[0] += ((u_int32_t)inputLen << 3)) + < ((u_int32_t)inputLen << 3)) + context->count[1]++; + context->count[1] += ((u_int32_t)inputLen >> 29); + + partLen = 64 - index; + + /* Transform as many times as possible. */ + if (inputLen >= partLen) { + memcpy((void *)&context->buffer[index], (void *)input, + partLen); + MD5Transform (context->state, context->buffer); + + for (i = partLen; i + 63 < inputLen; i += 64) + MD5Transform (context->state, &input[i]); + + index = 0; + } + else + i = 0; + + /* Buffer remaining input */ + memcpy ((void *)&context->buffer[index], (void *)&input[i], + inputLen-i); +} + +/* + * MD5 finalization. Ends an MD5 message-digest operation, writing the + * the message digest and zeroizing the context. + */ + +void +MD5Final (digest, context) + unsigned char digest[16]; + MD5_CTX *context; +{ + unsigned char bits[8]; + unsigned int index, padLen; + + /* Save number of bits */ + Encode (bits, context->count, 8); + + /* Pad out to 56 mod 64. */ + index = (unsigned int)((context->count[0] >> 3) & 0x3f); + padLen = (index < 56) ? (56 - index) : (120 - index); + MD5Update (context, PADDING, padLen); + + /* Append length (before padding) */ + MD5Update (context, bits, 8); + + /* Store state in digest */ + Encode (digest, context->state, 16); + + /* Zeroize sensitive information. */ + memset ((void *)context, 0, sizeof (*context)); +} + +/* MD5 basic transformation. Transforms state based on block. */ + +static void +MD5Transform (state, block) + u_int32_t state[4]; + const unsigned char block[64]; +{ + u_int32_t a = state[0], b = state[1], c = state[2], d = state[3], x[16]; + + Decode (x, block, 64); + + /* Round 1 */ +#define S11 7 +#define S12 12 +#define S13 17 +#define S14 22 + FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */ + FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */ + FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */ + FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */ + FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */ + FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */ + FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */ + FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */ + FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */ + FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */ + FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */ + FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */ + FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */ + FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */ + FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */ + FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */ + + /* Round 2 */ +#define S21 5 +#define S22 9 +#define S23 14 +#define S24 20 + GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */ + GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */ + GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */ + GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */ + GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */ + GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */ + GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */ + GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */ + GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */ + GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */ + GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */ + GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */ + GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */ + GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */ + GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */ + GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */ + + /* Round 3 */ +#define S31 4 +#define S32 11 +#define S33 16 +#define S34 23 + HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */ + HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */ + HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */ + HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */ + HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */ + HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */ + HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */ + HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */ + HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */ + HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */ + HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */ + HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */ + HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */ + HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */ + HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */ + HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */ + + /* Round 4 */ +#define S41 6 +#define S42 10 +#define S43 15 +#define S44 21 + II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */ + II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */ + II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */ + II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */ + II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */ + II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */ + II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */ + II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */ + II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */ + II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */ + II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */ + II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */ + II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */ + II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */ + II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */ + II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */ + + state[0] += a; + state[1] += b; + state[2] += c; + state[3] += d; + + /* Zeroize sensitive information. */ + memset ((void *)x, 0, sizeof (x)); +} diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c index 7281339..f48ce99 100644 --- a/sys/kern/subr_autoconf.c +++ b/sys/kern/subr_autoconf.c @@ -39,15 +39,14 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)subr_autoconf.c 8.3 (Berkeley) 5/17/94 + * @(#)subr_autoconf.c 8.1 (Berkeley) 6/10/93 * - * from: $Header: subr_autoconf.c,v 1.12 93/02/01 19:31:48 torek Exp $ (LBL) + * $Id$ */ #include <sys/param.h> #include <sys/device.h> #include <sys/malloc.h> -#include <libkern/libkern.h> /* * Autoconfiguration subroutines. @@ -284,16 +283,15 @@ config_attach(parent, cf, aux, print) void **nsp; if (old == 0) { - new = max(MINALLOCSIZE / sizeof(void *), - dev->dv_unit + 1); - newbytes = new * sizeof(void *); - nsp = malloc(newbytes, M_DEVBUF, M_WAITOK); /*XXX*/ - bzero(nsp, newbytes); + nsp = malloc(MINALLOCSIZE, M_DEVBUF, M_WAITOK); /*XXX*/ + bzero(nsp, MINALLOCSIZE); + cd->cd_ndevs = MINALLOCSIZE / sizeof(void *); } else { new = cd->cd_ndevs; do { new *= 2; } while (new <= dev->dv_unit); + cd->cd_ndevs = new; oldbytes = old * sizeof(void *); newbytes = new * sizeof(void *); nsp = malloc(newbytes, M_DEVBUF, M_WAITOK); /*XXX*/ @@ -301,7 +299,6 @@ config_attach(parent, cf, aux, print) bzero(&nsp[old], newbytes - oldbytes); free(cd->cd_devs, M_DEVBUF); } - cd->cd_ndevs = new; cd->cd_devs = nsp; } if (cd->cd_devs[dev->dv_unit]) diff --git a/sys/kern/subr_clist.c b/sys/kern/subr_clist.c new file mode 100644 index 0000000..d907b47 --- /dev/null +++ b/sys/kern/subr_clist.c @@ -0,0 +1,694 @@ +/* + * Copyright (c) 1994, David Greenman + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice unmodified, this list of conditions, and the following + * disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $Id: tty_subr.c,v 1.26 1997/03/05 16:45:01 bde Exp $ + */ + +/* + * clist support routines + */ + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/systm.h> +#include <sys/tty.h> +#include <sys/clist.h> +#include <sys/malloc.h> + +static void clist_init __P((void *)); +SYSINIT(clist, SI_SUB_CLIST, SI_ORDER_FIRST, clist_init, NULL) + +static struct cblock *cfreelist = 0; +int cfreecount = 0; +static int cslushcount; +static int ctotcount; + +#ifndef INITIAL_CBLOCKS +#define INITIAL_CBLOCKS 50 +#endif + +static struct cblock *cblock_alloc __P((void)); +static void cblock_alloc_cblocks __P((int number)); +static void cblock_free __P((struct cblock *cblockp)); +static void cblock_free_cblocks __P((int number)); + +#include "opt_ddb.h" +#ifdef DDB +#include <ddb/ddb.h> + +DB_SHOW_COMMAND(cbstat, cbstat) +{ + printf( + "tot = %d (active = %d, free = %d (reserved = %d, slush = %d))\n", + ctotcount * CBSIZE, ctotcount * CBSIZE - cfreecount, cfreecount, + cfreecount - cslushcount * CBSIZE, cslushcount * CBSIZE); +} +#endif /* DDB */ + +/* + * Called from init_main.c + */ +/* ARGSUSED*/ +static void +clist_init(dummy) + void *dummy; +{ + /* + * Allocate an initial base set of cblocks as a 'slush'. + * We allocate non-slush cblocks with each initial ttyopen() and + * deallocate them with each ttyclose(). + * We should adjust the slush allocation. This can't be done in + * the i/o routines because they are sometimes called from + * interrupt handlers when it may be unsafe to call malloc(). + */ + cblock_alloc_cblocks(cslushcount = INITIAL_CBLOCKS); +} + +/* + * Remove a cblock from the cfreelist queue and return a pointer + * to it. + */ +static inline struct cblock * +cblock_alloc() +{ + struct cblock *cblockp; + + cblockp = cfreelist; + if (cblockp == NULL) + panic("clist reservation botch"); + cfreelist = cblockp->c_next; + cblockp->c_next = NULL; + cfreecount -= CBSIZE; + return (cblockp); +} + +/* + * Add a cblock to the cfreelist queue. + */ +static inline void +cblock_free(cblockp) + struct cblock *cblockp; +{ + if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) + bzero(cblockp->c_quote, sizeof cblockp->c_quote); + cblockp->c_next = cfreelist; + cfreelist = cblockp; + cfreecount += CBSIZE; +} + +/* + * Allocate some cblocks for the cfreelist queue. + */ +static void +cblock_alloc_cblocks(number) + int number; +{ + int i; + struct cblock *cbp; + + for (i = 0; i < number; ++i) { + cbp = malloc(sizeof *cbp, M_TTYS, M_NOWAIT); + if (cbp == NULL) { + printf( +"clist_alloc_cblocks: M_NOWAIT malloc failed, trying M_WAITOK\n"); + cbp = malloc(sizeof *cbp, M_TTYS, M_WAITOK); + } + /* + * Freed cblocks have zero quotes and garbage elsewhere. + * Set the may-have-quote bit to force zeroing the quotes. + */ + setbit(cbp->c_quote, CBQSIZE * NBBY - 1); + cblock_free(cbp); + } + ctotcount += number; +} + +/* + * Set the cblock allocation policy for a a clist. + * Must be called in process context at spltty(). + */ +void +clist_alloc_cblocks(clistp, ccmax, ccreserved) + struct clist *clistp; + int ccmax; + int ccreserved; +{ + int dcbr; + + /* + * Allow for wasted space at the head. + */ + if (ccmax != 0) + ccmax += CBSIZE - 1; + if (ccreserved != 0) + ccreserved += CBSIZE - 1; + + clistp->c_cbmax = roundup(ccmax, CBSIZE) / CBSIZE; + dcbr = roundup(ccreserved, CBSIZE) / CBSIZE - clistp->c_cbreserved; + if (dcbr >= 0) + cblock_alloc_cblocks(dcbr); + else { + if (clistp->c_cbreserved + dcbr < clistp->c_cbcount) + dcbr = clistp->c_cbcount - clistp->c_cbreserved; + cblock_free_cblocks(-dcbr); + } + clistp->c_cbreserved += dcbr; +} + +/* + * Free some cblocks from the cfreelist queue back to the + * system malloc pool. + */ +static void +cblock_free_cblocks(number) + int number; +{ + int i; + + for (i = 0; i < number; ++i) + free(cblock_alloc(), M_TTYS); + ctotcount -= number; +} + +/* + * Free the cblocks reserved for a clist. + * Must be called at spltty(). + */ +void +clist_free_cblocks(clistp) + struct clist *clistp; +{ + if (clistp->c_cbcount != 0) + panic("freeing active clist cblocks"); + cblock_free_cblocks(clistp->c_cbreserved); + clistp->c_cbmax = 0; + clistp->c_cbreserved = 0; +} + +/* + * Get a character from the head of a clist. + */ +int +getc(clistp) + struct clist *clistp; +{ + int chr = -1; + int s; + struct cblock *cblockp; + + s = spltty(); + + /* If there are characters in the list, get one */ + if (clistp->c_cc) { + cblockp = (struct cblock *)((long)clistp->c_cf & ~CROUND); + chr = (u_char)*clistp->c_cf; + + /* + * If this char is quoted, set the flag. + */ + if (isset(cblockp->c_quote, clistp->c_cf - (char *)cblockp->c_info)) + chr |= TTY_QUOTE; + + /* + * Advance to next character. + */ + clistp->c_cf++; + clistp->c_cc--; + /* + * If we have advanced the 'first' character pointer + * past the end of this cblock, advance to the next one. + * If there are no more characters, set the first and + * last pointers to NULL. In either case, free the + * current cblock. + */ + if ((clistp->c_cf >= (char *)(cblockp+1)) || (clistp->c_cc == 0)) { + if (clistp->c_cc > 0) { + clistp->c_cf = cblockp->c_next->c_info; + } else { + clistp->c_cf = clistp->c_cl = NULL; + } + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + } + } + + splx(s); + return (chr); +} + +/* + * Copy 'amount' of chars, beginning at head of clist 'clistp' to + * destination linear buffer 'dest'. Return number of characters + * actually copied. + */ +int +q_to_b(clistp, dest, amount) + struct clist *clistp; + char *dest; + int amount; +{ + struct cblock *cblockp; + struct cblock *cblockn; + char *dest_orig = dest; + int numc; + int s; + + s = spltty(); + + while (clistp && amount && (clistp->c_cc > 0)) { + cblockp = (struct cblock *)((long)clistp->c_cf & ~CROUND); + cblockn = cblockp + 1; /* pointer arithmetic! */ + numc = min(amount, (char *)cblockn - clistp->c_cf); + numc = min(numc, clistp->c_cc); + bcopy(clistp->c_cf, dest, numc); + amount -= numc; + clistp->c_cf += numc; + clistp->c_cc -= numc; + dest += numc; + /* + * If this cblock has been emptied, advance to the next + * one. If there are no more characters, set the first + * and last pointer to NULL. In either case, free the + * current cblock. + */ + if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) { + if (clistp->c_cc > 0) { + clistp->c_cf = cblockp->c_next->c_info; + } else { + clistp->c_cf = clistp->c_cl = NULL; + } + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + } + } + + splx(s); + return (dest - dest_orig); +} + +/* + * Flush 'amount' of chars, beginning at head of clist 'clistp'. + */ +void +ndflush(clistp, amount) + struct clist *clistp; + int amount; +{ + struct cblock *cblockp; + struct cblock *cblockn; + int numc; + int s; + + s = spltty(); + + while (amount && (clistp->c_cc > 0)) { + cblockp = (struct cblock *)((long)clistp->c_cf & ~CROUND); + cblockn = cblockp + 1; /* pointer arithmetic! */ + numc = min(amount, (char *)cblockn - clistp->c_cf); + numc = min(numc, clistp->c_cc); + amount -= numc; + clistp->c_cf += numc; + clistp->c_cc -= numc; + /* + * If this cblock has been emptied, advance to the next + * one. If there are no more characters, set the first + * and last pointer to NULL. In either case, free the + * current cblock. + */ + if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) { + if (clistp->c_cc > 0) { + clistp->c_cf = cblockp->c_next->c_info; + } else { + clistp->c_cf = clistp->c_cl = NULL; + } + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + } + } + + splx(s); +} + +/* + * Add a character to the end of a clist. Return -1 is no + * more clists, or 0 for success. + */ +int +putc(chr, clistp) + int chr; + struct clist *clistp; +{ + struct cblock *cblockp; + int s; + + s = spltty(); + + if (clistp->c_cl == NULL) { + if (clistp->c_cbreserved < 1) { + splx(s); + printf("putc to a clist with no reserved cblocks\n"); + return (-1); /* nothing done */ + } + cblockp = cblock_alloc(); + clistp->c_cbcount = 1; + clistp->c_cf = clistp->c_cl = cblockp->c_info; + clistp->c_cc = 0; + } else { + cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND); + if (((long)clistp->c_cl & CROUND) == 0) { + struct cblock *prev = (cblockp - 1); + + if (clistp->c_cbcount >= clistp->c_cbreserved) { + if (clistp->c_cbcount >= clistp->c_cbmax + || cslushcount <= 0) { + splx(s); + return (-1); + } + --cslushcount; + } + cblockp = cblock_alloc(); + clistp->c_cbcount++; + prev->c_next = cblockp; + clistp->c_cl = cblockp->c_info; + } + } + + /* + * If this character is quoted, set the quote bit, if not, clear it. + */ + if (chr & TTY_QUOTE) { + setbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info); + /* + * Use one of the spare quote bits to record that something + * may be quoted. + */ + setbit(cblockp->c_quote, CBQSIZE * NBBY - 1); + } else + clrbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info); + + *clistp->c_cl++ = chr; + clistp->c_cc++; + + splx(s); + return (0); +} + +/* + * Copy data from linear buffer to clist chain. Return the + * number of characters not copied. + */ +int +b_to_q(src, amount, clistp) + char *src; + int amount; + struct clist *clistp; +{ + struct cblock *cblockp; + char *firstbyte, *lastbyte; + u_char startmask, endmask; + int startbit, endbit, num_between, numc; + int s; + + /* + * Avoid allocating an initial cblock and then not using it. + * c_cc == 0 must imply c_cbount == 0. + */ + if (amount <= 0) + return (amount); + + s = spltty(); + + /* + * If there are no cblocks assigned to this clist yet, + * then get one. + */ + if (clistp->c_cl == NULL) { + if (clistp->c_cbreserved < 1) { + splx(s); + printf("b_to_q to a clist with no reserved cblocks.\n"); + return (amount); /* nothing done */ + } + cblockp = cblock_alloc(); + clistp->c_cbcount = 1; + clistp->c_cf = clistp->c_cl = cblockp->c_info; + clistp->c_cc = 0; + } else { + cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND); + } + + while (amount) { + /* + * Get another cblock if needed. + */ + if (((long)clistp->c_cl & CROUND) == 0) { + struct cblock *prev = cblockp - 1; + + if (clistp->c_cbcount >= clistp->c_cbreserved) { + if (clistp->c_cbcount >= clistp->c_cbmax + || cslushcount <= 0) { + splx(s); + return (amount); + } + --cslushcount; + } + cblockp = cblock_alloc(); + clistp->c_cbcount++; + prev->c_next = cblockp; + clistp->c_cl = cblockp->c_info; + } + + /* + * Copy a chunk of the linear buffer up to the end + * of this cblock. + */ + numc = min(amount, (char *)(cblockp + 1) - clistp->c_cl); + bcopy(src, clistp->c_cl, numc); + + /* + * Clear quote bits if they aren't known to be clear. + * The following could probably be made into a seperate + * "bitzero()" routine, but why bother? + */ + if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) { + startbit = clistp->c_cl - (char *)cblockp->c_info; + endbit = startbit + numc - 1; + + firstbyte = (u_char *)cblockp->c_quote + (startbit / NBBY); + lastbyte = (u_char *)cblockp->c_quote + (endbit / NBBY); + + /* + * Calculate mask of bits to preserve in first and + * last bytes. + */ + startmask = NBBY - (startbit % NBBY); + startmask = 0xff >> startmask; + endmask = (endbit % NBBY); + endmask = 0xff << (endmask + 1); + + if (firstbyte != lastbyte) { + *firstbyte &= startmask; + *lastbyte &= endmask; + + num_between = lastbyte - firstbyte - 1; + if (num_between) + bzero(firstbyte + 1, num_between); + } else { + *firstbyte &= (startmask | endmask); + } + } + + /* + * ...and update pointer for the next chunk. + */ + src += numc; + clistp->c_cl += numc; + clistp->c_cc += numc; + amount -= numc; + /* + * If we go through the loop again, it's always + * for data in the next cblock, so by adding one (cblock), + * (which makes the pointer 1 beyond the end of this + * cblock) we prepare for the assignment of 'prev' + * above. + */ + cblockp += 1; + + } + + splx(s); + return (amount); +} + +/* + * Get the next character in the clist. Store it at dst. Don't + * advance any clist pointers, but return a pointer to the next + * character position. + */ +char * +nextc(clistp, cp, dst) + struct clist *clistp; + char *cp; + int *dst; +{ + struct cblock *cblockp; + + ++cp; + /* + * See if the next character is beyond the end of + * the clist. + */ + if (clistp->c_cc && (cp != clistp->c_cl)) { + /* + * If the next character is beyond the end of this + * cblock, advance to the next cblock. + */ + if (((long)cp & CROUND) == 0) + cp = ((struct cblock *)cp - 1)->c_next->c_info; + cblockp = (struct cblock *)((long)cp & ~CROUND); + + /* + * Get the character. Set the quote flag if this character + * is quoted. + */ + *dst = (u_char)*cp | (isset(cblockp->c_quote, cp - (char *)cblockp->c_info) ? TTY_QUOTE : 0); + + return (cp); + } + + return (NULL); +} + +/* + * "Unput" a character from a clist. + */ +int +unputc(clistp) + struct clist *clistp; +{ + struct cblock *cblockp = 0, *cbp = 0; + int s; + int chr = -1; + + + s = spltty(); + + if (clistp->c_cc) { + --clistp->c_cc; + --clistp->c_cl; + + chr = (u_char)*clistp->c_cl; + + cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND); + + /* + * Set quote flag if this character was quoted. + */ + if (isset(cblockp->c_quote, (u_char *)clistp->c_cl - cblockp->c_info)) + chr |= TTY_QUOTE; + + /* + * If all of the characters have been unput in this + * cblock, then find the previous one and free this + * one. + */ + if (clistp->c_cc && (clistp->c_cl <= (char *)cblockp->c_info)) { + cbp = (struct cblock *)((long)clistp->c_cf & ~CROUND); + + while (cbp->c_next != cblockp) + cbp = cbp->c_next; + + /* + * When the previous cblock is at the end, the 'last' + * pointer always points (invalidly) one past. + */ + clistp->c_cl = (char *)(cbp+1); + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + cbp->c_next = NULL; + } + } + + /* + * If there are no more characters on the list, then + * free the last cblock. + */ + if ((clistp->c_cc == 0) && clistp->c_cl) { + cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND); + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + clistp->c_cf = clistp->c_cl = NULL; + } + + splx(s); + return (chr); +} + +/* + * Move characters in source clist to destination clist, + * preserving quote bits. + */ +void +catq(src_clistp, dest_clistp) + struct clist *src_clistp, *dest_clistp; +{ + int chr, s; + + s = spltty(); + /* + * If the destination clist is empty (has no cblocks atttached), + * and there are no possible complications with the resource counters, + * then we simply assign the current clist to the destination. + */ + if (!dest_clistp->c_cf + && src_clistp->c_cbcount <= src_clistp->c_cbmax + && src_clistp->c_cbcount <= dest_clistp->c_cbmax) { + dest_clistp->c_cf = src_clistp->c_cf; + dest_clistp->c_cl = src_clistp->c_cl; + src_clistp->c_cf = src_clistp->c_cl = NULL; + + dest_clistp->c_cc = src_clistp->c_cc; + src_clistp->c_cc = 0; + dest_clistp->c_cbcount = src_clistp->c_cbcount; + src_clistp->c_cbcount = 0; + + splx(s); + return; + } + + splx(s); + + /* + * XXX This should probably be optimized to more than one + * character at a time. + */ + while ((chr = getc(src_clistp)) != -1) + putc(chr, dest_clistp); +} diff --git a/sys/kern/subr_disklabel.c b/sys/kern/subr_disklabel.c new file mode 100644 index 0000000..94315de --- /dev/null +++ b/sys/kern/subr_disklabel.c @@ -0,0 +1,406 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94 + * $Id$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/buf.h> +#include <sys/conf.h> +#include <sys/disklabel.h> +#include <sys/diskslice.h> +#include <sys/syslog.h> + +/* + * Seek sort for disks. + * + * The argument ap structure holds a b_actf activity chain pointer on which we + * keep two queues, sorted in ascending block order. The first queue holds + * those requests which are positioned after the current block (in the first + * request); the second holds requests which came in after their block number + * was passed. Thus we implement a one way scan, retracting after reaching the + * end of the drive to the first request on the second queue, at which time it + * becomes the first queue. + * + * A one-way scan is natural because of the way UNIX read-ahead blocks are + * allocated. + */ + +void +tqdisksort(ap, bp) + struct buf_queue_head *ap; + register struct buf *bp; +{ + register struct buf *bq; + struct buf *bn; + + /* If the queue is empty, then it's easy. */ + if ((bq = ap->tqh_first) == NULL) { + TAILQ_INSERT_HEAD(ap, bp, b_act); + return; + } + +#if 1 + /* Put new writes after all reads */ + if ((bp->b_flags & B_READ) == 0) { + while (bn = bq->b_act.tqe_next) { + if ((bq->b_flags & B_READ) == 0) + break; + bq = bn; + } + } else { + while (bn = bq->b_act.tqe_next) { + if ((bq->b_flags & B_READ) == 0) { + if (ap->tqh_first != bq) { + bq = *bq->b_act.tqe_prev; + } + break; + } + bq = bn; + } + goto insert; + } +#endif + + /* + * If we lie after the first (currently active) request, then we + * must locate the second request list and add ourselves to it. + */ + if (bp->b_pblkno < bq->b_pblkno) { + while (bn = bq->b_act.tqe_next) { + /* + * Check for an ``inversion'' in the normally ascending + * cylinder numbers, indicating the start of the second + * request list. + */ + if (bn->b_pblkno < bq->b_pblkno) { + /* + * Search the second request list for the first + * request at a larger cylinder number. We go + * before that; if there is no such request, we + * go at end. + */ + do { + if (bp->b_pblkno < bn->b_pblkno) + goto insert; + bq = bn; + } while (bn = bq->b_act.tqe_next); + goto insert; /* after last */ + } + bq = bn; + } + /* + * No inversions... we will go after the last, and + * be the first request in the second request list. + */ + goto insert; + } + /* + * Request is at/after the current request... + * sort in the first request list. + */ + while (bn = bq->b_act.tqe_next) { + /* + * We want to go after the current request if there is an + * inversion after it (i.e. it is the end of the first + * request list), or if the next request is a larger cylinder + * than our request. + */ + if (bn->b_pblkno < bq->b_pblkno || + bp->b_pblkno < bn->b_pblkno) + goto insert; + bq = bn; + } + /* + * Neither a second list nor a larger request... we go at the end of + * the first list, which is the same as the end of the whole schebang. + */ +insert: + TAILQ_INSERT_AFTER(ap, bq, bp, b_act); +} + + +/* + * Attempt to read a disk label from a device using the indicated strategy + * routine. The label must be partly set up before this: secpercyl, secsize + * and anything required in the strategy routine (e.g., dummy bounds for the + * partition containing the label) must be * filled in before calling us. + * Returns NULL on success and an error string on failure. + */ +char * +readdisklabel(dev, strat, lp) + dev_t dev; + d_strategy_t *strat; + register struct disklabel *lp; +{ + register struct buf *bp; + struct disklabel *dlp; + char *msg = NULL; + + bp = geteblk((int)lp->d_secsize); + bp->b_dev = dev; + bp->b_blkno = LABELSECTOR * ((int)lp->d_secsize/DEV_BSIZE); + bp->b_bcount = lp->d_secsize; + bp->b_flags &= ~B_INVAL; + bp->b_flags |= B_BUSY | B_READ; + (*strat)(bp); + if (biowait(bp)) + msg = "I/O error"; + else for (dlp = (struct disklabel *)bp->b_data; + dlp <= (struct disklabel *)((char *)bp->b_data + + DEV_BSIZE - sizeof(*dlp)); + dlp = (struct disklabel *)((char *)dlp + sizeof(long))) { + if (dlp->d_magic != DISKMAGIC || dlp->d_magic2 != DISKMAGIC) { + if (msg == NULL) + msg = "no disk label"; + } else if (dlp->d_npartitions > MAXPARTITIONS || + dkcksum(dlp) != 0) + msg = "disk label corrupted"; + else { + *lp = *dlp; + msg = NULL; + break; + } + } + bp->b_flags |= B_INVAL | B_AGE; + brelse(bp); + return (msg); +} + +/* + * Check new disk label for sensibility before setting it. + */ +int +setdisklabel(olp, nlp, openmask) + register struct disklabel *olp, *nlp; + u_long openmask; +{ + register i; + register struct partition *opp, *npp; + + /* + * Check it is actually a disklabel we are looking at. + */ + if (nlp->d_magic != DISKMAGIC || nlp->d_magic2 != DISKMAGIC || + dkcksum(nlp) != 0) + return (EINVAL); + /* + * For each partition that we think is open, + */ + while ((i = ffs((long)openmask)) != 0) { + i--; + /* + * Check it is not changing.... + */ + openmask &= ~(1 << i); + if (nlp->d_npartitions <= i) + return (EBUSY); + opp = &olp->d_partitions[i]; + npp = &nlp->d_partitions[i]; + if (npp->p_offset != opp->p_offset || npp->p_size < opp->p_size) + return (EBUSY); + /* + * Copy internally-set partition information + * if new label doesn't include it. XXX + * (If we are using it then we had better stay the same type) + * This is possibly dubious, as someone else noted (XXX) + */ + if (npp->p_fstype == FS_UNUSED && opp->p_fstype != FS_UNUSED) { + npp->p_fstype = opp->p_fstype; + npp->p_fsize = opp->p_fsize; + npp->p_frag = opp->p_frag; + npp->p_cpg = opp->p_cpg; + } + } + nlp->d_checksum = 0; + nlp->d_checksum = dkcksum(nlp); + *olp = *nlp; + return (0); +} + +/* + * Write disk label back to device after modification. + */ +int +writedisklabel(dev, strat, lp) + dev_t dev; + d_strategy_t *strat; + register struct disklabel *lp; +{ + struct buf *bp; + struct disklabel *dlp; + int labelpart; + int error = 0; + + labelpart = dkpart(dev); + if (lp->d_partitions[labelpart].p_offset != 0) { + if (lp->d_partitions[0].p_offset != 0) + return (EXDEV); /* not quite right */ + labelpart = 0; + } + bp = geteblk((int)lp->d_secsize); + bp->b_dev = dkmodpart(dev, labelpart); + bp->b_blkno = LABELSECTOR * ((int)lp->d_secsize/DEV_BSIZE); + bp->b_bcount = lp->d_secsize; +#if 1 + /* + * We read the label first to see if it's there, + * in which case we will put ours at the same offset into the block.. + * (I think this is stupid [Julian]) + * Note that you can't write a label out over a corrupted label! + * (also stupid.. how do you write the first one? by raw writes?) + */ + bp->b_flags &= ~B_INVAL; + bp->b_flags |= B_BUSY | B_READ; + (*strat)(bp); + error = biowait(bp); + if (error) + goto done; + for (dlp = (struct disklabel *)bp->b_data; + dlp <= (struct disklabel *) + ((char *)bp->b_data + lp->d_secsize - sizeof(*dlp)); + dlp = (struct disklabel *)((char *)dlp + sizeof(long))) { + if (dlp->d_magic == DISKMAGIC && dlp->d_magic2 == DISKMAGIC && + dkcksum(dlp) == 0) { + *dlp = *lp; + bp->b_flags &= ~(B_DONE | B_READ); + bp->b_flags |= B_BUSY | B_WRITE; + (*strat)(bp); + error = biowait(bp); + goto done; + } + } + error = ESRCH; +done: +#else + bzero(bp->b_data, lp->d_secsize); + dlp = (struct disklabel *)bp->b_data; + *dlp = *lp; + bp->b_flags &= ~B_INVAL; + bp->b_flags |= B_BUSY | B_WRITE; + (*strat)(bp); + error = biowait(bp); +#endif + bp->b_flags |= B_INVAL | B_AGE; + brelse(bp); + return (error); +} + +/* + * Compute checksum for disk label. + */ +u_int +dkcksum(lp) + register struct disklabel *lp; +{ + register u_short *start, *end; + register u_short sum = 0; + + start = (u_short *)lp; + end = (u_short *)&lp->d_partitions[lp->d_npartitions]; + while (start < end) + sum ^= *start++; + return (sum); +} + +/* + * Disk error is the preface to plaintive error messages + * about failing disk transfers. It prints messages of the form + +hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d) + + * if the offset of the error in the transfer and a disk label + * are both available. blkdone should be -1 if the position of the error + * is unknown; the disklabel pointer may be null from drivers that have not + * been converted to use them. The message is printed with printf + * if pri is LOG_PRINTF, otherwise it uses log at the specified priority. + * The message should be completed (with at least a newline) with printf + * or addlog, respectively. There is no trailing space. + */ +void +diskerr(bp, dname, what, pri, blkdone, lp) + register struct buf *bp; + char *dname, *what; + int pri, blkdone; + register struct disklabel *lp; +{ + int unit = dkunit(bp->b_dev); + int slice = dkslice(bp->b_dev); + int part = dkpart(bp->b_dev); + register int (*pr) __P((const char *, ...)); + char partname[2]; + char *sname; + int sn; + + if (pri != LOG_PRINTF) { + log(pri, ""); + pr = addlog; + } else + pr = printf; + sname = dsname(dname, unit, slice, part, partname); + (*pr)("%s%s: %s %sing fsbn ", sname, partname, what, + bp->b_flags & B_READ ? "read" : "writ"); + sn = bp->b_blkno; + if (bp->b_bcount <= DEV_BSIZE) + (*pr)("%d", sn); + else { + if (blkdone >= 0) { + sn += blkdone; + (*pr)("%d of ", sn); + } + (*pr)("%d-%d", bp->b_blkno, + bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE); + } + if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) { +#ifdef tahoe + sn *= DEV_BSIZE / lp->d_secsize; /* XXX */ +#endif + sn += lp->d_partitions[part].p_offset; + /* + * XXX should add slice offset and not print the slice, + * but we don't know the slice pointer. + * XXX should print bp->b_pblkno so that this will work + * independent of slices, labels and bad sector remapping, + * but some drivers don't set bp->b_pblkno. + */ + (*pr)(" (%s bn %d; cn %d", sname, sn, sn / lp->d_secpercyl); + sn %= lp->d_secpercyl; + (*pr)(" tn %d sn %d)", sn / lp->d_nsectors, sn % lp->d_nsectors); + } +} diff --git a/sys/kern/subr_diskmbr.c b/sys/kern/subr_diskmbr.c new file mode 100644 index 0000000..8983e950c --- /dev/null +++ b/sys/kern/subr_diskmbr.c @@ -0,0 +1,456 @@ +/*- + * Copyright (c) 1994 Bruce D. Evans. + * All rights reserved. + * + * Copyright (c) 1982, 1986, 1988 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)ufs_disksubr.c 7.16 (Berkeley) 5/4/91 + * from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $ + * $Id$ + */ + +#include <stddef.h> +#include <sys/param.h> +#include <sys/buf.h> +#include <sys/conf.h> +#include <sys/disklabel.h> +#define DOSPTYP_EXTENDED 5 +#define DOSPTYP_ONTRACK 84 +#include <sys/diskslice.h> +#include <sys/malloc.h> +#include <sys/syslog.h> +#include <sys/systm.h> + +#define TRACE(str) do { if (dsi_debug) printf str; } while (0) + +static volatile u_char dsi_debug; + +static struct dos_partition historical_bogus_partition_table[NDOSPART] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }, + { 0x80, 0, 1, 0, DOSPTYP_386BSD, 255, 255, 255, 0, 50000, }, +}; + +static int check_part __P((char *sname, struct dos_partition *dp, + u_long offset, int nsectors, int ntracks, + u_long mbr_offset)); +static void extended __P((char *dname, dev_t dev, d_strategy_t *strat, + struct disklabel *lp, struct diskslices *ssp, + u_long ext_offset, u_long ext_size, + u_long base_ext_offset, int nsectors, int ntracks, + u_long mbr_offset)); + +static int +check_part(sname, dp, offset, nsectors, ntracks, mbr_offset ) + char *sname; + struct dos_partition *dp; + u_long offset; + int nsectors; + int ntracks; + u_long mbr_offset; +{ + int chs_ecyl; + int chs_esect; + int chs_scyl; + int chs_ssect; + int error; + u_long esector; + u_long esector1; + u_long secpercyl; + u_long ssector; + u_long ssector1; + + secpercyl = (u_long)nsectors * ntracks; + chs_scyl = DPCYL(dp->dp_scyl, dp->dp_ssect); + chs_ssect = DPSECT(dp->dp_ssect); + ssector = chs_ssect - 1 + dp->dp_shd * nsectors + chs_scyl * secpercyl + + mbr_offset; + ssector1 = offset + dp->dp_start; + + /* + * If ssector1 is on a cylinder >= 1024, then ssector can't be right. + * Allow the C/H/S for it to be 1023/ntracks-1/nsectors, or correct + * apart from the cylinder being reduced modulo 1024. + */ + if (ssector < ssector1 + && ((chs_ssect == nsectors && dp->dp_shd == ntracks - 1 + && chs_scyl == 1023) + || (ssector1 - ssector) % (1024 * secpercyl) == 0) + || (dp->dp_scyl == 255 && dp->dp_shd == 255 + && dp->dp_ssect == 255)) { + TRACE(("%s: C/H/S start %d/%d/%d, start %lu: allow\n", + sname, chs_scyl, dp->dp_shd, chs_ssect, ssector1)); + ssector = ssector1; + } + + chs_ecyl = DPCYL(dp->dp_ecyl, dp->dp_esect); + chs_esect = DPSECT(dp->dp_esect); + esector = chs_esect - 1 + dp->dp_ehd * nsectors + chs_ecyl * secpercyl + + mbr_offset; + esector1 = ssector1 + dp->dp_size - 1; + + /* Allow certain bogus C/H/S values for esector, as above. */ + if (esector < esector1 + && ((chs_esect == nsectors && dp->dp_ehd == ntracks - 1 + && chs_ecyl == 1023) + || (esector1 - esector) % (1024 * secpercyl) == 0) + || (dp->dp_ecyl == 255 && dp->dp_ehd == 255 + && dp->dp_esect == 255)) { + TRACE(("%s: C/H/S end %d/%d/%d, end %lu: allow\n", + sname, chs_ecyl, dp->dp_ehd, chs_esect, esector1)); + esector = esector1; + } + + error = (ssector == ssector1 && esector == esector1) ? 0 : EINVAL; + if (bootverbose) + printf("%s: type 0x%x, start %lu, end = %lu, size %lu %s\n", + sname, dp->dp_typ, ssector1, esector1, dp->dp_size, + error ? "" : ": OK"); + if (ssector != ssector1 && bootverbose) + printf("%s: C/H/S start %d/%d/%d (%lu) != start %lu: invalid\n", + sname, chs_scyl, dp->dp_shd, chs_ssect, + ssector, ssector1); + if (esector != esector1 && bootverbose) + printf("%s: C/H/S end %d/%d/%d (%lu) != end %lu: invalid\n", + sname, chs_ecyl, dp->dp_ehd, chs_esect, + esector, esector1); + return (error); +} + +int +dsinit(dname, dev, strat, lp, sspp) + char *dname; + dev_t dev; + d_strategy_t *strat; + struct disklabel *lp; + struct diskslices **sspp; +{ + struct buf *bp; + u_char *cp; + int dospart; + struct dos_partition *dp; + struct dos_partition *dp0; + int error; + int max_ncyls; + int max_nsectors; + int max_ntracks; + u_long mbr_offset; + char partname[2]; + u_long secpercyl; + char *sname; + struct diskslice *sp; + struct diskslices *ssp; + + /* + * Allocate a dummy slices "struct" and initialize it to contain + * only an empty compatibility slice (pointing to itself) and a + * whole disk slice (covering the disk as described by the label). + * If there is an error, then the dummy struct becomes final. + */ + ssp = malloc(offsetof(struct diskslices, dss_slices) + + BASE_SLICE * sizeof *sp, M_DEVBUF, M_WAITOK); + *sspp = ssp; + ssp->dss_first_bsd_slice = COMPATIBILITY_SLICE; + ssp->dss_nslices = BASE_SLICE; + sp = &ssp->dss_slices[0]; + bzero(sp, BASE_SLICE * sizeof *sp); + sp[WHOLE_DISK_SLICE].ds_size = lp->d_secperunit; + + mbr_offset = DOSBBSECTOR; +reread_mbr: + /* Read master boot record. */ + bp = geteblk((int)lp->d_secsize); + bp->b_dev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART); + bp->b_blkno = mbr_offset; + bp->b_bcount = lp->d_secsize; + bp->b_flags |= B_BUSY | B_READ; + (*strat)(bp); + if (biowait(bp) != 0) { + diskerr(bp, dname, "error reading primary partition table", + LOG_PRINTF, 0, lp); + printf("\n"); + error = EIO; + goto done; + } + + /* Weakly verify it. */ + cp = bp->b_un.b_addr; + sname = dsname(dname, dkunit(dev), WHOLE_DISK_SLICE, RAW_PART, + partname); + if (cp[0x1FE] != 0x55 || cp[0x1FF] != 0xAA) { + printf("%s: invalid primary partition table: no magic\n", + sname); + error = EINVAL; + goto done; + } + dp0 = (struct dos_partition *)(cp + DOSPARTOFF); + + /* Check for "Ontrack Diskmanager". */ + for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) { + if (dp->dp_typ == DOSPTYP_ONTRACK) { + if (bootverbose) + printf( + "%s: Found \"Ontrack Disk Manager\" on this disk.\n", sname); + bp->b_flags |= B_INVAL | B_AGE; + brelse(bp); + mbr_offset = 63; + goto reread_mbr; + } + } + + if (bcmp(dp0, historical_bogus_partition_table, + sizeof historical_bogus_partition_table) == 0) { + TRACE(("%s: invalid primary partition table: historical\n", + sname)); + error = EINVAL; + goto done; + } + + /* Guess the geometry. */ + /* + * TODO: + * Perhaps skip entries with 0 size. + * Perhaps only look at entries of type DOSPTYP_386BSD. + */ + max_ncyls = 0; + max_nsectors = 0; + max_ntracks = 0; + for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) { + int ncyls; + int nsectors; + int ntracks; + + ncyls = DPCYL(dp->dp_ecyl, dp->dp_esect) + 1; + if (max_ncyls < ncyls) + max_ncyls = ncyls; + nsectors = DPSECT(dp->dp_esect); + if (max_nsectors < nsectors) + max_nsectors = nsectors; + ntracks = dp->dp_ehd + 1; + if (max_ntracks < ntracks) + max_ntracks = ntracks; + } + + /* + * Check that we have guessed the geometry right by checking the + * partition entries. + */ + /* + * TODO: + * As above. + * Check for overlaps. + * Check against d_secperunit if the latter is reliable. + */ + error = 0; + for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) { + if (dp->dp_scyl == 0 && dp->dp_shd == 0 && dp->dp_ssect == 0 + && dp->dp_start == 0 && dp->dp_size == 0) + continue; + sname = dsname(dname, dkunit(dev), BASE_SLICE + dospart, + RAW_PART, partname); + + /* + * Temporarily ignore errors from this check. We could + * simplify things by accepting the table eariler if we + * always ignore errors here. Perhaps we should always + * accept the table if the magic is right but not let + * bad entries affect the geometry. + */ + check_part(sname, dp, mbr_offset, max_nsectors, max_ntracks, + mbr_offset); + } + if (error != 0) + goto done; + + /* + * Accept the DOS partition table. + * First adjust the label (we have been careful not to change it + * before we can guarantee success). + */ + secpercyl = (u_long)max_nsectors * max_ntracks; + if (secpercyl != 0) { + u_long secperunit; + + lp->d_nsectors = max_nsectors; + lp->d_ntracks = max_ntracks; + lp->d_secpercyl = secpercyl; + secperunit = secpercyl * max_ncyls; + if (lp->d_secperunit < secperunit) + lp->d_secperunit = secperunit; + lp->d_ncylinders = lp->d_secperunit / secpercyl; + } + + /* + * Free the dummy slices "struct" and allocate a real new one. + * Initialize special slices as above. + */ + free(ssp, M_DEVBUF); + ssp = malloc(offsetof(struct diskslices, dss_slices) +#define MAX_SLICES_SUPPORTED MAX_SLICES /* was (BASE_SLICE + NDOSPART) */ + + MAX_SLICES_SUPPORTED * sizeof *sp, M_DEVBUF, M_WAITOK); + *sspp = ssp; + ssp->dss_first_bsd_slice = COMPATIBILITY_SLICE; + sp = &ssp->dss_slices[0]; + bzero(sp, MAX_SLICES_SUPPORTED * sizeof *sp); + sp[WHOLE_DISK_SLICE].ds_size = lp->d_secperunit; + + /* Initialize normal slices. */ + sp += BASE_SLICE; + for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++, sp++) { + sp->ds_offset = mbr_offset + dp->dp_start; + sp->ds_size = dp->dp_size; + sp->ds_type = dp->dp_typ; +#if 0 + lp->d_subtype |= (lp->d_subtype & 3) | dospart + | DSTYPE_INDOSPART; +#endif + } + ssp->dss_nslices = BASE_SLICE + NDOSPART; + + /* Handle extended partitions. */ + sp -= NDOSPART; + for (dospart = 0; dospart < NDOSPART; dospart++, sp++) + if (sp->ds_type == DOSPTYP_EXTENDED) + extended(dname, bp->b_dev, strat, lp, ssp, + sp->ds_offset, sp->ds_size, sp->ds_offset, + max_nsectors, max_ntracks, mbr_offset); + +done: + bp->b_flags |= B_INVAL | B_AGE; + brelse(bp); + if (error == EINVAL) + error = 0; + return (error); +} + +void +extended(dname, dev, strat, lp, ssp, ext_offset, ext_size, base_ext_offset, + nsectors, ntracks, mbr_offset) + char *dname; + dev_t dev; + struct disklabel *lp; + d_strategy_t *strat; + struct diskslices *ssp; + u_long ext_offset; + u_long ext_size; + u_long base_ext_offset; + int nsectors; + int ntracks; + u_long mbr_offset; +{ + struct buf *bp; + u_char *cp; + int dospart; + struct dos_partition *dp; + u_long ext_offsets[NDOSPART]; + u_long ext_sizes[NDOSPART]; + char partname[2]; + int slice; + char *sname; + struct diskslice *sp; + + /* Read extended boot record. */ + bp = geteblk((int)lp->d_secsize); + bp->b_dev = dev; + bp->b_blkno = ext_offset; + bp->b_bcount = lp->d_secsize; + bp->b_flags |= B_BUSY | B_READ; + (*strat)(bp); + if (biowait(bp) != 0) { + diskerr(bp, dname, "error reading extended partition table", + LOG_PRINTF, 0, lp); + printf("\n"); + goto done; + } + + /* Weakly verify it. */ + cp = bp->b_un.b_addr; + if (cp[0x1FE] != 0x55 || cp[0x1FF] != 0xAA) { + sname = dsname(dname, dkunit(dev), WHOLE_DISK_SLICE, RAW_PART, + partname); + printf("%s: invalid extended partition table: no magic\n", + sname); + goto done; + } + + for (dospart = 0, + dp = (struct dos_partition *)(bp->b_un.b_addr + DOSPARTOFF), + slice = ssp->dss_nslices, sp = &ssp->dss_slices[slice]; + dospart < NDOSPART; dospart++, dp++) { + ext_sizes[dospart] = 0; + if (dp->dp_scyl == 0 && dp->dp_shd == 0 && dp->dp_ssect == 0 + && dp->dp_start == 0 && dp->dp_size == 0) + continue; + if (dp->dp_typ == DOSPTYP_EXTENDED) { + char buf[32]; + + sname = dsname(dname, dkunit(dev), WHOLE_DISK_SLICE, + RAW_PART, partname); + strcpy(buf, sname); + if (strlen(buf) < sizeof buf - 11) + strcat(buf, "<extended>"); + check_part(buf, dp, base_ext_offset, nsectors, + ntracks, mbr_offset); + ext_offsets[dospart] = base_ext_offset + dp->dp_start; + ext_sizes[dospart] = dp->dp_size; + } else { + sname = dsname(dname, dkunit(dev), slice, RAW_PART, + partname); + check_part(sname, dp, ext_offset, nsectors, ntracks, + mbr_offset); + if (slice >= MAX_SLICES) { + printf("%s: too many slices\n", sname); + slice++; + continue; + } + sp->ds_offset = ext_offset + dp->dp_start; + sp->ds_size = dp->dp_size; + sp->ds_type = dp->dp_typ; + ssp->dss_nslices++; + slice++; + sp++; + } + } + + /* If we found any more slices, recursively find all the subslices. */ + for (dospart = 0; dospart < NDOSPART; dospart++) + if (ext_sizes[dospart] != 0) + extended(dname, dev, strat, lp, ssp, + ext_offsets[dospart], ext_sizes[dospart], + base_ext_offset, nsectors, ntracks, + mbr_offset); + +done: + bp->b_flags |= B_INVAL | B_AGE; + brelse(bp); +} diff --git a/sys/kern/subr_diskslice.c b/sys/kern/subr_diskslice.c new file mode 100644 index 0000000..44e01b0 --- /dev/null +++ b/sys/kern/subr_diskslice.c @@ -0,0 +1,1066 @@ +/*- + * Copyright (c) 1994 Bruce D. Evans. + * All rights reserved. + * + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Copyright (c) 1982, 1986, 1988 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)wd.c 7.2 (Berkeley) 5/9/91 + * from: wd.c,v 1.55 1994/10/22 01:57:12 phk Exp $ + * from: @(#)ufs_disksubr.c 7.16 (Berkeley) 5/4/91 + * from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $ + * $Id: subr_diskslice.c,v 1.35 1997/02/22 09:39:15 peter Exp $ + */ + +#include <sys/param.h> +#include <sys/buf.h> +#include <sys/conf.h> +#ifdef DEVFS +#include <sys/devfsext.h> +#endif +#include <sys/disklabel.h> +#include <sys/diskslice.h> +#include <sys/dkbad.h> +#include <sys/fcntl.h> +#include <sys/malloc.h> +#include <sys/stat.h> +#include <sys/syslog.h> +#include <sys/systm.h> +#include <sys/vnode.h> + +#include <ufs/ufs/dinode.h> +#include <ufs/ffs/fs.h> + +#define TRACE(str) do { if (ds_debug) printf str; } while (0) + +typedef u_char bool_t; + +static volatile bool_t ds_debug; + +static void dsiodone __P((struct buf *bp)); +static char *fixlabel __P((char *sname, struct diskslice *sp, + struct disklabel *lp, int writeflag)); +static void free_ds_label __P((struct diskslices *ssp, int slice)); +#ifdef DEVFS +static void free_ds_labeldevs __P((struct diskslices *ssp, int slice)); +#endif +static void partition_info __P((char *sname, int part, struct partition *pp)); +static void slice_info __P((char *sname, struct diskslice *sp)); +static void set_ds_bad __P((struct diskslices *ssp, int slice, + struct dkbad_intern *btp)); +static void set_ds_label __P((struct diskslices *ssp, int slice, + struct disklabel *lp)); +#ifdef DEVFS +static void set_ds_labeldevs __P((char *dname, dev_t dev, + struct diskslices *ssp)); +static void set_ds_labeldevs_unaliased __P((char *dname, dev_t dev, + struct diskslices *ssp)); +#endif +static void set_ds_wlabel __P((struct diskslices *ssp, int slice, + int wlabel)); + +/* + * Determine the size of the transfer, and make sure it is + * within the boundaries of the partition. Adjust transfer + * if needed, and signal errors or early completion. + * + * XXX TODO: + * o Do bad sector remapping. May need to split buffer. + * o Split buffers that are too big for the device. + * o Check for overflow. + * o Finish cleaning this up. + */ +int +dscheck(bp, ssp) + struct buf *bp; + struct diskslices *ssp; +{ + daddr_t blkno; + daddr_t labelsect; + struct disklabel *lp; + u_long maxsz; + char *msg; + struct partition *pp; + struct diskslice *sp; + long sz; + + if (bp->b_blkno < 0) { + Debugger("Slice code got negative blocknumber"); + bp->b_error = EINVAL; + goto bad; + } + + sp = &ssp->dss_slices[dkslice(bp->b_dev)]; + lp = sp->ds_label; + sz = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT; + if (lp == NULL) { + blkno = bp->b_blkno; + labelsect = -LABELSECTOR - 1; + maxsz = sp->ds_size; + } else { + labelsect = lp->d_partitions[LABEL_PART].p_offset; +if (labelsect != 0) Debugger("labelsect != 0 in dscheck()"); + pp = &lp->d_partitions[dkpart(bp->b_dev)]; + blkno = pp->p_offset + bp->b_blkno; + maxsz = pp->p_size; + if (sp->ds_bad != NULL && ds_debug) { + daddr_t newblkno; + + newblkno = transbad144(sp->ds_bad, blkno); + if (newblkno != blkno) + printf("should map bad block %lu -> %lu\n", + blkno, newblkno); + } + } + + /* overwriting disk label ? */ + /* XXX should also protect bootstrap in first 8K */ + if (blkno <= LABELSECTOR + labelsect && +#if LABELSECTOR != 0 + bp->b_blkno + sz > LABELSECTOR + labelsect && +#endif + (bp->b_flags & B_READ) == 0 && sp->ds_wlabel == 0) { + bp->b_error = EROFS; + goto bad; + } + +#if defined(DOSBBSECTOR) && defined(notyet) + /* overwriting master boot record? */ + if (blkno <= DOSBBSECTOR && (bp->b_flags & B_READ) == 0 && + sp->ds_wlabel == 0) { + bp->b_error = EROFS; + goto bad; + } +#endif + + /* beyond partition? */ + if (bp->b_blkno < 0 || bp->b_blkno + sz > maxsz) { + /* if exactly at end of disk, return an EOF */ + if (bp->b_blkno == maxsz) { + bp->b_resid = bp->b_bcount; + return (0); + } + /* or truncate if part of it fits */ + sz = maxsz - bp->b_blkno; + if (sz <= 0) { + bp->b_error = EINVAL; + goto bad; + } + bp->b_bcount = sz << DEV_BSHIFT; + } + + bp->b_pblkno = blkno + sp->ds_offset; + + /* + * Snoop on label accesses if the slice offset is nonzero. Fudge + * offsets in the label to keep the in-core label coherent with + * the on-disk one. + */ + if (blkno <= LABELSECTOR + labelsect +#if LABELSECTOR != 0 + && bp->b_blkno + sz > LABELSECTOR + labelsect +#endif + && sp->ds_offset != 0) { + struct iodone_chain *ic; + + ic = malloc(sizeof *ic , M_DEVBUF, M_WAITOK); + ic->ic_prev_flags = bp->b_flags; + ic->ic_prev_iodone = bp->b_iodone; + ic->ic_prev_iodone_chain = bp->b_iodone_chain; + ic->ic_args[0].ia_long = (LABELSECTOR + labelsect - blkno) + << DEV_BSHIFT; + if (lp) + ic->ic_args[0].ia_long *= lp->d_secsize / DEV_BSIZE; + ic->ic_args[1].ia_ptr = sp; + bp->b_flags |= B_CALL; + bp->b_iodone = dsiodone; + bp->b_iodone_chain = ic; + if (!(bp->b_flags & B_READ)) { + /* + * XXX even disklabel(8) writes directly so we need + * to adjust writes. Perhaps we should drop support + * for DIOCWLABEL (always write protect labels) and + * require the use of DIOCWDINFO. + * + * XXX probably need to copy the data to avoid even + * temporarily corrupting the in-core copy. + */ + if (bp->b_vp != NULL) + bp->b_vp->v_numoutput++; + msg = fixlabel((char *)NULL, sp, + (struct disklabel *) + (bp->b_data + ic->ic_args[0].ia_long), + TRUE); + if (msg != NULL) { + printf("%s\n", msg); + bp->b_error = EROFS; + goto bad; + } + } + } + return (1); + +bad: + bp->b_flags |= B_ERROR; + return (-1); +} + +void +dsclose(dev, mode, ssp) + dev_t dev; + int mode; + struct diskslices *ssp; +{ + u_char mask; + struct diskslice *sp; + + sp = &ssp->dss_slices[dkslice(dev)]; + mask = 1 << dkpart(dev); + switch (mode) { + case S_IFBLK: + sp->ds_bopenmask &= ~mask; + break; + case S_IFCHR: + sp->ds_copenmask &= ~mask; + break; + } + sp->ds_openmask = sp->ds_bopenmask | sp->ds_copenmask; +} + +void +dsgone(sspp) + struct diskslices **sspp; +{ + int slice; + struct diskslice *sp; + struct diskslices *ssp; + + for (slice = 0, ssp = *sspp; slice < ssp->dss_nslices; slice++) { + sp = &ssp->dss_slices[slice]; + if (sp->ds_bad != NULL) { + free(sp->ds_bad, M_DEVBUF); + set_ds_bad(ssp, slice, (struct dkbad_intern *)NULL); + } +#ifdef DEVFS + if (sp->ds_bdev != NULL) + devfs_remove_dev(sp->ds_bdev); + if (sp->ds_cdev != NULL) + devfs_remove_dev(sp->ds_cdev); +#endif + free_ds_label(ssp, slice); + } + free(ssp, M_DEVBUF); + *sspp = NULL; +} + +/* + * For the "write" commands (DIOCSBAD, DIOCSDINFO and DIOCWDINFO), this + * is subject to the same restriction as dsopen(). + */ +int +dsioctl(dname, dev, cmd, data, flags, sspp, strat, setgeom) + char *dname; + dev_t dev; + int cmd; + caddr_t data; + int flags; + struct diskslices **sspp; + d_strategy_t *strat; + ds_setgeom_t *setgeom; +{ + int error; + struct disklabel *lp; + int old_wlabel; + int slice; + struct diskslice *sp; + struct diskslices *ssp; + + slice = dkslice(dev); + ssp = *sspp; + sp = &ssp->dss_slices[slice]; + lp = sp->ds_label; + switch (cmd) { + + case DIOCGDINFO: + if (lp == NULL) + return (EINVAL); + *(struct disklabel *)data = *lp; + return (0); + +#ifdef notyet + case DIOCGDINFOP: + if (lp == NULL) + return (EINVAL); + *(struct disklabel **)data = lp; + return (0); +#endif + + case DIOCGPART: + if (lp == NULL) + return (EINVAL); + ((struct partinfo *)data)->disklab = lp; + ((struct partinfo *)data)->part + = &lp->d_partitions[dkpart(dev)]; + return (0); + + case DIOCGSLICEINFO: + *(struct diskslices *)data = *ssp; + return (0); + + case DIOCSBAD: + if (slice == WHOLE_DISK_SLICE) + return (ENODEV); + if (!(flags & FWRITE)) + return (EBADF); + if (lp == NULL) + return (EINVAL); + if (sp->ds_bad != NULL) + free(sp->ds_bad, M_DEVBUF); + set_ds_bad(ssp, slice, internbad144((struct dkbad *)data, lp)); + return (0); + + case DIOCSDINFO: + if (slice == WHOLE_DISK_SLICE) + return (ENODEV); + if (!(flags & FWRITE)) + return (EBADF); + lp = malloc(sizeof *lp, M_DEVBUF, M_WAITOK); + if (sp->ds_label == NULL) + bzero(lp, sizeof *lp); + else + bcopy(sp->ds_label, lp, sizeof *lp); + error = setdisklabel(lp, (struct disklabel *)data, + sp->ds_label != NULL + ? sp->ds_openmask : (u_long)0); + /* XXX why doesn't setdisklabel() check this? */ + if (error == 0 && lp->d_partitions[RAW_PART].p_offset != 0) + error = EINVAL; +#if 0 /* XXX */ + if (error != 0 && setgeom != NULL) + error = setgeom(lp); +#endif + if (error != 0) { + free(lp, M_DEVBUF); + return (error); + } + free_ds_label(ssp, slice); + set_ds_label(ssp, slice, lp); +#ifdef DEVFS + set_ds_labeldevs(dname, dev, ssp); +#endif + return (0); + + case DIOCSYNCSLICEINFO: + if (slice != WHOLE_DISK_SLICE || dkpart(dev) != RAW_PART) + return (EINVAL); + if (!*(int *)data) + for (slice = 0; slice < ssp->dss_nslices; slice++) { + u_char openmask; + + openmask = ssp->dss_slices[slice].ds_openmask; + if (openmask + && (slice != WHOLE_DISK_SLICE + || openmask & ~(1 << RAW_PART))) + return (EBUSY); + } + + /* + * Temporarily forget the current slices struct and read + * the current one. + * XXX should wait for current accesses on this disk to + * complete, then lock out future accesses and opens. + */ + *sspp = NULL; + lp = malloc(sizeof *lp, M_DEVBUF, M_WAITOK); + *lp = *ssp->dss_slices[WHOLE_DISK_SLICE].ds_label; + error = dsopen(dname, dev, + ssp->dss_slices[WHOLE_DISK_SLICE].ds_copenmask + & (1 << RAW_PART) ? S_IFCHR : S_IFBLK, + sspp, lp, strat, setgeom, ssp->dss_bdevsw, + ssp->dss_cdevsw); + if (error != 0) { + free(lp, M_DEVBUF); + *sspp = ssp; + return (error); + } + + /* + * Reopen everything. This is a no-op except in the "force" + * case and when the raw bdev and cdev are both open. Abort + * if anything fails. + */ + for (slice = 0; slice < ssp->dss_nslices; slice++) { + u_char openmask; + int part; + + for (openmask = ssp->dss_slices[slice].ds_bopenmask, + part = 0; openmask; openmask >>= 1, part++) { + if (!(openmask & 1)) + continue; + error = dsopen(dname, + dkmodslice(dkmodpart(dev, part), + slice), + S_IFBLK, sspp, lp, strat, + setgeom, ssp->dss_bdevsw, + ssp->dss_cdevsw); + if (error != 0) { + /* XXX should free devfs toks. */ + free(lp, M_DEVBUF); + /* XXX should restore devfs toks. */ + *sspp = ssp; + return (EBUSY); + } + } + for (openmask = ssp->dss_slices[slice].ds_copenmask, + part = 0; openmask; openmask >>= 1, part++) { + if (!(openmask & 1)) + continue; + error = dsopen(dname, + dkmodslice(dkmodpart(dev, part), + slice), + S_IFCHR, sspp, lp, strat, + setgeom, ssp->dss_bdevsw, + ssp->dss_cdevsw); + if (error != 0) { + /* XXX should free devfs toks. */ + free(lp, M_DEVBUF); + /* XXX should restore devfs toks. */ + *sspp = ssp; + return (EBUSY); + } + } + } + + /* XXX devfs tokens? */ + free(lp, M_DEVBUF); + dsgone(&ssp); + return (0); + + case DIOCWDINFO: + error = dsioctl(dname, dev, DIOCSDINFO, data, flags, &ssp, + strat, setgeom); + if (error != 0) + return (error); + /* + * XXX this used to hack on dk_openpart to fake opening + * partition 0 in case that is used instead of dkpart(dev). + */ + old_wlabel = sp->ds_wlabel; + set_ds_wlabel(ssp, slice, TRUE); + error = writedisklabel(dev, strat, sp->ds_label); + /* XXX should invalidate in-core label if write failed. */ + set_ds_wlabel(ssp, slice, old_wlabel); + return (error); + + case DIOCWLABEL: + if (slice == WHOLE_DISK_SLICE) + return (ENODEV); + if (!(flags & FWRITE)) + return (EBADF); + set_ds_wlabel(ssp, slice, *(int *)data != 0); + return (0); + + default: + return (-1); + } +} + +static void +dsiodone(bp) + struct buf *bp; +{ + struct iodone_chain *ic; + char *msg; + + ic = bp->b_iodone_chain; + bp->b_flags = (ic->ic_prev_flags & B_CALL) + | (bp->b_flags & ~(B_CALL | B_DONE)); + bp->b_iodone = ic->ic_prev_iodone; + bp->b_iodone_chain = ic->ic_prev_iodone_chain; + if (!(bp->b_flags & B_READ) + || (!(bp->b_flags & B_ERROR) && bp->b_error == 0)) { + msg = fixlabel((char *)NULL, ic->ic_args[1].ia_ptr, + (struct disklabel *) + (bp->b_data + ic->ic_args[0].ia_long), + FALSE); + if (msg != NULL) + printf("%s\n", msg); + } + free(ic, M_DEVBUF); + biodone(bp); +} + +int +dsisopen(ssp) + struct diskslices *ssp; +{ + int slice; + + if (ssp == NULL) + return (0); + for (slice = 0; slice < ssp->dss_nslices; slice++) + if (ssp->dss_slices[slice].ds_openmask) + return (1); + return (0); +} + +char * +dsname(dname, unit, slice, part, partname) + char *dname; + int unit; + int slice; + int part; + char *partname; +{ + static char name[32]; + + if (strlen(dname) > 16) + dname = "nametoolong"; + sprintf(name, "%s%d", dname, unit); + partname[0] = '\0'; + if (slice != WHOLE_DISK_SLICE || part != RAW_PART) { + partname[0] = 'a' + part; + partname[1] = '\0'; + if (slice != COMPATIBILITY_SLICE) + sprintf(name + strlen(name), "s%d", slice - 1); + } + return (name); +} + +/* + * This should only be called when the unit is inactive and the strategy + * routine should not allow it to become active unless we call it. Our + * strategy routine must be special to allow activity. + */ +int +dsopen(dname, dev, mode, sspp, lp, strat, setgeom, bdevsw, cdevsw) + char *dname; + dev_t dev; + int mode; + struct diskslices **sspp; + struct disklabel *lp; + d_strategy_t *strat; + ds_setgeom_t *setgeom; + struct bdevsw *bdevsw; + struct cdevsw *cdevsw; +{ + struct dkbad *btp; + dev_t dev1; + int error; + struct disklabel *lp1; + char *msg; + u_char mask; +#ifdef DEVFS + int mynor; +#endif + bool_t need_init; + int part; + char partname[2]; + int slice; + char *sname; + struct diskslice *sp; + struct diskslices *ssp; + int unit; + + /* + * XXX reinitialize the slice table unless there is an open device + * on the unit. This should only be done if the media has changed. + */ + ssp = *sspp; + need_init = !dsisopen(ssp); + if (ssp != NULL && need_init) + dsgone(sspp); + if (need_init) { + TRACE(("dsinit\n")); + error = dsinit(dname, dev, strat, lp, sspp); + if (error != 0) { + dsgone(sspp); + return (error); + } + lp->d_npartitions = RAW_PART + 1; + lp->d_partitions[RAW_PART].p_size = lp->d_secperunit; + ssp = *sspp; +#ifdef DEVFS + ssp->dss_bdevsw = bdevsw; + ssp->dss_cdevsw = cdevsw; +#endif + + /* + * If there are no real slices, then make the compatiblity + * slice cover the whole disk. + */ + if (ssp->dss_nslices == BASE_SLICE) + ssp->dss_slices[COMPATIBILITY_SLICE].ds_size + = lp->d_secperunit; + + /* Point the compatibility slice at the BSD slice, if any. */ + for (slice = BASE_SLICE; slice < ssp->dss_nslices; slice++) { + sp = &ssp->dss_slices[slice]; + if (sp->ds_type == DOSPTYP_386BSD /* XXX */) { + ssp->dss_first_bsd_slice = slice; + ssp->dss_slices[COMPATIBILITY_SLICE].ds_offset + = sp->ds_offset; + ssp->dss_slices[COMPATIBILITY_SLICE].ds_size + = sp->ds_size; + ssp->dss_slices[COMPATIBILITY_SLICE].ds_type + = sp->ds_type; + break; + } + } + + lp1 = malloc(sizeof *lp1, M_DEVBUF, M_WAITOK); + *lp1 = *lp; + + /* + * Initialize defaults for the label for the whole disk so + * that it can be used as a template for disklabel(8). + * d_rpm = 3600 is unlikely to be correct for a modern + * disk, but d_rpm is normally irrelevant. + */ + if (lp1->d_rpm == 0) + lp1->d_rpm = 3600; + if (lp1->d_interleave == 0) + lp1->d_interleave = 1; + if (lp1->d_npartitions == 0) + lp1->d_npartitions = MAXPARTITIONS; + if (lp1->d_bbsize == 0) + lp1->d_bbsize = BBSIZE; + if (lp1->d_sbsize == 0) + lp1->d_sbsize = SBSIZE; + + ssp->dss_slices[WHOLE_DISK_SLICE].ds_label = lp1; + ssp->dss_slices[WHOLE_DISK_SLICE].ds_wlabel = TRUE; + if (setgeom != NULL) { + error = setgeom(lp); + if (error != 0) { + dsgone(sspp); + return (error); + } + } + } + + unit = dkunit(dev); + + /* + * Initialize secondary info for all slices. It is needed for more + * than the current slice in the DEVFS case. + */ + for (slice = 0; slice < ssp->dss_nslices; slice++) { + sp = &ssp->dss_slices[slice]; + if (sp->ds_label != NULL) + continue; + dev1 = dkmodslice(dkmodpart(dev, RAW_PART), slice); + sname = dsname(dname, unit, slice, RAW_PART, partname); +#ifdef DEVFS + if (slice != COMPATIBILITY_SLICE && sp->ds_bdev == NULL + && sp->ds_size != 0) { + mynor = minor(dev1); + sp->ds_bdev = + devfs_add_devswf(bdevsw, mynor, DV_BLK, + UID_ROOT, GID_OPERATOR, 0640, + "%s", sname); + sp->ds_cdev = + devfs_add_devswf(cdevsw, mynor, DV_CHR, + UID_ROOT, GID_OPERATOR, 0640, + "r%s", sname); + } +#endif + /* + * XXX this should probably only be done for the need_init + * case, but there may be a problem with DIOCSYNCSLICEINFO. + */ + set_ds_wlabel(ssp, slice, TRUE); /* XXX invert */ + lp1 = malloc(sizeof *lp1, M_DEVBUF, M_WAITOK); + *lp1 = *lp; + TRACE(("readdisklabel\n")); + msg = readdisklabel(dev1, strat, lp1); +#if 0 /* XXX */ + if (msg == NULL && setgeom != NULL && setgeom(lp1) != 0) + msg = "setgeom failed"; +#endif + if (msg == NULL) + msg = fixlabel(sname, sp, lp1, FALSE); + if (msg != NULL) { + free(lp1, M_DEVBUF); + if (sp->ds_type == DOSPTYP_386BSD /* XXX */) + log(LOG_WARNING, "%s: cannot find label (%s)\n", + sname, msg); + continue; + } + if (lp1->d_flags & D_BADSECT) { + btp = malloc(sizeof *btp, M_DEVBUF, M_WAITOK); + TRACE(("readbad144\n")); + msg = readbad144(dev1, strat, lp1, btp); + if (msg != NULL) { + log(LOG_WARNING, + "%s: cannot find bad sector table (%s)\n", + sname, msg); + free(btp, M_DEVBUF); + free(lp1, M_DEVBUF); + continue; + } + set_ds_bad(ssp, slice, internbad144(btp, lp1)); + free(btp, M_DEVBUF); + if (sp->ds_bad == NULL) { + free(lp1, M_DEVBUF); + continue; + } + } + set_ds_label(ssp, slice, lp1); +#ifdef DEVFS + set_ds_labeldevs(dname, dev1, ssp); +#endif + set_ds_wlabel(ssp, slice, FALSE); + } + + slice = dkslice(dev); + if (slice >= ssp->dss_nslices) + return (ENXIO); + sp = &ssp->dss_slices[slice]; + part = dkpart(dev); + if (part != RAW_PART + && (sp->ds_label == NULL || part >= sp->ds_label->d_npartitions)) + return (EINVAL); /* XXX needs translation */ + mask = 1 << part; + switch (mode) { + case S_IFBLK: + sp->ds_bopenmask |= mask; + break; + case S_IFCHR: + sp->ds_copenmask |= mask; + break; + } + sp->ds_openmask = sp->ds_bopenmask | sp->ds_copenmask; + return (0); +} + +int +dssize(dev, sspp, dopen, dclose) + dev_t dev; + struct diskslices **sspp; + d_open_t dopen; + d_close_t dclose; +{ + struct disklabel *lp; + int part; + int slice; + struct diskslices *ssp; + + slice = dkslice(dev); + part = dkpart(dev); + ssp = *sspp; + if (ssp == NULL || slice >= ssp->dss_nslices + || !(ssp->dss_slices[slice].ds_bopenmask & (1 << part))) { + if (dopen(dev, FREAD, S_IFBLK, (struct proc *)NULL) != 0) + return (-1); + dclose(dev, FREAD, S_IFBLK, (struct proc *)NULL); + ssp = *sspp; + } + lp = ssp->dss_slices[slice].ds_label; + if (lp == NULL) + return (-1); + return ((int)lp->d_partitions[part].p_size); +} + +static void +free_ds_label(ssp, slice) + struct diskslices *ssp; + int slice; +{ + struct disklabel *lp; + struct diskslice *sp; + + sp = &ssp->dss_slices[slice]; + lp = sp->ds_label; + if (lp == NULL) + return; +#ifdef DEVFS + free_ds_labeldevs(ssp, slice); + if (slice == COMPATIBILITY_SLICE) + free_ds_labeldevs(ssp, ssp->dss_first_bsd_slice); + else if (slice == ssp->dss_first_bsd_slice) + free_ds_labeldevs(ssp, COMPATIBILITY_SLICE); +#endif + free(lp, M_DEVBUF); + set_ds_label(ssp, slice, (struct disklabel *)NULL); +} + +#ifdef DEVFS +static void +free_ds_labeldevs(ssp, slice) + struct diskslices *ssp; + int slice; +{ + struct disklabel *lp; + int part; + struct diskslice *sp; + + sp = &ssp->dss_slices[slice]; + lp = sp->ds_label; + if (lp == NULL) + return; + for (part = 0; part < lp->d_npartitions; part++) { + if (sp->ds_bdevs[part] != NULL) { + devfs_remove_dev(sp->ds_bdevs[part]); + sp->ds_bdevs[part] = NULL; + } + if (sp->ds_cdevs[part] != NULL) { + devfs_remove_dev(sp->ds_cdevs[part]); + sp->ds_cdevs[part] = NULL; + } + } +} +#endif + +static char * +fixlabel(sname, sp, lp, writeflag) + char *sname; + struct diskslice *sp; + struct disklabel *lp; + int writeflag; +{ + u_long end; + u_long offset; + int part; + struct partition *pp; + u_long start; + bool_t warned; + + /* These errors "can't happen" so don't bother reporting details. */ + if (lp->d_magic != DISKMAGIC || lp->d_magic2 != DISKMAGIC) + return ("fixlabel: invalid magic"); + if (dkcksum(lp) != 0) + return ("fixlabel: invalid checksum"); + + pp = &lp->d_partitions[RAW_PART]; + if (writeflag) { + start = 0; + offset = sp->ds_offset; + } else { + start = sp->ds_offset; + offset = -sp->ds_offset; + } + if (pp->p_offset != start) { + if (sname != NULL) { + printf( +"%s: rejecting BSD label: raw partition offset != slice offset\n", + sname); + slice_info(sname, sp); + partition_info(sname, RAW_PART, pp); + } + return ("fixlabel: raw partition offset != slice offset"); + } + if (pp->p_size != sp->ds_size) { + if (sname != NULL) { + printf("%s: raw partition size != slice size\n", sname); + slice_info(sname, sp); + partition_info(sname, RAW_PART, pp); + } + if (pp->p_size > sp->ds_size) { + if (sname == NULL) + return ("fixlabel: raw partition size > slice size"); + printf("%s: truncating raw partition\n", sname); + pp->p_size = sp->ds_size; + } + } + end = start + sp->ds_size; + if (start > end) + return ("fixlabel: slice wraps"); + if (lp->d_secpercyl <= 0) + return ("fixlabel: d_secpercyl <= 0"); + pp -= RAW_PART; + warned = FALSE; + for (part = 0; part < lp->d_npartitions; part++, pp++) { + if (pp->p_offset != 0 || pp->p_size != 0) { + if (pp->p_offset < start + || pp->p_offset + pp->p_size > end + || pp->p_offset + pp->p_size < pp->p_offset) { + if (sname != NULL) { + printf( +"%s: rejecting partition in BSD label: it isn't entirely within the slice\n", + sname); + if (!warned) { + slice_info(sname, sp); + warned = TRUE; + } + partition_info(sname, part, pp); + } + /* XXX else silently discard junk. */ + bzero(pp, sizeof *pp); + } else + pp->p_offset += offset; + } + } + lp->d_ncylinders = sp->ds_size / lp->d_secpercyl; + lp->d_secperunit = sp->ds_size; + lp->d_checksum = 0; + lp->d_checksum = dkcksum(lp); + return (NULL); +} + +static void +partition_info(sname, part, pp) + char *sname; + int part; + struct partition *pp; +{ + printf("%s%c: start %lu, end %lu, size %lu\n", sname, 'a' + part, + pp->p_offset, pp->p_offset + pp->p_size - 1, pp->p_size); +} + +static void +slice_info(sname, sp) + char *sname; + struct diskslice *sp; +{ + printf("%s: start %lu, end %lu, size %lu\n", sname, + sp->ds_offset, sp->ds_offset + sp->ds_size - 1, sp->ds_size); +} + +/* + * Most changes to ds_bad, ds_label and ds_wlabel are made using the + * following functions to ensure coherency of the compatibility slice + * with the first BSD slice. The openmask fields are _not_ shared and + * the other fields (ds_offset and ds_size) aren't changed after they + * are initialized. + */ +static void +set_ds_bad(ssp, slice, btp) + struct diskslices *ssp; + int slice; + struct dkbad_intern *btp; +{ + ssp->dss_slices[slice].ds_bad = btp; + if (slice == COMPATIBILITY_SLICE) + ssp->dss_slices[ssp->dss_first_bsd_slice].ds_bad = btp; + else if (slice == ssp->dss_first_bsd_slice) + ssp->dss_slices[COMPATIBILITY_SLICE].ds_bad = btp; +} + +static void +set_ds_label(ssp, slice, lp) + struct diskslices *ssp; + int slice; + struct disklabel *lp; +{ + ssp->dss_slices[slice].ds_label = lp; + if (slice == COMPATIBILITY_SLICE) + ssp->dss_slices[ssp->dss_first_bsd_slice].ds_label = lp; + else if (slice == ssp->dss_first_bsd_slice) + ssp->dss_slices[COMPATIBILITY_SLICE].ds_label = lp; +} + +#ifdef DEVFS +static void +set_ds_labeldevs(dname, dev, ssp) + char *dname; + dev_t dev; + struct diskslices *ssp; +{ + int slice; + + set_ds_labeldevs_unaliased(dname, dev, ssp); + if (ssp->dss_first_bsd_slice == COMPATIBILITY_SLICE) + return; + slice = dkslice(dev); + if (slice == COMPATIBILITY_SLICE) + set_ds_labeldevs_unaliased(dname, + dkmodslice(dev, ssp->dss_first_bsd_slice), ssp); + else if (slice == ssp->dss_first_bsd_slice) + set_ds_labeldevs_unaliased(dname, + dkmodslice(dev, COMPATIBILITY_SLICE), ssp); +} + +static void +set_ds_labeldevs_unaliased(dname, dev, ssp) + char *dname; + dev_t dev; + struct diskslices *ssp; +{ + struct disklabel *lp; + int mynor; + int part; + char partname[2]; + struct partition *pp; + int slice; + char *sname; + struct diskslice *sp; + + slice = dkslice(dev); + sp = &ssp->dss_slices[slice]; + if (sp->ds_size == 0) + return; + lp = sp->ds_label; + for (part = 0; part < lp->d_npartitions; part++) { + pp = &lp->d_partitions[part]; + if (pp->p_size == 0) + continue; + sname = dsname(dname, dkunit(dev), slice, part, partname); + if (part == RAW_PART && sp->ds_bdev != NULL) { + sp->ds_bdevs[part] = + devfs_link(sp->ds_bdev, + "%s%s", sname, partname); + sp->ds_cdevs[part] = + devfs_link(sp->ds_cdev, + "r%s%s", sname, partname); + } else { + mynor = minor(dkmodpart(dev, part)); + sp->ds_bdevs[part] = + devfs_add_devswf(ssp->dss_bdevsw, mynor, DV_BLK, + UID_ROOT, GID_OPERATOR, 0640, + "%s%s", sname, partname); + sp->ds_cdevs[part] = + devfs_add_devswf(ssp->dss_cdevsw, mynor, DV_CHR, + UID_ROOT, GID_OPERATOR, 0640, + "r%s%s", sname, partname); + } + } +} +#endif /* DEVFS */ + +static void +set_ds_wlabel(ssp, slice, wlabel) + struct diskslices *ssp; + int slice; + int wlabel; +{ + ssp->dss_slices[slice].ds_wlabel = wlabel; + if (slice == COMPATIBILITY_SLICE) + ssp->dss_slices[ssp->dss_first_bsd_slice].ds_wlabel = wlabel; + else if (slice == ssp->dss_first_bsd_slice) + ssp->dss_slices[COMPATIBILITY_SLICE].ds_wlabel = wlabel; +} diff --git a/sys/kern/subr_dkbad.c b/sys/kern/subr_dkbad.c new file mode 100644 index 0000000..8fef863 --- /dev/null +++ b/sys/kern/subr_dkbad.c @@ -0,0 +1,159 @@ +/*- + * Copyright (c) 1994 Bruce D. Evans. + * All rights reserved. + * + * Copyright (c) 1990 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * William Jolitz. + * + * Copyright (c) 1982, 1986, 1988 Regents of the University of California. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)wd.c 7.2 (Berkeley) 5/9/91 + * from: wd.c,v 1.55 1994/10/22 01:57:12 phk Exp $ + * from: @(#)ufs_disksubr.c 7.16 (Berkeley) 5/4/91 + * from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $ + * $Id$ + */ + +#include <sys/param.h> +#include <sys/buf.h> +#include <sys/conf.h> +#include <sys/disklabel.h> +#include <sys/dkbad.h> +#include <sys/malloc.h> + +/* + * Internalize the bad sector table. + * TODO: + * o Fix types. + * Type long should be daddr_t since we compare with blkno's. + * Sentinel -1 should be ((daddr_t)-1). + * o Can remove explicit test for sentinel if it is a positive + * (unsigned or not) value larger than all possible blkno's. + * o Check that the table is sorted. + * o Use faster searches. + * o Use the internal table in wddump(). + * o Don't duplicate so much code. + * o Do all bad block handing in a driver-independent file. + * o Remove limit of 126 spare sectors. + */ +struct dkbad_intern * +internbad144(btp, lp) + struct dkbad *btp; + struct disklabel *lp; +{ + struct dkbad_intern *bip; + int i; + + bip = malloc(sizeof *bip, M_DEVBUF, M_WAITOK); + /* + * Spare sectors are allocated beginning with the last sector of + * the second last track of the disk (the last track is used for + * the bad sector list). + */ + bip->bi_maxspare = lp->d_secperunit - lp->d_nsectors - 1; + bip->bi_nbad = DKBAD_MAXBAD; + i = 0; + for (; i < DKBAD_MAXBAD && btp->bt_bad[i].bt_cyl != DKBAD_NOCYL; i++) + bip->bi_bad[i] = btp->bt_bad[i].bt_cyl * lp->d_secpercyl + + (btp->bt_bad[i].bt_trksec >> 8) + * lp->d_nsectors + + (btp->bt_bad[i].bt_trksec & 0x00ff); + bip->bi_bad[i] = -1; + return (bip); +} + +char * +readbad144(dev, strat, lp, bdp) + dev_t dev; + d_strategy_t *strat; + struct disklabel *lp; + struct dkbad *bdp; +{ + struct buf *bp; + struct dkbad *db; + int i; + char *msg; + + bp = geteblk((int)lp->d_secsize); + i = 0; + do { + /* Read a bad sector table. */ + bp->b_dev = dev; + bp->b_blkno = lp->d_secperunit - lp->d_nsectors + i; + if (lp->d_secsize > DEV_BSIZE) + bp->b_blkno *= lp->d_secsize / DEV_BSIZE; + else + bp->b_blkno /= DEV_BSIZE / lp->d_secsize; + bp->b_bcount = lp->d_secsize; + bp->b_flags = B_BUSY | B_READ; + (*strat)(bp); + + /* If successful, validate, otherwise try another. */ + if (biowait(bp) == 0) { + db = (struct dkbad *)(bp->b_un.b_addr); + if (db->bt_mbz == 0 && db->bt_flag == DKBAD_MAGIC) { + msg = NULL; + *bdp = *db; + break; + } + msg = "bad sector table corrupted"; + } else + msg = "bad sector table I/O error"; + } while ((bp->b_flags & B_ERROR) && (i += 2) < 10 && + i < lp->d_nsectors); + bp->b_flags |= B_INVAL | B_AGE; + brelse(bp); + return (msg); +} + +daddr_t +transbad144(bip, blkno) + struct dkbad_intern *bip; + daddr_t blkno; +{ + int i; + + /* + * List is sorted, so the search can terminate when it is past our + * sector. + */ + for (i = 0; bip->bi_bad[i] != -1 && bip->bi_bad[i] <= blkno; i++) + if (bip->bi_bad[i] == blkno) + /* + * Spare sectors are allocated in decreasing order. + */ + return (bip->bi_maxspare - i); + return (blkno); +} diff --git a/sys/kern/subr_log.c b/sys/kern/subr_log.c index 792a1ce..1418709 100644 --- a/sys/kern/subr_log.c +++ b/sys/kern/subr_log.c @@ -30,7 +30,8 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)subr_log.c 8.3 (Berkeley) 2/14/95 + * @(#)subr_log.c 8.1 (Berkeley) 6/10/93 + * $Id: subr_log.c,v 1.21 1997/03/23 03:36:22 bde Exp $ */ /* @@ -39,18 +40,37 @@ #include <sys/param.h> #include <sys/systm.h> +#include <sys/conf.h> #include <sys/proc.h> #include <sys/vnode.h> -#include <sys/ioctl.h> +#include <sys/filio.h> +#include <sys/ttycom.h> #include <sys/msgbuf.h> -#include <sys/file.h> +#include <sys/fcntl.h> +#include <sys/signalvar.h> +#include <sys/kernel.h> +#ifdef DEVFS +#include <sys/devfsext.h> +#endif /*DEVFS*/ #define LOG_RDPRI (PZERO + 1) #define LOG_ASYNC 0x04 #define LOG_RDWAIT 0x08 -struct logsoftc { +static d_open_t logopen; +static d_close_t logclose; +static d_read_t logread; +static d_ioctl_t logioctl; +static d_select_t logselect; + +#define CDEV_MAJOR 7 +static struct cdevsw log_cdevsw = + { logopen, logclose, logread, nowrite, /*7*/ + logioctl, nostop, nullreset, nodevtotty,/* klog */ + logselect, nommap, NULL, "log", NULL, -1 }; + +static struct logsoftc { int sc_state; /* see above for possibilities */ struct selinfo sc_selp; /* process waiting on select call */ int sc_pgid; /* process/group for async I/O */ @@ -59,36 +79,21 @@ struct logsoftc { int log_open; /* also used in log() */ /*ARGSUSED*/ -int +static int logopen(dev, flags, mode, p) dev_t dev; int flags, mode; struct proc *p; { - register struct msgbuf *mbp = msgbufp; - if (log_open) return (EBUSY); log_open = 1; logsoftc.sc_pgid = p->p_pid; /* signal process only */ - /* - * Potential race here with putchar() but since putchar should be - * called by autoconf, msg_magic should be initialized by the time - * we get here. - */ - if (mbp->msg_magic != MSG_MAGIC) { - register int i; - - mbp->msg_magic = MSG_MAGIC; - mbp->msg_bufx = mbp->msg_bufr = 0; - for (i=0; i < MSG_BSIZE; i++) - mbp->msg_bufc[i] = 0; - } return (0); } /*ARGSUSED*/ -int +static int logclose(dev, flag, mode, p) dev_t dev; int flag, mode; @@ -101,7 +106,7 @@ logclose(dev, flag, mode, p) } /*ARGSUSED*/ -int +static int logread(dev, uio, flag) dev_t dev; struct uio *uio; @@ -119,8 +124,8 @@ logread(dev, uio, flag) return (EWOULDBLOCK); } logsoftc.sc_state |= LOG_RDWAIT; - if (error = tsleep((caddr_t)mbp, LOG_RDPRI | PCATCH, - "klog", 0)) { + if ((error = tsleep((caddr_t)mbp, LOG_RDPRI | PCATCH, + "klog", 0))) { splx(s); return (error); } @@ -140,14 +145,14 @@ logread(dev, uio, flag) if (error) break; mbp->msg_bufr += l; - if (mbp->msg_bufr < 0 || mbp->msg_bufr >= MSG_BSIZE) + if (mbp->msg_bufr >= MSG_BSIZE) mbp->msg_bufr = 0; } return (error); } /*ARGSUSED*/ -int +static int logselect(dev, rw, p) dev_t dev; int rw; @@ -179,8 +184,8 @@ logwakeup() selwakeup(&logsoftc.sc_selp); if (logsoftc.sc_state & LOG_ASYNC) { if (logsoftc.sc_pgid < 0) - gsignal(-logsoftc.sc_pgid, SIGIO); - else if (p = pfind(logsoftc.sc_pgid)) + gsignal(-logsoftc.sc_pgid, SIGIO); + else if ((p = pfind(logsoftc.sc_pgid))) psignal(p, SIGIO); } if (logsoftc.sc_state & LOG_RDWAIT) { @@ -190,10 +195,10 @@ logwakeup() } /*ARGSUSED*/ -int +static int logioctl(dev, com, data, flag, p) dev_t dev; - u_long com; + int com; caddr_t data; int flag; struct proc *p; @@ -232,7 +237,33 @@ logioctl(dev, com, data, flag, p) break; default: - return (-1); + return (ENOTTY); } return (0); } + +static log_devsw_installed = 0; +#ifdef DEVFS +static void *log_devfs_token; +#endif + +static void +log_drvinit(void *unused) +{ + dev_t dev; + + if( ! log_devsw_installed ) { + dev = makedev(CDEV_MAJOR,0); + cdevsw_add(&dev,&log_cdevsw,NULL); + log_devsw_installed = 1; +#ifdef DEVFS + log_devfs_token = devfs_add_devswf(&log_cdevsw, 0, DV_CHR, + UID_ROOT, GID_WHEEL, 0600, + "klog"); +#endif + } +} + +SYSINIT(logdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,log_drvinit,NULL) + + diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c new file mode 100644 index 0000000..f7d41bf --- /dev/null +++ b/sys/kern/subr_param.c @@ -0,0 +1,186 @@ +/* + * Copyright (c) 1980, 1986, 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)param.c 8.3 (Berkeley) 8/20/94 + * $Id$ + */ + +#include "opt_sysvipc.h" +#include "opt_param.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/socket.h> +#include <sys/proc.h> +#include <sys/vnode.h> +#include <sys/file.h> +#include <sys/callout.h> +#include <sys/clist.h> +#include <sys/mbuf.h> +#include <sys/kernel.h> + +#include <ufs/ufs/quota.h> + +#ifdef SYSVSHM +#include <machine/vmparam.h> +#include <sys/shm.h> +#endif +#ifdef SYSVSEM +#include <sys/sem.h> +#endif +#ifdef SYSVMSG +#include <sys/msg.h> +#endif + +/* + * System parameter formulae. + * + * This file is copied into each directory where we compile + * the kernel; it should be modified there to suit local taste + * if necessary. + * + * Compiled with -DMAXUSERS=xx + */ + +#ifndef HZ +#define HZ 100 +#endif +int hz = HZ; +int tick = 1000000 / HZ; +int tickadj = 30000 / (60 * HZ); /* can adjust 30ms in 60s */ +#define NPROC (20 + 16 * MAXUSERS) +int maxproc = NPROC; /* maximum # of processes */ +int maxprocperuid = NPROC-1; /* maximum # of processes per user */ +int maxfiles = NPROC*2; /* system wide open files limit */ +int maxfilesperproc = NPROC*2; /* per-process open files limit */ +int ncallout = 16 + NPROC; /* maximum # of timer events */ + +/* maximum # of mbuf clusters */ +#ifndef NMBCLUSTERS +#define NMBCLUSTERS (512 + MAXUSERS * 16) +#endif +int nmbclusters = NMBCLUSTERS; + +/* allocate 1/4th amount of virtual address space for mbufs XXX */ +int nmbufs = NMBCLUSTERS * 4; + +int fscale = FSCALE; /* kernel uses `FSCALE', user uses `fscale' */ + +/* + * Values in support of System V compatible shared memory. XXX + */ +#ifdef SYSVSHM +#ifndef SHMMAX +#define SHMMAX (SHMMAXPGS*PAGE_SIZE) +#endif +#ifndef SHMMIN +#define SHMMIN 1 +#endif +#ifndef SHMMNI +#define SHMMNI 32 /* <= SHMMMNI in shm.h */ +#endif +#ifndef SHMSEG +#define SHMSEG 8 +#endif +#ifndef SHMALL +#define SHMALL (SHMMAXPGS) +#endif + +struct shminfo shminfo = { + SHMMAX, + SHMMIN, + SHMMNI, + SHMSEG, + SHMALL +}; +#endif + +/* + * Values in support of System V compatible semaphores. + */ + +#ifdef SYSVSEM + +struct seminfo seminfo = { + SEMMAP, /* # of entries in semaphore map */ + SEMMNI, /* # of semaphore identifiers */ + SEMMNS, /* # of semaphores in system */ + SEMMNU, /* # of undo structures in system */ + SEMMSL, /* max # of semaphores per id */ + SEMOPM, /* max # of operations per semop call */ + SEMUME, /* max # of undo entries per process */ + SEMUSZ, /* size in bytes of undo structure */ + SEMVMX, /* semaphore maximum value */ + SEMAEM /* adjust on exit max value */ +}; +#endif + +/* + * Values in support of System V compatible messages. + */ + +#ifdef SYSVMSG + +struct msginfo msginfo = { + MSGMAX, /* max chars in a message */ + MSGMNI, /* # of message queue identifiers */ + MSGMNB, /* max chars in a queue */ + MSGTQL, /* max messages in system */ + MSGSSZ, /* size of a message segment */ + /* (must be small power of 2 greater than 4) */ + MSGSEG /* number of message segments */ +}; +#endif + +/* + * These may be set to nonzero here or by patching. + * If they are nonzero at bootstrap time then they are + * initialized to values dependent on the memory size. + */ +#ifdef NBUF +int nbuf = NBUF; +#else +int nbuf = 0; +#endif +int nswbuf = 0; + +/* + * These have to be allocated somewhere; allocating + * them here forces loader errors if this file is omitted + * (if they've been externed everywhere else; hah!). + */ +struct buf *swbuf; diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c index 8a9a44e..4b3ed36 100644 --- a/sys/kern/subr_prf.c +++ b/sys/kern/subr_prf.c @@ -35,23 +35,21 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)subr_prf.c 8.4 (Berkeley) 5/4/95 + * @(#)subr_prf.c 8.3 (Berkeley) 1/21/94 + * $Id$ */ +#include "opt_ddb.h" + #include <sys/param.h> #include <sys/systm.h> -#include <sys/buf.h> -#include <sys/conf.h> -#include <sys/reboot.h> #include <sys/msgbuf.h> #include <sys/proc.h> -#include <sys/ioctl.h> -#include <sys/vnode.h> -#include <sys/file.h> #include <sys/tty.h> #include <sys/tprintf.h> #include <sys/syslog.h> #include <sys/malloc.h> +#include <machine/cons.h> /* * Note that stdarg.h and the ANSI style va_start macro is used for both @@ -59,71 +57,20 @@ */ #include <machine/stdarg.h> -#ifdef KADB -#include <machine/kdbparam.h> -#endif - #define TOCONS 0x01 #define TOTTY 0x02 #define TOLOG 0x04 struct tty *constty; /* pointer to console "window" tty */ -extern cnputc(); /* standard console putc */ -int (*v_putc)() = cnputc; /* routine to putc on virtual console */ - -void logpri __P((int level)); -static void putchar __P((int ch, int flags, struct tty *tp)); +static void (*v_putc)(int) = cnputc; /* routine to putc on virtual console */ +static void logpri __P((int level)); +static void msglogchar(int c, void *dummyarg); +struct putchar_arg {int flags; struct tty *tty; }; +static void putchar __P((int ch, void *arg)); static char *ksprintn __P((u_long num, int base, int *len)); -void kprintf __P((const char *fmt, int flags, struct tty *tp, va_list ap)); - -int consintr = 1; /* Ok to handle console interrupts? */ -/* - * Variable panicstr contains argument to first call to panic; used as flag - * to indicate that the kernel has already called panic. - */ -const char *panicstr; - -/* - * Panic is called on unresolvable fatal errors. It prints "panic: mesg", - * and then reboots. If we are called twice, then we avoid trying to sync - * the disks as this often leads to recursive panics. - */ -#ifdef __GNUC__ -volatile void boot(int flags); /* boot() does not return */ -volatile /* panic() does not return */ -#endif -void -#ifdef __STDC__ -panic(const char *fmt, ...) -#else -panic(fmt, va_alist) - char *fmt; -#endif -{ - int bootopt; - va_list ap; - - bootopt = RB_AUTOBOOT | RB_DUMP; - if (panicstr) - bootopt |= RB_NOSYNC; - else - panicstr = fmt; - - va_start(ap, fmt); - printf("panic: %r\n", fmt, ap); - va_end(ap); - -#ifdef KGDB - kgdb_panic(); -#endif -#ifdef KADB - if (boothowto & RB_KDB) - kdbpanic(); -#endif - boot(bootopt); -} +static int consintr = 1; /* Ok to handle console interrupts? */ /* * Warn that a system table is full. @@ -142,19 +89,17 @@ tablefull(tab) * the queue does not clear in a reasonable time. */ void -#ifdef __STDC__ uprintf(const char *fmt, ...) -#else -uprintf(fmt, va_alist) - char *fmt; -#endif { - register struct proc *p = curproc; + struct proc *p = curproc; va_list ap; + struct putchar_arg pca; if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) { va_start(ap, fmt); - kprintf(fmt, TOTTY, p->p_session->s_ttyp, ap); + pca.tty = p->p_session->s_ttyp; + pca.flags = TOTTY; + kvprintf(fmt, putchar, &pca, 10, ap); va_end(ap); } } @@ -185,18 +130,13 @@ tprintf_close(sess) * with the given session. */ void -#ifdef __STDC__ tprintf(tpr_t tpr, const char *fmt, ...) -#else -tprintf(tpr, fmt, va_alist) - tpr_t tpr; - char *fmt; -#endif { register struct session *sess = (struct session *)tpr; struct tty *tp = NULL; int flags = TOLOG; va_list ap; + struct putchar_arg pca; logpri(LOG_INFO); if (sess && sess->s_ttyvp && ttycheckoutq(sess->s_ttyp, 0)) { @@ -204,7 +144,9 @@ tprintf(tpr, fmt, va_alist) tp = sess->s_ttyp; } va_start(ap, fmt); - kprintf(fmt, flags, tp, ap); + pca.tty = tp; + pca.flags = flags; + kvprintf(fmt, putchar, &pca, 10, ap); va_end(ap); logwakeup(); } @@ -215,18 +157,14 @@ tprintf(tpr, fmt, va_alist) * be revoke(2)'d away. Other callers should use tprintf. */ void -#ifdef __STDC__ ttyprintf(struct tty *tp, const char *fmt, ...) -#else -ttyprintf(tp, fmt, va_alist) - struct tty *tp; - char *fmt; -#endif { va_list ap; - + struct putchar_arg pca; va_start(ap, fmt); - kprintf(fmt, TOTTY, tp, ap); + pca.tty = tp; + pca.flags = TOTTY; + kvprintf(fmt, putchar, &pca, 10, ap); va_end(ap); } @@ -238,13 +176,7 @@ extern int log_open; * log yet, it writes to the console also. */ void -#ifdef __STDC__ log(int level, const char *fmt, ...) -#else -log(level, fmt, va_alist) - int level; - char *fmt; -#endif { register int s; va_list ap; @@ -252,73 +184,157 @@ log(level, fmt, va_alist) s = splhigh(); logpri(level); va_start(ap, fmt); - kprintf(fmt, TOLOG, NULL, ap); - splx(s); + + kvprintf(fmt, msglogchar, NULL, 10, ap); va_end(ap); + + splx(s); if (!log_open) { + struct putchar_arg pca; va_start(ap, fmt); - kprintf(fmt, TOCONS, NULL, ap); + pca.tty = NULL; + pca.flags = TOCONS; + kvprintf(fmt, putchar, &pca, 10, ap); va_end(ap); } logwakeup(); } -void +static void logpri(level) int level; { - register int ch; register char *p; - putchar('<', TOLOG, NULL); - for (p = ksprintn((u_long)level, 10, NULL); ch = *p--;) - putchar(ch, TOLOG, NULL); - putchar('>', TOLOG, NULL); + msglogchar('<', NULL); + for (p = ksprintn((u_long)level, 10, NULL); *p;) + msglogchar(*p--, NULL); + msglogchar('>', NULL); } -void -#ifdef __STDC__ +int addlog(const char *fmt, ...) -#else -addlog(fmt, va_alist) - char *fmt; -#endif { register int s; va_list ap; + int retval; s = splhigh(); va_start(ap, fmt); - kprintf(fmt, TOLOG, NULL, ap); + retval = kvprintf(fmt, msglogchar, NULL, 10, ap); splx(s); va_end(ap); if (!log_open) { + struct putchar_arg pca; va_start(ap, fmt); - kprintf(fmt, TOCONS, NULL, ap); + pca.tty = NULL; + pca.flags = TOCONS; + kvprintf(fmt, putchar, &pca, 10, ap); va_end(ap); } logwakeup(); + return (retval); } -void -#ifdef __STDC__ +int printf(const char *fmt, ...) -#else -printf(fmt, va_alist) - char *fmt; -#endif { va_list ap; register int savintr; + struct putchar_arg pca; + int retval; savintr = consintr; /* disable interrupts */ consintr = 0; va_start(ap, fmt); - kprintf(fmt, TOCONS | TOLOG, NULL, ap); + pca.tty = NULL; + pca.flags = TOCONS | TOLOG; + retval = kvprintf(fmt, putchar, &pca, 10, ap); va_end(ap); if (!panicstr) logwakeup(); consintr = savintr; /* reenable interrupts */ + return retval; +} + +void +vprintf(const char *fmt, va_list ap) +{ + register int savintr; + struct putchar_arg pca; + + savintr = consintr; /* disable interrupts */ + consintr = 0; + pca.tty = NULL; + pca.flags = TOCONS | TOLOG; + kvprintf(fmt, putchar, &pca, 10, ap); + if (!panicstr) + logwakeup(); + consintr = savintr; /* reenable interrupts */ +} + +/* + * Print a character on console or users terminal. If destination is + * the console then the last MSGBUFS characters are saved in msgbuf for + * inspection later. + */ +static void +putchar(int c, void *arg) +{ + struct putchar_arg *ap = (struct putchar_arg*) arg; + int flags = ap->flags; + struct tty *tp = ap->tty; + if (panicstr) + constty = NULL; + if ((flags & TOCONS) && tp == NULL && constty) { + tp = constty; + flags |= TOTTY; + } + if ((flags & TOTTY) && tp && tputchar(c, tp) < 0 && + (flags & TOCONS) && tp == constty) + constty = NULL; + if ((flags & TOLOG)) + msglogchar(c, NULL); + if ((flags & TOCONS) && constty == NULL && c != '\0') + (*v_putc)(c); +} + +/* + * Scaled down version of sprintf(3). + */ +int +sprintf(char *buf, const char *cfmt, ...) +{ + int retval; + va_list ap; + + va_start(ap, cfmt); + retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap); + buf[retval] = '\0'; + va_end(ap); + return retval; +} + +/* + * Put a number (base <= 16) in a buffer in reverse order; return an + * optional length and a pointer to the NULL terminated (preceded?) + * buffer. + */ +static char * +ksprintn(ul, base, lenp) + register u_long ul; + register int base, *lenp; +{ /* A long in base 8, plus NULL. */ + static char buf[sizeof(long) * NBBY / 3 + 2]; + register char *p; + + p = buf; + do { + *++p = hex2ascii(ul % base); + } while (ul /= base); + if (lenp) + *lenp = p - buf; + return (p); } /* @@ -337,110 +353,178 @@ printf(fmt, va_alist) * the next characters (up to a control character, i.e. a character <= 32), * give the name of the register. Thus: * - * kprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE\n"); + * kvprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE\n"); * * would produce output: * * reg=3<BITTWO,BITONE> * - * The format %r passes an additional format string and argument list - * recursively. Its usage is: - * - * fn(char *fmt, ...) - * { - * va_list ap; - * va_start(ap, fmt); - * printf("prefix: %r: suffix\n", fmt, ap); - * va_end(ap); - * } - * - * Space or zero padding and a field width are supported for the numeric - * formats only. + * XXX: %D -- Hexdump, takes pointer and separator string: + * ("%6D", ptr, ":") -> XX:XX:XX:XX:XX:XX + * ("%*D", len, ptr, " " -> XX XX XX XX ... */ -void -kprintf(fmt, flags, tp, ap) - register const char *fmt; - int flags; - struct tty *tp; - va_list ap; +int +kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_list ap) { - register char *p, *q; - register int ch, n; +#define PCHAR(c) {int cc=(c); if (func) (*func)(cc,arg); else *d++ = cc; retval++; } + char *p, *q, *d; + u_char *up; + int ch, n; u_long ul; - int base, lflag, tmp, width; + int base, lflag, tmp, width, ladjust, sharpflag, neg, sign, dot; + int dwidth; char padc; + int retval = 0; + + if (!func) + d = (char *) arg; + else + d = NULL; + + if (fmt == NULL) + fmt = "(fmt null)\n"; + + if (radix < 2 || radix > 36) + radix = 10; for (;;) { padc = ' '; width = 0; - while ((ch = *(u_char *)fmt++) != '%') { - if (ch == '\0') - return; - putchar(ch, flags, tp); + while ((ch = (u_char)*fmt++) != '%') { + if (ch == '\0') + return retval; + PCHAR(ch); } - lflag = 0; -reswitch: switch (ch = *(u_char *)fmt++) { - case '0': - padc = '0'; + lflag = 0; ladjust = 0; sharpflag = 0; neg = 0; + sign = 0; dot = 0; dwidth = 0; +reswitch: switch (ch = (u_char)*fmt++) { + case '.': + dot = 1; goto reswitch; - case '1': case '2': case '3': case '4': - case '5': case '6': case '7': case '8': case '9': - for (width = 0;; ++fmt) { - width = width * 10 + ch - '0'; - ch = *fmt; - if (ch < '0' || ch > '9') - break; + case '#': + sharpflag = 1; + goto reswitch; + case '+': + sign = 1; + goto reswitch; + case '-': + ladjust = 1; + goto reswitch; + case '%': + PCHAR(ch); + break; + case '*': + if (!dot) { + width = va_arg(ap, int); + if (width < 0) { + ladjust = !ladjust; + width = -width; + } + } else { + dwidth = va_arg(ap, int); } goto reswitch; - case 'l': - lflag = 1; + case '0': + if (!dot) { + padc = '0'; + goto reswitch; + } + case '1': case '2': case '3': case '4': + case '5': case '6': case '7': case '8': case '9': + for (n = 0;; ++fmt) { + n = n * 10 + ch - '0'; + ch = *fmt; + if (ch < '0' || ch > '9') + break; + } + if (dot) + dwidth = n; + else + width = n; goto reswitch; case 'b': ul = va_arg(ap, int); p = va_arg(ap, char *); - for (q = ksprintn(ul, *p++, NULL); ch = *q--;) - putchar(ch, flags, tp); + for (q = ksprintn(ul, *p++, NULL); *q;) + PCHAR(*q--); if (!ul) break; - for (tmp = 0; n = *p++;) { + for (tmp = 0; *p;) { + n = *p++; if (ul & (1 << (n - 1))) { - putchar(tmp ? ',' : '<', flags, tp); + PCHAR(tmp ? ',' : '<'); for (; (n = *p) > ' '; ++p) - putchar(n, flags, tp); + PCHAR(n); tmp = 1; } else for (; *p > ' '; ++p) continue; } if (tmp) - putchar('>', flags, tp); + PCHAR('>'); break; case 'c': - putchar(va_arg(ap, int), flags, tp); - break; - case 'r': - p = va_arg(ap, char *); - kprintf(p, flags, tp, va_arg(ap, va_list)); + PCHAR(va_arg(ap, int)); break; - case 's': + case 'D': + up = va_arg(ap, u_char *); p = va_arg(ap, char *); - while (ch = *p++) - putchar(ch, flags, tp); + if (!width) + width = 16; + while(width--) { + PCHAR(hex2ascii(*up >> 4)); + PCHAR(hex2ascii(*up & 0x0f)); + up++; + if (width) + for (q=p;*q;q++) + PCHAR(*q); + } break; case 'd': ul = lflag ? va_arg(ap, long) : va_arg(ap, int); - if ((long)ul < 0) { - putchar('-', flags, tp); - ul = -(long)ul; - } + sign = 1; base = 10; goto number; + case 'l': + lflag = 1; + goto reswitch; + case 'n': + ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int); + base = radix; + goto number; case 'o': ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int); base = 8; goto number; + case 'p': + ul = (u_long)va_arg(ap, void *); + base = 16; + PCHAR('0'); + PCHAR('x'); + goto number; + case 's': + p = va_arg(ap, char *); + if (p == NULL) + p = "(null)"; + if (!dot) + n = strlen (p); + else + for (n = 0; n < dwidth && p[n]; n++) + continue; + + width -= n; + + if (!ladjust && width > 0) + while (width--) + PCHAR(padc); + while (n--) + PCHAR(*p++); + if (ladjust && width > 0) + while (width--) + PCHAR(padc); + break; case 'u': ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int); base = 10; @@ -448,56 +532,71 @@ reswitch: switch (ch = *(u_char *)fmt++) { case 'x': ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int); base = 16; -number: p = ksprintn(ul, base, &tmp); - if (width && (width -= tmp) > 0) +number: if (sign && (long)ul < 0L) { + neg = 1; + ul = -(long)ul; + } + p = ksprintn(ul, base, &tmp); + if (sharpflag && ul != 0) { + if (base == 8) + tmp++; + else if (base == 16) + tmp += 2; + } + if (neg) + tmp++; + + if (!ladjust && width && (width -= tmp) > 0) + while (width--) + PCHAR(padc); + if (neg) + PCHAR('-'); + if (sharpflag && ul != 0) { + if (base == 8) { + PCHAR('0'); + } else if (base == 16) { + PCHAR('0'); + PCHAR('x'); + } + } + + while (*p) + PCHAR(*p--); + + if (ladjust && width && (width -= tmp) > 0) while (width--) - putchar(padc, flags, tp); - while (ch = *p--) - putchar(ch, flags, tp); + PCHAR(padc); + break; default: - putchar('%', flags, tp); + PCHAR('%'); if (lflag) - putchar('l', flags, tp); - /* FALLTHROUGH */ - case '%': - putchar(ch, flags, tp); + PCHAR('l'); + PCHAR(ch); + break; } } +#undef PCHAR } /* - * Print a character on console or users terminal. If destination is - * the console then the last MSGBUFS characters are saved in msgbuf for - * inspection later. + * Put character in log buffer. */ static void -putchar(c, flags, tp) - register int c; - int flags; - struct tty *tp; +msglogchar(int c, void *dummyarg) { - extern int msgbufmapped; - register struct msgbuf *mbp; + struct msgbuf *mbp; - if (panicstr) - constty = NULL; - if ((flags & TOCONS) && tp == NULL && constty) { - tp = constty; - flags |= TOTTY; - } - if ((flags & TOTTY) && tp && tputchar(c, tp) < 0 && - (flags & TOCONS) && tp == constty) - constty = NULL; - if ((flags & TOLOG) && - c != '\0' && c != '\r' && c != 0177 && msgbufmapped) { + if (c != '\0' && c != '\r' && c != 0177 && msgbufmapped) { mbp = msgbufp; - if (mbp->msg_magic != MSG_MAGIC) { - bzero((caddr_t)mbp, sizeof(*mbp)); + if (mbp->msg_magic != MSG_MAGIC || + mbp->msg_bufx >= MSG_BSIZE || + mbp->msg_bufr >= MSG_BSIZE) { + bzero(mbp, sizeof(struct msgbuf)); mbp->msg_magic = MSG_MAGIC; } mbp->msg_bufc[mbp->msg_bufx++] = c; - if (mbp->msg_bufx < 0 || mbp->msg_bufx >= MSG_BSIZE) + if (mbp->msg_bufx >= MSG_BSIZE) mbp->msg_bufx = 0; /* If the buffer is full, keep the most recent data. */ if (mbp->msg_bufr == mbp->msg_bufx) { @@ -505,102 +604,4 @@ putchar(c, flags, tp) mbp->msg_bufr = 0; } } - if ((flags & TOCONS) && constty == NULL && c != '\0') - (*v_putc)(c); -} - -/* - * Scaled down version of sprintf(3). - */ -#ifdef __STDC__ -sprintf(char *buf, const char *cfmt, ...) -#else -sprintf(buf, cfmt, va_alist) - char *buf, *cfmt; -#endif -{ - register const char *fmt = cfmt; - register char *p, *bp; - register int ch, base; - u_long ul; - int lflag; - va_list ap; - - va_start(ap, cfmt); - for (bp = buf; ; ) { - while ((ch = *(u_char *)fmt++) != '%') - if ((*bp++ = ch) == '\0') - return ((bp - buf) - 1); - - lflag = 0; -reswitch: switch (ch = *(u_char *)fmt++) { - case 'l': - lflag = 1; - goto reswitch; - case 'c': - *bp++ = va_arg(ap, int); - break; - case 's': - p = va_arg(ap, char *); - while (*bp++ = *p++) - continue; - --bp; - break; - case 'd': - ul = lflag ? va_arg(ap, long) : va_arg(ap, int); - if ((long)ul < 0) { - *bp++ = '-'; - ul = -(long)ul; - } - base = 10; - goto number; - break; - case 'o': - ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int); - base = 8; - goto number; - break; - case 'u': - ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int); - base = 10; - goto number; - break; - case 'x': - ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int); - base = 16; -number: for (p = ksprintn(ul, base, NULL); ch = *p--;) - *bp++ = ch; - break; - default: - *bp++ = '%'; - if (lflag) - *bp++ = 'l'; - /* FALLTHROUGH */ - case '%': - *bp++ = ch; - } - } - va_end(ap); -} - -/* - * Put a number (base <= 16) in a buffer in reverse order; return an - * optional length and a pointer to the NULL terminated (preceded?) - * buffer. - */ -static char * -ksprintn(ul, base, lenp) - register u_long ul; - register int base, *lenp; -{ /* A long in base 8, plus NULL. */ - static char buf[sizeof(long) * NBBY / 3 + 2]; - register char *p; - - p = buf; - do { - *++p = "0123456789abcdef"[ul % base]; - } while (ul /= base); - if (lenp) - *lenp = p - buf; - return (p); } diff --git a/sys/kern/subr_prof.c b/sys/kern/subr_prof.c index 237553d..08ba35f 100644 --- a/sys/kern/subr_prof.c +++ b/sys/kern/subr_prof.c @@ -30,17 +30,17 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)subr_prof.c 8.4 (Berkeley) 2/14/95 + * @(#)subr_prof.c 8.3 (Berkeley) 9/23/93 + * $Id$ */ #include <sys/param.h> #include <sys/systm.h> +#include <sys/sysproto.h> #include <sys/kernel.h> #include <sys/proc.h> -#include <sys/user.h> - -#include <sys/mount.h> -#include <sys/syscallargs.h> +#include <sys/resourcevar.h> +#include <sys/sysctl.h> #include <machine/cpu.h> @@ -48,26 +48,57 @@ #include <sys/malloc.h> #include <sys/gmon.h> -/* - * Froms is actually a bunch of unsigned shorts indexing tos - */ +static void kmstartup __P((void *)); +SYSINIT(kmem, SI_SUB_KPROF, SI_ORDER_FIRST, kmstartup, NULL) + struct gmonparam _gmonparam = { GMON_PROF_OFF }; +extern char btext[]; extern char etext[]; +#ifdef GUPROF +void +nullfunc_loop_profiled() +{ + int i; + + for (i = 0; i < CALIB_SCALE; i++) + nullfunc_profiled(); +} + +#define nullfunc_loop_profiled_end nullfunc_profiled /* XXX */ + void -kmstartup() +nullfunc_profiled() +{ +} +#endif /* GUPROF */ + +static void +kmstartup(dummy) + void *dummy; { char *cp; struct gmonparam *p = &_gmonparam; +#ifdef GUPROF + int cputime_overhead; + int empty_loop_time; + int i; + int mcount_overhead; + int mexitcount_overhead; + int nullfunc_loop_overhead; + int nullfunc_loop_profiled_time; + fptrint_t tmp_addr; +#endif + /* * Round lowpc and highpc to multiples of the density we're using * so the rest of the scaling (here and in gprof) stays in ints. */ - p->lowpc = ROUNDDOWN(KERNBASE, HISTFRACTION * sizeof(HISTCOUNTER)); + p->lowpc = ROUNDDOWN((u_long)btext, HISTFRACTION * sizeof(HISTCOUNTER)); p->highpc = ROUNDUP((u_long)etext, HISTFRACTION * sizeof(HISTCOUNTER)); p->textsize = p->highpc - p->lowpc; - printf("Profiling kernel, textsize=%d [%x..%x]\n", + printf("Profiling kernel, textsize=%lu [%x..%x]\n", p->textsize, p->lowpc, p->highpc); p->kcountsize = p->textsize / HISTFRACTION; p->hashfraction = HASHFRACTION; @@ -87,25 +118,168 @@ kmstartup() bzero(cp, p->kcountsize + p->tossize + p->fromssize); p->tos = (struct tostruct *)cp; cp += p->tossize; - p->kcount = (u_short *)cp; + p->kcount = (HISTCOUNTER *)cp; cp += p->kcountsize; p->froms = (u_short *)cp; + +#ifdef GUPROF + /* Initialize pointers to overhead counters. */ + p->cputime_count = &KCOUNT(p, PC_TO_I(p, cputime)); + p->mcount_count = &KCOUNT(p, PC_TO_I(p, mcount)); + p->mexitcount_count = &KCOUNT(p, PC_TO_I(p, mexitcount)); + + /* + * Disable interrupts to avoid interference while we calibrate + * things. + */ + disable_intr(); + + /* + * Determine overheads. + * XXX this needs to be repeated for each useful timer/counter. + */ + cputime_overhead = 0; + startguprof(p); + for (i = 0; i < CALIB_SCALE; i++) + cputime_overhead += cputime(); + + empty_loop(); + startguprof(p); + empty_loop(); + empty_loop_time = cputime(); + + nullfunc_loop_profiled(); + + /* + * Start profiling. There won't be any normal function calls since + * interrupts are disabled, but we will call the profiling routines + * directly to determine their overheads. + */ + p->state = GMON_PROF_HIRES; + + startguprof(p); + nullfunc_loop_profiled(); + + startguprof(p); + for (i = 0; i < CALIB_SCALE; i++) +#if defined(i386) && __GNUC__ >= 2 + asm("pushl %0; call __mcount; popl %%ecx" + : + : "i" (profil) + : "ax", "bx", "cx", "dx", "memory"); +#else +#error +#endif + mcount_overhead = KCOUNT(p, PC_TO_I(p, profil)); + + startguprof(p); + for (i = 0; i < CALIB_SCALE; i++) +#if defined(i386) && __GNUC__ >= 2 + asm("call mexitcount; 1:" + : : : "ax", "bx", "cx", "dx", "memory"); + asm("movl $1b,%0" : "=rm" (tmp_addr)); +#else +#error +#endif + mexitcount_overhead = KCOUNT(p, PC_TO_I(p, tmp_addr)); + + p->state = GMON_PROF_OFF; + stopguprof(p); + + enable_intr(); + + nullfunc_loop_profiled_time = 0; + for (tmp_addr = (fptrint_t)nullfunc_loop_profiled; + tmp_addr < (fptrint_t)nullfunc_loop_profiled_end; + tmp_addr += HISTFRACTION * sizeof(HISTCOUNTER)) + nullfunc_loop_profiled_time += KCOUNT(p, PC_TO_I(p, tmp_addr)); +#define CALIB_DOSCALE(count) (((count) + CALIB_SCALE / 3) / CALIB_SCALE) +#define c2n(count, freq) ((int)((count) * 1000000000LL / freq)) + printf("cputime %d, empty_loop %d, nullfunc_loop_profiled %d, mcount %d, mexitcount %d\n", + CALIB_DOSCALE(c2n(cputime_overhead, p->profrate)), + CALIB_DOSCALE(c2n(empty_loop_time, p->profrate)), + CALIB_DOSCALE(c2n(nullfunc_loop_profiled_time, p->profrate)), + CALIB_DOSCALE(c2n(mcount_overhead, p->profrate)), + CALIB_DOSCALE(c2n(mexitcount_overhead, p->profrate))); + cputime_overhead -= empty_loop_time; + mcount_overhead -= empty_loop_time; + mexitcount_overhead -= empty_loop_time; + + /*- + * Profiling overheads are determined by the times between the + * following events: + * MC1: mcount() is called + * MC2: cputime() (called from mcount()) latches the timer + * MC3: mcount() completes + * ME1: mexitcount() is called + * ME2: cputime() (called from mexitcount()) latches the timer + * ME3: mexitcount() completes. + * The times between the events vary slightly depending on instruction + * combination and cache misses, etc. Attempt to determine the + * minimum times. These can be subtracted from the profiling times + * without much risk of reducing the profiling times below what they + * would be when profiling is not configured. Abbreviate: + * ab = minimum time between MC1 and MC3 + * a = minumum time between MC1 and MC2 + * b = minimum time between MC2 and MC3 + * cd = minimum time between ME1 and ME3 + * c = minimum time between ME1 and ME2 + * d = minimum time between ME2 and ME3. + * These satisfy the relations: + * ab <= mcount_overhead (just measured) + * a + b <= ab + * cd <= mexitcount_overhead (just measured) + * c + d <= cd + * a + d <= nullfunc_loop_profiled_time (just measured) + * a >= 0, b >= 0, c >= 0, d >= 0. + * Assume that ab and cd are equal to the minimums. + */ + p->cputime_overhead = CALIB_DOSCALE(cputime_overhead); + p->mcount_overhead = CALIB_DOSCALE(mcount_overhead - cputime_overhead); + p->mexitcount_overhead = CALIB_DOSCALE(mexitcount_overhead + - cputime_overhead); + nullfunc_loop_overhead = nullfunc_loop_profiled_time - empty_loop_time; + p->mexitcount_post_overhead = CALIB_DOSCALE((mcount_overhead + - nullfunc_loop_overhead) + / 4); + p->mexitcount_pre_overhead = p->mexitcount_overhead + + p->cputime_overhead + - p->mexitcount_post_overhead; + p->mcount_pre_overhead = CALIB_DOSCALE(nullfunc_loop_overhead) + - p->mexitcount_post_overhead; + p->mcount_post_overhead = p->mcount_overhead + + p->cputime_overhead + - p->mcount_pre_overhead; + printf( +"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d nsec\n", + c2n(p->cputime_overhead, p->profrate), + c2n(p->mcount_overhead, p->profrate), + c2n(p->mcount_pre_overhead, p->profrate), + c2n(p->mcount_post_overhead, p->profrate), + c2n(p->cputime_overhead, p->profrate), + c2n(p->mexitcount_overhead, p->profrate), + c2n(p->mexitcount_pre_overhead, p->profrate), + c2n(p->mexitcount_post_overhead, p->profrate)); + printf( +"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d cycles\n", + p->cputime_overhead, p->mcount_overhead, + p->mcount_pre_overhead, p->mcount_post_overhead, + p->cputime_overhead, p->mexitcount_overhead, + p->mexitcount_pre_overhead, p->mexitcount_post_overhead); +#endif /* GUPROF */ } /* * Return kernel profiling information. */ -int -sysctl_doprof(name, namelen, oldp, oldlenp, newp, newlen, p) - int *name; - u_int namelen; - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; +static int +sysctl_kern_prof SYSCTL_HANDLER_ARGS { + int *name = (int *) arg1; + u_int namelen = arg2; struct gmonparam *gp = &_gmonparam; int error; + int state; /* all sysctl names at this level are terminal */ if (namelen != 1) @@ -113,30 +287,50 @@ sysctl_doprof(name, namelen, oldp, oldlenp, newp, newlen, p) switch (name[0]) { case GPROF_STATE: - error = sysctl_int(oldp, oldlenp, newp, newlen, &gp->state); + state = gp->state; + error = sysctl_handle_int(oidp, &state, 0, req); if (error) return (error); - if (gp->state == GMON_PROF_OFF) + if (!req->newptr) + return (0); + if (state == GMON_PROF_OFF) { + gp->state = state; stopprofclock(&proc0); - else + stopguprof(gp); + } else if (state == GMON_PROF_ON) { + gp->state = GMON_PROF_OFF; + stopguprof(gp); + gp->profrate = profhz; startprofclock(&proc0); + gp->state = state; +#ifdef GUPROF + } else if (state == GMON_PROF_HIRES) { + gp->state = GMON_PROF_OFF; + stopprofclock(&proc0); + startguprof(gp); + gp->state = state; +#endif + } else if (state != gp->state) + return (EINVAL); return (0); case GPROF_COUNT: - return (sysctl_struct(oldp, oldlenp, newp, newlen, - gp->kcount, gp->kcountsize)); + return (sysctl_handle_opaque(oidp, + gp->kcount, gp->kcountsize, req)); case GPROF_FROMS: - return (sysctl_struct(oldp, oldlenp, newp, newlen, - gp->froms, gp->fromssize)); + return (sysctl_handle_opaque(oidp, + gp->froms, gp->fromssize, req)); case GPROF_TOS: - return (sysctl_struct(oldp, oldlenp, newp, newlen, - gp->tos, gp->tossize)); + return (sysctl_handle_opaque(oidp, + gp->tos, gp->tossize, req)); case GPROF_GMONPARAM: - return (sysctl_rdstruct(oldp, oldlenp, newp, gp, sizeof *gp)); + return (sysctl_handle_opaque(oidp, gp, sizeof *gp, req)); default: return (EOPNOTSUPP); } /* NOTREACHED */ } + +SYSCTL_NODE(_kern, KERN_PROF, prof, CTLFLAG_RW, sysctl_kern_prof, ""); #endif /* GPROF */ /* @@ -145,24 +339,27 @@ sysctl_doprof(name, namelen, oldp, oldlenp, newp, newlen, p) * The scale factor is a fixed point number with 16 bits of fraction, so that * 1.0 is represented as 0x10000. A scale factor of 0 turns off profiling. */ +#ifndef _SYS_SYSPROTO_H_ +struct profil_args { + caddr_t samples; + u_int size; + u_int offset; + u_int scale; +}; +#endif /* ARGSUSED */ int profil(p, uap, retval) struct proc *p; - register struct profil_args /* { - syscallarg(caddr_t) samples; - syscallarg(u_int) size; - syscallarg(u_int) offset; - syscallarg(u_int) scale; - } */ *uap; - register_t *retval; + register struct profil_args *uap; + int *retval; { register struct uprof *upp; int s; - if (SCARG(uap, scale) > (1 << 16)) + if (uap->scale > (1 << 16)) return (EINVAL); - if (SCARG(uap, scale) == 0) { + if (uap->scale == 0) { stopprofclock(p); return (0); } @@ -170,10 +367,10 @@ profil(p, uap, retval) /* Block profile interrupts while changing state. */ s = splstatclock(); - upp->pr_off = SCARG(uap, offset); - upp->pr_scale = SCARG(uap, scale); - upp->pr_base = SCARG(uap, samples); - upp->pr_size = SCARG(uap, size); + upp->pr_off = uap->offset; + upp->pr_scale = uap->scale; + upp->pr_base = uap->samples; + upp->pr_size = uap->size; startprofclock(p); splx(s); diff --git a/sys/kern/subr_rlist.c b/sys/kern/subr_rlist.c new file mode 100644 index 0000000..ef29ce3 --- /dev/null +++ b/sys/kern/subr_rlist.c @@ -0,0 +1,311 @@ +/* + * Copyright (c) 1992 William F. Jolitz, TeleMuse + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This software is a component of "386BSD" developed by + William F. Jolitz, TeleMuse. + * 4. Neither the name of the developer nor the name "386BSD" + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS A COMPONENT OF 386BSD DEVELOPED BY WILLIAM F. JOLITZ + * AND IS INTENDED FOR RESEARCH AND EDUCATIONAL PURPOSES ONLY. THIS + * SOFTWARE SHOULD NOT BE CONSIDERED TO BE A COMMERCIAL PRODUCT. + * THE DEVELOPER URGES THAT USERS WHO REQUIRE A COMMERCIAL PRODUCT + * NOT MAKE USE THIS WORK. + * + * FOR USERS WHO WISH TO UNDERSTAND THE 386BSD SYSTEM DEVELOPED + * BY WILLIAM F. JOLITZ, WE RECOMMEND THE USER STUDY WRITTEN + * REFERENCES SUCH AS THE "PORTING UNIX TO THE 386" SERIES + * (BEGINNING JANUARY 1991 "DR. DOBBS JOURNAL", USA AND BEGINNING + * JUNE 1991 "UNIX MAGAZIN", GERMANY) BY WILLIAM F. JOLITZ AND + * LYNNE GREER JOLITZ, AS WELL AS OTHER BOOKS ON UNIX AND THE + * ON-LINE 386BSD USER MANUAL BEFORE USE. A BOOK DISCUSSING THE INTERNALS + * OF 386BSD ENTITLED "386BSD FROM THE INSIDE OUT" WILL BE AVAILABLE LATE 1992. + * + * THIS SOFTWARE IS PROVIDED BY THE DEVELOPER ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE DEVELOPER BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/* + * Changes Copyright (C) 1995, David Greenman & John Dyson; This software may + * be used, modified, copied, distributed, and sold, in both source and + * binary form provided that the above copyright and these terms are + * retained. Under no circumstances is the author responsible for the proper + * functioning of this software, nor does the author assume any responsibility + * for damages incurred with its use. + * + * $Id$ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/malloc.h> +#include <sys/rlist.h> +#include <sys/proc.h> +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> + +/* + * Resource lists. + */ + +#define RLIST_MIN 128 +static int rlist_count=0; +static struct rlist *rlfree; + +static struct rlist *rlist_malloc __P((void)); + +static struct rlist * +rlist_malloc() +{ + struct rlist *rl; + int i; + while( rlist_count < RLIST_MIN) { + int s = splhigh(); + rl = (struct rlist *)kmem_alloc(kernel_map, PAGE_SIZE); + splx(s); + if( !rl) + break; + + for(i=0;i<(PAGE_SIZE/(sizeof *rl));i++) { + rl->rl_next = rlfree; + rlfree = rl; + rlist_count++; + rl++; + } + } + + if( (rl = rlfree) == 0 ) + panic("Cannot get an rlist entry"); + + --rlist_count; + rlfree = rl->rl_next; + return rl; +} + +inline static void +rlist_mfree( struct rlist *rl) +{ + rl->rl_next = rlfree; + rlfree = rl; + ++rlist_count; +} + +void +rlist_free(rlh, start, end) + struct rlisthdr *rlh; + u_int start, end; +{ + struct rlist **rlp = &rlh->rlh_list; + struct rlist *prev_rlp = NULL, *cur_rlp = *rlp, *next_rlp = NULL; + int s; + + s = splhigh(); + while (rlh->rlh_lock & RLH_LOCKED) { + rlh->rlh_lock |= RLH_DESIRED; + tsleep(rlh, PSWP, "rlistf", 0); + } + rlh->rlh_lock |= RLH_LOCKED; + splx(s); + + /* + * Traverse the list looking for an entry after the one we want + * to insert. + */ + while (cur_rlp != NULL) { + if (start < cur_rlp->rl_start) + break; +#ifdef DIAGNOSTIC + if (prev_rlp) { + if (prev_rlp->rl_end + 1 == cur_rlp->rl_start) + panic("rlist_free: missed coalesce opportunity"); + if (prev_rlp->rl_end == cur_rlp->rl_start) + panic("rlist_free: entries overlap"); + if (prev_rlp->rl_end > cur_rlp->rl_start) + panic("entries out of order"); + } +#endif + prev_rlp = cur_rlp; + cur_rlp = cur_rlp->rl_next; + } + + if (cur_rlp != NULL) { + + if (end >= cur_rlp->rl_start) + panic("rlist_free: free end overlaps already freed area"); + + if (prev_rlp) { + if (start <= prev_rlp->rl_end) + panic("rlist_free: free start overlaps already freed area"); + /* + * Attempt to append + */ + if (prev_rlp->rl_end + 1 == start) { + prev_rlp->rl_end = end; + /* + * Attempt to prepend and coalesce + */ + if (end + 1 == cur_rlp->rl_start) { + prev_rlp->rl_end = cur_rlp->rl_end; + prev_rlp->rl_next = cur_rlp->rl_next; + rlist_mfree(cur_rlp); + } + goto done; + } + } + /* + * Attempt to prepend + */ + if (end + 1 == cur_rlp->rl_start) { + cur_rlp->rl_start = start; + goto done; + } + } + /* + * Reached the end of the list without finding a larger entry. + * Append to last entry if there is one and it's adjacent. + */ + if (prev_rlp) { + if (start <= prev_rlp->rl_end) + panic("rlist_free: free start overlaps already freed area at list tail"); + /* + * Attempt to append + */ + if (prev_rlp->rl_end + 1 == start) { + prev_rlp->rl_end = end; + goto done; + } + } + + /* + * Could neither append nor prepend; allocate a new entry. + */ + next_rlp = cur_rlp; + cur_rlp = rlist_malloc(); + cur_rlp->rl_start = start; + cur_rlp->rl_end = end; + cur_rlp->rl_next = next_rlp; + if (prev_rlp) { + prev_rlp->rl_next = cur_rlp; + } else { + /* + * No previous - this entry is the new list head. + */ + *rlp = cur_rlp; + } + +done: + rlh->rlh_lock &= ~RLH_LOCKED; + if (rlh->rlh_lock & RLH_DESIRED) { + wakeup(rlh); + rlh->rlh_lock &= ~RLH_DESIRED; + } + return; +} + +/* + * Obtain a region of desired size from a resource list. + * If nothing available of that size, return 0. Otherwise, + * return a value of 1 and set resource start location with + * "*loc". (Note: loc can be zero if we don't wish the value) + */ +int +rlist_alloc (rlh, size, loc) + struct rlisthdr *rlh; + unsigned size, *loc; +{ + struct rlist **rlp = &rlh->rlh_list; + register struct rlist *lp; + int s; + register struct rlist *olp = 0; + + s = splhigh(); + while (rlh->rlh_lock & RLH_LOCKED) { + rlh->rlh_lock |= RLH_DESIRED; + tsleep(rlh, PSWP, "rlistf", 0); + } + rlh->rlh_lock |= RLH_LOCKED; + splx(s); + + /* walk list, allocating first thing that's big enough (first fit) */ + for (; *rlp; rlp = &((*rlp)->rl_next)) + if(size <= (*rlp)->rl_end - (*rlp)->rl_start + 1) { + + /* hand it to the caller */ + if (loc) *loc = (*rlp)->rl_start; + (*rlp)->rl_start += size; + + /* did we eat this element entirely? */ + if ((*rlp)->rl_start > (*rlp)->rl_end) { + lp = (*rlp)->rl_next; + rlist_mfree(*rlp); + /* + * if the deleted element was in fromt + * of the list, adjust *rlp, else don't. + */ + if (olp) { + olp->rl_next = lp; + } else { + *rlp = lp; + } + } + + rlh->rlh_lock &= ~RLH_LOCKED; + if (rlh->rlh_lock & RLH_DESIRED) { + wakeup(rlh); + rlh->rlh_lock &= ~RLH_DESIRED; + } + return (1); + } else { + olp = *rlp; + } + + rlh->rlh_lock &= ~RLH_LOCKED; + if (rlh->rlh_lock & RLH_DESIRED) { + wakeup(rlh); + rlh->rlh_lock &= ~RLH_DESIRED; + } + /* nothing in list that's big enough */ + return (0); +} + +/* + * Finished with this resource list, reclaim all space and + * mark it as being empty. + */ +void +rlist_destroy (rlh) + struct rlisthdr *rlh; +{ + struct rlist **rlp = &rlh->rlh_list; + struct rlist *lp, *nlp; + + lp = *rlp; + *rlp = 0; + for (; lp; lp = nlp) { + nlp = lp->rl_next; + rlist_mfree(lp); + } +} diff --git a/sys/kern/subr_rmap.c b/sys/kern/subr_rmap.c deleted file mode 100644 index 2f31173..0000000 --- a/sys/kern/subr_rmap.c +++ /dev/null @@ -1,81 +0,0 @@ -/*- - * Copyright (c) 1982, 1986, 1993 - * The Regents of the University of California. All rights reserved. - * (c) UNIX System Laboratories, Inc. - * All or some portions of this file are derived from material licensed - * to the University of California by American Telephone and Telegraph - * Co. or Unix System Laboratories, Inc. and are reproduced herein with - * the permission of UNIX System Laboratories, Inc. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. - * 2. Redistributions in binary form must reproduce the above copyright - * notice, this list of conditions and the following disclaimer in the - * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. - * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. - * - * from: @(#)subr_rmap.c 8.1 (Berkeley) 6/10/93 - */ - -#include <sys/param.h> -#include <sys/map.h> -#include <sys/proc.h> - -void -rminit(a1, a2, a3, a4, a5) - struct map *a1; - long a2, a3; - char *a4; - int a5; -{ - - /* - * Body deleted. - */ - return; -} - -long -rmalloc(a1, a2) - struct map *a1; - long a2; -{ - - /* - * Body deleted. - */ - return (0); -} - -void -rmfree(a1, a2, a3) - struct map *a1; - long a2, a3; -{ - - /* - * Body deleted. - */ - return; -} diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c new file mode 100644 index 0000000..9dca842 --- /dev/null +++ b/sys/kern/subr_trap.c @@ -0,0 +1,940 @@ +/*- + * Copyright (C) 1994, David Greenman + * Copyright (c) 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the University of Utah, and William Jolitz. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)trap.c 7.4 (Berkeley) 5/13/91 + * $Id$ + */ + +/* + * 386 Trap and System call handling + */ + +#include "opt_ktrace.h" +#include "opt_ddb.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/acct.h> +#include <sys/kernel.h> +#include <sys/syscall.h> +#include <sys/sysent.h> +#include <sys/queue.h> +#include <sys/vmmeter.h> +#ifdef KTRACE +#include <sys/ktrace.h> +#endif + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_kern.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> +#include <vm/vm_extern.h> + +#include <sys/user.h> + +#include <machine/cpu.h> +#include <machine/md_var.h> +#include <machine/psl.h> +#include <machine/reg.h> +#include <machine/trap.h> +#include <machine/../isa/isa_device.h> + +#ifdef POWERFAIL_NMI +#include <sys/syslog.h> +#include <machine/clock.h> +#endif + +#include "isa.h" +#include "npx.h" + +int (*pmath_emulate) __P((struct trapframe *)); + +extern void trap __P((struct trapframe frame)); +extern int trapwrite __P((unsigned addr)); +extern void syscall __P((struct trapframe frame)); + +static int trap_pfault __P((struct trapframe *, int)); +static void trap_fatal __P((struct trapframe *)); +void dblfault_handler __P((void)); + +extern inthand_t IDTVEC(syscall); + +#define MAX_TRAP_MSG 28 +static char *trap_msg[] = { + "", /* 0 unused */ + "privileged instruction fault", /* 1 T_PRIVINFLT */ + "", /* 2 unused */ + "breakpoint instruction fault", /* 3 T_BPTFLT */ + "", /* 4 unused */ + "", /* 5 unused */ + "arithmetic trap", /* 6 T_ARITHTRAP */ + "system forced exception", /* 7 T_ASTFLT */ + "", /* 8 unused */ + "general protection fault", /* 9 T_PROTFLT */ + "trace trap", /* 10 T_TRCTRAP */ + "", /* 11 unused */ + "page fault", /* 12 T_PAGEFLT */ + "", /* 13 unused */ + "alignment fault", /* 14 T_ALIGNFLT */ + "", /* 15 unused */ + "", /* 16 unused */ + "", /* 17 unused */ + "integer divide fault", /* 18 T_DIVIDE */ + "non-maskable interrupt trap", /* 19 T_NMI */ + "overflow trap", /* 20 T_OFLOW */ + "FPU bounds check fault", /* 21 T_BOUND */ + "FPU device not available", /* 22 T_DNA */ + "double fault", /* 23 T_DOUBLEFLT */ + "FPU operand fetch fault", /* 24 T_FPOPFLT */ + "invalid TSS fault", /* 25 T_TSSFLT */ + "segment not present fault", /* 26 T_SEGNPFLT */ + "stack fault", /* 27 T_STKFLT */ + "machine check trap", /* 28 T_MCHK */ +}; + +static void userret __P((struct proc *p, struct trapframe *frame, + u_quad_t oticks)); + +static inline void +userret(p, frame, oticks) + struct proc *p; + struct trapframe *frame; + u_quad_t oticks; +{ + int sig, s; + + while ((sig = CURSIG(p)) != 0) + postsig(sig); + p->p_priority = p->p_usrpri; + if (want_resched) { + /* + * Since we are curproc, clock will normally just change + * our priority without moving us from one queue to another + * (since the running process is not on a queue.) + * If that happened after we setrunqueue ourselves but before we + * mi_switch()'ed, we might not be on the queue indicated by + * our priority. + */ + s = splhigh(); + setrunqueue(p); + p->p_stats->p_ru.ru_nivcsw++; + mi_switch(); + splx(s); + while ((sig = CURSIG(p)) != 0) + postsig(sig); + } + /* + * Charge system time if profiling. + */ + if (p->p_flag & P_PROFIL) + addupc_task(p, frame->tf_eip, + (u_int)(p->p_sticks - oticks) * psratio); + + curpriority = p->p_priority; +} + +/* + * Exception, fault, and trap interface to the FreeBSD kernel. + * This common code is called from assembly language IDT gate entry + * routines that prepare a suitable stack frame, and restore this + * frame after the exception has been processed. + */ + +void +trap(frame) + struct trapframe frame; +{ + struct proc *p = curproc; + u_quad_t sticks = 0; + int i = 0, ucode = 0, type, code; +#ifdef DEBUG + u_long eva; +#endif + + type = frame.tf_trapno; + code = frame.tf_err; + + if (ISPL(frame.tf_cs) == SEL_UPL) { + /* user trap */ + + sticks = p->p_sticks; + p->p_md.md_regs = (int *)&frame; + + switch (type) { + case T_PRIVINFLT: /* privileged instruction fault */ + ucode = type; + i = SIGILL; + break; + + case T_BPTFLT: /* bpt instruction fault */ + case T_TRCTRAP: /* trace trap */ + frame.tf_eflags &= ~PSL_T; + i = SIGTRAP; + break; + + case T_ARITHTRAP: /* arithmetic trap */ + ucode = code; + i = SIGFPE; + break; + + case T_ASTFLT: /* Allow process switch */ + astoff(); + cnt.v_soft++; + if (p->p_flag & P_OWEUPC) { + p->p_flag &= ~P_OWEUPC; + addupc_task(p, p->p_stats->p_prof.pr_addr, + p->p_stats->p_prof.pr_ticks); + } + goto out; + + case T_PROTFLT: /* general protection fault */ + case T_SEGNPFLT: /* segment not present fault */ + case T_STKFLT: /* stack fault */ + case T_TSSFLT: /* invalid TSS fault */ + case T_DOUBLEFLT: /* double fault */ + default: + ucode = code + BUS_SEGM_FAULT ; + i = SIGBUS; + break; + + case T_PAGEFLT: /* page fault */ + i = trap_pfault(&frame, TRUE); + if (i == -1) + return; + if (i == 0) + goto out; + + ucode = T_PAGEFLT; + break; + + case T_DIVIDE: /* integer divide fault */ + ucode = FPE_INTDIV_TRAP; + i = SIGFPE; + break; + +#if NISA > 0 + case T_NMI: +#ifdef POWERFAIL_NMI + goto handle_powerfail; +#else /* !POWERFAIL_NMI */ +#ifdef DDB + /* NMI can be hooked up to a pushbutton for debugging */ + printf ("NMI ... going to debugger\n"); + if (kdb_trap (type, 0, &frame)) + return; +#endif /* DDB */ + /* machine/parity/power fail/"kitchen sink" faults */ + if (isa_nmi(code) == 0) return; + panic("NMI indicates hardware failure"); +#endif /* POWERFAIL_NMI */ +#endif /* NISA > 0 */ + + case T_OFLOW: /* integer overflow fault */ + ucode = FPE_INTOVF_TRAP; + i = SIGFPE; + break; + + case T_BOUND: /* bounds check fault */ + ucode = FPE_SUBRNG_TRAP; + i = SIGFPE; + break; + + case T_DNA: +#if NNPX > 0 + /* if a transparent fault (due to context switch "late") */ + if (npxdna()) + return; +#endif + if (!pmath_emulate) { + i = SIGFPE; + ucode = FPE_FPU_NP_TRAP; + break; + } + i = (*pmath_emulate)(&frame); + if (i == 0) { + if (!(frame.tf_eflags & PSL_T)) + return; + frame.tf_eflags &= ~PSL_T; + i = SIGTRAP; + } + /* else ucode = emulator_only_knows() XXX */ + break; + + case T_FPOPFLT: /* FPU operand fetch fault */ + ucode = T_FPOPFLT; + i = SIGILL; + break; + } + } else { + /* kernel trap */ + + switch (type) { + case T_PAGEFLT: /* page fault */ + (void) trap_pfault(&frame, FALSE); + return; + + case T_DNA: +#if NNPX > 0 + /* + * The kernel is apparently using npx for copying. + * XXX this should be fatal unless the kernel has + * registered such use. + */ + if (npxdna()) + return; +#endif + break; + + case T_PROTFLT: /* general protection fault */ + case T_SEGNPFLT: /* segment not present fault */ + /* + * Invalid segment selectors and out of bounds + * %eip's and %esp's can be set up in user mode. + * This causes a fault in kernel mode when the + * kernel tries to return to user mode. We want + * to get this fault so that we can fix the + * problem here and not have to check all the + * selectors and pointers when the user changes + * them. + */ +#define MAYBE_DORETI_FAULT(where, whereto) \ + do { \ + if (frame.tf_eip == (int)where) { \ + frame.tf_eip = (int)whereto; \ + return; \ + } \ + } while (0) + + if (intr_nesting_level == 0) { + MAYBE_DORETI_FAULT(doreti_iret, + doreti_iret_fault); + MAYBE_DORETI_FAULT(doreti_popl_ds, + doreti_popl_ds_fault); + MAYBE_DORETI_FAULT(doreti_popl_es, + doreti_popl_es_fault); + if (curpcb && curpcb->pcb_onfault) { + frame.tf_eip = (int)curpcb->pcb_onfault; + return; + } + } + break; + + case T_TSSFLT: + /* + * PSL_NT can be set in user mode and isn't cleared + * automatically when the kernel is entered. This + * causes a TSS fault when the kernel attempts to + * `iret' because the TSS link is uninitialized. We + * want to get this fault so that we can fix the + * problem here and not every time the kernel is + * entered. + */ + if (frame.tf_eflags & PSL_NT) { + frame.tf_eflags &= ~PSL_NT; + return; + } + break; + + case T_TRCTRAP: /* trace trap */ + if (frame.tf_eip == (int)IDTVEC(syscall)) { + /* + * We've just entered system mode via the + * syscall lcall. Continue single stepping + * silently until the syscall handler has + * saved the flags. + */ + return; + } + if (frame.tf_eip == (int)IDTVEC(syscall) + 1) { + /* + * The syscall handler has now saved the + * flags. Stop single stepping it. + */ + frame.tf_eflags &= ~PSL_T; + return; + } + /* + * Fall through. + */ + case T_BPTFLT: + /* + * If DDB is enabled, let it handle the debugger trap. + * Otherwise, debugger traps "can't happen". + */ +#ifdef DDB + if (kdb_trap (type, 0, &frame)) + return; +#endif + break; + +#if NISA > 0 + case T_NMI: +#ifdef POWERFAIL_NMI +#ifndef TIMER_FREQ +# define TIMER_FREQ 1193182 +#endif + handle_powerfail: + { + static unsigned lastalert = 0; + + if(time.tv_sec - lastalert > 10) + { + log(LOG_WARNING, "NMI: power fail\n"); + sysbeep(TIMER_FREQ/880, hz); + lastalert = time.tv_sec; + } + return; + } +#else /* !POWERFAIL_NMI */ +#ifdef DDB + /* NMI can be hooked up to a pushbutton for debugging */ + printf ("NMI ... going to debugger\n"); + if (kdb_trap (type, 0, &frame)) + return; +#endif /* DDB */ + /* machine/parity/power fail/"kitchen sink" faults */ + if (isa_nmi(code) == 0) return; + /* FALL THROUGH */ +#endif /* POWERFAIL_NMI */ +#endif /* NISA > 0 */ + } + + trap_fatal(&frame); + return; + } + + trapsignal(p, i, ucode); + +#ifdef DEBUG + eva = rcr2(); + if (type <= MAX_TRAP_MSG) { + uprintf("fatal process exception: %s", + trap_msg[type]); + if ((type == T_PAGEFLT) || (type == T_PROTFLT)) + uprintf(", fault VA = 0x%x", eva); + uprintf("\n"); + } +#endif + +out: + userret(p, &frame, sticks); +} + +#ifdef notyet +/* + * This version doesn't allow a page fault to user space while + * in the kernel. The rest of the kernel needs to be made "safe" + * before this can be used. I think the only things remaining + * to be made safe are the iBCS2 code and the process tracing/ + * debugging code. + */ +static int +trap_pfault(frame, usermode) + struct trapframe *frame; + int usermode; +{ + vm_offset_t va; + struct vmspace *vm = NULL; + vm_map_t map = 0; + int rv = 0; + vm_prot_t ftype; + int eva; + struct proc *p = curproc; + + if (frame->tf_err & PGEX_W) + ftype = VM_PROT_READ | VM_PROT_WRITE; + else + ftype = VM_PROT_READ; + + eva = rcr2(); + va = trunc_page((vm_offset_t)eva); + + if (va < VM_MIN_KERNEL_ADDRESS) { + vm_offset_t v; + vm_page_t mpte; + + if (p == NULL || + (!usermode && va < VM_MAXUSER_ADDRESS && + (intr_nesting_level != 0 || curpcb == NULL || + curpcb->pcb_onfault == NULL))) { + trap_fatal(frame); + return (-1); + } + + /* + * This is a fault on non-kernel virtual memory. + * vm is initialized above to NULL. If curproc is NULL + * or curproc->p_vmspace is NULL the fault is fatal. + */ + vm = p->p_vmspace; + if (vm == NULL) + goto nogo; + + map = &vm->vm_map; + + /* + * Keep swapout from messing with us during this + * critical time. + */ + ++p->p_lock; + + /* + * Grow the stack if necessary + */ + if ((caddr_t)va > vm->vm_maxsaddr + && (caddr_t)va < (caddr_t)USRSTACK) { + if (!grow(p, va)) { + rv = KERN_FAILURE; + --p->p_lock; + goto nogo; + } + } + + /* Fault in the user page: */ + rv = vm_fault(map, va, ftype, FALSE); + + --p->p_lock; + } else { + /* + * Don't allow user-mode faults in kernel address space. + */ + if (usermode) + goto nogo; + + /* + * Since we know that kernel virtual address addresses + * always have pte pages mapped, we just have to fault + * the page. + */ + rv = vm_fault(kernel_map, va, ftype, FALSE); + } + + if (rv == KERN_SUCCESS) + return (0); +nogo: + if (!usermode) { + if (intr_nesting_level == 0 && curpcb && curpcb->pcb_onfault) { + frame->tf_eip = (int)curpcb->pcb_onfault; + return (0); + } + trap_fatal(frame); + return (-1); + } + + /* kludge to pass faulting virtual address to sendsig */ + frame->tf_err = eva; + + return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); +} +#endif + +int +trap_pfault(frame, usermode) + struct trapframe *frame; + int usermode; +{ + vm_offset_t va; + struct vmspace *vm = NULL; + vm_map_t map = 0; + int rv = 0; + vm_prot_t ftype; + int eva; + struct proc *p = curproc; + + eva = rcr2(); + va = trunc_page((vm_offset_t)eva); + + if (va >= KERNBASE) { + /* + * Don't allow user-mode faults in kernel address space. + */ + if (usermode) + goto nogo; + + map = kernel_map; + } else { + /* + * This is a fault on non-kernel virtual memory. + * vm is initialized above to NULL. If curproc is NULL + * or curproc->p_vmspace is NULL the fault is fatal. + */ + if (p != NULL) + vm = p->p_vmspace; + + if (vm == NULL) + goto nogo; + + map = &vm->vm_map; + } + + if (frame->tf_err & PGEX_W) + ftype = VM_PROT_READ | VM_PROT_WRITE; + else + ftype = VM_PROT_READ; + + if (map != kernel_map) { + /* + * Keep swapout from messing with us during this + * critical time. + */ + ++p->p_lock; + + /* + * Grow the stack if necessary + */ + if ((caddr_t)va > vm->vm_maxsaddr + && (caddr_t)va < (caddr_t)USRSTACK) { + if (!grow(p, va)) { + rv = KERN_FAILURE; + --p->p_lock; + goto nogo; + } + } + + /* Fault in the user page: */ + rv = vm_fault(map, va, ftype, FALSE); + + --p->p_lock; + } else { + /* + * Since we know that kernel virtual address addresses + * always have pte pages mapped, we just have to fault + * the page. + */ + rv = vm_fault(map, va, ftype, FALSE); + } + + if (rv == KERN_SUCCESS) + return (0); +nogo: + if (!usermode) { + if (intr_nesting_level == 0 && curpcb && curpcb->pcb_onfault) { + frame->tf_eip = (int)curpcb->pcb_onfault; + return (0); + } + trap_fatal(frame); + return (-1); + } + + /* kludge to pass faulting virtual address to sendsig */ + frame->tf_err = eva; + + return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV); +} + +static void +trap_fatal(frame) + struct trapframe *frame; +{ + int code, type, eva, ss, esp; + struct soft_segment_descriptor softseg; + + code = frame->tf_err; + type = frame->tf_trapno; + eva = rcr2(); + sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg); + + if (type <= MAX_TRAP_MSG) + printf("\n\nFatal trap %d: %s while in %s mode\n", + type, trap_msg[type], + ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel"); + if (type == T_PAGEFLT) { + printf("fault virtual address = 0x%x\n", eva); + printf("fault code = %s %s, %s\n", + code & PGEX_U ? "user" : "supervisor", + code & PGEX_W ? "write" : "read", + code & PGEX_P ? "protection violation" : "page not present"); + } + printf("instruction pointer = 0x%x:0x%x\n", + frame->tf_cs & 0xffff, frame->tf_eip); + if (ISPL(frame->tf_cs) == SEL_UPL) { + ss = frame->tf_ss & 0xffff; + esp = frame->tf_esp; + } else { + ss = GSEL(GDATA_SEL, SEL_KPL); + esp = (int)&frame->tf_esp; + } + printf("stack pointer = 0x%x:0x%x\n", ss, esp); + printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp); + printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n", + softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type); + printf(" = DPL %d, pres %d, def32 %d, gran %d\n", + softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32, + softseg.ssd_gran); + printf("processor eflags = "); + if (frame->tf_eflags & PSL_T) + printf("trace trap, "); + if (frame->tf_eflags & PSL_I) + printf("interrupt enabled, "); + if (frame->tf_eflags & PSL_NT) + printf("nested task, "); + if (frame->tf_eflags & PSL_RF) + printf("resume, "); + if (frame->tf_eflags & PSL_VM) + printf("vm86, "); + printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12); + printf("current process = "); + if (curproc) { + printf("%lu (%s)\n", + (u_long)curproc->p_pid, curproc->p_comm ? + curproc->p_comm : ""); + } else { + printf("Idle\n"); + } + printf("interrupt mask = "); + if ((cpl & net_imask) == net_imask) + printf("net "); + if ((cpl & tty_imask) == tty_imask) + printf("tty "); + if ((cpl & bio_imask) == bio_imask) + printf("bio "); + if (cpl == 0) + printf("none"); + printf("\n"); + +#ifdef KDB + if (kdb_trap(&psl)) + return; +#endif +#ifdef DDB + if (kdb_trap (type, 0, frame)) + return; +#endif + if (type <= MAX_TRAP_MSG) + panic(trap_msg[type]); + else + panic("unknown/reserved trap"); +} + +/* + * Double fault handler. Called when a fault occurs while writing + * a frame for a trap/exception onto the stack. This usually occurs + * when the stack overflows (such is the case with infinite recursion, + * for example). + * + * XXX Note that the current PTD gets replaced by IdlePTD when the + * task switch occurs. This means that the stack that was active at + * the time of the double fault is not available at <kstack> unless + * the machine was idle when the double fault occurred. The downside + * of this is that "trace <ebp>" in ddb won't work. + */ +void +dblfault_handler() +{ + struct pcb *pcb = curpcb; + + if (pcb != NULL) { + printf("\nFatal double fault:\n"); + printf("eip = 0x%x\n", pcb->pcb_tss.tss_eip); + printf("esp = 0x%x\n", pcb->pcb_tss.tss_esp); + printf("ebp = 0x%x\n", pcb->pcb_tss.tss_ebp); + } + + panic("double fault"); +} + +/* + * Compensate for 386 brain damage (missing URKR). + * This is a little simpler than the pagefault handler in trap() because + * it the page tables have already been faulted in and high addresses + * are thrown out early for other reasons. + */ +int trapwrite(addr) + unsigned addr; +{ + struct proc *p; + vm_offset_t va; + struct vmspace *vm; + int rv; + + va = trunc_page((vm_offset_t)addr); + /* + * XXX - MAX is END. Changed > to >= for temp. fix. + */ + if (va >= VM_MAXUSER_ADDRESS) + return (1); + + p = curproc; + vm = p->p_vmspace; + + ++p->p_lock; + + if ((caddr_t)va >= vm->vm_maxsaddr + && (caddr_t)va < (caddr_t)USRSTACK) { + if (!grow(p, va)) { + --p->p_lock; + return (1); + } + } + + /* + * fault the data page + */ + rv = vm_fault(&vm->vm_map, va, VM_PROT_READ|VM_PROT_WRITE, FALSE); + + --p->p_lock; + + if (rv != KERN_SUCCESS) + return 1; + + return (0); +} + +/* + * System call request from POSIX system call gate interface to kernel. + * Like trap(), argument is call by reference. + */ +void +syscall(frame) + struct trapframe frame; +{ + caddr_t params; + int i; + struct sysent *callp; + struct proc *p = curproc; + u_quad_t sticks; + int error; + int args[8], rval[2]; + u_int code; + + sticks = p->p_sticks; + if (ISPL(frame.tf_cs) != SEL_UPL) + panic("syscall"); + + p->p_md.md_regs = (int *)&frame; + params = (caddr_t)frame.tf_esp + sizeof(int); + code = frame.tf_eax; + if (p->p_sysent->sv_prepsyscall) { + (*p->p_sysent->sv_prepsyscall)(&frame, args, &code, ¶ms); + } else { + /* + * Need to check if this is a 32 bit or 64 bit syscall. + */ + if (code == SYS_syscall) { + /* + * Code is first argument, followed by actual args. + */ + code = fuword(params); + params += sizeof(int); + } else if (code == SYS___syscall) { + /* + * Like syscall, but code is a quad, so as to maintain + * quad alignment for the rest of the arguments. + */ + code = fuword(params); + params += sizeof(quad_t); + } + } + + if (p->p_sysent->sv_mask) + code &= p->p_sysent->sv_mask; + + if (code >= p->p_sysent->sv_size) + callp = &p->p_sysent->sv_table[0]; + else + callp = &p->p_sysent->sv_table[code]; + + if (params && (i = callp->sy_narg * sizeof(int)) && + (error = copyin(params, (caddr_t)args, (u_int)i))) { +#ifdef KTRACE + if (KTRPOINT(p, KTR_SYSCALL)) + ktrsyscall(p->p_tracep, code, callp->sy_narg, args); +#endif + goto bad; + } +#ifdef KTRACE + if (KTRPOINT(p, KTR_SYSCALL)) + ktrsyscall(p->p_tracep, code, callp->sy_narg, args); +#endif + rval[0] = 0; + rval[1] = frame.tf_edx; + + error = (*callp->sy_call)(p, args, rval); + + switch (error) { + + case 0: + /* + * Reinitialize proc pointer `p' as it may be different + * if this is a child returning from fork syscall. + */ + p = curproc; + frame.tf_eax = rval[0]; + frame.tf_edx = rval[1]; + frame.tf_eflags &= ~PSL_C; + break; + + case ERESTART: + /* + * Reconstruct pc, assuming lcall $X,y is 7 bytes, + * int 0x80 is 2 bytes. We saved this in tf_err. + */ + frame.tf_eip -= frame.tf_err; + break; + + case EJUSTRETURN: + break; + + default: +bad: + if (p->p_sysent->sv_errsize) + if (error >= p->p_sysent->sv_errsize) + error = -1; /* XXX */ + else + error = p->p_sysent->sv_errtbl[error]; + frame.tf_eax = error; + frame.tf_eflags |= PSL_C; + break; + } + + if (frame.tf_eflags & PSL_T) { + /* Traced syscall. */ + frame.tf_eflags &= ~PSL_T; + trapsignal(p, SIGTRAP, 0); + } + + userret(p, &frame, sticks); + +#ifdef KTRACE + if (KTRPOINT(p, KTR_SYSRET)) + ktrsysret(p->p_tracep, code, error, rval[0]); +#endif +} diff --git a/sys/kern/subr_xxx.c b/sys/kern/subr_xxx.c index 45b2d64..5ff7dcc 100644 --- a/sys/kern/subr_xxx.c +++ b/sys/kern/subr_xxx.c @@ -30,88 +30,282 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)subr_xxx.c 8.3 (Berkeley) 3/29/95 + * @(#)subr_xxx.c 8.1 (Berkeley) 6/10/93 + * $Id$ */ /* - * Miscellaneous trivial functions, including many - * that are often inline-expanded or done in assembler. + * Miscellaneous trivial functions. */ #include <sys/param.h> #include <sys/systm.h> -#include <machine/cpu.h> - /* - * Unsupported device function (e.g. writing to read-only device). + * Return error for operation not supported + * on a specific object or file type. */ int -enodev() +eopnotsupp() { - return (ENODEV); + return (EOPNOTSUPP); } /* - * Unconfigured device function; driver not configured. + * Return error for an inval operation + * on a specific object or file type. */ int -enxio() +einval() { - return (ENXIO); + return (EINVAL); } /* - * Unsupported ioctl function. + * Generic null operation, always returns success. */ int -enoioctl() +nullop() { - return (ENOTTY); + return (0); } +#include <sys/conf.h> + /* - * Unsupported system function. - * This is used for an otherwise-reasonable operation - * that is not supported by the current system binary. + * Unsupported devswitch functions (e.g. for writing to read-only device). + * XXX may belong elsewhere. */ + +int +noopen(dev, flags, fmt, p) + dev_t dev; + int flags; + int fmt; + struct proc *p; +{ + + return (ENODEV); +} + +int +noclose(dev, flags, fmt, p) + dev_t dev; + int flags; + int fmt; + struct proc *p; +{ + + return (ENODEV); +} + +int +noread(dev, uio, ioflag) + dev_t dev; + struct uio *uio; + int ioflag; +{ + + return (ENODEV); +} + +int +nowrite(dev, uio, ioflag) + dev_t dev; + struct uio *uio; + int ioflag; +{ + + return (ENODEV); +} + +int +noioctl(dev, cmd, data, flags, p) + dev_t dev; + int cmd; + caddr_t data; + int flags; + struct proc *p; +{ + + return (ENODEV); +} + +void +nostop(tp, rw) + struct tty *tp; + int rw; +{ + +} + +int +noreset(dev) + dev_t dev; +{ + + printf("noreset(0x%x) called\n", dev); + return (ENODEV); +} + +struct tty * +nodevtotty(dev) + dev_t dev; +{ + + return (NULL); +} + +int +noselect(dev, rw, p) + dev_t dev; + int rw; + struct proc *p; +{ + + /* XXX is this distinguished from 1 for data available? */ + return (ENODEV); +} + +int +nommap(dev, offset, nprot) + dev_t dev; + int offset; + int nprot; +{ + + /* Don't return ENODEV. That would allow mapping address ENODEV! */ + return (-1); +} + int -enosys() +nodump(dev) + dev_t dev; { - return (ENOSYS); + return (ENODEV); } /* - * Return error for operation not supported - * on a specific object or file type. + * Null devswitch functions (for when the operation always succeeds). + * XXX may belong elsewhere. + * XXX not all are here (e.g., seltrue() isn't). + */ + +/* + * XXX this is probably bogus. Any device that uses it isn't checking the + * minor number. */ int -eopnotsupp() +nullopen(dev, flags, fmt, p) + dev_t dev; + int flags; + int fmt; + struct proc *p; { - return (EOPNOTSUPP); + return (0); +} + +int +nullclose(dev, flags, fmt, p) + dev_t dev; + int flags; + int fmt; + struct proc *p; +{ + + return (0); } /* - * Return error for an inval operation - * on a specific object or file type. + * Unconfigured devswitch functions (for unconfigured drivers). + * XXX may belong elsewhere. */ + int -einval() +nxopen(dev, flags, fmt, p) + dev_t dev; + int flags; + int fmt; + struct proc *p; { - return (EINVAL); + return (ENXIO); } /* - * Generic null operation, always returns success. + * XXX all nx functions except nxopen() should probably go away. They + * probably can't be called for non-open devices. */ + int -nullop() +nxclose(dev, flags, fmt, p) + dev_t dev; + int flags; + int fmt; + struct proc *p; { - return (0); + printf("nxclose(0x%x) called\n", dev); + return (ENXIO); +} + +int +nxread(dev, uio, ioflag) + dev_t dev; + struct uio *uio; + int ioflag; +{ + + printf("nxread(0x%x) called\n", dev); + return (ENXIO); +} + +int +nxwrite(dev, uio, ioflag) + dev_t dev; + struct uio *uio; + int ioflag; +{ + + printf("nxwrite(0x%x) called\n", dev); + return (ENXIO); +} + +int +nxioctl(dev, cmd, data, flags, p) + dev_t dev; + int cmd; + caddr_t data; + int flags; + struct proc *p; +{ + + printf("nxioctl(0x%x) called\n", dev); + return (ENXIO); +} + +int +nxselect(dev, rw, p) + dev_t dev; + int rw; + struct proc *p; +{ + + printf("nxselect(0x%x) called\n", dev); + + /* XXX is this distinguished from 1 for data available? */ + return (ENXIO); +} + +int +nxdump(dev) + dev_t dev; +{ + + printf("nxdump(0x%x) called\n", dev); + return (ENXIO); } diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c index 08385b3..2bcfd68 100644 --- a/sys/kern/sys_generic.c +++ b/sys/kern/sys_generic.c @@ -35,15 +35,24 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95 + * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94 + * $Id: sys_generic.c,v 1.25 1997/03/23 03:36:23 bde Exp $ */ +#include "opt_ktrace.h" + #include <sys/param.h> #include <sys/systm.h> +#include <sys/sysproto.h> #include <sys/filedesc.h> -#include <sys/ioctl.h> +#include <sys/filio.h> +#include <sys/ttycom.h> +#include <sys/fcntl.h> #include <sys/file.h> #include <sys/proc.h> +#include <sys/stat.h> +#include <sys/signalvar.h> +#include <sys/socket.h> #include <sys/socketvar.h> #include <sys/uio.h> #include <sys/kernel.h> @@ -52,23 +61,26 @@ #ifdef KTRACE #include <sys/ktrace.h> #endif +#include <vm/vm.h> -#include <sys/mount.h> -#include <sys/syscallargs.h> +static int selscan __P((struct proc *, fd_mask **, fd_mask **, int, int *)); /* * Read system call. */ +#ifndef _SYS_SYSPROTO_H_ +struct read_args { + int fd; + char *buf; + u_int nbyte; +}; +#endif /* ARGSUSED */ int read(p, uap, retval) struct proc *p; - register struct read_args /* { - syscallarg(int) fd; - syscallarg(char *) buf; - syscallarg(u_int) nbyte; - } */ *uap; - register_t *retval; + register struct read_args *uap; + int *retval; { register struct file *fp; register struct filedesc *fdp = p->p_fd; @@ -79,15 +91,19 @@ read(p, uap, retval) struct iovec ktriov; #endif - if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL || + if (((u_int)uap->fd) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL || (fp->f_flag & FREAD) == 0) return (EBADF); - aiov.iov_base = (caddr_t)SCARG(uap, buf); - aiov.iov_len = SCARG(uap, nbyte); + aiov.iov_base = (caddr_t)uap->buf; + aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; - auio.uio_resid = SCARG(uap, nbyte); + + auio.uio_resid = uap->nbyte; + if (auio.uio_resid < 0) + return (EINVAL); + auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_procp = p; @@ -98,16 +114,15 @@ read(p, uap, retval) if (KTRPOINT(p, KTR_GENIO)) ktriov = aiov; #endif - cnt = SCARG(uap, nbyte); - if (error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred)) + cnt = uap->nbyte; + if ((error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred))) if (auio.uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; cnt -= auio.uio_resid; #ifdef KTRACE if (KTRPOINT(p, KTR_GENIO) && error == 0) - ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_READ, &ktriov, - cnt, error); + ktrgenio(p->p_tracep, uap->fd, UIO_READ, &ktriov, cnt, error); #endif *retval = cnt; return (error); @@ -116,15 +131,18 @@ read(p, uap, retval) /* * Scatter read system call. */ +#ifndef _SYS_SYSPROTO_H_ +struct readv_args { + int fd; + struct iovec *iovp; + u_int iovcnt; +}; +#endif int readv(p, uap, retval) struct proc *p; - register struct readv_args /* { - syscallarg(int) fd; - syscallarg(struct iovec *) iovp; - syscallarg(u_int) iovcnt; - } */ *uap; - register_t *retval; + register struct readv_args *uap; + int *retval; { register struct file *fp; register struct filedesc *fdp = p->p_fd; @@ -138,14 +156,14 @@ readv(p, uap, retval) struct iovec *ktriov = NULL; #endif - if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL || + if (((u_int)uap->fd) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL || (fp->f_flag & FREAD) == 0) return (EBADF); /* note: can't use iovlen until iovcnt is validated */ - iovlen = SCARG(uap, iovcnt) * sizeof (struct iovec); - if (SCARG(uap, iovcnt) > UIO_SMALLIOV) { - if (SCARG(uap, iovcnt) > UIO_MAXIOV) + iovlen = uap->iovcnt * sizeof (struct iovec); + if (uap->iovcnt > UIO_SMALLIOV) { + if (uap->iovcnt > UIO_MAXIOV) return (EINVAL); MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); needfree = iov; @@ -154,19 +172,19 @@ readv(p, uap, retval) needfree = NULL; } auio.uio_iov = iov; - auio.uio_iovcnt = SCARG(uap, iovcnt); + auio.uio_iovcnt = uap->iovcnt; auio.uio_rw = UIO_READ; auio.uio_segflg = UIO_USERSPACE; auio.uio_procp = p; - if (error = copyin((caddr_t)SCARG(uap, iovp), (caddr_t)iov, iovlen)) + if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) goto done; auio.uio_resid = 0; - for (i = 0; i < SCARG(uap, iovcnt); i++) { - if (auio.uio_resid + iov->iov_len < auio.uio_resid) { + for (i = 0; i < uap->iovcnt; i++) { + auio.uio_resid += iov->iov_len; + if (auio.uio_resid < 0) { error = EINVAL; goto done; } - auio.uio_resid += iov->iov_len; iov++; } #ifdef KTRACE @@ -179,7 +197,7 @@ readv(p, uap, retval) } #endif cnt = auio.uio_resid; - if (error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred)) + if ((error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred))) if (auio.uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; @@ -187,7 +205,7 @@ readv(p, uap, retval) #ifdef KTRACE if (ktriov != NULL) { if (error == 0) - ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_READ, ktriov, + ktrgenio(p->p_tracep, uap->fd, UIO_READ, ktriov, cnt, error); FREE(ktriov, M_TEMP); } @@ -202,15 +220,18 @@ done: /* * Write system call */ +#ifndef _SYS_SYSPROTO_H_ +struct write_args { + int fd; + char *buf; + u_int nbyte; +}; +#endif int write(p, uap, retval) struct proc *p; - register struct write_args /* { - syscallarg(int) fd; - syscallarg(char *) buf; - syscallarg(u_int) nbyte; - } */ *uap; - register_t *retval; + register struct write_args *uap; + int *retval; { register struct file *fp; register struct filedesc *fdp = p->p_fd; @@ -221,15 +242,15 @@ write(p, uap, retval) struct iovec ktriov; #endif - if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL || + if (((u_int)uap->fd) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL || (fp->f_flag & FWRITE) == 0) return (EBADF); - aiov.iov_base = (caddr_t)SCARG(uap, buf); - aiov.iov_len = SCARG(uap, nbyte); + aiov.iov_base = (caddr_t)uap->buf; + aiov.iov_len = uap->nbyte; auio.uio_iov = &aiov; auio.uio_iovcnt = 1; - auio.uio_resid = SCARG(uap, nbyte); + auio.uio_resid = uap->nbyte; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_USERSPACE; auio.uio_procp = p; @@ -240,8 +261,8 @@ write(p, uap, retval) if (KTRPOINT(p, KTR_GENIO)) ktriov = aiov; #endif - cnt = SCARG(uap, nbyte); - if (error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred)) { + cnt = uap->nbyte; + if ((error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred))) { if (auio.uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; @@ -251,7 +272,7 @@ write(p, uap, retval) cnt -= auio.uio_resid; #ifdef KTRACE if (KTRPOINT(p, KTR_GENIO) && error == 0) - ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_WRITE, + ktrgenio(p->p_tracep, uap->fd, UIO_WRITE, &ktriov, cnt, error); #endif *retval = cnt; @@ -261,15 +282,18 @@ write(p, uap, retval) /* * Gather write system call */ +#ifndef _SYS_SYSPROTO_H_ +struct writev_args { + int fd; + struct iovec *iovp; + u_int iovcnt; +}; +#endif int writev(p, uap, retval) struct proc *p; - register struct writev_args /* { - syscallarg(int) fd; - syscallarg(struct iovec *) iovp; - syscallarg(u_int) iovcnt; - } */ *uap; - register_t *retval; + register struct writev_args *uap; + int *retval; { register struct file *fp; register struct filedesc *fdp = p->p_fd; @@ -283,14 +307,14 @@ writev(p, uap, retval) struct iovec *ktriov = NULL; #endif - if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL || + if (((u_int)uap->fd) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL || (fp->f_flag & FWRITE) == 0) return (EBADF); /* note: can't use iovlen until iovcnt is validated */ - iovlen = SCARG(uap, iovcnt) * sizeof (struct iovec); - if (SCARG(uap, iovcnt) > UIO_SMALLIOV) { - if (SCARG(uap, iovcnt) > UIO_MAXIOV) + iovlen = uap->iovcnt * sizeof (struct iovec); + if (uap->iovcnt > UIO_SMALLIOV) { + if (uap->iovcnt > UIO_MAXIOV) return (EINVAL); MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK); needfree = iov; @@ -299,19 +323,19 @@ writev(p, uap, retval) needfree = NULL; } auio.uio_iov = iov; - auio.uio_iovcnt = SCARG(uap, iovcnt); + auio.uio_iovcnt = uap->iovcnt; auio.uio_rw = UIO_WRITE; auio.uio_segflg = UIO_USERSPACE; auio.uio_procp = p; - if (error = copyin((caddr_t)SCARG(uap, iovp), (caddr_t)iov, iovlen)) + if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen))) goto done; auio.uio_resid = 0; - for (i = 0; i < SCARG(uap, iovcnt); i++) { - if (auio.uio_resid + iov->iov_len < auio.uio_resid) { + for (i = 0; i < uap->iovcnt; i++) { + auio.uio_resid += iov->iov_len; + if (auio.uio_resid < 0) { error = EINVAL; goto done; } - auio.uio_resid += iov->iov_len; iov++; } #ifdef KTRACE @@ -324,7 +348,7 @@ writev(p, uap, retval) } #endif cnt = auio.uio_resid; - if (error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred)) { + if ((error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred))) { if (auio.uio_resid != cnt && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; @@ -335,7 +359,7 @@ writev(p, uap, retval) #ifdef KTRACE if (ktriov != NULL) { if (error == 0) - ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_WRITE, + ktrgenio(p->p_tracep, uap->fd, UIO_WRITE, ktriov, cnt, error); FREE(ktriov, M_TEMP); } @@ -350,21 +374,23 @@ done: /* * Ioctl system call */ +#ifndef _SYS_SYSPROTO_H_ +struct ioctl_args { + int fd; + int com; + caddr_t data; +}; +#endif /* ARGSUSED */ int ioctl(p, uap, retval) struct proc *p; - register struct ioctl_args /* { - syscallarg(int) fd; - syscallarg(u_long) com; - syscallarg(caddr_t) data; - } */ *uap; - register_t *retval; + register struct ioctl_args *uap; + int *retval; { register struct file *fp; register struct filedesc *fdp; - register u_long com; - register int error; + register int com, error; register u_int size; caddr_t data, memp; int tmp; @@ -372,19 +398,19 @@ ioctl(p, uap, retval) char stkbuf[STK_PARAMS]; fdp = p->p_fd; - if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles || - (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL) + if ((u_int)uap->fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[uap->fd]) == NULL) return (EBADF); if ((fp->f_flag & (FREAD | FWRITE)) == 0) return (EBADF); - switch (com = SCARG(uap, com)) { + switch (com = uap->com) { case FIONCLEX: - fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE; + fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE; return (0); case FIOCLEX: - fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE; + fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE; return (0); } @@ -403,14 +429,14 @@ ioctl(p, uap, retval) data = stkbuf; if (com&IOC_IN) { if (size) { - error = copyin(SCARG(uap, data), data, (u_int)size); + error = copyin(uap->data, data, (u_int)size); if (error) { if (memp) free(memp, M_IOCTLOPS); return (error); } } else - *(caddr_t *)data = SCARG(uap, data); + *(caddr_t *)data = uap->data; } else if ((com&IOC_OUT) && size) /* * Zero the buffer so the user always @@ -418,12 +444,12 @@ ioctl(p, uap, retval) */ bzero(data, size); else if (com&IOC_VOID) - *(caddr_t *)data = SCARG(uap, data); + *(caddr_t *)data = uap->data; switch (com) { case FIONBIO: - if (tmp = *(int *)data) + if ((tmp = *(int *)data)) fp->f_flag |= FNONBLOCK; else fp->f_flag &= ~FNONBLOCK; @@ -431,7 +457,7 @@ ioctl(p, uap, retval) break; case FIOASYNC: - if (tmp = *(int *)data) + if ((tmp = *(int *)data)) fp->f_flag |= FASYNC; else fp->f_flag &= ~FASYNC; @@ -456,7 +482,7 @@ ioctl(p, uap, retval) tmp = p1->p_pgrp->pg_id; } error = (*fp->f_ops->fo_ioctl) - (fp, TIOCSPGRP, (caddr_t)&tmp, p); + (fp, (int)TIOCSPGRP, (caddr_t)&tmp, p); break; case FIOGETOWN: @@ -465,7 +491,7 @@ ioctl(p, uap, retval) *(int *)data = ((struct socket *)fp->f_data)->so_pgid; break; } - error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p); + error = (*fp->f_ops->fo_ioctl)(fp, (int)TIOCGPGRP, data, p); *(int *)data = -*(int *)data; break; @@ -476,7 +502,7 @@ ioctl(p, uap, retval) * already set and checked above. */ if (error == 0 && (com&IOC_OUT) && size) - error = copyout(data, SCARG(uap, data), (u_int)size); + error = copyout(data, uap->data, (u_int)size); break; } if (memp) @@ -484,49 +510,88 @@ ioctl(p, uap, retval) return (error); } -int selwait, nselcoll; +static int nselcoll; +int selwait; /* * Select system call. */ +#ifndef _SYS_SYSPROTO_H_ +struct select_args { + int nd; + fd_set *in, *ou, *ex; + struct timeval *tv; +}; +#endif int select(p, uap, retval) register struct proc *p; - register struct select_args /* { - syscallarg(u_int) nd; - syscallarg(fd_set *) in; - syscallarg(fd_set *) ou; - syscallarg(fd_set *) ex; - syscallarg(struct timeval *) tv; - } */ *uap; - register_t *retval; + register struct select_args *uap; + int *retval; { - fd_set ibits[3], obits[3]; + /* + * The magic 2048 here is chosen to be just enough for FD_SETSIZE + * infds with the new FD_SETSIZE of 1024, and more than enough for + * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE + * of 256. + */ + fd_mask s_selbits[howmany(2048, NFDBITS)]; + fd_mask *ibits[3], *obits[3], *selbits, *sbp; struct timeval atv; - int s, ncoll, error, timo = 0; - u_int ni; + int s, ncoll, error, timo; + u_int nbufbytes, ncpbytes, nfdbits; - bzero((caddr_t)ibits, sizeof(ibits)); - bzero((caddr_t)obits, sizeof(obits)); - if (SCARG(uap, nd) > FD_SETSIZE) + if (uap->nd < 0) return (EINVAL); - if (SCARG(uap, nd) > p->p_fd->fd_nfiles) { - /* forgiving; slightly wrong */ - SCARG(uap, nd) = p->p_fd->fd_nfiles; - } - ni = howmany(SCARG(uap, nd), NFDBITS) * sizeof(fd_mask); + if (uap->nd > p->p_fd->fd_nfiles) + uap->nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */ + + /* + * Allocate just enough bits for the non-null fd_sets. Use the + * preallocated auto buffer if possible. + */ + nfdbits = roundup(uap->nd, NFDBITS); + ncpbytes = nfdbits / NBBY; + nbufbytes = 0; + if (uap->in != NULL) + nbufbytes += 2 * ncpbytes; + if (uap->ou != NULL) + nbufbytes += 2 * ncpbytes; + if (uap->ex != NULL) + nbufbytes += 2 * ncpbytes; + if (nbufbytes <= sizeof s_selbits) + selbits = &s_selbits[0]; + else + selbits = malloc(nbufbytes, M_SELECT, M_WAITOK); + /* + * Assign pointers into the bit buffers and fetch the input bits. + * Put the output buffers together so that they can be bzeroed + * together. + */ + sbp = selbits; #define getbits(name, x) \ - if (SCARG(uap, name) && (error = copyin((caddr_t)SCARG(uap, name), \ - (caddr_t)&ibits[x], ni))) \ - goto done; + do { \ + if (uap->name == NULL) \ + ibits[x] = NULL; \ + else { \ + ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \ + obits[x] = sbp; \ + sbp += ncpbytes / sizeof *sbp; \ + error = copyin(uap->name, ibits[x], ncpbytes); \ + if (error != 0) \ + goto done; \ + } \ + } while (0) getbits(in, 0); getbits(ou, 1); getbits(ex, 2); #undef getbits + if (nbufbytes != 0) + bzero(selbits, nbufbytes / 2); - if (SCARG(uap, tv)) { - error = copyin((caddr_t)SCARG(uap, tv), (caddr_t)&atv, + if (uap->tv) { + error = copyin((caddr_t)uap->tv, (caddr_t)&atv, sizeof (atv)); if (error) goto done; @@ -535,31 +600,28 @@ select(p, uap, retval) goto done; } s = splclock(); - timevaladd(&atv, (struct timeval *)&time); + timevaladd(&atv, &time); + timo = hzto(&atv); + /* + * Avoid inadvertently sleeping forever. + */ + if (timo == 0) + timo = 1; splx(s); - } + } else + timo = 0; retry: ncoll = nselcoll; p->p_flag |= P_SELECT; - error = selscan(p, ibits, obits, SCARG(uap, nd), retval); + error = selscan(p, ibits, obits, uap->nd, retval); if (error || *retval) goto done; s = splhigh(); - if (SCARG(uap, tv)) { - if (timercmp(&time, &atv, >=)) { - splx(s); - goto done; - } - /* - * If poll wait was tiny, this could be zero; we will - * have to round it up to avoid sleeping forever. If - * we retry below, the timercmp above will get us out. - * Note that if wait was 0, the timercmp will prevent - * us from getting here the first time. - */ - timo = hzto(&atv); - if (timo == 0) - timo = 1; + /* this should be timercmp(&time, &atv, >=) */ + if (uap->tv && (time.tv_sec > atv.tv_sec || + (time.tv_sec == atv.tv_sec && time.tv_usec >= atv.tv_usec))) { + splx(s); + goto done; } if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) { splx(s); @@ -578,8 +640,7 @@ done: if (error == EWOULDBLOCK) error = 0; #define putbits(name, x) \ - if (SCARG(uap, name) && (error2 = copyout((caddr_t)&obits[x], \ - (caddr_t)SCARG(uap, name), ni))) \ + if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \ error = error2; if (error == 0) { int error2; @@ -589,15 +650,16 @@ done: putbits(ex, 2); #undef putbits } + if (selbits != &s_selbits[0]) + free(selbits, M_SELECT); return (error); } -int +static int selscan(p, ibits, obits, nfd, retval) struct proc *p; - fd_set *ibits, *obits; - int nfd; - register_t *retval; + fd_mask **ibits, **obits; + int nfd, *retval; { register struct filedesc *fdp = p->p_fd; register int msk, i, j, fd; @@ -607,15 +669,18 @@ selscan(p, ibits, obits, nfd, retval) static int flag[3] = { FREAD, FWRITE, 0 }; for (msk = 0; msk < 3; msk++) { + if (ibits[msk] == NULL) + continue; for (i = 0; i < nfd; i += NFDBITS) { - bits = ibits[msk].fds_bits[i/NFDBITS]; + bits = ibits[msk][i/NFDBITS]; while ((j = ffs(bits)) && (fd = i + --j) < nfd) { bits &= ~(1 << j); fp = fdp->fd_ofiles[fd]; if (fp == NULL) return (EBADF); if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) { - FD_SET(fd, &obits[msk]); + obits[msk][(fd)/NFDBITS] |= + (1 << ((fd) % NFDBITS)); n++; } } diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c new file mode 100644 index 0000000..5beac60 --- /dev/null +++ b/sys/kern/sys_pipe.c @@ -0,0 +1,1107 @@ +/* + * Copyright (c) 1996 John S. Dyson + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice immediately at the beginning of the file, without modification, + * this list of conditions, and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Absolutely no warranty of function or purpose is made by the author + * John S. Dyson. + * 4. Modifications may be freely made to this file if the above conditions + * are met. + * + * $Id: sys_pipe.c,v 1.26 1997/03/23 03:36:24 bde Exp $ + */ + +#ifndef OLD_PIPE + +/* + * This file contains a high-performance replacement for the socket-based + * pipes scheme originally used in FreeBSD/4.4Lite. It does not support + * all features of sockets, but does do everything that pipes normally + * do. + */ + +/* + * This code has two modes of operation, a small write mode and a large + * write mode. The small write mode acts like conventional pipes with + * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the + * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT + * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and + * the receiving process can copy it directly from the pages in the sending + * process. + * + * If the sending process receives a signal, it is possible that it will + * go away, and certainly its address space can change, because control + * is returned back to the user-mode side. In that case, the pipe code + * arranges to copy the buffer supplied by the user process, to a pageable + * kernel buffer, and the receiving process will grab the data from the + * pageable kernel buffer. Since signals don't happen all that often, + * the copy operation is normally eliminated. + * + * The constant PIPE_MINDIRECT is chosen to make sure that buffering will + * happen for small transfers so that the system will not spend all of + * its time context switching. PIPE_SIZE is constrained by the + * amount of kernel virtual memory. + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/proc.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/protosw.h> +#include <sys/stat.h> +#include <sys/filedesc.h> +#include <sys/malloc.h> +#include <sys/filio.h> +#include <sys/ttycom.h> +#include <sys/stat.h> +#include <sys/select.h> +#include <sys/signalvar.h> +#include <sys/errno.h> +#include <sys/queue.h> +#include <sys/vmmeter.h> +#include <sys/kernel.h> +#include <sys/sysproto.h> +#include <sys/pipe.h> + +#include <vm/vm.h> +#include <vm/vm_prot.h> +#include <vm/vm_param.h> +#include <sys/lock.h> +#include <vm/vm_object.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_page.h> + +/* + * Use this define if you want to disable *fancy* VM things. Expect an + * approx 30% decrease in transfer rate. This could be useful for + * NetBSD or OpenBSD. + */ +/* #define PIPE_NODIRECT */ + +/* + * interfaces to the outside world + */ +static int pipe_read __P((struct file *fp, struct uio *uio, + struct ucred *cred)); +static int pipe_write __P((struct file *fp, struct uio *uio, + struct ucred *cred)); +static int pipe_close __P((struct file *fp, struct proc *p)); +static int pipe_select __P((struct file *fp, int which, struct proc *p)); +static int pipe_ioctl __P((struct file *fp, int cmd, caddr_t data, struct proc *p)); + +static struct fileops pipeops = + { pipe_read, pipe_write, pipe_ioctl, pipe_select, pipe_close }; + +/* + * Default pipe buffer size(s), this can be kind-of large now because pipe + * space is pageable. The pipe code will try to maintain locality of + * reference for performance reasons, so small amounts of outstanding I/O + * will not wipe the cache. + */ +#define MINPIPESIZE (PIPE_SIZE/3) +#define MAXPIPESIZE (2*PIPE_SIZE/3) + +/* + * Maximum amount of kva for pipes -- this is kind-of a soft limit, but + * is there so that on large systems, we don't exhaust it. + */ +#define MAXPIPEKVA (8*1024*1024) + +/* + * Limit for direct transfers, we cannot, of course limit + * the amount of kva for pipes in general though. + */ +#define LIMITPIPEKVA (16*1024*1024) + +/* + * Limit the number of "big" pipes + */ +#define LIMITBIGPIPES 32 +int nbigpipe; + +static int amountpipekva; + +static void pipeclose __P((struct pipe *cpipe)); +static void pipeinit __P((struct pipe *cpipe)); +static __inline int pipelock __P((struct pipe *cpipe, int catch)); +static __inline void pipeunlock __P((struct pipe *cpipe)); +static __inline void pipeselwakeup __P((struct pipe *cpipe)); +#ifndef PIPE_NODIRECT +static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio)); +static void pipe_destroy_write_buffer __P((struct pipe *wpipe)); +static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio)); +static void pipe_clone_write_buffer __P((struct pipe *wpipe)); +#endif +static void pipespace __P((struct pipe *cpipe)); + +/* + * The pipe system call for the DTYPE_PIPE type of pipes + */ + +/* ARGSUSED */ +int +pipe(p, uap, retval) + struct proc *p; + struct pipe_args /* { + int dummy; + } */ *uap; + int retval[]; +{ + register struct filedesc *fdp = p->p_fd; + struct file *rf, *wf; + struct pipe *rpipe, *wpipe; + int fd, error; + + rpipe = malloc( sizeof (*rpipe), M_TEMP, M_WAITOK); + pipeinit(rpipe); + rpipe->pipe_state |= PIPE_DIRECTOK; + wpipe = malloc( sizeof (*wpipe), M_TEMP, M_WAITOK); + pipeinit(wpipe); + wpipe->pipe_state |= PIPE_DIRECTOK; + + error = falloc(p, &rf, &fd); + if (error) + goto free2; + retval[0] = fd; + rf->f_flag = FREAD | FWRITE; + rf->f_type = DTYPE_PIPE; + rf->f_ops = &pipeops; + rf->f_data = (caddr_t)rpipe; + error = falloc(p, &wf, &fd); + if (error) + goto free3; + wf->f_flag = FREAD | FWRITE; + wf->f_type = DTYPE_PIPE; + wf->f_ops = &pipeops; + wf->f_data = (caddr_t)wpipe; + retval[1] = fd; + + rpipe->pipe_peer = wpipe; + wpipe->pipe_peer = rpipe; + + return (0); +free3: + ffree(rf); + fdp->fd_ofiles[retval[0]] = 0; +free2: + (void)pipeclose(wpipe); + (void)pipeclose(rpipe); + return (error); +} + +/* + * Allocate kva for pipe circular buffer, the space is pageable + */ +static void +pipespace(cpipe) + struct pipe *cpipe; +{ + int npages, error; + + npages = round_page(cpipe->pipe_buffer.size)/PAGE_SIZE; + /* + * Create an object, I don't like the idea of paging to/from + * kernel_object. + * XXX -- minor change needed here for NetBSD/OpenBSD VM systems. + */ + cpipe->pipe_buffer.object = vm_object_allocate(OBJT_DEFAULT, npages); + cpipe->pipe_buffer.buffer = (caddr_t) vm_map_min(kernel_map); + + /* + * Insert the object into the kernel map, and allocate kva for it. + * The map entry is, by default, pageable. + * XXX -- minor change needed here for NetBSD/OpenBSD VM systems. + */ + error = vm_map_find(kernel_map, cpipe->pipe_buffer.object, 0, + (vm_offset_t *) &cpipe->pipe_buffer.buffer, + cpipe->pipe_buffer.size, 1, + VM_PROT_ALL, VM_PROT_ALL, 0); + + if (error != KERN_SUCCESS) + panic("pipeinit: cannot allocate pipe -- out of kvm -- code = %d", error); + amountpipekva += cpipe->pipe_buffer.size; +} + +/* + * initialize and allocate VM and memory for pipe + */ +static void +pipeinit(cpipe) + struct pipe *cpipe; +{ + int s; + + cpipe->pipe_buffer.in = 0; + cpipe->pipe_buffer.out = 0; + cpipe->pipe_buffer.cnt = 0; + cpipe->pipe_buffer.size = PIPE_SIZE; + + /* Buffer kva gets dynamically allocated */ + cpipe->pipe_buffer.buffer = NULL; + /* cpipe->pipe_buffer.object = invalid */ + + cpipe->pipe_state = 0; + cpipe->pipe_peer = NULL; + cpipe->pipe_busy = 0; + gettime(&cpipe->pipe_ctime); + cpipe->pipe_atime = cpipe->pipe_ctime; + cpipe->pipe_mtime = cpipe->pipe_ctime; + bzero(&cpipe->pipe_sel, sizeof cpipe->pipe_sel); + cpipe->pipe_pgid = NO_PID; + +#ifndef PIPE_NODIRECT + /* + * pipe data structure initializations to support direct pipe I/O + */ + cpipe->pipe_map.cnt = 0; + cpipe->pipe_map.kva = 0; + cpipe->pipe_map.pos = 0; + cpipe->pipe_map.npages = 0; + /* cpipe->pipe_map.ms[] = invalid */ +#endif +} + + +/* + * lock a pipe for I/O, blocking other access + */ +static __inline int +pipelock(cpipe, catch) + struct pipe *cpipe; + int catch; +{ + int error; + while (cpipe->pipe_state & PIPE_LOCK) { + cpipe->pipe_state |= PIPE_LWANT; + if (error = tsleep( cpipe, + catch?(PRIBIO|PCATCH):PRIBIO, "pipelk", 0)) { + return error; + } + } + cpipe->pipe_state |= PIPE_LOCK; + return 0; +} + +/* + * unlock a pipe I/O lock + */ +static __inline void +pipeunlock(cpipe) + struct pipe *cpipe; +{ + cpipe->pipe_state &= ~PIPE_LOCK; + if (cpipe->pipe_state & PIPE_LWANT) { + cpipe->pipe_state &= ~PIPE_LWANT; + wakeup(cpipe); + } +} + +static __inline void +pipeselwakeup(cpipe) + struct pipe *cpipe; +{ + struct proc *p; + + if (cpipe->pipe_state & PIPE_SEL) { + cpipe->pipe_state &= ~PIPE_SEL; + selwakeup(&cpipe->pipe_sel); + } + if (cpipe->pipe_state & PIPE_ASYNC) { + if (cpipe->pipe_pgid < 0) + gsignal(-cpipe->pipe_pgid, SIGIO); + else if ((p = pfind(cpipe->pipe_pgid)) != NULL) + psignal(p, SIGIO); + } +} + +/* ARGSUSED */ +static int +pipe_read(fp, uio, cred) + struct file *fp; + struct uio *uio; + struct ucred *cred; +{ + + struct pipe *rpipe = (struct pipe *) fp->f_data; + int error = 0; + int nread = 0; + u_int size; + + ++rpipe->pipe_busy; + while (uio->uio_resid) { + /* + * normal pipe buffer receive + */ + if (rpipe->pipe_buffer.cnt > 0) { + size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out; + if (size > rpipe->pipe_buffer.cnt) + size = rpipe->pipe_buffer.cnt; + if (size > (u_int) uio->uio_resid) + size = (u_int) uio->uio_resid; + if ((error = pipelock(rpipe,1)) == 0) { + error = uiomove( &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], + size, uio); + pipeunlock(rpipe); + } + if (error) { + break; + } + rpipe->pipe_buffer.out += size; + if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size) + rpipe->pipe_buffer.out = 0; + + rpipe->pipe_buffer.cnt -= size; + nread += size; +#ifndef PIPE_NODIRECT + /* + * Direct copy, bypassing a kernel buffer. + */ + } else if ((size = rpipe->pipe_map.cnt) && + (rpipe->pipe_state & PIPE_DIRECTW)) { + caddr_t va; + if (size > (u_int) uio->uio_resid) + size = (u_int) uio->uio_resid; + if ((error = pipelock(rpipe,1)) == 0) { + va = (caddr_t) rpipe->pipe_map.kva + rpipe->pipe_map.pos; + error = uiomove(va, size, uio); + pipeunlock(rpipe); + } + if (error) + break; + nread += size; + rpipe->pipe_map.pos += size; + rpipe->pipe_map.cnt -= size; + if (rpipe->pipe_map.cnt == 0) { + rpipe->pipe_state &= ~PIPE_DIRECTW; + wakeup(rpipe); + } +#endif + } else { + /* + * detect EOF condition + */ + if (rpipe->pipe_state & PIPE_EOF) { + /* XXX error = ? */ + break; + } + /* + * If the "write-side" has been blocked, wake it up now. + */ + if (rpipe->pipe_state & PIPE_WANTW) { + rpipe->pipe_state &= ~PIPE_WANTW; + wakeup(rpipe); + } + if (nread > 0) + break; + + if (fp->f_flag & FNONBLOCK) { + error = EAGAIN; + break; + } + + /* + * If there is no more to read in the pipe, reset + * its pointers to the beginning. This improves + * cache hit stats. + */ + + if ((error = pipelock(rpipe,1)) == 0) { + if (rpipe->pipe_buffer.cnt == 0) { + rpipe->pipe_buffer.in = 0; + rpipe->pipe_buffer.out = 0; + } + pipeunlock(rpipe); + } else { + break; + } + + if (rpipe->pipe_state & PIPE_WANTW) { + rpipe->pipe_state &= ~PIPE_WANTW; + wakeup(rpipe); + } + + rpipe->pipe_state |= PIPE_WANTR; + if (error = tsleep(rpipe, PRIBIO|PCATCH, "piperd", 0)) { + break; + } + } + } + + if (error == 0) + gettime(&rpipe->pipe_atime); + + --rpipe->pipe_busy; + if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) { + rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW); + wakeup(rpipe); + } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) { + /* + * If there is no more to read in the pipe, reset + * its pointers to the beginning. This improves + * cache hit stats. + */ + if (rpipe->pipe_buffer.cnt == 0) { + if ((error == 0) && (error = pipelock(rpipe,1)) == 0) { + rpipe->pipe_buffer.in = 0; + rpipe->pipe_buffer.out = 0; + pipeunlock(rpipe); + } + } + + /* + * If the "write-side" has been blocked, wake it up now. + */ + if (rpipe->pipe_state & PIPE_WANTW) { + rpipe->pipe_state &= ~PIPE_WANTW; + wakeup(rpipe); + } + } + + if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF) + pipeselwakeup(rpipe); + + return error; +} + +#ifndef PIPE_NODIRECT +/* + * Map the sending processes' buffer into kernel space and wire it. + * This is similar to a physical write operation. + */ +static int +pipe_build_write_buffer(wpipe, uio) + struct pipe *wpipe; + struct uio *uio; +{ + u_int size; + int i; + vm_offset_t addr, endaddr, paddr; + + size = (u_int) uio->uio_iov->iov_len; + if (size > wpipe->pipe_buffer.size) + size = wpipe->pipe_buffer.size; + + endaddr = round_page(uio->uio_iov->iov_base + size); + for(i = 0, addr = trunc_page(uio->uio_iov->iov_base); + addr < endaddr; + addr += PAGE_SIZE, i+=1) { + + vm_page_t m; + + vm_fault_quick( (caddr_t) addr, VM_PROT_READ); + paddr = pmap_kextract(addr); + if (!paddr) { + int j; + for(j=0;j<i;j++) + vm_page_unwire(wpipe->pipe_map.ms[j]); + return EFAULT; + } + + m = PHYS_TO_VM_PAGE(paddr); + vm_page_wire(m); + wpipe->pipe_map.ms[i] = m; + } + +/* + * set up the control block + */ + wpipe->pipe_map.npages = i; + wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK; + wpipe->pipe_map.cnt = size; + +/* + * and map the buffer + */ + if (wpipe->pipe_map.kva == 0) { + /* + * We need to allocate space for an extra page because the + * address range might (will) span pages at times. + */ + wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map, + wpipe->pipe_buffer.size + PAGE_SIZE); + amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE; + } + pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms, + wpipe->pipe_map.npages); + +/* + * and update the uio data + */ + + uio->uio_iov->iov_len -= size; + uio->uio_iov->iov_base += size; + if (uio->uio_iov->iov_len == 0) + uio->uio_iov++; + uio->uio_resid -= size; + uio->uio_offset += size; + return 0; +} + +/* + * unmap and unwire the process buffer + */ +static void +pipe_destroy_write_buffer(wpipe) +struct pipe *wpipe; +{ + int i; + if (wpipe->pipe_map.kva) { + pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages); + + if (amountpipekva > MAXPIPEKVA) { + vm_offset_t kva = wpipe->pipe_map.kva; + wpipe->pipe_map.kva = 0; + kmem_free(kernel_map, kva, + wpipe->pipe_buffer.size + PAGE_SIZE); + amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE; + } + } + for (i=0;i<wpipe->pipe_map.npages;i++) + vm_page_unwire(wpipe->pipe_map.ms[i]); +} + +/* + * In the case of a signal, the writing process might go away. This + * code copies the data into the circular buffer so that the source + * pages can be freed without loss of data. + */ +static void +pipe_clone_write_buffer(wpipe) +struct pipe *wpipe; +{ + int size; + int pos; + + size = wpipe->pipe_map.cnt; + pos = wpipe->pipe_map.pos; + bcopy((caddr_t) wpipe->pipe_map.kva+pos, + (caddr_t) wpipe->pipe_buffer.buffer, + size); + + wpipe->pipe_buffer.in = size; + wpipe->pipe_buffer.out = 0; + wpipe->pipe_buffer.cnt = size; + wpipe->pipe_state &= ~PIPE_DIRECTW; + + pipe_destroy_write_buffer(wpipe); +} + +/* + * This implements the pipe buffer write mechanism. Note that only + * a direct write OR a normal pipe write can be pending at any given time. + * If there are any characters in the pipe buffer, the direct write will + * be deferred until the receiving process grabs all of the bytes from + * the pipe buffer. Then the direct mapping write is set-up. + */ +static int +pipe_direct_write(wpipe, uio) + struct pipe *wpipe; + struct uio *uio; +{ + int error; +retry: + while (wpipe->pipe_state & PIPE_DIRECTW) { + if ( wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + wpipe->pipe_state |= PIPE_WANTW; + error = tsleep(wpipe, + PRIBIO|PCATCH, "pipdww", 0); + if (error) + goto error1; + if (wpipe->pipe_state & PIPE_EOF) { + error = EPIPE; + goto error1; + } + } + wpipe->pipe_map.cnt = 0; /* transfer not ready yet */ + if (wpipe->pipe_buffer.cnt > 0) { + if ( wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + + wpipe->pipe_state |= PIPE_WANTW; + error = tsleep(wpipe, + PRIBIO|PCATCH, "pipdwc", 0); + if (error) + goto error1; + if (wpipe->pipe_state & PIPE_EOF) { + error = EPIPE; + goto error1; + } + goto retry; + } + + wpipe->pipe_state |= PIPE_DIRECTW; + + error = pipe_build_write_buffer(wpipe, uio); + if (error) { + wpipe->pipe_state &= ~PIPE_DIRECTW; + goto error1; + } + + error = 0; + while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) { + if (wpipe->pipe_state & PIPE_EOF) { + pipelock(wpipe, 0); + pipe_destroy_write_buffer(wpipe); + pipeunlock(wpipe); + pipeselwakeup(wpipe); + error = EPIPE; + goto error1; + } + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + pipeselwakeup(wpipe); + error = tsleep(wpipe, PRIBIO|PCATCH, "pipdwt", 0); + } + + pipelock(wpipe,0); + if (wpipe->pipe_state & PIPE_DIRECTW) { + /* + * this bit of trickery substitutes a kernel buffer for + * the process that might be going away. + */ + pipe_clone_write_buffer(wpipe); + } else { + pipe_destroy_write_buffer(wpipe); + } + pipeunlock(wpipe); + return error; + +error1: + wakeup(wpipe); + return error; +} +#endif + +static int +pipe_write(fp, uio, cred) + struct file *fp; + struct uio *uio; + struct ucred *cred; +{ + int error = 0; + int orig_resid; + + struct pipe *wpipe, *rpipe; + + rpipe = (struct pipe *) fp->f_data; + wpipe = rpipe->pipe_peer; + + /* + * detect loss of pipe read side, issue SIGPIPE if lost. + */ + if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) { + return EPIPE; + } + + /* + * If it is advantageous to resize the pipe buffer, do + * so. + */ + if ((uio->uio_resid > PIPE_SIZE) && + (nbigpipe < LIMITBIGPIPES) && + (wpipe->pipe_state & PIPE_DIRECTW) == 0 && + (wpipe->pipe_buffer.size <= PIPE_SIZE) && + (wpipe->pipe_buffer.cnt == 0)) { + + if (wpipe->pipe_buffer.buffer) { + amountpipekva -= wpipe->pipe_buffer.size; + kmem_free(kernel_map, + (vm_offset_t)wpipe->pipe_buffer.buffer, + wpipe->pipe_buffer.size); + } + +#ifndef PIPE_NODIRECT + if (wpipe->pipe_map.kva) { + amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE; + kmem_free(kernel_map, + wpipe->pipe_map.kva, + wpipe->pipe_buffer.size + PAGE_SIZE); + } +#endif + + wpipe->pipe_buffer.in = 0; + wpipe->pipe_buffer.out = 0; + wpipe->pipe_buffer.cnt = 0; + wpipe->pipe_buffer.size = BIG_PIPE_SIZE; + wpipe->pipe_buffer.buffer = NULL; + ++nbigpipe; + +#ifndef PIPE_NODIRECT + wpipe->pipe_map.cnt = 0; + wpipe->pipe_map.kva = 0; + wpipe->pipe_map.pos = 0; + wpipe->pipe_map.npages = 0; +#endif + + } + + + if( wpipe->pipe_buffer.buffer == NULL) { + if ((error = pipelock(wpipe,1)) == 0) { + pipespace(wpipe); + pipeunlock(wpipe); + } else { + return error; + } + } + + ++wpipe->pipe_busy; + orig_resid = uio->uio_resid; + while (uio->uio_resid) { + int space; +#ifndef PIPE_NODIRECT + /* + * If the transfer is large, we can gain performance if + * we do process-to-process copies directly. + * If the write is non-blocking, we don't use the + * direct write mechanism. + */ + if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) && + (fp->f_flag & FNONBLOCK) == 0 && + (wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) && + (uio->uio_iov->iov_len >= PIPE_MINDIRECT)) { + error = pipe_direct_write( wpipe, uio); + if (error) { + break; + } + continue; + } +#endif + + /* + * Pipe buffered writes cannot be coincidental with + * direct writes. We wait until the currently executing + * direct write is completed before we start filling the + * pipe buffer. + */ + retrywrite: + while (wpipe->pipe_state & PIPE_DIRECTW) { + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + error = tsleep(wpipe, + PRIBIO|PCATCH, "pipbww", 0); + if (error) + break; + } + + space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt; + + /* Writes of size <= PIPE_BUF must be atomic. */ + /* XXX perhaps they need to be contiguous to be atomic? */ + if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF)) + space = 0; + + if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) { + /* + * This set the maximum transfer as a segment of + * the buffer. + */ + int size = wpipe->pipe_buffer.size - wpipe->pipe_buffer.in; + /* + * space is the size left in the buffer + */ + if (size > space) + size = space; + /* + * now limit it to the size of the uio transfer + */ + if (size > uio->uio_resid) + size = uio->uio_resid; + if ((error = pipelock(wpipe,1)) == 0) { + /* + * It is possible for a direct write to + * slip in on us... handle it here... + */ + if (wpipe->pipe_state & PIPE_DIRECTW) { + pipeunlock(wpipe); + goto retrywrite; + } + error = uiomove( &wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], + size, uio); + pipeunlock(wpipe); + } + if (error) + break; + + wpipe->pipe_buffer.in += size; + if (wpipe->pipe_buffer.in >= wpipe->pipe_buffer.size) + wpipe->pipe_buffer.in = 0; + + wpipe->pipe_buffer.cnt += size; + } else { + /* + * If the "read-side" has been blocked, wake it up now. + */ + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + + /* + * don't block on non-blocking I/O + */ + if (fp->f_flag & FNONBLOCK) { + error = EAGAIN; + break; + } + + /* + * We have no more space and have something to offer, + * wake up selects. + */ + pipeselwakeup(wpipe); + + wpipe->pipe_state |= PIPE_WANTW; + if (error = tsleep(wpipe, (PRIBIO+1)|PCATCH, "pipewr", 0)) { + break; + } + /* + * If read side wants to go away, we just issue a signal + * to ourselves. + */ + if (wpipe->pipe_state & PIPE_EOF) { + error = EPIPE; + break; + } + } + } + + --wpipe->pipe_busy; + if ((wpipe->pipe_busy == 0) && + (wpipe->pipe_state & PIPE_WANT)) { + wpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTR); + wakeup(wpipe); + } else if (wpipe->pipe_buffer.cnt > 0) { + /* + * If we have put any characters in the buffer, we wake up + * the reader. + */ + if (wpipe->pipe_state & PIPE_WANTR) { + wpipe->pipe_state &= ~PIPE_WANTR; + wakeup(wpipe); + } + } + + /* + * Don't return EPIPE if I/O was successful + */ + if ((wpipe->pipe_buffer.cnt == 0) && + (uio->uio_resid == 0) && + (error == EPIPE)) + error = 0; + + if (error == 0) + gettime(&wpipe->pipe_mtime); + + /* + * We have something to offer, + * wake up select. + */ + if (wpipe->pipe_buffer.cnt) + pipeselwakeup(wpipe); + + return error; +} + +/* + * we implement a very minimal set of ioctls for compatibility with sockets. + */ +int +pipe_ioctl(fp, cmd, data, p) + struct file *fp; + int cmd; + register caddr_t data; + struct proc *p; +{ + register struct pipe *mpipe = (struct pipe *)fp->f_data; + + switch (cmd) { + + case FIONBIO: + return (0); + + case FIOASYNC: + if (*(int *)data) { + mpipe->pipe_state |= PIPE_ASYNC; + } else { + mpipe->pipe_state &= ~PIPE_ASYNC; + } + return (0); + + case FIONREAD: + if (mpipe->pipe_state & PIPE_DIRECTW) + *(int *)data = mpipe->pipe_map.cnt; + else + *(int *)data = mpipe->pipe_buffer.cnt; + return (0); + + case TIOCSPGRP: + mpipe->pipe_pgid = *(int *)data; + return (0); + + case TIOCGPGRP: + *(int *)data = mpipe->pipe_pgid; + return (0); + + } + return (ENOTTY); +} + +int +pipe_select(fp, which, p) + struct file *fp; + int which; + struct proc *p; +{ + register struct pipe *rpipe = (struct pipe *)fp->f_data; + struct pipe *wpipe; + + wpipe = rpipe->pipe_peer; + switch (which) { + + case FREAD: + if ( (rpipe->pipe_state & PIPE_DIRECTW) || + (rpipe->pipe_buffer.cnt > 0) || + (rpipe->pipe_state & PIPE_EOF)) { + return (1); + } + selrecord(p, &rpipe->pipe_sel); + rpipe->pipe_state |= PIPE_SEL; + break; + + case FWRITE: + if ((wpipe == NULL) || + (wpipe->pipe_state & PIPE_EOF) || + (((wpipe->pipe_state & PIPE_DIRECTW) == 0) && + (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) { + return (1); + } + selrecord(p, &wpipe->pipe_sel); + wpipe->pipe_state |= PIPE_SEL; + break; + + case 0: + if ((rpipe->pipe_state & PIPE_EOF) || + (wpipe == NULL) || + (wpipe->pipe_state & PIPE_EOF)) { + return (1); + } + + selrecord(p, &rpipe->pipe_sel); + rpipe->pipe_state |= PIPE_SEL; + break; + } + return (0); +} + +int +pipe_stat(pipe, ub) + register struct pipe *pipe; + register struct stat *ub; +{ + bzero((caddr_t)ub, sizeof (*ub)); + ub->st_mode = S_IFIFO; + ub->st_blksize = pipe->pipe_buffer.size; + ub->st_size = pipe->pipe_buffer.cnt; + ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize; + TIMEVAL_TO_TIMESPEC(&pipe->pipe_atime, &ub->st_atimespec); + TIMEVAL_TO_TIMESPEC(&pipe->pipe_mtime, &ub->st_mtimespec); + TIMEVAL_TO_TIMESPEC(&pipe->pipe_ctime, &ub->st_ctimespec); + /* + * Left as 0: st_dev, st_ino, st_nlink, st_uid, st_gid, st_rdev, + * st_flags, st_gen. + * XXX (st_dev, st_ino) should be unique. + */ + return 0; +} + +/* ARGSUSED */ +static int +pipe_close(fp, p) + struct file *fp; + struct proc *p; +{ + struct pipe *cpipe = (struct pipe *)fp->f_data; + + pipeclose(cpipe); + fp->f_data = NULL; + return 0; +} + +/* + * shutdown the pipe + */ +static void +pipeclose(cpipe) + struct pipe *cpipe; +{ + struct pipe *ppipe; + if (cpipe) { + + pipeselwakeup(cpipe); + + /* + * If the other side is blocked, wake it up saying that + * we want to close it down. + */ + while (cpipe->pipe_busy) { + wakeup(cpipe); + cpipe->pipe_state |= PIPE_WANT|PIPE_EOF; + tsleep(cpipe, PRIBIO, "pipecl", 0); + } + + /* + * Disconnect from peer + */ + if (ppipe = cpipe->pipe_peer) { + pipeselwakeup(ppipe); + + ppipe->pipe_state |= PIPE_EOF; + wakeup(ppipe); + ppipe->pipe_peer = NULL; + } + + /* + * free resources + */ + if (cpipe->pipe_buffer.buffer) { + if (cpipe->pipe_buffer.size > PIPE_SIZE) + --nbigpipe; + amountpipekva -= cpipe->pipe_buffer.size; + kmem_free(kernel_map, + (vm_offset_t)cpipe->pipe_buffer.buffer, + cpipe->pipe_buffer.size); + } +#ifndef PIPE_NODIRECT + if (cpipe->pipe_map.kva) { + amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE; + kmem_free(kernel_map, + cpipe->pipe_map.kva, + cpipe->pipe_buffer.size + PAGE_SIZE); + } +#endif + free(cpipe, M_TEMP); + } +} +#endif diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c index 4cc40ba..7a538b6 100644 --- a/sys/kern/sys_process.c +++ b/sys/kern/sys_process.c @@ -1,11 +1,6 @@ -/*- - * Copyright (c) 1982, 1986, 1989, 1993 - * The Regents of the University of California. All rights reserved. - * (c) UNIX System Laboratories, Inc. - * All or some portions of this file are derived from material licensed - * to the University of California by American Telephone and Telegraph - * Co. or Unix System Laboratories, Inc. and are reproduced herein with - * the permission of UNIX System Laboratories, Inc. +/* + * Copyright (c) 1994, Sean Eric Fagan + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -17,16 +12,14 @@ * documentation and/or other materials provided with the distribution. * 3. All advertising materials mentioning features or use of this software * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. + * This product includes software developed by Sean Eric Fagan. + * 4. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) @@ -35,40 +28,481 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * from: @(#)sys_process.c 8.1 (Berkeley) 6/10/93 + * $Id$ */ #include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> #include <sys/proc.h> +#include <sys/vnode.h> +#include <sys/ptrace.h> #include <sys/errno.h> +#include <sys/queue.h> + +#include <machine/reg.h> +#include <machine/psl.h> +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> + +#include <sys/user.h> +#include <miscfs/procfs/procfs.h> + +/* use the equivalent procfs code */ +#if 0 +static int +pread (struct proc *procp, unsigned int addr, unsigned int *retval) { + int rv; + vm_map_t map, tmap; + vm_object_t object; + vm_offset_t kva = 0; + int page_offset; /* offset into page */ + vm_offset_t pageno; /* page number */ + vm_map_entry_t out_entry; + vm_prot_t out_prot; + boolean_t wired, single_use; + vm_pindex_t pindex; + + /* Map page into kernel space */ + + map = &procp->p_vmspace->vm_map; + + page_offset = addr - trunc_page(addr); + pageno = trunc_page(addr); + + tmap = map; + rv = vm_map_lookup (&tmap, pageno, VM_PROT_READ, &out_entry, + &object, &pindex, &out_prot, &wired, &single_use); + + if (rv != KERN_SUCCESS) + return EINVAL; + + vm_map_lookup_done (tmap, out_entry); + + /* Find space in kernel_map for the page we're interested in */ + rv = vm_map_find (kernel_map, object, IDX_TO_OFF(pindex), + &kva, PAGE_SIZE, 0, VM_PROT_ALL, VM_PROT_ALL, 0); + + if (!rv) { + vm_object_reference (object); + + rv = vm_map_pageable (kernel_map, kva, kva + PAGE_SIZE, 0); + if (!rv) { + *retval = 0; + bcopy ((caddr_t)kva + page_offset, + retval, sizeof *retval); + } + vm_map_remove (kernel_map, kva, kva + PAGE_SIZE); + } + + return rv; +} + +static int +pwrite (struct proc *procp, unsigned int addr, unsigned int datum) { + int rv; + vm_map_t map, tmap; + vm_object_t object; + vm_offset_t kva = 0; + int page_offset; /* offset into page */ + vm_offset_t pageno; /* page number */ + vm_map_entry_t out_entry; + vm_prot_t out_prot; + boolean_t wired, single_use; + vm_pindex_t pindex; + boolean_t fix_prot = 0; + + /* Map page into kernel space */ + + map = &procp->p_vmspace->vm_map; + + page_offset = addr - trunc_page(addr); + pageno = trunc_page(addr); + + /* + * Check the permissions for the area we're interested in. + */ + + if (vm_map_check_protection (map, pageno, pageno + PAGE_SIZE, + VM_PROT_WRITE) == FALSE) { + /* + * If the page was not writable, we make it so. + * XXX It is possible a page may *not* be read/executable, + * if a process changes that! + */ + fix_prot = 1; + /* The page isn't writable, so let's try making it so... */ + if ((rv = vm_map_protect (map, pageno, pageno + PAGE_SIZE, + VM_PROT_ALL, 0)) != KERN_SUCCESS) + return EFAULT; /* I guess... */ + } + + /* + * Now we need to get the page. out_entry, out_prot, wired, and + * single_use aren't used. One would think the vm code would be + * a *bit* nicer... We use tmap because vm_map_lookup() can + * change the map argument. + */ + + tmap = map; + rv = vm_map_lookup (&tmap, pageno, VM_PROT_WRITE, &out_entry, + &object, &pindex, &out_prot, &wired, &single_use); + if (rv != KERN_SUCCESS) { + return EINVAL; + } + + /* + * Okay, we've got the page. Let's release tmap. + */ + + vm_map_lookup_done (tmap, out_entry); + + /* + * Fault the page in... + */ + + rv = vm_fault(map, pageno, VM_PROT_WRITE|VM_PROT_READ, FALSE); + if (rv != KERN_SUCCESS) + return EFAULT; + + /* Find space in kernel_map for the page we're interested in */ + rv = vm_map_find (kernel_map, object, IDX_TO_OFF(pindex), + &kva, PAGE_SIZE, 0, + VM_PROT_ALL, VM_PROT_ALL, 0); + if (!rv) { + vm_object_reference (object); + + rv = vm_map_pageable (kernel_map, kva, kva + PAGE_SIZE, 0); + if (!rv) { + bcopy (&datum, (caddr_t)kva + page_offset, sizeof datum); + } + vm_map_remove (kernel_map, kva, kva + PAGE_SIZE); + } + + if (fix_prot) + vm_map_protect (map, pageno, pageno + PAGE_SIZE, + VM_PROT_READ|VM_PROT_EXECUTE, 0); + return rv; +} +#endif /* * Process debugging system call. */ +#ifndef _SYS_SYSPROTO_H_ struct ptrace_args { int req; pid_t pid; caddr_t addr; int data; }; -ptrace(a1, a2, a3) - struct proc *a1; - struct ptrace_args *a2; - int *a3; +#endif + +int +ptrace(curp, uap, retval) + struct proc *curp; + struct ptrace_args *uap; + int *retval; { + struct proc *p; + struct iovec iov; + struct uio uio; + int error = 0; + int write; + int s; + + if (uap->req == PT_TRACE_ME) + p = curp; + else { + if ((p = pfind(uap->pid)) == NULL) + return ESRCH; + } /* - * Body deleted. + * Permissions check */ - return (ENOSYS); -} + switch (uap->req) { + case PT_TRACE_ME: + /* Always legal. */ + break; -trace_req(a1) - struct proc *a1; -{ + case PT_ATTACH: + /* Self */ + if (p->p_pid == curp->p_pid) + return EINVAL; + + /* Already traced */ + if (p->p_flag & P_TRACED) + return EBUSY; + + /* not owned by you, has done setuid (unless you're root) */ + if ((p->p_cred->p_ruid != curp->p_cred->p_ruid) || + (p->p_flag & P_SUGID)) { + if (error = suser(curp->p_ucred, &curp->p_acflag)) + return error; + } + + /* OK */ + break; + + case PT_READ_I: + case PT_READ_D: + case PT_READ_U: + case PT_WRITE_I: + case PT_WRITE_D: + case PT_WRITE_U: + case PT_CONTINUE: + case PT_KILL: + case PT_STEP: + case PT_DETACH: +#ifdef PT_GETREGS + case PT_GETREGS: +#endif +#ifdef PT_SETREGS + case PT_SETREGS: +#endif +#ifdef PT_GETFPREGS + case PT_GETFPREGS: +#endif +#ifdef PT_SETFPREGS + case PT_SETFPREGS: +#endif + /* not being traced... */ + if ((p->p_flag & P_TRACED) == 0) + return EPERM; + + /* not being traced by YOU */ + if (p->p_pptr != curp) + return EBUSY; + + /* not currently stopped */ + if (p->p_stat != SSTOP || (p->p_flag & P_WAITED) == 0) + return EBUSY; + + /* OK */ + break; + + default: + return EINVAL; + } + +#ifdef FIX_SSTEP + /* + * Single step fixup ala procfs + */ + FIX_SSTEP(p); +#endif /* - * Body deleted. + * Actually do the requests */ - return (0); + + write = 0; + *retval = 0; + + switch (uap->req) { + case PT_TRACE_ME: + /* set my trace flag and "owner" so it can read/write me */ + p->p_flag |= P_TRACED; + p->p_oppid = p->p_pptr->p_pid; + return 0; + + case PT_ATTACH: + /* security check done above */ + p->p_flag |= P_TRACED; + p->p_oppid = p->p_pptr->p_pid; + if (p->p_pptr != curp) + proc_reparent(p, curp); + uap->data = SIGSTOP; + goto sendsig; /* in PT_CONTINUE below */ + + case PT_STEP: + case PT_CONTINUE: + case PT_DETACH: + if ((unsigned)uap->data >= NSIG) + return EINVAL; + + PHOLD(p); + + if (uap->req == PT_STEP) { + if ((error = ptrace_single_step (p))) { + PRELE(p); + return error; + } + } + + if (uap->addr != (caddr_t)1) { + fill_eproc (p, &p->p_addr->u_kproc.kp_eproc); + if ((error = ptrace_set_pc (p, (u_int)uap->addr))) { + PRELE(p); + return error; + } + } + PRELE(p); + + if (uap->req == PT_DETACH) { + /* reset process parent */ + if (p->p_oppid != p->p_pptr->p_pid) { + struct proc *pp; + + pp = pfind(p->p_oppid); + proc_reparent(p, pp ? pp : initproc); + } + + p->p_flag &= ~(P_TRACED | P_WAITED); + p->p_oppid = 0; + + /* should we send SIGCHLD? */ + + } + + sendsig: + /* deliver or queue signal */ + s = splhigh(); + if (p->p_stat == SSTOP) { + p->p_xstat = uap->data; + setrunnable(p); + } else if (uap->data) { + psignal(p, uap->data); + } + splx(s); + return 0; + + case PT_WRITE_I: + case PT_WRITE_D: + write = 1; + /* fallthrough */ + case PT_READ_I: + case PT_READ_D: + /* write = 0 set above */ + iov.iov_base = write ? (caddr_t)&uap->data : (caddr_t)retval; + iov.iov_len = sizeof(int); + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_offset = (off_t)(u_long)uap->addr; + uio.uio_resid = sizeof(int); + uio.uio_segflg = UIO_SYSSPACE; /* ie: the uap */ + uio.uio_rw = write ? UIO_WRITE : UIO_READ; + uio.uio_procp = p; + error = procfs_domem(curp, p, NULL, &uio); + if (uio.uio_resid != 0) { + /* + * XXX procfs_domem() doesn't currently return ENOSPC, + * so I think write() can bogusly return 0. + * XXX what happens for short writes? We don't want + * to write partial data. + * XXX procfs_domem() returns EPERM for other invalid + * addresses. Convert this to EINVAL. Does this + * clobber returns of EPERM for other reasons? + */ + if (error == 0 || error == ENOSPC || error == EPERM) + error = EINVAL; /* EOF */ + } + return (error); + + case PT_READ_U: + if ((u_int)uap->addr > (UPAGES * PAGE_SIZE - sizeof(int))) { + return EFAULT; + } + error = 0; + PHOLD(p); /* user had damn well better be incore! */ + if (p->p_flag & P_INMEM) { + p->p_addr->u_kproc.kp_proc = *p; + fill_eproc (p, &p->p_addr->u_kproc.kp_eproc); + *retval = *(int*)((u_int)p->p_addr + (u_int)uap->addr); + } else { + *retval = 0; + error = EFAULT; + } + PRELE(p); + return error; + + case PT_WRITE_U: + PHOLD(p); /* user had damn well better be incore! */ + if (p->p_flag & P_INMEM) { + p->p_addr->u_kproc.kp_proc = *p; + fill_eproc (p, &p->p_addr->u_kproc.kp_eproc); + error = ptrace_write_u(p, (vm_offset_t)uap->addr, uap->data); + } else { + error = EFAULT; + } + PRELE(p); + return error; + + case PT_KILL: + uap->data = SIGKILL; + goto sendsig; /* in PT_CONTINUE above */ + +#ifdef PT_SETREGS + case PT_SETREGS: + write = 1; + /* fallthrough */ +#endif /* PT_SETREGS */ +#ifdef PT_GETREGS + case PT_GETREGS: + /* write = 0 above */ +#endif /* PT_SETREGS */ +#if defined(PT_SETREGS) || defined(PT_GETREGS) + if (!procfs_validregs(p)) /* no P_SYSTEM procs please */ + return EINVAL; + else { + iov.iov_base = uap->addr; + iov.iov_len = sizeof(struct reg); + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_offset = 0; + uio.uio_resid = sizeof(struct reg); + uio.uio_segflg = UIO_USERSPACE; + uio.uio_rw = write ? UIO_WRITE : UIO_READ; + uio.uio_procp = curp; + return (procfs_doregs(curp, p, NULL, &uio)); + } +#endif /* defined(PT_SETREGS) || defined(PT_GETREGS) */ + +#ifdef PT_SETFPREGS + case PT_SETFPREGS: + write = 1; + /* fallthrough */ +#endif /* PT_SETFPREGS */ +#ifdef PT_GETFPREGS + case PT_GETFPREGS: + /* write = 0 above */ +#endif /* PT_SETFPREGS */ +#if defined(PT_SETFPREGS) || defined(PT_GETFPREGS) + if (!procfs_validfpregs(p)) /* no P_SYSTEM procs please */ + return EINVAL; + else { + iov.iov_base = uap->addr; + iov.iov_len = sizeof(struct fpreg); + uio.uio_iov = &iov; + uio.uio_iovcnt = 1; + uio.uio_offset = 0; + uio.uio_resid = sizeof(struct fpreg); + uio.uio_segflg = UIO_USERSPACE; + uio.uio_rw = write ? UIO_WRITE : UIO_READ; + uio.uio_procp = curp; + return (procfs_dofpregs(curp, p, NULL, &uio)); + } +#endif /* defined(PT_SETFPREGS) || defined(PT_GETFPREGS) */ + + default: + break; + } + + return 0; +} + +int +trace_req(p) + struct proc *p; +{ + return 1; } diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c index abc2dc7..c3e6615 100644 --- a/sys/kern/sys_socket.c +++ b/sys/kern/sys_socket.c @@ -30,28 +30,39 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)sys_socket.c 8.3 (Berkeley) 2/14/95 + * @(#)sys_socket.c 8.1 (Berkeley) 6/10/93 + * $Id: sys_socket.c,v 1.11 1997/03/23 03:36:25 bde Exp $ */ #include <sys/param.h> +#include <sys/queue.h> #include <sys/systm.h> #include <sys/proc.h> +#include <sys/fcntl.h> #include <sys/file.h> #include <sys/mbuf.h> #include <sys/protosw.h> #include <sys/socket.h> +#include <sys/stat.h> #include <sys/socketvar.h> -#include <sys/ioctl.h> +#include <sys/filio.h> /* XXX */ +#include <sys/sockio.h> #include <sys/stat.h> #include <net/if.h> #include <net/route.h> +static int soo_read __P((struct file *fp, struct uio *uio, + struct ucred *cred)); +static int soo_write __P((struct file *fp, struct uio *uio, + struct ucred *cred)); +static int soo_close __P((struct file *fp, struct proc *p)); + struct fileops socketops = { soo_read, soo_write, soo_ioctl, soo_select, soo_close }; /* ARGSUSED */ -int +static int soo_read(fp, uio, cred) struct file *fp; struct uio *uio; @@ -63,7 +74,7 @@ soo_read(fp, uio, cred) } /* ARGSUSED */ -int +static int soo_write(fp, uio, cred) struct file *fp; struct uio *uio; @@ -77,7 +88,7 @@ soo_write(fp, uio, cred) int soo_ioctl(fp, cmd, data, p) struct file *fp; - u_long cmd; + int cmd; register caddr_t data; struct proc *p; { @@ -129,8 +140,7 @@ soo_ioctl(fp, cmd, data, p) return (ifioctl(so, cmd, data, p)); if (IOCGROUP(cmd) == 'r') return (rtioctl(cmd, data, p)); - return ((*so->so_proto->pr_usrreq)(so, PRU_CONTROL, - (struct mbuf *)cmd, (struct mbuf *)data, (struct mbuf *)0)); + return ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, 0)); } int @@ -183,13 +193,11 @@ soo_stat(so, ub) bzero((caddr_t)ub, sizeof (*ub)); ub->st_mode = S_IFSOCK; - return ((*so->so_proto->pr_usrreq)(so, PRU_SENSE, - (struct mbuf *)ub, (struct mbuf *)0, - (struct mbuf *)0)); + return ((*so->so_proto->pr_usrreqs->pru_sense)(so, ub)); } /* ARGSUSED */ -int +static int soo_close(fp, p) struct file *fp; struct proc *p; diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c index 91cbdc9..e938376 100644 --- a/sys/kern/syscalls.c +++ b/sys/kern/syscalls.c @@ -2,7 +2,7 @@ * System call names. * * DO NOT EDIT-- this file is automatically generated. - * created from @(#)syscalls.master 8.6 (Berkeley) 3/30/95 + * created from Id: syscalls.master,v 1.33 1997/02/22 09:39:21 peter Exp */ char *syscallnames[] = { @@ -14,10 +14,10 @@ char *syscallnames[] = { "open", /* 5 = open */ "close", /* 6 = close */ "wait4", /* 7 = wait4 */ - "compat_43_creat", /* 8 = compat_43 creat */ + "old.creat", /* 8 = old creat */ "link", /* 9 = link */ "unlink", /* 10 = unlink */ - "#11 (obsolete execv)", /* 11 = obsolete execv */ + "obs_execv", /* 11 = obsolete execv */ "chdir", /* 12 = chdir */ "fchdir", /* 13 = fchdir */ "mknod", /* 14 = mknod */ @@ -25,7 +25,7 @@ char *syscallnames[] = { "chown", /* 16 = chown */ "break", /* 17 = break */ "getfsstat", /* 18 = getfsstat */ - "compat_43_lseek", /* 19 = compat_43 lseek */ + "old.lseek", /* 19 = old lseek */ "getpid", /* 20 = getpid */ "mount", /* 21 = mount */ "unmount", /* 22 = unmount */ @@ -44,18 +44,14 @@ char *syscallnames[] = { "fchflags", /* 35 = fchflags */ "sync", /* 36 = sync */ "kill", /* 37 = kill */ - "compat_43_stat", /* 38 = compat_43 stat */ + "old.stat", /* 38 = old stat */ "getppid", /* 39 = getppid */ - "compat_43_lstat", /* 40 = compat_43 lstat */ + "old.lstat", /* 40 = old lstat */ "dup", /* 41 = dup */ "pipe", /* 42 = pipe */ "getegid", /* 43 = getegid */ "profil", /* 44 = profil */ -#ifdef KTRACE "ktrace", /* 45 = ktrace */ -#else - "#45 (unimplemented ktrace)", /* 45 = unimplemented ktrace */ -#endif "sigaction", /* 46 = sigaction */ "getgid", /* 47 = getgid */ "sigprocmask", /* 48 = sigprocmask */ @@ -72,83 +68,75 @@ char *syscallnames[] = { "execve", /* 59 = execve */ "umask", /* 60 = umask */ "chroot", /* 61 = chroot */ - "compat_43_fstat", /* 62 = compat_43 fstat */ - "compat_43_getkerninfo", /* 63 = compat_43 getkerninfo */ - "compat_43_getpagesize", /* 64 = compat_43 getpagesize */ + "old.fstat", /* 62 = old fstat */ + "old.getkerninfo", /* 63 = old getkerninfo */ + "old.getpagesize", /* 64 = old getpagesize */ "msync", /* 65 = msync */ "vfork", /* 66 = vfork */ - "#67 (obsolete vread)", /* 67 = obsolete vread */ - "#68 (obsolete vwrite)", /* 68 = obsolete vwrite */ + "obs_vread", /* 67 = obsolete vread */ + "obs_vwrite", /* 68 = obsolete vwrite */ "sbrk", /* 69 = sbrk */ "sstk", /* 70 = sstk */ - "compat_43_mmap", /* 71 = compat_43 mmap */ + "old.mmap", /* 71 = old mmap */ "vadvise", /* 72 = vadvise */ "munmap", /* 73 = munmap */ "mprotect", /* 74 = mprotect */ "madvise", /* 75 = madvise */ - "#76 (obsolete vhangup)", /* 76 = obsolete vhangup */ - "#77 (obsolete vlimit)", /* 77 = obsolete vlimit */ + "obs_vhangup", /* 76 = obsolete vhangup */ + "obs_vlimit", /* 77 = obsolete vlimit */ "mincore", /* 78 = mincore */ "getgroups", /* 79 = getgroups */ "setgroups", /* 80 = setgroups */ "getpgrp", /* 81 = getpgrp */ "setpgid", /* 82 = setpgid */ "setitimer", /* 83 = setitimer */ - "compat_43_wait", /* 84 = compat_43 wait */ + "old.wait", /* 84 = old wait */ "swapon", /* 85 = swapon */ "getitimer", /* 86 = getitimer */ - "compat_43_gethostname", /* 87 = compat_43 gethostname */ - "compat_43_sethostname", /* 88 = compat_43 sethostname */ + "old.gethostname", /* 87 = old gethostname */ + "old.sethostname", /* 88 = old sethostname */ "getdtablesize", /* 89 = getdtablesize */ "dup2", /* 90 = dup2 */ - "#91 (unimplemented getdopt)", /* 91 = unimplemented getdopt */ + "#91", /* 91 = getdopt */ "fcntl", /* 92 = fcntl */ "select", /* 93 = select */ - "#94 (unimplemented setdopt)", /* 94 = unimplemented setdopt */ + "#94", /* 94 = setdopt */ "fsync", /* 95 = fsync */ "setpriority", /* 96 = setpriority */ "socket", /* 97 = socket */ "connect", /* 98 = connect */ - "compat_43_accept", /* 99 = compat_43 accept */ + "old.accept", /* 99 = old accept */ "getpriority", /* 100 = getpriority */ - "compat_43_send", /* 101 = compat_43 send */ - "compat_43_recv", /* 102 = compat_43 recv */ + "old.send", /* 101 = old send */ + "old.recv", /* 102 = old recv */ "sigreturn", /* 103 = sigreturn */ "bind", /* 104 = bind */ "setsockopt", /* 105 = setsockopt */ "listen", /* 106 = listen */ - "#107 (obsolete vtimes)", /* 107 = obsolete vtimes */ - "compat_43_sigvec", /* 108 = compat_43 sigvec */ - "compat_43_sigblock", /* 109 = compat_43 sigblock */ - "compat_43_sigsetmask", /* 110 = compat_43 sigsetmask */ + "obs_vtimes", /* 107 = obsolete vtimes */ + "old.sigvec", /* 108 = old sigvec */ + "old.sigblock", /* 109 = old sigblock */ + "old.sigsetmask", /* 110 = old sigsetmask */ "sigsuspend", /* 111 = sigsuspend */ - "compat_43_sigstack", /* 112 = compat_43 sigstack */ - "compat_43_recvmsg", /* 113 = compat_43 recvmsg */ - "compat_43_sendmsg", /* 114 = compat_43 sendmsg */ -#ifdef TRACE - "vtrace", /* 115 = vtrace */ -#else - "#115 (obsolete vtrace)", /* 115 = obsolete vtrace */ -#endif + "old.sigstack", /* 112 = old sigstack */ + "old.recvmsg", /* 113 = old recvmsg */ + "old.sendmsg", /* 114 = old sendmsg */ + "obs_vtrace", /* 115 = obsolete vtrace */ "gettimeofday", /* 116 = gettimeofday */ "getrusage", /* 117 = getrusage */ "getsockopt", /* 118 = getsockopt */ -#ifdef vax - "resuba", /* 119 = resuba */ -#else - "#119 (unimplemented resuba)", /* 119 = unimplemented resuba */ -#endif + "#119", /* 119 = resuba */ "readv", /* 120 = readv */ "writev", /* 121 = writev */ "settimeofday", /* 122 = settimeofday */ "fchown", /* 123 = fchown */ "fchmod", /* 124 = fchmod */ - "compat_43_recvfrom", /* 125 = compat_43 recvfrom */ - "compat_43_setreuid", /* 126 = compat_43 setreuid */ - "compat_43_setregid", /* 127 = compat_43 setregid */ + "old.recvfrom", /* 125 = old recvfrom */ + "setreuid", /* 126 = setreuid */ + "setregid", /* 127 = setregid */ "rename", /* 128 = rename */ - "compat_43_truncate", /* 129 = compat_43 truncate */ - "compat_43_ftruncate", /* 130 = compat_43 ftruncate */ + "old.truncate", /* 129 = old truncate */ + "old.ftruncate", /* 130 = old ftruncate */ "flock", /* 131 = flock */ "mkfifo", /* 132 = mkfifo */ "sendto", /* 133 = sendto */ @@ -157,60 +145,56 @@ char *syscallnames[] = { "mkdir", /* 136 = mkdir */ "rmdir", /* 137 = rmdir */ "utimes", /* 138 = utimes */ - "#139 (obsolete 4.2 sigreturn)", /* 139 = obsolete 4.2 sigreturn */ + "obs_4.2", /* 139 = obsolete 4.2 sigreturn */ "adjtime", /* 140 = adjtime */ - "compat_43_getpeername", /* 141 = compat_43 getpeername */ - "compat_43_gethostid", /* 142 = compat_43 gethostid */ - "compat_43_sethostid", /* 143 = compat_43 sethostid */ - "compat_43_getrlimit", /* 144 = compat_43 getrlimit */ - "compat_43_setrlimit", /* 145 = compat_43 setrlimit */ - "compat_43_killpg", /* 146 = compat_43 killpg */ + "old.getpeername", /* 141 = old getpeername */ + "old.gethostid", /* 142 = old gethostid */ + "old.sethostid", /* 143 = old sethostid */ + "old.getrlimit", /* 144 = old getrlimit */ + "old.setrlimit", /* 145 = old setrlimit */ + "old.killpg", /* 146 = old killpg */ "setsid", /* 147 = setsid */ "quotactl", /* 148 = quotactl */ - "compat_43_quota", /* 149 = compat_43 quota */ - "compat_43_getsockname", /* 150 = compat_43 getsockname */ - "#151 (unimplemented)", /* 151 = unimplemented */ - "#152 (unimplemented)", /* 152 = unimplemented */ - "#153 (unimplemented)", /* 153 = unimplemented */ - "#154 (unimplemented)", /* 154 = unimplemented */ + "old.quota", /* 149 = old quota */ + "old.getsockname", /* 150 = old getsockname */ + "#151", /* 151 = sem_lock */ + "#152", /* 152 = sem_wakeup */ + "#153", /* 153 = asyncdaemon */ + "#154", /* 154 = nosys */ #ifdef NFS "nfssvc", /* 155 = nfssvc */ #else - "#155 (unimplemented nfssvc)", /* 155 = unimplemented nfssvc */ + "#155", /* 155 = nosys */ #endif - "compat_43_getdirentries", /* 156 = compat_43 getdirentries */ + "old.getdirentries", /* 156 = old getdirentries */ "statfs", /* 157 = statfs */ "fstatfs", /* 158 = fstatfs */ - "#159 (unimplemented)", /* 159 = unimplemented */ - "#160 (unimplemented)", /* 160 = unimplemented */ -#ifdef NFS + "#159", /* 159 = nosys */ + "#160", /* 160 = nosys */ +#if defined(NFS) && !defined (NFS_NOSERVER) "getfh", /* 161 = getfh */ #else - "#161 (unimplemented getfh)", /* 161 = unimplemented getfh */ -#endif - "#162 (unimplemented getdomainname)", /* 162 = unimplemented getdomainname */ - "#163 (unimplemented setdomainname)", /* 163 = unimplemented setdomainname */ - "#164 (unimplemented)", /* 164 = unimplemented */ - "#165 (unimplemented)", /* 165 = unimplemented */ - "#166 (unimplemented)", /* 166 = unimplemented */ - "#167 (unimplemented)", /* 167 = unimplemented */ - "#168 (unimplemented)", /* 168 = unimplemented */ - "#169 (unimplemented semsys)", /* 169 = unimplemented semsys */ - "#170 (unimplemented msgsys)", /* 170 = unimplemented msgsys */ -#if defined(SYSVSHM) && !defined(alpha) - "compat_43_shmsys", /* 171 = compat_43 shmsys */ -#else - "#171 (unimplemented shmsys)", /* 171 = unimplemented shmsys */ + "#161", /* 161 = nosys */ #endif - "#172 (unimplemented)", /* 172 = unimplemented */ - "#173 (unimplemented)", /* 173 = unimplemented */ - "#174 (unimplemented)", /* 174 = unimplemented */ - "#175 (unimplemented)", /* 175 = unimplemented */ - "#176 (unimplemented)", /* 176 = unimplemented */ - "#177 (unimplemented)", /* 177 = unimplemented */ - "#178 (unimplemented)", /* 178 = unimplemented */ - "#179 (unimplemented)", /* 179 = unimplemented */ - "#180 (unimplemented)", /* 180 = unimplemented */ + "getdomainname", /* 162 = getdomainname */ + "setdomainname", /* 163 = setdomainname */ + "uname", /* 164 = uname */ + "sysarch", /* 165 = sysarch */ + "rtprio", /* 166 = rtprio */ + "#167", /* 167 = nosys */ + "#168", /* 168 = nosys */ + "semsys", /* 169 = semsys */ + "msgsys", /* 170 = msgsys */ + "shmsys", /* 171 = shmsys */ + "#172", /* 172 = nosys */ + "#173", /* 173 = nosys */ + "#174", /* 174 = nosys */ + "#175", /* 175 = nosys */ + "ntp_adjtime", /* 176 = ntp_adjtime */ + "#177", /* 177 = sfork */ + "#178", /* 178 = getdescriptor */ + "#179", /* 179 = setdescriptor */ + "#180", /* 180 = nosys */ "setgid", /* 181 = setgid */ "setegid", /* 182 = setegid */ "seteuid", /* 183 = seteuid */ @@ -220,17 +204,17 @@ char *syscallnames[] = { "lfs_segclean", /* 186 = lfs_segclean */ "lfs_segwait", /* 187 = lfs_segwait */ #else - "#184 (unimplemented lfs_bmapv)", /* 184 = unimplemented lfs_bmapv */ - "#185 (unimplemented lfs_markv)", /* 185 = unimplemented lfs_markv */ - "#186 (unimplemented lfs_segclean)", /* 186 = unimplemented lfs_segclean */ - "#187 (unimplemented lfs_segwait)", /* 187 = unimplemented lfs_segwait */ + "#184", /* 184 = nosys */ + "#185", /* 185 = nosys */ + "#186", /* 186 = nosys */ + "#187", /* 187 = nosys */ #endif "stat", /* 188 = stat */ "fstat", /* 189 = fstat */ "lstat", /* 190 = lstat */ "pathconf", /* 191 = pathconf */ "fpathconf", /* 192 = fpathconf */ - "#193 (unimplemented)", /* 193 = unimplemented */ + "#193", /* 193 = nosys */ "getrlimit", /* 194 = getrlimit */ "setrlimit", /* 195 = setrlimit */ "getdirentries", /* 196 = getdirentries */ @@ -242,38 +226,51 @@ char *syscallnames[] = { "__sysctl", /* 202 = __sysctl */ "mlock", /* 203 = mlock */ "munlock", /* 204 = munlock */ - "undelete", /* 205 = undelete */ - "#206 (unimplemented)", /* 206 = unimplemented */ - "#207 (unimplemented)", /* 207 = unimplemented */ - "#208 (unimplemented)", /* 208 = unimplemented */ - "#209 (unimplemented)", /* 209 = unimplemented */ - "#210 (unimplemented)", /* 210 = unimplemented */ - "#211 (unimplemented)", /* 211 = unimplemented */ - "#212 (unimplemented)", /* 212 = unimplemented */ - "#213 (unimplemented)", /* 213 = unimplemented */ - "#214 (unimplemented)", /* 214 = unimplemented */ - "#215 (unimplemented)", /* 215 = unimplemented */ - "#216 (unimplemented)", /* 216 = unimplemented */ - "#217 (unimplemented)", /* 217 = unimplemented */ - "#218 (unimplemented)", /* 218 = unimplemented */ - "#219 (unimplemented)", /* 219 = unimplemented */ - "#220 (unimplemented semctl)", /* 220 = unimplemented semctl */ - "#221 (unimplemented semget)", /* 221 = unimplemented semget */ - "#222 (unimplemented semop)", /* 222 = unimplemented semop */ - "#223 (unimplemented semconfig)", /* 223 = unimplemented semconfig */ - "#224 (unimplemented msgctl)", /* 224 = unimplemented msgctl */ - "#225 (unimplemented msgget)", /* 225 = unimplemented msgget */ - "#226 (unimplemented msgsnd)", /* 226 = unimplemented msgsnd */ - "#227 (unimplemented msgrcv)", /* 227 = unimplemented msgrcv */ -#if defined(SYSVSHM) && 0 + "utrace", /* 205 = utrace */ + "undelete", /* 206 = undelete */ + "#207", /* 207 = nosys */ + "#208", /* 208 = nosys */ + "#209", /* 209 = nosys */ + "lkmnosys", /* 210 = lkmnosys */ + "lkmnosys", /* 211 = lkmnosys */ + "lkmnosys", /* 212 = lkmnosys */ + "lkmnosys", /* 213 = lkmnosys */ + "lkmnosys", /* 214 = lkmnosys */ + "lkmnosys", /* 215 = lkmnosys */ + "lkmnosys", /* 216 = lkmnosys */ + "lkmnosys", /* 217 = lkmnosys */ + "lkmnosys", /* 218 = lkmnosys */ + "lkmnosys", /* 219 = lkmnosys */ + "__semctl", /* 220 = __semctl */ + "semget", /* 221 = semget */ + "semop", /* 222 = semop */ + "semconfig", /* 223 = semconfig */ + "msgctl", /* 224 = msgctl */ + "msgget", /* 225 = msgget */ + "msgsnd", /* 226 = msgsnd */ + "msgrcv", /* 227 = msgrcv */ "shmat", /* 228 = shmat */ "shmctl", /* 229 = shmctl */ "shmdt", /* 230 = shmdt */ "shmget", /* 231 = shmget */ -#else - "#228 (unimplemented shmat)", /* 228 = unimplemented shmat */ - "#229 (unimplemented shmctl)", /* 229 = unimplemented shmctl */ - "#230 (unimplemented shmdt)", /* 230 = unimplemented shmdt */ - "#231 (unimplemented shmget)", /* 231 = unimplemented shmget */ -#endif + "#232", /* 232 = nosys */ + "#233", /* 233 = nosys */ + "#234", /* 234 = nosys */ + "#235", /* 235 = nosys */ + "#236", /* 236 = nosys */ + "#237", /* 237 = nosys */ + "#238", /* 238 = nosys */ + "#239", /* 239 = nosys */ + "#240", /* 240 = nosys */ + "#241", /* 241 = nosys */ + "#242", /* 242 = nosys */ + "#243", /* 243 = nosys */ + "#244", /* 244 = nosys */ + "#245", /* 245 = nosys */ + "#246", /* 246 = nosys */ + "#247", /* 247 = nosys */ + "#248", /* 248 = nosys */ + "#249", /* 249 = nosys */ + "minherit", /* 250 = minherit */ + "rfork", /* 251 = rfork */ }; diff --git a/sys/kern/syscalls.conf b/sys/kern/syscalls.conf deleted file mode 100644 index 71b82ce..0000000 --- a/sys/kern/syscalls.conf +++ /dev/null @@ -1,12 +0,0 @@ -# @(#)syscalls.conf 8.1 (Berkeley) 2/14/95 - -sysnames="syscalls.c" -sysnumhdr="../sys/syscall.h" -syssw="init_sysent.c" -sysarghdr="../sys/syscallargs.h" -compatopts="compat_43" -libcompatopts="" - -switchname="sysent" -namesname="syscallnames" -constprefix="SYS_" diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master index b57cd73..b0921d4 100644 --- a/sys/kern/syscalls.master +++ b/sys/kern/syscalls.master @@ -1,38 +1,32 @@ - @(#)syscalls.master 8.6 (Berkeley) 3/30/95 -; System call name/number "master" file. -; (See syscalls.conf to see what it is processed into.) + $Id$ +; from: @(#)syscalls.master 8.2 (Berkeley) 1/13/94 ; -; Fields: number type [type-dependent ...] +; System call name/number master file. +; Processed to created init_sysent.c, syscalls.c and syscall.h. + +; Columns: number type nargs namespc name alt{name,tag,rtyp}/comments ; number system call number, must be in order -; type one of STD, OBSOL, UNIMPL, NODEF, NOARGS, or one of -; the compatibility options defined in syscalls.conf. -; +; type one of STD, OBSOL, UNIMPL, COMPAT +; namespc one of POSIX, BSD, NOHIDE +; name psuedo-prototype of syscall routine +; If one of the following alts is different, then all appear: +; altname name of system call if different +; alttag name of args struct tag if different from [o]`name'"_args" +; altrtyp return type if not int (bogus - syscalls always return int) +; for UNIMPL/OBSOL, name continues with comments + ; types: ; STD always included -; OBSOL obsolete, not included in system -; UNIMPL unimplemented, not included in system -; NODEF included, but don't define the syscall number -; NOARGS included, but don't define the syscall args structure -; -; The compat options are defined in the syscalls.conf file, and the -; compat option name is prefixed to the syscall name. Other than -; that, they're like NODEF (for 'compat' options), or STD (for -; 'libcompat' options). -; -; The type-dependent arguments are as follows: -; For STD, NODEF, NOARGS, and compat syscalls: -; { pseudo-proto } [alias] -; For other syscalls: -; [comment] -; +; COMPAT included on COMPAT #ifdef +; LIBCOMPAT included on COMPAT #ifdef, and placed in syscall.h +; OBSOL obsolete, not included in system, only specifies name +; UNIMPL not implemented, placeholder only + ; #ifdef's, etc. may be included, and are copied to the output files. -; #include's are copied to the syscall switch definition file only. #include <sys/param.h> -#include <sys/systm.h> -#include <sys/signal.h> -#include <sys/mount.h> -#include <sys/syscallargs.h> +#include <sys/sysent.h> +#include <sys/sysproto.h> ; Reserved/unimplemented system calls in the range 0-150 inclusive ; are reserved for use in future Berkeley releases. @@ -40,316 +34,359 @@ ; redistributions should be placed in the reserved range at the end ; of the current calls. -0 STD { int nosys(void); } syscall -1 STD { int exit(int rval); } -2 STD { int fork(void); } -3 STD { int read(int fd, char *buf, u_int nbyte); } -4 STD { int write(int fd, char *buf, u_int nbyte); } -5 STD { int open(char *path, int flags, int mode); } -6 STD { int close(int fd); } -7 STD { int wait4(int pid, int *status, int options, \ - struct rusage *rusage); } -8 COMPAT_43 { int creat(char *path, int mode); } -9 STD { int link(char *path, char *link); } -10 STD { int unlink(char *path); } -11 OBSOL execv -12 STD { int chdir(char *path); } -13 STD { int fchdir(int fd); } -14 STD { int mknod(char *path, int mode, int dev); } -15 STD { int chmod(char *path, int mode); } -16 STD { int chown(char *path, int uid, int gid); } -17 STD { int obreak(char *nsize); } break -18 STD { int getfsstat(struct statfs *buf, long bufsize, \ +0 STD NOHIDE { int nosys(void); } syscall nosys_args int +1 STD NOHIDE { void exit(int rval); } exit rexit_args void +2 STD POSIX { int fork(void); } +3 STD POSIX { int read(int fd, char *buf, u_int nbyte); } +4 STD POSIX { int write(int fd, char *buf, u_int nbyte); } +5 STD POSIX { int open(char *path, int flags, int mode); } +; XXX should be { int open(const char *path, int flags, ...); } +; but we're not ready for `const' or varargs. +; XXX man page says `mode_t mode'. +6 STD POSIX { int close(int fd); } +7 STD BSD { int wait4(int pid, int *status, int options, \ + struct rusage *rusage); } wait4 wait_args int +8 COMPAT BSD { int creat(char *path, int mode); } +9 STD POSIX { int link(char *path, char *link); } +10 STD POSIX { int unlink(char *path); } +11 OBSOL NOHIDE execv +12 STD POSIX { int chdir(char *path); } +13 STD BSD { int fchdir(int fd); } +14 STD POSIX { int mknod(char *path, int mode, int dev); } +15 STD POSIX { int chmod(char *path, int mode); } +16 STD POSIX { int chown(char *path, int uid, int gid); } +17 STD BSD { int obreak(char *nsize); } break obreak_args int +18 STD BSD { int getfsstat(struct statfs *buf, long bufsize, \ int flags); } -19 COMPAT_43 { long lseek(int fd, long offset, int whence); } -20 STD { pid_t getpid(void); } -21 STD { int mount(char *type, char *path, int flags, \ +19 COMPAT POSIX { long lseek(int fd, long offset, int whence); } +20 STD POSIX { pid_t getpid(void); } +21 STD BSD { int mount(char *type, char *path, int flags, \ caddr_t data); } -22 STD { int unmount(char *path, int flags); } -23 STD { int setuid(uid_t uid); } -24 STD { uid_t getuid(void); } -25 STD { uid_t geteuid(void); } -26 STD { int ptrace(int req, pid_t pid, caddr_t addr, \ +; XXX 4.4lite2 uses `char *type' but we're not ready for that. +; XXX `path' should have type `const char *' but we're not ready for that. +22 STD BSD { int unmount(char *path, int flags); } +23 STD POSIX { int setuid(uid_t uid); } +24 STD POSIX { uid_t getuid(void); } +25 STD POSIX { uid_t geteuid(void); } +26 STD BSD { int ptrace(int req, pid_t pid, caddr_t addr, \ int data); } -27 STD { int recvmsg(int s, struct msghdr *msg, int flags); } -28 STD { int sendmsg(int s, caddr_t msg, int flags); } -29 STD { int recvfrom(int s, caddr_t buf, size_t len, \ +27 STD BSD { int recvmsg(int s, struct msghdr *msg, int flags); } +28 STD BSD { int sendmsg(int s, caddr_t msg, int flags); } +29 STD BSD { int recvfrom(int s, caddr_t buf, size_t len, \ int flags, caddr_t from, int *fromlenaddr); } -30 STD { int accept(int s, caddr_t name, int *anamelen); } -31 STD { int getpeername(int fdes, caddr_t asa, int *alen); } -32 STD { int getsockname(int fdes, caddr_t asa, int *alen); } -33 STD { int access(char *path, int flags); } -34 STD { int chflags(char *path, int flags); } -35 STD { int fchflags(int fd, int flags); } -36 STD { int sync(void); } -37 STD { int kill(int pid, int signum); } -38 COMPAT_43 { int stat(char *path, struct ostat *ub); } -39 STD { pid_t getppid(void); } -40 COMPAT_43 { int lstat(char *path, struct ostat *ub); } -41 STD { int dup(u_int fd); } -42 STD { int pipe(void); } -43 STD { gid_t getegid(void); } -44 STD { int profil(caddr_t samples, u_int size, \ +30 STD BSD { int accept(int s, caddr_t name, int *anamelen); } +31 STD BSD { int getpeername(int fdes, caddr_t asa, int *alen); } +32 STD BSD { int getsockname(int fdes, caddr_t asa, int *alen); } +33 STD POSIX { int access(char *path, int flags); } +34 STD BSD { int chflags(char *path, int flags); } +35 STD BSD { int fchflags(int fd, int flags); } +36 STD BSD { int sync(void); } +37 STD POSIX { int kill(int pid, int signum); } +38 COMPAT POSIX { int stat(char *path, struct ostat *ub); } +39 STD POSIX { pid_t getppid(void); } +40 COMPAT POSIX { int lstat(char *path, struct ostat *ub); } +41 STD POSIX { int dup(u_int fd); } +42 STD POSIX { int pipe(void); } +43 STD POSIX { gid_t getegid(void); } +44 STD BSD { int profil(caddr_t samples, u_int size, \ u_int offset, u_int scale); } -#ifdef KTRACE -45 STD { int ktrace(char *fname, int ops, int facs, \ +45 STD BSD { int ktrace(char *fname, int ops, int facs, \ int pid); } -#else -45 UNIMPL ktrace -#endif -46 STD { int sigaction(int signum, struct sigaction *nsa, \ +46 STD POSIX { int sigaction(int signum, struct sigaction *nsa, \ struct sigaction *osa); } -47 STD { gid_t getgid(void); } -48 STD { int sigprocmask(int how, sigset_t mask); } -49 STD { int getlogin(char *namebuf, u_int namelen); } -50 STD { int setlogin(char *namebuf); } -51 STD { int acct(char *path); } -52 STD { int sigpending(void); } -53 STD { int sigaltstack(struct sigaltstack *nss, \ +47 STD POSIX { gid_t getgid(void); } +48 STD POSIX { int sigprocmask(int how, sigset_t mask); } +; XXX note nonstandard (bogus) calling convention - the libc stub passes +; us the mask, not a pointer to it, and we return the old mask as the +; (int) return value. +49 STD BSD { int getlogin(char *namebuf, u_int namelen); } +50 STD BSD { int setlogin(char *namebuf); } +51 STD BSD { int acct(char *path); } +52 STD POSIX { int sigpending(void); } +53 STD BSD { int sigaltstack(struct sigaltstack *nss, \ struct sigaltstack *oss); } -54 STD { int ioctl(int fd, u_long com, caddr_t data); } -55 STD { int reboot(int opt); } -56 STD { int revoke(char *path); } -57 STD { int symlink(char *path, char *link); } -58 STD { int readlink(char *path, char *buf, int count); } -59 STD { int execve(char *path, char **argp, char **envp); } -60 STD { int umask(int newmask); } -61 STD { int chroot(char *path); } -62 COMPAT_43 { int fstat(int fd, struct ostat *sb); } -63 COMPAT_43 { int getkerninfo(int op, char *where, int *size, \ - int arg); } -64 COMPAT_43 { int getpagesize(void); } -65 STD { int msync(caddr_t addr, int len); } -66 STD { int vfork(void); } -67 OBSOL vread -68 OBSOL vwrite -69 STD { int sbrk(int incr); } -70 STD { int sstk(int incr); } -71 COMPAT_43 { int mmap(caddr_t addr, int len, int prot, \ +54 STD POSIX { int ioctl(int fd, u_long com, caddr_t data); } +55 STD BSD { int reboot(int opt); } +56 STD POSIX { int revoke(char *path); } +57 STD POSIX { int symlink(char *path, char *link); } +58 STD POSIX { int readlink(char *path, char *buf, int count); } +59 STD POSIX { int execve(char *fname, char **argv, char **envv); } +60 STD POSIX { int umask(int newmask); } umask umask_args int +61 STD BSD { int chroot(char *path); } +62 COMPAT POSIX { int fstat(int fd, struct ostat *sb); } +63 COMPAT BSD { int getkerninfo(int op, char *where, int *size, \ + int arg); } getkerninfo getkerninfo_args int +64 COMPAT BSD { int getpagesize(void); } \ + getpagesize getpagesize_args int +65 STD BSD { int msync(caddr_t addr, size_t len, int flags); } +66 STD BSD { int vfork(void); } +67 OBSOL NOHIDE vread +68 OBSOL NOHIDE vwrite +69 STD BSD { int sbrk(int incr); } +70 STD BSD { int sstk(int incr); } +71 COMPAT BSD { int mmap(caddr_t addr, int len, int prot, \ int flags, int fd, long pos); } -72 STD { int ovadvise(int anom); } vadvise -73 STD { int munmap(caddr_t addr, int len); } -74 STD { int mprotect(caddr_t addr, int len, int prot); } -75 STD { int madvise(caddr_t addr, int len, int behav); } -76 OBSOL vhangup -77 OBSOL vlimit -78 STD { int mincore(caddr_t addr, int len, char *vec); } -79 STD { int getgroups(u_int gidsetsize, gid_t *gidset); } -80 STD { int setgroups(u_int gidsetsize, gid_t *gidset); } -81 STD { int getpgrp(void); } -82 STD { int setpgid(int pid, int pgid); } -83 STD { int setitimer(u_int which, struct itimerval *itv, \ +72 STD BSD { int ovadvise(int anom); } vadvise ovadvise_args int +73 STD BSD { int munmap(caddr_t addr, size_t len); } +74 STD BSD { int mprotect(caddr_t addr, size_t len, int prot); } +75 STD BSD { int madvise(caddr_t addr, size_t len, int behav); } +76 OBSOL NOHIDE vhangup +77 OBSOL NOHIDE vlimit +78 STD BSD { int mincore(caddr_t addr, size_t len, char *vec); } +79 STD POSIX { int getgroups(u_int gidsetsize, gid_t *gidset); } +80 STD POSIX { int setgroups(u_int gidsetsize, gid_t *gidset); } +81 STD POSIX { int getpgrp(void); } +82 STD POSIX { int setpgid(int pid, int pgid); } +83 STD BSD { int setitimer(u_int which, struct itimerval *itv, \ struct itimerval *oitv); } -84 COMPAT_43 { int wait(void); } -85 STD { int swapon(char *name); } -86 STD { int getitimer(u_int which, struct itimerval *itv); } -87 COMPAT_43 { int gethostname(char *hostname, u_int len); } -88 COMPAT_43 { int sethostname(char *hostname, u_int len); } -89 STD { int getdtablesize(void); } -90 STD { int dup2(u_int from, u_int to); } -91 UNIMPL getdopt -92 STD { int fcntl(int fd, int cmd, void *arg); } -93 STD { int select(u_int nd, fd_set *in, fd_set *ou, \ +84 COMPAT BSD { int wait(void); } +85 STD BSD { int swapon(char *name); } +86 STD BSD { int getitimer(u_int which, struct itimerval *itv); } +87 COMPAT BSD { int gethostname(char *hostname, u_int len); } \ + gethostname gethostname_args int +88 COMPAT BSD { int sethostname(char *hostname, u_int len); } \ + sethostname sethostname_args int +89 STD BSD { int getdtablesize(void); } +90 STD POSIX { int dup2(u_int from, u_int to); } +91 UNIMPL BSD getdopt +92 STD POSIX { int fcntl(int fd, int cmd, int arg); } +; XXX should be { int fcntl(int fd, int cmd, ...); } +; but we're not ready for varargs. +; XXX man page says `int arg' too. +93 STD BSD { int select(int nd, fd_set *in, fd_set *ou, \ fd_set *ex, struct timeval *tv); } -94 UNIMPL setdopt -95 STD { int fsync(int fd); } -96 STD { int setpriority(int which, int who, int prio); } -97 STD { int socket(int domain, int type, int protocol); } -98 STD { int connect(int s, caddr_t name, int namelen); } -99 COMPAT_43 { int accept(int s, caddr_t name, int *anamelen); } -100 STD { int getpriority(int which, int who); } -101 COMPAT_43 { int send(int s, caddr_t buf, int len, int flags); } -102 COMPAT_43 { int recv(int s, caddr_t buf, int len, int flags); } -103 STD { int sigreturn(struct sigcontext *sigcntxp); } -104 STD { int bind(int s, caddr_t name, int namelen); } -105 STD { int setsockopt(int s, int level, int name, \ +94 UNIMPL BSD setdopt +95 STD POSIX { int fsync(int fd); } +96 STD BSD { int setpriority(int which, int who, int prio); } +97 STD BSD { int socket(int domain, int type, int protocol); } +98 STD BSD { int connect(int s, caddr_t name, int namelen); } +99 CPT_NOA BSD { int accept(int s, caddr_t name, int *anamelen); } \ + accept accept_args int +100 STD BSD { int getpriority(int which, int who); } +101 COMPAT BSD { int send(int s, caddr_t buf, int len, int flags); } +102 COMPAT BSD { int recv(int s, caddr_t buf, int len, int flags); } +103 STD BSD { int sigreturn(struct sigcontext *sigcntxp); } +104 STD BSD { int bind(int s, caddr_t name, int namelen); } +105 STD BSD { int setsockopt(int s, int level, int name, \ caddr_t val, int valsize); } -106 STD { int listen(int s, int backlog); } -107 OBSOL vtimes -108 COMPAT_43 { int sigvec(int signum, struct sigvec *nsv, \ +106 STD BSD { int listen(int s, int backlog); } +107 OBSOL NOHIDE vtimes +108 COMPAT BSD { int sigvec(int signum, struct sigvec *nsv, \ struct sigvec *osv); } -109 COMPAT_43 { int sigblock(int mask); } -110 COMPAT_43 { int sigsetmask(int mask); } -111 STD { int sigsuspend(int mask); } -112 COMPAT_43 { int sigstack(struct sigstack *nss, \ +109 COMPAT BSD { int sigblock(int mask); } +110 COMPAT BSD { int sigsetmask(int mask); } +111 STD POSIX { int sigsuspend(sigset_t mask); } +; XXX note nonstandard (bogus) calling convention - the libc stub passes +; us the mask, not a pointer to it. +112 COMPAT BSD { int sigstack(struct sigstack *nss, \ struct sigstack *oss); } -113 COMPAT_43 { int recvmsg(int s, struct omsghdr *msg, int flags); } -114 COMPAT_43 { int sendmsg(int s, caddr_t msg, int flags); } -#ifdef TRACE -115 STD { int vtrace(int request, int value); } -#else -115 OBSOL vtrace -#endif -116 STD { int gettimeofday(struct timeval *tp, \ +113 COMPAT BSD { int recvmsg(int s, struct omsghdr *msg, int flags); } +114 COMPAT BSD { int sendmsg(int s, caddr_t msg, int flags); } +115 OBSOL NOHIDE vtrace +116 STD BSD { int gettimeofday(struct timeval *tp, \ struct timezone *tzp); } -117 STD { int getrusage(int who, struct rusage *rusage); } -118 STD { int getsockopt(int s, int level, int name, \ +117 STD BSD { int getrusage(int who, struct rusage *rusage); } +118 STD BSD { int getsockopt(int s, int level, int name, \ caddr_t val, int *avalsize); } -#ifdef vax -119 STD { int resuba(int value); } -#else -119 UNIMPL resuba -#endif -120 STD { int readv(int fd, struct iovec *iovp, u_int iovcnt); } -121 STD { int writev(int fd, struct iovec *iovp, \ +119 UNIMPL NOHIDE resuba (BSD/OS 2.x) +120 STD BSD { int readv(int fd, struct iovec *iovp, u_int iovcnt); } +121 STD BSD { int writev(int fd, struct iovec *iovp, \ u_int iovcnt); } -122 STD { int settimeofday(struct timeval *tv, \ +122 STD BSD { int settimeofday(struct timeval *tv, \ struct timezone *tzp); } -123 STD { int fchown(int fd, int uid, int gid); } -124 STD { int fchmod(int fd, int mode); } -125 COMPAT_43 { int recvfrom(int s, caddr_t buf, size_t len, \ - int flags, caddr_t from, int *fromlenaddr); } -126 COMPAT_43 { int setreuid(int ruid, int euid); } -127 COMPAT_43 { int setregid(int rgid, int egid); } -128 STD { int rename(char *from, char *to); } -129 COMPAT_43 { int truncate(char *path, long length); } -130 COMPAT_43 { int ftruncate(int fd, long length); } -131 STD { int flock(int fd, int how); } -132 STD { int mkfifo(char *path, int mode); } -133 STD { int sendto(int s, caddr_t buf, size_t len, \ +123 STD BSD { int fchown(int fd, int uid, int gid); } +124 STD BSD { int fchmod(int fd, int mode); } +125 CPT_NOA BSD { int recvfrom(int s, caddr_t buf, size_t len, \ + int flags, caddr_t from, int *fromlenaddr); } \ + recvfrom recvfrom_args int +126 STD BSD { int setreuid(int ruid, int euid); } +127 STD BSD { int setregid(int rgid, int egid); } +128 STD POSIX { int rename(char *from, char *to); } +129 COMPAT BSD { int truncate(char *path, long length); } +130 COMPAT BSD { int ftruncate(int fd, long length); } +131 STD BSD { int flock(int fd, int how); } +132 STD POSIX { int mkfifo(char *path, int mode); } +133 STD BSD { int sendto(int s, caddr_t buf, size_t len, \ int flags, caddr_t to, int tolen); } -134 STD { int shutdown(int s, int how); } -135 STD { int socketpair(int domain, int type, int protocol, \ +134 STD BSD { int shutdown(int s, int how); } +135 STD BSD { int socketpair(int domain, int type, int protocol, \ int *rsv); } -136 STD { int mkdir(char *path, int mode); } -137 STD { int rmdir(char *path); } -138 STD { int utimes(char *path, struct timeval *tptr); } -139 OBSOL 4.2 sigreturn -140 STD { int adjtime(struct timeval *delta, \ +136 STD POSIX { int mkdir(char *path, int mode); } +137 STD POSIX { int rmdir(char *path); } +138 STD BSD { int utimes(char *path, struct timeval *tptr); } +139 OBSOL NOHIDE 4.2 sigreturn +140 STD BSD { int adjtime(struct timeval *delta, \ struct timeval *olddelta); } -141 COMPAT_43 { int getpeername(int fdes, caddr_t asa, int *alen); } -142 COMPAT_43 { int32_t gethostid(void); } -143 COMPAT_43 { int sethostid(int32_t hostid); } -144 COMPAT_43 { int getrlimit(u_int which, struct ogetrlimit *rlp); } -145 COMPAT_43 { int setrlimit(u_int which, struct ogetrlimit *rlp); } -146 COMPAT_43 { int killpg(int pgid, int signum); } -147 STD { int setsid(void); } -148 STD { int quotactl(char *path, int cmd, int uid, \ +141 COMPAT BSD { int getpeername(int fdes, caddr_t asa, int *alen); } +142 COMPAT BSD { long gethostid(void); } +143 COMPAT BSD { int sethostid(long hostid); } +144 COMPAT BSD { int getrlimit(u_int which, struct ogetrlimit *rlp); } +145 COMPAT BSD { int setrlimit(u_int which, struct ogetrlimit *rlp); } +146 COMPAT BSD { int killpg(int pgid, int signum); } +147 STD POSIX { int setsid(void); } +148 STD BSD { int quotactl(char *path, int cmd, int uid, \ caddr_t arg); } -149 COMPAT_43 { int quota(void); } -150 COMPAT_43 { int getsockname(int fdec, caddr_t asa, int *alen); } +149 COMPAT BSD { int quota(void); } +150 CPT_NOA BSD { int getsockname(int fdec, caddr_t asa, int *alen); }\ + getsockname getsockname_args int ; Syscalls 151-180 inclusive are reserved for vendor-specific ; system calls. (This includes various calls added for compatibity ; with other Unix variants.) ; Some of these calls are now supported by BSD... -151 UNIMPL -152 UNIMPL -153 UNIMPL -154 UNIMPL +151 UNIMPL NOHIDE sem_lock (BSD/OS 2.x) +152 UNIMPL NOHIDE sem_wakeup (BSD/OS 2.x) +153 UNIMPL NOHIDE asyncdaemon (BSD/OS 2.x) +154 UNIMPL NOHIDE nosys #ifdef NFS -155 STD { int nfssvc(int flag, caddr_t argp); } +155 STD BSD { int nfssvc(int flag, caddr_t argp); } #else -155 UNIMPL nfssvc +155 UNIMPL BSD nosys #endif -156 COMPAT_43 { int getdirentries(int fd, char *buf, u_int count, \ +156 COMPAT BSD { int getdirentries(int fd, char *buf, u_int count, \ long *basep); } -157 STD { int statfs(char *path, struct statfs *buf); } -158 STD { int fstatfs(int fd, struct statfs *buf); } -159 UNIMPL -160 UNIMPL -#ifdef NFS -161 STD { int getfh(char *fname, fhandle_t *fhp); } -#else -161 UNIMPL getfh -#endif -162 UNIMPL getdomainname -163 UNIMPL setdomainname -164 UNIMPL -165 UNIMPL -166 UNIMPL -167 UNIMPL -168 UNIMPL -169 UNIMPL semsys -170 UNIMPL msgsys -; XXX more generally, never on machines where sizeof(void *) != sizeof(int) -#if defined(SYSVSHM) && !defined(alpha) -171 COMPAT_43 { int shmsys(int which, int a2, int a3, int a4); } +157 STD BSD { int statfs(char *path, struct statfs *buf); } +158 STD BSD { int fstatfs(int fd, struct statfs *buf); } +159 UNIMPL NOHIDE nosys +160 UNIMPL NOHIDE nosys +#if defined(NFS) && !defined (NFS_NOSERVER) +161 STD BSD { int getfh(char *fname, struct fhandle *fhp); } #else -171 UNIMPL shmsys +161 UNIMPL BSD nosys #endif -172 UNIMPL -173 UNIMPL -174 UNIMPL -175 UNIMPL -176 UNIMPL -177 UNIMPL -178 UNIMPL -179 UNIMPL -180 UNIMPL +162 STD BSD { int getdomainname(char *domainname, int len); } +163 STD BSD { int setdomainname(char *domainname, int len); } +164 STD BSD { int uname(struct utsname *name); } +165 STD BSD { int sysarch(int op, char *parms); } +166 STD BSD { int rtprio(int function, pid_t pid, \ + struct rtprio *rtp); } +167 UNIMPL NOHIDE nosys +168 UNIMPL NOHIDE nosys +169 STD BSD { int semsys(int which, int a2, int a3, int a4, \ + int a5); } +; XXX should be { int semsys(int which, ...); } +170 STD BSD { int msgsys(int which, int a2, int a3, int a4, \ + int a5, int a6); } +; XXX should be { int msgsys(int which, ...); } +171 STD BSD { int shmsys(int which, int a2, int a3, int a4); } +; XXX should be { int shmsys(int which, ...); } +172 UNIMPL NOHIDE nosys +173 UNIMPL NOHIDE nosys +174 UNIMPL NOHIDE nosys +175 UNIMPL NOHIDE nosys +176 STD BSD { int ntp_adjtime(struct timex *tp); } +177 UNIMPL NOHIDE sfork (BSD/OS 2.x) +178 UNIMPL NOHIDE getdescriptor (BSD/OS 2.x) +179 UNIMPL NOHIDE setdescriptor (BSD/OS 2.x) +180 UNIMPL NOHIDE nosys -; Syscalls 180-209 are used by/reserved for BSD -181 STD { int setgid(gid_t gid); } -182 STD { int setegid(gid_t egid); } -183 STD { int seteuid(uid_t euid); } +; Syscalls 180-199 are used by/reserved for BSD +181 STD POSIX { int setgid(gid_t gid); } +182 STD BSD { int setegid(gid_t egid); } +183 STD BSD { int seteuid(uid_t euid); } #ifdef LFS -184 STD { int lfs_bmapv(fsid_t *fsidp, \ +184 STD BSD { int lfs_bmapv(struct fsid **fsidp, \ struct block_info *blkiov, int blkcnt); } -185 STD { int lfs_markv(fsid_t *fsidp, \ +185 STD BSD { int lfs_markv(struct fsid **fsidp, \ struct block_info *blkiov, int blkcnt); } -186 STD { int lfs_segclean(fsid_t *fsidp, u_long segment); } -187 STD { int lfs_segwait(fsid_t *fsidp, struct timeval *tv); } +186 STD BSD { int lfs_segclean(struct fsid **fsidp, \ + u_long segment); } +187 STD BSD { int lfs_segwait(struct fsid **fsidp, \ + struct timeval *tv); } #else -184 UNIMPL lfs_bmapv -185 UNIMPL lfs_markv -186 UNIMPL lfs_segclean -187 UNIMPL lfs_segwait +184 UNIMPL BSD nosys +185 UNIMPL BSD nosys +186 UNIMPL BSD nosys +187 UNIMPL BSD nosys #endif -188 STD { int stat(char *path, struct stat *ub); } -189 STD { int fstat(int fd, struct stat *sb); } -190 STD { int lstat(char *path, struct stat *ub); } -191 STD { int pathconf(char *path, int name); } -192 STD { int fpathconf(int fd, int name); } -193 UNIMPL -194 STD { int getrlimit(u_int which, struct rlimit *rlp); } -195 STD { int setrlimit(u_int which, struct rlimit *rlp); } -196 STD { int getdirentries(int fd, char *buf, u_int count, \ +188 STD POSIX { int stat(char *path, struct stat *ub); } +189 STD POSIX { int fstat(int fd, struct stat *sb); } +190 STD POSIX { int lstat(char *path, struct stat *ub); } +191 STD POSIX { int pathconf(char *path, int name); } +192 STD POSIX { int fpathconf(int fd, int name); } +193 UNIMPL NOHIDE nosys +194 STD BSD { int getrlimit(u_int which, \ + struct orlimit *rlp); } \ + getrlimit __getrlimit_args int +195 STD BSD { int setrlimit(u_int which, \ + struct orlimit *rlp); } \ + setrlimit __setrlimit_args int +196 STD BSD { int getdirentries(int fd, char *buf, u_int count, \ long *basep); } -197 STD { caddr_t mmap(caddr_t addr, size_t len, int prot, \ +197 STD BSD { caddr_t mmap(caddr_t addr, size_t len, int prot, \ int flags, int fd, long pad, off_t pos); } -198 STD { int nosys(void); } __syscall -199 STD { off_t lseek(int fd, int pad, off_t offset, \ +198 STD NOHIDE { int nosys(void); } __syscall __syscall_args int +199 STD POSIX { off_t lseek(int fd, int pad, off_t offset, \ int whence); } -200 STD { int truncate(char *path, int pad, off_t length); } -201 STD { int ftruncate(int fd, int pad, off_t length); } -202 STD { int __sysctl(int *name, u_int namelen, void *old, \ - size_t *oldlenp, void *new, size_t newlen); } -203 STD { int mlock(caddr_t addr, size_t len); } -204 STD { int munlock(caddr_t addr, size_t len); } -205 STD { int undelete(char *path); } -206 UNIMPL -207 UNIMPL -208 UNIMPL -209 UNIMPL -; Syscalls 210-219 are used by/reserved for vendor-specific system calls -210 UNIMPL -211 UNIMPL -212 UNIMPL -213 UNIMPL -214 UNIMPL -215 UNIMPL -216 UNIMPL -217 UNIMPL -218 UNIMPL -219 UNIMPL -; System calls 220-240 are reserved for use by BSD -220 UNIMPL semctl -221 UNIMPL semget -222 UNIMPL semop -223 UNIMPL semconfig -224 UNIMPL msgctl -225 UNIMPL msgget -226 UNIMPL msgsnd -227 UNIMPL msgrcv -#if defined(SYSVSHM) && 0 -228 STD { int shmat(int shmid, void *shmaddr, int shmflg); } -229 STD { int shmctl(int shmid, int cmd, \ +200 STD BSD { int truncate(char *path, int pad, off_t length); } +201 STD BSD { int ftruncate(int fd, int pad, off_t length); } +202 STD BSD { int __sysctl(int *name, u_int namelen, void *old, \ + size_t *oldlenp, void *new, size_t newlen); } \ + __sysctl sysctl_args int +; properly, __sysctl should be a NOHIDE, but making an exception +; here allows to avoid one in libc/sys/Makefile.inc. +203 STD BSD { int mlock(caddr_t addr, size_t len); } +204 STD BSD { int munlock(caddr_t addr, size_t len); } +205 STD BSD { int utrace(caddr_t addr, size_t len); } +206 STD BSD { int undelete(char *path); } +207 UNIMPL NOHIDE nosys +208 UNIMPL NOHIDE nosys +209 UNIMPL NOHIDE nosys + +; +; The following are reserved for loadable syscalls +; +210 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +211 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +212 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +213 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +214 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +215 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +216 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +217 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +218 NODEF NOHIDE lkmnosys lkmnosys nosys_args int +219 NODEF NOHIDE lkmnosys lkmnosys nosys_args int + +; +; The following were introduced with NetBSD/4.4Lite-2 +; +220 STD BSD { int __semctl(int semid, int semnum, int cmd, \ + union semun *arg); } +221 STD BSD { int semget(key_t key, int nsems, int semflg); } +222 STD BSD { int semop(int semid, struct sembuf *sops, \ + u_int nsops); } +223 STD BSD { int semconfig(int flag); } +224 STD BSD { int msgctl(int msqid, int cmd, \ + struct msqid_ds *buf); } +225 STD BSD { int msgget(key_t key, int msgflg); } +226 STD BSD { int msgsnd(int msqid, void *msgp, size_t msgsz, \ + int msgflg); } +227 STD BSD { int msgrcv(int msqid, void *msgp, size_t msgsz, \ + long msgtyp, int msgflg); } +228 STD BSD { int shmat(int shmid, void *shmaddr, int shmflg); } +229 STD BSD { int shmctl(int shmid, int cmd, \ struct shmid_ds *buf); } -230 STD { int shmdt(void *shmaddr); } -231 STD { int shmget(key_t key, int size, int shmflg); } -#else -228 UNIMPL shmat -229 UNIMPL shmctl -230 UNIMPL shmdt -231 UNIMPL shmget -#endif +230 STD BSD { int shmdt(void *shmaddr); } +231 STD BSD { int shmget(key_t key, int size, int shmflg); } +; +232 UNIMPL NOHIDE nosys +233 UNIMPL NOHIDE nosys +234 UNIMPL NOHIDE nosys +235 UNIMPL NOHIDE nosys +236 UNIMPL NOHIDE nosys +237 UNIMPL NOHIDE nosys +238 UNIMPL NOHIDE nosys +239 UNIMPL NOHIDE nosys +240 UNIMPL NOHIDE nosys +241 UNIMPL NOHIDE nosys +242 UNIMPL NOHIDE nosys +243 UNIMPL NOHIDE nosys +244 UNIMPL NOHIDE nosys +245 UNIMPL NOHIDE nosys +246 UNIMPL NOHIDE nosys +247 UNIMPL NOHIDE nosys +248 UNIMPL NOHIDE nosys +249 UNIMPL NOHIDE nosys +; syscall numbers initially used in OpenBSD +250 STD BSD { int minherit(caddr_t addr, size_t len, int inherit); } +251 STD BSD { int rfork(int flags); } diff --git a/sys/kern/sysv_ipc.c b/sys/kern/sysv_ipc.c new file mode 100644 index 0000000..a1a1965 --- /dev/null +++ b/sys/kern/sysv_ipc.c @@ -0,0 +1,297 @@ +/* $Id$ */ +/* $NetBSD: sysv_ipc.c,v 1.7 1994/06/29 06:33:11 cgd Exp $ */ + +/* + * Copyright (c) 1994 Herb Peyerl <hpeyerl@novatel.ca> + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Herb Peyerl. + * 4. The name of Herb Peyerl may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "opt_sysvipc.h" + +#include <sys/param.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/systm.h> +#include <sys/syslog.h> +#include <sys/sysproto.h> +#include <sys/ipc.h> +#include <sys/shm.h> +#include <sys/sem.h> + +#if defined(SYSVSEM) || defined(SYSVSHM) || defined(SYSVMSG) + +/* + * Check for ipc permission + */ + +int +ipcperm(cred, perm, mode) + struct ucred *cred; + struct ipc_perm *perm; + int mode; +{ + + if (cred->cr_uid == 0) + return (0); + + /* Check for user match. */ + if (cred->cr_uid != perm->cuid && cred->cr_uid != perm->uid) { + if (mode & IPC_M) + return (EPERM); + /* Check for group match. */ + mode >>= 3; + if (!groupmember(perm->gid, cred) && + !groupmember(perm->cgid, cred)) + /* Check for `other' match. */ + mode >>= 3; + } + + if (mode & IPC_M) + return (0); + return ((mode & perm->mode) == mode ? 0 : EACCES); +} + +#endif /* defined(SYSVSEM) || defined(SYSVSHM) || defined(SYSVMSG) */ + + +#if !defined(SYSVSEM) || !defined(SYSVSHM) || !defined(SYSVMSG) + +static void sysv_nosys __P((struct proc *p, char *s)); + +static void +sysv_nosys(p, s) + struct proc *p; + char *s; +{ + log(LOG_ERR, "cmd %s pid %d tried to use non-present %s\n", + p->p_comm, p->p_pid, s); +} + +#if !defined(SYSVSEM) + +/* + * SYSVSEM stubs + */ + +int +semsys(p, uap, retval) + struct proc *p; + struct semsys_args *uap; + int *retval; +{ + sysv_nosys(p, "SYSVSEM"); + return nosys(p, (struct nosys_args *)uap, retval); +}; + +int +semconfig(p, uap, retval) + struct proc *p; + struct semconfig_args *uap; + int *retval; +{ + sysv_nosys(p, "SYSVSEM"); + return nosys(p, (struct nosys_args *)uap, retval); +}; + +int +__semctl(p, uap, retval) + struct proc *p; + register struct __semctl_args *uap; + int *retval; +{ + sysv_nosys(p, "SYSVSEM"); + return nosys(p, (struct nosys_args *)uap, retval); +}; + +int +semget(p, uap, retval) + struct proc *p; + register struct semget_args *uap; + int *retval; +{ + sysv_nosys(p, "SYSVSEM"); + return nosys(p, (struct nosys_args *)uap, retval); +}; + +int +semop(p, uap, retval) + struct proc *p; + register struct semop_args *uap; + int *retval; +{ + sysv_nosys(p, "SYSVSEM"); + return nosys(p, (struct nosys_args *)uap, retval); +}; + +/* called from kern_exit.c */ +void +semexit(p) + struct proc *p; +{ + return; +} + +#endif /* !defined(SYSVSEM) */ + + +#if !defined(SYSVMSG) + +/* + * SYSVMSG stubs + */ + +int +msgsys(p, uap, retval) + struct proc *p; + /* XXX actually varargs. */ + struct msgsys_args *uap; + int *retval; +{ + sysv_nosys(p, "SYSVMSG"); + return nosys(p, (struct nosys_args *)uap, retval); +}; + +int +msgctl(p, uap, retval) + struct proc *p; + register struct msgctl_args *uap; + int *retval; +{ + sysv_nosys(p, "SYSVMSG"); + return nosys(p, (struct nosys_args *)uap, retval); +}; + +int +msgget(p, uap, retval) + struct proc *p; + register struct msgget_args *uap; + int *retval; +{ + sysv_nosys(p, "SYSVMSG"); + return nosys(p, (struct nosys_args *)uap, retval); +}; + +int +msgsnd(p, uap, retval) + struct proc *p; + register struct msgsnd_args *uap; + int *retval; +{ + sysv_nosys(p, "SYSVMSG"); + return nosys(p, (struct nosys_args *)uap, retval); +}; + +int +msgrcv(p, uap, retval) + struct proc *p; + register struct msgrcv_args *uap; + int *retval; +{ + sysv_nosys(p, "SYSVMSG"); + return nosys(p, (struct nosys_args *)uap, retval); +}; + +#endif /* !defined(SYSVMSG) */ + + +#if !defined(SYSVSHM) + +/* + * SYSVSHM stubs + */ + +int +shmdt(p, uap, retval) + struct proc *p; + struct shmdt_args *uap; + int *retval; +{ + sysv_nosys(p, "SYSVSHM"); + return nosys(p, (struct nosys_args *)uap, retval); +}; + +int +shmat(p, uap, retval) + struct proc *p; + struct shmat_args *uap; + int *retval; +{ + sysv_nosys(p, "SYSVSHM"); + return nosys(p, (struct nosys_args *)uap, retval); +}; + +int +shmctl(p, uap, retval) + struct proc *p; + struct shmctl_args *uap; + int *retval; +{ + sysv_nosys(p, "SYSVSHM"); + return nosys(p, (struct nosys_args *)uap, retval); +}; + +int +shmget(p, uap, retval) + struct proc *p; + struct shmget_args *uap; + int *retval; +{ + sysv_nosys(p, "SYSVSHM"); + return nosys(p, (struct nosys_args *)uap, retval); +}; + +int +shmsys(p, uap, retval) + struct proc *p; + /* XXX actually varargs. */ + struct shmsys_args *uap; + int *retval; +{ + sysv_nosys(p, "SYSVSHM"); + return nosys(p, (struct nosys_args *)uap, retval); +}; + +/* called from kern_fork.c */ +void +shmfork(p1, p2) + struct proc *p1, *p2; +{ + return; +} + +/* called from kern_exit.c */ +void +shmexit(p) + struct proc *p; +{ + return; +} + +#endif /* !defined(SYSVSHM) */ + +#endif /* !defined(SYSVSEM) || !defined(SYSVSHM) || !defined(SYSVMSG) */ diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c new file mode 100644 index 0000000..d6e695f --- /dev/null +++ b/sys/kern/sysv_msg.c @@ -0,0 +1,1034 @@ +/* $Id$ */ + +/* + * Implementation of SVID messages + * + * Author: Daniel Boulet + * + * Copyright 1993 Daniel Boulet and RTMX Inc. + * + * This system call was implemented by Daniel Boulet under contract from RTMX. + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + */ + +#include "opt_sysvipc.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/msg.h> +#include <sys/sysent.h> + +static void msginit __P((void *)); +SYSINIT(sysv_msg, SI_SUB_SYSV_MSG, SI_ORDER_FIRST, msginit, NULL) + +#define MSG_DEBUG +#undef MSG_DEBUG_OK + +#ifndef _SYS_SYSPROTO_H_ +struct msgctl_args; +int msgctl __P((struct proc *p, struct msgctl_args *uap, int *retval)); +struct msgget_args; +int msgget __P((struct proc *p, struct msgget_args *uap, int *retval)); +struct msgsnd_args; +int msgsnd __P((struct proc *p, struct msgsnd_args *uap, int *retval)); +struct msgrcv_args; +int msgrcv __P((struct proc *p, struct msgrcv_args *uap, int *retval)); +#endif +static void msg_freehdr __P((struct msg *msghdr)); + +/* XXX casting to (sy_call_t *) is bogus, as usual. */ +static sy_call_t *msgcalls[] = { + (sy_call_t *)msgctl, (sy_call_t *)msgget, + (sy_call_t *)msgsnd, (sy_call_t *)msgrcv +}; + +static int nfree_msgmaps; /* # of free map entries */ +static short free_msgmaps; /* head of linked list of free map entries */ +static struct msg *free_msghdrs; /* list of free msg headers */ +char *msgpool; /* MSGMAX byte long msg buffer pool */ +struct msgmap *msgmaps; /* MSGSEG msgmap structures */ +struct msg *msghdrs; /* MSGTQL msg headers */ +struct msqid_ds *msqids; /* MSGMNI msqid_ds struct's */ + +void +msginit(dummy) + void *dummy; +{ + register int i; + + /* + * msginfo.msgssz should be a power of two for efficiency reasons. + * It is also pretty silly if msginfo.msgssz is less than 8 + * or greater than about 256 so ... + */ + + i = 8; + while (i < 1024 && i != msginfo.msgssz) + i <<= 1; + if (i != msginfo.msgssz) { + printf("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz, + msginfo.msgssz); + panic("msginfo.msgssz not a small power of 2"); + } + + if (msginfo.msgseg > 32767) { + printf("msginfo.msgseg=%d\n", msginfo.msgseg); + panic("msginfo.msgseg > 32767"); + } + + if (msgmaps == NULL) + panic("msgmaps is NULL"); + + for (i = 0; i < msginfo.msgseg; i++) { + if (i > 0) + msgmaps[i-1].next = i; + msgmaps[i].next = -1; /* implies entry is available */ + } + free_msgmaps = 0; + nfree_msgmaps = msginfo.msgseg; + + if (msghdrs == NULL) + panic("msghdrs is NULL"); + + for (i = 0; i < msginfo.msgtql; i++) { + msghdrs[i].msg_type = 0; + if (i > 0) + msghdrs[i-1].msg_next = &msghdrs[i]; + msghdrs[i].msg_next = NULL; + } + free_msghdrs = &msghdrs[0]; + + if (msqids == NULL) + panic("msqids is NULL"); + + for (i = 0; i < msginfo.msgmni; i++) { + msqids[i].msg_qbytes = 0; /* implies entry is available */ + msqids[i].msg_perm.seq = 0; /* reset to a known value */ + } +} + +/* + * Entry point for all MSG calls + */ +int +msgsys(p, uap, retval) + struct proc *p; + /* XXX actually varargs. */ + struct msgsys_args /* { + u_int which; + int a2; + int a3; + int a4; + int a5; + int a6; + } */ *uap; + int *retval; +{ + + if (uap->which >= sizeof(msgcalls)/sizeof(msgcalls[0])) + return (EINVAL); + return ((*msgcalls[uap->which])(p, &uap->a2, retval)); +} + +static void +msg_freehdr(msghdr) + struct msg *msghdr; +{ + while (msghdr->msg_ts > 0) { + short next; + if (msghdr->msg_spot < 0 || msghdr->msg_spot >= msginfo.msgseg) + panic("msghdr->msg_spot out of range"); + next = msgmaps[msghdr->msg_spot].next; + msgmaps[msghdr->msg_spot].next = free_msgmaps; + free_msgmaps = msghdr->msg_spot; + nfree_msgmaps++; + msghdr->msg_spot = next; + if (msghdr->msg_ts >= msginfo.msgssz) + msghdr->msg_ts -= msginfo.msgssz; + else + msghdr->msg_ts = 0; + } + if (msghdr->msg_spot != -1) + panic("msghdr->msg_spot != -1"); + msghdr->msg_next = free_msghdrs; + free_msghdrs = msghdr; +} + +#ifndef _SYS_SYSPROTO_H_ +struct msgctl_args { + int msqid; + int cmd; + struct msqid_ds *buf; +}; +#endif + +int +msgctl(p, uap, retval) + struct proc *p; + register struct msgctl_args *uap; + int *retval; +{ + int msqid = uap->msqid; + int cmd = uap->cmd; + struct msqid_ds *user_msqptr = uap->buf; + struct ucred *cred = p->p_ucred; + int rval, eval; + struct msqid_ds msqbuf; + register struct msqid_ds *msqptr; + +#ifdef MSG_DEBUG_OK + printf("call to msgctl(%d, %d, 0x%x)\n", msqid, cmd, user_msqptr); +#endif + + msqid = IPCID_TO_IX(msqid); + + if (msqid < 0 || msqid >= msginfo.msgmni) { +#ifdef MSG_DEBUG_OK + printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid, + msginfo.msgmni); +#endif + return(EINVAL); + } + + msqptr = &msqids[msqid]; + + if (msqptr->msg_qbytes == 0) { +#ifdef MSG_DEBUG_OK + printf("no such msqid\n"); +#endif + return(EINVAL); + } + if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) { +#ifdef MSG_DEBUG_OK + printf("wrong sequence number\n"); +#endif + return(EINVAL); + } + + eval = 0; + rval = 0; + + switch (cmd) { + + case IPC_RMID: + { + struct msg *msghdr; + if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_M))) + return(eval); + /* Free the message headers */ + msghdr = msqptr->msg_first; + while (msghdr != NULL) { + struct msg *msghdr_tmp; + + /* Free the segments of each message */ + msqptr->msg_cbytes -= msghdr->msg_ts; + msqptr->msg_qnum--; + msghdr_tmp = msghdr; + msghdr = msghdr->msg_next; + msg_freehdr(msghdr_tmp); + } + + if (msqptr->msg_cbytes != 0) + panic("msg_cbytes is screwed up"); + if (msqptr->msg_qnum != 0) + panic("msg_qnum is screwed up"); + + msqptr->msg_qbytes = 0; /* Mark it as free */ + + wakeup((caddr_t)msqptr); + } + + break; + + case IPC_SET: + if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_M))) + return(eval); + if ((eval = copyin(user_msqptr, &msqbuf, sizeof(msqbuf))) != 0) + return(eval); + if (msqbuf.msg_qbytes > msqptr->msg_qbytes && cred->cr_uid != 0) + return(EPERM); + if (msqbuf.msg_qbytes > msginfo.msgmnb) { +#ifdef MSG_DEBUG_OK + printf("can't increase msg_qbytes beyond %d (truncating)\n", + msginfo.msgmnb); +#endif + msqbuf.msg_qbytes = msginfo.msgmnb; /* silently restrict qbytes to system limit */ + } + if (msqbuf.msg_qbytes == 0) { +#ifdef MSG_DEBUG_OK + printf("can't reduce msg_qbytes to 0\n"); +#endif + return(EINVAL); /* non-standard errno! */ + } + msqptr->msg_perm.uid = msqbuf.msg_perm.uid; /* change the owner */ + msqptr->msg_perm.gid = msqbuf.msg_perm.gid; /* change the owner */ + msqptr->msg_perm.mode = (msqptr->msg_perm.mode & ~0777) | + (msqbuf.msg_perm.mode & 0777); + msqptr->msg_qbytes = msqbuf.msg_qbytes; + msqptr->msg_ctime = time.tv_sec; + break; + + case IPC_STAT: + if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_R))) { +#ifdef MSG_DEBUG_OK + printf("requester doesn't have read access\n"); +#endif + return(eval); + } + eval = copyout((caddr_t)msqptr, user_msqptr, + sizeof(struct msqid_ds)); + break; + + default: +#ifdef MSG_DEBUG_OK + printf("invalid command %d\n", cmd); +#endif + return(EINVAL); + } + + if (eval == 0) + *retval = rval; + return(eval); +} + +#ifndef _SYS_SYSPROTO_H_ +struct msgget_args { + key_t key; + int msgflg; +}; +#endif + +int +msgget(p, uap, retval) + struct proc *p; + register struct msgget_args *uap; + int *retval; +{ + int msqid, eval; + int key = uap->key; + int msgflg = uap->msgflg; + struct ucred *cred = p->p_ucred; + register struct msqid_ds *msqptr = NULL; + +#ifdef MSG_DEBUG_OK + printf("msgget(0x%x, 0%o)\n", key, msgflg); +#endif + + if (key != IPC_PRIVATE) { + for (msqid = 0; msqid < msginfo.msgmni; msqid++) { + msqptr = &msqids[msqid]; + if (msqptr->msg_qbytes != 0 && + msqptr->msg_perm.key == key) + break; + } + if (msqid < msginfo.msgmni) { +#ifdef MSG_DEBUG_OK + printf("found public key\n"); +#endif + if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) { +#ifdef MSG_DEBUG_OK + printf("not exclusive\n"); +#endif + return(EEXIST); + } + if ((eval = ipcperm(cred, &msqptr->msg_perm, msgflg & 0700 ))) { +#ifdef MSG_DEBUG_OK + printf("requester doesn't have 0%o access\n", + msgflg & 0700); +#endif + return(eval); + } + goto found; + } + } + +#ifdef MSG_DEBUG_OK + printf("need to allocate the msqid_ds\n"); +#endif + if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) { + for (msqid = 0; msqid < msginfo.msgmni; msqid++) { + /* + * Look for an unallocated and unlocked msqid_ds. + * msqid_ds's can be locked by msgsnd or msgrcv while + * they are copying the message in/out. We can't + * re-use the entry until they release it. + */ + msqptr = &msqids[msqid]; + if (msqptr->msg_qbytes == 0 && + (msqptr->msg_perm.mode & MSG_LOCKED) == 0) + break; + } + if (msqid == msginfo.msgmni) { +#ifdef MSG_DEBUG_OK + printf("no more msqid_ds's available\n"); +#endif + return(ENOSPC); + } +#ifdef MSG_DEBUG_OK + printf("msqid %d is available\n", msqid); +#endif + msqptr->msg_perm.key = key; + msqptr->msg_perm.cuid = cred->cr_uid; + msqptr->msg_perm.uid = cred->cr_uid; + msqptr->msg_perm.cgid = cred->cr_gid; + msqptr->msg_perm.gid = cred->cr_gid; + msqptr->msg_perm.mode = (msgflg & 0777); + /* Make sure that the returned msqid is unique */ + msqptr->msg_perm.seq++; + msqptr->msg_first = NULL; + msqptr->msg_last = NULL; + msqptr->msg_cbytes = 0; + msqptr->msg_qnum = 0; + msqptr->msg_qbytes = msginfo.msgmnb; + msqptr->msg_lspid = 0; + msqptr->msg_lrpid = 0; + msqptr->msg_stime = 0; + msqptr->msg_rtime = 0; + msqptr->msg_ctime = time.tv_sec; + } else { +#ifdef MSG_DEBUG_OK + printf("didn't find it and wasn't asked to create it\n"); +#endif + return(ENOENT); + } + +found: + /* Construct the unique msqid */ + *retval = IXSEQ_TO_IPCID(msqid, msqptr->msg_perm); + return(0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct msgsnd_args { + int msqid; + void *msgp; + size_t msgsz; + int msgflg; +}; +#endif + +int +msgsnd(p, uap, retval) + struct proc *p; + register struct msgsnd_args *uap; + int *retval; +{ + int msqid = uap->msqid; + void *user_msgp = uap->msgp; + size_t msgsz = uap->msgsz; + int msgflg = uap->msgflg; + int segs_needed, eval; + struct ucred *cred = p->p_ucred; + register struct msqid_ds *msqptr; + register struct msg *msghdr; + short next; + +#ifdef MSG_DEBUG_OK + printf("call to msgsnd(%d, 0x%x, %d, %d)\n", msqid, user_msgp, msgsz, + msgflg); +#endif + + msqid = IPCID_TO_IX(msqid); + + if (msqid < 0 || msqid >= msginfo.msgmni) { +#ifdef MSG_DEBUG_OK + printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid, + msginfo.msgmni); +#endif + return(EINVAL); + } + + msqptr = &msqids[msqid]; + if (msqptr->msg_qbytes == 0) { +#ifdef MSG_DEBUG_OK + printf("no such message queue id\n"); +#endif + return(EINVAL); + } + if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) { +#ifdef MSG_DEBUG_OK + printf("wrong sequence number\n"); +#endif + return(EINVAL); + } + + if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_W))) { +#ifdef MSG_DEBUG_OK + printf("requester doesn't have write access\n"); +#endif + return(eval); + } + + segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz; +#ifdef MSG_DEBUG_OK + printf("msgsz=%d, msgssz=%d, segs_needed=%d\n", msgsz, msginfo.msgssz, + segs_needed); +#endif + for (;;) { + int need_more_resources = 0; + + /* + * check msgsz + * (inside this loop in case msg_qbytes changes while we sleep) + */ + + if (msgsz > msqptr->msg_qbytes) { +#ifdef MSG_DEBUG_OK + printf("msgsz > msqptr->msg_qbytes\n"); +#endif + return(EINVAL); + } + + if (msqptr->msg_perm.mode & MSG_LOCKED) { +#ifdef MSG_DEBUG_OK + printf("msqid is locked\n"); +#endif + need_more_resources = 1; + } + if (msgsz + msqptr->msg_cbytes > msqptr->msg_qbytes) { +#ifdef MSG_DEBUG_OK + printf("msgsz + msg_cbytes > msg_qbytes\n"); +#endif + need_more_resources = 1; + } + if (segs_needed > nfree_msgmaps) { +#ifdef MSG_DEBUG_OK + printf("segs_needed > nfree_msgmaps\n"); +#endif + need_more_resources = 1; + } + if (free_msghdrs == NULL) { +#ifdef MSG_DEBUG_OK + printf("no more msghdrs\n"); +#endif + need_more_resources = 1; + } + + if (need_more_resources) { + int we_own_it; + + if ((msgflg & IPC_NOWAIT) != 0) { +#ifdef MSG_DEBUG_OK + printf("need more resources but caller doesn't want to wait\n"); +#endif + return(EAGAIN); + } + + if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0) { +#ifdef MSG_DEBUG_OK + printf("we don't own the msqid_ds\n"); +#endif + we_own_it = 0; + } else { + /* Force later arrivals to wait for our + request */ +#ifdef MSG_DEBUG_OK + printf("we own the msqid_ds\n"); +#endif + msqptr->msg_perm.mode |= MSG_LOCKED; + we_own_it = 1; + } +#ifdef MSG_DEBUG_OK + printf("goodnight\n"); +#endif + eval = tsleep((caddr_t)msqptr, (PZERO - 4) | PCATCH, + "msgwait", 0); +#ifdef MSG_DEBUG_OK + printf("good morning, eval=%d\n", eval); +#endif + if (we_own_it) + msqptr->msg_perm.mode &= ~MSG_LOCKED; + if (eval != 0) { +#ifdef MSG_DEBUG_OK + printf("msgsnd: interrupted system call\n"); +#endif + return(EINTR); + } + + /* + * Make sure that the msq queue still exists + */ + + if (msqptr->msg_qbytes == 0) { +#ifdef MSG_DEBUG_OK + printf("msqid deleted\n"); +#endif + /* The SVID says to return EIDRM. */ +#ifdef EIDRM + return(EIDRM); +#else + /* Unfortunately, BSD doesn't define that code + yet! */ + return(EINVAL); +#endif + } + + } else { +#ifdef MSG_DEBUG_OK + printf("got all the resources that we need\n"); +#endif + break; + } + } + + /* + * We have the resources that we need. + * Make sure! + */ + + if (msqptr->msg_perm.mode & MSG_LOCKED) + panic("msg_perm.mode & MSG_LOCKED"); + if (segs_needed > nfree_msgmaps) + panic("segs_needed > nfree_msgmaps"); + if (msgsz + msqptr->msg_cbytes > msqptr->msg_qbytes) + panic("msgsz + msg_cbytes > msg_qbytes"); + if (free_msghdrs == NULL) + panic("no more msghdrs"); + + /* + * Re-lock the msqid_ds in case we page-fault when copying in the + * message + */ + + if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0) + panic("msqid_ds is already locked"); + msqptr->msg_perm.mode |= MSG_LOCKED; + + /* + * Allocate a message header + */ + + msghdr = free_msghdrs; + free_msghdrs = msghdr->msg_next; + msghdr->msg_spot = -1; + msghdr->msg_ts = msgsz; + + /* + * Allocate space for the message + */ + + while (segs_needed > 0) { + if (nfree_msgmaps <= 0) + panic("not enough msgmaps"); + if (free_msgmaps == -1) + panic("nil free_msgmaps"); + next = free_msgmaps; + if (next <= -1) + panic("next too low #1"); + if (next >= msginfo.msgseg) + panic("next out of range #1"); +#ifdef MSG_DEBUG_OK + printf("allocating segment %d to message\n", next); +#endif + free_msgmaps = msgmaps[next].next; + nfree_msgmaps--; + msgmaps[next].next = msghdr->msg_spot; + msghdr->msg_spot = next; + segs_needed--; + } + + /* + * Copy in the message type + */ + + if ((eval = copyin(user_msgp, &msghdr->msg_type, + sizeof(msghdr->msg_type))) != 0) { +#ifdef MSG_DEBUG_OK + printf("error %d copying the message type\n", eval); +#endif + msg_freehdr(msghdr); + msqptr->msg_perm.mode &= ~MSG_LOCKED; + wakeup((caddr_t)msqptr); + return(eval); + } + user_msgp = (char *)user_msgp + sizeof(msghdr->msg_type); + + /* + * Validate the message type + */ + + if (msghdr->msg_type < 1) { + msg_freehdr(msghdr); + msqptr->msg_perm.mode &= ~MSG_LOCKED; + wakeup((caddr_t)msqptr); +#ifdef MSG_DEBUG_OK + printf("mtype (%d) < 1\n", msghdr->msg_type); +#endif + return(EINVAL); + } + + /* + * Copy in the message body + */ + + next = msghdr->msg_spot; + while (msgsz > 0) { + size_t tlen; + if (msgsz > msginfo.msgssz) + tlen = msginfo.msgssz; + else + tlen = msgsz; + if (next <= -1) + panic("next too low #2"); + if (next >= msginfo.msgseg) + panic("next out of range #2"); + if ((eval = copyin(user_msgp, &msgpool[next * msginfo.msgssz], + tlen)) != 0) { +#ifdef MSG_DEBUG_OK + printf("error %d copying in message segment\n", eval); +#endif + msg_freehdr(msghdr); + msqptr->msg_perm.mode &= ~MSG_LOCKED; + wakeup((caddr_t)msqptr); + return(eval); + } + msgsz -= tlen; + user_msgp = (char *)user_msgp + tlen; + next = msgmaps[next].next; + } + if (next != -1) + panic("didn't use all the msg segments"); + + /* + * We've got the message. Unlock the msqid_ds. + */ + + msqptr->msg_perm.mode &= ~MSG_LOCKED; + + /* + * Make sure that the msqid_ds is still allocated. + */ + + if (msqptr->msg_qbytes == 0) { + msg_freehdr(msghdr); + wakeup((caddr_t)msqptr); + /* The SVID says to return EIDRM. */ +#ifdef EIDRM + return(EIDRM); +#else + /* Unfortunately, BSD doesn't define that code yet! */ + return(EINVAL); +#endif + } + + /* + * Put the message into the queue + */ + + if (msqptr->msg_first == NULL) { + msqptr->msg_first = msghdr; + msqptr->msg_last = msghdr; + } else { + msqptr->msg_last->msg_next = msghdr; + msqptr->msg_last = msghdr; + } + msqptr->msg_last->msg_next = NULL; + + msqptr->msg_cbytes += msghdr->msg_ts; + msqptr->msg_qnum++; + msqptr->msg_lspid = p->p_pid; + msqptr->msg_stime = time.tv_sec; + + wakeup((caddr_t)msqptr); + *retval = 0; + return(0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct msgrcv_args { + int msqid; + void *msgp; + size_t msgsz; + long msgtyp; + int msgflg; +}; +#endif + +int +msgrcv(p, uap, retval) + struct proc *p; + register struct msgrcv_args *uap; + int *retval; +{ + int msqid = uap->msqid; + void *user_msgp = uap->msgp; + size_t msgsz = uap->msgsz; + long msgtyp = uap->msgtyp; + int msgflg = uap->msgflg; + size_t len; + struct ucred *cred = p->p_ucred; + register struct msqid_ds *msqptr; + register struct msg *msghdr; + int eval; + short next; + +#ifdef MSG_DEBUG_OK + printf("call to msgrcv(%d, 0x%x, %d, %ld, %d)\n", msqid, user_msgp, + msgsz, msgtyp, msgflg); +#endif + + msqid = IPCID_TO_IX(msqid); + + if (msqid < 0 || msqid >= msginfo.msgmni) { +#ifdef MSG_DEBUG_OK + printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid, + msginfo.msgmni); +#endif + return(EINVAL); + } + + msqptr = &msqids[msqid]; + if (msqptr->msg_qbytes == 0) { +#ifdef MSG_DEBUG_OK + printf("no such message queue id\n"); +#endif + return(EINVAL); + } + if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) { +#ifdef MSG_DEBUG_OK + printf("wrong sequence number\n"); +#endif + return(EINVAL); + } + + if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_R))) { +#ifdef MSG_DEBUG_OK + printf("requester doesn't have read access\n"); +#endif + return(eval); + } + + msghdr = NULL; + while (msghdr == NULL) { + if (msgtyp == 0) { + msghdr = msqptr->msg_first; + if (msghdr != NULL) { + if (msgsz < msghdr->msg_ts && + (msgflg & MSG_NOERROR) == 0) { +#ifdef MSG_DEBUG_OK + printf("first message on the queue is too big (want %d, got %d)\n", + msgsz, msghdr->msg_ts); +#endif + return(E2BIG); + } + if (msqptr->msg_first == msqptr->msg_last) { + msqptr->msg_first = NULL; + msqptr->msg_last = NULL; + } else { + msqptr->msg_first = msghdr->msg_next; + if (msqptr->msg_first == NULL) + panic("msg_first/last screwed up #1"); + } + } + } else { + struct msg *previous; + struct msg **prev; + + previous = NULL; + prev = &(msqptr->msg_first); + while ((msghdr = *prev) != NULL) { + /* + * Is this message's type an exact match or is + * this message's type less than or equal to + * the absolute value of a negative msgtyp? + * Note that the second half of this test can + * NEVER be true if msgtyp is positive since + * msg_type is always positive! + */ + + if (msgtyp == msghdr->msg_type || + msghdr->msg_type <= -msgtyp) { +#ifdef MSG_DEBUG_OK + printf("found message type %d, requested %d\n", + msghdr->msg_type, msgtyp); +#endif + if (msgsz < msghdr->msg_ts && + (msgflg & MSG_NOERROR) == 0) { +#ifdef MSG_DEBUG_OK + printf("requested message on the queue is too big (want %d, got %d)\n", + msgsz, msghdr->msg_ts); +#endif + return(E2BIG); + } + *prev = msghdr->msg_next; + if (msghdr == msqptr->msg_last) { + if (previous == NULL) { + if (prev != + &msqptr->msg_first) + panic("msg_first/last screwed up #2"); + msqptr->msg_first = + NULL; + msqptr->msg_last = + NULL; + } else { + if (prev == + &msqptr->msg_first) + panic("msg_first/last screwed up #3"); + msqptr->msg_last = + previous; + } + } + break; + } + previous = msghdr; + prev = &(msghdr->msg_next); + } + } + + /* + * We've either extracted the msghdr for the appropriate + * message or there isn't one. + * If there is one then bail out of this loop. + */ + + if (msghdr != NULL) + break; + + /* + * Hmph! No message found. Does the user want to wait? + */ + + if ((msgflg & IPC_NOWAIT) != 0) { +#ifdef MSG_DEBUG_OK + printf("no appropriate message found (msgtyp=%d)\n", + msgtyp); +#endif + /* The SVID says to return ENOMSG. */ +#ifdef ENOMSG + return(ENOMSG); +#else + /* Unfortunately, BSD doesn't define that code yet! */ + return(EAGAIN); +#endif + } + + /* + * Wait for something to happen + */ + +#ifdef MSG_DEBUG_OK + printf("msgrcv: goodnight\n"); +#endif + eval = tsleep((caddr_t)msqptr, (PZERO - 4) | PCATCH, "msgwait", + 0); +#ifdef MSG_DEBUG_OK + printf("msgrcv: good morning (eval=%d)\n", eval); +#endif + + if (eval != 0) { +#ifdef MSG_DEBUG_OK + printf("msgsnd: interrupted system call\n"); +#endif + return(EINTR); + } + + /* + * Make sure that the msq queue still exists + */ + + if (msqptr->msg_qbytes == 0 || + msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) { +#ifdef MSG_DEBUG_OK + printf("msqid deleted\n"); +#endif + /* The SVID says to return EIDRM. */ +#ifdef EIDRM + return(EIDRM); +#else + /* Unfortunately, BSD doesn't define that code yet! */ + return(EINVAL); +#endif + } + } + + /* + * Return the message to the user. + * + * First, do the bookkeeping (before we risk being interrupted). + */ + + msqptr->msg_cbytes -= msghdr->msg_ts; + msqptr->msg_qnum--; + msqptr->msg_lrpid = p->p_pid; + msqptr->msg_rtime = time.tv_sec; + + /* + * Make msgsz the actual amount that we'll be returning. + * Note that this effectively truncates the message if it is too long + * (since msgsz is never increased). + */ + +#ifdef MSG_DEBUG_OK + printf("found a message, msgsz=%d, msg_ts=%d\n", msgsz, + msghdr->msg_ts); +#endif + if (msgsz > msghdr->msg_ts) + msgsz = msghdr->msg_ts; + + /* + * Return the type to the user. + */ + + eval = copyout((caddr_t)&(msghdr->msg_type), user_msgp, + sizeof(msghdr->msg_type)); + if (eval != 0) { +#ifdef MSG_DEBUG_OK + printf("error (%d) copying out message type\n", eval); +#endif + msg_freehdr(msghdr); + wakeup((caddr_t)msqptr); + return(eval); + } + user_msgp = (char *)user_msgp + sizeof(msghdr->msg_type); + + /* + * Return the segments to the user + */ + + next = msghdr->msg_spot; + for (len = 0; len < msgsz; len += msginfo.msgssz) { + size_t tlen; + + if (msgsz > msginfo.msgssz) + tlen = msginfo.msgssz; + else + tlen = msgsz; + if (next <= -1) + panic("next too low #3"); + if (next >= msginfo.msgseg) + panic("next out of range #3"); + eval = copyout((caddr_t)&msgpool[next * msginfo.msgssz], + user_msgp, tlen); + if (eval != 0) { +#ifdef MSG_DEBUG_OK + printf("error (%d) copying out message segment\n", + eval); +#endif + msg_freehdr(msghdr); + wakeup((caddr_t)msqptr); + return(eval); + } + user_msgp = (char *)user_msgp + tlen; + next = msgmaps[next].next; + } + + /* + * Done, return the actual number of bytes copied out. + */ + + msg_freehdr(msghdr); + wakeup((caddr_t)msqptr); + *retval = msgsz; + return(0); +} diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c new file mode 100644 index 0000000..e66ddc6 --- /dev/null +++ b/sys/kern/sysv_sem.c @@ -0,0 +1,985 @@ +/* $Id$ */ + +/* + * Implementation of SVID semaphores + * + * Author: Daniel Boulet + * + * This software is provided ``AS IS'' without any warranties of any kind. + */ + +#include "opt_sysvipc.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/sem.h> +#include <sys/sysent.h> + +static void seminit __P((void *)); +SYSINIT(sysv_sem, SI_SUB_SYSV_SEM, SI_ORDER_FIRST, seminit, NULL) + +#ifndef _SYS_SYSPROTO_H_ +struct __semctl_args; +int __semctl __P((struct proc *p, struct __semctl_args *uap, int *retval)); +struct semget_args; +int semget __P((struct proc *p, struct semget_args *uap, int *retval)); +struct semop_args; +int semop __P((struct proc *p, struct semop_args *uap, int *retval)); +struct semconfig_args; +int semconfig __P((struct proc *p, struct semconfig_args *uap, + int *retval)); +#endif + +static struct sem_undo *semu_alloc __P((struct proc *p)); +static int semundo_adjust __P((struct proc *p, struct sem_undo **supptr, + int semid, int semnum, int adjval)); +static void semundo_clear __P((int semid, int semnum)); + +/* XXX casting to (sy_call_t *) is bogus, as usual. */ +static sy_call_t *semcalls[] = { + (sy_call_t *)__semctl, (sy_call_t *)semget, + (sy_call_t *)semop, (sy_call_t *)semconfig +}; + +static int semtot = 0; +struct semid_ds *sema; /* semaphore id pool */ +struct sem *sem; /* semaphore pool */ +static struct sem_undo *semu_list; /* list of active undo structures */ +int *semu; /* undo structure pool */ + +static struct proc *semlock_holder = NULL; + +void +seminit(dummy) + void *dummy; +{ + register int i; + + if (sema == NULL) + panic("sema is NULL"); + if (semu == NULL) + panic("semu is NULL"); + + for (i = 0; i < seminfo.semmni; i++) { + sema[i].sem_base = 0; + sema[i].sem_perm.mode = 0; + } + for (i = 0; i < seminfo.semmnu; i++) { + register struct sem_undo *suptr = SEMU(i); + suptr->un_proc = NULL; + } + semu_list = NULL; +} + +/* + * Entry point for all SEM calls + */ +int +semsys(p, uap, retval) + struct proc *p; + /* XXX actually varargs. */ + struct semsys_args /* { + u_int which; + int a2; + int a3; + int a4; + int a5; + } */ *uap; + int *retval; +{ + + while (semlock_holder != NULL && semlock_holder != p) + (void) tsleep((caddr_t)&semlock_holder, (PZERO - 4), "semsys", 0); + + if (uap->which >= sizeof(semcalls)/sizeof(semcalls[0])) + return (EINVAL); + return ((*semcalls[uap->which])(p, &uap->a2, retval)); +} + +/* + * Lock or unlock the entire semaphore facility. + * + * This will probably eventually evolve into a general purpose semaphore + * facility status enquiry mechanism (I don't like the "read /dev/kmem" + * approach currently taken by ipcs and the amount of info that we want + * to be able to extract for ipcs is probably beyond what the capability + * of the getkerninfo facility. + * + * At the time that the current version of semconfig was written, ipcs is + * the only user of the semconfig facility. It uses it to ensure that the + * semaphore facility data structures remain static while it fishes around + * in /dev/kmem. + */ + +#ifndef _SYS_SYSPROTO_H_ +struct semconfig_args { + semconfig_ctl_t flag; +}; +#endif + +int +semconfig(p, uap, retval) + struct proc *p; + struct semconfig_args *uap; + int *retval; +{ + int eval = 0; + + switch (uap->flag) { + case SEM_CONFIG_FREEZE: + semlock_holder = p; + break; + + case SEM_CONFIG_THAW: + semlock_holder = NULL; + wakeup((caddr_t)&semlock_holder); + break; + + default: + printf("semconfig: unknown flag parameter value (%d) - ignored\n", + uap->flag); + eval = EINVAL; + break; + } + + *retval = 0; + return(eval); +} + +/* + * Allocate a new sem_undo structure for a process + * (returns ptr to structure or NULL if no more room) + */ + +static struct sem_undo * +semu_alloc(p) + struct proc *p; +{ + register int i; + register struct sem_undo *suptr; + register struct sem_undo **supptr; + int attempt; + + /* + * Try twice to allocate something. + * (we'll purge any empty structures after the first pass so + * two passes are always enough) + */ + + for (attempt = 0; attempt < 2; attempt++) { + /* + * Look for a free structure. + * Fill it in and return it if we find one. + */ + + for (i = 0; i < seminfo.semmnu; i++) { + suptr = SEMU(i); + if (suptr->un_proc == NULL) { + suptr->un_next = semu_list; + semu_list = suptr; + suptr->un_cnt = 0; + suptr->un_proc = p; + return(suptr); + } + } + + /* + * We didn't find a free one, if this is the first attempt + * then try to free some structures. + */ + + if (attempt == 0) { + /* All the structures are in use - try to free some */ + int did_something = 0; + + supptr = &semu_list; + while ((suptr = *supptr) != NULL) { + if (suptr->un_cnt == 0) { + suptr->un_proc = NULL; + *supptr = suptr->un_next; + did_something = 1; + } else + supptr = &(suptr->un_next); + } + + /* If we didn't free anything then just give-up */ + if (!did_something) + return(NULL); + } else { + /* + * The second pass failed even though we freed + * something after the first pass! + * This is IMPOSSIBLE! + */ + panic("semu_alloc - second attempt failed"); + } + } + return (NULL); +} + +/* + * Adjust a particular entry for a particular proc + */ + +static int +semundo_adjust(p, supptr, semid, semnum, adjval) + register struct proc *p; + struct sem_undo **supptr; + int semid, semnum; + int adjval; +{ + register struct sem_undo *suptr; + register struct undo *sunptr; + int i; + + /* Look for and remember the sem_undo if the caller doesn't provide + it */ + + suptr = *supptr; + if (suptr == NULL) { + for (suptr = semu_list; suptr != NULL; + suptr = suptr->un_next) { + if (suptr->un_proc == p) { + *supptr = suptr; + break; + } + } + if (suptr == NULL) { + if (adjval == 0) + return(0); + suptr = semu_alloc(p); + if (suptr == NULL) + return(ENOSPC); + *supptr = suptr; + } + } + + /* + * Look for the requested entry and adjust it (delete if adjval becomes + * 0). + */ + sunptr = &suptr->un_ent[0]; + for (i = 0; i < suptr->un_cnt; i++, sunptr++) { + if (sunptr->un_id != semid || sunptr->un_num != semnum) + continue; + if (adjval == 0) + sunptr->un_adjval = 0; + else + sunptr->un_adjval += adjval; + if (sunptr->un_adjval == 0) { + suptr->un_cnt--; + if (i < suptr->un_cnt) + suptr->un_ent[i] = + suptr->un_ent[suptr->un_cnt]; + } + return(0); + } + + /* Didn't find the right entry - create it */ + if (adjval == 0) + return(0); + if (suptr->un_cnt != SEMUME) { + sunptr = &suptr->un_ent[suptr->un_cnt]; + suptr->un_cnt++; + sunptr->un_adjval = adjval; + sunptr->un_id = semid; sunptr->un_num = semnum; + } else + return(EINVAL); + return(0); +} + +static void +semundo_clear(semid, semnum) + int semid, semnum; +{ + register struct sem_undo *suptr; + + for (suptr = semu_list; suptr != NULL; suptr = suptr->un_next) { + register struct undo *sunptr = &suptr->un_ent[0]; + register int i = 0; + + while (i < suptr->un_cnt) { + if (sunptr->un_id == semid) { + if (semnum == -1 || sunptr->un_num == semnum) { + suptr->un_cnt--; + if (i < suptr->un_cnt) { + suptr->un_ent[i] = + suptr->un_ent[suptr->un_cnt]; + continue; + } + } + if (semnum != -1) + break; + } + i++, sunptr++; + } + } +} + +/* + * Note that the user-mode half of this passes a union, not a pointer + */ +#ifndef _SYS_SYSPROTO_H_ +struct __semctl_args { + int semid; + int semnum; + int cmd; + union semun *arg; +}; +#endif + +int +__semctl(p, uap, retval) + struct proc *p; + register struct __semctl_args *uap; + int *retval; +{ + int semid = uap->semid; + int semnum = uap->semnum; + int cmd = uap->cmd; + union semun *arg = uap->arg; + union semun real_arg; + struct ucred *cred = p->p_ucred; + int i, rval, eval; + struct semid_ds sbuf; + register struct semid_ds *semaptr; + +#ifdef SEM_DEBUG + printf("call to semctl(%d, %d, %d, 0x%x)\n", semid, semnum, cmd, arg); +#endif + + semid = IPCID_TO_IX(semid); + if (semid < 0 || semid >= seminfo.semmsl) + return(EINVAL); + + semaptr = &sema[semid]; + if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 || + semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) + return(EINVAL); + + eval = 0; + rval = 0; + + switch (cmd) { + case IPC_RMID: + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_M))) + return(eval); + semaptr->sem_perm.cuid = cred->cr_uid; + semaptr->sem_perm.uid = cred->cr_uid; + semtot -= semaptr->sem_nsems; + for (i = semaptr->sem_base - sem; i < semtot; i++) + sem[i] = sem[i + semaptr->sem_nsems]; + for (i = 0; i < seminfo.semmni; i++) { + if ((sema[i].sem_perm.mode & SEM_ALLOC) && + sema[i].sem_base > semaptr->sem_base) + sema[i].sem_base -= semaptr->sem_nsems; + } + semaptr->sem_perm.mode = 0; + semundo_clear(semid, -1); + wakeup((caddr_t)semaptr); + break; + + case IPC_SET: + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_M))) + return(eval); + if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0) + return(eval); + if ((eval = copyin(real_arg.buf, (caddr_t)&sbuf, + sizeof(sbuf))) != 0) + return(eval); + semaptr->sem_perm.uid = sbuf.sem_perm.uid; + semaptr->sem_perm.gid = sbuf.sem_perm.gid; + semaptr->sem_perm.mode = (semaptr->sem_perm.mode & ~0777) | + (sbuf.sem_perm.mode & 0777); + semaptr->sem_ctime = time.tv_sec; + break; + + case IPC_STAT: + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R))) + return(eval); + if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0) + return(eval); + eval = copyout((caddr_t)semaptr, real_arg.buf, + sizeof(struct semid_ds)); + break; + + case GETNCNT: + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R))) + return(eval); + if (semnum < 0 || semnum >= semaptr->sem_nsems) + return(EINVAL); + rval = semaptr->sem_base[semnum].semncnt; + break; + + case GETPID: + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R))) + return(eval); + if (semnum < 0 || semnum >= semaptr->sem_nsems) + return(EINVAL); + rval = semaptr->sem_base[semnum].sempid; + break; + + case GETVAL: + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R))) + return(eval); + if (semnum < 0 || semnum >= semaptr->sem_nsems) + return(EINVAL); + rval = semaptr->sem_base[semnum].semval; + break; + + case GETALL: + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R))) + return(eval); + if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0) + return(eval); + for (i = 0; i < semaptr->sem_nsems; i++) { + eval = copyout((caddr_t)&semaptr->sem_base[i].semval, + &real_arg.array[i], sizeof(real_arg.array[0])); + if (eval != 0) + break; + } + break; + + case GETZCNT: + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R))) + return(eval); + if (semnum < 0 || semnum >= semaptr->sem_nsems) + return(EINVAL); + rval = semaptr->sem_base[semnum].semzcnt; + break; + + case SETVAL: + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_W))) + return(eval); + if (semnum < 0 || semnum >= semaptr->sem_nsems) + return(EINVAL); + if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0) + return(eval); + semaptr->sem_base[semnum].semval = real_arg.val; + semundo_clear(semid, semnum); + wakeup((caddr_t)semaptr); + break; + + case SETALL: + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_W))) + return(eval); + if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0) + return(eval); + for (i = 0; i < semaptr->sem_nsems; i++) { + eval = copyin(&real_arg.array[i], + (caddr_t)&semaptr->sem_base[i].semval, + sizeof(real_arg.array[0])); + if (eval != 0) + break; + } + semundo_clear(semid, -1); + wakeup((caddr_t)semaptr); + break; + + default: + return(EINVAL); + } + + if (eval == 0) + *retval = rval; + return(eval); +} + +#ifndef _SYS_SYSPROTO_H_ +struct semget_args { + key_t key; + int nsems; + int semflg; +}; +#endif + +int +semget(p, uap, retval) + struct proc *p; + register struct semget_args *uap; + int *retval; +{ + int semid, eval; + int key = uap->key; + int nsems = uap->nsems; + int semflg = uap->semflg; + struct ucred *cred = p->p_ucred; + +#ifdef SEM_DEBUG + printf("semget(0x%x, %d, 0%o)\n", key, nsems, semflg); +#endif + + if (key != IPC_PRIVATE) { + for (semid = 0; semid < seminfo.semmni; semid++) { + if ((sema[semid].sem_perm.mode & SEM_ALLOC) && + sema[semid].sem_perm.key == key) + break; + } + if (semid < seminfo.semmni) { +#ifdef SEM_DEBUG + printf("found public key\n"); +#endif + if ((eval = ipcperm(cred, &sema[semid].sem_perm, + semflg & 0700))) + return(eval); + if (nsems > 0 && sema[semid].sem_nsems < nsems) { +#ifdef SEM_DEBUG + printf("too small\n"); +#endif + return(EINVAL); + } + if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) { +#ifdef SEM_DEBUG + printf("not exclusive\n"); +#endif + return(EEXIST); + } + goto found; + } + } + +#ifdef SEM_DEBUG + printf("need to allocate the semid_ds\n"); +#endif + if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) { + if (nsems <= 0 || nsems > seminfo.semmsl) { +#ifdef SEM_DEBUG + printf("nsems out of range (0<%d<=%d)\n", nsems, + seminfo.semmsl); +#endif + return(EINVAL); + } + if (nsems > seminfo.semmns - semtot) { +#ifdef SEM_DEBUG + printf("not enough semaphores left (need %d, got %d)\n", + nsems, seminfo.semmns - semtot); +#endif + return(ENOSPC); + } + for (semid = 0; semid < seminfo.semmni; semid++) { + if ((sema[semid].sem_perm.mode & SEM_ALLOC) == 0) + break; + } + if (semid == seminfo.semmni) { +#ifdef SEM_DEBUG + printf("no more semid_ds's available\n"); +#endif + return(ENOSPC); + } +#ifdef SEM_DEBUG + printf("semid %d is available\n", semid); +#endif + sema[semid].sem_perm.key = key; + sema[semid].sem_perm.cuid = cred->cr_uid; + sema[semid].sem_perm.uid = cred->cr_uid; + sema[semid].sem_perm.cgid = cred->cr_gid; + sema[semid].sem_perm.gid = cred->cr_gid; + sema[semid].sem_perm.mode = (semflg & 0777) | SEM_ALLOC; + sema[semid].sem_perm.seq = + (sema[semid].sem_perm.seq + 1) & 0x7fff; + sema[semid].sem_nsems = nsems; + sema[semid].sem_otime = 0; + sema[semid].sem_ctime = time.tv_sec; + sema[semid].sem_base = &sem[semtot]; + semtot += nsems; + bzero(sema[semid].sem_base, + sizeof(sema[semid].sem_base[0])*nsems); +#ifdef SEM_DEBUG + printf("sembase = 0x%x, next = 0x%x\n", sema[semid].sem_base, + &sem[semtot]); +#endif + } else { +#ifdef SEM_DEBUG + printf("didn't find it and wasn't asked to create it\n"); +#endif + return(ENOENT); + } + +found: + *retval = IXSEQ_TO_IPCID(semid, sema[semid].sem_perm); + return(0); +} + +#ifndef _SYS_SYSPROTO_H_ +struct semop_args { + int semid; + struct sembuf *sops; + int nsops; +}; +#endif + +int +semop(p, uap, retval) + struct proc *p; + register struct semop_args *uap; + int *retval; +{ + int semid = uap->semid; + int nsops = uap->nsops; + struct sembuf sops[MAX_SOPS]; + register struct semid_ds *semaptr; + register struct sembuf *sopptr; + register struct sem *semptr; + struct sem_undo *suptr = NULL; + struct ucred *cred = p->p_ucred; + int i, j, eval; + int do_wakeup, do_undos; + +#ifdef SEM_DEBUG + printf("call to semop(%d, 0x%x, %d)\n", semid, sops, nsops); +#endif + + semid = IPCID_TO_IX(semid); /* Convert back to zero origin */ + + if (semid < 0 || semid >= seminfo.semmsl) + return(EINVAL); + + semaptr = &sema[semid]; + if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0) + return(EINVAL); + if (semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) + return(EINVAL); + + if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_W))) { +#ifdef SEM_DEBUG + printf("eval = %d from ipaccess\n", eval); +#endif + return(eval); + } + + if (nsops > MAX_SOPS) { +#ifdef SEM_DEBUG + printf("too many sops (max=%d, nsops=%d)\n", MAX_SOPS, nsops); +#endif + return(E2BIG); + } + + if ((eval = copyin(uap->sops, &sops, nsops * sizeof(sops[0]))) != 0) { +#ifdef SEM_DEBUG + printf("eval = %d from copyin(%08x, %08x, %d)\n", eval, + uap->sops, &sops, nsops * sizeof(sops[0])); +#endif + return(eval); + } + + /* + * Loop trying to satisfy the vector of requests. + * If we reach a point where we must wait, any requests already + * performed are rolled back and we go to sleep until some other + * process wakes us up. At this point, we start all over again. + * + * This ensures that from the perspective of other tasks, a set + * of requests is atomic (never partially satisfied). + */ + do_undos = 0; + + for (;;) { + do_wakeup = 0; + + for (i = 0; i < nsops; i++) { + sopptr = &sops[i]; + + if (sopptr->sem_num >= semaptr->sem_nsems) + return(EFBIG); + + semptr = &semaptr->sem_base[sopptr->sem_num]; + +#ifdef SEM_DEBUG + printf("semop: semaptr=%x, sem_base=%x, semptr=%x, sem[%d]=%d : op=%d, flag=%s\n", + semaptr, semaptr->sem_base, semptr, + sopptr->sem_num, semptr->semval, sopptr->sem_op, + (sopptr->sem_flg & IPC_NOWAIT) ? "nowait" : "wait"); +#endif + + if (sopptr->sem_op < 0) { + if (semptr->semval + sopptr->sem_op < 0) { +#ifdef SEM_DEBUG + printf("semop: can't do it now\n"); +#endif + break; + } else { + semptr->semval += sopptr->sem_op; + if (semptr->semval == 0 && + semptr->semzcnt > 0) + do_wakeup = 1; + } + if (sopptr->sem_flg & SEM_UNDO) + do_undos = 1; + } else if (sopptr->sem_op == 0) { + if (semptr->semval > 0) { +#ifdef SEM_DEBUG + printf("semop: not zero now\n"); +#endif + break; + } + } else { + if (semptr->semncnt > 0) + do_wakeup = 1; + semptr->semval += sopptr->sem_op; + if (sopptr->sem_flg & SEM_UNDO) + do_undos = 1; + } + } + + /* + * Did we get through the entire vector? + */ + if (i >= nsops) + goto done; + + /* + * No ... rollback anything that we've already done + */ +#ifdef SEM_DEBUG + printf("semop: rollback 0 through %d\n", i-1); +#endif + for (j = 0; j < i; j++) + semaptr->sem_base[sops[j].sem_num].semval -= + sops[j].sem_op; + + /* + * If the request that we couldn't satisfy has the + * NOWAIT flag set then return with EAGAIN. + */ + if (sopptr->sem_flg & IPC_NOWAIT) + return(EAGAIN); + + if (sopptr->sem_op == 0) + semptr->semzcnt++; + else + semptr->semncnt++; + +#ifdef SEM_DEBUG + printf("semop: good night!\n"); +#endif + eval = tsleep((caddr_t)semaptr, (PZERO - 4) | PCATCH, + "semwait", 0); +#ifdef SEM_DEBUG + printf("semop: good morning (eval=%d)!\n", eval); +#endif + + suptr = NULL; /* sem_undo may have been reallocated */ + + if (eval != 0) + return(EINTR); +#ifdef SEM_DEBUG + printf("semop: good morning!\n"); +#endif + + /* + * Make sure that the semaphore still exists + */ + if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 || + semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) { + /* The man page says to return EIDRM. */ + /* Unfortunately, BSD doesn't define that code! */ +#ifdef EIDRM + return(EIDRM); +#else + return(EINVAL); +#endif + } + + /* + * The semaphore is still alive. Readjust the count of + * waiting processes. + */ + if (sopptr->sem_op == 0) + semptr->semzcnt--; + else + semptr->semncnt--; + } + +done: + /* + * Process any SEM_UNDO requests. + */ + if (do_undos) { + for (i = 0; i < nsops; i++) { + /* + * We only need to deal with SEM_UNDO's for non-zero + * op's. + */ + int adjval; + + if ((sops[i].sem_flg & SEM_UNDO) == 0) + continue; + adjval = sops[i].sem_op; + if (adjval == 0) + continue; + eval = semundo_adjust(p, &suptr, semid, + sops[i].sem_num, -adjval); + if (eval == 0) + continue; + + /* + * Oh-Oh! We ran out of either sem_undo's or undo's. + * Rollback the adjustments to this point and then + * rollback the semaphore ups and down so we can return + * with an error with all structures restored. We + * rollback the undo's in the exact reverse order that + * we applied them. This guarantees that we won't run + * out of space as we roll things back out. + */ + for (j = i - 1; j >= 0; j--) { + if ((sops[j].sem_flg & SEM_UNDO) == 0) + continue; + adjval = sops[j].sem_op; + if (adjval == 0) + continue; + if (semundo_adjust(p, &suptr, semid, + sops[j].sem_num, adjval) != 0) + panic("semop - can't undo undos"); + } + + for (j = 0; j < nsops; j++) + semaptr->sem_base[sops[j].sem_num].semval -= + sops[j].sem_op; + +#ifdef SEM_DEBUG + printf("eval = %d from semundo_adjust\n", eval); +#endif + return(eval); + } /* loop through the sops */ + } /* if (do_undos) */ + + /* We're definitely done - set the sempid's */ + for (i = 0; i < nsops; i++) { + sopptr = &sops[i]; + semptr = &semaptr->sem_base[sopptr->sem_num]; + semptr->sempid = p->p_pid; + } + + /* Do a wakeup if any semaphore was up'd. */ + if (do_wakeup) { +#ifdef SEM_DEBUG + printf("semop: doing wakeup\n"); +#ifdef SEM_WAKEUP + sem_wakeup((caddr_t)semaptr); +#else + wakeup((caddr_t)semaptr); +#endif + printf("semop: back from wakeup\n"); +#else + wakeup((caddr_t)semaptr); +#endif + } +#ifdef SEM_DEBUG + printf("semop: done\n"); +#endif + *retval = 0; + return(0); +} + +/* + * Go through the undo structures for this process and apply the adjustments to + * semaphores. + */ +void +semexit(p) + struct proc *p; +{ + register struct sem_undo *suptr; + register struct sem_undo **supptr; + int did_something; + + /* + * If somebody else is holding the global semaphore facility lock + * then sleep until it is released. + */ + while (semlock_holder != NULL && semlock_holder != p) { +#ifdef SEM_DEBUG + printf("semaphore facility locked - sleeping ...\n"); +#endif + (void) tsleep((caddr_t)&semlock_holder, (PZERO - 4), "semext", 0); + } + + did_something = 0; + + /* + * Go through the chain of undo vectors looking for one + * associated with this process. + */ + + for (supptr = &semu_list; (suptr = *supptr) != NULL; + supptr = &suptr->un_next) { + if (suptr->un_proc == p) + break; + } + + if (suptr == NULL) + goto unlock; + +#ifdef SEM_DEBUG + printf("proc @%08x has undo structure with %d entries\n", p, + suptr->un_cnt); +#endif + + /* + * If there are any active undo elements then process them. + */ + if (suptr->un_cnt > 0) { + int ix; + + for (ix = 0; ix < suptr->un_cnt; ix++) { + int semid = suptr->un_ent[ix].un_id; + int semnum = suptr->un_ent[ix].un_num; + int adjval = suptr->un_ent[ix].un_adjval; + struct semid_ds *semaptr; + + semaptr = &sema[semid]; + if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0) + panic("semexit - semid not allocated"); + if (semnum >= semaptr->sem_nsems) + panic("semexit - semnum out of range"); + +#ifdef SEM_DEBUG + printf("semexit: %08x id=%d num=%d(adj=%d) ; sem=%d\n", + suptr->un_proc, suptr->un_ent[ix].un_id, + suptr->un_ent[ix].un_num, + suptr->un_ent[ix].un_adjval, + semaptr->sem_base[semnum].semval); +#endif + + if (adjval < 0) { + if (semaptr->sem_base[semnum].semval < -adjval) + semaptr->sem_base[semnum].semval = 0; + else + semaptr->sem_base[semnum].semval += + adjval; + } else + semaptr->sem_base[semnum].semval += adjval; + +#ifdef SEM_WAKEUP + sem_wakeup((caddr_t)semaptr); +#else + wakeup((caddr_t)semaptr); +#endif +#ifdef SEM_DEBUG + printf("semexit: back from wakeup\n"); +#endif + } + } + + /* + * Deallocate the undo vector. + */ +#ifdef SEM_DEBUG + printf("removing vector\n"); +#endif + suptr->un_proc = NULL; + *supptr = suptr->un_next; + +unlock: + /* + * If the exiting process is holding the global semaphore facility + * lock then release it. + */ + if (semlock_holder == p) { + semlock_holder = NULL; + wakeup((caddr_t)&semlock_holder); + } +} diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c new file mode 100644 index 0000000..9e93923 --- /dev/null +++ b/sys/kern/sysv_shm.c @@ -0,0 +1,622 @@ +/* $Id$ */ +/* $NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $ */ + +/* + * Copyright (c) 1994 Adam Glass and Charles Hannum. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by Adam Glass and Charles + * Hannum. + * 4. The names of the authors may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ + +#include "opt_sysvipc.h" +#include "opt_rlimit.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/shm.h> +#include <sys/proc.h> +#include <sys/malloc.h> +#include <sys/mman.h> +#include <sys/stat.h> +#include <sys/sysent.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_object.h> +#include <vm/vm_map.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> +#include <vm/vm_pager.h> +#include <vm/vm_inherit.h> + +#ifndef _SYS_SYSPROTO_H_ +struct shmat_args; +extern int shmat __P((struct proc *p, struct shmat_args *uap, int *retval)); +struct shmctl_args; +extern int shmctl __P((struct proc *p, struct shmctl_args *uap, int *retval)); +struct shmdt_args; +extern int shmdt __P((struct proc *p, struct shmdt_args *uap, int *retval)); +struct shmget_args; +extern int shmget __P((struct proc *p, struct shmget_args *uap, int *retval)); +#endif + +static void shminit __P((void *)); +SYSINIT(sysv_shm, SI_SUB_SYSV_SHM, SI_ORDER_FIRST, shminit, NULL) + +struct oshmctl_args; +static int oshmctl __P((struct proc *p, struct oshmctl_args *uap, int *retval)); +static int shmget_allocate_segment __P((struct proc *p, struct shmget_args *uap, int mode, int *retval)); +static int shmget_existing __P((struct proc *p, struct shmget_args *uap, int mode, int segnum, int *retval)); + +/* XXX casting to (sy_call_t *) is bogus, as usual. */ +sy_call_t *shmcalls[] = { + (sy_call_t *)shmat, (sy_call_t *)oshmctl, + (sy_call_t *)shmdt, (sy_call_t *)shmget, + (sy_call_t *)shmctl +}; + +#define SHMSEG_FREE 0x0200 +#define SHMSEG_REMOVED 0x0400 +#define SHMSEG_ALLOCATED 0x0800 +#define SHMSEG_WANTED 0x1000 + +static int shm_last_free, shm_nused, shm_committed; +struct shmid_ds *shmsegs; + +struct shm_handle { + /* vm_offset_t kva; */ + vm_object_t shm_object; +}; + +struct shmmap_state { + vm_offset_t va; + int shmid; +}; + +static void shm_deallocate_segment __P((struct shmid_ds *)); +static int shm_find_segment_by_key __P((key_t)); +static struct shmid_ds *shm_find_segment_by_shmid __P((int)); +static int shm_delete_mapping __P((struct proc *, struct shmmap_state *)); + +static int +shm_find_segment_by_key(key) + key_t key; +{ + int i; + + for (i = 0; i < shminfo.shmmni; i++) + if ((shmsegs[i].shm_perm.mode & SHMSEG_ALLOCATED) && + shmsegs[i].shm_perm.key == key) + return i; + return -1; +} + +static struct shmid_ds * +shm_find_segment_by_shmid(shmid) + int shmid; +{ + int segnum; + struct shmid_ds *shmseg; + + segnum = IPCID_TO_IX(shmid); + if (segnum < 0 || segnum >= shminfo.shmmni) + return NULL; + shmseg = &shmsegs[segnum]; + if ((shmseg->shm_perm.mode & (SHMSEG_ALLOCATED | SHMSEG_REMOVED)) + != SHMSEG_ALLOCATED || + shmseg->shm_perm.seq != IPCID_TO_SEQ(shmid)) + return NULL; + return shmseg; +} + +static void +shm_deallocate_segment(shmseg) + struct shmid_ds *shmseg; +{ + struct shm_handle *shm_handle; + size_t size; + + shm_handle = shmseg->shm_internal; + vm_object_deallocate(shm_handle->shm_object); + free((caddr_t)shm_handle, M_SHM); + shmseg->shm_internal = NULL; + size = round_page(shmseg->shm_segsz); + shm_committed -= btoc(size); + shm_nused--; + shmseg->shm_perm.mode = SHMSEG_FREE; +} + +static int +shm_delete_mapping(p, shmmap_s) + struct proc *p; + struct shmmap_state *shmmap_s; +{ + struct shmid_ds *shmseg; + int segnum, result; + size_t size; + + segnum = IPCID_TO_IX(shmmap_s->shmid); + shmseg = &shmsegs[segnum]; + size = round_page(shmseg->shm_segsz); + result = vm_map_remove(&p->p_vmspace->vm_map, shmmap_s->va, shmmap_s->va + size); + if (result != KERN_SUCCESS) + return EINVAL; + shmmap_s->shmid = -1; + shmseg->shm_dtime = time.tv_sec; + if ((--shmseg->shm_nattch <= 0) && + (shmseg->shm_perm.mode & SHMSEG_REMOVED)) { + shm_deallocate_segment(shmseg); + shm_last_free = segnum; + } + return 0; +} + +#ifndef _SYS_SYSPROTO_H_ +struct shmdt_args { + void *shmaddr; +}; +#endif + +int +shmdt(p, uap, retval) + struct proc *p; + struct shmdt_args *uap; + int *retval; +{ + struct shmmap_state *shmmap_s; + int i; + + shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm; + if (shmmap_s == NULL) + return EINVAL; + for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) + if (shmmap_s->shmid != -1 && + shmmap_s->va == (vm_offset_t)uap->shmaddr) + break; + if (i == shminfo.shmseg) + return EINVAL; + return shm_delete_mapping(p, shmmap_s); +} + +#ifndef _SYS_SYSPROTO_H_ +struct shmat_args { + int shmid; + void *shmaddr; + int shmflg; +}; +#endif + +int +shmat(p, uap, retval) + struct proc *p; + struct shmat_args *uap; + int *retval; +{ + int error, i, flags; + struct ucred *cred = p->p_ucred; + struct shmid_ds *shmseg; + struct shmmap_state *shmmap_s = NULL; + struct shm_handle *shm_handle; + vm_offset_t attach_va; + vm_prot_t prot; + vm_size_t size; + int rv; + + shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm; + if (shmmap_s == NULL) { + size = shminfo.shmseg * sizeof(struct shmmap_state); + shmmap_s = malloc(size, M_SHM, M_WAITOK); + for (i = 0; i < shminfo.shmseg; i++) + shmmap_s[i].shmid = -1; + p->p_vmspace->vm_shm = (caddr_t)shmmap_s; + } + shmseg = shm_find_segment_by_shmid(uap->shmid); + if (shmseg == NULL) + return EINVAL; + error = ipcperm(cred, &shmseg->shm_perm, + (uap->shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W); + if (error) + return error; + for (i = 0; i < shminfo.shmseg; i++) { + if (shmmap_s->shmid == -1) + break; + shmmap_s++; + } + if (i >= shminfo.shmseg) + return EMFILE; + size = round_page(shmseg->shm_segsz); + prot = VM_PROT_READ; + if ((uap->shmflg & SHM_RDONLY) == 0) + prot |= VM_PROT_WRITE; + flags = MAP_ANON | MAP_SHARED; + if (uap->shmaddr) { + flags |= MAP_FIXED; + if (uap->shmflg & SHM_RND) + attach_va = (vm_offset_t)uap->shmaddr & ~(SHMLBA-1); + else if (((vm_offset_t)uap->shmaddr & (SHMLBA-1)) == 0) + attach_va = (vm_offset_t)uap->shmaddr; + else + return EINVAL; + } else { + /* This is just a hint to vm_map_find() about where to put it. */ + attach_va = round_page(p->p_vmspace->vm_taddr + MAXTSIZ + MAXDSIZ); + } + + shm_handle = shmseg->shm_internal; + vm_object_reference(shm_handle->shm_object); + rv = vm_map_find(&p->p_vmspace->vm_map, shm_handle->shm_object, + 0, &attach_va, size, (flags & MAP_FIXED)?0:1, prot, prot, 0); + if (rv != KERN_SUCCESS) { + return ENOMEM; + } + vm_map_inherit(&p->p_vmspace->vm_map, + attach_va, attach_va + size, VM_INHERIT_SHARE); + + shmmap_s->va = attach_va; + shmmap_s->shmid = uap->shmid; + shmseg->shm_lpid = p->p_pid; + shmseg->shm_atime = time.tv_sec; + shmseg->shm_nattch++; + *retval = attach_va; + return 0; +} + +struct oshmid_ds { + struct ipc_perm shm_perm; /* operation perms */ + int shm_segsz; /* size of segment (bytes) */ + ushort shm_cpid; /* pid, creator */ + ushort shm_lpid; /* pid, last operation */ + short shm_nattch; /* no. of current attaches */ + time_t shm_atime; /* last attach time */ + time_t shm_dtime; /* last detach time */ + time_t shm_ctime; /* last change time */ + void *shm_handle; /* internal handle for shm segment */ +}; + +struct oshmctl_args { + int shmid; + int cmd; + struct oshmid_ds *ubuf; +}; + +static int +oshmctl(p, uap, retval) + struct proc *p; + struct oshmctl_args *uap; + int *retval; +{ +#ifdef COMPAT_43 + int error; + struct ucred *cred = p->p_ucred; + struct shmid_ds *shmseg; + struct oshmid_ds outbuf; + + shmseg = shm_find_segment_by_shmid(uap->shmid); + if (shmseg == NULL) + return EINVAL; + switch (uap->cmd) { + case IPC_STAT: + error = ipcperm(cred, &shmseg->shm_perm, IPC_R); + if (error) + return error; + outbuf.shm_perm = shmseg->shm_perm; + outbuf.shm_segsz = shmseg->shm_segsz; + outbuf.shm_cpid = shmseg->shm_cpid; + outbuf.shm_lpid = shmseg->shm_lpid; + outbuf.shm_nattch = shmseg->shm_nattch; + outbuf.shm_atime = shmseg->shm_atime; + outbuf.shm_dtime = shmseg->shm_dtime; + outbuf.shm_ctime = shmseg->shm_ctime; + outbuf.shm_handle = shmseg->shm_internal; + error = copyout((caddr_t)&outbuf, uap->ubuf, sizeof(outbuf)); + if (error) + return error; + break; + default: + /* XXX casting to (sy_call_t *) is bogus, as usual. */ + return ((sy_call_t *)shmctl)(p, uap, retval); + } + return 0; +#else + return EINVAL; +#endif +} + +#ifndef _SYS_SYSPROTO_H_ +struct shmctl_args { + int shmid; + int cmd; + struct shmid_ds *buf; +}; +#endif + +int +shmctl(p, uap, retval) + struct proc *p; + struct shmctl_args *uap; + int *retval; +{ + int error; + struct ucred *cred = p->p_ucred; + struct shmid_ds inbuf; + struct shmid_ds *shmseg; + + shmseg = shm_find_segment_by_shmid(uap->shmid); + if (shmseg == NULL) + return EINVAL; + switch (uap->cmd) { + case IPC_STAT: + error = ipcperm(cred, &shmseg->shm_perm, IPC_R); + if (error) + return error; + error = copyout((caddr_t)shmseg, uap->buf, sizeof(inbuf)); + if (error) + return error; + break; + case IPC_SET: + error = ipcperm(cred, &shmseg->shm_perm, IPC_M); + if (error) + return error; + error = copyin(uap->buf, (caddr_t)&inbuf, sizeof(inbuf)); + if (error) + return error; + shmseg->shm_perm.uid = inbuf.shm_perm.uid; + shmseg->shm_perm.gid = inbuf.shm_perm.gid; + shmseg->shm_perm.mode = + (shmseg->shm_perm.mode & ~ACCESSPERMS) | + (inbuf.shm_perm.mode & ACCESSPERMS); + shmseg->shm_ctime = time.tv_sec; + break; + case IPC_RMID: + error = ipcperm(cred, &shmseg->shm_perm, IPC_M); + if (error) + return error; + shmseg->shm_perm.key = IPC_PRIVATE; + shmseg->shm_perm.mode |= SHMSEG_REMOVED; + if (shmseg->shm_nattch <= 0) { + shm_deallocate_segment(shmseg); + shm_last_free = IPCID_TO_IX(uap->shmid); + } + break; +#if 0 + case SHM_LOCK: + case SHM_UNLOCK: +#endif + default: + return EINVAL; + } + return 0; +} + +#ifndef _SYS_SYSPROTO_H_ +struct shmget_args { + key_t key; + size_t size; + int shmflg; +}; +#endif + +static int +shmget_existing(p, uap, mode, segnum, retval) + struct proc *p; + struct shmget_args *uap; + int mode; + int segnum; + int *retval; +{ + struct shmid_ds *shmseg; + struct ucred *cred = p->p_ucred; + int error; + + shmseg = &shmsegs[segnum]; + if (shmseg->shm_perm.mode & SHMSEG_REMOVED) { + /* + * This segment is in the process of being allocated. Wait + * until it's done, and look the key up again (in case the + * allocation failed or it was freed). + */ + shmseg->shm_perm.mode |= SHMSEG_WANTED; + error = tsleep((caddr_t)shmseg, PLOCK | PCATCH, "shmget", 0); + if (error) + return error; + return EAGAIN; + } + error = ipcperm(cred, &shmseg->shm_perm, mode); + if (error) + return error; + if (uap->size && uap->size > shmseg->shm_segsz) + return EINVAL; + if ((uap->shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL)) + return EEXIST; + *retval = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm); + return 0; +} + +static int +shmget_allocate_segment(p, uap, mode, retval) + struct proc *p; + struct shmget_args *uap; + int mode; + int *retval; +{ + int i, segnum, shmid, size; + struct ucred *cred = p->p_ucred; + struct shmid_ds *shmseg; + struct shm_handle *shm_handle; + + if (uap->size < shminfo.shmmin || uap->size > shminfo.shmmax) + return EINVAL; + if (shm_nused >= shminfo.shmmni) /* any shmids left? */ + return ENOSPC; + size = round_page(uap->size); + if (shm_committed + btoc(size) > shminfo.shmall) + return ENOMEM; + if (shm_last_free < 0) { + for (i = 0; i < shminfo.shmmni; i++) + if (shmsegs[i].shm_perm.mode & SHMSEG_FREE) + break; + if (i == shminfo.shmmni) + panic("shmseg free count inconsistent"); + segnum = i; + } else { + segnum = shm_last_free; + shm_last_free = -1; + } + shmseg = &shmsegs[segnum]; + /* + * In case we sleep in malloc(), mark the segment present but deleted + * so that noone else tries to create the same key. + */ + shmseg->shm_perm.mode = SHMSEG_ALLOCATED | SHMSEG_REMOVED; + shmseg->shm_perm.key = uap->key; + shmseg->shm_perm.seq = (shmseg->shm_perm.seq + 1) & 0x7fff; + shm_handle = (struct shm_handle *) + malloc(sizeof(struct shm_handle), M_SHM, M_WAITOK); + shmid = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm); + + /* + * We make sure that we have allocated a pager before we need + * to. + */ + shm_handle->shm_object = + vm_pager_allocate(OBJT_SWAP, 0, OFF_TO_IDX(size), + VM_PROT_DEFAULT, 0); + shmseg->shm_internal = shm_handle; + shmseg->shm_perm.cuid = shmseg->shm_perm.uid = cred->cr_uid; + shmseg->shm_perm.cgid = shmseg->shm_perm.gid = cred->cr_gid; + shmseg->shm_perm.mode = (shmseg->shm_perm.mode & SHMSEG_WANTED) | + (mode & ACCESSPERMS) | SHMSEG_ALLOCATED; + shmseg->shm_segsz = uap->size; + shmseg->shm_cpid = p->p_pid; + shmseg->shm_lpid = shmseg->shm_nattch = 0; + shmseg->shm_atime = shmseg->shm_dtime = 0; + shmseg->shm_ctime = time.tv_sec; + shm_committed += btoc(size); + shm_nused++; + if (shmseg->shm_perm.mode & SHMSEG_WANTED) { + /* + * Somebody else wanted this key while we were asleep. Wake + * them up now. + */ + shmseg->shm_perm.mode &= ~SHMSEG_WANTED; + wakeup((caddr_t)shmseg); + } + *retval = shmid; + return 0; +} + +int +shmget(p, uap, retval) + struct proc *p; + struct shmget_args *uap; + int *retval; +{ + int segnum, mode, error; + + mode = uap->shmflg & ACCESSPERMS; + if (uap->key != IPC_PRIVATE) { + again: + segnum = shm_find_segment_by_key(uap->key); + if (segnum >= 0) { + error = shmget_existing(p, uap, mode, segnum, retval); + if (error == EAGAIN) + goto again; + return error; + } + if ((uap->shmflg & IPC_CREAT) == 0) + return ENOENT; + } + return shmget_allocate_segment(p, uap, mode, retval); +} + +int +shmsys(p, uap, retval) + struct proc *p; + /* XXX actually varargs. */ + struct shmsys_args /* { + u_int which; + int a2; + int a3; + int a4; + } */ *uap; + int *retval; +{ + + if (uap->which >= sizeof(shmcalls)/sizeof(shmcalls[0])) + return EINVAL; + return ((*shmcalls[uap->which])(p, &uap->a2, retval)); +} + +void +shmfork(p1, p2) + struct proc *p1, *p2; +{ + struct shmmap_state *shmmap_s; + size_t size; + int i; + + size = shminfo.shmseg * sizeof(struct shmmap_state); + shmmap_s = malloc(size, M_SHM, M_WAITOK); + bcopy((caddr_t)p1->p_vmspace->vm_shm, (caddr_t)shmmap_s, size); + p2->p_vmspace->vm_shm = (caddr_t)shmmap_s; + for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) + if (shmmap_s->shmid != -1) + shmsegs[IPCID_TO_IX(shmmap_s->shmid)].shm_nattch++; +} + +void +shmexit(p) + struct proc *p; +{ + struct shmmap_state *shmmap_s; + int i; + + shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm; + for (i = 0; i < shminfo.shmseg; i++, shmmap_s++) + if (shmmap_s->shmid != -1) + shm_delete_mapping(p, shmmap_s); + free((caddr_t)p->p_vmspace->vm_shm, M_SHM); + p->p_vmspace->vm_shm = NULL; +} + +void +shminit(dummy) + void *dummy; +{ + int i; + for (i = 0; i < shminfo.shmmni; i++) { + shmsegs[i].shm_perm.mode = SHMSEG_FREE; + shmsegs[i].shm_perm.seq = 0; + } + shm_last_free = 0; + shm_nused = 0; + shm_committed = 0; +} diff --git a/sys/kern/tty.c b/sys/kern/tty.c index 5d698b1..f6e14f9 100644 --- a/sys/kern/tty.c +++ b/sys/kern/tty.c @@ -35,39 +35,82 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)tty.c 8.13 (Berkeley) 1/9/95 + * @(#)tty.c 8.8 (Berkeley) 1/21/94 + * $Id: tty.c,v 1.93 1997/03/23 03:36:26 bde Exp $ */ +/*- + * TODO: + * o Fix races for sending the start char in ttyflush(). + * o Handle inter-byte timeout for "MIN > 0, TIME > 0" in ttyselect(). + * With luck, there will be MIN chars before select() returns(). + * o Handle CLOCAL consistently for ptys. Perhaps disallow setting it. + * o Don't allow input in TS_ZOMBIE case. It would be visible through + * FIONREAD. + * o Do the new sio locking stuff here and use it to avoid special + * case for EXTPROC? + * o Lock PENDIN too? + * o Move EXTPROC and/or PENDIN to t_state? + * o Wrap most of ttioctl in spltty/splx. + * o Implement TIOCNOTTY or remove it from <sys/ioctl.h>. + * o Send STOP if IXOFF is toggled off while TS_TBLOCK is set. + * o Don't allow certain termios flags to affect disciplines other + * than TTYDISC. Cancel their effects before switch disciplines + * and ignore them if they are set while we are in another + * discipline. + * o Handle c_ispeed = 0 to c_ispeed = c_ospeed conversion here instead + * of in drivers and fix drivers that write to tp->t_termios. + * o Check for TS_CARR_ON being set while everything is closed and not + * waiting for carrier. TS_CARR_ON isn't cleared if nothing is open, + * so it would live until the next open even if carrier drops. + * o Restore TS_WOPEN since it is useful in pstat. It must be cleared + * only when _all_ openers leave open(). + */ + +#include "snp.h" +#include "opt_uconsole.h" + #include <sys/param.h> #include <sys/systm.h> -#include <sys/ioctl.h> +#include <sys/filio.h> +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#include <sys/ioctl_compat.h> +#endif #include <sys/proc.h> #define TTYDEFCHARS #include <sys/tty.h> #undef TTYDEFCHARS -#include <sys/file.h> +#include <sys/fcntl.h> #include <sys/conf.h> #include <sys/dkstat.h> #include <sys/uio.h> #include <sys/kernel.h> #include <sys/vnode.h> #include <sys/syslog.h> +#include <sys/signalvar.h> +#include <sys/resourcevar.h> +#include <sys/malloc.h> +#if NSNP > 0 +#include <sys/snoop.h> +#endif #include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <sys/lock.h> +#include <vm/pmap.h> +#include <vm/vm_map.h> static int proc_compare __P((struct proc *p1, struct proc *p2)); -static int ttnread __P((struct tty *)); -static void ttyblock __P((struct tty *tp)); -static void ttyecho __P((int, struct tty *tp)); -static void ttyrubo __P((struct tty *, int)); - -/* Symbolic sleep message strings. */ -char ttclos[] = "ttycls"; -char ttopen[] = "ttyopn"; -char ttybg[] = "ttybg"; -char ttybuf[] = "ttybuf"; -char ttyin[] = "ttyin"; -char ttyout[] = "ttyout"; +static int ttnread __P((struct tty *tp)); +static void ttyecho __P((int c, struct tty *tp)); +static int ttyoutput __P((int c, register struct tty *tp)); +static void ttypend __P((struct tty *tp)); +static void ttyretype __P((struct tty *tp)); +static void ttyrub __P((int c, struct tty *tp)); +static void ttyrubo __P((struct tty *tp, int cnt)); +static void ttyunblock __P((struct tty *tp)); +static int ttywflush __P((struct tty *tp)); /* * Table with character classes and parity. The 8th bit indicates parity, @@ -95,7 +138,7 @@ char ttyout[] = "ttyout"; #define TB TAB #define VT VTAB -char const char_type[] = { +static u_char const char_type[] = { E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* nul - bel */ O|BS, E|TB, E|NL, O|CC, E|VT, O|CR, O|CC, E|CC, /* bs - si */ O|CC, E|CC, E|CC, O|CC, E|CC, O|CC, O|CC, E|CC, /* dle - etb */ @@ -148,6 +191,17 @@ char const char_type[] = { #define ISSET(t, f) ((t) & (f)) /* + * Input control starts when we would not be able to fit the maximum + * contents of the ping-pong buffers and finishes when we would be able + * to fit that much plus 1/8 more. + */ +#define I_HIGH_WATER (TTYHOG - 2 * 256) /* XXX */ +#define I_LOW_WATER ((TTYHOG - 2 * 256) * 7 / 8) /* XXX */ + +#undef MAX_INPUT /* XXX wrong in <sys/syslimits.h> */ +#define MAX_INPUT TTYHOG + +/* * Initial open of tty, or (re)entry to standard tty line discipline. */ int @@ -161,9 +215,20 @@ ttyopen(device, tp) tp->t_dev = device; if (!ISSET(tp->t_state, TS_ISOPEN)) { SET(tp->t_state, TS_ISOPEN); + if (ISSET(tp->t_cflag, CLOCAL)) + SET(tp->t_state, TS_CONNECTED); bzero(&tp->t_winsize, sizeof(tp->t_winsize)); } - CLR(tp->t_state, TS_WOPEN); + + /* + * Initialize or restore a cblock allocation policy suitable for + * the standard line discipline. + */ + clist_alloc_cblocks(&tp->t_canq, TTYHOG, 512); + clist_alloc_cblocks(&tp->t_outq, TTMAXHIWAT + OBUFSIZ + 100, + TTMAXHIWAT + OBUFSIZ + 100); + clist_alloc_cblocks(&tp->t_rawq, TTYHOG, TTYHOG); + splx(s); return (0); } @@ -172,22 +237,36 @@ ttyopen(device, tp) * Handle close() on a tty line: flush and set to initial state, * bumping generation number so that pending read/write calls * can detect recycling of the tty. + * XXX our caller should have done `spltty(); l_close(); ttyclose();' + * and l_close() should have flushed, but we repeat the spltty() and + * the flush in case there are buggy callers. */ int ttyclose(tp) register struct tty *tp; { - extern struct tty *constty; /* Temporary virtual console. */ + int s; + s = spltty(); if (constty == tp) constty = NULL; ttyflush(tp, FREAD | FWRITE); + clist_free_cblocks(&tp->t_canq); + clist_free_cblocks(&tp->t_outq); + clist_free_cblocks(&tp->t_rawq); + +#if NSNP > 0 + if (ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL) + snpdown((struct snoop *)tp->t_sc); +#endif tp->t_gen++; + tp->t_line = TTYDISC; tp->t_pgrp = NULL; tp->t_session = NULL; tp->t_state = 0; + splx(s); return (0); } @@ -197,10 +276,10 @@ ttyclose(tp) } /* Is 'c' a line delimiter ("break" character)? */ -#define TTBREAKC(c) \ - ((c) == '\n' || ((c) == cc[VEOF] || \ - (c) == cc[VEOL] || (c) == cc[VEOL2]) && (c) != _POSIX_VDISABLE) - +#define TTBREAKC(c, lflag) \ + ((c) == '\n' || (((c) == cc[VEOF] || \ + (c) == cc[VEOL] || ((c) == cc[VEOL2] && lflag & IEXTEN)) && \ + (c) != _POSIX_VDISABLE)) /* * Process input of a single character received on a tty. @@ -210,8 +289,8 @@ ttyinput(c, tp) register int c; register struct tty *tp; { - register int iflag, lflag; - register u_char *cc; + register tcflag_t iflag, lflag; + register cc_t *cc; int i, err; /* @@ -232,26 +311,44 @@ ttyinput(c, tp) } ++tk_nin; + /* + * Block further input iff: + * current input > threshold AND input is available to user program + * AND input flow control is enabled and not yet invoked. + * The 3 is slop for PARMRK. + */ + iflag = tp->t_iflag; + if (tp->t_rawq.c_cc + tp->t_canq.c_cc > I_HIGH_WATER - 3 && + (!ISSET(lflag, ICANON) || tp->t_canq.c_cc != 0) && + (ISSET(tp->t_cflag, CRTS_IFLOW) || ISSET(iflag, IXOFF)) && + !ISSET(tp->t_state, TS_TBLOCK)) + ttyblock(tp); + /* Handle exceptional conditions (break, parity, framing). */ cc = tp->t_cc; - iflag = tp->t_iflag; - if (err = (ISSET(c, TTY_ERRORMASK))) { + err = (ISSET(c, TTY_ERRORMASK)); + if (err) { CLR(c, TTY_ERRORMASK); - if (ISSET(err, TTY_FE) && !c) { /* Break. */ + if (ISSET(err, TTY_BI)) { if (ISSET(iflag, IGNBRK)) + return (0); + if (ISSET(iflag, BRKINT)) { + ttyflush(tp, FREAD | FWRITE); + pgsignal(tp->t_pgrp, SIGINT, 1); goto endcase; - else if (ISSET(iflag, BRKINT) && - ISSET(lflag, ISIG) && - (cc[VINTR] != _POSIX_VDISABLE)) - c = cc[VINTR]; - else if (ISSET(iflag, PARMRK)) + } + if (ISSET(iflag, PARMRK)) goto parmrk; - } else if (ISSET(err, TTY_PE) && - ISSET(iflag, INPCK) || ISSET(err, TTY_FE)) { + } else if ((ISSET(err, TTY_PE) && ISSET(iflag, INPCK)) + || ISSET(err, TTY_FE)) { if (ISSET(iflag, IGNPAR)) - goto endcase; + return (0); else if (ISSET(iflag, PARMRK)) { -parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq); +parmrk: + if (tp->t_rawq.c_cc + tp->t_canq.c_cc > + MAX_INPUT - 3) + goto input_overflow; + (void)putc(0377 | TTY_QUOTE, &tp->t_rawq); (void)putc(0 | TTY_QUOTE, &tp->t_rawq); (void)putc(c | TTY_QUOTE, &tp->t_rawq); goto endcase; @@ -259,11 +356,7 @@ parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq); c = 0; } } - /* - * In tandem mode, check high water mark. - */ - if (ISSET(iflag, IXOFF)) - ttyblock(tp); + if (!ISSET(tp->t_state, TS_TYPEN) && ISSET(iflag, ISTRIP)) CLR(c, 0x80); if (!ISSET(lflag, EXTPROC)) { @@ -341,7 +434,7 @@ parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq); #ifdef sun4c /* XXX */ (*tp->t_stop)(tp, 0); #else - (*cdevsw[major(tp->t_dev)].d_stop)(tp, + (*cdevsw[major(tp->t_dev)]->d_stop)(tp, 0); #endif return (0); @@ -361,7 +454,7 @@ parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq); */ if (c == '\r') { if (ISSET(iflag, IGNCR)) - goto endcase; + return (0); else if (ISSET(iflag, ICRNL)) c = '\n'; } else if (c == '\n' && ISSET(iflag, INLCR)) @@ -403,8 +496,7 @@ parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq); /* * word erase (^W) */ - if (CCEQ(cc[VWERASE], c)) { - int alt = ISSET(lflag, ALTWERASE); + if (CCEQ(cc[VWERASE], c) && ISSET(lflag, IEXTEN)) { int ctype; /* @@ -436,21 +528,21 @@ parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq); if (c == -1) goto endcase; } while (c != ' ' && c != '\t' && - (alt == 0 || ISALPHA(c) == ctype)); + (!ISSET(lflag, ALTWERASE) || ISALPHA(c) == ctype)); (void)putc(c, &tp->t_rawq); goto endcase; } /* * reprint line (^R) */ - if (CCEQ(cc[VREPRINT], c)) { + if (CCEQ(cc[VREPRINT], c) && ISSET(lflag, IEXTEN)) { ttyretype(tp); goto endcase; } /* * ^T - kernel info and generate SIGINFO */ - if (CCEQ(cc[VSTATUS], c)) { + if (CCEQ(cc[VSTATUS], c) && ISSET(lflag, IEXTEN)) { if (ISSET(lflag, ISIG)) pgsignal(tp->t_pgrp, SIGINFO, 1); if (!ISSET(lflag, NOKERNINFO)) @@ -461,14 +553,19 @@ parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq); /* * Check for input buffer overflow */ - if (tp->t_rawq.c_cc + tp->t_canq.c_cc >= TTYHOG) { + if (tp->t_rawq.c_cc + tp->t_canq.c_cc >= MAX_INPUT) { +input_overflow: if (ISSET(iflag, IMAXBEL)) { if (tp->t_outq.c_cc < tp->t_hiwat) (void)ttyoutput(CTRL('g'), tp); - } else - ttyflush(tp, FREAD | FWRITE); + } goto endcase; } + + if ( c == 0377 && ISSET(iflag, PARMRK) && !ISSET(iflag, ISTRIP) + && ISSET(iflag, IGNBRK|IGNPAR) != (IGNBRK|IGNPAR)) + (void)putc(0377 | TTY_QUOTE, &tp->t_rawq); + /* * Put data char in q for user and * wakeup on seeing a line delimiter. @@ -479,7 +576,7 @@ parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq); ttyecho(c, tp); goto endcase; } - if (TTBREAKC(c)) { + if (TTBREAKC(c, lflag)) { tp->t_rocount = 0; catq(&tp->t_rawq, &tp->t_canq); ttwakeup(tp); @@ -498,7 +595,7 @@ parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq); /* * Place the cursor over the '^' of the ^D. */ - i = min(2, tp->t_column - i); + i = imin(2, tp->t_column - i); while (i > 0) { (void)ttyoutput('\b', tp); i--; @@ -525,13 +622,13 @@ startoutput: * Returns < 0 if succeeds, otherwise returns char to resend. * Must be recursive. */ -int +static int ttyoutput(c, tp) register int c; register struct tty *tp; { - register long oflag; - register int notout, col, s; + register tcflag_t oflag; + register int col, s; oflag = tp->t_oflag; if (!ISSET(oflag, OPOST)) { @@ -553,18 +650,15 @@ ttyoutput(c, tp) if (c == '\t' && ISSET(oflag, OXTABS) && !ISSET(tp->t_lflag, EXTPROC)) { c = 8 - (tp->t_column & 7); - if (ISSET(tp->t_lflag, FLUSHO)) { - notout = 0; - } else { + if (!ISSET(tp->t_lflag, FLUSHO)) { s = spltty(); /* Don't interrupt tabs. */ - notout = b_to_q(" ", c, &tp->t_outq); - c -= notout; + c -= b_to_q(" ", c, &tp->t_outq); tk_nout += c; tp->t_outcc += c; splx(s); } tp->t_column += c; - return (notout ? '\t' : -1); + return (c ? -1 : '\t'); } if (c == CEOT && ISSET(oflag, ONOEOT)) return (-1); @@ -616,12 +710,9 @@ ttyoutput(c, tp) int ttioctl(tp, cmd, data, flag) register struct tty *tp; - u_long cmd; + int cmd, flag; void *data; - int flag; { - extern struct tty *constty; /* Temporary virtual console. */ - extern int nlinesw; register struct proc *p; int s, error; @@ -637,6 +728,7 @@ ttioctl(tp, cmd, data, flag) #ifdef notdef case TIOCSPGRP: #endif + case TIOCSTAT: case TIOCSTI: case TIOCSWINSZ: #if defined(COMPAT_43) || defined(COMPAT_SUNOS) @@ -649,13 +741,16 @@ ttioctl(tp, cmd, data, flag) case TIOCSETP: case TIOCSLTC: #endif - while (isbackground(curproc, tp) && - p->p_pgrp->pg_jobc && (p->p_flag & P_PPWAIT) == 0 && + while (isbackground(p, tp) && + (p->p_flag & P_PPWAIT) == 0 && (p->p_sigignore & sigmask(SIGTTOU)) == 0 && (p->p_sigmask & sigmask(SIGTTOU)) == 0) { + if (p->p_pgrp->pg_jobc == 0) + return (EIO); pgsignal(p->p_pgrp, SIGTTOU, 1); - if (error = ttysleep(tp, - &lbolt, TTOPRI | PCATCH, ttybg, 0)) + error = ttysleep(tp, &lbolt, TTOPRI | PCATCH, "ttybg1", + 0); + if (error) return (error); } break; @@ -673,7 +768,9 @@ ttioctl(tp, cmd, data, flag) case FIONBIO: /* set/clear non-blocking i/o */ break; /* XXX: delete. */ case FIONREAD: /* get # bytes to read */ + s = spltty(); *(int *)data = ttnread(tp); + splx(s); break; case TIOCEXCL: /* set exclusive use of tty */ s = spltty(); @@ -693,8 +790,7 @@ ttioctl(tp, cmd, data, flag) case TIOCCONS: /* become virtual console */ if (*(int *)data) { if (constty && constty != tp && - ISSET(constty->t_state, TS_CARR_ON | TS_ISOPEN) == - (TS_CARR_ON | TS_ISOPEN)) + ISSET(constty->t_state, TS_CONNECTED)) return (EBUSY); #ifndef UCONSOLE if (error = suser(p->p_ucred, &p->p_acflag)) @@ -705,7 +801,8 @@ ttioctl(tp, cmd, data, flag) constty = NULL; break; case TIOCDRAIN: /* wait till output drained */ - if (error = ttywait(tp)) + error = ttywait(tp); + if (error) return (error); break; case TIOCGETA: { /* get termios struct */ @@ -745,9 +842,12 @@ ttioctl(tp, cmd, data, flag) case TIOCSETAF: { /* drn out, fls in, set */ register struct termios *t = (struct termios *)data; + if (t->c_ispeed < 0 || t->c_ospeed < 0) + return (EINVAL); s = spltty(); if (cmd == TIOCSETAW || cmd == TIOCSETAF) { - if (error = ttywait(tp)) { + error = ttywait(tp); + if (error) { splx(s); return (error); } @@ -761,35 +861,56 @@ ttioctl(tp, cmd, data, flag) if (tp->t_param && (error = (*tp->t_param)(tp, t))) { splx(s); return (error); - } else { - if (!ISSET(tp->t_state, TS_CARR_ON) && - ISSET(tp->t_cflag, CLOCAL) && - !ISSET(t->c_cflag, CLOCAL)) { - CLR(tp->t_state, TS_ISOPEN); - SET(tp->t_state, TS_WOPEN); - ttwakeup(tp); - } - tp->t_cflag = t->c_cflag; - tp->t_ispeed = t->c_ispeed; - tp->t_ospeed = t->c_ospeed; } + if (ISSET(t->c_cflag, CLOCAL) && + !ISSET(tp->t_cflag, CLOCAL)) { + /* + * XXX disconnections would be too hard to + * get rid of without this kludge. The only + * way to get rid of controlling terminals + * is to exit from the session leader. + */ + CLR(tp->t_state, TS_ZOMBIE); + + wakeup(TSA_CARR_ON(tp)); + ttwakeup(tp); + ttwwakeup(tp); + } + if ((ISSET(tp->t_state, TS_CARR_ON) || + ISSET(t->c_cflag, CLOCAL)) && + !ISSET(tp->t_state, TS_ZOMBIE)) + SET(tp->t_state, TS_CONNECTED); + else + CLR(tp->t_state, TS_CONNECTED); + tp->t_cflag = t->c_cflag; + tp->t_ispeed = t->c_ispeed; + tp->t_ospeed = t->c_ospeed; ttsetwater(tp); } - if (cmd != TIOCSETAF) { - if (ISSET(t->c_lflag, ICANON) != - ISSET(tp->t_lflag, ICANON)) - if (ISSET(t->c_lflag, ICANON)) { - SET(tp->t_lflag, PENDIN); - ttwakeup(tp); - } else { - struct clist tq; - + if (ISSET(t->c_lflag, ICANON) != ISSET(tp->t_lflag, ICANON) && + cmd != TIOCSETAF) { + if (ISSET(t->c_lflag, ICANON)) + SET(tp->t_lflag, PENDIN); + else { + /* + * XXX we really shouldn't allow toggling + * ICANON while we're in a non-termios line + * discipline. Now we have to worry about + * panicing for a null queue. + */ + if (tp->t_canq.c_cbreserved > 0 && + tp->t_rawq.c_cbreserved > 0) { catq(&tp->t_rawq, &tp->t_canq); - tq = tp->t_rawq; - tp->t_rawq = tp->t_canq; - tp->t_canq = tq; - CLR(tp->t_lflag, PENDIN); + /* + * XXX the queue limits may be + * different, so the old queue + * swapping method no longer works. + */ + catq(&tp->t_canq, &tp->t_rawq); } + CLR(tp->t_lflag, PENDIN); + } + ttwakeup(tp); } tp->t_iflag = t->c_iflag; tp->t_oflag = t->c_oflag; @@ -801,6 +922,9 @@ ttioctl(tp, cmd, data, flag) else CLR(t->c_lflag, EXTPROC); tp->t_lflag = t->c_lflag | ISSET(tp->t_lflag, PENDIN); + if (t->c_cc[VMIN] != tp->t_cc[VMIN] || + t->c_cc[VTIME] != tp->t_cc[VTIME]) + ttwakeup(tp); bcopy(t->c_cc, tp->t_cc, sizeof(t->c_cc)); splx(s); break; @@ -840,7 +964,9 @@ ttioctl(tp, cmd, data, flag) return (EPERM); if (p->p_ucred->cr_uid && !isctty(p, tp)) return (EACCES); + s = spltty(); (*linesw[tp->t_line].l_rint)(*(u_char *)data, tp); + splx(s); break; case TIOCSTOP: /* stop output, like ^S */ s = spltty(); @@ -849,7 +975,7 @@ ttioctl(tp, cmd, data, flag) #ifdef sun4c /* XXX */ (*tp->t_stop)(tp, 0); #else - (*cdevsw[major(tp->t_dev)].d_stop)(tp, 0); + (*cdevsw[major(tp->t_dev)]->d_stop)(tp, 0); #endif } splx(s); @@ -857,8 +983,8 @@ ttioctl(tp, cmd, data, flag) case TIOCSCTTY: /* become controlling tty */ /* Session ctty vnode pointer set in vnode layer. */ if (!SESS_LEADER(p) || - (p->p_session->s_ttyvp || tp->t_session) && - (tp->t_session != p->p_session)) + ((p->p_session->s_ttyvp || tp->t_session) && + (tp->t_session != p->p_session))) return (EPERM); tp->t_session = p->p_session; tp->t_pgrp = p->p_pgrp; @@ -875,6 +1001,11 @@ ttioctl(tp, cmd, data, flag) tp->t_pgrp = pgrp; break; } + case TIOCSTAT: /* simulate control-T */ + s = spltty(); + ttyinfo(tp); + splx(s); + break; case TIOCSWINSZ: /* set window size */ if (bcmp((caddr_t)&tp->t_winsize, data, sizeof (struct winsize))) { @@ -882,6 +1013,17 @@ ttioctl(tp, cmd, data, flag) pgsignal(tp->t_pgrp, SIGWINCH, 1); } break; + case TIOCSDRAINWAIT: + error = suser(p->p_ucred, &p->p_acflag); + if (error) + return (error); + tp->t_timeout = *(int *)data * hz; + wakeup(TSA_OCOMPLETE(tp)); + wakeup(TSA_OLOWAT(tp)); + break; + case TIOCGDRAINWAIT: + *(int *)data = tp->t_timeout / hz; + break; default: #if defined(COMPAT_43) || defined(COMPAT_SUNOS) return (ttcompat(tp, cmd, data, flag)); @@ -893,27 +1035,27 @@ ttioctl(tp, cmd, data, flag) } int -ttselect(device, rw, p) - dev_t device; +ttyselect(tp, rw, p) + struct tty *tp; int rw; struct proc *p; { - register struct tty *tp; - int nread, s; + int s; - tp = &cdevsw[major(device)].d_ttys[minor(device)]; + if (tp == NULL) + return (ENXIO); s = spltty(); switch (rw) { case FREAD: - nread = ttnread(tp); - if (nread > 0 || !ISSET(tp->t_cflag, CLOCAL) && - !ISSET(tp->t_state, TS_CARR_ON)) + if (ttnread(tp) > 0 || ISSET(tp->t_state, TS_ZOMBIE)) goto win; selrecord(p, &tp->t_rsel); break; case FWRITE: - if (tp->t_outq.c_cc <= tp->t_lowat) { + if ((tp->t_outq.c_cc <= tp->t_lowat && + ISSET(tp->t_state, TS_CONNECTED)) + || ISSET(tp->t_state, TS_ZOMBIE)) { win: splx(s); return (1); } @@ -924,6 +1066,22 @@ win: splx(s); return (0); } +/* + * This is a wrapper for compatibility with the select vector used by + * cdevsw. It relies on a proper xxxdevtotty routine. + */ +int +ttselect(dev, rw, p) + dev_t dev; + int rw; + struct proc *p; +{ + return ttyselect((*cdevsw[major(dev)]->d_devtotty)(dev), rw, p); +} + +/* + * Must be called at spltty(). + */ static int ttnread(tp) struct tty *tp; @@ -933,8 +1091,11 @@ ttnread(tp) if (ISSET(tp->t_lflag, PENDIN)) ttypend(tp); nread = tp->t_canq.c_cc; - if (!ISSET(tp->t_lflag, ICANON)) + if (!ISSET(tp->t_lflag, ICANON)) { nread += tp->t_rawq.c_cc; + if (nread < tp->t_cc[VMIN] && tp->t_cc[VTIME] == 0) + nread = 0; + } return (nread); } @@ -950,14 +1111,24 @@ ttywait(tp) error = 0; s = spltty(); while ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) && - (ISSET(tp->t_state, TS_CARR_ON) || ISSET(tp->t_cflag, CLOCAL)) - && tp->t_oproc) { + ISSET(tp->t_state, TS_CONNECTED) && tp->t_oproc) { (*tp->t_oproc)(tp); - SET(tp->t_state, TS_ASLEEP); - if (error = ttysleep(tp, - &tp->t_outq, TTOPRI | PCATCH, ttyout, 0)) + if ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) && + ISSET(tp->t_state, TS_CONNECTED)) { + SET(tp->t_state, TS_SO_OCOMPLETE); + error = ttysleep(tp, TSA_OCOMPLETE(tp), + TTOPRI | PCATCH, "ttywai", + tp->t_timeout); + if (error) { + if (error == EWOULDBLOCK) + error = EIO; + break; + } + } else break; } + if (!error && (tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY))) + error = EIO; splx(s); return (error); } @@ -965,7 +1136,7 @@ ttywait(tp) /* * Flush if successfully wait. */ -int +static int ttywflush(tp) struct tty *tp; { @@ -987,24 +1158,66 @@ ttyflush(tp, rw) register int s; s = spltty(); +#if 0 +again: +#endif + if (rw & FWRITE) { + FLUSHQ(&tp->t_outq); + CLR(tp->t_state, TS_TTSTOP); + } +#ifdef sun4c /* XXX */ + (*tp->t_stop)(tp, rw); +#else + (*cdevsw[major(tp->t_dev)]->d_stop)(tp, rw); +#endif if (rw & FREAD) { FLUSHQ(&tp->t_canq); FLUSHQ(&tp->t_rawq); + CLR(tp->t_lflag, PENDIN); tp->t_rocount = 0; tp->t_rocol = 0; CLR(tp->t_state, TS_LOCAL); ttwakeup(tp); + if (ISSET(tp->t_state, TS_TBLOCK)) { + if (rw & FWRITE) + FLUSHQ(&tp->t_outq); + ttyunblock(tp); + + /* + * Don't let leave any state that might clobber the + * next line discipline (although we should do more + * to send the START char). Not clearing the state + * may have caused the "putc to a clist with no + * reserved cblocks" panic/printf. + */ + CLR(tp->t_state, TS_TBLOCK); + +#if 0 /* forget it, sleeping isn't always safe and we don't know when it is */ + if (ISSET(tp->t_iflag, IXOFF)) { + /* + * XXX wait a bit in the hope that the stop + * character (if any) will go out. Waiting + * isn't good since it allows races. This + * will be fixed when the stop character is + * put in a special queue. Don't bother with + * the checks in ttywait() since the timeout + * will save us. + */ + SET(tp->t_state, TS_SO_OCOMPLETE); + ttysleep(tp, TSA_OCOMPLETE(tp), TTOPRI, + "ttyfls", hz / 10); + /* + * Don't try sending the stop character again. + */ + CLR(tp->t_state, TS_TBLOCK); + goto again; + } +#endif + } } if (rw & FWRITE) { - CLR(tp->t_state, TS_TTSTOP); -#ifdef sun4c /* XXX */ - (*tp->t_stop)(tp, rw); -#else - (*cdevsw[major(tp->t_dev)].d_stop)(tp, rw); -#endif FLUSHQ(&tp->t_outq); - wakeup((caddr_t)&tp->t_outq); - selwakeup(&tp->t_wsel); + ttwwakeup(tp); } splx(s); } @@ -1013,42 +1226,63 @@ ttyflush(tp, rw) * Copy in the default termios characters. */ void +termioschars(t) + struct termios *t; +{ + + bcopy(ttydefchars, t->c_cc, sizeof t->c_cc); +} + +/* + * Old interface. + */ +void ttychars(tp) struct tty *tp; { - bcopy(ttydefchars, tp->t_cc, sizeof(ttydefchars)); + termioschars(&tp->t_termios); } /* - * Send stop character on input overflow. + * Handle input high water. Send stop character for the IXOFF case. Turn + * on our input flow control bit and propagate the changes to the driver. + * XXX the stop character should be put in a special high priority queue. */ -static void +void ttyblock(tp) - register struct tty *tp; + struct tty *tp; { - register int total; - total = tp->t_rawq.c_cc + tp->t_canq.c_cc; - if (tp->t_rawq.c_cc > TTYHOG) { - ttyflush(tp, FREAD | FWRITE); - CLR(tp->t_state, TS_TBLOCK); - } - /* - * Block further input iff: current input > threshold - * AND input is available to user program. - */ - if (total >= TTYHOG / 2 && - !ISSET(tp->t_state, TS_TBLOCK) && - !ISSET(tp->t_lflag, ICANON) || tp->t_canq.c_cc > 0 && - tp->t_cc[VSTOP] != _POSIX_VDISABLE) { - if (putc(tp->t_cc[VSTOP], &tp->t_outq) == 0) { - SET(tp->t_state, TS_TBLOCK); - ttstart(tp); - } - } + SET(tp->t_state, TS_TBLOCK); + if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTOP] != _POSIX_VDISABLE && + putc(tp->t_cc[VSTOP], &tp->t_outq) != 0) + CLR(tp->t_state, TS_TBLOCK); /* try again later */ + ttstart(tp); +} + +/* + * Handle input low water. Send start character for the IXOFF case. Turn + * off our input flow control bit and propagate the changes to the driver. + * XXX the start character should be put in a special high priority queue. + */ +static void +ttyunblock(tp) + struct tty *tp; +{ + + CLR(tp->t_state, TS_TBLOCK); + if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTART] != _POSIX_VDISABLE && + putc(tp->t_cc[VSTART], &tp->t_outq) != 0) + SET(tp->t_state, TS_TBLOCK); /* try again later */ + ttstart(tp); } +#ifdef notyet +/* Not used by any current (i386) drivers. */ +/* + * Restart after an inter-char delay. + */ void ttrstrt(tp_arg) void *tp_arg; @@ -1068,6 +1302,7 @@ ttrstrt(tp_arg) splx(s); } +#endif int ttstart(tp) @@ -1088,10 +1323,8 @@ ttylclose(tp, flag) int flag; { - if (flag & IO_NDELAY) + if (flag & FNONBLOCK || ttywflush(tp)) ttyflush(tp, FREAD | FWRITE); - else - ttywflush(tp); return (0); } @@ -1106,19 +1339,23 @@ ttymodem(tp, flag) int flag; { - if (!ISSET(tp->t_state, TS_WOPEN) && ISSET(tp->t_cflag, MDMBUF)) { + if (ISSET(tp->t_state, TS_CARR_ON) && ISSET(tp->t_cflag, MDMBUF)) { /* * MDMBUF: do flow control according to carrier flag + * XXX TS_CAR_OFLOW doesn't do anything yet. TS_TTSTOP + * works if IXON and IXANY are clear. */ if (flag) { + CLR(tp->t_state, TS_CAR_OFLOW); CLR(tp->t_state, TS_TTSTOP); ttstart(tp); - } else if (!ISSET(tp->t_state, TS_TTSTOP)) { + } else if (!ISSET(tp->t_state, TS_CAR_OFLOW)) { + SET(tp->t_state, TS_CAR_OFLOW); SET(tp->t_state, TS_TTSTOP); #ifdef sun4c /* XXX */ (*tp->t_stop)(tp, 0); #else - (*cdevsw[major(tp->t_dev)].d_stop)(tp, 0); + (*cdevsw[major(tp->t_dev)]->d_stop)(tp, 0); #endif } } else if (flag == 0) { @@ -1128,6 +1365,8 @@ ttymodem(tp, flag) CLR(tp->t_state, TS_CARR_ON); if (ISSET(tp->t_state, TS_ISOPEN) && !ISSET(tp->t_cflag, CLOCAL)) { + SET(tp->t_state, TS_ZOMBIE); + CLR(tp->t_state, TS_CONNECTED); if (tp->t_session && tp->t_session->s_leader) psignal(tp->t_session->s_leader, SIGHUP); ttyflush(tp, FREAD | FWRITE); @@ -1138,30 +1377,11 @@ ttymodem(tp, flag) * Carrier now on. */ SET(tp->t_state, TS_CARR_ON); + if (!ISSET(tp->t_state, TS_ZOMBIE)) + SET(tp->t_state, TS_CONNECTED); + wakeup(TSA_CARR_ON(tp)); ttwakeup(tp); - } - return (1); -} - -/* - * Default modem control routine (for other line disciplines). - * Return argument flag, to turn off device on carrier drop. - */ -int -nullmodem(tp, flag) - register struct tty *tp; - int flag; -{ - - if (flag) - SET(tp->t_state, TS_CARR_ON); - else { - CLR(tp->t_state, TS_CARR_ON); - if (!ISSET(tp->t_cflag, CLOCAL)) { - if (tp->t_session && tp->t_session->s_leader) - psignal(tp->t_session->s_leader, SIGHUP); - return (0); - } + ttwwakeup(tp); } return (1); } @@ -1170,18 +1390,25 @@ nullmodem(tp, flag) * Reinput pending characters after state switch * call at spltty(). */ -void +static void ttypend(tp) register struct tty *tp; { struct clist tq; - register c; + register int c; CLR(tp->t_lflag, PENDIN); SET(tp->t_state, TS_TYPEN); + /* + * XXX this assumes too much about clist internals. It may even + * fail if the cblock slush pool is empty. We can't allocate more + * cblocks here because we are called from an interrupt handler + * and clist_alloc_cblocks() can wait. + */ tq = tp->t_rawq; - tp->t_rawq.c_cc = 0; - tp->t_rawq.c_cf = tp->t_rawq.c_cl = 0; + bzero(&tp->t_rawq, sizeof tp->t_rawq); + tp->t_rawq.c_cbmax = tq.c_cbmax; + tp->t_rawq.c_cbreserved = tq.c_cbreserved; while ((c = getc(&tq)) >= 0) ttyinput(c, tp); CLR(tp->t_state, TS_TYPEN); @@ -1198,34 +1425,47 @@ ttread(tp, uio, flag) { register struct clist *qp; register int c; - register long lflag; - register u_char *cc = tp->t_cc; + register tcflag_t lflag; + register cc_t *cc = tp->t_cc; register struct proc *p = curproc; int s, first, error = 0; + int has_stime = 0, last_cc = 0; + long slp = 0; /* XXX this should be renamed `timo'. */ -loop: lflag = tp->t_lflag; +loop: s = spltty(); + lflag = tp->t_lflag; /* * take pending input first */ - if (ISSET(lflag, PENDIN)) + if (ISSET(lflag, PENDIN)) { ttypend(tp); - splx(s); + splx(s); /* reduce latency */ + s = spltty(); + lflag = tp->t_lflag; /* XXX ttypend() clobbers it */ + } /* * Hang process if it's in the background. */ if (isbackground(p, tp)) { + splx(s); if ((p->p_sigignore & sigmask(SIGTTIN)) || (p->p_sigmask & sigmask(SIGTTIN)) || p->p_flag & P_PPWAIT || p->p_pgrp->pg_jobc == 0) return (EIO); pgsignal(p->p_pgrp, SIGTTIN, 1); - if (error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, ttybg, 0)) + error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg2", 0); + if (error) return (error); goto loop; } + if (ISSET(tp->t_state, TS_ZOMBIE)) { + splx(s); + return (0); /* EOF */ + } + /* * If canonical, use the canonical queue, * else use the raw queue. @@ -1234,47 +1474,171 @@ loop: lflag = tp->t_lflag; */ qp = ISSET(lflag, ICANON) ? &tp->t_canq : &tp->t_rawq; - /* - * If there is no input, sleep on rawq - * awaiting hardware receipt and notification. - * If we have data, we don't need to check for carrier. - */ - s = spltty(); - if (qp->c_cc <= 0) { - int carrier; - - carrier = ISSET(tp->t_state, TS_CARR_ON) || - ISSET(tp->t_cflag, CLOCAL); - if (!carrier && ISSET(tp->t_state, TS_ISOPEN)) { + if (flag & IO_NDELAY) { + if (qp->c_cc > 0) + goto read; + if (!ISSET(lflag, ICANON) && cc[VMIN] == 0) { splx(s); - return (0); /* EOF */ + return (0); } - if (flag & IO_NDELAY) { + splx(s); + return (EWOULDBLOCK); + } + if (!ISSET(lflag, ICANON)) { + int m = cc[VMIN]; + long t = cc[VTIME]; + struct timeval stime, timecopy; + int x; + + /* + * Check each of the four combinations. + * (m > 0 && t == 0) is the normal read case. + * It should be fairly efficient, so we check that and its + * companion case (m == 0 && t == 0) first. + * For the other two cases, we compute the target sleep time + * into slp. + */ + if (t == 0) { + if (qp->c_cc < m) + goto sleep; + if (qp->c_cc > 0) + goto read; + + /* m, t and qp->c_cc are all 0. 0 is enough input. */ splx(s); - return (EWOULDBLOCK); + return (0); + } + t *= 100000; /* time in us */ +#define diff(t1, t2) (((t1).tv_sec - (t2).tv_sec) * 1000000 + \ + ((t1).tv_usec - (t2).tv_usec)) + if (m > 0) { + if (qp->c_cc <= 0) + goto sleep; + if (qp->c_cc >= m) + goto read; + gettime(&timecopy); + if (!has_stime) { + /* first character, start timer */ + has_stime = 1; + stime = timecopy; + slp = t; + } else if (qp->c_cc > last_cc) { + /* got a character, restart timer */ + stime = timecopy; + slp = t; + } else { + /* nothing, check expiration */ + slp = t - diff(timecopy, stime); + if (slp <= 0) + goto read; + } + last_cc = qp->c_cc; + } else { /* m == 0 */ + if (qp->c_cc > 0) + goto read; + gettime(&timecopy); + if (!has_stime) { + has_stime = 1; + stime = timecopy; + slp = t; + } else { + slp = t - diff(timecopy, stime); + if (slp <= 0) { + /* Timed out, but 0 is enough input. */ + splx(s); + return (0); + } + } } - error = ttysleep(tp, &tp->t_rawq, TTIPRI | PCATCH, - carrier ? ttyin : ttopen, 0); +#undef diff + /* + * Rounding down may make us wake up just short + * of the target, so we round up. + * The formula is ceiling(slp * hz/1000000). + * 32-bit arithmetic is enough for hz < 169. + * XXX see hzto() for how to avoid overflow if hz + * is large (divide by `tick' and/or arrange to + * use hzto() if hz is large). + */ + slp = (long) (((u_long)slp * hz) + 999999) / 1000000; + goto sleep; + } + if (qp->c_cc <= 0) { +sleep: + /* + * There is no input, or not enough input and we can block. + */ + error = ttysleep(tp, TSA_HUP_OR_INPUT(tp), TTIPRI | PCATCH, + ISSET(tp->t_state, TS_CONNECTED) ? + "ttyin" : "ttyhup", (int)slp); splx(s); - if (error) + if (error == EWOULDBLOCK) + error = 0; + else if (error) return (error); + /* + * XXX what happens if another process eats some input + * while we are asleep (not just here)? It would be + * safest to detect changes and reset our state variables + * (has_stime and last_cc). + */ + slp = 0; goto loop; } +read: splx(s); - /* * Input present, check for input mapping and processing. */ first = 1; - while ((c = getc(qp)) >= 0) { + if (ISSET(lflag, ICANON | ISIG)) + goto slowcase; + for (;;) { + char ibuf[IBUFSIZ]; + int icc; + + icc = imin(uio->uio_resid, IBUFSIZ); + icc = q_to_b(qp, ibuf, icc); + if (icc <= 0) { + if (first) + goto loop; + break; + } + error = uiomove(ibuf, icc, uio); + /* + * XXX if there was an error then we should ungetc() the + * unmoved chars and reduce icc here. + */ +#if NSNP > 0 + if (ISSET(tp->t_lflag, ECHO) && + ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL) + snpin((struct snoop *)tp->t_sc, ibuf, icc); +#endif + if (error) + break; + if (uio->uio_resid == 0) + break; + first = 0; + } + goto out; +slowcase: + for (;;) { + c = getc(qp); + if (c < 0) { + if (first) + goto loop; + break; + } /* * delayed suspend (^Y) */ - if (CCEQ(cc[VDSUSP], c) && ISSET(lflag, ISIG)) { + if (CCEQ(cc[VDSUSP], c) && + ISSET(lflag, IEXTEN | ISIG) == (IEXTEN | ISIG)) { pgsignal(tp->t_pgrp, SIGTSTP, 1); if (first) { - if (error = ttysleep(tp, - &lbolt, TTIPRI | PCATCH, ttybg, 0)) + error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, + "ttybg3", 0); + if (error) break; goto loop; } @@ -1290,30 +1654,39 @@ loop: lflag = tp->t_lflag; */ error = ureadc(c, uio); if (error) + /* XXX should ungetc(c, qp). */ break; +#if NSNP > 0 + /* + * Only snoop directly on input in echo mode. Non-echoed + * input will be snooped later iff the application echoes it. + */ + if (ISSET(tp->t_lflag, ECHO) && + ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL) + snpinc((struct snoop *)tp->t_sc, (char)c); +#endif if (uio->uio_resid == 0) break; /* * In canonical mode check for a "break character" * marking the end of a "line of input". */ - if (ISSET(lflag, ICANON) && TTBREAKC(c)) + if (ISSET(lflag, ICANON) && TTBREAKC(c, lflag)) break; first = 0; } + +out: /* - * Look to unblock output now that (presumably) + * Look to unblock input now that (presumably) * the input queue has gone down. */ s = spltty(); - if (ISSET(tp->t_state, TS_TBLOCK) && tp->t_rawq.c_cc < TTYHOG/5) { - if (cc[VSTART] != _POSIX_VDISABLE && - putc(cc[VSTART], &tp->t_outq) == 0) { - CLR(tp->t_state, TS_TBLOCK); - ttstart(tp); - } - } + if (ISSET(tp->t_state, TS_TBLOCK) && + tp->t_rawq.c_cc + tp->t_canq.c_cc <= I_LOW_WATER) + ttyunblock(tp); splx(s); + return (error); } @@ -1334,17 +1707,17 @@ ttycheckoutq(tp, wait) hiwat = tp->t_hiwat; s = spltty(); oldsig = wait ? curproc->p_siglist : 0; - if (tp->t_outq.c_cc > hiwat + 200) + if (tp->t_outq.c_cc > hiwat + OBUFSIZ + 100) while (tp->t_outq.c_cc > hiwat) { ttstart(tp); + if (tp->t_outq.c_cc <= hiwat) + break; if (wait == 0 || curproc->p_siglist != oldsig) { splx(s); return (0); } - timeout((void (*)__P((void *)))wakeup, - (void *)&tp->t_outq, hz); - SET(tp->t_state, TS_ASLEEP); - sleep((caddr_t)&tp->t_outq, PZERO - 1); + SET(tp->t_state, TS_SO_OLOWAT); + tsleep(TSA_OLOWAT(tp), PZERO - 1, "ttoutq", hz); } splx(s); return (1); @@ -1359,7 +1732,7 @@ ttwrite(tp, uio, flag) register struct uio *uio; int flag; { - register char *cp; + register char *cp = NULL; register int cc, ce; register struct proc *p; int i, hiwat, cnt, error, s; @@ -1371,24 +1744,24 @@ ttwrite(tp, uio, flag) cc = 0; loop: s = spltty(); - if (!ISSET(tp->t_state, TS_CARR_ON) && - !ISSET(tp->t_cflag, CLOCAL)) { - if (ISSET(tp->t_state, TS_ISOPEN)) { - splx(s); - return (EIO); - } else if (flag & IO_NDELAY) { + if (ISSET(tp->t_state, TS_ZOMBIE)) { + splx(s); + if (uio->uio_resid == cnt) + error = EIO; + goto out; + } + if (!ISSET(tp->t_state, TS_CONNECTED)) { + if (flag & IO_NDELAY) { splx(s); error = EWOULDBLOCK; goto out; - } else { - /* Sleep awaiting carrier. */ - error = ttysleep(tp, - &tp->t_rawq, TTIPRI | PCATCH,ttopen, 0); - splx(s); - if (error) - goto out; - goto loop; } + error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH, + "ttydcd", 0); + splx(s); + if (error) + goto out; + goto loop; } splx(s); /* @@ -1398,10 +1771,14 @@ loop: if (isbackground(p, tp) && ISSET(tp->t_lflag, TOSTOP) && (p->p_flag & P_PPWAIT) == 0 && (p->p_sigignore & sigmask(SIGTTOU)) == 0 && - (p->p_sigmask & sigmask(SIGTTOU)) == 0 && - p->p_pgrp->pg_jobc) { + (p->p_sigmask & sigmask(SIGTTOU)) == 0) { + if (p->p_pgrp->pg_jobc == 0) { + error = EIO; + goto out; + } pgsignal(p->p_pgrp, SIGTTOU, 1); - if (error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, ttybg, 0)) + error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg4", 0); + if (error) goto out; goto loop; } @@ -1422,13 +1799,17 @@ loop: * leftover from last time. */ if (cc == 0) { - cc = min(uio->uio_resid, OBUFSIZ); + cc = imin(uio->uio_resid, OBUFSIZ); cp = obuf; error = uiomove(cp, cc, uio); if (error) { cc = 0; break; } +#if NSNP > 0 + if (ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL) + snpin((struct snoop *)tp->t_sc, cp, cc); +#endif } /* * If nothing fancy need be done, grab those characters we @@ -1444,7 +1825,7 @@ loop: ce = cc; else { ce = cc - scanc((u_int)cc, (u_char *)cp, - (u_char *)char_type, CCLASSMASK); + char_type, CCLASSMASK); /* * If ce is zero, then we're processing * a special character through ttyoutput. @@ -1454,9 +1835,15 @@ loop: if (ttyoutput(*cp, tp) >= 0) { /* No Clists, wait a bit. */ ttstart(tp); - if (error = ttysleep(tp, &lbolt, - TTOPRI | PCATCH, ttybuf, 0)) - break; + if (flag & IO_NDELAY) { + error = EWOULDBLOCK; + goto out; + } + error = ttysleep(tp, &lbolt, + TTOPRI|PCATCH, + "ttybf1", 0); + if (error) + goto out; goto loop; } cp++; @@ -1484,9 +1871,14 @@ loop: if (i > 0) { /* No Clists, wait a bit. */ ttstart(tp); - if (error = ttysleep(tp, - &lbolt, TTOPRI | PCATCH, ttybuf, 0)) - break; + if (flag & IO_NDELAY) { + error = EWOULDBLOCK; + goto out; + } + error = ttysleep(tp, &lbolt, TTOPRI | PCATCH, + "ttybf2", 0); + if (error) + goto out; goto loop; } if (ISSET(tp->t_lflag, FLUSHO) || @@ -1520,9 +1912,12 @@ ovhiwat: uio->uio_resid += cc; return (uio->uio_resid == cnt ? EWOULDBLOCK : 0); } - SET(tp->t_state, TS_ASLEEP); - error = ttysleep(tp, &tp->t_outq, TTOPRI | PCATCH, ttyout, 0); + SET(tp->t_state, TS_SO_OLOWAT); + error = ttysleep(tp, TSA_OLOWAT(tp), TTOPRI | PCATCH, "ttywri", + tp->t_timeout); splx(s); + if (error == EWOULDBLOCK) + error = EIO; if (error) goto out; goto loop; @@ -1532,7 +1927,7 @@ ovhiwat: * Rubout one character from the rawq of tp * as cleanly as possible. */ -void +static void ttyrub(c, tp) register int c; register struct tty *tp; @@ -1635,7 +2030,7 @@ ttyrubo(tp, cnt) * Reprint the rawq line. Note, it is assumed that c_cc has already * been checked. */ -void +static void ttyretype(tp) register struct tty *tp; { @@ -1679,11 +2074,11 @@ ttyecho(c, tp) if (!ISSET(tp->t_state, TS_CNTTB)) CLR(tp->t_lflag, FLUSHO); if ((!ISSET(tp->t_lflag, ECHO) && - (!ISSET(tp->t_lflag, ECHONL) || c == '\n')) || + (c != '\n' || !ISSET(tp->t_lflag, ECHONL))) || ISSET(tp->t_lflag, EXTPROC)) return; if (ISSET(tp->t_lflag, ECHOCTL) && - (ISSET(c, TTY_CHARMASK) <= 037 && c != '\t' && c != '\n' || + ((ISSET(c, TTY_CHARMASK) <= 037 && c != '\t' && c != '\n') || ISSET(c, TTY_CHARMASK) == 0177)) { (void)ttyoutput('^', tp); CLR(c, ~TTY_CHARMASK); @@ -1703,10 +2098,33 @@ ttwakeup(tp) register struct tty *tp; { - selwakeup(&tp->t_rsel); + if (tp->t_rsel.si_pid != 0) + selwakeup(&tp->t_rsel); if (ISSET(tp->t_state, TS_ASYNC)) pgsignal(tp->t_pgrp, SIGIO, 1); - wakeup((caddr_t)&tp->t_rawq); + wakeup(TSA_HUP_OR_INPUT(tp)); +} + +/* + * Wake up any writers on a tty. + */ +void +ttwwakeup(tp) + register struct tty *tp; +{ + + if (tp->t_wsel.si_pid != 0 && tp->t_outq.c_cc <= tp->t_lowat) + selwakeup(&tp->t_wsel); + if (ISSET(tp->t_state, TS_BUSY | TS_SO_OCOMPLETE) == + TS_SO_OCOMPLETE && tp->t_outq.c_cc == 0) { + CLR(tp->t_state, TS_SO_OCOMPLETE); + wakeup(TSA_OCOMPLETE(tp)); + } + if (ISSET(tp->t_state, TS_SO_OLOWAT) && + tp->t_outq.c_cc <= tp->t_lowat) { + CLR(tp->t_state, TS_SO_OLOWAT); + wakeup(TSA_OLOWAT(tp)); + } } /* @@ -1786,15 +2204,15 @@ ttyinfo(tp) /* Print user time. */ ttyprintf(tp, "%d.%02du ", - utime.tv_sec, (utime.tv_usec + 5000) / 10000); + utime.tv_sec, utime.tv_usec / 10000); /* Print system time. */ ttyprintf(tp, "%d.%02ds ", - stime.tv_sec, (stime.tv_usec + 5000) / 10000); + stime.tv_sec, stime.tv_usec / 10000); -#define pgtok(a) (((a) * NBPG) / 1024) +#define pgtok(a) (((a) * PAGE_SIZE) / 1024) /* Print percentage cpu, resident set size. */ - tmp = pick->p_pctcpu * 10000 + FSCALE / 2 >> FSHIFT; + tmp = (pick->p_pctcpu * 10000 + FSCALE / 2) >> FSHIFT; ttyprintf(tp, "%d%% %dk\n", tmp / 100, pick->p_stat == SIDL || pick->p_stat == SZOMB ? 0 : @@ -1891,8 +2309,7 @@ tputchar(c, tp) register int s; s = spltty(); - if (ISSET(tp->t_state, - TS_CARR_ON | TS_ISOPEN) != (TS_CARR_ON | TS_ISOPEN)) { + if (!ISSET(tp->t_state, TS_CONNECTED)) { splx(s); return (-1); } @@ -1906,7 +2323,7 @@ tputchar(c, tp) /* * Sleep on chan, returning ERESTART if tty changed while we napped and - * returning any errors (e.g. EINTR/ETIMEDOUT) reported by tsleep. If + * returning any errors (e.g. EINTR/EWOULDBLOCK) reported by tsleep. If * the tty is revoked, restarting a pending call will redo validation done * at the start of the call. */ @@ -1918,10 +2335,44 @@ ttysleep(tp, chan, pri, wmesg, timo) char *wmesg; { int error; - short gen; + int gen; gen = tp->t_gen; - if (error = tsleep(chan, pri, wmesg, timo)) + error = tsleep(chan, pri, wmesg, timo); + if (error) return (error); return (tp->t_gen == gen ? 0 : ERESTART); } + +#ifdef notyet +/* + * XXX this is usable not useful or used. Most tty drivers have + * ifdefs for using ttymalloc() but assume a different interface. + */ +/* + * Allocate a tty struct. Clists in the struct will be allocated by + * ttyopen(). + */ +struct tty * +ttymalloc() +{ + struct tty *tp; + + tp = malloc(sizeof *tp, M_TTYS, M_WAITOK); + bzero(tp, sizeof *tp); + return (tp); +} +#endif + +#if 0 /* XXX not yet usable: session leader holds a ref (see kern_exit.c). */ +/* + * Free a tty struct. Clists in the struct should have been freed by + * ttyclose(). + */ +void +ttyfree(tp) + struct tty *tp; +{ + free(tp, M_TTYS); +} +#endif /* 0 */ diff --git a/sys/kern/tty_compat.c b/sys/kern/tty_compat.c index ce95853..ed58c6a 100644 --- a/sys/kern/tty_compat.c +++ b/sys/kern/tty_compat.c @@ -30,28 +30,39 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)tty_compat.c 8.2 (Berkeley) 1/9/95 + * @(#)tty_compat.c 8.1 (Berkeley) 6/10/93 + * $Id: tty_compat.c,v 1.21 1997/02/22 09:39:24 peter Exp $ */ -/* +/* * mapping routines for old line discipline (yuck) */ #if defined(COMPAT_43) || defined(COMPAT_SUNOS) #include <sys/param.h> #include <sys/systm.h> -#include <sys/ioctl.h> +#include <sys/ioctl_compat.h> #include <sys/proc.h> #include <sys/tty.h> #include <sys/termios.h> #include <sys/file.h> #include <sys/conf.h> #include <sys/kernel.h> +#include <sys/sysctl.h> #include <sys/syslog.h> -int ttydebug = 0; +static int ttcompatgetflags __P((struct tty *tp)); +static void ttcompatsetflags __P((struct tty *tp, struct termios *t)); +static void ttcompatsetlflags __P((struct tty *tp, struct termios *t)); +static int ttcompatspeedtab __P((int speed, struct speedtab *table)); + +static int ttydebug = 0; +SYSCTL_INT(_debug, OID_AUTO, ttydebug, CTLFLAG_RW, &ttydebug, 0, ""); static struct speedtab compatspeeds[] = { +#define MAX_SPEED 17 + { 115200, 17 }, + { 57600, 16 }, { 38400, 15 }, { 19200, 14 }, { 9600, 13 }, @@ -70,78 +81,61 @@ static struct speedtab compatspeeds[] = { { 0, 0 }, { -1, -1 }, }; -static int compatspcodes[16] = { +static int compatspcodes[] = { 0, 50, 75, 110, 134, 150, 200, 300, 600, 1200, - 1800, 2400, 4800, 9600, 19200, 38400, + 1800, 2400, 4800, 9600, 19200, 38400, 57600, 115200, }; -/*ARGSUSED*/ -ttcompat(tp, com, data, flag) +static int +ttcompatspeedtab(speed, table) + int speed; + register struct speedtab *table; +{ + if (speed == 0) + return (0); /* hangup */ + for ( ; table->sp_speed > 0; table++) + if (table->sp_speed <= speed) /* nearest one, rounded down */ + return (table->sp_code); + return (1); /* 50, min and not hangup */ +} + +int +ttsetcompat(tp, com, data, term) register struct tty *tp; - u_long com; + int *com; caddr_t data; - int flag; + struct termios *term; { - - switch (com) { - case TIOCGETP: { - register struct sgttyb *sg = (struct sgttyb *)data; - register u_char *cc = tp->t_cc; - register speed; - - speed = ttspeedtab(tp->t_ospeed, compatspeeds); - sg->sg_ospeed = (speed == -1) ? 15 : speed; - if (tp->t_ispeed == 0) - sg->sg_ispeed = sg->sg_ospeed; - else { - speed = ttspeedtab(tp->t_ispeed, compatspeeds); - sg->sg_ispeed = (speed == -1) ? 15 : speed; - } - sg->sg_erase = cc[VERASE]; - sg->sg_kill = cc[VKILL]; - sg->sg_flags = ttcompatgetflags(tp); - break; - } - + switch (*com) { case TIOCSETP: case TIOCSETN: { register struct sgttyb *sg = (struct sgttyb *)data; - struct termios term; int speed; - term = tp->t_termios; - if ((speed = sg->sg_ispeed) > 15 || speed < 0) - term.c_ispeed = speed; + if ((speed = sg->sg_ispeed) > MAX_SPEED || speed < 0) + return(EINVAL); + else if (speed != ttcompatspeedtab(tp->t_ispeed, compatspeeds)) + term->c_ispeed = compatspcodes[speed]; else - term.c_ispeed = compatspcodes[speed]; - if ((speed = sg->sg_ospeed) > 15 || speed < 0) - term.c_ospeed = speed; + term->c_ispeed = tp->t_ispeed; + if ((speed = sg->sg_ospeed) > MAX_SPEED || speed < 0) + return(EINVAL); + else if (speed != ttcompatspeedtab(tp->t_ospeed, compatspeeds)) + term->c_ospeed = compatspcodes[speed]; else - term.c_ospeed = compatspcodes[speed]; - term.c_cc[VERASE] = sg->sg_erase; - term.c_cc[VKILL] = sg->sg_kill; - tp->t_flags = tp->t_flags&0xffff0000 | sg->sg_flags&0xffff; - ttcompatsetflags(tp, &term); - return (ttioctl(tp, com == TIOCSETP ? TIOCSETAF : TIOCSETA, - &term, flag)); - } - - case TIOCGETC: { - struct tchars *tc = (struct tchars *)data; - register u_char *cc = tp->t_cc; - - tc->t_intrc = cc[VINTR]; - tc->t_quitc = cc[VQUIT]; - tc->t_startc = cc[VSTART]; - tc->t_stopc = cc[VSTOP]; - tc->t_eofc = cc[VEOF]; - tc->t_brkc = cc[VEOL]; + term->c_ospeed = tp->t_ospeed; + term->c_cc[VERASE] = sg->sg_erase; + term->c_cc[VKILL] = sg->sg_kill; + tp->t_flags = (tp->t_flags&0xffff0000) | (sg->sg_flags&0xffff); + ttcompatsetflags(tp, term); + *com = (*com == TIOCSETP) ? TIOCSETAF : TIOCSETA; break; } case TIOCSETC: { struct tchars *tc = (struct tchars *)data; - register u_char *cc = tp->t_cc; + register cc_t *cc; + cc = term->c_cc; cc[VINTR] = tc->t_intrc; cc[VQUIT] = tc->t_quitc; cc[VSTART] = tc->t_startc; @@ -150,23 +144,96 @@ ttcompat(tp, com, data, flag) cc[VEOL] = tc->t_brkc; if (tc->t_brkc == -1) cc[VEOL2] = _POSIX_VDISABLE; + *com = TIOCSETA; break; } case TIOCSLTC: { struct ltchars *ltc = (struct ltchars *)data; - register u_char *cc = tp->t_cc; + register cc_t *cc; + cc = term->c_cc; cc[VSUSP] = ltc->t_suspc; cc[VDSUSP] = ltc->t_dsuspc; cc[VREPRINT] = ltc->t_rprntc; cc[VDISCARD] = ltc->t_flushc; cc[VWERASE] = ltc->t_werasc; cc[VLNEXT] = ltc->t_lnextc; + *com = TIOCSETA; + break; + } + case TIOCLBIS: + case TIOCLBIC: + case TIOCLSET: + if (*com == TIOCLSET) + tp->t_flags = (tp->t_flags&0xffff) | *(int *)data<<16; + else { + tp->t_flags = + (ttcompatgetflags(tp)&0xffff0000)|(tp->t_flags&0xffff); + if (*com == TIOCLBIS) + tp->t_flags |= *(int *)data<<16; + else + tp->t_flags &= ~(*(int *)data<<16); + } + ttcompatsetlflags(tp, term); + *com = TIOCSETA; + break; + } + return 0; +} + +/*ARGSUSED*/ +int +ttcompat(tp, com, data, flag) + register struct tty *tp; + int com; + caddr_t data; + int flag; +{ + switch (com) { + case TIOCSETP: + case TIOCSETN: + case TIOCSETC: + case TIOCSLTC: + case TIOCLBIS: + case TIOCLBIC: + case TIOCLSET: { + struct termios term; + int error; + + term = tp->t_termios; + if ((error = ttsetcompat(tp, &com, data, &term)) != 0) + return error; + return ttioctl(tp, com, &term, flag); + } + case TIOCGETP: { + register struct sgttyb *sg = (struct sgttyb *)data; + register cc_t *cc = tp->t_cc; + + sg->sg_ospeed = ttcompatspeedtab(tp->t_ospeed, compatspeeds); + if (tp->t_ispeed == 0) + sg->sg_ispeed = sg->sg_ospeed; + else + sg->sg_ispeed = ttcompatspeedtab(tp->t_ispeed, compatspeeds); + sg->sg_erase = cc[VERASE]; + sg->sg_kill = cc[VKILL]; + sg->sg_flags = tp->t_flags = ttcompatgetflags(tp); + break; + } + case TIOCGETC: { + struct tchars *tc = (struct tchars *)data; + register cc_t *cc = tp->t_cc; + + tc->t_intrc = cc[VINTR]; + tc->t_quitc = cc[VQUIT]; + tc->t_startc = cc[VSTART]; + tc->t_stopc = cc[VSTOP]; + tc->t_eofc = cc[VEOF]; + tc->t_brkc = cc[VEOL]; break; } case TIOCGLTC: { struct ltchars *ltc = (struct ltchars *)data; - register u_char *cc = tp->t_cc; + register cc_t *cc = tp->t_cc; ltc->t_suspc = cc[VSUSP]; ltc->t_dsuspc = cc[VDSUSP]; @@ -176,27 +243,11 @@ ttcompat(tp, com, data, flag) ltc->t_lnextc = cc[VLNEXT]; break; } - case TIOCLBIS: - case TIOCLBIC: - case TIOCLSET: { - struct termios term; - - term = tp->t_termios; - if (com == TIOCLSET) - tp->t_flags = (tp->t_flags&0xffff) | *(int *)data<<16; - else { - tp->t_flags = - (ttcompatgetflags(tp)&0xffff0000)|(tp->t_flags&0xffff); - if (com == TIOCLBIS) - tp->t_flags |= *(int *)data<<16; - else - tp->t_flags &= ~(*(int *)data<<16); - } - ttcompatsetlflags(tp, &term); - return (ttioctl(tp, TIOCSETA, &term, flag)); - } case TIOCLGET: - *(int *)data = ttcompatgetflags(tp)>>16; + tp->t_flags = + (ttcompatgetflags(tp) & 0xffff0000UL) + | (tp->t_flags & 0xffff); + *(int *)data = tp->t_flags>>16; if (ttydebug) printf("CLGET: returning %x\n", *(int *)data); break; @@ -208,7 +259,7 @@ ttcompat(tp, com, data, flag) case OTIOCSETD: { int ldisczero = 0; - return (ttioctl(tp, TIOCSETD, + return (ttioctl(tp, TIOCSETD, *(int *)data == 2 ? (caddr_t)&ldisczero : data, flag)); } @@ -222,20 +273,26 @@ ttcompat(tp, com, data, flag) return (0); } +static int ttcompatgetflags(tp) register struct tty *tp; { - register long iflag = tp->t_iflag; - register long lflag = tp->t_lflag; - register long oflag = tp->t_oflag; - register long cflag = tp->t_cflag; + register tcflag_t iflag = tp->t_iflag; + register tcflag_t lflag = tp->t_lflag; + register tcflag_t oflag = tp->t_oflag; + register tcflag_t cflag = tp->t_cflag; register flags = 0; if (iflag&IXOFF) flags |= TANDEM; if (iflag&ICRNL || oflag&ONLCR) flags |= CRMOD; - if (cflag&PARENB) { + if ((cflag&CSIZE) == CS8) { + flags |= PASS8; + if (iflag&ISTRIP) + flags |= ANYP; + } + else if (cflag&PARENB) { if (iflag&INPCK) { if (cflag&PARODD) flags |= ODDP; @@ -243,20 +300,18 @@ ttcompatgetflags(tp) flags |= EVENP; } else flags |= EVENP | ODDP; - } else { - if ((tp->t_flags&LITOUT) && !(oflag&OPOST)) - flags |= LITOUT; - if (tp->t_flags&PASS8) - flags |= PASS8; } - - if ((lflag&ICANON) == 0) { + + if ((lflag&ICANON) == 0) { /* fudge */ - if (iflag&IXON || lflag&ISIG || lflag&IEXTEN || cflag&PARENB) + if (iflag&(INPCK|ISTRIP|IXON) || lflag&(IEXTEN|ISIG) + || cflag&(CSIZE|PARENB) != CS8) flags |= CBREAK; else flags |= RAW; } + if (!(flags&RAW) && !(oflag&OPOST) && cflag&(CSIZE|PARENB) == CS8) + flags |= LITOUT; if (cflag&MDMBUF) flags |= MDMBUF; if ((cflag&HUPCL) == 0) @@ -274,28 +329,28 @@ ttcompatgetflags(tp) if ((iflag&IXANY) == 0) flags |= DECCTQ; flags |= lflag&(ECHO|TOSTOP|FLUSHO|PENDIN|NOFLSH); -if (ttydebug) - printf("getflags: %x\n", flags); + if (ttydebug) + printf("getflags: %x\n", flags); return (flags); } +static void ttcompatsetflags(tp, t) register struct tty *tp; register struct termios *t; { register flags = tp->t_flags; - register long iflag = t->c_iflag; - register long oflag = t->c_oflag; - register long lflag = t->c_lflag; - register long cflag = t->c_cflag; + register tcflag_t iflag = t->c_iflag; + register tcflag_t oflag = t->c_oflag; + register tcflag_t lflag = t->c_lflag; + register tcflag_t cflag = t->c_cflag; if (flags & RAW) { - iflag &= IXOFF; - oflag &= ~OPOST; + iflag = IGNBRK; lflag &= ~(ECHOCTL|ISIG|ICANON|IEXTEN); } else { + iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR); iflag |= BRKINT|IXON|IMAXBEL; - oflag |= OPOST; lflag |= ISIG|IEXTEN|ECHOCTL; /* XXX was echoctl on ? */ if (flags & XTABS) oflag |= OXTABS; @@ -317,49 +372,59 @@ ttcompatsetflags(tp, t) lflag |= ECHO; else lflag &= ~ECHO; - + + cflag &= ~(CSIZE|PARENB); if (flags&(RAW|LITOUT|PASS8)) { - cflag &= ~(CSIZE|PARENB); cflag |= CS8; - if ((flags&(RAW|PASS8)) == 0) + if (!(flags&(RAW|PASS8)) + || (flags&(RAW|PASS8|ANYP)) == (PASS8|ANYP)) iflag |= ISTRIP; else iflag &= ~ISTRIP; + if (flags&(RAW|LITOUT)) + oflag &= ~OPOST; + else + oflag |= OPOST; } else { - cflag &= ~CSIZE; cflag |= CS7|PARENB; iflag |= ISTRIP; + oflag |= OPOST; } + /* XXX don't set INPCK if RAW or PASS8? */ if ((flags&(EVENP|ODDP)) == EVENP) { iflag |= INPCK; cflag &= ~PARODD; } else if ((flags&(EVENP|ODDP)) == ODDP) { iflag |= INPCK; cflag |= PARODD; - } else + } else iflag &= ~INPCK; - if (flags&LITOUT) - oflag &= ~OPOST; /* move earlier ? */ if (flags&TANDEM) iflag |= IXOFF; else iflag &= ~IXOFF; + if ((flags&DECCTQ) == 0) + iflag |= IXANY; + else + iflag &= ~IXANY; t->c_iflag = iflag; t->c_oflag = oflag; t->c_lflag = lflag; t->c_cflag = cflag; } +static void ttcompatsetlflags(tp, t) register struct tty *tp; register struct termios *t; { register flags = tp->t_flags; - register long iflag = t->c_iflag; - register long oflag = t->c_oflag; - register long lflag = t->c_lflag; - register long cflag = t->c_cflag; + register tcflag_t iflag = t->c_iflag; + register tcflag_t oflag = t->c_oflag; + register tcflag_t lflag = t->c_lflag; + register tcflag_t cflag = t->c_cflag; + iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR); if (flags&CRTERA) lflag |= ECHOE; else @@ -376,6 +441,10 @@ ttcompatsetlflags(tp, t) lflag |= ECHOCTL; else lflag &= ~ECHOCTL; + if (flags&TANDEM) + iflag |= IXOFF; + else + iflag &= ~IXOFF; if ((flags&DECCTQ) == 0) iflag |= IXANY; else @@ -390,17 +459,30 @@ ttcompatsetlflags(tp, t) cflag |= HUPCL; lflag &= ~(TOSTOP|FLUSHO|PENDIN|NOFLSH); lflag |= flags&(TOSTOP|FLUSHO|PENDIN|NOFLSH); - if (flags&(LITOUT|PASS8)) { - iflag &= ~ISTRIP; - cflag &= ~(CSIZE|PARENB); + + /* + * The next if-else statement is copied from above so don't bother + * checking it separately. We could avoid fiddlling with the + * character size if the mode is already RAW or if neither the + * LITOUT bit or the PASS8 bit is being changed, but the delta of + * the change is not available here and skipping the RAW case would + * make the code different from above. + */ + cflag &= ~(CSIZE|PARENB); + if (flags&(RAW|LITOUT|PASS8)) { cflag |= CS8; - if (flags&LITOUT) - oflag &= ~OPOST; - if ((flags&(PASS8|RAW)) == 0) + if (!(flags&(RAW|PASS8)) + || (flags&(RAW|PASS8|ANYP)) == (PASS8|ANYP)) iflag |= ISTRIP; - } else if ((flags&RAW) == 0) { - cflag &= ~CSIZE; + else + iflag &= ~ISTRIP; + if (flags&(RAW|LITOUT)) + oflag &= ~OPOST; + else + oflag |= OPOST; + } else { cflag |= CS7|PARENB; + iflag |= ISTRIP; oflag |= OPOST; } t->c_iflag = iflag; diff --git a/sys/kern/tty_conf.c b/sys/kern/tty_conf.c index 1453675..2e765c8 100644 --- a/sys/kern/tty_conf.c +++ b/sys/kern/tty_conf.c @@ -35,92 +35,174 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)tty_conf.c 8.5 (Berkeley) 1/9/95 + * @(#)tty_conf.c 8.4 (Berkeley) 1/21/94 + * $Id$ */ #include <sys/param.h> #include <sys/systm.h> -#include <sys/buf.h> -#include <sys/ioctl.h> -#include <sys/proc.h> #include <sys/tty.h> #include <sys/conf.h> -#define ttynodisc ((int (*) __P((dev_t, struct tty *)))enodev) -#define ttyerrclose ((int (*) __P((struct tty *, int flags)))enodev) -#define ttyerrio ((int (*) __P((struct tty *, struct uio *, int)))enodev) -#define ttyerrinput ((int (*) __P((int c, struct tty *)))enodev) -#define ttyerrstart ((int (*) __P((struct tty *)))enodev) - -int nullioctl __P((struct tty *tp, u_long cmd, caddr_t data, - int flag, struct proc *p)); - -#include "tb.h" -#if NTB > 0 -int tbopen __P((dev_t dev, struct tty *tp)); -int tbclose __P((struct tty *tp, int flags)); -int tbread __P((struct tty *, struct uio *, int flags)); -int tbioctl __P((struct tty *tp, u_long cmd, caddr_t data, - int flag, struct proc *p)); -int tbinput __P((int c, struct tty *tp)); +#ifndef MAXLDISC +#define MAXLDISC 8 #endif -#include "sl.h" -#if NSL > 0 -int slopen __P((dev_t dev, struct tty *tp)); -int slclose __P((struct tty *tp, int flags)); -int sltioctl __P((struct tty *tp, u_long cmd, caddr_t data, - int flag, struct proc *p)); -int slinput __P((int c, struct tty *tp)); -int slstart __P((struct tty *tp)); +static l_open_t l_noopen; +static l_close_t l_noclose; +static l_ioctl_t l_nullioctl; +static l_rint_t l_norint; +static l_start_t l_nostart; + +/* + * XXX it probably doesn't matter what the entries other than the l_open + * entry are here. The l_nullioctl and ttymodem entries still look fishy. + * Reconsider the removal of nullmodem anyway. It was too much like + * ttymodem, but a completely null version might be useful. + */ +#define NODISC(n) \ + { l_noopen, l_noclose, l_noread, l_nowrite, \ + l_nullioctl, l_norint, l_nostart, ttymodem } + +struct linesw linesw[MAXLDISC] = +{ + /* 0- termios */ + { ttyopen, ttylclose, ttread, ttwrite, + l_nullioctl, ttyinput, ttstart, ttymodem }, + NODISC(1), /* 1- defunct */ + /* 2- NTTYDISC */ +#ifdef COMPAT_43 + { ttyopen, ttylclose, ttread, ttwrite, + l_nullioctl, ttyinput, ttstart, ttymodem }, +#else + NODISC(2), #endif + NODISC(3), /* TABLDISC */ + NODISC(4), /* SLIPDISC */ + NODISC(5), /* PPPDISC */ + NODISC(6), /* loadable */ + NODISC(7), /* loadable */ +}; +int nlinesw = sizeof (linesw) / sizeof (linesw[0]); + +static struct linesw nodisc = NODISC(0); -struct linesw linesw[] = +#define LOADABLE_LDISC 6 +/* + * ldisc_register: Register a line discipline. + * + * discipline: Index for discipline to load, or LDISC_LOAD for us to choose. + * linesw_p: Pointer to linesw_p. + * + * Returns: Index used or -1 on failure. + */ +int +ldisc_register(discipline, linesw_p) + int discipline; + struct linesw *linesw_p; { - { ttyopen, ttylclose, ttread, ttwrite, nullioctl, - ttyinput, ttstart, ttymodem }, /* 0- termios */ + int slot = -1; - { ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl, - ttyerrinput, ttyerrstart, nullmodem }, /* 1- defunct */ + if (discipline == LDISC_LOAD) { + int i; + for (i = LOADABLE_LDISC; i < MAXLDISC; i++) + if (bcmp(linesw + i, &nodisc, sizeof(nodisc)) == 0) { + slot = i; + } + } + else if (discipline >= 0 && discipline < MAXLDISC) { + slot = discipline; + } - { ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl, - ttyerrinput, ttyerrstart, nullmodem }, /* 2- defunct */ + if (slot != -1 && linesw_p) + linesw[slot] = *linesw_p; -#if NTB > 0 - { tbopen, tbclose, tbread, enodev, tbioctl, - tbinput, ttstart, nullmodem }, /* 3- TABLDISC */ -#else - { ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl, - ttyerrinput, ttyerrstart, nullmodem }, -#endif + return slot; +} -#if NSL > 0 - { slopen, slclose, ttyerrio, ttyerrio, sltioctl, - slinput, slstart, nullmodem }, /* 4- SLIPDISC */ -#else - { ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl, - ttyerrinput, ttyerrstart, nullmodem }, -#endif -}; +/* + * ldisc_deregister: Deregister a line discipline obtained with + * ldisc_register. Can only deregister "loadable" ones now. + * + * discipline: Index for discipline to unload. + */ +void +ldisc_deregister(discipline) + int discipline; +{ + if (discipline >= LOADABLE_LDISC && discipline < MAXLDISC) { + linesw[discipline] = nodisc; + } +} -int nlinesw = sizeof (linesw) / sizeof (linesw[0]); +static int +l_noopen(dev, tp) + dev_t dev; + struct tty *tp; +{ + + return (ENODEV); +} + +static int +l_noclose(tp, flag) + struct tty *tp; + int flag; +{ + + return (ENODEV); +} + +int +l_noread(tp, uio, flag) + struct tty *tp; + struct uio *uio; + int flag; +{ + + return (ENODEV); +} + +int +l_nowrite(tp, uio, flag) + struct tty *tp; + struct uio *uio; + int flag; +{ + + return (ENODEV); +} + +static int +l_norint(c, tp) + int c; + struct tty *tp; +{ + + return (ENODEV); +} + +static int +l_nostart(tp) + struct tty *tp; +{ + + return (ENODEV); +} /* * Do nothing specific version of line * discipline specific ioctl command. */ -/*ARGSUSED*/ -nullioctl(tp, cmd, data, flags, p) +static int +l_nullioctl(tp, cmd, data, flags, p) struct tty *tp; - u_long cmd; + int cmd; char *data; int flags; struct proc *p; { -#ifdef lint - tp = tp; data = data; flags = flags; p = p; -#endif return (-1); } diff --git a/sys/kern/tty_cons.c b/sys/kern/tty_cons.c new file mode 100644 index 0000000..1a56c85 --- /dev/null +++ b/sys/kern/tty_cons.c @@ -0,0 +1,353 @@ +/* + * Copyright (c) 1988 University of Utah. + * Copyright (c) 1991 The Regents of the University of California. + * All rights reserved. + * + * This code is derived from software contributed to Berkeley by + * the Systems Programming Group of the University of Utah Computer + * Science Department. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * from: @(#)cons.c 7.2 (Berkeley) 5/9/91 + * $Id$ + */ + +#include <sys/param.h> +#ifdef DEVFS +#include <sys/devfsext.h> +#endif /*DEVFS*/ +#include <sys/systm.h> +#include <sys/conf.h> +#include <sys/kernel.h> +#include <sys/reboot.h> +#include <sys/sysctl.h> +#include <sys/proc.h> +#include <sys/tty.h> + +#include <machine/cpu.h> +#include <machine/cons.h> + +/* XXX this should be config(8)ed. */ +#include "sc.h" +#include "vt.h" +#include "sio.h" +static struct consdev constab[] = { +#if NSC > 0 + { sccnprobe, sccninit, sccngetc, sccncheckc, sccnputc }, +#endif +#if NVT > 0 + { pccnprobe, pccninit, pccngetc, pccncheckc, pccnputc }, +#endif +#if NSIO > 0 + { siocnprobe, siocninit, siocngetc, siocncheckc, siocnputc }, +#endif + { 0 }, +}; + +static d_open_t cnopen; +static d_close_t cnclose; +static d_read_t cnread; +static d_write_t cnwrite; +static d_ioctl_t cnioctl; +static d_select_t cnselect; + +#define CDEV_MAJOR 0 +static struct cdevsw cn_cdevsw = + { cnopen, cnclose, cnread, cnwrite, /*0*/ + cnioctl, nullstop, nullreset, nodevtotty,/* console */ + cnselect, nommap, NULL, "console", NULL, -1 }; + +struct tty *constty = 0; /* virtual console output device */ + +static dev_t cn_dev_t; +SYSCTL_OPAQUE(_machdep, CPU_CONSDEV, consdev, CTLTYPE_OPAQUE|CTLFLAG_RD, + &cn_dev_t, sizeof cn_dev_t, "T,dev_t", ""); +static int cn_mute; +SYSCTL_INT(_kern, OID_AUTO, consmute, CTLFLAG_RW, &cn_mute, 0, ""); + +int cons_unavail = 0; /* XXX: + * physical console not available for + * input (i.e., it is in graphics mode) + */ + +static u_char cn_is_open; /* nonzero if logical console is open */ +static u_char cn_phys_is_open; /* nonzero if physical console is open */ +static d_close_t *cn_phys_close; /* physical device close function */ +static d_open_t *cn_phys_open; /* physical device open function */ +static struct consdev *cn_tab; /* physical console device info */ +static struct tty *cn_tp; /* physical console tty struct */ +#ifdef DEVFS +void *cn_devfs_token; /* represents the devfs entry */ +#endif /* DEVFS */ + +void +cninit() +{ + struct consdev *best_cp, *cp; + + /* + * Find the first console with the highest priority. + */ + best_cp = NULL; + for (cp = constab; cp->cn_probe; cp++) { + (*cp->cn_probe)(cp); + if (cp->cn_pri > CN_DEAD && + (best_cp == NULL || cp->cn_pri > best_cp->cn_pri)) + best_cp = cp; + } + + /* + * Check if we should mute the console (for security reasons perhaps) + * It can be changes dynamically using sysctl kern.consmute + * once we are up and going. + * + */ + cn_mute = ((boothowto & (RB_MUTE + |RB_SINGLE + |RB_VERBOSE + |RB_ASKNAME + |RB_CONFIG)) == RB_MUTE); + + /* + * If no console, give up. + */ + if (best_cp == NULL) { + cn_tab = best_cp; + return; + } + + /* + * Initialize console, then attach to it. This ordering allows + * debugging using the previous console, if any. + * XXX if there was a previous console, then its driver should + * be informed when we forget about it. + */ + (*best_cp->cn_init)(best_cp); + cn_tab = best_cp; +} + +void +cninit_finish() +{ + struct cdevsw *cdp; + + if (cn_tab == NULL) + return; + + /* + * Hook the open and close functions. + */ + cdp = cdevsw[major(cn_tab->cn_dev)]; + cn_phys_close = cdp->d_close; + cdp->d_close = cnclose; + cn_phys_open = cdp->d_open; + cdp->d_open = cnopen; + cn_tp = (*cdp->d_devtotty)(cn_tab->cn_dev); + cn_dev_t = cn_tp->t_dev; +} + +static int +cnopen(dev, flag, mode, p) + dev_t dev; + int flag, mode; + struct proc *p; +{ + dev_t cndev, physdev; + int retval; + + if (cn_tab == NULL) + return (0); + cndev = cn_tab->cn_dev; + physdev = (major(dev) == major(cndev) ? dev : cndev); + retval = (*cn_phys_open)(physdev, flag, mode, p); + if (retval == 0) { + if (dev == cndev) + cn_phys_is_open = 1; + else if (physdev == cndev) + cn_is_open = 1; + } + return (retval); +} + +static int +cnclose(dev, flag, mode, p) + dev_t dev; + int flag, mode; + struct proc *p; +{ + dev_t cndev; + + if (cn_tab == NULL) + return (0); + cndev = cn_tab->cn_dev; + if (dev == cndev) { + /* the physical device is about to be closed */ + cn_phys_is_open = 0; + if (cn_is_open) { + if (cn_tp) { + /* perform a ttyhalfclose() */ + /* reset session and proc group */ + cn_tp->t_pgrp = NULL; + cn_tp->t_session = NULL; + } + return (0); + } + } else if (major(dev) != major(cndev)) { + /* the logical console is about to be closed */ + cn_is_open = 0; + if (cn_phys_is_open) + return (0); + dev = cndev; + } + return ((*cn_phys_close)(dev, flag, mode, p)); +} + +static int +cnread(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + if ((cn_tab == NULL) || cn_mute) + return (0); + dev = cn_tab->cn_dev; + return ((*cdevsw[major(dev)]->d_read)(dev, uio, flag)); +} + +static int +cnwrite(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + if ((cn_tab == NULL) || cn_mute) + return (0); + if (constty) + dev = constty->t_dev; + else + dev = cn_tab->cn_dev; + return ((*cdevsw[major(dev)]->d_write)(dev, uio, flag)); +} + +static int +cnioctl(dev, cmd, data, flag, p) + dev_t dev; + int cmd; + caddr_t data; + int flag; + struct proc *p; +{ + int error; + + if ((cn_tab == NULL) || cn_mute) + return (0); + /* + * Superuser can always use this to wrest control of console + * output from the "virtual" console. + */ + if (cmd == TIOCCONS && constty) { + error = suser(p->p_ucred, (u_short *) NULL); + if (error) + return (error); + constty = NULL; + return (0); + } + dev = cn_tab->cn_dev; + return ((*cdevsw[major(dev)]->d_ioctl)(dev, cmd, data, flag, p)); +} + +static int +cnselect(dev, rw, p) + dev_t dev; + int rw; + struct proc *p; +{ + if ((cn_tab == NULL) || cn_mute) + return (1); + + dev = cn_tab->cn_dev; + + return ((*cdevsw[major(dev)]->d_select)(dev, rw, p)); +} + +int +cngetc() +{ + int c; + if ((cn_tab == NULL) || cn_mute) + return (-1); + c = (*cn_tab->cn_getc)(cn_tab->cn_dev); + if (c == '\r') c = '\n'; /* console input is always ICRNL */ + return (c); +} + +int +cncheckc() +{ + if ((cn_tab == NULL) || cn_mute) + return (-1); + return ((*cn_tab->cn_checkc)(cn_tab->cn_dev)); +} + +void +cnputc(c) + register int c; +{ + if ((cn_tab == NULL) || cn_mute) + return; + if (c) { + if (c == '\n') + (*cn_tab->cn_putc)(cn_tab->cn_dev, '\r'); + (*cn_tab->cn_putc)(cn_tab->cn_dev, c); + } +} + +static cn_devsw_installed = 0; + +static void +cn_drvinit(void *unused) +{ + dev_t dev; + + if( ! cn_devsw_installed ) { + dev = makedev(CDEV_MAJOR,0); + cdevsw_add(&dev,&cn_cdevsw,NULL); + cn_devsw_installed = 1; +#ifdef DEVFS + cn_devfs_token = devfs_add_devswf(&cn_cdevsw, 0, DV_CHR, + UID_ROOT, GID_WHEEL, 0600, + "console"); +#endif + } +} + +SYSINIT(cndev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,cn_drvinit,NULL) + + diff --git a/sys/kern/tty_pty.c b/sys/kern/tty_pty.c index 2c37984..ee0b653 100644 --- a/sys/kern/tty_pty.c +++ b/sys/kern/tty_pty.c @@ -31,6 +31,7 @@ * SUCH DAMAGE. * * @(#)tty_pty.c 8.4 (Berkeley) 2/20/95 + * $Id: tty_pty.c,v 1.42 1997/03/23 03:36:28 bde Exp $ */ /* @@ -41,14 +42,53 @@ #include <sys/param.h> #include <sys/systm.h> -#include <sys/ioctl.h> +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +#include <sys/ioctl_compat.h> +#endif #include <sys/proc.h> #include <sys/tty.h> #include <sys/conf.h> -#include <sys/file.h> +#include <sys/fcntl.h> #include <sys/uio.h> #include <sys/kernel.h> #include <sys/vnode.h> +#include <sys/signalvar.h> + +#ifdef DEVFS +#include <sys/devfsext.h> +#endif /*DEVFS*/ + +#ifdef notyet +static void ptyattach __P((int n)); +#endif +static void ptsstart __P((struct tty *tp)); +static void ptcwakeup __P((struct tty *tp, int flag)); + +static d_open_t ptsopen; +static d_close_t ptsclose; +static d_read_t ptsread; +static d_write_t ptswrite; +static d_ioctl_t ptyioctl; +static d_stop_t ptsstop; +static d_devtotty_t ptydevtotty; +static d_open_t ptcopen; +static d_close_t ptcclose; +static d_read_t ptcread; +static d_write_t ptcwrite; +static d_select_t ptcselect; + +#define CDEV_MAJOR_S 5 +#define CDEV_MAJOR_C 6 +static struct cdevsw pts_cdevsw = + { ptsopen, ptsclose, ptsread, ptswrite, /*5*/ + ptyioctl, ptsstop, nullreset, ptydevtotty,/* ttyp */ + ttselect, nommap, NULL, "pts", NULL, -1 }; + +static struct cdevsw ptc_cdevsw = + { ptcopen, ptcclose, ptcread, ptcwrite, /*6*/ + ptyioctl, nullstop, nullreset, ptydevtotty,/* ptyp */ + ptcselect, nommap, NULL, "ptc", NULL, -1 }; + #if NPTY == 1 #undef NPTY @@ -58,17 +98,17 @@ #define BUFSIZ 100 /* Chunk size iomoved to/from user */ /* - * pts == /dev/tty[pqrs]? - * ptc == /dev/pty[pqrs]? + * pts == /dev/tty[pqrsPQRS][0123456789abcdefghijklmnopqrstuv] + * ptc == /dev/pty[pqrsPQRS][0123456789abcdefghijklmnopqrstuv] */ -struct tty pt_tty[NPTY]; /* XXX */ -struct pt_ioctl { +static struct tty pt_tty[NPTY]; /* XXX */ +static struct pt_ioctl { int pt_flags; struct selinfo pt_selr, pt_selw; u_char pt_send; u_char pt_ucntl; } pt_ioctl[NPTY]; /* XXX */ -int npty = NPTY; /* for pstat -t */ +static int npty = NPTY; /* for pstat -t */ #define PF_PKT 0x08 /* packet mode */ #define PF_STOPPED 0x10 /* user told stopped */ @@ -76,18 +116,16 @@ int npty = NPTY; /* for pstat -t */ #define PF_NOSTOP 0x40 #define PF_UCNTL 0x80 /* user control mode */ -void ptsstop __P((struct tty *, int)); - +#ifdef notyet /* * Establish n (or default if n is 1) ptys in the system. * * XXX cdevsw & pstat require the array `pty[]' to be an array */ -void +static void ptyattach(n) int n; { -#ifdef notyet char *mem; register u_long ntb; #define DEFAULT_NPTY 32 @@ -102,10 +140,11 @@ ptyattach(n) mem = (char *)ALIGN(mem + ntb); pt_ioctl = (struct pt_ioctl *)mem; npty = n; -#endif } +#endif /*ARGSUSED*/ +static int ptsopen(dev, flag, devtype, p) dev_t dev; int flag, devtype; @@ -118,7 +157,6 @@ ptsopen(dev, flag, devtype, p) return (ENXIO); tp = &pt_tty[minor(dev)]; if ((tp->t_state & TS_ISOPEN) == 0) { - tp->t_state |= TS_WOPEN; ttychars(tp); /* Set up default chars */ tp->t_iflag = TTYDEF_IFLAG; tp->t_oflag = TTYDEF_OFLAG; @@ -129,20 +167,22 @@ ptsopen(dev, flag, devtype, p) } else if (tp->t_state&TS_XCLUDE && p->p_ucred->cr_uid != 0) return (EBUSY); if (tp->t_oproc) /* Ctrlr still around. */ - tp->t_state |= TS_CARR_ON; + (void)(*linesw[tp->t_line].l_modem)(tp, 1); while ((tp->t_state & TS_CARR_ON) == 0) { - tp->t_state |= TS_WOPEN; if (flag&FNONBLOCK) break; - if (error = ttysleep(tp, (caddr_t)&tp->t_rawq, TTIPRI | PCATCH, - ttopen, 0)) + error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH, + "ptsopn", 0); + if (error) return (error); } error = (*linesw[tp->t_line].l_open)(dev, tp); - ptcwakeup(tp, FREAD|FWRITE); + if (error == 0) + ptcwakeup(tp, FREAD|FWRITE); return (error); } +static int ptsclose(dev, flag, mode, p) dev_t dev; int flag, mode; @@ -153,11 +193,12 @@ ptsclose(dev, flag, mode, p) tp = &pt_tty[minor(dev)]; err = (*linesw[tp->t_line].l_close)(tp, flag); - err |= ttyclose(tp); - ptcwakeup(tp, FREAD|FWRITE); + ptsstop(tp, FREAD|FWRITE); + (void) ttyclose(tp); return (err); } +static int ptsread(dev, uio, flag) dev_t dev; struct uio *uio; @@ -177,15 +218,17 @@ again: p->p_flag & P_PPWAIT) return (EIO); pgsignal(p->p_pgrp, SIGTTIN, 1); - if (error = ttysleep(tp, (caddr_t)&lbolt, - TTIPRI | PCATCH, ttybg, 0)) + error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ptsbg", + 0); + if (error) return (error); } if (tp->t_canq.c_cc == 0) { if (flag & IO_NDELAY) return (EWOULDBLOCK); - if (error = ttysleep(tp, (caddr_t)&tp->t_canq, - TTIPRI | PCATCH, ttyin, 0)) + error = ttysleep(tp, TSA_PTS_READ(tp), TTIPRI | PCATCH, + "ptsin", 0); + if (error) return (error); goto again; } @@ -210,6 +253,7 @@ again: * Wakeups of controlling tty will happen * indirectly, when tty driver calls ptsstart. */ +static int ptswrite(dev, uio, flag) dev_t dev; struct uio *uio; @@ -227,7 +271,7 @@ ptswrite(dev, uio, flag) * Start output on pseudo-tty. * Wake up process selecting or sleeping for input from controlling tty. */ -void +static void ptsstart(tp) struct tty *tp; { @@ -242,6 +286,7 @@ ptsstart(tp) ptcwakeup(tp, FREAD); } +static void ptcwakeup(tp, flag) struct tty *tp; int flag; @@ -250,23 +295,19 @@ ptcwakeup(tp, flag) if (flag & FREAD) { selwakeup(&pti->pt_selr); - wakeup((caddr_t)&tp->t_outq.c_cf); + wakeup(TSA_PTC_READ(tp)); } if (flag & FWRITE) { selwakeup(&pti->pt_selw); - wakeup((caddr_t)&tp->t_rawq.c_cf); + wakeup(TSA_PTC_WRITE(tp)); } } -/*ARGSUSED*/ -#ifdef __STDC__ -ptcopen(dev_t dev, int flag, int devtype, struct proc *p) -#else +static int ptcopen(dev, flag, devtype, p) dev_t dev; int flag, devtype; struct proc *p; -#endif { register struct tty *tp; struct pt_ioctl *pti; @@ -289,19 +330,37 @@ ptcopen(dev, flag, devtype, p) return (0); } -ptcclose(dev) +static int +ptcclose(dev, flags, fmt, p) dev_t dev; + int flags; + int fmt; + struct proc *p; { register struct tty *tp; tp = &pt_tty[minor(dev)]; (void)(*linesw[tp->t_line].l_modem)(tp, 0); - tp->t_state &= ~TS_CARR_ON; + + /* + * XXX MDMBUF makes no sense for ptys but would inhibit the above + * l_modem(). CLOCAL makes sense but isn't supported. Special + * l_modem()s that ignore carrier drop make no sense for ptys but + * may be in use because other parts of the line discipline make + * sense for ptys. Recover by doing everything that a normal + * ttymodem() would have done except for sending a SIGHUP. + */ + if (tp->t_state & TS_ISOPEN) { + tp->t_state &= ~(TS_CARR_ON | TS_CONNECTED); + tp->t_state |= TS_ZOMBIE; + ttyflush(tp, FREAD | FWRITE); + } + tp->t_oproc = 0; /* mark closed */ - tp->t_session = 0; return (0); } +static int ptcread(dev, uio, flag) dev_t dev; struct uio *uio; @@ -327,7 +386,8 @@ ptcread(dev, uio, flag) if (pti->pt_send & TIOCPKT_IOCTL) { cc = min(uio->uio_resid, sizeof(tp->t_termios)); - uiomove(&tp->t_termios, cc, uio); + uiomove((caddr_t)&tp->t_termios, cc, + uio); } pti->pt_send = 0; return (0); @@ -342,12 +402,12 @@ ptcread(dev, uio, flag) if (tp->t_outq.c_cc && (tp->t_state&TS_TTSTOP) == 0) break; } - if ((tp->t_state&TS_CARR_ON) == 0) + if ((tp->t_state & TS_CONNECTED) == 0) return (0); /* EOF */ if (flag & IO_NDELAY) return (EWOULDBLOCK); - if (error = tsleep((caddr_t)&tp->t_outq.c_cf, TTIPRI | PCATCH, - ttyin, 0)) + error = tsleep(TSA_PTC_READ(tp), TTIPRI | PCATCH, "ptcin", 0); + if (error) return (error); } if (pti->pt_flags & (PF_PKT|PF_UCNTL)) @@ -358,17 +418,11 @@ ptcread(dev, uio, flag) break; error = uiomove(buf, cc, uio); } - if (tp->t_outq.c_cc <= tp->t_lowat) { - if (tp->t_state&TS_ASLEEP) { - tp->t_state &= ~TS_ASLEEP; - wakeup((caddr_t)&tp->t_outq); - } - selwakeup(&tp->t_wsel); - } + ttwwakeup(tp); return (error); } -void +static void ptsstop(tp, flush) register struct tty *tp; int flush; @@ -392,6 +446,7 @@ ptsstop(tp, flush) ptcwakeup(tp, flag); } +static int ptcselect(dev, rw, p) dev_t dev; int rw; @@ -401,7 +456,7 @@ ptcselect(dev, rw, p) struct pt_ioctl *pti = &pt_ioctl[minor(dev)]; int s; - if ((tp->t_state&TS_CARR_ON) == 0) + if ((tp->t_state & TS_CONNECTED) == 0) return (1); switch (rw) { @@ -420,8 +475,8 @@ ptcselect(dev, rw, p) case 0: /* exceptional */ if ((tp->t_state&TS_ISOPEN) && - (pti->pt_flags&PF_PKT && pti->pt_send || - pti->pt_flags&PF_UCNTL && pti->pt_ucntl)) + ((pti->pt_flags&PF_PKT && pti->pt_send) || + (pti->pt_flags&PF_UCNTL && pti->pt_ucntl))) return (1); selrecord(p, &pti->pt_selr); break; @@ -446,13 +501,14 @@ ptcselect(dev, rw, p) return (0); } +static int ptcwrite(dev, uio, flag) dev_t dev; register struct uio *uio; int flag; { register struct tty *tp = &pt_tty[minor(dev)]; - register u_char *cp; + register u_char *cp = 0; register int cc = 0; u_char locbuf[BUFSIZ]; int cnt = 0; @@ -465,7 +521,8 @@ again: if (pti->pt_flags & PF_REMOTE) { if (tp->t_canq.c_cc) goto block; - while (uio->uio_resid > 0 && tp->t_canq.c_cc < TTYHOG - 1) { + while ((uio->uio_resid > 0 || cc > 0) && + tp->t_canq.c_cc < TTYHOG - 1) { if (cc == 0) { cc = min(uio->uio_resid, BUFSIZ); cc = min(cc, TTYHOG - 1 - tp->t_canq.c_cc); @@ -474,19 +531,34 @@ again: if (error) return (error); /* check again for safety */ - if ((tp->t_state&TS_ISOPEN) == 0) + if ((tp->t_state & TS_ISOPEN) == 0) { + /* adjust as usual */ + uio->uio_resid += cc; return (EIO); + } + } + if (cc > 0) { + cc = b_to_q((char *)cp, cc, &tp->t_canq); + /* + * XXX we don't guarantee that the canq size + * is >= TTYHOG, so the above b_to_q() may + * leave some bytes uncopied. However, space + * is guaranteed for the null terminator if + * we don't fail here since (TTYHOG - 1) is + * not a multiple of CBSIZE. + */ + if (cc > 0) + break; } - if (cc) - (void) b_to_q((char *)cp, cc, &tp->t_canq); - cc = 0; } + /* adjust for data copied in but not written */ + uio->uio_resid += cc; (void) putc(0, &tp->t_canq); ttwakeup(tp); - wakeup((caddr_t)&tp->t_canq); + wakeup(TSA_PTS_READ(tp)); return (0); } - while (uio->uio_resid > 0) { + while (uio->uio_resid > 0 || cc > 0) { if (cc == 0) { cc = min(uio->uio_resid, BUFSIZ); cp = locbuf; @@ -494,13 +566,16 @@ again: if (error) return (error); /* check again for safety */ - if ((tp->t_state&TS_ISOPEN) == 0) + if ((tp->t_state & TS_ISOPEN) == 0) { + /* adjust for data copied in but not written */ + uio->uio_resid += cc; return (EIO); + } } while (cc > 0) { if ((tp->t_rawq.c_cc + tp->t_canq.c_cc) >= TTYHOG - 2 && (tp->t_canq.c_cc > 0 || !(tp->t_iflag&ICANON))) { - wakeup((caddr_t)&tp->t_rawq); + wakeup(TSA_HUP_OR_INPUT(tp)); goto block; } (*linesw[tp->t_line].l_rint)(*cp++, tp); @@ -513,10 +588,13 @@ again: block: /* * Come here to wait for slave to open, for space - * in outq, or space in rawq. + * in outq, or space in rawq, or an empty canq. */ - if ((tp->t_state&TS_CARR_ON) == 0) + if ((tp->t_state & TS_CONNECTED) == 0) { + /* adjust for data copied in but not written */ + uio->uio_resid += cc; return (EIO); + } if (flag & IO_NDELAY) { /* adjust for data copied in but not written */ uio->uio_resid += cc; @@ -524,8 +602,8 @@ block: return (EWOULDBLOCK); return (0); } - if (error = tsleep((caddr_t)&tp->t_rawq.c_cf, TTOPRI | PCATCH, - ttyout, 0)) { + error = tsleep(TSA_PTC_WRITE(tp), TTOPRI | PCATCH, "ptcout", 0); + if (error) { /* adjust for data copied in but not written */ uio->uio_resid += cc; return (error); @@ -533,10 +611,21 @@ block: goto again; } +static struct tty * +ptydevtotty(dev) + dev_t dev; +{ + if (minor(dev) >= npty) + return (NULL); + + return &pt_tty[minor(dev)]; +} + /*ARGSUSED*/ +static int ptyioctl(dev, cmd, data, flag, p) dev_t dev; - u_long cmd; + int cmd; caddr_t data; int flag; struct proc *p; @@ -572,7 +661,7 @@ ptyioctl(dev, cmd, data, flag, p) } return(0); } else - if (cdevsw[major(dev)].d_open == ptcopen) + if (cdevsw[major(dev)]->d_open == ptcopen) switch (cmd) { case TIOCGPGRP: @@ -610,7 +699,7 @@ ptyioctl(dev, cmd, data, flag, p) return (0); #ifdef COMPAT_43 - case TIOCSETP: + case TIOCSETP: case TIOCSETN: #endif case TIOCSETD: @@ -670,7 +759,7 @@ ptyioctl(dev, cmd, data, flag, p) break; } } - stop = (tp->t_iflag & IXON) && CCEQ(cc[VSTOP], CTRL('s')) + stop = (tp->t_iflag & IXON) && CCEQ(cc[VSTOP], CTRL('s')) && CCEQ(cc[VSTART], CTRL('q')); if (pti->pt_flags & PF_NOSTOP) { if (stop) { @@ -689,3 +778,49 @@ ptyioctl(dev, cmd, data, flag, p) } return (error); } + +static ptc_devsw_installed = 0; +#ifdef DEVFS +#define MAXUNITS (8 * 32) +static void *devfs_token_pts[MAXUNITS]; +static void *devfs_token_ptc[MAXUNITS]; +static const char jnames[] = "pqrsPQRS"; +#endif + +static void +ptc_drvinit(void *unused) +{ +#ifdef DEVFS + int i,j,k; +#endif + dev_t dev; + + if( ! ptc_devsw_installed ) { + dev = makedev(CDEV_MAJOR_S, 0); + cdevsw_add(&dev, &pts_cdevsw, NULL); + dev = makedev(CDEV_MAJOR_C, 0); + cdevsw_add(&dev, &ptc_cdevsw, NULL); + ptc_devsw_installed = 1; +#ifdef DEVFS +/*XXX*/ +#if NPTY > MAXUNITS +#undef NPTY +#define NPTY MAXUNITS +#endif + for ( i = 0 ; i<NPTY ; i++ ) { + j = i / 32; + k = i % 32; + devfs_token_pts[i] = + devfs_add_devswf(&pts_cdevsw,i, + DV_CHR,0,0,0666, + "tty%c%n",jnames[j],k); + devfs_token_ptc[i] = + devfs_add_devswf(&ptc_cdevsw,i, + DV_CHR,0,0,0666, + "pty%c%n",jnames[j],k); + } +#endif + } +} + +SYSINIT(ptcdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR_C,ptc_drvinit,NULL) diff --git a/sys/kern/tty_snoop.c b/sys/kern/tty_snoop.c new file mode 100644 index 0000000..6e2bf5d --- /dev/null +++ b/sys/kern/tty_snoop.c @@ -0,0 +1,548 @@ +/* + * Copyright (c) 1995 Ugen J.S.Antsilevich + * + * Redistribution and use in source forms, with and without modification, + * are permitted provided that this entire comment appears intact. + * + * Redistribution in binary form may occur without any restrictions. + * Obviously, it would be nice if you gave credit where credit is due + * but requiring it would be too onerous. + * + * This software is provided ``AS IS'' without any warranties of any kind. + * + * Snoop stuff. + */ + +#include "snp.h" + +#if NSNP > 0 + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/filio.h> +#include <sys/ioctl_compat.h> /* Oooh..We need O/NTTYDISC */ +#include <sys/proc.h> +#include <sys/tty.h> +#include <sys/fcntl.h> +#include <sys/conf.h> +#include <sys/uio.h> +#include <sys/kernel.h> +#include <sys/malloc.h> +#ifdef DEVFS +#include <sys/devfsext.h> +#endif /*DEVFS*/ + +#include <sys/snoop.h> + +static d_open_t snpopen; +static d_close_t snpclose; +static d_read_t snpread; +static d_write_t snpwrite; +static d_ioctl_t snpioctl; +static d_select_t snpselect; + +#define CDEV_MAJOR 53 +static struct cdevsw snp_cdevsw = + { snpopen, snpclose, snpread, snpwrite, /*53*/ + snpioctl, nostop, nullreset, nodevtotty,/* snoop */ + snpselect, nommap, NULL, "snp", NULL, -1 }; + + +#ifndef MIN +#define MIN(a,b) (((a)<(b))?(a):(b)) +#endif + +static struct snoop snoopsw[NSNP]; + +static struct tty *snpdevtotty __P((dev_t dev)); +static int snp_detach __P((struct snoop *snp)); + +static struct tty * +snpdevtotty (dev) + dev_t dev; +{ + struct cdevsw *cdp; + int maj; + + maj = major(dev); + if ((u_int)maj >= nchrdev) + return (NULL); + cdp = cdevsw[maj]; + if (cdp == NULL) + return (NULL); + return ((*cdp->d_devtotty)(dev)); +} + +#define SNP_INPUT_BUF 5 /* This is even too much,the maximal + * interactive mode write is 3 bytes + * length for function keys... + */ + +static int +snpwrite(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + int unit = minor(dev), len, i, error; + struct snoop *snp = &snoopsw[unit]; + struct tty *tp; + char c[SNP_INPUT_BUF]; + + if (snp->snp_tty == NULL) + return (EIO); + + tp = snp->snp_tty; + + if ((tp->t_sc == snp) && (tp->t_state & TS_SNOOP) && + (tp->t_line == OTTYDISC || tp->t_line == NTTYDISC)) + goto tty_input; + + printf("Snoop: attempt to write to bad tty.\n"); + return (EIO); + +tty_input: + if (!(tp->t_state & TS_ISOPEN)) + return (EIO); + + while (uio->uio_resid > 0) { + len = MIN(uio->uio_resid,SNP_INPUT_BUF); + if ((error = uiomove(c, len, uio)) != 0) + return (error); + for (i=0;i<len;i++) { + if (ttyinput(c[i] , tp)) + return (EIO); + } + } + return 0; + +} + + +static int +snpread(dev, uio, flag) + dev_t dev; + struct uio *uio; + int flag; +{ + int unit = minor(dev), s; + struct snoop *snp = &snoopsw[unit]; + int len, n, nblen, error = 0; + caddr_t from; + char *nbuf; + +#ifdef DIAGNOSTIC + if ((snp->snp_len + snp->snp_base) > snp->snp_blen) + panic("snoop buffer error"); +#endif + + if (snp->snp_tty == NULL) + return (EIO); + + snp->snp_flags &= ~SNOOP_RWAIT; + + do { + if (snp->snp_len == 0) { + if (snp->snp_flags & SNOOP_NBIO) { + return EWOULDBLOCK; + } + snp->snp_flags |= SNOOP_RWAIT; + tsleep((caddr_t) snp, (PZERO + 1) | PCATCH, "snoopread", 0); + } + } while (snp->snp_len == 0); + + n = snp->snp_len; + + while (snp->snp_len > 0 && uio->uio_resid > 0 && error == 0) { + len = MIN(uio->uio_resid, snp->snp_len); + from = (caddr_t) (snp->snp_buf + snp->snp_base); + if (len == 0) + break; + + error = uiomove(from, len, uio); + snp->snp_base += len; + snp->snp_len -= len; + } + if ((snp->snp_flags & SNOOP_OFLOW) && (n < snp->snp_len)) { + snp->snp_flags &= ~SNOOP_OFLOW; + } + s = spltty(); + nblen = snp->snp_blen; + if (((nblen / 2) >= SNOOP_MINLEN) && (nblen / 2) >= snp->snp_len) { + while (((nblen / 2) >= snp->snp_len) && ((nblen / 2) >= SNOOP_MINLEN)) + nblen = nblen / 2; + if (nbuf = malloc(nblen, M_TTYS, M_NOWAIT)) { + bcopy(snp->snp_buf + snp->snp_base, nbuf, snp->snp_len); + free(snp->snp_buf, M_TTYS); + snp->snp_buf = nbuf; + snp->snp_blen = nblen; + snp->snp_base = 0; + } + } + splx(s); + + return error; +} + +int +snpinc(snp, c) + struct snoop *snp; + char c; +{ + char buf[1]; + + buf[0]=c; + return (snpin(snp,buf,1)); +} + + +int +snpin(snp, buf, n) + struct snoop *snp; + char *buf; + int n; +{ + int s_free, s_tail; + int s, len, nblen; + caddr_t from, to; + char *nbuf; + + + if (n == 0) + return 0; + +#ifdef DIAGNOSTIC + if (n < 0) + panic("bad snoop char count"); + + if (!(snp->snp_flags & SNOOP_OPEN)) { + printf("Snoop: data coming to closed device.\n"); + return 0; + } +#endif + if (snp->snp_flags & SNOOP_DOWN) { + printf("Snoop: more data to down interface.\n"); + return 0; + } + + if (snp->snp_flags & SNOOP_OFLOW) { + printf("Snoop: buffer overflow.\n"); + /* + * On overflow we just repeat the standart close + * procedure...yes , this is waste of space but.. Then next + * read from device will fail if one would recall he is + * snooping and retry... + */ + + return (snpdown(snp)); + } + s_tail = snp->snp_blen - (snp->snp_len + snp->snp_base); + s_free = snp->snp_blen - snp->snp_len; + + + if (n > s_free) { + s = spltty(); + nblen = snp->snp_blen; + while ((n > s_free) && ((nblen * 2) <= SNOOP_MAXLEN)) { + nblen = snp->snp_blen * 2; + s_free = nblen - (snp->snp_len + snp->snp_base); + } + if ((n <= s_free) && (nbuf = malloc(nblen, M_TTYS, M_NOWAIT))) { + bcopy(snp->snp_buf + snp->snp_base, nbuf, snp->snp_len); + free(snp->snp_buf, M_TTYS); + snp->snp_buf = nbuf; + snp->snp_blen = nblen; + snp->snp_base = 0; + } else { + snp->snp_flags |= SNOOP_OFLOW; + if (snp->snp_flags & SNOOP_RWAIT) { + snp->snp_flags &= ~SNOOP_RWAIT; + wakeup((caddr_t) snp); + } + splx(s); + return 0; + } + splx(s); + } + if (n > s_tail) { + from = (caddr_t) (snp->snp_buf + snp->snp_base); + to = (caddr_t) (snp->snp_buf); + len = snp->snp_len; + bcopy(from, to, len); + snp->snp_base = 0; + } + to = (caddr_t) (snp->snp_buf + snp->snp_base + snp->snp_len); + bcopy(buf, to, n); + snp->snp_len += n; + + if (snp->snp_flags & SNOOP_RWAIT) { + snp->snp_flags &= ~SNOOP_RWAIT; + wakeup((caddr_t) snp); + } + selwakeup(&snp->snp_sel); + snp->snp_sel.si_pid = 0; + + return n; +} + +static int +snpopen(dev, flag, mode, p) + dev_t dev; + int flag, mode; + struct proc *p; +{ + struct snoop *snp; + register int unit, error; + + if (error = suser(p->p_ucred, &p->p_acflag)) + return (error); + + if ((unit = minor(dev)) >= NSNP) + return (ENXIO); + + snp = &snoopsw[unit]; + + if (snp->snp_flags & SNOOP_OPEN) + return (ENXIO); + + /* + * We intentionally do not OR flags with SNOOP_OPEN,but set them so + * all previous settings (especially SNOOP_OFLOW) will be cleared. + */ + snp->snp_flags = SNOOP_OPEN; + + snp->snp_buf = malloc(SNOOP_MINLEN, M_TTYS, M_WAITOK); + snp->snp_blen = SNOOP_MINLEN; + snp->snp_base = 0; + snp->snp_len = 0; + + /* + * snp_tty == NULL is for inactive snoop devices. + */ + snp->snp_tty = NULL; + snp->snp_target = -1; + return (0); +} + + +static int +snp_detach(snp) + struct snoop *snp; +{ + struct tty *tp; + + snp->snp_base = 0; + snp->snp_len = 0; + + /* + * If line disc. changed we do not touch this pointer,SLIP/PPP will + * change it anyway. + */ + + if (snp->snp_tty == NULL) + goto detach_notty; + + tp = snp->snp_tty; + + if (tp && (tp->t_sc == snp) && (tp->t_state & TS_SNOOP) && + (tp->t_line == OTTYDISC || tp->t_line == NTTYDISC)) { + tp->t_sc = NULL; + tp->t_state &= ~TS_SNOOP; + } else + printf("Snoop: bad attached tty data.\n"); + + snp->snp_tty = NULL; + snp->snp_target = -1; + +detach_notty: + selwakeup(&snp->snp_sel); + snp->snp_sel.si_pid = 0; + + return (0); +} + +static int +snpclose(dev, flags, fmt, p) + dev_t dev; + int flags; + int fmt; + struct proc *p; +{ + register int unit = minor(dev); + struct snoop *snp = &snoopsw[unit]; + + snp->snp_blen = 0; + free(snp->snp_buf, M_TTYS); + snp->snp_flags &= ~SNOOP_OPEN; + + return (snp_detach(snp)); +} + +int +snpdown(snp) + struct snoop *snp; +{ + snp->snp_blen = SNOOP_MINLEN; + free(snp->snp_buf, M_TTYS); + snp->snp_buf = malloc(SNOOP_MINLEN, M_TTYS, M_WAITOK); + snp->snp_flags |= SNOOP_DOWN; + + return (snp_detach(snp)); +} + + +static int +snpioctl(dev, cmd, data, flags, p) + dev_t dev; + int cmd; + caddr_t data; + int flags; + struct proc *p; +{ + int unit = minor(dev), s; + dev_t tdev; + struct snoop *snp = &snoopsw[unit]; + struct tty *tp, *tpo; + + switch (cmd) { + case SNPSTTY: + tdev = *((dev_t *) data); + if (tdev == -1) + return (snpdown(snp)); + + tp = snpdevtotty(tdev); + if (!tp) + return (EINVAL); + + if ((tp->t_sc != (caddr_t) snp) && (tp->t_state & TS_SNOOP)) + return (EBUSY); + + if ((tp->t_line != OTTYDISC) && (tp->t_line != NTTYDISC)) + return (EBUSY); + + s = spltty(); + + if (snp->snp_target == -1) { + tpo = snp->snp_tty; + if (tpo) + tpo->t_state &= ~TS_SNOOP; + } + + tp->t_sc = (caddr_t) snp; + tp->t_state |= TS_SNOOP; + snp->snp_tty = tp; + snp->snp_target = tdev; + + /* + * Clean overflow and down flags - + * we'll have a chance to get them in the future :))) + */ + snp->snp_flags &= ~SNOOP_OFLOW; + snp->snp_flags &= ~SNOOP_DOWN; + splx(s); + break; + + case SNPGTTY: + /* + * We keep snp_target field specially to make + * SNPGTTY happy,else we can't know what is device + * major/minor for tty. + */ + *((dev_t *) data) = snp->snp_target; + break; + + case FIONBIO: + if (*(int *) data) + snp->snp_flags |= SNOOP_NBIO; + else + snp->snp_flags &= ~SNOOP_NBIO; + break; + + case FIOASYNC: + if (*(int *) data) + snp->snp_flags |= SNOOP_ASYNC; + else + snp->snp_flags &= ~SNOOP_ASYNC; + break; + + case FIONREAD: + s = spltty(); + if (snp->snp_tty != NULL) + *(int *) data = snp->snp_len; + else + if (snp->snp_flags & SNOOP_DOWN) { + if (snp->snp_flags & SNOOP_OFLOW) + *(int *) data = SNP_OFLOW; + else + *(int *) data = SNP_TTYCLOSE; + } else { + *(int *) data = SNP_DETACH; + } + splx(s); + break; + + default: + return (ENOTTY); + } + return (0); +} + + +static int +snpselect(dev, rw, p) + dev_t dev; + int rw; + struct proc *p; +{ + int unit = minor(dev); + struct snoop *snp = &snoopsw[unit]; + + if (rw != FREAD) + return 1; + + if (snp->snp_len > 0) + return 1; + + /* + * If snoop is down,we don't want to select() forever so we return 1. + * Caller should see if we down via FIONREAD ioctl().The last should + * return -1 to indicate down state. + */ + if (snp->snp_flags & SNOOP_DOWN) + return 1; + + selrecord(p, &snp->snp_sel); + return 0; +} + +#ifdef DEVFS +static void *snp_devfs_token[NSNP]; +#endif +static snp_devsw_installed = 0; + +static void +snp_drvinit(void *unused) +{ + dev_t dev; +#ifdef DEVFS + int i; +#endif + + if( ! snp_devsw_installed ) { + dev = makedev(CDEV_MAJOR, 0); + cdevsw_add(&dev,&snp_cdevsw, NULL); + snp_devsw_installed = 1; +#ifdef DEVFS + for ( i = 0 ; i < NSNP ; i++) { + snp_devfs_token[i] = + devfs_add_devswf(&snp_cdevsw, i, DV_CHR, 0, 0, + 0600, "snp%d", i); + } +#endif + } +} + +SYSINIT(snpdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,snp_drvinit,NULL) + + +#endif diff --git a/sys/kern/tty_subr.c b/sys/kern/tty_subr.c index fe8f000..d907b47 100644 --- a/sys/kern/tty_subr.c +++ b/sys/kern/tty_subr.c @@ -1,32 +1,21 @@ -/*- - * Copyright (c) 1982, 1986, 1993 - * The Regents of the University of California. All rights reserved. - * (c) UNIX System Laboratories, Inc. - * All or some portions of this file are derived from material licensed - * to the University of California by American Telephone and Telegraph - * Co. or Unix System Laboratories, Inc. and are reproduced herein with - * the permission of UNIX System Laboratories, Inc. +/* + * Copyright (c) 1994, David Greenman + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. + * notice unmodified, this list of conditions, and the following + * disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) @@ -35,125 +24,671 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * from: @(#)tty_subr.c 8.2 (Berkeley) 9/5/93 + * $Id: tty_subr.c,v 1.26 1997/03/05 16:45:01 bde Exp $ + */ + +/* + * clist support routines */ #include <sys/param.h> -#include <sys/ioctl.h> +#include <sys/kernel.h> +#include <sys/systm.h> #include <sys/tty.h> +#include <sys/clist.h> +#include <sys/malloc.h> -char cwaiting; -struct cblock *cfree, *cfreelist; -int cfreecount, nclist; +static void clist_init __P((void *)); +SYSINIT(clist, SI_SUB_CLIST, SI_ORDER_FIRST, clist_init, NULL) -void -clist_init() +static struct cblock *cfreelist = 0; +int cfreecount = 0; +static int cslushcount; +static int ctotcount; + +#ifndef INITIAL_CBLOCKS +#define INITIAL_CBLOCKS 50 +#endif + +static struct cblock *cblock_alloc __P((void)); +static void cblock_alloc_cblocks __P((int number)); +static void cblock_free __P((struct cblock *cblockp)); +static void cblock_free_cblocks __P((int number)); + +#include "opt_ddb.h" +#ifdef DDB +#include <ddb/ddb.h> + +DB_SHOW_COMMAND(cbstat, cbstat) { + printf( + "tot = %d (active = %d, free = %d (reserved = %d, slush = %d))\n", + ctotcount * CBSIZE, ctotcount * CBSIZE - cfreecount, cfreecount, + cfreecount - cslushcount * CBSIZE, cslushcount * CBSIZE); +} +#endif /* DDB */ +/* + * Called from init_main.c + */ +/* ARGSUSED*/ +static void +clist_init(dummy) + void *dummy; +{ /* - * Body deleted. + * Allocate an initial base set of cblocks as a 'slush'. + * We allocate non-slush cblocks with each initial ttyopen() and + * deallocate them with each ttyclose(). + * We should adjust the slush allocation. This can't be done in + * the i/o routines because they are sometimes called from + * interrupt handlers when it may be unsafe to call malloc(). */ - return; + cblock_alloc_cblocks(cslushcount = INITIAL_CBLOCKS); } -getc(a1) - struct clist *a1; +/* + * Remove a cblock from the cfreelist queue and return a pointer + * to it. + */ +static inline struct cblock * +cblock_alloc() { + struct cblock *cblockp; - /* - * Body deleted. - */ - return ((char)0); + cblockp = cfreelist; + if (cblockp == NULL) + panic("clist reservation botch"); + cfreelist = cblockp->c_next; + cblockp->c_next = NULL; + cfreecount -= CBSIZE; + return (cblockp); } -q_to_b(a1, a2, a3) - struct clist *a1; - char *a2; - int a3; +/* + * Add a cblock to the cfreelist queue. + */ +static inline void +cblock_free(cblockp) + struct cblock *cblockp; { + if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) + bzero(cblockp->c_quote, sizeof cblockp->c_quote); + cblockp->c_next = cfreelist; + cfreelist = cblockp; + cfreecount += CBSIZE; +} - /* - * Body deleted. - */ - return (0); +/* + * Allocate some cblocks for the cfreelist queue. + */ +static void +cblock_alloc_cblocks(number) + int number; +{ + int i; + struct cblock *cbp; + + for (i = 0; i < number; ++i) { + cbp = malloc(sizeof *cbp, M_TTYS, M_NOWAIT); + if (cbp == NULL) { + printf( +"clist_alloc_cblocks: M_NOWAIT malloc failed, trying M_WAITOK\n"); + cbp = malloc(sizeof *cbp, M_TTYS, M_WAITOK); + } + /* + * Freed cblocks have zero quotes and garbage elsewhere. + * Set the may-have-quote bit to force zeroing the quotes. + */ + setbit(cbp->c_quote, CBQSIZE * NBBY - 1); + cblock_free(cbp); + } + ctotcount += number; } -ndqb(a1, a2) - struct clist *a1; - int a2; +/* + * Set the cblock allocation policy for a a clist. + * Must be called in process context at spltty(). + */ +void +clist_alloc_cblocks(clistp, ccmax, ccreserved) + struct clist *clistp; + int ccmax; + int ccreserved; { + int dcbr; /* - * Body deleted. + * Allow for wasted space at the head. */ - return (0); + if (ccmax != 0) + ccmax += CBSIZE - 1; + if (ccreserved != 0) + ccreserved += CBSIZE - 1; + + clistp->c_cbmax = roundup(ccmax, CBSIZE) / CBSIZE; + dcbr = roundup(ccreserved, CBSIZE) / CBSIZE - clistp->c_cbreserved; + if (dcbr >= 0) + cblock_alloc_cblocks(dcbr); + else { + if (clistp->c_cbreserved + dcbr < clistp->c_cbcount) + dcbr = clistp->c_cbcount - clistp->c_cbreserved; + cblock_free_cblocks(-dcbr); + } + clistp->c_cbreserved += dcbr; } +/* + * Free some cblocks from the cfreelist queue back to the + * system malloc pool. + */ +static void +cblock_free_cblocks(number) + int number; +{ + int i; + + for (i = 0; i < number; ++i) + free(cblock_alloc(), M_TTYS); + ctotcount -= number; +} + +/* + * Free the cblocks reserved for a clist. + * Must be called at spltty(). + */ void -ndflush(a1, a2) - struct clist *a1; - int a2; +clist_free_cblocks(clistp) + struct clist *clistp; { + if (clistp->c_cbcount != 0) + panic("freeing active clist cblocks"); + cblock_free_cblocks(clistp->c_cbreserved); + clistp->c_cbmax = 0; + clistp->c_cbreserved = 0; +} - /* - * Body deleted. - */ - return; +/* + * Get a character from the head of a clist. + */ +int +getc(clistp) + struct clist *clistp; +{ + int chr = -1; + int s; + struct cblock *cblockp; + + s = spltty(); + + /* If there are characters in the list, get one */ + if (clistp->c_cc) { + cblockp = (struct cblock *)((long)clistp->c_cf & ~CROUND); + chr = (u_char)*clistp->c_cf; + + /* + * If this char is quoted, set the flag. + */ + if (isset(cblockp->c_quote, clistp->c_cf - (char *)cblockp->c_info)) + chr |= TTY_QUOTE; + + /* + * Advance to next character. + */ + clistp->c_cf++; + clistp->c_cc--; + /* + * If we have advanced the 'first' character pointer + * past the end of this cblock, advance to the next one. + * If there are no more characters, set the first and + * last pointers to NULL. In either case, free the + * current cblock. + */ + if ((clistp->c_cf >= (char *)(cblockp+1)) || (clistp->c_cc == 0)) { + if (clistp->c_cc > 0) { + clistp->c_cf = cblockp->c_next->c_info; + } else { + clistp->c_cf = clistp->c_cl = NULL; + } + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + } + } + + splx(s); + return (chr); +} + +/* + * Copy 'amount' of chars, beginning at head of clist 'clistp' to + * destination linear buffer 'dest'. Return number of characters + * actually copied. + */ +int +q_to_b(clistp, dest, amount) + struct clist *clistp; + char *dest; + int amount; +{ + struct cblock *cblockp; + struct cblock *cblockn; + char *dest_orig = dest; + int numc; + int s; + + s = spltty(); + + while (clistp && amount && (clistp->c_cc > 0)) { + cblockp = (struct cblock *)((long)clistp->c_cf & ~CROUND); + cblockn = cblockp + 1; /* pointer arithmetic! */ + numc = min(amount, (char *)cblockn - clistp->c_cf); + numc = min(numc, clistp->c_cc); + bcopy(clistp->c_cf, dest, numc); + amount -= numc; + clistp->c_cf += numc; + clistp->c_cc -= numc; + dest += numc; + /* + * If this cblock has been emptied, advance to the next + * one. If there are no more characters, set the first + * and last pointer to NULL. In either case, free the + * current cblock. + */ + if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) { + if (clistp->c_cc > 0) { + clistp->c_cf = cblockp->c_next->c_info; + } else { + clistp->c_cf = clistp->c_cl = NULL; + } + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + } + } + + splx(s); + return (dest - dest_orig); } -putc(a1, a2) - char a1; - struct clist *a2; +/* + * Flush 'amount' of chars, beginning at head of clist 'clistp'. + */ +void +ndflush(clistp, amount) + struct clist *clistp; + int amount; { + struct cblock *cblockp; + struct cblock *cblockn; + int numc; + int s; + + s = spltty(); + + while (amount && (clistp->c_cc > 0)) { + cblockp = (struct cblock *)((long)clistp->c_cf & ~CROUND); + cblockn = cblockp + 1; /* pointer arithmetic! */ + numc = min(amount, (char *)cblockn - clistp->c_cf); + numc = min(numc, clistp->c_cc); + amount -= numc; + clistp->c_cf += numc; + clistp->c_cc -= numc; + /* + * If this cblock has been emptied, advance to the next + * one. If there are no more characters, set the first + * and last pointer to NULL. In either case, free the + * current cblock. + */ + if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) { + if (clistp->c_cc > 0) { + clistp->c_cf = cblockp->c_next->c_info; + } else { + clistp->c_cf = clistp->c_cl = NULL; + } + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + } + } + + splx(s); +} + +/* + * Add a character to the end of a clist. Return -1 is no + * more clists, or 0 for success. + */ +int +putc(chr, clistp) + int chr; + struct clist *clistp; +{ + struct cblock *cblockp; + int s; + + s = spltty(); + + if (clistp->c_cl == NULL) { + if (clistp->c_cbreserved < 1) { + splx(s); + printf("putc to a clist with no reserved cblocks\n"); + return (-1); /* nothing done */ + } + cblockp = cblock_alloc(); + clistp->c_cbcount = 1; + clistp->c_cf = clistp->c_cl = cblockp->c_info; + clistp->c_cc = 0; + } else { + cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND); + if (((long)clistp->c_cl & CROUND) == 0) { + struct cblock *prev = (cblockp - 1); + + if (clistp->c_cbcount >= clistp->c_cbreserved) { + if (clistp->c_cbcount >= clistp->c_cbmax + || cslushcount <= 0) { + splx(s); + return (-1); + } + --cslushcount; + } + cblockp = cblock_alloc(); + clistp->c_cbcount++; + prev->c_next = cblockp; + clistp->c_cl = cblockp->c_info; + } + } /* - * Body deleted. + * If this character is quoted, set the quote bit, if not, clear it. */ + if (chr & TTY_QUOTE) { + setbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info); + /* + * Use one of the spare quote bits to record that something + * may be quoted. + */ + setbit(cblockp->c_quote, CBQSIZE * NBBY - 1); + } else + clrbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info); + + *clistp->c_cl++ = chr; + clistp->c_cc++; + + splx(s); return (0); } -b_to_q(a1, a2, a3) - char *a1; - int a2; - struct clist *a3; +/* + * Copy data from linear buffer to clist chain. Return the + * number of characters not copied. + */ +int +b_to_q(src, amount, clistp) + char *src; + int amount; + struct clist *clistp; { + struct cblock *cblockp; + char *firstbyte, *lastbyte; + u_char startmask, endmask; + int startbit, endbit, num_between, numc; + int s; /* - * Body deleted. + * Avoid allocating an initial cblock and then not using it. + * c_cc == 0 must imply c_cbount == 0. */ - return (0); + if (amount <= 0) + return (amount); + + s = spltty(); + + /* + * If there are no cblocks assigned to this clist yet, + * then get one. + */ + if (clistp->c_cl == NULL) { + if (clistp->c_cbreserved < 1) { + splx(s); + printf("b_to_q to a clist with no reserved cblocks.\n"); + return (amount); /* nothing done */ + } + cblockp = cblock_alloc(); + clistp->c_cbcount = 1; + clistp->c_cf = clistp->c_cl = cblockp->c_info; + clistp->c_cc = 0; + } else { + cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND); + } + + while (amount) { + /* + * Get another cblock if needed. + */ + if (((long)clistp->c_cl & CROUND) == 0) { + struct cblock *prev = cblockp - 1; + + if (clistp->c_cbcount >= clistp->c_cbreserved) { + if (clistp->c_cbcount >= clistp->c_cbmax + || cslushcount <= 0) { + splx(s); + return (amount); + } + --cslushcount; + } + cblockp = cblock_alloc(); + clistp->c_cbcount++; + prev->c_next = cblockp; + clistp->c_cl = cblockp->c_info; + } + + /* + * Copy a chunk of the linear buffer up to the end + * of this cblock. + */ + numc = min(amount, (char *)(cblockp + 1) - clistp->c_cl); + bcopy(src, clistp->c_cl, numc); + + /* + * Clear quote bits if they aren't known to be clear. + * The following could probably be made into a seperate + * "bitzero()" routine, but why bother? + */ + if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) { + startbit = clistp->c_cl - (char *)cblockp->c_info; + endbit = startbit + numc - 1; + + firstbyte = (u_char *)cblockp->c_quote + (startbit / NBBY); + lastbyte = (u_char *)cblockp->c_quote + (endbit / NBBY); + + /* + * Calculate mask of bits to preserve in first and + * last bytes. + */ + startmask = NBBY - (startbit % NBBY); + startmask = 0xff >> startmask; + endmask = (endbit % NBBY); + endmask = 0xff << (endmask + 1); + + if (firstbyte != lastbyte) { + *firstbyte &= startmask; + *lastbyte &= endmask; + + num_between = lastbyte - firstbyte - 1; + if (num_between) + bzero(firstbyte + 1, num_between); + } else { + *firstbyte &= (startmask | endmask); + } + } + + /* + * ...and update pointer for the next chunk. + */ + src += numc; + clistp->c_cl += numc; + clistp->c_cc += numc; + amount -= numc; + /* + * If we go through the loop again, it's always + * for data in the next cblock, so by adding one (cblock), + * (which makes the pointer 1 beyond the end of this + * cblock) we prepare for the assignment of 'prev' + * above. + */ + cblockp += 1; + + } + + splx(s); + return (amount); } +/* + * Get the next character in the clist. Store it at dst. Don't + * advance any clist pointers, but return a pointer to the next + * character position. + */ char * -nextc(a1, a2, a3) - struct clist *a1; - char *a2; - int *a3; +nextc(clistp, cp, dst) + struct clist *clistp; + char *cp; + int *dst; { + struct cblock *cblockp; + ++cp; /* - * Body deleted. + * See if the next character is beyond the end of + * the clist. */ - return ((char *)0); + if (clistp->c_cc && (cp != clistp->c_cl)) { + /* + * If the next character is beyond the end of this + * cblock, advance to the next cblock. + */ + if (((long)cp & CROUND) == 0) + cp = ((struct cblock *)cp - 1)->c_next->c_info; + cblockp = (struct cblock *)((long)cp & ~CROUND); + + /* + * Get the character. Set the quote flag if this character + * is quoted. + */ + *dst = (u_char)*cp | (isset(cblockp->c_quote, cp - (char *)cblockp->c_info) ? TTY_QUOTE : 0); + + return (cp); + } + + return (NULL); } -unputc(a1) - struct clist *a1; +/* + * "Unput" a character from a clist. + */ +int +unputc(clistp) + struct clist *clistp; { + struct cblock *cblockp = 0, *cbp = 0; + int s; + int chr = -1; + + + s = spltty(); + + if (clistp->c_cc) { + --clistp->c_cc; + --clistp->c_cl; + + chr = (u_char)*clistp->c_cl; + + cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND); + + /* + * Set quote flag if this character was quoted. + */ + if (isset(cblockp->c_quote, (u_char *)clistp->c_cl - cblockp->c_info)) + chr |= TTY_QUOTE; + + /* + * If all of the characters have been unput in this + * cblock, then find the previous one and free this + * one. + */ + if (clistp->c_cc && (clistp->c_cl <= (char *)cblockp->c_info)) { + cbp = (struct cblock *)((long)clistp->c_cf & ~CROUND); + + while (cbp->c_next != cblockp) + cbp = cbp->c_next; + + /* + * When the previous cblock is at the end, the 'last' + * pointer always points (invalidly) one past. + */ + clistp->c_cl = (char *)(cbp+1); + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + cbp->c_next = NULL; + } + } /* - * Body deleted. + * If there are no more characters on the list, then + * free the last cblock. */ - return ((char)0); + if ((clistp->c_cc == 0) && clistp->c_cl) { + cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND); + cblock_free(cblockp); + if (--clistp->c_cbcount >= clistp->c_cbreserved) + ++cslushcount; + clistp->c_cf = clistp->c_cl = NULL; + } + + splx(s); + return (chr); } +/* + * Move characters in source clist to destination clist, + * preserving quote bits. + */ void -catq(a1, a2) - struct clist *a1, *a2; +catq(src_clistp, dest_clistp) + struct clist *src_clistp, *dest_clistp; { + int chr, s; + + s = spltty(); + /* + * If the destination clist is empty (has no cblocks atttached), + * and there are no possible complications with the resource counters, + * then we simply assign the current clist to the destination. + */ + if (!dest_clistp->c_cf + && src_clistp->c_cbcount <= src_clistp->c_cbmax + && src_clistp->c_cbcount <= dest_clistp->c_cbmax) { + dest_clistp->c_cf = src_clistp->c_cf; + dest_clistp->c_cl = src_clistp->c_cl; + src_clistp->c_cf = src_clistp->c_cl = NULL; + + dest_clistp->c_cc = src_clistp->c_cc; + src_clistp->c_cc = 0; + dest_clistp->c_cbcount = src_clistp->c_cbcount; + src_clistp->c_cbcount = 0; + + splx(s); + return; + } + + splx(s); /* - * Body deleted. + * XXX This should probably be optimized to more than one + * character at a time. */ - return; + while ((chr = getc(src_clistp)) != -1) + putc(chr, dest_clistp); } diff --git a/sys/kern/tty_tb.c b/sys/kern/tty_tb.c index 05a46ba..8f4c84c 100644 --- a/sys/kern/tty_tb.c +++ b/sys/kern/tty_tb.c @@ -30,7 +30,8 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)tty_tb.c 8.2 (Berkeley) 1/9/95 + * @(#)tty_tb.c 8.1 (Berkeley) 6/10/93 + * $Id$ */ #include "tb.h" @@ -310,9 +311,7 @@ poldecode(tc, cp, polpos) /*ARGSUSED*/ tbioctl(tp, cmd, data, flag) struct tty *tp; - u_long cmd; caddr_t data; - int flag; { register struct tb *tbp = (struct tb *)tp->T_LINEP; diff --git a/sys/kern/tty_tty.c b/sys/kern/tty_tty.c index d9dd1b4..be164d5 100644 --- a/sys/kern/tty_tty.c +++ b/sys/kern/tty_tty.c @@ -1,5 +1,5 @@ /*- - * Copyright (c) 1982, 1986, 1991, 1993, 1995 + * Copyright (c) 1982, 1986, 1991, 1993 * The Regents of the University of California. All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -30,7 +30,8 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)tty_tty.c 8.4 (Berkeley) 5/14/95 + * @(#)tty_tty.c 8.2 (Berkeley) 9/23/93 + * $Id: tty_tty.c,v 1.15 1997/03/23 03:36:30 bde Exp $ */ /* @@ -39,15 +40,33 @@ #include <sys/param.h> #include <sys/systm.h> #include <sys/conf.h> -#include <sys/ioctl.h> #include <sys/proc.h> #include <sys/tty.h> #include <sys/vnode.h> -#include <sys/file.h> +#include <sys/fcntl.h> +#include <sys/kernel.h> +#ifdef DEVFS +#include <sys/devfsext.h> +#endif /*DEVFS*/ + +static d_open_t cttyopen; +static d_read_t cttyread; +static d_write_t cttywrite; +static d_ioctl_t cttyioctl; +static d_select_t cttyselect; + +#define CDEV_MAJOR 1 +/* Don't make static, fdesc_vnops uses this. */ +struct cdevsw ctty_cdevsw = + { cttyopen, nullclose, cttyread, cttywrite, /*1*/ + cttyioctl, nullstop, nullreset, nodevtotty,/* tty */ + cttyselect, nommap, NULL, "ctty", NULL, -1 }; + #define cttyvp(p) ((p)->p_flag & P_CONTROLT ? (p)->p_session->s_ttyvp : NULL) /*ARGSUSED*/ +static int cttyopen(dev, flag, mode, p) dev_t dev; int flag, mode; @@ -64,7 +83,7 @@ cttyopen(dev, flag, mode, p) * Since group is tty and mode is 620 on most terminal lines * and since sessions protect terminals from processes outside * your session, this check is probably no longer necessary. - * Since it inhibits setuid root programs that later switch + * Since it inhibits setuid root programs that later switch * to another user from accessing /dev/tty, we have decided * to delete this test. (mckusick 5/93) */ @@ -78,6 +97,7 @@ cttyopen(dev, flag, mode, p) } /*ARGSUSED*/ +static int cttyread(dev, uio, flag) dev_t dev; struct uio *uio; @@ -96,6 +116,7 @@ cttyread(dev, uio, flag) } /*ARGSUSED*/ +static int cttywrite(dev, uio, flag) dev_t dev; struct uio *uio; @@ -114,9 +135,10 @@ cttywrite(dev, uio, flag) } /*ARGSUSED*/ +static int cttyioctl(dev, cmd, addr, flag, p) dev_t dev; - u_long cmd; + int cmd; caddr_t addr; int flag; struct proc *p; @@ -125,6 +147,8 @@ cttyioctl(dev, cmd, addr, flag, p) if (ttyvp == NULL) return (EIO); + if (cmd == TIOCSCTTY) /* don't allow controlling tty to be set */ + return EINVAL; /* to controlling tty -- infinite recursion */ if (cmd == TIOCNOTTY) { if (!SESS_LEADER(p)) { p->p_flag &= ~P_CONTROLT; @@ -136,6 +160,7 @@ cttyioctl(dev, cmd, addr, flag, p) } /*ARGSUSED*/ +static int cttyselect(dev, flag, p) dev_t dev; int flag; @@ -147,3 +172,27 @@ cttyselect(dev, flag, p) return (1); /* try operation to get EOF/failure */ return (VOP_SELECT(ttyvp, flag, FREAD|FWRITE, NOCRED, p)); } + +static ctty_devsw_installed = 0; +#ifdef DEVFS +static void *ctty_devfs_token; +#endif + +static void +ctty_drvinit(void *unused) +{ + dev_t dev; + + if( ! ctty_devsw_installed ) { + dev = makedev(CDEV_MAJOR,0); + cdevsw_add(&dev,&ctty_cdevsw,NULL); + ctty_devsw_installed = 1; +#ifdef DEVFS + ctty_devfs_token = + devfs_add_devswf(&ctty_cdevsw, 0, DV_CHR, 0, 0, + 0666, "tty"); +#endif + } +} + +SYSINIT(cttydev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,ctty_drvinit,NULL) diff --git a/sys/kern/uipc_domain.c b/sys/kern/uipc_domain.c index 1c91f2a..a2c3477 100644 --- a/sys/kern/uipc_domain.c +++ b/sys/kern/uipc_domain.c @@ -30,7 +30,8 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)uipc_domain.c 8.3 (Berkeley) 2/14/95 + * @(#)uipc_domain.c 8.2 (Berkeley) 10/18/93 + * $Id$ */ #include <sys/param.h> @@ -38,69 +39,120 @@ #include <sys/protosw.h> #include <sys/domain.h> #include <sys/mbuf.h> -#include <sys/time.h> #include <sys/kernel.h> #include <sys/systm.h> -#include <sys/proc.h> -#include <vm/vm.h> -#include <sys/sysctl.h> -void pffasttimo __P((void *)); -void pfslowtimo __P((void *)); +/* + * System initialization + * + * Note: domain initialization wants to take place on a per domain basis + * as a result of traversing a linker set. Most likely, each domain + * want to call a registration function rather than being handled here + * in domaininit(). Probably this will look like: + * + * SYSINIT(unique, SI_SUB_PROTO_DOMAI, SI_ORDER_ANY, domain_add, xxx) + * + * Where 'xxx' is replaced by the address of a parameter struct to be + * passed to the doamin_add() function. + */ + +static int x_save_spl; /* used by kludge*/ +static void kludge_splimp __P((void *)); +static void kludge_splx __P((void *)); +static void domaininit __P((void *)); +SYSINIT(splimp, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, kludge_splimp, &x_save_spl) +SYSINIT(domain, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, domaininit, NULL) +SYSINIT(splx, SI_SUB_PROTO_END, SI_ORDER_FIRST, kludge_splx, &x_save_spl) + +static void pffasttimo __P((void *)); +static void pfslowtimo __P((void *)); + +struct domain *domains; #define ADDDOMAIN(x) { \ - extern struct domain __CONCAT(x,domain); \ __CONCAT(x,domain.dom_next) = domains; \ domains = &__CONCAT(x,domain); \ } -void -domaininit() +extern struct linker_set domain_set; + +/* ARGSUSED*/ +static void +domaininit(dummy) + void *dummy; { - register struct domain *dp; + register struct domain *dp, **dpp; register struct protosw *pr; -#undef unix -#ifndef lint - ADDDOMAIN(unix); - ADDDOMAIN(route); -#ifdef INET - ADDDOMAIN(inet); -#endif -#ifdef NS - ADDDOMAIN(ns); -#endif -#ifdef ISO - ADDDOMAIN(iso); -#endif -#ifdef CCITT - ADDDOMAIN(ccitt); -#endif -#include "imp.h" -#if NIMP > 0 - ADDDOMAIN(imp); -#endif + /* + * NB - local domain is always present. + */ + ADDDOMAIN(local); + + for (dpp = (struct domain **)domain_set.ls_items; *dpp; dpp++) { + (**dpp).dom_next = domains; + domains = *dpp; + } + +/* - not in our sources +#ifdef ISDN + ADDDOMAIN(isdn); #endif +*/ for (dp = domains; dp; dp = dp->dom_next) { if (dp->dom_init) (*dp->dom_init)(); - for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) + for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++){ +#ifdef PRU_OLDSTYLE + /* See comments in uipc_socket2.c. */ + if (pr->pr_usrreqs == 0 && pr->pr_ousrreq) + pr->pr_usrreqs = &pru_oldstyle; +#endif if (pr->pr_init) (*pr->pr_init)(); + } } -if (max_linkhdr < 16) /* XXX */ -max_linkhdr = 16; + if (max_linkhdr < 16) /* XXX */ + max_linkhdr = 16; max_hdr = max_linkhdr + max_protohdr; max_datalen = MHLEN - max_hdr; - timeout(pffasttimo, NULL, 1); - timeout(pfslowtimo, NULL, 1); + timeout(pffasttimo, (void *)0, 1); + timeout(pfslowtimo, (void *)0, 1); } + +/* + * The following two operations are kludge code. Most likely, they should + * be done as a "domainpreinit()" for the first function and then rolled + * in as the last act of "domaininit()" for the second. + * + * In point of fact, it is questionable why other initialization prior + * to this does not also take place at splimp by default. + */ +static void +kludge_splimp(udata) + void *udata; +{ + int *savesplp = udata; + + *savesplp = splimp(); +} + +static void +kludge_splx(udata) + void *udata; +{ + int *savesplp = udata; + + splx( *savesplp); +} + + + struct protosw * -pffindtype(family, type) - int family, type; +pffindtype(int family, int type) { register struct domain *dp; register struct protosw *pr; @@ -117,8 +169,7 @@ found: } struct protosw * -pffindproto(family, protocol, type) - int family, protocol, type; +pffindproto(int family, int protocol, int type) { register struct domain *dp; register struct protosw *pr; @@ -142,44 +193,6 @@ found: return (maybe); } -int -net_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) - int *name; - u_int namelen; - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; - struct proc *p; -{ - register struct domain *dp; - register struct protosw *pr; - int family, protocol; - - /* - * All sysctl names at this level are nonterminal; - * next two components are protocol family and protocol number, - * then at least one addition component. - */ - if (namelen < 3) - return (EISDIR); /* overloaded */ - family = name[0]; - protocol = name[1]; - - if (family == 0) - return (0); - for (dp = domains; dp; dp = dp->dom_next) - if (dp->dom_family == family) - goto found; - return (ENOPROTOOPT); -found: - for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) - if (pr->pr_protocol == protocol && pr->pr_sysctl) - return ((*pr->pr_sysctl)(name + 2, namelen - 2, - oldp, oldlenp, newp, newlen)); - return (ENOPROTOOPT); -} - void pfctlinput(cmd, sa) int cmd; @@ -191,10 +204,10 @@ pfctlinput(cmd, sa) for (dp = domains; dp; dp = dp->dom_next) for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_ctlinput) - (*pr->pr_ctlinput)(cmd, sa, (caddr_t)0); + (*pr->pr_ctlinput)(cmd, sa, (void *)0); } -void +static void pfslowtimo(arg) void *arg; { @@ -205,10 +218,10 @@ pfslowtimo(arg) for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_slowtimo) (*pr->pr_slowtimo)(); - timeout(pfslowtimo, NULL, hz/2); + timeout(pfslowtimo, (void *)0, hz/2); } -void +static void pffasttimo(arg) void *arg; { @@ -219,5 +232,5 @@ pffasttimo(arg) for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++) if (pr->pr_fasttimo) (*pr->pr_fasttimo)(); - timeout(pffasttimo, NULL, hz/5); + timeout(pffasttimo, (void *)0, hz/5); } diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c index 62abfd5..7032e44 100644 --- a/sys/kern/uipc_mbuf.c +++ b/sys/kern/uipc_mbuf.c @@ -30,35 +30,81 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)uipc_mbuf.c 8.4 (Berkeley) 2/14/95 + * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94 + * $Id: uipc_mbuf.c,v 1.28 1997/02/18 20:43:05 wollman Exp $ */ #include <sys/param.h> #include <sys/systm.h> #include <sys/proc.h> #include <sys/malloc.h> -#include <sys/map.h> #define MBTYPES #include <sys/mbuf.h> #include <sys/kernel.h> +#include <sys/sysctl.h> #include <sys/syslog.h> #include <sys/domain.h> #include <sys/protosw.h> #include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_kern.h> +#include <vm/vm_extern.h> -extern vm_map_t mb_map; -struct mbuf *mbutl; -char *mclrefcnt; +static void mbinit __P((void *)); +SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbinit, NULL) -void -mbinit() +struct mbuf *mbutl; +char *mclrefcnt; +struct mbstat mbstat; +struct mbuf *mmbfree; +union mcluster *mclfree; +int max_linkhdr; +int max_protohdr; +int max_hdr; +int max_datalen; + +SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW, + &max_linkhdr, 0, ""); +SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW, + &max_protohdr, 0, ""); +SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, ""); +SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW, + &max_datalen, 0, ""); +SYSCTL_STRUCT(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RW, &mbstat, mbstat, ""); + +static void m_reclaim __P((void)); + +/* "number of clusters of pages" */ +#define NCL_INIT 1 + +#define NMB_INIT 16 + +/* ARGSUSED*/ +static void +mbinit(dummy) + void *dummy; { int s; + mmbfree = NULL; mclfree = NULL; + mbstat.m_msize = MSIZE; + mbstat.m_mclbytes = MCLBYTES; + mbstat.m_minclsize = MINCLSIZE; + mbstat.m_mlen = MLEN; + mbstat.m_mhlen = MHLEN; + s = splimp(); - if (m_clalloc(max(4096/CLBYTES, 1), M_DONTWAIT) == 0) + if (m_mballoc(NMB_INIT, M_DONTWAIT) == 0) goto bad; +#if MCLBYTES <= PAGE_SIZE + if (m_clalloc(NCL_INIT, M_DONTWAIT) == 0) + goto bad; +#else + /* It's OK to call contigmalloc in this context. */ + if (m_clalloc(16, 0) == 0) + goto bad; +#endif splx(s); return; bad: @@ -66,6 +112,80 @@ bad: } /* + * Allocate at least nmb mbufs and place on mbuf free list. + * Must be called at splimp. + */ +/* ARGSUSED */ +int +m_mballoc(nmb, nowait) + register int nmb; + int nowait; +{ + register caddr_t p; + register int i; + int nbytes; + + /* Once we run out of map space, it will be impossible to get + * any more (nothing is ever freed back to the map) (XXX which + * is dumb). (however you are not dead as m_reclaim might + * still be able to free a substantial amount of space). + */ + if (mb_map_full) + return (0); + + nbytes = round_page(nmb * MSIZE); + p = (caddr_t)kmem_malloc(mb_map, nbytes, M_NOWAIT); + if (p == 0 && !nowait) { + mbstat.m_wait++; + p = (caddr_t)kmem_malloc(mb_map, nbytes, M_WAITOK); + } + + /* + * Either the map is now full, or this is nowait and there + * are no pages left. + */ + if (p == NULL) + return (0); + + nmb = nbytes / MSIZE; + for (i = 0; i < nmb; i++) { + ((struct mbuf *)p)->m_next = mmbfree; + mmbfree = (struct mbuf *)p; + p += MSIZE; + } + mbstat.m_mbufs += nmb; + return (1); +} + +#if MCLBYTES > PAGE_SIZE +static int i_want_my_mcl; + +static void +kproc_mclalloc(void) +{ + int status; + + while (1) { + tsleep(&i_want_my_mcl, PVM, "mclalloc", 0); + + for (; i_want_my_mcl; i_want_my_mcl--) { + if (m_clalloc(1, 0) == 0) + printf("m_clalloc failed even in process context!\n"); + } + } +} + +static struct proc *mclallocproc; +static struct kproc_desc mclalloc_kp = { + "mclalloc", + kproc_mclalloc, + &mclallocproc +}; +SYSINIT_KT(mclallocproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start, + &mclalloc_kp); +#endif + +/* * Allocate some number of mbuf clusters * and place on cluster free list. * Must be called at splimp. @@ -76,21 +196,45 @@ m_clalloc(ncl, nowait) register int ncl; int nowait; { - static int logged; register caddr_t p; register int i; int npg; - npg = ncl * CLSIZE; - p = (caddr_t)kmem_malloc(mb_map, ctob(npg), !nowait); + /* + * Once we run out of map space, it will be impossible + * to get any more (nothing is ever freed back to the + * map). + */ + if (mb_map_full) { + mbstat.m_drops++; + return (0); + } + +#if MCLBYTES > PAGE_SIZE + if (nowait) { + i_want_my_mcl += ncl; + wakeup(&i_want_my_mcl); + mbstat.m_wait++; + p = 0; + } else { + p = contigmalloc1(MCLBYTES * ncl, M_DEVBUF, M_WAITOK, 0ul, + ~0ul, PAGE_SIZE, 0, mb_map); + } +#else + npg = ncl; + p = (caddr_t)kmem_malloc(mb_map, ctob(npg), + nowait ? M_NOWAIT : M_WAITOK); + ncl = ncl * PAGE_SIZE / MCLBYTES; +#endif + /* + * Either the map is now full, or this is nowait and there + * are no pages left. + */ if (p == NULL) { - if (logged == 0) { - logged++; - log(LOG_ERR, "mb_map full\n"); - } + mbstat.m_drops++; return (0); } - ncl = ncl * CLBYTES / MCLBYTES; + for (i = 0; i < ncl; i++) { ((union mcluster *)p)->mcl_next = mclfree; mclfree = (union mcluster *)p; @@ -115,6 +259,10 @@ m_retry(i, t) #define m_retry(i, t) (struct mbuf *)0 MGET(m, i, t); #undef m_retry + if (m != NULL) + mbstat.m_wait++; + else + mbstat.m_drops++; return (m); } @@ -131,10 +279,14 @@ m_retryhdr(i, t) #define m_retryhdr(i, t) (struct mbuf *)0 MGETHDR(m, i, t); #undef m_retryhdr + if (m != NULL) + mbstat.m_wait++; + else + mbstat.m_drops++; return (m); } -void +static void m_reclaim() { register struct domain *dp; @@ -207,7 +359,8 @@ m_freem(m) return; do { MFREE(m, n); - } while (m = n); + m = n; + } while (m); } /* @@ -248,7 +401,7 @@ m_prepend(m, len, how) * continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf. * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller. */ -int MCFail; +#define MCFail (mbstat.m_mcfail) struct mbuf * m_copym(m, off0, len, wait) @@ -296,7 +449,11 @@ m_copym(m, off0, len, wait) n->m_len = min(len, m->m_len - off); if (m->m_flags & M_EXT) { n->m_data = m->m_data + off; - mclrefcnt[mtocl(m->m_ext.ext_buf)]++; + if(!m->m_ext.ext_ref) + mclrefcnt[mtocl(m->m_ext.ext_buf)]++; + else + (*(m->m_ext.ext_ref))(m->m_ext.ext_buf, + m->m_ext.ext_size); n->m_ext = m->m_ext; n->m_flags |= M_EXT; } else @@ -318,6 +475,61 @@ nospace: } /* + * Copy an entire packet, including header (which must be present). + * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'. + */ +struct mbuf * +m_copypacket(m, how) + struct mbuf *m; + int how; +{ + struct mbuf *top, *n, *o; + + MGET(n, how, m->m_type); + top = n; + if (!n) + goto nospace; + + M_COPY_PKTHDR(n, m); + n->m_len = m->m_len; + if (m->m_flags & M_EXT) { + n->m_data = m->m_data; + mclrefcnt[mtocl(m->m_ext.ext_buf)]++; + n->m_ext = m->m_ext; + n->m_flags |= M_EXT; + } else { + bcopy(mtod(m, char *), mtod(n, char *), n->m_len); + } + + m = m->m_next; + while (m) { + MGET(o, how, m->m_type); + if (!o) + goto nospace; + + n->m_next = o; + n = n->m_next; + + n->m_len = m->m_len; + if (m->m_flags & M_EXT) { + n->m_data = m->m_data; + mclrefcnt[mtocl(m->m_ext.ext_buf)]++; + n->m_ext = m->m_ext; + n->m_flags |= M_EXT; + } else { + bcopy(mtod(m, char *), mtod(n, char *), n->m_len); + } + + m = m->m_next; + } + return top; +nospace: + m_freem(top); + MCFail++; + return 0; +} + +/* * Copy data from an mbuf chain starting "off" bytes from the beginning, * continuing for "len" bytes, into the indicated buffer. */ @@ -447,8 +659,8 @@ m_adj(mp, req_len) } count -= m->m_len; } - while (m = m->m_next) - m->m_len = 0; + while (m->m_next) + (m = m->m_next) ->m_len = 0; } } @@ -460,7 +672,7 @@ m_adj(mp, req_len) * If there is room, it will add up to max_protohdr-len extra bytes to the * contiguous region in an attempt to avoid being called next time. */ -int MPFail; +#define MPFail (mbstat.m_mpfail) struct mbuf * m_pullup(n, len) @@ -573,7 +785,11 @@ extpacket: if (m->m_flags & M_EXT) { n->m_flags |= M_EXT; n->m_ext = m->m_ext; - mclrefcnt[mtocl(m->m_ext.ext_buf)]++; + if(!m->m_ext.ext_ref) + mclrefcnt[mtocl(m->m_ext.ext_buf)]++; + else + (*(m->m_ext.ext_ref))(m->m_ext.ext_buf, + m->m_ext.ext_size); m->m_ext.ext_size = 0; /* For Accounting XXXXXX danger */ n->m_data = m->m_data + len; } else { @@ -593,7 +809,7 @@ m_devget(buf, totlen, off0, ifp, copy) char *buf; int totlen, off0; struct ifnet *ifp; - void (*copy)(); + void (*copy) __P((char *from, caddr_t to, u_int len)); { register struct mbuf *m; struct mbuf *top = 0, **mp = ⊤ @@ -604,12 +820,8 @@ m_devget(buf, totlen, off0, ifp, copy) cp = buf; epkt = cp + totlen; if (off) { - /* - * If 'off' is non-zero, packet is trailer-encapsulated, - * so we have to skip the type and length fields. - */ - cp += off + 2 * sizeof(u_int16_t); - totlen -= 2 * sizeof(u_int16_t); + cp += off + 2 * sizeof(u_short); + totlen -= 2 * sizeof(u_short); } MGETHDR(m, M_DONTWAIT, MT_DATA); if (m == 0) @@ -658,3 +870,56 @@ m_devget(buf, totlen, off0, ifp, copy) } return (top); } + +/* + * Copy data from a buffer back into the indicated mbuf chain, + * starting "off" bytes from the beginning, extending the mbuf + * chain if necessary. + */ +void +m_copyback(m0, off, len, cp) + struct mbuf *m0; + register int off; + register int len; + caddr_t cp; +{ + register int mlen; + register struct mbuf *m = m0, *n; + int totlen = 0; + + if (m0 == 0) + return; + while (off > (mlen = m->m_len)) { + off -= mlen; + totlen += mlen; + if (m->m_next == 0) { + n = m_getclr(M_DONTWAIT, m->m_type); + if (n == 0) + goto out; + n->m_len = min(MLEN, len + off); + m->m_next = n; + } + m = m->m_next; + } + while (len > 0) { + mlen = min (m->m_len - off, len); + bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen); + cp += mlen; + len -= mlen; + mlen += off; + off = 0; + totlen += mlen; + if (len == 0) + break; + if (m->m_next == 0) { + n = m_get(M_DONTWAIT, m->m_type); + if (n == 0) + break; + n->m_len = min(MLEN, len); + m->m_next = n; + } + m = m->m_next; + } +out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen)) + m->m_pkthdr.len = totlen; +} diff --git a/sys/kern/uipc_proto.c b/sys/kern/uipc_proto.c index e89a84c..f652ce3 100644 --- a/sys/kern/uipc_proto.c +++ b/sys/kern/uipc_proto.c @@ -30,43 +30,47 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)uipc_proto.c 8.2 (Berkeley) 2/14/95 + * @(#)uipc_proto.c 8.1 (Berkeley) 6/10/93 + * $Id: uipc_proto.c,v 1.9 1997/02/22 09:39:27 peter Exp $ */ #include <sys/param.h> -#include <sys/socket.h> -#include <sys/protosw.h> +#include <sys/kernel.h> #include <sys/domain.h> #include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/socket.h> +#include <sys/sysctl.h> +#include <sys/un.h> + +#include <net/raw_cb.h> /* - * Definitions of protocols supported in the UNIX domain. + * Definitions of protocols supported in the LOCAL domain. */ -int uipc_usrreq(), raw_usrreq(); -void raw_init(), raw_input(), raw_ctlinput(); -extern struct domain unixdomain; /* or at least forward */ - -struct protosw unixsw[] = { -{ SOCK_STREAM, &unixdomain, 0, PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS, +static struct protosw localsw[] = { +{ SOCK_STREAM, &localdomain, 0, PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS, 0, 0, 0, 0, uipc_usrreq, 0, 0, 0, 0, }, -{ SOCK_DGRAM, &unixdomain, 0, PR_ATOMIC|PR_ADDR|PR_RIGHTS, +{ SOCK_DGRAM, &localdomain, 0, PR_ATOMIC|PR_ADDR|PR_RIGHTS, 0, 0, 0, 0, uipc_usrreq, 0, 0, 0, 0, }, { 0, 0, 0, 0, - raw_input, 0, raw_ctlinput, 0, + 0, 0, raw_ctlinput, 0, raw_usrreq, raw_init, 0, 0, 0, } }; -int unp_externalize(), unp_dispose(); +struct domain localdomain = + { AF_LOCAL, "local", 0, unp_externalize, unp_dispose, + localsw, &localsw[sizeof(localsw)/sizeof(localsw[0])] }; -struct domain unixdomain = - { AF_UNIX, "unix", 0, unp_externalize, unp_dispose, - unixsw, &unixsw[sizeof(unixsw)/sizeof(unixsw[0])] }; +SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain"); +SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0, "SOCK_STREAM"); +SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM"); diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c new file mode 100644 index 0000000..e19db0c --- /dev/null +++ b/sys/kern/uipc_sockbuf.c @@ -0,0 +1,1018 @@ +/* + * Copyright (c) 1982, 1986, 1988, 1990, 1993 + * The Regents of the University of California. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 + * $Id: uipc_socket2.c,v 1.21 1997/02/19 19:15:43 wollman Exp $ + */ + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/proc.h> +#include <sys/file.h> +#include <sys/buf.h> +#include <sys/malloc.h> +#include <sys/mbuf.h> +#include <sys/protosw.h> +#include <sys/stat.h> +#include <sys/socket.h> +#include <sys/socketvar.h> +#include <sys/signalvar.h> +#include <sys/sysctl.h> + +/* + * Primitive routines for operating on sockets and socket buffers + */ + +u_long sb_max = SB_MAX; /* XXX should be static */ + +static u_long sb_efficiency = 8; /* parameter for sbreserve() */ + +/* + * Procedures to manipulate state flags of socket + * and do appropriate wakeups. Normal sequence from the + * active (originating) side is that soisconnecting() is + * called during processing of connect() call, + * resulting in an eventual call to soisconnected() if/when the + * connection is established. When the connection is torn down + * soisdisconnecting() is called during processing of disconnect() call, + * and soisdisconnected() is called when the connection to the peer + * is totally severed. The semantics of these routines are such that + * connectionless protocols can call soisconnected() and soisdisconnected() + * only, bypassing the in-progress calls when setting up a ``connection'' + * takes no time. + * + * From the passive side, a socket is created with + * two queues of sockets: so_q0 for connections in progress + * and so_q for connections already made and awaiting user acceptance. + * As a protocol is preparing incoming connections, it creates a socket + * structure queued on so_q0 by calling sonewconn(). When the connection + * is established, soisconnected() is called, and transfers the + * socket structure to so_q, making it available to accept(). + * + * If a socket is closed with sockets on either + * so_q0 or so_q, these sockets are dropped. + * + * If higher level protocols are implemented in + * the kernel, the wakeups done here will sometimes + * cause software-interrupt process scheduling. + */ + +void +soisconnecting(so) + register struct socket *so; +{ + + so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING); + so->so_state |= SS_ISCONNECTING; +} + +void +soisconnected(so) + register struct socket *so; +{ + register struct socket *head = so->so_head; + + so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); + so->so_state |= SS_ISCONNECTED; + if (head && (so->so_state & SS_INCOMP)) { + TAILQ_REMOVE(&head->so_incomp, so, so_list); + head->so_incqlen--; + so->so_state &= ~SS_INCOMP; + TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); + so->so_state |= SS_COMP; + sorwakeup(head); + wakeup((caddr_t)&head->so_timeo); + } else { + wakeup((caddr_t)&so->so_timeo); + sorwakeup(so); + sowwakeup(so); + } +} + +void +soisdisconnecting(so) + register struct socket *so; +{ + + so->so_state &= ~SS_ISCONNECTING; + so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE); + wakeup((caddr_t)&so->so_timeo); + sowwakeup(so); + sorwakeup(so); +} + +void +soisdisconnected(so) + register struct socket *so; +{ + + so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING); + so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE); + wakeup((caddr_t)&so->so_timeo); + sowwakeup(so); + sorwakeup(so); +} + +/* + * Return a random connection that hasn't been serviced yet and + * is eligible for discard. There is a one in qlen chance that + * we will return a null, saying that there are no dropable + * requests. In this case, the protocol specific code should drop + * the new request. This insures fairness. + * + * This may be used in conjunction with protocol specific queue + * congestion routines. + */ +struct socket * +sodropablereq(head) + register struct socket *head; +{ + register struct socket *so; + unsigned int i, j, qlen; + + static int rnd; + static long old_mono_secs; + static unsigned int cur_cnt, old_cnt; + + if ((i = (mono_time.tv_sec - old_mono_secs)) != 0) { + old_mono_secs = mono_time.tv_sec; + old_cnt = cur_cnt / i; + cur_cnt = 0; + } + + so = TAILQ_FIRST(&head->so_incomp); + if (!so) + return (so); + + qlen = head->so_incqlen; + if (++cur_cnt > qlen || old_cnt > qlen) { + rnd = (314159 * rnd + 66329) & 0xffff; + j = ((qlen + 1) * rnd) >> 16; + + while (j-- && so) + so = TAILQ_NEXT(so, so_list); + } + + return (so); +} + +/* + * When an attempt at a new connection is noted on a socket + * which accepts connections, sonewconn is called. If the + * connection is possible (subject to space constraints, etc.) + * then we allocate a new structure, propoerly linked into the + * data structure of the original socket, and return this. + * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED. + * + * Currently, sonewconn() is defined as sonewconn1() in socketvar.h + * to catch calls that are missing the (new) second parameter. + */ +struct socket * +sonewconn1(head, connstatus) + register struct socket *head; + int connstatus; +{ + register struct socket *so; + + if (head->so_qlen > 3 * head->so_qlimit / 2) + return ((struct socket *)0); + MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_DONTWAIT); + if (so == NULL) + return ((struct socket *)0); + bzero((caddr_t)so, sizeof(*so)); + so->so_head = head; + so->so_type = head->so_type; + so->so_options = head->so_options &~ SO_ACCEPTCONN; + so->so_linger = head->so_linger; + so->so_state = head->so_state | SS_NOFDREF; + so->so_proto = head->so_proto; + so->so_timeo = head->so_timeo; + so->so_pgid = head->so_pgid; + (void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat); + + if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0)) { + (void) free((caddr_t)so, M_SOCKET); + return ((struct socket *)0); + } + + if (connstatus) { + TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); + so->so_state |= SS_COMP; + } else { + TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); + so->so_state |= SS_INCOMP; + head->so_incqlen++; + } + head->so_qlen++; + if (connstatus) { + sorwakeup(head); + wakeup((caddr_t)&head->so_timeo); + so->so_state |= connstatus; + } + return (so); +} + +/* + * Socantsendmore indicates that no more data will be sent on the + * socket; it would normally be applied to a socket when the user + * informs the system that no more data is to be sent, by the protocol + * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data + * will be received, and will normally be applied to the socket by a + * protocol when it detects that the peer will send no more data. + * Data queued for reading in the socket may yet be read. + */ + +void +socantsendmore(so) + struct socket *so; +{ + + so->so_state |= SS_CANTSENDMORE; + sowwakeup(so); +} + +void +socantrcvmore(so) + struct socket *so; +{ + + so->so_state |= SS_CANTRCVMORE; + sorwakeup(so); +} + +/* + * Wait for data to arrive at/drain from a socket buffer. + */ +int +sbwait(sb) + struct sockbuf *sb; +{ + + sb->sb_flags |= SB_WAIT; + return (tsleep((caddr_t)&sb->sb_cc, + (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait", + sb->sb_timeo)); +} + +/* + * Lock a sockbuf already known to be locked; + * return any error returned from sleep (EINTR). + */ +int +sb_lock(sb) + register struct sockbuf *sb; +{ + int error; + + while (sb->sb_flags & SB_LOCK) { + sb->sb_flags |= SB_WANT; + error = tsleep((caddr_t)&sb->sb_flags, + (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH, + "sblock", 0); + if (error) + return (error); + } + sb->sb_flags |= SB_LOCK; + return (0); +} + +/* + * Wakeup processes waiting on a socket buffer. + * Do asynchronous notification via SIGIO + * if the socket has the SS_ASYNC flag set. + */ +void +sowakeup(so, sb) + register struct socket *so; + register struct sockbuf *sb; +{ + struct proc *p; + + selwakeup(&sb->sb_sel); + sb->sb_flags &= ~SB_SEL; + if (sb->sb_flags & SB_WAIT) { + sb->sb_flags &= ~SB_WAIT; + wakeup((caddr_t)&sb->sb_cc); + } + if (so->so_state & SS_ASYNC) { + if (so->so_pgid < 0) + gsignal(-so->so_pgid, SIGIO); + else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0) + psignal(p, SIGIO); + } +} + +/* + * Socket buffer (struct sockbuf) utility routines. + * + * Each socket contains two socket buffers: one for sending data and + * one for receiving data. Each buffer contains a queue of mbufs, + * information about the number of mbufs and amount of data in the + * queue, and other fields allowing select() statements and notification + * on data availability to be implemented. + * + * Data stored in a socket buffer is maintained as a list of records. + * Each record is a list of mbufs chained together with the m_next + * field. Records are chained together with the m_nextpkt field. The upper + * level routine soreceive() expects the following conventions to be + * observed when placing information in the receive buffer: + * + * 1. If the protocol requires each message be preceded by the sender's + * name, then a record containing that name must be present before + * any associated data (mbuf's must be of type MT_SONAME). + * 2. If the protocol supports the exchange of ``access rights'' (really + * just additional data associated with the message), and there are + * ``rights'' to be received, then a record containing this data + * should be present (mbuf's must be of type MT_RIGHTS). + * 3. If a name or rights record exists, then it must be followed by + * a data record, perhaps of zero length. + * + * Before using a new socket structure it is first necessary to reserve + * buffer space to the socket, by calling sbreserve(). This should commit + * some of the available buffer space in the system buffer pool for the + * socket (currently, it does nothing but enforce limits). The space + * should be released by calling sbrelease() when the socket is destroyed. + */ + +int +soreserve(so, sndcc, rcvcc) + register struct socket *so; + u_long sndcc, rcvcc; +{ + + if (sbreserve(&so->so_snd, sndcc) == 0) + goto bad; + if (sbreserve(&so->so_rcv, rcvcc) == 0) + goto bad2; + if (so->so_rcv.sb_lowat == 0) + so->so_rcv.sb_lowat = 1; + if (so->so_snd.sb_lowat == 0) + so->so_snd.sb_lowat = MCLBYTES; + if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat) + so->so_snd.sb_lowat = so->so_snd.sb_hiwat; + return (0); +bad2: + sbrelease(&so->so_snd); +bad: + return (ENOBUFS); +} + +/* + * Allot mbufs to a sockbuf. + * Attempt to scale mbmax so that mbcnt doesn't become limiting + * if buffering efficiency is near the normal case. + */ +int +sbreserve(sb, cc) + struct sockbuf *sb; + u_long cc; +{ + if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES)) + return (0); + sb->sb_hiwat = cc; + sb->sb_mbmax = min(cc * sb_efficiency, sb_max); + if (sb->sb_lowat > sb->sb_hiwat) + sb->sb_lowat = sb->sb_hiwat; + return (1); +} + +/* + * Free mbufs held by a socket, and reserved mbuf space. + */ +void +sbrelease(sb) + struct sockbuf *sb; +{ + + sbflush(sb); + sb->sb_hiwat = sb->sb_mbmax = 0; +} + +/* + * Routines to add and remove + * data from an mbuf queue. + * + * The routines sbappend() or sbappendrecord() are normally called to + * append new mbufs to a socket buffer, after checking that adequate + * space is available, comparing the function sbspace() with the amount + * of data to be added. sbappendrecord() differs from sbappend() in + * that data supplied is treated as the beginning of a new record. + * To place a sender's address, optional access rights, and data in a + * socket receive buffer, sbappendaddr() should be used. To place + * access rights and data in a socket receive buffer, sbappendrights() + * should be used. In either case, the new data begins a new record. + * Note that unlike sbappend() and sbappendrecord(), these routines check + * for the caller that there will be enough space to store the data. + * Each fails if there is not enough space, or if it cannot find mbufs + * to store additional information in. + * + * Reliable protocols may use the socket send buffer to hold data + * awaiting acknowledgement. Data is normally copied from a socket + * send buffer in a protocol with m_copy for output to a peer, + * and then removing the data from the socket buffer with sbdrop() + * or sbdroprecord() when the data is acknowledged by the peer. + */ + +/* + * Append mbuf chain m to the last record in the + * socket buffer sb. The additional space associated + * the mbuf chain is recorded in sb. Empty mbufs are + * discarded and mbufs are compacted where possible. + */ +void +sbappend(sb, m) + struct sockbuf *sb; + struct mbuf *m; +{ + register struct mbuf *n; + + if (m == 0) + return; + n = sb->sb_mb; + if (n) { + while (n->m_nextpkt) + n = n->m_nextpkt; + do { + if (n->m_flags & M_EOR) { + sbappendrecord(sb, m); /* XXXXXX!!!! */ + return; + } + } while (n->m_next && (n = n->m_next)); + } + sbcompress(sb, m, n); +} + +#ifdef SOCKBUF_DEBUG +void +sbcheck(sb) + register struct sockbuf *sb; +{ + register struct mbuf *m; + register int len = 0, mbcnt = 0; + + for (m = sb->sb_mb; m; m = m->m_next) { + len += m->m_len; + mbcnt += MSIZE; + if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ + mbcnt += m->m_ext.ext_size; + if (m->m_nextpkt) + panic("sbcheck nextpkt"); + } + if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) { + printf("cc %d != %d || mbcnt %d != %d\n", len, sb->sb_cc, + mbcnt, sb->sb_mbcnt); + panic("sbcheck"); + } +} +#endif + +/* + * As above, except the mbuf chain + * begins a new record. + */ +void +sbappendrecord(sb, m0) + register struct sockbuf *sb; + register struct mbuf *m0; +{ + register struct mbuf *m; + + if (m0 == 0) + return; + m = sb->sb_mb; + if (m) + while (m->m_nextpkt) + m = m->m_nextpkt; + /* + * Put the first mbuf on the queue. + * Note this permits zero length records. + */ + sballoc(sb, m0); + if (m) + m->m_nextpkt = m0; + else + sb->sb_mb = m0; + m = m0->m_next; + m0->m_next = 0; + if (m && (m0->m_flags & M_EOR)) { + m0->m_flags &= ~M_EOR; + m->m_flags |= M_EOR; + } + sbcompress(sb, m, m0); +} + +/* + * As above except that OOB data + * is inserted at the beginning of the sockbuf, + * but after any other OOB data. + */ +void +sbinsertoob(sb, m0) + register struct sockbuf *sb; + register struct mbuf *m0; +{ + register struct mbuf *m; + register struct mbuf **mp; + + if (m0 == 0) + return; + for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) { + m = *mp; + again: + switch (m->m_type) { + + case MT_OOBDATA: + continue; /* WANT next train */ + + case MT_CONTROL: + m = m->m_next; + if (m) + goto again; /* inspect THIS train further */ + } + break; + } + /* + * Put the first mbuf on the queue. + * Note this permits zero length records. + */ + sballoc(sb, m0); + m0->m_nextpkt = *mp; + *mp = m0; + m = m0->m_next; + m0->m_next = 0; + if (m && (m0->m_flags & M_EOR)) { + m0->m_flags &= ~M_EOR; + m->m_flags |= M_EOR; + } + sbcompress(sb, m, m0); +} + +/* + * Append address and data, and optionally, control (ancillary) data + * to the receive queue of a socket. If present, + * m0 must include a packet header with total length. + * Returns 0 if no space in sockbuf or insufficient mbufs. + */ +int +sbappendaddr(sb, asa, m0, control) + register struct sockbuf *sb; + struct sockaddr *asa; + struct mbuf *m0, *control; +{ + register struct mbuf *m, *n; + int space = asa->sa_len; + +if (m0 && (m0->m_flags & M_PKTHDR) == 0) +panic("sbappendaddr"); + if (m0) + space += m0->m_pkthdr.len; + for (n = control; n; n = n->m_next) { + space += n->m_len; + if (n->m_next == 0) /* keep pointer to last control buf */ + break; + } + if (space > sbspace(sb)) + return (0); + if (asa->sa_len > MLEN) + return (0); + MGET(m, M_DONTWAIT, MT_SONAME); + if (m == 0) + return (0); + m->m_len = asa->sa_len; + bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len); + if (n) + n->m_next = m0; /* concatenate data to control */ + else + control = m0; + m->m_next = control; + for (n = m; n; n = n->m_next) + sballoc(sb, n); + n = sb->sb_mb; + if (n) { + while (n->m_nextpkt) + n = n->m_nextpkt; + n->m_nextpkt = m; + } else + sb->sb_mb = m; + return (1); +} + +int +sbappendcontrol(sb, m0, control) + struct sockbuf *sb; + struct mbuf *control, *m0; +{ + register struct mbuf *m, *n; + int space = 0; + + if (control == 0) + panic("sbappendcontrol"); + for (m = control; ; m = m->m_next) { + space += m->m_len; + if (m->m_next == 0) + break; + } + n = m; /* save pointer to last control buffer */ + for (m = m0; m; m = m->m_next) + space += m->m_len; + if (space > sbspace(sb)) + return (0); + n->m_next = m0; /* concatenate data to control */ + for (m = control; m; m = m->m_next) + sballoc(sb, m); + n = sb->sb_mb; + if (n) { + while (n->m_nextpkt) + n = n->m_nextpkt; + n->m_nextpkt = control; + } else + sb->sb_mb = control; + return (1); +} + +/* + * Compress mbuf chain m into the socket + * buffer sb following mbuf n. If n + * is null, the buffer is presumed empty. + */ +void +sbcompress(sb, m, n) + register struct sockbuf *sb; + register struct mbuf *m, *n; +{ + register int eor = 0; + register struct mbuf *o; + + while (m) { + eor |= m->m_flags & M_EOR; + if (m->m_len == 0 && + (eor == 0 || + (((o = m->m_next) || (o = n)) && + o->m_type == m->m_type))) { + m = m_free(m); + continue; + } + if (n && (n->m_flags & (M_EXT | M_EOR)) == 0 && + (n->m_data + n->m_len + m->m_len) < &n->m_dat[MLEN] && + n->m_type == m->m_type) { + bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len, + (unsigned)m->m_len); + n->m_len += m->m_len; + sb->sb_cc += m->m_len; + m = m_free(m); + continue; + } + if (n) + n->m_next = m; + else + sb->sb_mb = m; + sballoc(sb, m); + n = m; + m->m_flags &= ~M_EOR; + m = m->m_next; + n->m_next = 0; + } + if (eor) { + if (n) + n->m_flags |= eor; + else + printf("semi-panic: sbcompress\n"); + } +} + +/* + * Free all mbufs in a sockbuf. + * Check that all resources are reclaimed. + */ +void +sbflush(sb) + register struct sockbuf *sb; +{ + + if (sb->sb_flags & SB_LOCK) + panic("sbflush"); + while (sb->sb_mbcnt) + sbdrop(sb, (int)sb->sb_cc); + if (sb->sb_cc || sb->sb_mb) + panic("sbflush 2"); +} + +/* + * Drop data from (the front of) a sockbuf. + */ +void +sbdrop(sb, len) + register struct sockbuf *sb; + register int len; +{ + register struct mbuf *m, *mn; + struct mbuf *next; + + next = (m = sb->sb_mb) ? m->m_nextpkt : 0; + while (len > 0) { + if (m == 0) { + if (next == 0) + panic("sbdrop"); + m = next; + next = m->m_nextpkt; + continue; + } + if (m->m_len > len) { + m->m_len -= len; + m->m_data += len; + sb->sb_cc -= len; + break; + } + len -= m->m_len; + sbfree(sb, m); + MFREE(m, mn); + m = mn; + } + while (m && m->m_len == 0) { + sbfree(sb, m); + MFREE(m, mn); + m = mn; + } + if (m) { + sb->sb_mb = m; + m->m_nextpkt = next; + } else + sb->sb_mb = next; +} + +/* + * Drop a record off the front of a sockbuf + * and move the next record to the front. + */ +void +sbdroprecord(sb) + register struct sockbuf *sb; +{ + register struct mbuf *m, *mn; + + m = sb->sb_mb; + if (m) { + sb->sb_mb = m->m_nextpkt; + do { + sbfree(sb, m); + MFREE(m, mn); + m = mn; + } while (m); + } +} + +/* + * Create a "control" mbuf containing the specified data + * with the specified type for presentation on a socket buffer. + */ +struct mbuf * +sbcreatecontrol(p, size, type, level) + caddr_t p; + register int size; + int type, level; +{ + register struct cmsghdr *cp; + struct mbuf *m; + + if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) + return ((struct mbuf *) NULL); + cp = mtod(m, struct cmsghdr *); + /* XXX check size? */ + (void)memcpy(CMSG_DATA(cp), p, size); + size += sizeof(*cp); + m->m_len = size; + cp->cmsg_len = size; + cp->cmsg_level = level; + cp->cmsg_type = type; + return (m); +} + +#ifdef PRU_OLDSTYLE +/* + * The following routines mediate between the old-style `pr_usrreq' + * protocol implementations and the new-style `struct pr_usrreqs' + * calling convention. + */ + +/* syntactic sugar */ +#define nomb (struct mbuf *)0 + +static int +old_abort(struct socket *so) +{ + return so->so_proto->pr_ousrreq(so, PRU_ABORT, nomb, nomb, nomb); +} + +static int +old_accept(struct socket *so, struct mbuf *nam) +{ + return so->so_proto->pr_ousrreq(so, PRU_ACCEPT, nomb, nam, nomb); +} + +static int +old_attach(struct socket *so, int proto) +{ + return so->so_proto->pr_ousrreq(so, PRU_ATTACH, nomb, + (struct mbuf *)proto, /* XXX */ + nomb); +} + +static int +old_bind(struct socket *so, struct mbuf *nam) +{ + return so->so_proto->pr_ousrreq(so, PRU_BIND, nomb, nam, nomb); +} + +static int +old_connect(struct socket *so, struct mbuf *nam) +{ + return so->so_proto->pr_ousrreq(so, PRU_CONNECT, nomb, nam, nomb); +} + +static int +old_connect2(struct socket *so1, struct socket *so2) +{ + return so1->so_proto->pr_ousrreq(so1, PRU_CONNECT2, nomb, + (struct mbuf *)so2, nomb); +} + +static int +old_control(struct socket *so, int cmd, caddr_t data, struct ifnet *ifp) +{ + return so->so_proto->pr_ousrreq(so, PRU_CONTROL, (struct mbuf *)cmd, + (struct mbuf *)data, + (struct mbuf *)ifp); +} + +static int +old_detach(struct socket *so) +{ + return so->so_proto->pr_ousrreq(so, PRU_DETACH, nomb, nomb, nomb); +} + +static int +old_disconnect(struct socket *so) +{ + return so->so_proto->pr_ousrreq(so, PRU_DISCONNECT, nomb, nomb, nomb); +} + +static int +old_listen(struct socket *so) +{ + return so->so_proto->pr_ousrreq(so, PRU_LISTEN, nomb, nomb, nomb); +} + +static int +old_peeraddr(struct socket *so, struct mbuf *nam) +{ + return so->so_proto->pr_ousrreq(so, PRU_PEERADDR, nomb, nam, nomb); +} + +static int +old_rcvd(struct socket *so, int flags) +{ + return so->so_proto->pr_ousrreq(so, PRU_RCVD, nomb, + (struct mbuf *)flags, /* XXX */ + nomb); +} + +static int +old_rcvoob(struct socket *so, struct mbuf *m, int flags) +{ + return so->so_proto->pr_ousrreq(so, PRU_RCVOOB, m, + (struct mbuf *)flags, /* XXX */ + nomb); +} + +static int +old_send(struct socket *so, int flags, struct mbuf *m, struct mbuf *addr, + struct mbuf *control) +{ + int req; + + if (flags & PRUS_OOB) { + req = PRU_SENDOOB; + } else if(flags & PRUS_EOF) { + req = PRU_SEND_EOF; + } else { + req = PRU_SEND; + } + return so->so_proto->pr_ousrreq(so, req, m, addr, control); +} + +static int +old_sense(struct socket *so, struct stat *sb) +{ + return so->so_proto->pr_ousrreq(so, PRU_SENSE, (struct mbuf *)sb, + nomb, nomb); +} + +static int +old_shutdown(struct socket *so) +{ + return so->so_proto->pr_ousrreq(so, PRU_SHUTDOWN, nomb, nomb, nomb); +} + +static int +old_sockaddr(struct socket *so, struct mbuf *nam) +{ + return so->so_proto->pr_ousrreq(so, PRU_SOCKADDR, nomb, nam, nomb); +} + +struct pr_usrreqs pru_oldstyle = { + old_abort, old_accept, old_attach, old_bind, old_connect, + old_connect2, old_control, old_detach, old_disconnect, + old_listen, old_peeraddr, old_rcvd, old_rcvoob, old_send, + old_sense, old_shutdown, old_sockaddr +}; + +#endif /* PRU_OLDSTYLE */ + +/* + * Some routines that return EOPNOTSUPP for entry points that are not + * supported by a protocol. Fill in as needed. + */ +int +pru_accept_notsupp(struct socket *so, struct mbuf *nam) +{ + return EOPNOTSUPP; +} + +int +pru_connect2_notsupp(struct socket *so1, struct socket *so2) +{ + return EOPNOTSUPP; +} + +int +pru_listen_notsupp(struct socket *so) +{ + return EOPNOTSUPP; +} + +int +pru_rcvd_notsupp(struct socket *so, int flags) +{ + return EOPNOTSUPP; +} + +int +pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) +{ + return EOPNOTSUPP; +} + +/* + * This isn't really a ``null'' operation, but it's the default one + * and doesn't do anything destructive. + */ +int +pru_sense_null(struct socket *so, struct stat *sb) +{ + sb->st_blksize = so->so_snd.sb_hiwat; + return 0; +} + +/* + * Here is the definition of some of the basic objects in the kern.ipc + * branch of the MIB. + */ +SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); + +/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */ +static int dummy; +SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, ""); + +SYSCTL_INT(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLFLAG_RW, &sb_max, 0, "") +SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW, + &sb_efficiency, 0, ""); diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c index a9c5453..9f70207 100644 --- a/sys/kern/uipc_socket.c +++ b/sys/kern/uipc_socket.c @@ -30,13 +30,15 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95 + * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94 + * $Id: uipc_socket.c,v 1.24 1997/02/24 20:30:56 wollman Exp $ */ #include <sys/param.h> +#include <sys/queue.h> #include <sys/systm.h> #include <sys/proc.h> -#include <sys/file.h> +#include <sys/fcntl.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/domain.h> @@ -45,6 +47,12 @@ #include <sys/socket.h> #include <sys/socketvar.h> #include <sys/resourcevar.h> +#include <sys/signalvar.h> +#include <sys/sysctl.h> + +static int somaxconn = SOMAXCONN; +SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn, + 0, ""); /* * Socket operation routines. @@ -55,13 +63,13 @@ */ /*ARGSUSED*/ int -socreate(dom, aso, type, proto) +socreate(dom, aso, type, proto, p) int dom; struct socket **aso; register int type; int proto; + struct proc *p; { - struct proc *p = curproc; /* XXX */ register struct protosw *prp; register struct socket *so; register int error; @@ -70,18 +78,19 @@ socreate(dom, aso, type, proto) prp = pffindproto(dom, proto, type); else prp = pffindtype(dom, type); - if (prp == 0 || prp->pr_usrreq == 0) + if (prp == 0 || prp->pr_usrreqs == 0) return (EPROTONOSUPPORT); if (prp->pr_type != type) return (EPROTOTYPE); MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_WAIT); bzero((caddr_t)so, sizeof(*so)); + TAILQ_INIT(&so->so_incomp); + TAILQ_INIT(&so->so_comp); so->so_type = type; if (p->p_ucred->cr_uid == 0) so->so_state = SS_PRIV; so->so_proto = prp; - error = (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0, - (struct mbuf *)(long)proto, (struct mbuf *)0); + error = (*prp->pr_usrreqs->pru_attach)(so, proto); if (error) { so->so_state |= SS_NOFDREF; sofree(so); @@ -99,9 +108,7 @@ sobind(so, nam) int s = splnet(); int error; - error = - (*so->so_proto->pr_usrreq)(so, PRU_BIND, - (struct mbuf *)0, nam, (struct mbuf *)0); + error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam); splx(s); return (error); } @@ -113,33 +120,40 @@ solisten(so, backlog) { int s = splnet(), error; - error = - (*so->so_proto->pr_usrreq)(so, PRU_LISTEN, - (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0); + error = (*so->so_proto->pr_usrreqs->pru_listen)(so); if (error) { splx(s); return (error); } - if (so->so_q == 0) + if (so->so_comp.tqh_first == NULL) so->so_options |= SO_ACCEPTCONN; - if (backlog < 0) - backlog = 0; - so->so_qlimit = min(backlog, SOMAXCONN); + if (backlog < 0 || backlog > somaxconn) + backlog = somaxconn; + so->so_qlimit = backlog; splx(s); return (0); } -int +void sofree(so) register struct socket *so; { + struct socket *head = so->so_head; if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0) return; - if (so->so_head) { - if (!soqremque(so, 0) && !soqremque(so, 1)) - panic("sofree dq"); - so->so_head = 0; + if (head != NULL) { + if (so->so_state & SS_INCOMP) { + TAILQ_REMOVE(&head->so_incomp, so, so_list); + head->so_incqlen--; + } else if (so->so_state & SS_COMP) { + TAILQ_REMOVE(&head->so_comp, so, so_list); + } else { + panic("sofree: not queued"); + } + head->so_qlen--; + so->so_state &= ~(SS_INCOMP|SS_COMP); + so->so_head = NULL; } sbrelease(&so->so_snd); sorflush(so); @@ -159,10 +173,16 @@ soclose(so) int error = 0; if (so->so_options & SO_ACCEPTCONN) { - while (so->so_q0) - (void) soabort(so->so_q0); - while (so->so_q) - (void) soabort(so->so_q); + struct socket *sp, *sonext; + + for (sp = so->so_incomp.tqh_first; sp != NULL; sp = sonext) { + sonext = sp->so_list.tqe_next; + (void) soabort(sp); + } + for (sp = so->so_comp.tqh_first; sp != NULL; sp = sonext) { + sonext = sp->so_list.tqe_next; + (void) soabort(sp); + } } if (so->so_pcb == 0) goto discard; @@ -176,17 +196,17 @@ soclose(so) if ((so->so_state & SS_ISDISCONNECTING) && (so->so_state & SS_NBIO)) goto drop; - while (so->so_state & SS_ISCONNECTED) - if (error = tsleep((caddr_t)&so->so_timeo, - PSOCK | PCATCH, netcls, so->so_linger * hz)) + while (so->so_state & SS_ISCONNECTED) { + error = tsleep((caddr_t)&so->so_timeo, + PSOCK | PCATCH, "soclos", so->so_linger); + if (error) break; + } } } drop: if (so->so_pcb) { - int error2 = - (*so->so_proto->pr_usrreq)(so, PRU_DETACH, - (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0); + int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so); if (error == 0) error = error2; } @@ -207,9 +227,7 @@ soabort(so) struct socket *so; { - return ( - (*so->so_proto->pr_usrreq)(so, PRU_ABORT, - (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0)); + return (*so->so_proto->pr_usrreqs->pru_abort)(so); } int @@ -223,8 +241,7 @@ soaccept(so, nam) if ((so->so_state & SS_NOFDREF) == 0) panic("soaccept: !NOFDREF"); so->so_state &= ~SS_NOFDREF; - error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT, - (struct mbuf *)0, nam, (struct mbuf *)0); + error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam); splx(s); return (error); } @@ -251,8 +268,7 @@ soconnect(so, nam) (error = sodisconnect(so)))) error = EISCONN; else - error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT, - (struct mbuf *)0, nam, (struct mbuf *)0); + error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam); splx(s); return (error); } @@ -265,8 +281,7 @@ soconnect2(so1, so2) int s = splnet(); int error; - error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2, - (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0); + error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2); splx(s); return (error); } @@ -286,8 +301,7 @@ sodisconnect(so) error = EALREADY; goto bad; } - error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT, - (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0); + error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so); bad: splx(s); return (error); @@ -349,7 +363,8 @@ sosend(so, addr, uio, top, control, flags) #define snderr(errno) { error = errno; splx(s); goto release; } restart: - if (error = sblock(&so->so_snd, SBLOCKWAIT(flags))) + error = sblock(&so->so_snd, SBLOCKWAIT(flags)); + if (error) goto out; do { s = splnet(); @@ -358,17 +373,25 @@ restart: if (so->so_error) snderr(so->so_error); if ((so->so_state & SS_ISCONNECTED) == 0) { - if (so->so_proto->pr_flags & PR_CONNREQUIRED) { + /* + * `sendto' and `sendmsg' is allowed on a connection- + * based socket if it supports implied connect. + * Return ENOTCONN if not connected and no address is + * supplied. + */ + if ((so->so_proto->pr_flags & PR_CONNREQUIRED) && + (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) { if ((so->so_state & SS_ISCONFIRMING) == 0 && !(resid == 0 && clen != 0)) snderr(ENOTCONN); } else if (addr == 0) - snderr(EDESTADDRREQ); + snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ? + ENOTCONN : EDESTADDRREQ); } space = sbspace(&so->so_snd); if (flags & MSG_OOB) space += 1024; - if (atomic && resid > so->so_snd.sb_hiwat || + if ((atomic && resid > so->so_snd.sb_hiwat) || clen > so->so_snd.sb_hiwat) snderr(EMSGSIZE); if (space < resid + clen && uio && @@ -403,25 +426,15 @@ restart: MGET(m, M_WAIT, MT_DATA); mlen = MLEN; } - if (resid >= MINCLSIZE && space >= MCLBYTES) { + if (resid >= MINCLSIZE) { MCLGET(m, M_WAIT); if ((m->m_flags & M_EXT) == 0) goto nopages; mlen = MCLBYTES; -#ifdef MAPPED_MBUFS - len = min(MCLBYTES, resid); -#else - if (atomic && top == 0) { - len = min(MCLBYTES - max_hdr, resid); - m->m_data += max_hdr; - } else - len = min(MCLBYTES, resid); -#endif - space -= MCLBYTES; + len = min(min(mlen, resid), space); } else { nopages: len = min(min(mlen, resid), space); - space -= len; /* * For datagram protocols, leave room * for protocol headers in first mbuf. @@ -429,6 +442,7 @@ nopages: if (atomic && top == 0 && len < mlen) MH_ALIGN(m, len); } + space -= len; error = uiomove(mtod(m, caddr_t), (int)len, uio); resid = uio->uio_resid; m->m_len = len; @@ -446,8 +460,17 @@ nopages: if (dontroute) so->so_options |= SO_DONTROUTE; s = splnet(); /* XXX */ - error = (*so->so_proto->pr_usrreq)(so, - (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND, + error = (*so->so_proto->pr_usrreqs->pru_send)(so, + (flags & MSG_OOB) ? PRUS_OOB : + /* + * If the user set MSG_EOF, the protocol + * understands this flag and nothing left to + * send then use PRU_SEND_EOF instead of PRU_SEND. + */ + ((flags & MSG_EOF) && + (so->so_proto->pr_flags & PR_IMPLOPCL) && + (resid <= 0)) ? + PRUS_EOF : 0, top, addr, control); splx(s); if (dontroute) @@ -500,7 +523,7 @@ soreceive(so, paddr, uio, mp0, controlp, flagsp) register int flags, len, error, s, offset; struct protosw *pr = so->so_proto; struct mbuf *nextrecord; - int moff, type; + int moff, type = 0; int orig_resid = uio->uio_resid; mp = mp0; @@ -514,8 +537,7 @@ soreceive(so, paddr, uio, mp0, controlp, flagsp) flags = 0; if (flags & MSG_OOB) { m = m_get(M_WAIT, MT_DATA); - error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m, - (struct mbuf *)(long)(flags & MSG_PEEK), (struct mbuf *)0); + error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK); if (error) goto bad; do { @@ -531,11 +553,11 @@ bad: if (mp) *mp = (struct mbuf *)0; if (so->so_state & SS_ISCONFIRMING && uio->uio_resid) - (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, - (struct mbuf *)0, (struct mbuf *)0); + (*pr->pr_usrreqs->pru_rcvd)(so, 0); restart: - if (error = sblock(&so->so_rcv, SBLOCKWAIT(flags))) + error = sblock(&so->so_rcv, SBLOCKWAIT(flags)); + if (error) return (error); s = splnet(); @@ -545,17 +567,17 @@ restart: * (subject to any timeout) if: * 1. the current count is less than the low water mark, or * 2. MSG_WAITALL is set, and it is possible to do the entire - * receive operation at once if we block (resid <= hiwat), or - * 3. MSG_DONTWAIT is not set. + * receive operation at once if we block (resid <= hiwat). + * 3. MSG_DONTWAIT is not set * If MSG_WAITALL is set but resid is larger than the receive buffer, * we have to do the receive in sections, and thus risk returning * a short count if a timeout or signal occurs after we start. */ - if (m == 0 || ((flags & MSG_DONTWAIT) == 0 && + if (m == 0 || (((flags & MSG_DONTWAIT) == 0 && so->so_rcv.sb_cc < uio->uio_resid) && (so->so_rcv.sb_cc < so->so_rcv.sb_lowat || ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) && - m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0) { + m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) { #ifdef DIAGNOSTIC if (m == 0 && so->so_rcv.sb_cc) panic("receive 1"); @@ -687,6 +709,8 @@ dontblock: splx(s); error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio); s = splnet(); + if (error) + goto release; } else uio->uio_resid -= len; if (len == m->m_len - moff) { @@ -753,7 +777,8 @@ dontblock: splx(s); return (0); } - if (m = so->so_rcv.sb_mb) + m = so->so_rcv.sb_mb; + if (m) nextrecord = m->m_nextpkt; } } @@ -767,9 +792,7 @@ dontblock: if (m == 0) so->so_rcv.sb_mb = nextrecord; if (pr->pr_flags & PR_WANTRCVD && so->so_pcb) - (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0, - (struct mbuf *)(long)flags, (struct mbuf *)0, - (struct mbuf *)0); + (*pr->pr_usrreqs->pru_rcvd)(so, flags); } if (orig_resid == uio->uio_resid && orig_resid && (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) { @@ -777,7 +800,7 @@ dontblock: splx(s); goto restart; } - + if (flagsp) *flagsp |= flags; release: @@ -797,8 +820,7 @@ soshutdown(so, how) if (how & FREAD) sorflush(so); if (how & FWRITE) - return ((*pr->pr_usrreq)(so, PRU_SHUTDOWN, - (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0)); + return ((*pr->pr_usrreqs->pru_shutdown)(so)); return (0); } @@ -857,6 +879,7 @@ sosetopt(so, level, optname, m0) case SO_REUSEADDR: case SO_REUSEPORT: case SO_OOBINLINE: + case SO_TIMESTAMP: if (m == NULL || m->m_len < sizeof (int)) { error = EINVAL; goto bad; @@ -907,7 +930,7 @@ sosetopt(so, level, optname, m0) goto bad; } tv = mtod(m, struct timeval *); - if (tv->tv_sec * hz + tv->tv_usec / tick > SHRT_MAX) { + if (tv->tv_sec > SHRT_MAX / hz - hz) { error = EDOM; goto bad; } @@ -925,6 +948,11 @@ sosetopt(so, level, optname, m0) break; } + case SO_PRIVSTATE: + /* we don't care what the parameter is... */ + so->so_state &= ~SS_PRIV; + break; + default: error = ENOPROTOOPT; break; @@ -976,9 +1004,14 @@ sogetopt(so, level, optname, mp) case SO_REUSEPORT: case SO_BROADCAST: case SO_OOBINLINE: + case SO_TIMESTAMP: *mtod(m, int *) = so->so_options & optname; break; + case SO_PRIVSTATE: + *mtod(m, int *) = so->so_state & SS_PRIV; + break; + case SO_TYPE: *mtod(m, int *) = so->so_type; break; diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c index 865108a..e19db0c 100644 --- a/sys/kern/uipc_socket2.c +++ b/sys/kern/uipc_socket2.c @@ -30,30 +30,32 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)uipc_socket2.c 8.2 (Berkeley) 2/14/95 + * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93 + * $Id: uipc_socket2.c,v 1.21 1997/02/19 19:15:43 wollman Exp $ */ #include <sys/param.h> #include <sys/systm.h> +#include <sys/kernel.h> #include <sys/proc.h> #include <sys/file.h> #include <sys/buf.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/protosw.h> +#include <sys/stat.h> #include <sys/socket.h> #include <sys/socketvar.h> +#include <sys/signalvar.h> +#include <sys/sysctl.h> /* * Primitive routines for operating on sockets and socket buffers */ -/* strings for sleep message: */ -char netio[] = "netio"; -char netcon[] = "netcon"; -char netcls[] = "netcls"; +u_long sb_max = SB_MAX; /* XXX should be static */ -u_long sb_max = SB_MAX; /* patchable */ +static u_long sb_efficiency = 8; /* parameter for sbreserve() */ /* * Procedures to manipulate state flags of socket @@ -76,7 +78,7 @@ u_long sb_max = SB_MAX; /* patchable */ * structure queued on so_q0 by calling sonewconn(). When the connection * is established, soisconnected() is called, and transfers the * socket structure to so_q, making it available to accept(). - * + * * If a socket is closed with sockets on either * so_q0 or so_q, these sockets are dropped. * @@ -102,8 +104,12 @@ soisconnected(so) so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING); so->so_state |= SS_ISCONNECTED; - if (head && soqremque(so, 0)) { - soqinsque(head, so, 1); + if (head && (so->so_state & SS_INCOMP)) { + TAILQ_REMOVE(&head->so_incomp, so, so_list); + head->so_incqlen--; + so->so_state &= ~SS_INCOMP; + TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); + so->so_state |= SS_COMP; sorwakeup(head); wakeup((caddr_t)&head->so_timeo); } else { @@ -138,6 +144,49 @@ soisdisconnected(so) } /* + * Return a random connection that hasn't been serviced yet and + * is eligible for discard. There is a one in qlen chance that + * we will return a null, saying that there are no dropable + * requests. In this case, the protocol specific code should drop + * the new request. This insures fairness. + * + * This may be used in conjunction with protocol specific queue + * congestion routines. + */ +struct socket * +sodropablereq(head) + register struct socket *head; +{ + register struct socket *so; + unsigned int i, j, qlen; + + static int rnd; + static long old_mono_secs; + static unsigned int cur_cnt, old_cnt; + + if ((i = (mono_time.tv_sec - old_mono_secs)) != 0) { + old_mono_secs = mono_time.tv_sec; + old_cnt = cur_cnt / i; + cur_cnt = 0; + } + + so = TAILQ_FIRST(&head->so_incomp); + if (!so) + return (so); + + qlen = head->so_incqlen; + if (++cur_cnt > qlen || old_cnt > qlen) { + rnd = (314159 * rnd + 66329) & 0xffff; + j = ((qlen + 1) * rnd) >> 16; + + while (j-- && so) + so = TAILQ_NEXT(so, so_list); + } + + return (so); +} + +/* * When an attempt at a new connection is noted on a socket * which accepts connections, sonewconn is called. If the * connection is possible (subject to space constraints, etc.) @@ -154,14 +203,14 @@ sonewconn1(head, connstatus) int connstatus; { register struct socket *so; - int soqueue = connstatus ? 1 : 0; - if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2) + if (head->so_qlen > 3 * head->so_qlimit / 2) return ((struct socket *)0); MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_DONTWAIT); - if (so == NULL) + if (so == NULL) return ((struct socket *)0); bzero((caddr_t)so, sizeof(*so)); + so->so_head = head; so->so_type = head->so_type; so->so_options = head->so_options &~ SO_ACCEPTCONN; so->so_linger = head->so_linger; @@ -170,13 +219,21 @@ sonewconn1(head, connstatus) so->so_timeo = head->so_timeo; so->so_pgid = head->so_pgid; (void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat); - soqinsque(head, so, soqueue); - if ((*so->so_proto->pr_usrreq)(so, PRU_ATTACH, - (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0)) { - (void) soqremque(so, soqueue); + + if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0)) { (void) free((caddr_t)so, M_SOCKET); return ((struct socket *)0); } + + if (connstatus) { + TAILQ_INSERT_TAIL(&head->so_comp, so, so_list); + so->so_state |= SS_COMP; + } else { + TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list); + so->so_state |= SS_INCOMP; + head->so_incqlen++; + } + head->so_qlen++; if (connstatus) { sorwakeup(head); wakeup((caddr_t)&head->so_timeo); @@ -185,57 +242,6 @@ sonewconn1(head, connstatus) return (so); } -void -soqinsque(head, so, q) - register struct socket *head, *so; - int q; -{ - - register struct socket **prev; - so->so_head = head; - if (q == 0) { - head->so_q0len++; - so->so_q0 = 0; - for (prev = &(head->so_q0); *prev; ) - prev = &((*prev)->so_q0); - } else { - head->so_qlen++; - so->so_q = 0; - for (prev = &(head->so_q); *prev; ) - prev = &((*prev)->so_q); - } - *prev = so; -} - -int -soqremque(so, q) - register struct socket *so; - int q; -{ - register struct socket *head, *prev, *next; - - head = so->so_head; - prev = head; - for (;;) { - next = q ? prev->so_q : prev->so_q0; - if (next == so) - break; - if (next == 0) - return (0); - prev = next; - } - if (q == 0) { - prev->so_q0 = next->so_q0; - head->so_q0len--; - } else { - prev->so_q = next->so_q; - head->so_qlen--; - } - next->so_q0 = next->so_q = 0; - next->so_head = 0; - return (1); -} - /* * Socantsendmore indicates that no more data will be sent on the * socket; it would normally be applied to a socket when the user @@ -274,11 +280,11 @@ sbwait(sb) sb->sb_flags |= SB_WAIT; return (tsleep((caddr_t)&sb->sb_cc, - (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, netio, + (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait", sb->sb_timeo)); } -/* +/* * Lock a sockbuf already known to be locked; * return any error returned from sleep (EINTR). */ @@ -290,9 +296,10 @@ sb_lock(sb) while (sb->sb_flags & SB_LOCK) { sb->sb_flags |= SB_WANT; - if (error = tsleep((caddr_t)&sb->sb_flags, + error = tsleep((caddr_t)&sb->sb_flags, (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH, - netio, 0)) + "sblock", 0); + if (error) return (error); } sb->sb_flags |= SB_LOCK; @@ -390,11 +397,10 @@ sbreserve(sb, cc) struct sockbuf *sb; u_long cc; { - - if (cc > sb_max * MCLBYTES / (MSIZE + MCLBYTES)) + if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES)) return (0); sb->sb_hiwat = cc; - sb->sb_mbmax = min(cc * 2, sb_max); + sb->sb_mbmax = min(cc * sb_efficiency, sb_max); if (sb->sb_lowat > sb->sb_hiwat) sb->sb_lowat = sb->sb_hiwat; return (1); @@ -452,7 +458,8 @@ sbappend(sb, m) if (m == 0) return; - if (n = sb->sb_mb) { + n = sb->sb_mb; + if (n) { while (n->m_nextpkt) n = n->m_nextpkt; do { @@ -476,7 +483,7 @@ sbcheck(sb) for (m = sb->sb_mb; m; m = m->m_next) { len += m->m_len; mbcnt += MSIZE; - if (m->m_flags & M_EXT) + if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */ mbcnt += m->m_ext.ext_size; if (m->m_nextpkt) panic("sbcheck nextpkt"); @@ -502,7 +509,8 @@ sbappendrecord(sb, m0) if (m0 == 0) return; - if (m = sb->sb_mb) + m = sb->sb_mb; + if (m) while (m->m_nextpkt) m = m->m_nextpkt; /* @@ -538,7 +546,8 @@ sbinsertoob(sb, m0) if (m0 == 0) return; - for (mp = &sb->sb_mb; m = *mp; mp = &((*mp)->m_nextpkt)) { + for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) { + m = *mp; again: switch (m->m_type) { @@ -546,7 +555,8 @@ sbinsertoob(sb, m0) continue; /* WANT next train */ case MT_CONTROL: - if (m = m->m_next) + m = m->m_next; + if (m) goto again; /* inspect THIS train further */ } break; @@ -607,7 +617,8 @@ panic("sbappendaddr"); m->m_next = control; for (n = m; n; n = n->m_next) sballoc(sb, n); - if (n = sb->sb_mb) { + n = sb->sb_mb; + if (n) { while (n->m_nextpkt) n = n->m_nextpkt; n->m_nextpkt = m; @@ -619,7 +630,7 @@ panic("sbappendaddr"); int sbappendcontrol(sb, m0, control) struct sockbuf *sb; - struct mbuf *m0, *control; + struct mbuf *control, *m0; { register struct mbuf *m, *n; int space = 0; @@ -639,7 +650,8 @@ sbappendcontrol(sb, m0, control) n->m_next = m0; /* concatenate data to control */ for (m = control; m; m = m->m_next) sballoc(sb, m); - if (n = sb->sb_mb) { + n = sb->sb_mb; + if (n) { while (n->m_nextpkt) n = n->m_nextpkt; n->m_nextpkt = control; @@ -774,6 +786,233 @@ sbdroprecord(sb) do { sbfree(sb, m); MFREE(m, mn); - } while (m = mn); + m = mn; + } while (m); } } + +/* + * Create a "control" mbuf containing the specified data + * with the specified type for presentation on a socket buffer. + */ +struct mbuf * +sbcreatecontrol(p, size, type, level) + caddr_t p; + register int size; + int type, level; +{ + register struct cmsghdr *cp; + struct mbuf *m; + + if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL) + return ((struct mbuf *) NULL); + cp = mtod(m, struct cmsghdr *); + /* XXX check size? */ + (void)memcpy(CMSG_DATA(cp), p, size); + size += sizeof(*cp); + m->m_len = size; + cp->cmsg_len = size; + cp->cmsg_level = level; + cp->cmsg_type = type; + return (m); +} + +#ifdef PRU_OLDSTYLE +/* + * The following routines mediate between the old-style `pr_usrreq' + * protocol implementations and the new-style `struct pr_usrreqs' + * calling convention. + */ + +/* syntactic sugar */ +#define nomb (struct mbuf *)0 + +static int +old_abort(struct socket *so) +{ + return so->so_proto->pr_ousrreq(so, PRU_ABORT, nomb, nomb, nomb); +} + +static int +old_accept(struct socket *so, struct mbuf *nam) +{ + return so->so_proto->pr_ousrreq(so, PRU_ACCEPT, nomb, nam, nomb); +} + +static int +old_attach(struct socket *so, int proto) +{ + return so->so_proto->pr_ousrreq(so, PRU_ATTACH, nomb, + (struct mbuf *)proto, /* XXX */ + nomb); +} + +static int +old_bind(struct socket *so, struct mbuf *nam) +{ + return so->so_proto->pr_ousrreq(so, PRU_BIND, nomb, nam, nomb); +} + +static int +old_connect(struct socket *so, struct mbuf *nam) +{ + return so->so_proto->pr_ousrreq(so, PRU_CONNECT, nomb, nam, nomb); +} + +static int +old_connect2(struct socket *so1, struct socket *so2) +{ + return so1->so_proto->pr_ousrreq(so1, PRU_CONNECT2, nomb, + (struct mbuf *)so2, nomb); +} + +static int +old_control(struct socket *so, int cmd, caddr_t data, struct ifnet *ifp) +{ + return so->so_proto->pr_ousrreq(so, PRU_CONTROL, (struct mbuf *)cmd, + (struct mbuf *)data, + (struct mbuf *)ifp); +} + +static int +old_detach(struct socket *so) +{ + return so->so_proto->pr_ousrreq(so, PRU_DETACH, nomb, nomb, nomb); +} + +static int +old_disconnect(struct socket *so) +{ + return so->so_proto->pr_ousrreq(so, PRU_DISCONNECT, nomb, nomb, nomb); +} + +static int +old_listen(struct socket *so) +{ + return so->so_proto->pr_ousrreq(so, PRU_LISTEN, nomb, nomb, nomb); +} + +static int +old_peeraddr(struct socket *so, struct mbuf *nam) +{ + return so->so_proto->pr_ousrreq(so, PRU_PEERADDR, nomb, nam, nomb); +} + +static int +old_rcvd(struct socket *so, int flags) +{ + return so->so_proto->pr_ousrreq(so, PRU_RCVD, nomb, + (struct mbuf *)flags, /* XXX */ + nomb); +} + +static int +old_rcvoob(struct socket *so, struct mbuf *m, int flags) +{ + return so->so_proto->pr_ousrreq(so, PRU_RCVOOB, m, + (struct mbuf *)flags, /* XXX */ + nomb); +} + +static int +old_send(struct socket *so, int flags, struct mbuf *m, struct mbuf *addr, + struct mbuf *control) +{ + int req; + + if (flags & PRUS_OOB) { + req = PRU_SENDOOB; + } else if(flags & PRUS_EOF) { + req = PRU_SEND_EOF; + } else { + req = PRU_SEND; + } + return so->so_proto->pr_ousrreq(so, req, m, addr, control); +} + +static int +old_sense(struct socket *so, struct stat *sb) +{ + return so->so_proto->pr_ousrreq(so, PRU_SENSE, (struct mbuf *)sb, + nomb, nomb); +} + +static int +old_shutdown(struct socket *so) +{ + return so->so_proto->pr_ousrreq(so, PRU_SHUTDOWN, nomb, nomb, nomb); +} + +static int +old_sockaddr(struct socket *so, struct mbuf *nam) +{ + return so->so_proto->pr_ousrreq(so, PRU_SOCKADDR, nomb, nam, nomb); +} + +struct pr_usrreqs pru_oldstyle = { + old_abort, old_accept, old_attach, old_bind, old_connect, + old_connect2, old_control, old_detach, old_disconnect, + old_listen, old_peeraddr, old_rcvd, old_rcvoob, old_send, + old_sense, old_shutdown, old_sockaddr +}; + +#endif /* PRU_OLDSTYLE */ + +/* + * Some routines that return EOPNOTSUPP for entry points that are not + * supported by a protocol. Fill in as needed. + */ +int +pru_accept_notsupp(struct socket *so, struct mbuf *nam) +{ + return EOPNOTSUPP; +} + +int +pru_connect2_notsupp(struct socket *so1, struct socket *so2) +{ + return EOPNOTSUPP; +} + +int +pru_listen_notsupp(struct socket *so) +{ + return EOPNOTSUPP; +} + +int +pru_rcvd_notsupp(struct socket *so, int flags) +{ + return EOPNOTSUPP; +} + +int +pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags) +{ + return EOPNOTSUPP; +} + +/* + * This isn't really a ``null'' operation, but it's the default one + * and doesn't do anything destructive. + */ +int +pru_sense_null(struct socket *so, struct stat *sb) +{ + sb->st_blksize = so->so_snd.sb_hiwat; + return 0; +} + +/* + * Here is the definition of some of the basic objects in the kern.ipc + * branch of the MIB. + */ +SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC"); + +/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */ +static int dummy; +SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, ""); + +SYSCTL_INT(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLFLAG_RW, &sb_max, 0, "") +SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW, + &sb_efficiency, 0, ""); diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c index 800434c..e3aca30 100644 --- a/sys/kern/uipc_syscalls.c +++ b/sys/kern/uipc_syscalls.c @@ -30,26 +30,43 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)uipc_syscalls.c 8.6 (Berkeley) 2/14/95 + * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94 + * $Id: uipc_syscalls.c,v 1.22 1997/02/22 09:39:29 peter Exp $ */ +#include "opt_ktrace.h" + #include <sys/param.h> #include <sys/systm.h> +#include <sys/sysproto.h> #include <sys/filedesc.h> #include <sys/proc.h> +#include <sys/fcntl.h> #include <sys/file.h> #include <sys/buf.h> #include <sys/malloc.h> #include <sys/mbuf.h> #include <sys/protosw.h> +#include <sys/stat.h> #include <sys/socket.h> #include <sys/socketvar.h> +#include <sys/signalvar.h> +#include <sys/un.h> #ifdef KTRACE #include <sys/ktrace.h> #endif -#include <sys/mount.h> -#include <sys/syscallargs.h> +extern int sendit __P((struct proc *p, int s, struct msghdr *mp, int flags, + int *retsize)); +extern int recvit __P((struct proc *p, int s, struct msghdr *mp, + caddr_t namelenp, int *retsize)); + +static int accept1 __P((struct proc *p, struct accept_args *uap, int *retval, + int compat)); +static int getsockname1 __P((struct proc *p, struct getsockname_args *uap, + int *retval, int compat)); +static int getpeername1 __P((struct proc *p, struct getpeername_args *uap, + int *retval, int compat)); /* * System call interface to the socket abstraction. @@ -64,24 +81,25 @@ int socket(p, uap, retval) struct proc *p; register struct socket_args /* { - syscallarg(int) domain; - syscallarg(int) type; - syscallarg(int) protocol; + int domain; + int type; + int protocol; } */ *uap; - register_t *retval; + int *retval; { struct filedesc *fdp = p->p_fd; struct socket *so; struct file *fp; int fd, error; - if (error = falloc(p, &fp, &fd)) + error = falloc(p, &fp, &fd); + if (error) return (error); fp->f_flag = FREAD|FWRITE; fp->f_type = DTYPE_SOCKET; fp->f_ops = &socketops; - if (error = socreate(SCARG(uap, domain), &so, SCARG(uap, type), - SCARG(uap, protocol))) { + error = socreate(uap->domain, &so, uap->type, uap->protocol, p); + if (error) { fdp->fd_ofiles[fd] = 0; ffree(fp); } else { @@ -96,20 +114,21 @@ int bind(p, uap, retval) struct proc *p; register struct bind_args /* { - syscallarg(int) s; - syscallarg(caddr_t) name; - syscallarg(int) namelen; + int s; + caddr_t name; + int namelen; } */ *uap; - register_t *retval; + int *retval; { struct file *fp; struct mbuf *nam; int error; - if (error = getsock(p->p_fd, SCARG(uap, s), &fp)) + error = getsock(p->p_fd, uap->s, &fp); + if (error) return (error); - if (error = sockargs(&nam, SCARG(uap, name), SCARG(uap, namelen), - MT_SONAME)) + error = sockargs(&nam, uap->name, uap->namelen, MT_SONAME); + if (error) return (error); error = sobind((struct socket *)fp->f_data, nam); m_freem(nam); @@ -121,159 +140,161 @@ int listen(p, uap, retval) struct proc *p; register struct listen_args /* { - syscallarg(int) s; - syscallarg(int) backlog; + int s; + int backlog; } */ *uap; - register_t *retval; + int *retval; { struct file *fp; int error; - if (error = getsock(p->p_fd, SCARG(uap, s), &fp)) + error = getsock(p->p_fd, uap->s, &fp); + if (error) return (error); - return (solisten((struct socket *)fp->f_data, SCARG(uap, backlog))); + return (solisten((struct socket *)fp->f_data, uap->backlog)); } -#ifdef COMPAT_OLDSOCK -int -accept(p, uap, retval) - struct proc *p; - struct accept_args /* { - syscallarg(int) s; - syscallarg(caddr_t) name; - syscallarg(int *) anamelen; - } */ *uap; - register_t *retval; -{ - - return (accept1(p, uap, retval, 0)); -} - -int -compat_43_accept(p, uap, retval) - struct proc *p; - struct accept_args /* { - syscallarg(int) s; - syscallarg(caddr_t) name; - syscallarg(int *) anamelen; - } */ *uap; - register_t *retval; -{ - - return (accept1(p, uap, retval, 1)); -} -#else /* COMPAT_OLDSOCK */ - -#define accept1 accept -#endif - -int -accept1(p, uap, retval, compat_43) +static int +accept1(p, uap, retval, compat) struct proc *p; register struct accept_args /* { - syscallarg(int) s; - syscallarg(caddr_t) name; - syscallarg(int *) anamelen; + int s; + caddr_t name; + int *anamelen; } */ *uap; - register_t *retval; - int compat_43; + int *retval; + int compat; { struct file *fp; struct mbuf *nam; - int namelen, error, s, tmpfd; - register struct socket *so; - - if (SCARG(uap, name) && (error = copyin((caddr_t)SCARG(uap, anamelen), - (caddr_t)&namelen, sizeof (namelen)))) - return (error); - if (error = getsock(p->p_fd, SCARG(uap, s), &fp)) + int namelen, error, s; + struct socket *head, *so; + short fflag; /* type must match fp->f_flag */ + + if (uap->name) { + error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen, + sizeof (namelen)); + if(error) + return (error); + } + error = getsock(p->p_fd, uap->s, &fp); + if (error) return (error); s = splnet(); - so = (struct socket *)fp->f_data; - if ((so->so_options & SO_ACCEPTCONN) == 0) { + head = (struct socket *)fp->f_data; + if ((head->so_options & SO_ACCEPTCONN) == 0) { splx(s); return (EINVAL); } - if ((so->so_state & SS_NBIO) && so->so_qlen == 0) { + if ((head->so_state & SS_NBIO) && head->so_comp.tqh_first == NULL) { splx(s); return (EWOULDBLOCK); } - while (so->so_qlen == 0 && so->so_error == 0) { - if (so->so_state & SS_CANTRCVMORE) { - so->so_error = ECONNABORTED; + while (head->so_comp.tqh_first == NULL && head->so_error == 0) { + if (head->so_state & SS_CANTRCVMORE) { + head->so_error = ECONNABORTED; break; } - if (error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH, - netcon, 0)) { + error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH, + "accept", 0); + if (error) { splx(s); return (error); } } - if (so->so_error) { - error = so->so_error; - so->so_error = 0; + if (head->so_error) { + error = head->so_error; + head->so_error = 0; splx(s); return (error); } - if (error = falloc(p, &fp, &tmpfd)) { + fflag = fp->f_flag; + error = falloc(p, &fp, retval); + if (error) { splx(s); return (error); } - *retval = tmpfd; - { struct socket *aso = so->so_q; - if (soqremque(aso, 1) == 0) - panic("accept"); - so = aso; - } + + so = head->so_comp.tqh_first; + if (so == NULL) + panic("accept: nothing queued"); + TAILQ_REMOVE(&head->so_comp, so, so_list); + so->so_state &= ~SS_COMP; + so->so_head = NULL; + head->so_qlen--; + fp->f_type = DTYPE_SOCKET; - fp->f_flag = FREAD|FWRITE; + fp->f_flag = fflag; fp->f_ops = &socketops; fp->f_data = (caddr_t)so; nam = m_get(M_WAIT, MT_SONAME); (void) soaccept(so, nam); - if (SCARG(uap, name)) { + if (uap->name) { #ifdef COMPAT_OLDSOCK - if (compat_43) + if (compat) mtod(nam, struct osockaddr *)->sa_family = mtod(nam, struct sockaddr *)->sa_family; #endif if (namelen > nam->m_len) namelen = nam->m_len; /* SHOULD COPY OUT A CHAIN HERE */ - if ((error = copyout(mtod(nam, caddr_t), - (caddr_t)SCARG(uap, name), (u_int)namelen)) == 0) + error = copyout(mtod(nam, caddr_t), (caddr_t)uap->name, + (u_int)namelen); + if (!error) error = copyout((caddr_t)&namelen, - (caddr_t)SCARG(uap, anamelen), - sizeof (*SCARG(uap, anamelen))); + (caddr_t)uap->anamelen, sizeof (*uap->anamelen)); } m_freem(nam); splx(s); return (error); } +int +accept(p, uap, retval) + struct proc *p; + struct accept_args *uap; + int *retval; +{ + + return (accept1(p, uap, retval, 0)); +} + +#ifdef COMPAT_OLDSOCK +int +oaccept(p, uap, retval) + struct proc *p; + struct accept_args *uap; + int *retval; +{ + + return (accept1(p, uap, retval, 1)); +} +#endif /* COMPAT_OLDSOCK */ + /* ARGSUSED */ int connect(p, uap, retval) struct proc *p; register struct connect_args /* { - syscallarg(int) s; - syscallarg(caddr_t) name; - syscallarg(int) namelen; + int s; + caddr_t name; + int namelen; } */ *uap; - register_t *retval; + int *retval; { struct file *fp; register struct socket *so; struct mbuf *nam; int error, s; - if (error = getsock(p->p_fd, SCARG(uap, s), &fp)) + error = getsock(p->p_fd, uap->s, &fp); + if (error) return (error); so = (struct socket *)fp->f_data; if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING)) return (EALREADY); - if (error = sockargs(&nam, SCARG(uap, name), SCARG(uap, namelen), - MT_SONAME)) + error = sockargs(&nam, uap->name, uap->namelen, MT_SONAME); + if (error) return (error); error = soconnect(so, nam); if (error) @@ -283,10 +304,12 @@ connect(p, uap, retval) return (EINPROGRESS); } s = splnet(); - while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) - if (error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH, - netcon, 0)) + while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) { + error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH, + "connec", 0); + if (error) break; + } if (error == 0) { error = so->so_error; so->so_error = 0; @@ -304,51 +327,56 @@ int socketpair(p, uap, retval) struct proc *p; register struct socketpair_args /* { - syscallarg(int) domain; - syscallarg(int) type; - syscallarg(int) protocol; - syscallarg(int *) rsv; + int domain; + int type; + int protocol; + int *rsv; } */ *uap; - register_t *retval; + int retval[]; { register struct filedesc *fdp = p->p_fd; struct file *fp1, *fp2; struct socket *so1, *so2; int fd, error, sv[2]; - if (error = socreate(SCARG(uap, domain), &so1, SCARG(uap, type), - SCARG(uap, protocol))) + error = socreate(uap->domain, &so1, uap->type, uap->protocol, p); + if (error) return (error); - if (error = socreate(SCARG(uap, domain), &so2, SCARG(uap, type), - SCARG(uap, protocol))) + error = socreate(uap->domain, &so2, uap->type, uap->protocol, p); + if (error) goto free1; - if (error = falloc(p, &fp1, &fd)) + error = falloc(p, &fp1, &fd); + if (error) goto free2; sv[0] = fd; fp1->f_flag = FREAD|FWRITE; fp1->f_type = DTYPE_SOCKET; fp1->f_ops = &socketops; fp1->f_data = (caddr_t)so1; - if (error = falloc(p, &fp2, &fd)) + error = falloc(p, &fp2, &fd); + if (error) goto free3; fp2->f_flag = FREAD|FWRITE; fp2->f_type = DTYPE_SOCKET; fp2->f_ops = &socketops; fp2->f_data = (caddr_t)so2; sv[1] = fd; - if (error = soconnect2(so1, so2)) + error = soconnect2(so1, so2); + if (error) goto free4; - if (SCARG(uap, type) == SOCK_DGRAM) { + if (uap->type == SOCK_DGRAM) { /* * Datagram socket connection is asymmetric. */ - if (error = soconnect2(so2, so1)) + error = soconnect2(so2, so1); + if (error) goto free4; } - error = copyout((caddr_t)sv, (caddr_t)SCARG(uap, rsv), - 2 * sizeof (int)); + error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int)); +#if 0 /* old pipe(2) syscall compatability, unused these days */ retval[0] = sv[0]; /* XXX ??? */ retval[1] = sv[1]; /* XXX ??? */ +#endif return (error); free4: ffree(fp2); @@ -364,145 +392,11 @@ free1: } int -sendto(p, uap, retval) - struct proc *p; - register struct sendto_args /* { - syscallarg(int) s; - syscallarg(caddr_t) buf; - syscallarg(size_t) len; - syscallarg(int) flags; - syscallarg(caddr_t) to; - syscallarg(int) tolen; - } */ *uap; - register_t *retval; -{ - struct msghdr msg; - struct iovec aiov; - - msg.msg_name = SCARG(uap, to); - msg.msg_namelen = SCARG(uap, tolen); - msg.msg_iov = &aiov; - msg.msg_iovlen = 1; - msg.msg_control = 0; -#ifdef COMPAT_OLDSOCK - msg.msg_flags = 0; -#endif - aiov.iov_base = SCARG(uap, buf); - aiov.iov_len = SCARG(uap, len); - return (sendit(p, SCARG(uap, s), &msg, SCARG(uap, flags), retval)); -} - -#ifdef COMPAT_OLDSOCK -int -compat_43_send(p, uap, retval) - struct proc *p; - register struct compat_43_send_args /* { - syscallarg(int) s; - syscallarg(caddr_t) buf; - syscallarg(int) len; - syscallarg(int) flags; - } */ *uap; - register_t *retval; -{ - struct msghdr msg; - struct iovec aiov; - - msg.msg_name = 0; - msg.msg_namelen = 0; - msg.msg_iov = &aiov; - msg.msg_iovlen = 1; - aiov.iov_base = SCARG(uap, buf); - aiov.iov_len = SCARG(uap, len); - msg.msg_control = 0; - msg.msg_flags = 0; - return (sendit(p, SCARG(uap, s), &msg, SCARG(uap, flags), retval)); -} - -#define MSG_COMPAT 0x8000 -int -compat_43_sendmsg(p, uap, retval) - struct proc *p; - register struct compat_43_sendmsg_args /* { - syscallarg(int) s; - syscallarg(caddr_t) msg; - syscallarg(int) flags; - } */ *uap; - register_t *retval; -{ - struct msghdr msg; - struct iovec aiov[UIO_SMALLIOV], *iov; - int error; - - if (error = copyin(SCARG(uap, msg), (caddr_t)&msg, - sizeof (struct omsghdr))) - return (error); - if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { - if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) - return (EMSGSIZE); - MALLOC(iov, struct iovec *, - sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, - M_WAITOK); - } else - iov = aiov; - if (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, - (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))) - goto done; - msg.msg_flags = MSG_COMPAT; - msg.msg_iov = iov; - error = sendit(p, SCARG(uap, s), &msg, SCARG(uap, flags), retval); -done: - if (iov != aiov) - FREE(iov, M_IOV); - return (error); -} -#endif - -int -sendmsg(p, uap, retval) - struct proc *p; - register struct sendmsg_args /* { - syscallarg(int) s; - syscallarg(caddr_t) msg; - syscallarg(int) flags; - } */ *uap; - register_t *retval; -{ - struct msghdr msg; - struct iovec aiov[UIO_SMALLIOV], *iov; - int error; - - if (error = copyin(SCARG(uap, msg), (caddr_t)&msg, sizeof (msg))) - return (error); - if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { - if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) - return (EMSGSIZE); - MALLOC(iov, struct iovec *, - sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, - M_WAITOK); - } else - iov = aiov; - if (msg.msg_iovlen && - (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, - (unsigned)(msg.msg_iovlen * sizeof (struct iovec))))) - goto done; - msg.msg_iov = iov; -#ifdef COMPAT_OLDSOCK - msg.msg_flags = 0; -#endif - error = sendit(p, SCARG(uap, s), &msg, SCARG(uap, flags), retval); -done: - if (iov != aiov) - FREE(iov, M_IOV); - return (error); -} - -int sendit(p, s, mp, flags, retsize) register struct proc *p; int s; register struct msghdr *mp; - int flags; - register_t *retsize; + int flags, *retsize; { struct file *fp; struct uio auio; @@ -513,8 +407,9 @@ sendit(p, s, mp, flags, retsize) #ifdef KTRACE struct iovec *ktriov = NULL; #endif - - if (error = getsock(p->p_fd, s, &fp)) + + error = getsock(p->p_fd, s, &fp); + if (error) return (error); auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; @@ -525,13 +420,12 @@ sendit(p, s, mp, flags, retsize) auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { - if (auio.uio_resid + iov->iov_len < auio.uio_resid) + if ((auio.uio_resid += iov->iov_len) < 0) return (EINVAL); - auio.uio_resid += iov->iov_len; } if (mp->msg_name) { - if (error = sockargs(&to, mp->msg_name, mp->msg_namelen, - MT_SONAME)) + error = sockargs(&to, mp->msg_name, mp->msg_namelen, MT_SONAME); + if (error) return (error); } else to = 0; @@ -544,8 +438,9 @@ sendit(p, s, mp, flags, retsize) error = EINVAL; goto bad; } - if (error = sockargs(&control, mp->msg_control, - mp->msg_controllen, MT_CONTROL)) + error = sockargs(&control, mp->msg_control, + mp->msg_controllen, MT_CONTROL); + if (error) goto bad; #ifdef COMPAT_OLDSOCK if (mp->msg_flags == MSG_COMPAT) { @@ -574,8 +469,9 @@ sendit(p, s, mp, flags, retsize) } #endif len = auio.uio_resid; - if (error = sosend((struct socket *)fp->f_data, to, &auio, - (struct mbuf *)0, control, flags)) { + error = sosend((struct socket *)fp->f_data, to, &auio, + (struct mbuf *)0, control, flags); + if (error) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; @@ -598,71 +494,46 @@ bad: return (error); } -#ifdef COMPAT_OLDSOCK int -compat_43_recvfrom(p, uap, retval) - struct proc *p; - struct recvfrom_args /* { - syscallarg(int) s; - syscallarg(caddr_t) buf; - syscallarg(size_t) len; - syscallarg(int) flags; - syscallarg(caddr_t) from; - syscallarg(int *) fromlenaddr; - } */ *uap; - register_t *retval; -{ - - SCARG(uap, flags) |= MSG_COMPAT; - return (recvfrom(p, uap, retval)); -} -#endif - -int -recvfrom(p, uap, retval) +sendto(p, uap, retval) struct proc *p; - register struct recvfrom_args /* { - syscallarg(int) s; - syscallarg(caddr_t) buf; - syscallarg(size_t) len; - syscallarg(int) flags; - syscallarg(caddr_t) from; - syscallarg(int *) fromlenaddr; + register struct sendto_args /* { + int s; + caddr_t buf; + size_t len; + int flags; + caddr_t to; + int tolen; } */ *uap; - register_t *retval; + int *retval; { struct msghdr msg; struct iovec aiov; - int error; - if (SCARG(uap, fromlenaddr)) { - if (error = copyin((caddr_t)SCARG(uap, fromlenaddr), - (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen))) - return (error); - } else - msg.msg_namelen = 0; - msg.msg_name = SCARG(uap, from); + msg.msg_name = uap->to; + msg.msg_namelen = uap->tolen; msg.msg_iov = &aiov; msg.msg_iovlen = 1; - aiov.iov_base = SCARG(uap, buf); - aiov.iov_len = SCARG(uap, len); msg.msg_control = 0; - msg.msg_flags = SCARG(uap, flags); - return (recvit(p, SCARG(uap, s), &msg, - (caddr_t)SCARG(uap, fromlenaddr), retval)); +#ifdef COMPAT_OLDSOCK + msg.msg_flags = 0; +#endif + aiov.iov_base = uap->buf; + aiov.iov_len = uap->len; + return (sendit(p, uap->s, &msg, uap->flags, retval)); } #ifdef COMPAT_OLDSOCK int -compat_43_recv(p, uap, retval) +osend(p, uap, retval) struct proc *p; - register struct compat_43_recv_args /* { - syscallarg(int) s; - syscallarg(caddr_t) buf; - syscallarg(int) len; - syscallarg(int) flags; + register struct osend_args /* { + int s; + caddr_t buf; + int len; + int flags; } */ *uap; - register_t *retval; + int *retval; { struct msghdr msg; struct iovec aiov; @@ -671,34 +542,29 @@ compat_43_recv(p, uap, retval) msg.msg_namelen = 0; msg.msg_iov = &aiov; msg.msg_iovlen = 1; - aiov.iov_base = SCARG(uap, buf); - aiov.iov_len = SCARG(uap, len); + aiov.iov_base = uap->buf; + aiov.iov_len = uap->len; msg.msg_control = 0; - msg.msg_flags = SCARG(uap, flags); - return (recvit(p, SCARG(uap, s), &msg, (caddr_t)0, retval)); + msg.msg_flags = 0; + return (sendit(p, uap->s, &msg, uap->flags, retval)); } -/* - * Old recvmsg. This code takes advantage of the fact that the old msghdr - * overlays the new one, missing only the flags, and with the (old) access - * rights where the control fields are now. - */ int -compat_43_recvmsg(p, uap, retval) +osendmsg(p, uap, retval) struct proc *p; - register struct compat_43_recvmsg_args /* { - syscallarg(int) s; - syscallarg(struct omsghdr *) msg; - syscallarg(int) flags; + register struct osendmsg_args /* { + int s; + caddr_t msg; + int flags; } */ *uap; - register_t *retval; + int *retval; { struct msghdr msg; struct iovec aiov[UIO_SMALLIOV], *iov; int error; - if (error = copyin((caddr_t)SCARG(uap, msg), (caddr_t)&msg, - sizeof (struct omsghdr))) + error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr)); + if (error) return (error); if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) @@ -708,17 +574,13 @@ compat_43_recvmsg(p, uap, retval) M_WAITOK); } else iov = aiov; - msg.msg_flags = SCARG(uap, flags) | MSG_COMPAT; - if (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, - (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))) + error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, + (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); + if (error) goto done; + msg.msg_flags = MSG_COMPAT; msg.msg_iov = iov; - error = recvit(p, SCARG(uap, s), &msg, - (caddr_t)&SCARG(uap, msg)->msg_namelen, retval); - - if (msg.msg_controllen && error == 0) - error = copyout((caddr_t)&msg.msg_controllen, - (caddr_t)&SCARG(uap, msg)->msg_accrightslen, sizeof (int)); + error = sendit(p, uap->s, &msg, uap->flags, retval); done: if (iov != aiov) FREE(iov, M_IOV); @@ -727,21 +589,21 @@ done: #endif int -recvmsg(p, uap, retval) +sendmsg(p, uap, retval) struct proc *p; - register struct recvmsg_args /* { - syscallarg(int) s; - syscallarg(struct msghdr *) msg; - syscallarg(int) flags; + register struct sendmsg_args /* { + int s; + caddr_t msg; + int flags; } */ *uap; - register_t *retval; + int *retval; { struct msghdr msg; - struct iovec aiov[UIO_SMALLIOV], *uiov, *iov; - register int error; + struct iovec aiov[UIO_SMALLIOV], *iov; + int error; - if (error = copyin((caddr_t)SCARG(uap, msg), (caddr_t)&msg, - sizeof (msg))) + error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg)); + if (error) return (error); if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) @@ -751,21 +613,15 @@ recvmsg(p, uap, retval) M_WAITOK); } else iov = aiov; + if (msg.msg_iovlen && + (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, + (unsigned)(msg.msg_iovlen * sizeof (struct iovec))))) + goto done; + msg.msg_iov = iov; #ifdef COMPAT_OLDSOCK - msg.msg_flags = SCARG(uap, flags) &~ MSG_COMPAT; -#else - msg.msg_flags = SCARG(uap, flags); + msg.msg_flags = 0; #endif - uiov = msg.msg_iov; - msg.msg_iov = iov; - if (error = copyin((caddr_t)uiov, (caddr_t)iov, - (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))) - goto done; - if ((error = recvit(p, SCARG(uap, s), &msg, (caddr_t)0, retval)) == 0) { - msg.msg_iov = uiov; - error = copyout((caddr_t)&msg, (caddr_t)SCARG(uap, msg), - sizeof(msg)); - } + error = sendit(p, uap->s, &msg, uap->flags, retval); done: if (iov != aiov) FREE(iov, M_IOV); @@ -778,19 +634,21 @@ recvit(p, s, mp, namelenp, retsize) int s; register struct msghdr *mp; caddr_t namelenp; - register_t *retsize; + int *retsize; { struct file *fp; struct uio auio; register struct iovec *iov; register int i; int len, error; - struct mbuf *from = 0, *control = 0; + struct mbuf *m, *from = 0, *control = 0; + caddr_t ctlbuf; #ifdef KTRACE struct iovec *ktriov = NULL; #endif - - if (error = getsock(p->p_fd, s, &fp)) + + error = getsock(p->p_fd, s, &fp); + if (error) return (error); auio.uio_iov = mp->msg_iov; auio.uio_iovcnt = mp->msg_iovlen; @@ -801,9 +659,8 @@ recvit(p, s, mp, namelenp, retsize) auio.uio_resid = 0; iov = mp->msg_iov; for (i = 0; i < mp->msg_iovlen; i++, iov++) { - if (auio.uio_resid + iov->iov_len < auio.uio_resid) + if ((auio.uio_resid += iov->iov_len) < 0) return (EINVAL); - auio.uio_resid += iov->iov_len; } #ifdef KTRACE if (KTRPOINT(p, KTR_GENIO)) { @@ -814,9 +671,10 @@ recvit(p, s, mp, namelenp, retsize) } #endif len = auio.uio_resid; - if (error = soreceive((struct socket *)fp->f_data, &from, &auio, + error = soreceive((struct socket *)fp->f_data, &from, &auio, (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0, - &mp->msg_flags)) { + &mp->msg_flags); + if (error) { if (auio.uio_resid != len && (error == ERESTART || error == EINTR || error == EWOULDBLOCK)) error = 0; @@ -845,8 +703,9 @@ recvit(p, s, mp, namelenp, retsize) if (len > from->m_len) len = from->m_len; /* else if len < from->m_len ??? */ - if (error = copyout(mtod(from, caddr_t), - (caddr_t)mp->msg_name, (unsigned)len)) + error = copyout(mtod(from, caddr_t), + (caddr_t)mp->msg_name, (unsigned)len); + if (error) goto out; } mp->msg_namelen = len; @@ -882,17 +741,29 @@ recvit(p, s, mp, namelenp, retsize) } #endif len = mp->msg_controllen; - if (len <= 0 || control == 0) - len = 0; - else { - if (len >= control->m_len) - len = control->m_len; - else + m = control; + mp->msg_controllen = 0; + ctlbuf = (caddr_t) mp->msg_control; + + while (m && len > 0) { + unsigned int tocopy; + + if (len >= m->m_len) + tocopy = m->m_len; + else { mp->msg_flags |= MSG_CTRUNC; - error = copyout((caddr_t)mtod(control, caddr_t), - (caddr_t)mp->msg_control, (unsigned)len); + tocopy = len; + } + + if (error = copyout((caddr_t)mtod(m, caddr_t), + ctlbuf, tocopy)) + goto out; + + ctlbuf += tocopy; + len -= tocopy; + m = m->m_next; } - mp->msg_controllen = len; + mp->msg_controllen = ctlbuf - mp->msg_control; } out: if (from) @@ -902,22 +773,193 @@ out: return (error); } +int +recvfrom(p, uap, retval) + struct proc *p; + register struct recvfrom_args /* { + int s; + caddr_t buf; + size_t len; + int flags; + caddr_t from; + int *fromlenaddr; + } */ *uap; + int *retval; +{ + struct msghdr msg; + struct iovec aiov; + int error; + + if (uap->fromlenaddr) { + error = copyin((caddr_t)uap->fromlenaddr, + (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen)); + if (error) + return (error); + } else + msg.msg_namelen = 0; + msg.msg_name = uap->from; + msg.msg_iov = &aiov; + msg.msg_iovlen = 1; + aiov.iov_base = uap->buf; + aiov.iov_len = uap->len; + msg.msg_control = 0; + msg.msg_flags = uap->flags; + return (recvit(p, uap->s, &msg, (caddr_t)uap->fromlenaddr, retval)); +} + +#ifdef COMPAT_OLDSOCK +int +orecvfrom(p, uap, retval) + struct proc *p; + struct recvfrom_args *uap; + int *retval; +{ + + uap->flags |= MSG_COMPAT; + return (recvfrom(p, uap, retval)); +} +#endif + + +#ifdef COMPAT_OLDSOCK +int +orecv(p, uap, retval) + struct proc *p; + register struct orecv_args /* { + int s; + caddr_t buf; + int len; + int flags; + } */ *uap; + int *retval; +{ + struct msghdr msg; + struct iovec aiov; + + msg.msg_name = 0; + msg.msg_namelen = 0; + msg.msg_iov = &aiov; + msg.msg_iovlen = 1; + aiov.iov_base = uap->buf; + aiov.iov_len = uap->len; + msg.msg_control = 0; + msg.msg_flags = uap->flags; + return (recvit(p, uap->s, &msg, (caddr_t)0, retval)); +} + +/* + * Old recvmsg. This code takes advantage of the fact that the old msghdr + * overlays the new one, missing only the flags, and with the (old) access + * rights where the control fields are now. + */ +int +orecvmsg(p, uap, retval) + struct proc *p; + register struct orecvmsg_args /* { + int s; + struct omsghdr *msg; + int flags; + } */ *uap; + int *retval; +{ + struct msghdr msg; + struct iovec aiov[UIO_SMALLIOV], *iov; + int error; + + error = copyin((caddr_t)uap->msg, (caddr_t)&msg, + sizeof (struct omsghdr)); + if (error) + return (error); + if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { + if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) + return (EMSGSIZE); + MALLOC(iov, struct iovec *, + sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, + M_WAITOK); + } else + iov = aiov; + msg.msg_flags = uap->flags | MSG_COMPAT; + error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov, + (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); + if (error) + goto done; + msg.msg_iov = iov; + error = recvit(p, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen, retval); + + if (msg.msg_controllen && error == 0) + error = copyout((caddr_t)&msg.msg_controllen, + (caddr_t)&uap->msg->msg_accrightslen, sizeof (int)); +done: + if (iov != aiov) + FREE(iov, M_IOV); + return (error); +} +#endif + +int +recvmsg(p, uap, retval) + struct proc *p; + register struct recvmsg_args /* { + int s; + struct msghdr *msg; + int flags; + } */ *uap; + int *retval; +{ + struct msghdr msg; + struct iovec aiov[UIO_SMALLIOV], *uiov, *iov; + register int error; + + error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg)); + if (error) + return (error); + if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) { + if ((u_int)msg.msg_iovlen >= UIO_MAXIOV) + return (EMSGSIZE); + MALLOC(iov, struct iovec *, + sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, + M_WAITOK); + } else + iov = aiov; +#ifdef COMPAT_OLDSOCK + msg.msg_flags = uap->flags &~ MSG_COMPAT; +#else + msg.msg_flags = uap->flags; +#endif + uiov = msg.msg_iov; + msg.msg_iov = iov; + error = copyin((caddr_t)uiov, (caddr_t)iov, + (unsigned)(msg.msg_iovlen * sizeof (struct iovec))); + if (error) + goto done; + error = recvit(p, uap->s, &msg, (caddr_t)0, retval); + if (!error) { + msg.msg_iov = uiov; + error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg)); + } +done: + if (iov != aiov) + FREE(iov, M_IOV); + return (error); +} + /* ARGSUSED */ int shutdown(p, uap, retval) struct proc *p; register struct shutdown_args /* { - syscallarg(int) s; - syscallarg(int) how; + int s; + int how; } */ *uap; - register_t *retval; + int *retval; { struct file *fp; int error; - if (error = getsock(p->p_fd, SCARG(uap, s), &fp)) + error = getsock(p->p_fd, uap->s, &fp); + if (error) return (error); - return (soshutdown((struct socket *)fp->f_data, SCARG(uap, how))); + return (soshutdown((struct socket *)fp->f_data, uap->how)); } /* ARGSUSED */ @@ -925,35 +967,36 @@ int setsockopt(p, uap, retval) struct proc *p; register struct setsockopt_args /* { - syscallarg(int) s; - syscallarg(int) level; - syscallarg(int) name; - syscallarg(caddr_t) val; - syscallarg(int) valsize; + int s; + int level; + int name; + caddr_t val; + int valsize; } */ *uap; - register_t *retval; + int *retval; { struct file *fp; struct mbuf *m = NULL; int error; - if (error = getsock(p->p_fd, SCARG(uap, s), &fp)) + error = getsock(p->p_fd, uap->s, &fp); + if (error) return (error); - if (SCARG(uap, valsize) > MLEN) + if (uap->valsize > MLEN) return (EINVAL); - if (SCARG(uap, val)) { + if (uap->val) { m = m_get(M_WAIT, MT_SOOPTS); if (m == NULL) return (ENOBUFS); - if (error = copyin(SCARG(uap, val), mtod(m, caddr_t), - (u_int)SCARG(uap, valsize))) { + error = copyin(uap->val, mtod(m, caddr_t), (u_int)uap->valsize); + if (error) { (void) m_free(m); return (error); } - m->m_len = SCARG(uap, valsize); + m->m_len = uap->valsize; } - return (sosetopt((struct socket *)fp->f_data, SCARG(uap, level), - SCARG(uap, name), m)); + return (sosetopt((struct socket *)fp->f_data, uap->level, + uap->name, m)); } /* ARGSUSED */ @@ -961,73 +1004,88 @@ int getsockopt(p, uap, retval) struct proc *p; register struct getsockopt_args /* { - syscallarg(int) s; - syscallarg(int) level; - syscallarg(int) name; - syscallarg(caddr_t) val; - syscallarg(int *) avalsize; + int s; + int level; + int name; + caddr_t val; + int *avalsize; } */ *uap; - register_t *retval; + int *retval; { struct file *fp; - struct mbuf *m = NULL; - int valsize, error; + struct mbuf *m = NULL, *m0; + int op, i, valsize, error; - if (error = getsock(p->p_fd, SCARG(uap, s), &fp)) + error = getsock(p->p_fd, uap->s, &fp); + if (error) return (error); - if (SCARG(uap, val)) { - if (error = copyin((caddr_t)SCARG(uap, avalsize), - (caddr_t)&valsize, sizeof (valsize))) + if (uap->val) { + error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize, + sizeof (valsize)); + if (error) return (error); } else valsize = 0; - if ((error = sogetopt((struct socket *)fp->f_data, SCARG(uap, level), - SCARG(uap, name), &m)) == 0 && SCARG(uap, val) && valsize && - m != NULL) { - if (valsize > m->m_len) - valsize = m->m_len; - error = copyout(mtod(m, caddr_t), SCARG(uap, val), - (u_int)valsize); + if ((error = sogetopt((struct socket *)fp->f_data, uap->level, + uap->name, &m)) == 0 && uap->val && valsize && m != NULL) { + op = 0; + while (m && !error && op < valsize) { + i = min(m->m_len, (valsize - op)); + error = copyout(mtod(m, caddr_t), uap->val, (u_int)i); + op += i; + uap->val += i; + m0 = m; + MFREE(m0,m); + } + valsize = op; if (error == 0) error = copyout((caddr_t)&valsize, - (caddr_t)SCARG(uap, avalsize), sizeof (valsize)); + (caddr_t)uap->avalsize, sizeof (valsize)); } if (m != NULL) (void) m_free(m); return (error); } +#ifdef OLD_PIPE /* ARGSUSED */ int pipe(p, uap, retval) struct proc *p; - void *uap; - register_t *retval; + struct pipe_args /* { + int dummy; + } */ *uap; + int retval[]; { register struct filedesc *fdp = p->p_fd; struct file *rf, *wf; struct socket *rso, *wso; int fd, error; - if (error = socreate(AF_UNIX, &rso, SOCK_STREAM, 0)) + error = socreate(AF_UNIX, &rso, SOCK_STREAM, 0, p); + if (error) return (error); - if (error = socreate(AF_UNIX, &wso, SOCK_STREAM, 0)) + error = socreate(AF_UNIX, &wso, SOCK_STREAM, 0, p); + if (error) goto free1; - if (error = falloc(p, &rf, &fd)) + error = falloc(p, &rf, &fd); + if (error) goto free2; retval[0] = fd; - rf->f_flag = FREAD; + rf->f_flag = FREAD | FWRITE; rf->f_type = DTYPE_SOCKET; rf->f_ops = &socketops; rf->f_data = (caddr_t)rso; - if (error = falloc(p, &wf, &fd)) + error = falloc(p, &wf, &fd); + if (error) goto free3; - wf->f_flag = FWRITE; + wf->f_flag = FREAD | FWRITE; wf->f_type = DTYPE_SOCKET; wf->f_ops = &socketops; wf->f_data = (caddr_t)wso; retval[1] = fd; - if (error = unp_connect2(wso, rso)) + error = unp_connect2(wso, rso); + if (error) goto free4; return (0); free4: @@ -1042,170 +1100,153 @@ free1: (void)soclose(rso); return (error); } - +#endif /* * Get socket name. */ -#ifdef COMPAT_OLDSOCK -int -getsockname(p, uap, retval) - struct proc *p; - struct getsockname_args /* { - syscallarg(int) fdes; - syscallarg(caddr_t) asa; - syscallarg(int *) alen; - } */ *uap; - register_t *retval; -{ - - return (getsockname1(p, uap, retval, 0)); -} - -int -compat_43_getsockname(p, uap, retval) - struct proc *p; - struct getsockname_args /* { - syscallarg(int) fdes; - syscallarg(caddr_t) asa; - syscallarg(int *) alen; - } */ *uap; - register_t *retval; -{ - - return (getsockname1(p, uap, retval, 1)); -} -#else /* COMPAT_OLDSOCK */ - -#define getsockname1 getsockname -#endif - /* ARGSUSED */ -int -getsockname1(p, uap, retval, compat_43) +static int +getsockname1(p, uap, retval, compat) struct proc *p; register struct getsockname_args /* { - syscallarg(int) fdes; - syscallarg(caddr_t) asa; - syscallarg(int *) alen; + int fdes; + caddr_t asa; + int *alen; } */ *uap; - register_t *retval; - int compat_43; + int *retval; + int compat; { struct file *fp; register struct socket *so; struct mbuf *m; int len, error; - if (error = getsock(p->p_fd, SCARG(uap, fdes), &fp)) + error = getsock(p->p_fd, uap->fdes, &fp); + if (error) return (error); - if (error = copyin((caddr_t)SCARG(uap, alen), (caddr_t)&len, - sizeof (len))) + error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len)); + if (error) return (error); so = (struct socket *)fp->f_data; m = m_getclr(M_WAIT, MT_SONAME); if (m == NULL) return (ENOBUFS); - if (error = (*so->so_proto->pr_usrreq)(so, PRU_SOCKADDR, 0, m, 0)) + error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, m); + if (error) goto bad; if (len > m->m_len) len = m->m_len; #ifdef COMPAT_OLDSOCK - if (compat_43) + if (compat) mtod(m, struct osockaddr *)->sa_family = mtod(m, struct sockaddr *)->sa_family; #endif - error = copyout(mtod(m, caddr_t), (caddr_t)SCARG(uap, asa), (u_int)len); + error = copyout(mtod(m, caddr_t), (caddr_t)uap->asa, (u_int)len); if (error == 0) - error = copyout((caddr_t)&len, (caddr_t)SCARG(uap, alen), + error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len)); bad: m_freem(m); return (error); } -/* - * Get name of peer for connected socket. - */ -#ifdef COMPAT_OLDSOCK int -getpeername(p, uap, retval) +getsockname(p, uap, retval) struct proc *p; - struct getpeername_args /* { - syscallarg(int) fdes; - syscallarg(caddr_t) asa; - syscallarg(int *) alen; - } */ *uap; - register_t *retval; + struct getsockname_args *uap; + int *retval; { - return (getpeername1(p, uap, retval, 0)); + return (getsockname1(p, uap, retval, 0)); } +#ifdef COMPAT_OLDSOCK int -compat_43_getpeername(p, uap, retval) +ogetsockname(p, uap, retval) struct proc *p; - struct getpeername_args /* { - syscallarg(int) fdes; - syscallarg(caddr_t) asa; - syscallarg(int *) alen; - } */ *uap; - register_t *retval; + struct getsockname_args *uap; + int *retval; { - return (getpeername1(p, uap, retval, 1)); + return (getsockname1(p, uap, retval, 1)); } -#else /* COMPAT_OLDSOCK */ - -#define getpeername1 getpeername -#endif +#endif /* COMPAT_OLDSOCK */ +/* + * Get name of peer for connected socket. + */ /* ARGSUSED */ -int -getpeername1(p, uap, retval, compat_43) +static int +getpeername1(p, uap, retval, compat) struct proc *p; register struct getpeername_args /* { - syscallarg(int) fdes; - syscallarg(caddr_t) asa; - syscallarg(int *) alen; + int fdes; + caddr_t asa; + int *alen; } */ *uap; - register_t *retval; - int compat_43; + int *retval; + int compat; { struct file *fp; register struct socket *so; struct mbuf *m; int len, error; - if (error = getsock(p->p_fd, SCARG(uap, fdes), &fp)) + error = getsock(p->p_fd, uap->fdes, &fp); + if (error) return (error); so = (struct socket *)fp->f_data; if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0) return (ENOTCONN); - if (error = - copyin((caddr_t)SCARG(uap, alen), (caddr_t)&len, sizeof (len))) + error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len)); + if (error) return (error); m = m_getclr(M_WAIT, MT_SONAME); if (m == NULL) return (ENOBUFS); - if (error = (*so->so_proto->pr_usrreq)(so, PRU_PEERADDR, 0, m, 0)) + error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, m); + if (error) goto bad; if (len > m->m_len) len = m->m_len; #ifdef COMPAT_OLDSOCK - if (compat_43) + if (compat) mtod(m, struct osockaddr *)->sa_family = mtod(m, struct sockaddr *)->sa_family; #endif - if (error = - copyout(mtod(m, caddr_t), (caddr_t)SCARG(uap, asa), (u_int)len)) + error = copyout(mtod(m, caddr_t), (caddr_t)uap->asa, (u_int)len); + if (error) goto bad; - error = copyout((caddr_t)&len, (caddr_t)SCARG(uap, alen), sizeof (len)); + error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len)); bad: m_freem(m); return (error); } int +getpeername(p, uap, retval) + struct proc *p; + struct getpeername_args *uap; + int *retval; +{ + + return (getpeername1(p, uap, retval, 0)); +} + +#ifdef COMPAT_OLDSOCK +int +ogetpeername(p, uap, retval) + struct proc *p; + struct ogetpeername_args *uap; + int *retval; +{ + + /* XXX uap should have type `getpeername_args *' to begin with. */ + return (getpeername1(p, (struct getpeername_args *)uap, retval, 1)); +} +#endif /* COMPAT_OLDSOCK */ + +int sockargs(mp, buf, buflen, type) struct mbuf **mp; caddr_t buf; @@ -1228,21 +1269,21 @@ sockargs(mp, buf, buflen, type) return (ENOBUFS); m->m_len = buflen; error = copyin(buf, mtod(m, caddr_t), (u_int)buflen); - if (error) { + if (error) (void) m_free(m); - return (error); - } - *mp = m; - if (type == MT_SONAME) { - sa = mtod(m, struct sockaddr *); + else { + *mp = m; + if (type == MT_SONAME) { + sa = mtod(m, struct sockaddr *); #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN - if (sa->sa_family == 0 && sa->sa_len < AF_MAX) - sa->sa_family = sa->sa_len; + if (sa->sa_family == 0 && sa->sa_len < AF_MAX) + sa->sa_family = sa->sa_len; #endif - sa->sa_len = buflen; + sa->sa_len = buflen; + } } - return (0); + return (error); } int diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c index c6bcbfd..0a47414 100644 --- a/sys/kern/uipc_usrreq.c +++ b/sys/kern/uipc_usrreq.c @@ -30,24 +30,29 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95 + * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94 + * $Id: uipc_usrreq.c,v 1.21 1997/03/21 16:12:32 wpaul Exp $ */ #include <sys/param.h> +#include <sys/queue.h> #include <sys/systm.h> -#include <sys/proc.h> -#include <sys/filedesc.h> +#include <sys/kernel.h> #include <sys/domain.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/filedesc.h> +#include <sys/mbuf.h> +#include <sys/namei.h> +#include <sys/proc.h> #include <sys/protosw.h> #include <sys/socket.h> #include <sys/socketvar.h> -#include <sys/unpcb.h> +#include <sys/stat.h> +#include <sys/sysctl.h> #include <sys/un.h> -#include <sys/namei.h> +#include <sys/unpcb.h> #include <sys/vnode.h> -#include <sys/file.h> -#include <sys/stat.h> -#include <sys/mbuf.h> /* * Unix communications domain. @@ -57,8 +62,22 @@ * rethink name space problems * need a proper out-of-band */ -struct sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX }; -ino_t unp_ino; /* prototype for fake inode numbers */ +static struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL }; +static ino_t unp_ino; /* prototype for fake inode numbers */ + +static int unp_attach __P((struct socket *)); +static void unp_detach __P((struct unpcb *)); +static int unp_bind __P((struct unpcb *,struct mbuf *, struct proc *)); +static int unp_connect __P((struct socket *,struct mbuf *, struct proc *)); +static void unp_disconnect __P((struct unpcb *)); +static void unp_shutdown __P((struct unpcb *)); +static void unp_drop __P((struct unpcb *, int)); +static void unp_gc __P((void)); +static void unp_scan __P((struct mbuf *, void (*)(struct file *))); +static void unp_mark __P((struct file *)); +static void unp_discard __P((struct file *)); +static int unp_internalize __P((struct mbuf *, struct proc *)); + /*ARGSUSED*/ int @@ -170,6 +189,7 @@ uipc_usrreq(so, req, m, nam, control) break; case PRU_SEND: + case PRU_SEND_EOF: if (control && (error = unp_internalize(control, p))) break; switch (so->so_type) { @@ -210,6 +230,22 @@ uipc_usrreq(so, req, m, nam, control) case SOCK_STREAM: #define rcv (&so2->so_rcv) #define snd (&so->so_snd) + /* Connect if not connected yet. */ + /* + * Note: A better implementation would complain + * if not equal to the peer's address. + */ + if ((so->so_state & SS_ISCONNECTED) == 0) { + if (nam) { + error = unp_connect(so, nam, p); + if (error) + break; /* XXX */ + } else { + error = ENOTCONN; + break; + } + } + if (so->so_state & SS_CANTSENDMORE) { error = EPIPE; break; @@ -241,6 +277,14 @@ uipc_usrreq(so, req, m, nam, control) default: panic("uipc 4"); } + /* + * SEND_EOF is equivalent to a SEND followed by + * a SHUTDOWN. + */ + if (req == PRU_SEND_EOF) { + socantsendmore(so); + unp_shutdown(unp); + } break; case PRU_ABORT: @@ -306,22 +350,34 @@ release: * and don't really want to reserve the sendspace. Their recvspace should * be large enough for at least one max-size datagram plus address. */ -#define PIPSIZ 4096 -u_long unpst_sendspace = PIPSIZ; -u_long unpst_recvspace = PIPSIZ; -u_long unpdg_sendspace = 2*1024; /* really max datagram size */ -u_long unpdg_recvspace = 4*1024; - -int unp_rights; /* file descriptors in flight */ - -int +#ifndef PIPSIZ +#define PIPSIZ 8192 +#endif +static u_long unpst_sendspace = PIPSIZ; +static u_long unpst_recvspace = PIPSIZ; +static u_long unpdg_sendspace = 2*1024; /* really max datagram size */ +static u_long unpdg_recvspace = 4*1024; + +static int unp_rights; /* file descriptors in flight */ + +SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, + &unpst_sendspace, 0, ""); +SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW, + &unpst_recvspace, 0, ""); +SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW, + &unpdg_sendspace, 0, ""); +SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW, + &unpdg_recvspace, 0, ""); +SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, ""); + +static int unp_attach(so) struct socket *so; { register struct mbuf *m; register struct unpcb *unp; int error; - + if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) { switch (so->so_type) { @@ -348,11 +404,11 @@ unp_attach(so) return (0); } -void +static void unp_detach(unp) register struct unpcb *unp; { - + if (unp->unp_vnode) { unp->unp_vnode->v_socket = 0; vrele(unp->unp_vnode); @@ -364,8 +420,6 @@ unp_detach(unp) unp_drop(unp->unp_refs, ECONNRESET); soisdisconnected(unp->unp_socket); unp->unp_socket->so_pcb = 0; - m_freem(unp->unp_addr); - (void) m_free(dtom(unp)); if (unp_rights) { /* * Normally the receive buffer is flushed later, @@ -377,9 +431,11 @@ unp_detach(unp) sorflush(unp->unp_socket); unp_gc(); } + m_freem(unp->unp_addr); + (void) m_free(dtom(unp)); } -int +static int unp_bind(unp, nam, p) struct unpcb *unp; struct mbuf *nam; @@ -401,7 +457,8 @@ unp_bind(unp, nam, p) } else *(mtod(nam, caddr_t) + nam->m_len) = 0; /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */ - if (error = namei(&nd)) + error = namei(&nd); + if (error) return (error); vp = nd.ni_vp; if (vp != NULL) { @@ -427,7 +484,7 @@ unp_bind(unp, nam, p) return (0); } -int +static int unp_connect(so, nam, p) struct socket *so; struct mbuf *nam; @@ -446,14 +503,16 @@ unp_connect(so, nam, p) return (EMSGSIZE); } else *(mtod(nam, caddr_t) + nam->m_len) = 0; - if (error = namei(&nd)) + error = namei(&nd); + if (error) return (error); vp = nd.ni_vp; if (vp->v_type != VSOCK) { error = ENOTSOCK; goto bad; } - if (error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) + error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p); + if (error) goto bad; so2 = vp->v_socket; if (so2 == 0) { @@ -515,7 +574,7 @@ unp_connect2(so, so2) return (0); } -void +static void unp_disconnect(unp) struct unpcb *unp; { @@ -562,7 +621,7 @@ unp_abort(unp) } #endif -void +static void unp_shutdown(unp) struct unpcb *unp; { @@ -573,7 +632,7 @@ unp_shutdown(unp) socantrcvmore(so); } -void +static void unp_drop(unp, errno) struct unpcb *unp; int errno; @@ -591,6 +650,7 @@ unp_drop(unp, errno) } #ifdef notdef +void unp_drain() { @@ -609,6 +669,9 @@ unp_externalize(rights) int newfds = (cm->cmsg_len - sizeof(*cm)) / sizeof (int); int f; + /* + * if the new FD's will not fit, then we free them all + */ if (!fdavail(p, newfds)) { for (i = 0; i < newfds; i++) { fp = *rp; @@ -617,6 +680,12 @@ unp_externalize(rights) } return (EMSGSIZE); } + /* + * now change each pointer to an fd in the global table to + * an integer that is the index to the local fd table entry + * that we set up to point to the global one we are transferring. + * XXX this assumes a pointer and int are the same size...! + */ for (i = 0; i < newfds; i++) { if (fdalloc(p, 0, &f)) panic("unp_externalize"); @@ -629,7 +698,11 @@ unp_externalize(rights) return (0); } -int +#ifndef MIN +#define MIN(a,b) (((a)<(b))?(a):(b)) +#endif + +static int unp_internalize(control, p) struct mbuf *control; struct proc *p; @@ -639,12 +712,34 @@ unp_internalize(control, p) register struct file **rp; register struct file *fp; register int i, fd; + register struct cmsgcred *cmcred; int oldfds; - if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET || - cm->cmsg_len != control->m_len) + if ((cm->cmsg_type != SCM_RIGHTS && cm->cmsg_type != SCM_CREDS) || + cm->cmsg_level != SOL_SOCKET || cm->cmsg_len != control->m_len) return (EINVAL); + + /* + * Fill in credential information. + */ + if (cm->cmsg_type == SCM_CREDS) { + cmcred = (struct cmsgcred *)(cm + 1); + cmcred->cmcred_pid = p->p_pid; + cmcred->cmcred_uid = p->p_cred->p_ruid; + cmcred->cmcred_gid = p->p_cred->p_rgid; + cmcred->cmcred_euid = p->p_ucred->cr_uid; + cmcred->cmcred_ngroups = MIN(p->p_ucred->cr_ngroups, + CMGROUP_MAX); + for (i = 0; i < cmcred->cmcred_ngroups; i++) + cmcred->cmcred_groups[i] = p->p_ucred->cr_groups[i]; + return(0); + } + oldfds = (cm->cmsg_len - sizeof (*cm)) / sizeof (int); + /* + * check that all the FDs passed in refer to legal OPEN files + * If not, reject the entire operation. + */ rp = (struct file **)(cm + 1); for (i = 0; i < oldfds; i++) { fd = *(int *)rp++; @@ -652,6 +747,11 @@ unp_internalize(control, p) fdp->fd_ofiles[fd] == NULL) return (EBADF); } + /* + * Now replace the integer FDs with pointers to + * the associated global file table entry.. + * XXX this assumes a pointer and an int are the same size! + */ rp = (struct file **)(cm + 1); for (i = 0; i < oldfds; i++) { fp = fdp->fd_ofiles[*(int *)rp]; @@ -663,10 +763,9 @@ unp_internalize(control, p) return (0); } -int unp_defer, unp_gcing; -extern struct domain unixdomain; +static int unp_defer, unp_gcing; -void +static void unp_gc() { register struct file *fp, *nextfp; @@ -678,26 +777,56 @@ unp_gc() return; unp_gcing = 1; unp_defer = 0; + /* + * before going through all this, set all FDs to + * be NOT defered and NOT externally accessible + */ for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next) fp->f_flag &= ~(FMARK|FDEFER); do { for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next) { + /* + * If the file is not open, skip it + */ if (fp->f_count == 0) continue; + /* + * If we already marked it as 'defer' in a + * previous pass, then try process it this time + * and un-mark it + */ if (fp->f_flag & FDEFER) { fp->f_flag &= ~FDEFER; unp_defer--; } else { + /* + * if it's not defered, then check if it's + * already marked.. if so skip it + */ if (fp->f_flag & FMARK) continue; + /* + * If all references are from messages + * in transit, then skip it. it's not + * externally accessible. + */ if (fp->f_count == fp->f_msgcount) continue; + /* + * If it got this far then it must be + * externally accessible. + */ fp->f_flag |= FMARK; } + /* + * either it was defered, or it is externally + * accessible and not already marked so. + * Now check if it is possibly one of OUR sockets. + */ if (fp->f_type != DTYPE_SOCKET || (so = (struct socket *)fp->f_data) == 0) continue; - if (so->so_proto->pr_domain != &unixdomain || + if (so->so_proto->pr_domain != &localdomain || (so->so_proto->pr_flags&PR_RIGHTS) == 0) continue; #ifdef notdef @@ -716,6 +845,13 @@ unp_gc() goto restart; } #endif + /* + * So, Ok, it's one of our sockets and it IS externally + * accessible (or was defered). Now we look + * to see if we hold any file descriptors in it's + * message buffers. Follow those links and mark them + * as accessible too. + */ unp_scan(so->so_rcv.sb_mb, unp_mark); } } while (unp_defer); @@ -762,18 +898,30 @@ unp_gc() for (nunref = 0, fp = filehead.lh_first, fpp = extra_ref; fp != 0; fp = nextfp) { nextfp = fp->f_list.le_next; + /* + * If it's not open, skip it + */ if (fp->f_count == 0) continue; + /* + * If all refs are from msgs, and it's not marked accessible + * then it must be referenced from some unreachable cycle + * of (shut-down) FDs, so include it in our + * list of FDs to remove + */ if (fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) { *fpp++ = fp; nunref++; fp->f_count++; } } + /* + * for each FD on our hit list, do the following two things + */ for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) sorflush((struct socket *)(*fpp)->f_data); for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp) - closef(*fpp, (struct proc *)NULL); + closef(*fpp, (struct proc *) NULL); free((caddr_t)extra_ref, M_FILE); unp_gcing = 0; } @@ -787,7 +935,7 @@ unp_dispose(m) unp_scan(m, unp_discard); } -void +static void unp_scan(m0, op) register struct mbuf *m0; void (*op) __P((struct file *)); @@ -817,7 +965,7 @@ unp_scan(m0, op) } } -void +static void unp_mark(fp) struct file *fp; { @@ -828,7 +976,7 @@ unp_mark(fp) fp->f_flag |= (FMARK|FDEFER); } -void +static void unp_discard(fp) struct file *fp; { diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c index ec5c962..494a53d 100644 --- a/sys/kern/vfs_bio.c +++ b/sys/kern/vfs_bio.c @@ -1,181 +1,377 @@ -/*- - * Copyright (c) 1982, 1986, 1989, 1993 - * The Regents of the University of California. All rights reserved. - * (c) UNIX System Laboratories, Inc. - * All or some portions of this file are derived from material licensed - * to the University of California by American Telephone and Telegraph - * Co. or Unix System Laboratories, Inc. and are reproduced herein with - * the permission of UNIX System Laboratories, Inc. +/* + * Copyright (c) 1994 John S. Dyson + * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions and the following disclaimer. + * notice immediately at the beginning of the file, without modification, + * this list of conditions, and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. - * 3. All advertising materials mentioning features or use of this software - * must display the following acknowledgement: - * This product includes software developed by the University of - * California, Berkeley and its contributors. - * 4. Neither the name of the University nor the names of its contributors - * may be used to endorse or promote products derived from this software - * without specific prior written permission. + * 3. Absolutely no warranty of function or purpose is made by the author + * John S. Dyson. + * 4. This work was done expressly for inclusion into FreeBSD. Other use + * is allowed if this notation is included. + * 5. Modifications may be freely made to this file if the above conditions + * are met. * - * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND - * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE - * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE - * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE - * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT - * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY - * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF - * SUCH DAMAGE. + * $Id$ + */ + +/* + * this file contains a new buffer I/O scheme implementing a coherent + * VM object and buffer cache scheme. Pains have been taken to make + * sure that the performance degradation associated with schemes such + * as this is not realized. * - * from: @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94 + * Author: John S. Dyson + * Significant help during the development and debugging phases + * had been provided by David Greenman, also of the FreeBSD core team. */ +#include "opt_bounce.h" + +#define VMIO #include <sys/param.h> #include <sys/systm.h> +#include <sys/sysproto.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> #include <sys/proc.h> -#include <sys/buf.h> #include <sys/vnode.h> +#include <sys/vmmeter.h> +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <vm/vm_kern.h> +#include <vm/vm_pageout.h> +#include <vm/vm_page.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> +#include <vm/lock.h> +#include <vm/vm_map.h> +#include <sys/buf.h> #include <sys/mount.h> -#include <sys/trace.h> #include <sys/malloc.h> #include <sys/resourcevar.h> +#include <sys/proc.h> + +#include <miscfs/specfs/specdev.h> + +static void vfs_update __P((void)); +static struct proc *updateproc; +static struct kproc_desc up_kp = { + "update", + vfs_update, + &updateproc +}; +SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp) + +struct buf *buf; /* buffer header pool */ +struct swqueue bswlist; + +int count_lock_queue __P((void)); +static void vm_hold_free_pages(struct buf * bp, vm_offset_t from, + vm_offset_t to); +static void vm_hold_load_pages(struct buf * bp, vm_offset_t from, + vm_offset_t to); +static void vfs_clean_pages(struct buf * bp); +static void vfs_setdirty(struct buf *bp); +static void vfs_vmio_release(struct buf *bp); + +int needsbuffer; /* - * Definitions for the buffer hash lists. + * Internal update daemon, process 3 + * The variable vfs_update_wakeup allows for internal syncs. */ -#define BUFHASH(dvp, lbn) \ - (&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash]) -LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash; -u_long bufhash; +int vfs_update_wakeup; + /* - * Insq/Remq for the buffer hash lists. + * buffers base kva */ -#define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash) -#define bremhash(bp) LIST_REMOVE(bp, b_hash) /* - * Definitions for the buffer free lists. + * bogus page -- for I/O to/from partially complete buffers + * this is a temporary solution to the problem, but it is not + * really that bad. it would be better to split the buffer + * for input in the case of buffers partially already in memory, + * but the code is intricate enough already. */ -#define BQUEUES 4 /* number of free buffer queues */ +vm_page_t bogus_page; +static vm_offset_t bogus_offset; -#define BQ_LOCKED 0 /* super-blocks &c */ -#define BQ_LRU 1 /* lru, useful buffers */ -#define BQ_AGE 2 /* rubbish */ -#define BQ_EMPTY 3 /* buffer headers with no memory */ +static int bufspace, maxbufspace, vmiospace, maxvmiobufspace, + bufmallocspace, maxbufmallocspace; -TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES]; -int needbuffer; +static struct bufhashhdr bufhashtbl[BUFHSZ], invalhash; +static struct bqueues bufqueues[BUFFER_QUEUES]; -/* - * Insq/Remq for the buffer free lists. - */ -#define binsheadfree(bp, dp) TAILQ_INSERT_HEAD(dp, bp, b_freelist) -#define binstailfree(bp, dp) TAILQ_INSERT_TAIL(dp, bp, b_freelist) +extern int vm_swap_size; -void -bremfree(bp) - struct buf *bp; -{ - struct bqueues *dp = NULL; - - /* - * We only calculate the head of the freelist when removing - * the last element of the list as that is the only time that - * it is needed (e.g. to reset the tail pointer). - * - * NB: This makes an assumption about how tailq's are implemented. - */ - if (bp->b_freelist.tqe_next == NULL) { - for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) - if (dp->tqh_last == &bp->b_freelist.tqe_next) - break; - if (dp == &bufqueues[BQUEUES]) - panic("bremfree: lost tail"); - } - TAILQ_REMOVE(dp, bp, b_freelist); -} +#define BUF_MAXUSE 16 /* - * Initialize buffers and hash links for buffers. + * Initialize buffer headers and related structures. */ void bufinit() { - register struct buf *bp; - struct bqueues *dp; - register int i; - int base, residual; - - for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++) - TAILQ_INIT(dp); - bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash); - base = bufpages / nbuf; - residual = bufpages % nbuf; + struct buf *bp; + int i; + + TAILQ_INIT(&bswlist); + LIST_INIT(&invalhash); + + /* first, make a null hash table */ + for (i = 0; i < BUFHSZ; i++) + LIST_INIT(&bufhashtbl[i]); + + /* next, make a null set of free lists */ + for (i = 0; i < BUFFER_QUEUES; i++) + TAILQ_INIT(&bufqueues[i]); + + /* finally, initialize each buffer header and stick on empty q */ for (i = 0; i < nbuf; i++) { bp = &buf[i]; - bzero((char *)bp, sizeof *bp); + bzero(bp, sizeof *bp); + bp->b_flags = B_INVAL; /* we're just an empty header */ bp->b_dev = NODEV; bp->b_rcred = NOCRED; bp->b_wcred = NOCRED; + bp->b_qindex = QUEUE_EMPTY; bp->b_vnbufs.le_next = NOLIST; - bp->b_data = buffers + i * MAXBSIZE; - if (i < residual) - bp->b_bufsize = (base + 1) * CLBYTES; - else - bp->b_bufsize = base * CLBYTES; - bp->b_flags = B_INVAL; - dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY]; - binsheadfree(bp, dp); - binshash(bp, &invalhash); + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist); + LIST_INSERT_HEAD(&invalhash, bp, b_hash); } +/* + * maxbufspace is currently calculated to support all filesystem blocks + * to be 8K. If you happen to use a 16K filesystem, the size of the buffer + * cache is still the same as it would be for 8K filesystems. This + * keeps the size of the buffer cache "in check" for big block filesystems. + */ + maxbufspace = (nbuf + 8) * DFLTBSIZE; +/* + * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed + */ + maxvmiobufspace = 2 * maxbufspace / 3; +/* + * Limit the amount of malloc memory since it is wired permanently into + * the kernel space. Even though this is accounted for in the buffer + * allocation, we don't want the malloced region to grow uncontrolled. + * The malloc scheme improves memory utilization significantly on average + * (small) directories. + */ + maxbufmallocspace = maxbufspace / 20; + + bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE); + bogus_page = vm_page_alloc(kernel_object, + ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), + VM_ALLOC_NORMAL); + } -bread(a1, a2, a3, a4, a5) - struct vnode *a1; - daddr_t a2; - int a3; - struct ucred *a4; - struct buf **a5; +/* + * Free the kva allocation for a buffer + * Must be called only at splbio or higher, + * as this is the only locking for buffer_map. + */ +static void +bfreekva(struct buf * bp) { + if (bp->b_kvasize == 0) + return; + + vm_map_delete(buffer_map, + (vm_offset_t) bp->b_kvabase, + (vm_offset_t) bp->b_kvabase + bp->b_kvasize); + + bp->b_kvasize = 0; - /* - * Body deleted. - */ - return (EIO); } -breadn(a1, a2, a3, a4, a5, a6, a7, a8) - struct vnode *a1; - daddr_t a2; int a3; - daddr_t a4[]; int a5[]; - int a6; - struct ucred *a7; - struct buf **a8; +/* + * remove the buffer from the appropriate free list + */ +void +bremfree(struct buf * bp) { + int s = splbio(); - /* - * Body deleted. - */ - return (EIO); + if (bp->b_qindex != QUEUE_NONE) { + TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist); + bp->b_qindex = QUEUE_NONE; + } else { + panic("bremfree: removing a buffer when not on a queue"); + } + splx(s); } -bwrite(a1) - struct buf *a1; +/* + * Get a buffer with the specified data. Look in the cache first. + */ +int +bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred, + struct buf ** bpp) { + struct buf *bp; + + bp = getblk(vp, blkno, size, 0, 0); + *bpp = bp; + + /* if not found in cache, do some I/O */ + if ((bp->b_flags & B_CACHE) == 0) { + if (curproc != NULL) + curproc->p_stats->p_ru.ru_inblock++; + bp->b_flags |= B_READ; + bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); + if (bp->b_rcred == NOCRED) { + if (cred != NOCRED) + crhold(cred); + bp->b_rcred = cred; + } + vfs_busy_pages(bp, 0); + VOP_STRATEGY(bp); + return (biowait(bp)); + } + return (0); +} + +/* + * Operates like bread, but also starts asynchronous I/O on + * read-ahead blocks. + */ +int +breadn(struct vnode * vp, daddr_t blkno, int size, + daddr_t * rablkno, int *rabsize, + int cnt, struct ucred * cred, struct buf ** bpp) +{ + struct buf *bp, *rabp; + int i; + int rv = 0, readwait = 0; + + *bpp = bp = getblk(vp, blkno, size, 0, 0); + + /* if not found in cache, do some I/O */ + if ((bp->b_flags & B_CACHE) == 0) { + if (curproc != NULL) + curproc->p_stats->p_ru.ru_inblock++; + bp->b_flags |= B_READ; + bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); + if (bp->b_rcred == NOCRED) { + if (cred != NOCRED) + crhold(cred); + bp->b_rcred = cred; + } + vfs_busy_pages(bp, 0); + VOP_STRATEGY(bp); + ++readwait; + } + for (i = 0; i < cnt; i++, rablkno++, rabsize++) { + if (inmem(vp, *rablkno)) + continue; + rabp = getblk(vp, *rablkno, *rabsize, 0, 0); + + if ((rabp->b_flags & B_CACHE) == 0) { + if (curproc != NULL) + curproc->p_stats->p_ru.ru_inblock++; + rabp->b_flags |= B_READ | B_ASYNC; + rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL); + if (rabp->b_rcred == NOCRED) { + if (cred != NOCRED) + crhold(cred); + rabp->b_rcred = cred; + } + vfs_busy_pages(rabp, 0); + VOP_STRATEGY(rabp); + } else { + brelse(rabp); + } + } + + if (readwait) { + rv = biowait(bp); + } + return (rv); +} + +/* + * Write, release buffer on completion. (Done by iodone + * if async.) + */ +int +bwrite(struct buf * bp) +{ + int oldflags = bp->b_flags; + + if (bp->b_flags & B_INVAL) { + brelse(bp); + return (0); + } + if (!(bp->b_flags & B_BUSY)) + panic("bwrite: buffer is not busy???"); + + bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); + bp->b_flags |= B_WRITEINPROG; + + if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) { + reassignbuf(bp, bp->b_vp); + } + + bp->b_vp->v_numoutput++; + vfs_busy_pages(bp, 1); + if (curproc != NULL) + curproc->p_stats->p_ru.ru_oublock++; + VOP_STRATEGY(bp); /* - * Body deleted. + * Handle ordered writes here. + * If the write was originally flagged as ordered, + * then we check to see if it was converted to async. + * If it was converted to async, and is done now, then + * we release the buffer. Otherwise we clear the + * ordered flag because it is not needed anymore. + * + * Note that biodone has been modified so that it does + * not release ordered buffers. This allows us to have + * a chance to determine whether or not the driver + * has set the async flag in the strategy routine. Otherwise + * if biodone was not modified, then the buffer may have been + * reused before we have had a chance to check the flag. */ - return (EIO); + + if ((oldflags & B_ORDERED) == B_ORDERED) { + int s; + s = splbio(); + if (bp->b_flags & B_ASYNC) { + if ((bp->b_flags & B_DONE)) { + if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0) + brelse(bp); + else + bqrelse(bp); + } + splx(s); + return (0); + } else { + bp->b_flags &= ~B_ORDERED; + } + splx(s); + } + + if ((oldflags & B_ASYNC) == 0) { + int rtval = biowait(bp); + + if (oldflags & B_DELWRI) { + reassignbuf(bp, bp->b_vp); + } + brelse(bp); + return (rtval); + } + return (0); } int @@ -185,155 +381,1566 @@ vn_bwrite(ap) return (bwrite(ap->a_bp)); } -bdwrite(a1) - struct buf *a1; +/* + * Delayed write. (Buffer is marked dirty). + */ +void +bdwrite(struct buf * bp) { + if ((bp->b_flags & B_BUSY) == 0) { + panic("bdwrite: buffer is not busy"); + } + if (bp->b_flags & B_INVAL) { + brelse(bp); + return; + } + if (bp->b_flags & B_TAPE) { + bawrite(bp); + return; + } + bp->b_flags &= ~(B_READ|B_RELBUF); + if ((bp->b_flags & B_DELWRI) == 0) { + bp->b_flags |= B_DONE | B_DELWRI; + reassignbuf(bp, bp->b_vp); + } + /* - * Body deleted. + * This bmap keeps the system from needing to do the bmap later, + * perhaps when the system is attempting to do a sync. Since it + * is likely that the indirect block -- or whatever other datastructure + * that the filesystem needs is still in memory now, it is a good + * thing to do this. Note also, that if the pageout daemon is + * requesting a sync -- there might not be enough memory to do + * the bmap then... So, this is important to do. */ - return; -} + if( bp->b_lblkno == bp->b_blkno) { + VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL); + } -bawrite(a1) - struct buf *a1; -{ + /* + * Set the *dirty* buffer range based upon the VM system dirty pages. + */ + vfs_setdirty(bp); /* - * Body deleted. + * We need to do this here to satisfy the vnode_pager and the + * pageout daemon, so that it thinks that the pages have been + * "cleaned". Note that since the pages are in a delayed write + * buffer -- the VFS layer "will" see that the pages get written + * out on the next sync, or perhaps the cluster will be completed. */ + vfs_clean_pages(bp); + bqrelse(bp); return; } -brelse(a1) - struct buf *a1; +/* + * Asynchronous write. + * Start output on a buffer, but do not wait for it to complete. + * The buffer is released when the output completes. + */ +void +bawrite(struct buf * bp) +{ + bp->b_flags |= B_ASYNC; + (void) VOP_BWRITE(bp); +} + +/* + * Ordered write. + * Start output on a buffer, but only wait for it to complete if the + * output device cannot guarantee ordering in some other way. Devices + * that can perform asynchronous ordered writes will set the B_ASYNC + * flag in their strategy routine. + * The buffer is released when the output completes. + */ +int +bowrite(struct buf * bp) { + bp->b_flags |= B_ORDERED; + return (VOP_BWRITE(bp)); +} + +/* + * Release a buffer. + */ +void +brelse(struct buf * bp) +{ + int s; + + if (bp->b_flags & B_CLUSTER) { + relpbuf(bp); + return; + } + /* anyone need a "free" block? */ + s = splbio(); + + /* anyone need this block? */ + if (bp->b_flags & B_WANTED) { + bp->b_flags &= ~(B_WANTED | B_AGE); + wakeup(bp); + } + + if (bp->b_flags & B_LOCKED) + bp->b_flags &= ~B_ERROR; + + if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) || + (bp->b_bufsize <= 0)) { + bp->b_flags |= B_INVAL; + bp->b_flags &= ~(B_DELWRI | B_CACHE); + if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) { + if (bp->b_bufsize) + allocbuf(bp, 0); + brelvp(bp); + } + } /* - * Body deleted. + * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer + * constituted, so the B_INVAL flag is used to *invalidate* the buffer, + * but the VM object is kept around. The B_NOCACHE flag is used to + * invalidate the pages in the VM object. */ - return; + if (bp->b_flags & B_VMIO) { + vm_ooffset_t foff; + vm_object_t obj; + int i, resid; + vm_page_t m; + struct vnode *vp; + int iototal = bp->b_bufsize; + + vp = bp->b_vp; + if (!vp) + panic("brelse: missing vp"); + + if (bp->b_npages) { + vm_pindex_t poff; + obj = (vm_object_t) vp->v_object; + if (vp->v_type == VBLK) + foff = ((vm_ooffset_t) bp->b_lblkno) << DEV_BSHIFT; + else + foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; + poff = OFF_TO_IDX(foff); + for (i = 0; i < bp->b_npages; i++) { + m = bp->b_pages[i]; + if (m == bogus_page) { + m = vm_page_lookup(obj, poff + i); + if (!m) { + panic("brelse: page missing\n"); + } + bp->b_pages[i] = m; + pmap_qenter(trunc_page(bp->b_data), + bp->b_pages, bp->b_npages); + } + resid = IDX_TO_OFF(m->pindex+1) - foff; + if (resid > iototal) + resid = iototal; + if (resid > 0) { + /* + * Don't invalidate the page if the local machine has already + * modified it. This is the lesser of two evils, and should + * be fixed. + */ + if (bp->b_flags & (B_NOCACHE | B_ERROR)) { + vm_page_test_dirty(m); + if (m->dirty == 0) { + vm_page_set_invalid(m, (vm_offset_t) foff, resid); + if (m->valid == 0) + vm_page_protect(m, VM_PROT_NONE); + } + } + if (resid >= PAGE_SIZE) { + if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) { + bp->b_flags |= B_INVAL; + } + } else { + if (!vm_page_is_valid(m, + (((vm_offset_t) bp->b_data) & PAGE_MASK), resid)) { + bp->b_flags |= B_INVAL; + } + } + } + foff += resid; + iototal -= resid; + } + } + if (bp->b_flags & (B_INVAL | B_RELBUF)) + vfs_vmio_release(bp); + } + if (bp->b_qindex != QUEUE_NONE) + panic("brelse: free buffer onto another queue???"); + + /* enqueue */ + /* buffers with no memory */ + if (bp->b_bufsize == 0) { + bp->b_qindex = QUEUE_EMPTY; + TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist); + LIST_REMOVE(bp, b_hash); + LIST_INSERT_HEAD(&invalhash, bp, b_hash); + bp->b_dev = NODEV; + /* + * Get rid of the kva allocation *now* + */ + bfreekva(bp); + if (needsbuffer) { + wakeup(&needsbuffer); + needsbuffer=0; + } + /* buffers with junk contents */ + } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) { + bp->b_qindex = QUEUE_AGE; + TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist); + LIST_REMOVE(bp, b_hash); + LIST_INSERT_HEAD(&invalhash, bp, b_hash); + bp->b_dev = NODEV; + if (needsbuffer) { + wakeup(&needsbuffer); + needsbuffer=0; + } + /* buffers that are locked */ + } else if (bp->b_flags & B_LOCKED) { + bp->b_qindex = QUEUE_LOCKED; + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); + /* buffers with stale but valid contents */ + } else if (bp->b_flags & B_AGE) { + bp->b_qindex = QUEUE_AGE; + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist); + if (needsbuffer) { + wakeup(&needsbuffer); + needsbuffer=0; + } + /* buffers with valid and quite potentially reuseable contents */ + } else { + bp->b_qindex = QUEUE_LRU; + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); + if (needsbuffer) { + wakeup(&needsbuffer); + needsbuffer=0; + } + } + + /* unlock */ + bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY | + B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); + splx(s); } +/* + * Release a buffer. + */ +void +bqrelse(struct buf * bp) +{ + int s; + + s = splbio(); + + + /* anyone need this block? */ + if (bp->b_flags & B_WANTED) { + bp->b_flags &= ~(B_WANTED | B_AGE); + wakeup(bp); + } + + if (bp->b_qindex != QUEUE_NONE) + panic("bqrelse: free buffer onto another queue???"); + + if (bp->b_flags & B_LOCKED) { + bp->b_flags &= ~B_ERROR; + bp->b_qindex = QUEUE_LOCKED; + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist); + /* buffers with stale but valid contents */ + } else { + bp->b_qindex = QUEUE_LRU; + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); + if (needsbuffer) { + wakeup(&needsbuffer); + needsbuffer=0; + } + } + + /* unlock */ + bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY | + B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF); + splx(s); +} + +static void +vfs_vmio_release(bp) + struct buf *bp; +{ + int i; + vm_page_t m; + + for (i = 0; i < bp->b_npages; i++) { + m = bp->b_pages[i]; + bp->b_pages[i] = NULL; + vm_page_unwire(m); + /* + * We don't mess with busy pages, it is + * the responsibility of the process that + * busied the pages to deal with them. + */ + if ((m->flags & PG_BUSY) || (m->busy != 0)) + continue; + + if (m->wire_count == 0) { + + if (m->flags & PG_WANTED) { + m->flags &= ~PG_WANTED; + wakeup(m); + } + + /* + * If this is an async free -- we cannot place + * pages onto the cache queue, so our policy for + * such buffers is to avoid the cache queue, and + * only modify the active queue or free queue. + */ + if ((bp->b_flags & B_ASYNC) == 0) { + + /* + * In the case of sync buffer frees, we can do pretty much + * anything to any of the memory queues. Specifically, + * the cache queue is free to be modified. + */ + if (m->valid) { + if(m->dirty == 0) + vm_page_test_dirty(m); + /* + * this keeps pressure off of the process memory + */ + if ((vm_swap_size == 0) || + (cnt.v_free_count < cnt.v_free_min)) { + if ((m->dirty == 0) && + (m->hold_count == 0)) + vm_page_cache(m); + else + vm_page_deactivate(m); + } + } else if (m->hold_count == 0) { + vm_page_protect(m, VM_PROT_NONE); + vm_page_free(m); + } + } else { + /* + * If async, then at least we clear the + * act_count. + */ + m->act_count = 0; + } + } + } + bufspace -= bp->b_bufsize; + vmiospace -= bp->b_bufsize; + pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); + bp->b_npages = 0; + bp->b_bufsize = 0; + bp->b_flags &= ~B_VMIO; + if (bp->b_vp) + brelvp(bp); +} + +/* + * Check to see if a block is currently memory resident. + */ struct buf * -incore(a1, a2) - struct vnode *a1; - daddr_t a2; +gbincore(struct vnode * vp, daddr_t blkno) { + struct buf *bp; + struct bufhashhdr *bh; + bh = BUFHASH(vp, blkno); + bp = bh->lh_first; + + /* Search hash chain */ + while (bp != NULL) { + /* hit */ + if (bp->b_vp == vp && bp->b_lblkno == blkno && + (bp->b_flags & B_INVAL) == 0) { + break; + } + bp = bp->b_hash.le_next; + } + return (bp); +} + +/* + * this routine implements clustered async writes for + * clearing out B_DELWRI buffers... This is much better + * than the old way of writing only one buffer at a time. + */ +int +vfs_bio_awrite(struct buf * bp) +{ + int i; + daddr_t lblkno = bp->b_lblkno; + struct vnode *vp = bp->b_vp; + int s; + int ncl; + struct buf *bpa; + int nwritten; + + s = splbio(); /* - * Body deleted. + * right now we support clustered writing only to regular files */ - return (0); + if ((vp->v_type == VREG) && + (vp->v_mount != 0) && /* Only on nodes that have the size info */ + (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) { + int size; + int maxcl; + + size = vp->v_mount->mnt_stat.f_iosize; + maxcl = MAXPHYS / size; + + for (i = 1; i < maxcl; i++) { + if ((bpa = gbincore(vp, lblkno + i)) && + ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) == + (B_DELWRI | B_CLUSTEROK)) && + (bpa->b_bufsize == size)) { + if ((bpa->b_blkno == bpa->b_lblkno) || + (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT))) + break; + } else { + break; + } + } + ncl = i; + /* + * this is a possible cluster write + */ + if (ncl != 1) { + nwritten = cluster_wbuild(vp, size, lblkno, ncl); + splx(s); + return nwritten; + } + } + bremfree(bp); + splx(s); + /* + * default (old) behavior, writing out only one block + */ + bp->b_flags |= B_BUSY | B_ASYNC; + nwritten = bp->b_bufsize; + (void) VOP_BWRITE(bp); + return nwritten; } -struct buf * -getblk(a1, a2, a3, a4, a5) - struct vnode *a1; - daddr_t a2; - int a3, a4, a5; + +/* + * Find a buffer header which is available for use. + */ +static struct buf * +getnewbuf(int slpflag, int slptimeo, int size, int maxsize) { + struct buf *bp; + int nbyteswritten = 0; + vm_offset_t addr; + +start: + if (bufspace >= maxbufspace) + goto trytofreespace; + + /* can we constitute a new buffer? */ + if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) { + if (bp->b_qindex != QUEUE_EMPTY) + panic("getnewbuf: inconsistent EMPTY queue, qindex=%d", + bp->b_qindex); + bp->b_flags |= B_BUSY; + bremfree(bp); + goto fillbuf; + } +trytofreespace: + /* + * We keep the file I/O from hogging metadata I/O + * This is desirable because file data is cached in the + * VM/Buffer cache even if a buffer is freed. + */ + if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) { + if (bp->b_qindex != QUEUE_AGE) + panic("getnewbuf: inconsistent AGE queue, qindex=%d", + bp->b_qindex); + } else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) { + if (bp->b_qindex != QUEUE_LRU) + panic("getnewbuf: inconsistent LRU queue, qindex=%d", + bp->b_qindex); + } + if (!bp) { + /* wait for a free buffer of any kind */ + needsbuffer = 1; + tsleep(&needsbuffer, + (PRIBIO + 1) | slpflag, "newbuf", slptimeo); + return (0); + } + +#if defined(DIAGNOSTIC) + if (bp->b_flags & B_BUSY) { + panic("getnewbuf: busy buffer on free list\n"); + } +#endif /* - * Body deleted. + * We are fairly aggressive about freeing VMIO buffers, but since + * the buffering is intact without buffer headers, there is not + * much loss. We gain by maintaining non-VMIOed metadata in buffers. */ - return ((struct buf *)0); + if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) { + if ((bp->b_flags & B_VMIO) == 0 || + (vmiospace < maxvmiobufspace)) { + --bp->b_usecount; + TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist); + if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) { + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); + goto start; + } + TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist); + } + } + + /* if we are a delayed write, convert to an async write */ + if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) { + nbyteswritten += vfs_bio_awrite(bp); + if (!slpflag && !slptimeo) { + return (0); + } + goto start; + } + + if (bp->b_flags & B_WANTED) { + bp->b_flags &= ~B_WANTED; + wakeup(bp); + } + bremfree(bp); + bp->b_flags |= B_BUSY; + + if (bp->b_flags & B_VMIO) { + bp->b_flags &= ~B_ASYNC; + vfs_vmio_release(bp); + } + + if (bp->b_vp) + brelvp(bp); + +fillbuf: + /* we are not free, nor do we contain interesting data */ + if (bp->b_rcred != NOCRED) { + crfree(bp->b_rcred); + bp->b_rcred = NOCRED; + } + if (bp->b_wcred != NOCRED) { + crfree(bp->b_wcred); + bp->b_wcred = NOCRED; + } + + LIST_REMOVE(bp, b_hash); + LIST_INSERT_HEAD(&invalhash, bp, b_hash); + if (bp->b_bufsize) { + allocbuf(bp, 0); + } + bp->b_flags = B_BUSY; + bp->b_dev = NODEV; + bp->b_vp = NULL; + bp->b_blkno = bp->b_lblkno = 0; + bp->b_iodone = 0; + bp->b_error = 0; + bp->b_resid = 0; + bp->b_bcount = 0; + bp->b_npages = 0; + bp->b_dirtyoff = bp->b_dirtyend = 0; + bp->b_validoff = bp->b_validend = 0; + bp->b_usecount = 4; + + maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK; + + /* + * we assume that buffer_map is not at address 0 + */ + addr = 0; + if (maxsize != bp->b_kvasize) { + bfreekva(bp); + + /* + * See if we have buffer kva space + */ + if (vm_map_findspace(buffer_map, + vm_map_min(buffer_map), maxsize, &addr)) { + bp->b_flags |= B_INVAL; + brelse(bp); + goto trytofreespace; + } + } + + /* + * See if we are below are allocated minimum + */ + if (bufspace >= (maxbufspace + nbyteswritten)) { + bp->b_flags |= B_INVAL; + brelse(bp); + goto trytofreespace; + } + + /* + * create a map entry for the buffer -- in essence + * reserving the kva space. + */ + if (addr) { + vm_map_insert(buffer_map, NULL, 0, + addr, addr + maxsize, + VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT); + + bp->b_kvabase = (caddr_t) addr; + bp->b_kvasize = maxsize; + } + bp->b_data = bp->b_kvabase; + + return (bp); } +/* + * Check to see if a block is currently memory resident. + */ struct buf * -geteblk(a1) - int a1; +incore(struct vnode * vp, daddr_t blkno) { + struct buf *bp; - /* - * Body deleted. - */ - return ((struct buf *)0); + int s = splbio(); + bp = gbincore(vp, blkno); + splx(s); + return (bp); } -allocbuf(a1, a2) - struct buf *a1; - int a2; +/* + * Returns true if no I/O is needed to access the + * associated VM object. This is like incore except + * it also hunts around in the VM system for the data. + */ + +int +inmem(struct vnode * vp, daddr_t blkno) { + vm_object_t obj; + vm_offset_t toff, tinc; + vm_page_t m; + vm_ooffset_t off; + + if (incore(vp, blkno)) + return 1; + if (vp->v_mount == NULL) + return 0; + if ((vp->v_object == NULL) || (vp->v_flag & VVMIO) == 0) + return 0; + + obj = vp->v_object; + tinc = PAGE_SIZE; + if (tinc > vp->v_mount->mnt_stat.f_iosize) + tinc = vp->v_mount->mnt_stat.f_iosize; + off = blkno * vp->v_mount->mnt_stat.f_iosize; + + for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) { + + m = vm_page_lookup(obj, OFF_TO_IDX(off + toff)); + if (!m) + return 0; + if (vm_page_is_valid(m, (vm_offset_t) (toff + off), tinc) == 0) + return 0; + } + return 1; +} +/* + * now we set the dirty range for the buffer -- + * for NFS -- if the file is mapped and pages have + * been written to, let it know. We want the + * entire range of the buffer to be marked dirty if + * any of the pages have been written to for consistancy + * with the b_validoff, b_validend set in the nfs write + * code, and used by the nfs read code. + */ +static void +vfs_setdirty(struct buf *bp) { + int i; + vm_object_t object; + vm_offset_t boffset, offset; /* - * Body deleted. + * We qualify the scan for modified pages on whether the + * object has been flushed yet. The OBJ_WRITEABLE flag + * is not cleared simply by protecting pages off. */ - return (0); + if ((bp->b_flags & B_VMIO) && + ((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) { + /* + * test the pages to see if they have been modified directly + * by users through the VM system. + */ + for (i = 0; i < bp->b_npages; i++) + vm_page_test_dirty(bp->b_pages[i]); + + /* + * scan forwards for the first page modified + */ + for (i = 0; i < bp->b_npages; i++) { + if (bp->b_pages[i]->dirty) { + break; + } + } + boffset = (i << PAGE_SHIFT); + if (boffset < bp->b_dirtyoff) { + bp->b_dirtyoff = boffset; + } + + /* + * scan backwards for the last page modified + */ + for (i = bp->b_npages - 1; i >= 0; --i) { + if (bp->b_pages[i]->dirty) { + break; + } + } + boffset = (i + 1); + offset = boffset + bp->b_pages[0]->pindex; + if (offset >= object->size) + boffset = object->size - bp->b_pages[0]->pindex; + if (bp->b_dirtyend < (boffset << PAGE_SHIFT)) + bp->b_dirtyend = (boffset << PAGE_SHIFT); + } } +/* + * Get a block given a specified block and offset into a file/device. + */ struct buf * -getnewbuf(a1, a2) - int a1, a2; +getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo) { + struct buf *bp; + int s; + struct bufhashhdr *bh; + int maxsize; - /* - * Body deleted. - */ - return ((struct buf *)0); + if (vp->v_mount) { + maxsize = vp->v_mount->mnt_stat.f_iosize; + /* + * This happens on mount points. + */ + if (maxsize < size) + maxsize = size; + } else { + maxsize = size; + } + + if (size > MAXBSIZE) + panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE); + + s = splbio(); +loop: + if ((bp = gbincore(vp, blkno))) { + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + if (bp->b_usecount < BUF_MAXUSE) + ++bp->b_usecount; + if (!tsleep(bp, + (PRIBIO + 1) | slpflag, "getblk", slptimeo)) + goto loop; + + splx(s); + return (struct buf *) NULL; + } + bp->b_flags |= B_BUSY | B_CACHE; + bremfree(bp); + + /* + * check for size inconsistancies (note that they shouldn't happen + * but do when filesystems don't handle the size changes correctly.) + * We are conservative on metadata and don't just extend the buffer + * but write and re-constitute it. + */ + + if (bp->b_bcount != size) { + if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) { + allocbuf(bp, size); + } else { + bp->b_flags |= B_NOCACHE; + VOP_BWRITE(bp); + goto loop; + } + } + + if (bp->b_usecount < BUF_MAXUSE) + ++bp->b_usecount; + splx(s); + return (bp); + } else { + vm_object_t obj; + + if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == 0) { + if (slpflag || slptimeo) { + splx(s); + return NULL; + } + goto loop; + } + + /* + * This code is used to make sure that a buffer is not + * created while the getnewbuf routine is blocked. + * Normally the vnode is locked so this isn't a problem. + * VBLK type I/O requests, however, don't lock the vnode. + */ + if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) { + bp->b_flags |= B_INVAL; + brelse(bp); + goto loop; + } + + /* + * Insert the buffer into the hash, so that it can + * be found by incore. + */ + bp->b_blkno = bp->b_lblkno = blkno; + bgetvp(vp, bp); + LIST_REMOVE(bp, b_hash); + bh = BUFHASH(vp, blkno); + LIST_INSERT_HEAD(bh, bp, b_hash); + + if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) { + bp->b_flags |= (B_VMIO | B_CACHE); +#if defined(VFS_BIO_DEBUG) + if (vp->v_type != VREG && vp->v_type != VBLK) + printf("getblk: vmioing file type %d???\n", vp->v_type); +#endif + } else { + bp->b_flags &= ~B_VMIO; + } + splx(s); + + allocbuf(bp, size); +#ifdef PC98 + /* + * 1024byte/sector support + */ +#define B_XXX2 0x8000000 + if (vp->v_flag & 0x10000) bp->b_flags |= B_XXX2; +#endif + return (bp); + } } -biowait(a1) - struct buf *a1; +/* + * Get an empty, disassociated buffer of given size. + */ +struct buf * +geteblk(int size) { + struct buf *bp; + int s; - /* - * Body deleted. - */ - return (EIO); + s = splbio(); + while ((bp = getnewbuf(0, 0, size, MAXBSIZE)) == 0); + splx(s); + allocbuf(bp, size); + bp->b_flags |= B_INVAL; + return (bp); } + +/* + * This code constitutes the buffer memory from either anonymous system + * memory (in the case of non-VMIO operations) or from an associated + * VM object (in the case of VMIO operations). + * + * Note that this code is tricky, and has many complications to resolve + * deadlock or inconsistant data situations. Tread lightly!!! + * + * Modify the length of a buffer's underlying buffer storage without + * destroying information (unless, of course the buffer is shrinking). + */ +int +allocbuf(struct buf * bp, int size) +{ + + int s; + int newbsize, mbsize; + int i; + + if (!(bp->b_flags & B_BUSY)) + panic("allocbuf: buffer not busy"); + + if (bp->b_kvasize < size) + panic("allocbuf: buffer too small"); + + if ((bp->b_flags & B_VMIO) == 0) { + caddr_t origbuf; + int origbufsize; + /* + * Just get anonymous memory from the kernel + */ + mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); +#if !defined(NO_B_MALLOC) + if (bp->b_flags & B_MALLOC) + newbsize = mbsize; + else +#endif + newbsize = round_page(size); + + if (newbsize < bp->b_bufsize) { +#if !defined(NO_B_MALLOC) + /* + * malloced buffers are not shrunk + */ + if (bp->b_flags & B_MALLOC) { + if (newbsize) { + bp->b_bcount = size; + } else { + free(bp->b_data, M_BIOBUF); + bufspace -= bp->b_bufsize; + bufmallocspace -= bp->b_bufsize; + bp->b_data = bp->b_kvabase; + bp->b_bufsize = 0; + bp->b_bcount = 0; + bp->b_flags &= ~B_MALLOC; + } + return 1; + } +#endif + vm_hold_free_pages( + bp, + (vm_offset_t) bp->b_data + newbsize, + (vm_offset_t) bp->b_data + bp->b_bufsize); + } else if (newbsize > bp->b_bufsize) { +#if !defined(NO_B_MALLOC) + /* + * We only use malloced memory on the first allocation. + * and revert to page-allocated memory when the buffer grows. + */ + if ( (bufmallocspace < maxbufmallocspace) && + (bp->b_bufsize == 0) && + (mbsize <= PAGE_SIZE/2)) { + + bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK); + bp->b_bufsize = mbsize; + bp->b_bcount = size; + bp->b_flags |= B_MALLOC; + bufspace += mbsize; + bufmallocspace += mbsize; + return 1; + } +#endif + origbuf = NULL; + origbufsize = 0; +#if !defined(NO_B_MALLOC) + /* + * If the buffer is growing on it's other-than-first allocation, + * then we revert to the page-allocation scheme. + */ + if (bp->b_flags & B_MALLOC) { + origbuf = bp->b_data; + origbufsize = bp->b_bufsize; + bp->b_data = bp->b_kvabase; + bufspace -= bp->b_bufsize; + bufmallocspace -= bp->b_bufsize; + bp->b_bufsize = 0; + bp->b_flags &= ~B_MALLOC; + newbsize = round_page(newbsize); + } +#endif + vm_hold_load_pages( + bp, + (vm_offset_t) bp->b_data + bp->b_bufsize, + (vm_offset_t) bp->b_data + newbsize); +#if !defined(NO_B_MALLOC) + if (origbuf) { + bcopy(origbuf, bp->b_data, origbufsize); + free(origbuf, M_BIOBUF); + } +#endif + } + } else { + vm_page_t m; + int desiredpages; + + newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1); + desiredpages = (round_page(newbsize) >> PAGE_SHIFT); + +#if !defined(NO_B_MALLOC) + if (bp->b_flags & B_MALLOC) + panic("allocbuf: VMIO buffer can't be malloced"); +#endif + + if (newbsize < bp->b_bufsize) { + if (desiredpages < bp->b_npages) { + for (i = desiredpages; i < bp->b_npages; i++) { + /* + * the page is not freed here -- it + * is the responsibility of vnode_pager_setsize + */ + m = bp->b_pages[i]; +#if defined(DIAGNOSTIC) + if (m == bogus_page) + panic("allocbuf: bogus page found"); +#endif + s = splvm(); + while ((m->flags & PG_BUSY) || (m->busy != 0)) { + m->flags |= PG_WANTED; + tsleep(m, PVM, "biodep", 0); + } + splx(s); + + bp->b_pages[i] = NULL; + vm_page_unwire(m); + } + pmap_qremove((vm_offset_t) trunc_page(bp->b_data) + + (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages)); + bp->b_npages = desiredpages; + } + } else if (newbsize > bp->b_bufsize) { + vm_object_t obj; + vm_offset_t tinc, toff; + vm_ooffset_t off; + vm_pindex_t objoff; + int pageindex, curbpnpages; + struct vnode *vp; + int bsize; + + vp = bp->b_vp; + + if (vp->v_type == VBLK) + bsize = DEV_BSIZE; + else + bsize = vp->v_mount->mnt_stat.f_iosize; + + if (bp->b_npages < desiredpages) { + obj = vp->v_object; + tinc = PAGE_SIZE; + if (tinc > bsize) + tinc = bsize; + off = (vm_ooffset_t) bp->b_lblkno * bsize; + curbpnpages = bp->b_npages; + doretry: + bp->b_flags |= B_CACHE; + for (toff = 0; toff < newbsize; toff += tinc) { + int bytesinpage; + + pageindex = toff >> PAGE_SHIFT; + objoff = OFF_TO_IDX(off + toff); + if (pageindex < curbpnpages) { + + m = bp->b_pages[pageindex]; +#ifdef VFS_BIO_DIAG + if (m->pindex != objoff) + panic("allocbuf: page changed offset??!!!?"); +#endif + bytesinpage = tinc; + if (tinc > (newbsize - toff)) + bytesinpage = newbsize - toff; + if ((bp->b_flags & B_CACHE) && + !vm_page_is_valid(m, + (vm_offset_t) ((toff + off) & PAGE_MASK), + bytesinpage)) { + bp->b_flags &= ~B_CACHE; + } + continue; + } + m = vm_page_lookup(obj, objoff); + if (!m) { + m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL); + if (!m) { + VM_WAIT; + goto doretry; + } + /* + * Normally it is unwise to clear PG_BUSY without + * PAGE_WAKEUP -- but it is okay here, as there is + * no chance for blocking between here and vm_page_alloc + */ + m->flags &= ~PG_BUSY; + vm_page_wire(m); + bp->b_flags &= ~B_CACHE; + } else if (m->flags & PG_BUSY) { + s = splvm(); + if (m->flags & PG_BUSY) { + m->flags |= PG_WANTED; + tsleep(m, PVM, "pgtblk", 0); + } + splx(s); + goto doretry; + } else { + if ((curproc != pageproc) && + ((m->queue - m->pc) == PQ_CACHE) && + ((cnt.v_free_count + cnt.v_cache_count) < + (cnt.v_free_min + cnt.v_cache_min))) { + pagedaemon_wakeup(); + } + bytesinpage = tinc; + if (tinc > (newbsize - toff)) + bytesinpage = newbsize - toff; + if ((bp->b_flags & B_CACHE) && + !vm_page_is_valid(m, + (vm_offset_t) ((toff + off) & PAGE_MASK), + bytesinpage)) { + bp->b_flags &= ~B_CACHE; + } + vm_page_wire(m); + } + bp->b_pages[pageindex] = m; + curbpnpages = pageindex + 1; + } + bp->b_data = (caddr_t) trunc_page(bp->b_data); + bp->b_npages = curbpnpages; + pmap_qenter((vm_offset_t) bp->b_data, + bp->b_pages, bp->b_npages); + ((vm_offset_t) bp->b_data) |= off & PAGE_MASK; + } + } + } + if (bp->b_flags & B_VMIO) + vmiospace += bp->b_bufsize; + bufspace += (newbsize - bp->b_bufsize); + bp->b_bufsize = newbsize; + bp->b_bcount = size; + return 1; +} + +/* + * Wait for buffer I/O completion, returning error status. + */ +int +biowait(register struct buf * bp) +{ + int s; + + s = splbio(); + while ((bp->b_flags & B_DONE) == 0) + tsleep(bp, PRIBIO, "biowait", 0); + splx(s); + if (bp->b_flags & B_EINTR) { + bp->b_flags &= ~B_EINTR; + return (EINTR); + } + if (bp->b_flags & B_ERROR) { + return (bp->b_error ? bp->b_error : EIO); + } else { + return (0); + } +} + +/* + * Finish I/O on a buffer, calling an optional function. + * This is usually called from interrupt level, so process blocking + * is not *a good idea*. + */ void -biodone(a1) - struct buf *a1; +biodone(register struct buf * bp) { + int s; + + s = splbio(); + if (!(bp->b_flags & B_BUSY)) + panic("biodone: buffer not busy"); + + if (bp->b_flags & B_DONE) { + splx(s); + printf("biodone: buffer already done\n"); + return; + } + bp->b_flags |= B_DONE; + + if ((bp->b_flags & B_READ) == 0) { + vwakeup(bp); + } +#ifdef BOUNCE_BUFFERS + if (bp->b_flags & B_BOUNCE) + vm_bounce_free(bp); +#endif + + /* call optional completion function if requested */ + if (bp->b_flags & B_CALL) { + bp->b_flags &= ~B_CALL; + (*bp->b_iodone) (bp); + splx(s); + return; + } + if (bp->b_flags & B_VMIO) { + int i, resid; + vm_ooffset_t foff; + vm_page_t m; + vm_object_t obj; + int iosize; + struct vnode *vp = bp->b_vp; + + if (vp->v_type == VBLK) + foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; + else + foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; + obj = vp->v_object; + if (!obj) { + panic("biodone: no object"); + } +#if defined(VFS_BIO_DEBUG) + if (obj->paging_in_progress < bp->b_npages) { + printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n", + obj->paging_in_progress, bp->b_npages); + } +#endif + iosize = bp->b_bufsize; + for (i = 0; i < bp->b_npages; i++) { + int bogusflag = 0; + m = bp->b_pages[i]; + if (m == bogus_page) { + bogusflag = 1; + m = vm_page_lookup(obj, OFF_TO_IDX(foff)); + if (!m) { +#if defined(VFS_BIO_DEBUG) + printf("biodone: page disappeared\n"); +#endif + --obj->paging_in_progress; + continue; + } + bp->b_pages[i] = m; + pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); + } +#if defined(VFS_BIO_DEBUG) + if (OFF_TO_IDX(foff) != m->pindex) { + printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex); + } +#endif + resid = IDX_TO_OFF(m->pindex + 1) - foff; + if (resid > iosize) + resid = iosize; + /* + * In the write case, the valid and clean bits are + * already changed correctly, so we only need to do this + * here in the read case. + */ + if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) { + vm_page_set_validclean(m, + (vm_offset_t) (foff & PAGE_MASK), resid); + } + /* + * when debugging new filesystems or buffer I/O methods, this + * is the most common error that pops up. if you see this, you + * have not set the page busy flag correctly!!! + */ + if (m->busy == 0) { + printf("biodone: page busy < 0, " + "pindex: %d, foff: 0x(%x,%x), " + "resid: %d, index: %d\n", + (int) m->pindex, (int)(foff >> 32), + (int) foff & 0xffffffff, resid, i); + if (vp->v_type != VBLK) + printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n", + bp->b_vp->v_mount->mnt_stat.f_iosize, + (int) bp->b_lblkno, + bp->b_flags, bp->b_npages); + else + printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n", + (int) bp->b_lblkno, + bp->b_flags, bp->b_npages); + printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n", + m->valid, m->dirty, m->wire_count); + panic("biodone: page busy < 0\n"); + } + --m->busy; + if ((m->busy == 0) && (m->flags & PG_WANTED)) { + m->flags &= ~PG_WANTED; + wakeup(m); + } + --obj->paging_in_progress; + foff += resid; + iosize -= resid; + } + if (obj && obj->paging_in_progress == 0 && + (obj->flags & OBJ_PIPWNT)) { + obj->flags &= ~OBJ_PIPWNT; + wakeup(obj); + } + } /* - * Body deleted. + * For asynchronous completions, release the buffer now. The brelse + * checks for B_WANTED and will do the wakeup there if necessary - so + * no need to do a wakeup here in the async case. */ - return; + + if (bp->b_flags & B_ASYNC) { + if ((bp->b_flags & B_ORDERED) == 0) { + if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0) + brelse(bp); + else + bqrelse(bp); + } + } else { + bp->b_flags &= ~B_WANTED; + wakeup(bp); + } + splx(s); } int count_lock_queue() { + int count; + struct buf *bp; - /* - * Body deleted. - */ - return (0); + count = 0; + for (bp = TAILQ_FIRST(&bufqueues[QUEUE_LOCKED]); + bp != NULL; + bp = TAILQ_NEXT(bp, b_freelist)) + count++; + return (count); +} + +int vfs_update_interval = 30; + +static void +vfs_update() +{ + while (1) { + tsleep(&vfs_update_wakeup, PUSER, "update", + hz * vfs_update_interval); + vfs_update_wakeup = 0; + sync(curproc, NULL, NULL); + } } -#ifdef DIAGNOSTIC +static int +sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS +{ + int error = sysctl_handle_int(oidp, + oidp->oid_arg1, oidp->oid_arg2, req); + if (!error) + wakeup(&vfs_update_wakeup); + return error; +} + +SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW, + &vfs_update_interval, 0, sysctl_kern_updateinterval, "I", ""); + + /* - * Print out statistics on the current allocation of the buffer pool. - * Can be enabled to print out on every ``sync'' by setting "syncprt" - * in vfs_syscalls.c using sysctl. + * This routine is called in lieu of iodone in the case of + * incomplete I/O. This keeps the busy status for pages + * consistant. */ void -vfs_bufstats() +vfs_unbusy_pages(struct buf * bp) { - int s, i, j, count; - register struct buf *bp; - register struct bqueues *dp; - int counts[MAXBSIZE/CLBYTES+1]; - static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" }; - - for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) { - count = 0; - for (j = 0; j <= MAXBSIZE/CLBYTES; j++) - counts[j] = 0; - s = splbio(); - for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) { - counts[bp->b_bufsize/CLBYTES]++; - count++; + int i; + + if (bp->b_flags & B_VMIO) { + struct vnode *vp = bp->b_vp; + vm_object_t obj = vp->v_object; + vm_ooffset_t foff; + + foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; + + for (i = 0; i < bp->b_npages; i++) { + vm_page_t m = bp->b_pages[i]; + + if (m == bogus_page) { + m = vm_page_lookup(obj, OFF_TO_IDX(foff) + i); + if (!m) { + panic("vfs_unbusy_pages: page missing\n"); + } + bp->b_pages[i] = m; + pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); + } + --obj->paging_in_progress; + --m->busy; + if ((m->busy == 0) && (m->flags & PG_WANTED)) { + m->flags &= ~PG_WANTED; + wakeup(m); + } + } + if (obj->paging_in_progress == 0 && + (obj->flags & OBJ_PIPWNT)) { + obj->flags &= ~OBJ_PIPWNT; + wakeup(obj); + } + } +} + +/* + * This routine is called before a device strategy routine. + * It is used to tell the VM system that paging I/O is in + * progress, and treat the pages associated with the buffer + * almost as being PG_BUSY. Also the object paging_in_progress + * flag is handled to make sure that the object doesn't become + * inconsistant. + */ +void +vfs_busy_pages(struct buf * bp, int clear_modify) +{ + int i; + + if (bp->b_flags & B_VMIO) { + vm_object_t obj = bp->b_vp->v_object; + vm_ooffset_t foff; + int iocount = bp->b_bufsize; + + if (bp->b_vp->v_type == VBLK) + foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; + else + foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; + vfs_setdirty(bp); + for (i = 0; i < bp->b_npages; i++) { + vm_page_t m = bp->b_pages[i]; + int resid = IDX_TO_OFF(m->pindex + 1) - foff; + + if (resid > iocount) + resid = iocount; + if ((bp->b_flags & B_CLUSTER) == 0) { + obj->paging_in_progress++; + m->busy++; + } + vm_page_protect(m, VM_PROT_NONE); + if (clear_modify) { + vm_page_set_validclean(m, + (vm_offset_t) (foff & PAGE_MASK), resid); + } else if (bp->b_bcount >= PAGE_SIZE) { + if (m->valid && (bp->b_flags & B_CACHE) == 0) { + bp->b_pages[i] = bogus_page; + pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages); + } + } + foff += resid; + iocount -= resid; + } + } +} + +/* + * Tell the VM system that the pages associated with this buffer + * are clean. This is used for delayed writes where the data is + * going to go to disk eventually without additional VM intevention. + */ +void +vfs_clean_pages(struct buf * bp) +{ + int i; + + if (bp->b_flags & B_VMIO) { + vm_ooffset_t foff; + int iocount = bp->b_bufsize; + + if (bp->b_vp->v_type == VBLK) + foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno; + else + foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno; + + for (i = 0; i < bp->b_npages; i++) { + vm_page_t m = bp->b_pages[i]; + int resid = IDX_TO_OFF(m->pindex + 1) - foff; + + if (resid > iocount) + resid = iocount; + if (resid > 0) { + vm_page_set_validclean(m, + ((vm_offset_t) foff & PAGE_MASK), resid); + } + foff += resid; + iocount -= resid; + } + } +} + +void +vfs_bio_clrbuf(struct buf *bp) { + int i; + if( bp->b_flags & B_VMIO) { + if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) { + int mask; + mask = 0; + for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE) + mask |= (1 << (i/DEV_BSIZE)); + if( bp->b_pages[0]->valid != mask) { + bzero(bp->b_data, bp->b_bufsize); + } + bp->b_pages[0]->valid = mask; + bp->b_resid = 0; + return; + } + for(i=0;i<bp->b_npages;i++) { + if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL) + continue; + if( bp->b_pages[i]->valid == 0) { + if ((bp->b_pages[i]->flags & PG_ZERO) == 0) { + bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE); + } + } else { + int j; + for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) { + if( (bp->b_pages[i]->valid & (1<<j)) == 0) + bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE); + } + } + /* bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; */ + } + bp->b_resid = 0; + } else { + clrbuf(bp); + } +} + +/* + * vm_hold_load_pages and vm_hold_unload pages get pages into + * a buffers address space. The pages are anonymous and are + * not associated with a file object. + */ +void +vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) +{ + vm_offset_t pg; + vm_page_t p; + int index; + + to = round_page(to); + from = round_page(from); + index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT; + + for (pg = from; pg < to; pg += PAGE_SIZE, index++) { + +tryagain: + + p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT), + VM_ALLOC_NORMAL); + if (!p) { + VM_WAIT; + goto tryagain; + } + vm_page_wire(p); + pmap_kenter(pg, VM_PAGE_TO_PHYS(p)); + bp->b_pages[index] = p; + PAGE_WAKEUP(p); + } + bp->b_npages = to >> PAGE_SHIFT; +} + +void +vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to) +{ + vm_offset_t pg; + vm_page_t p; + int index; + + from = round_page(from); + to = round_page(to); + index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT; + + for (pg = from; pg < to; pg += PAGE_SIZE, index++) { + p = bp->b_pages[index]; + if (p && (index < bp->b_npages)) { + if (p->busy) { + printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n", + bp->b_blkno, bp->b_lblkno); + } + bp->b_pages[index] = NULL; + pmap_kremove(pg); + vm_page_unwire(p); + vm_page_free(p); } - splx(s); - printf("%s: total-%d", bname[i], count); - for (j = 0; j <= MAXBSIZE/CLBYTES; j++) - if (counts[j] != 0) - printf(", %d-%d", j * CLBYTES, counts[j]); - printf("\n"); } + bp->b_npages = from >> PAGE_SHIFT; } -#endif /* DIAGNOSTIC */ diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c index c20966b..ef0f222 100644 --- a/sys/kern/vfs_cache.c +++ b/sys/kern/vfs_cache.c @@ -33,13 +33,14 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * from: vfs_cache.c,v 1.11 1995/03/12 02:01:20 phk Exp $ - * * @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95 + * $Id: vfs_cache.c,v 1.23 1997/02/22 09:39:31 peter Exp $ */ #include <sys/param.h> #include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/sysctl.h> #include <sys/time.h> #include <sys/mount.h> #include <sys/vnode.h> @@ -47,6 +48,8 @@ #include <sys/errno.h> #include <sys/malloc.h> +#define MAXVNODEUSE 32 + /* * Name caching works as follows: * @@ -72,14 +75,24 @@ * Structures associated with name cacheing. */ #define NCHHASH(dvp, cnp) \ - (&nchashtbl[((dvp)->v_id + (cnp)->cn_hash) & nchash]) -LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */ -u_long nchash; /* size of hash table - 1 */ -long numcache; /* number of cache entries allocated */ -TAILQ_HEAD(, namecache) nclruhead; /* LRU chain */ + (&nchashtbl[((dvp)->v_id + (cnp)->cn_hash) % nchash]) +static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */ +static u_long nchash; /* size of hash table */ +static u_long numcache; /* number of cache entries allocated */ +static TAILQ_HEAD(, namecache) nclruhead; /* LRU chain */ struct nchstats nchstats; /* cache effectiveness statistics */ -int doingcache = 1; /* 1 => enable the cache */ +static int doingcache = 1; /* 1 => enable the cache */ +SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, ""); + +#ifdef NCH_STATISTICS +u_long nchnbr; +#define NCHNBR(ncp) (ncp)->nc_nbr = ++nchnbr; +#define NCHHIT(ncp) (ncp)->nc_hits++ +#else +#define NCHNBR(ncp) +#define NCHHIT(ncp) +#endif /* * Delete an entry from its hash list and move it to the front @@ -100,13 +113,14 @@ int doingcache = 1; /* 1 => enable the cache */ if (ncp->nc_lru.tqe_next != 0) { \ TAILQ_REMOVE(&nclruhead, ncp, nc_lru); \ TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); \ + NCHNBR(ncp); \ } \ } /* - * Lookup an entry in the cache + * Lookup an entry in the cache * - * We don't do this if the segment name is long, simply so the cache + * We don't do this if the segment name is long, simply so the cache * can avoid holding long names (which would either waste space, or * add greatly to the complexity). * @@ -160,18 +174,22 @@ cache_lookup(dvp, vpp, cnp) return (0); } + NCHHIT(ncp); + /* We don't want to have an entry, so dump it */ if ((cnp->cn_flags & MAKEENTRY) == 0) { nchstats.ncs_badhits++; PURGE(ncp); return (0); - } + } /* We found a "positive" match, return the vnode */ if (ncp->nc_vp) { nchstats.ncs_goodhits++; TOUCH(ncp); *vpp = ncp->nc_vp; + if ((*vpp)->v_usage < MAXVNODEUSE) + (*vpp)->v_usage++; return (-1); } @@ -207,10 +225,10 @@ cache_enter(dvp, vp, cnp) if (!doingcache) return; -#ifdef DIAGNOSTIC - if (cnp->cn_namelen > NCHNAMLEN) - panic("cache_enter: name too long"); -#endif + if (cnp->cn_namelen > NCHNAMLEN) { + printf("cache_enter: name too long"); + return; + } /* * We allocate a new entry if we are less than the maximum @@ -244,9 +262,11 @@ cache_enter(dvp, vp, cnp) * otherwise unused. */ ncp->nc_vp = vp; - if (vp) + if (vp) { ncp->nc_vpid = vp->v_id; - else + if (vp->v_usage < MAXVNODEUSE) + ++vp->v_usage; + } else ncp->nc_vpid = cnp->cn_flags & ISWHITEOUT; ncp->nc_dvp = dvp; ncp->nc_dvpid = dvp->v_id; @@ -265,14 +285,14 @@ nchinit() { TAILQ_INIT(&nclruhead); - nchashtbl = hashinit(desiredvnodes, M_CACHE, &nchash); + nchashtbl = phashinit(desiredvnodes, M_CACHE, &nchash); } /* - * Invalidate a all entries to particular vnode. - * - * We actually just increment the v_id, that will do it. The entries will - * be purged by lookup as they get found. If the v_id wraps around, we + * Invalidate all entries to particular vnode. + * + * We actually just increment the v_id, that will do it. The stale entries + * will be purged by lookup as they get found. If the v_id wraps around, we * need to ditch the entire cache, to avoid confusion. No valid vnode will * ever have (v_id == 0). */ @@ -282,11 +302,12 @@ cache_purge(vp) { struct namecache *ncp; struct nchashhead *ncpp; + static u_long nextvnodeid; vp->v_id = ++nextvnodeid; if (nextvnodeid != 0) return; - for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) { + for (ncpp = &nchashtbl[nchash - 1]; ncpp >= nchashtbl; ncpp--) { while (ncp = ncpp->lh_first) PURGE(ncp); } @@ -297,7 +318,7 @@ cache_purge(vp) * Flush all entries referencing a particular filesystem. * * Since we need to check it anyway, we will flush all the invalid - * entriess at the same time. + * entries at the same time. */ void cache_purgevfs(mp) @@ -307,7 +328,7 @@ cache_purgevfs(mp) struct namecache *ncp, *nnp; /* Scan hash tables for applicable entries */ - for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) { + for (ncpp = &nchashtbl[nchash - 1]; ncpp >= nchashtbl; ncpp--) { for (ncp = ncpp->lh_first; ncp != 0; ncp = nnp) { nnp = ncp->nc_hash.le_next; if (ncp->nc_dvpid != ncp->nc_dvp->v_id || diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c index e01d24f..b00da1f 100644 --- a/sys/kern/vfs_cluster.c +++ b/sys/kern/vfs_cluster.c @@ -1,6 +1,8 @@ /*- * Copyright (c) 1993 * The Regents of the University of California. All rights reserved. + * Modifications/enhancements: + * Copyright (c) 1995 John S. Dyson. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -30,233 +32,281 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95 + * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94 + * $Id: vfs_cluster.c,v 1.42 1997/02/22 09:39:31 peter Exp $ */ #include <sys/param.h> +#include <sys/systm.h> #include <sys/proc.h> #include <sys/buf.h> #include <sys/vnode.h> #include <sys/mount.h> -#include <sys/trace.h> #include <sys/malloc.h> #include <sys/resourcevar.h> -#include <libkern/libkern.h> +#include <sys/vmmeter.h> +#include <miscfs/specfs/specdev.h> +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_prot.h> +#include <vm/vm_object.h> +#include <vm/vm_page.h> + +#if defined(CLUSTERDEBUG) +#include <sys/sysctl.h> +#include <sys/kernel.h> +static int rcluster= 0; +SYSCTL_INT(_debug, 14, rcluster, CTLFLAG_RW, &rcluster, 0, ""); +#endif -/* - * Local declarations - */ -struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t, - daddr_t, long, int)); -struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *, - daddr_t, daddr_t, long, int, long)); -void cluster_wbuild __P((struct vnode *, struct buf *, long, - daddr_t, int, daddr_t)); -struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *)); +#ifdef notyet_block_reallocation_enabled +#ifdef DEBUG +#include <sys/sysctl.h> +#include <sys/kernel.h> -#ifdef DIAGNOSTIC -/* - * Set to 1 if reads of block zero should cause readahead to be done. - * Set to 0 treats a read of block zero as a non-sequential read. - * - * Setting to one assumes that most reads of block zero of files are due to - * sequential passes over the files (e.g. cat, sum) where additional blocks - * will soon be needed. Setting to zero assumes that the majority are - * surgical strikes to get particular info (e.g. size, file) where readahead - * blocks will not be used and, in fact, push out other potentially useful - * blocks from the cache. The former seems intuitive, but some quick tests - * showed that the latter performed better from a system-wide point of view. - */ -int doclusterraz = 0; -#define ISSEQREAD(vp, blk) \ - (((blk) != 0 || doclusterraz) && \ - ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) +static int doreallocblks = 0; +SYSCTL_INT(_debug, 13, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, ""); #else -#define ISSEQREAD(vp, blk) \ - ((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr)) +#define doreallocblks 0 #endif +#endif /* notyet_block_reallocation_enabled */ + +#ifdef notyet_block_reallocation_enabled +static struct cluster_save * + cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp)); +#endif +static struct buf * + cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn, + daddr_t blkno, long size, int run, struct buf *fbp)); + +extern vm_page_t bogus_page; /* - * This replaces bread. If this is a bread at the beginning of a file and - * lastr is 0, we assume this is the first read and we'll read up to two - * blocks if they are sequential. After that, we'll do regular read ahead - * in clustered chunks. - * - * There are 4 or 5 cases depending on how you count: - * Desired block is in the cache: - * 1 Not sequential access (0 I/Os). - * 2 Access is sequential, do read-ahead (1 ASYNC). - * Desired block is not in cache: - * 3 Not sequential access (1 SYNC). - * 4 Sequential access, next block is contiguous (1 SYNC). - * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC) - * - * There are potentially two buffers that require I/O. - * bp is the block requested. - * rbp is the read-ahead block. - * If either is NULL, then you don't have to do the I/O. + * Maximum number of blocks for read-ahead. */ -cluster_read(vp, filesize, lblkno, size, cred, bpp) +#define MAXRA 32 + +/* + * This replaces bread. + */ +int +cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp) struct vnode *vp; u_quad_t filesize; daddr_t lblkno; long size; struct ucred *cred; + long totread; + int seqcount; struct buf **bpp; { - struct buf *bp, *rbp; - daddr_t blkno, ioblkno; - long flags; - int error, num_ra, alreadyincore; - -#ifdef DIAGNOSTIC - if (size == 0) - panic("cluster_read: size = 0"); -#endif + struct buf *bp, *rbp, *reqbp; + daddr_t blkno, rablkno, origblkno; + int error, num_ra; + int i; + int maxra, racluster; + long origtotread; error = 0; - flags = B_READ; - *bpp = bp = getblk(vp, lblkno, size, 0, 0); - if (bp->b_flags & B_CACHE) { - /* - * Desired block is in cache; do any readahead ASYNC. - * Case 1, 2. - */ - trace(TR_BREADHIT, pack(vp, size), lblkno); - flags |= B_ASYNC; - ioblkno = lblkno + (vp->v_ralen ? vp->v_ralen : 1); - alreadyincore = incore(vp, ioblkno) != NULL; - bp = NULL; - } else { - /* Block wasn't in cache, case 3, 4, 5. */ - trace(TR_BREADMISS, pack(vp, size), lblkno); - bp->b_flags |= B_READ; - ioblkno = lblkno; - alreadyincore = 0; - curproc->p_stats->p_ru.ru_inblock++; /* XXX */ - } + /* - * XXX - * Replace 1 with a window size based on some permutation of - * maxcontig and rot_delay. This will let you figure out how - * many blocks you should read-ahead (case 2, 4, 5). - * - * If the access isn't sequential, reset the window to 1. - * Note that a read to the same block is considered sequential. - * This catches the case where the file is being read sequentially, - * but at smaller than the filesystem block size. + * Try to limit the amount of read-ahead by a few + * ad-hoc parameters. This needs work!!! */ - rbp = NULL; - if (!ISSEQREAD(vp, lblkno)) { - vp->v_ralen = 0; - vp->v_maxra = lblkno; - } else if ((ioblkno + 1) * size <= filesize && !alreadyincore && - !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) && - blkno != -1) { - /* - * Reading sequentially, and the next block is not in the - * cache. We are going to try reading ahead. - */ - if (num_ra) { - /* - * If our desired readahead block had been read - * in a previous readahead but is no longer in - * core, then we may be reading ahead too far - * or are not using our readahead very rapidly. - * In this case we scale back the window. - */ - if (!alreadyincore && ioblkno <= vp->v_maxra) - vp->v_ralen = max(vp->v_ralen >> 1, 1); - /* - * There are more sequential blocks than our current - * window allows, scale up. Ideally we want to get - * in sync with the filesystem maxcontig value. - */ - else if (num_ra > vp->v_ralen && lblkno != vp->v_lastr) - vp->v_ralen = vp->v_ralen ? - min(num_ra, vp->v_ralen << 1) : 1; + racluster = MAXPHYS/size; + maxra = 2 * racluster + (totread / size); + if (maxra > MAXRA) + maxra = MAXRA; + if (maxra > nbuf/8) + maxra = nbuf/8; - if (num_ra > vp->v_ralen) - num_ra = vp->v_ralen; - } + /* + * get the requested block + */ + *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0); + origblkno = lblkno; + origtotread = totread; - if (num_ra) /* case 2, 4 */ - rbp = cluster_rbuild(vp, filesize, - bp, ioblkno, blkno, size, num_ra, flags); - else if (ioblkno == lblkno) { - bp->b_blkno = blkno; - /* Case 5: check how many blocks to read ahead */ - ++ioblkno; - if ((ioblkno + 1) * size > filesize || - incore(vp, ioblkno) || (error = VOP_BMAP(vp, - ioblkno, NULL, &blkno, &num_ra)) || blkno == -1) - goto skip_readahead; + /* + * if it is in the cache, then check to see if the reads have been + * sequential. If they have, then try some read-ahead, otherwise + * back-off on prospective read-aheads. + */ + if (bp->b_flags & B_CACHE) { + if (!seqcount) { + return 0; + } else if ((bp->b_flags & B_RAM) == 0) { + return 0; + } else { + int s; + struct buf *tbp; + bp->b_flags &= ~B_RAM; /* - * Adjust readahead as above. - * Don't check alreadyincore, we know it is 0 from - * the previous conditional. + * We do the spl here so that there is no window + * between the incore and the b_usecount increment + * below. We opt to keep the spl out of the loop + * for efficiency. */ - if (num_ra) { - if (ioblkno <= vp->v_maxra) - vp->v_ralen = max(vp->v_ralen >> 1, 1); - else if (num_ra > vp->v_ralen && - lblkno != vp->v_lastr) - vp->v_ralen = vp->v_ralen ? - min(num_ra,vp->v_ralen<<1) : 1; - if (num_ra > vp->v_ralen) - num_ra = vp->v_ralen; + s = splbio(); + for(i=1;i<maxra;i++) { + + if (!(tbp = incore(vp, lblkno+i))) { + break; + } + + /* + * Set another read-ahead mark so we know to check + * again. + */ + if (((i % racluster) == (racluster - 1)) || + (i == (maxra - 1))) + tbp->b_flags |= B_RAM; + +#if 0 + if (tbp->b_usecount == 0) { + /* + * Make sure that the soon-to-be used readaheads + * are still there. The getblk/bqrelse pair will + * boost the priority of the buffer. + */ + tbp = getblk(vp, lblkno+i, size, 0, 0); + bqrelse(tbp); + } +#endif } - flags |= B_ASYNC; - if (num_ra) - rbp = cluster_rbuild(vp, filesize, - NULL, ioblkno, blkno, size, num_ra, flags); - else { - rbp = getblk(vp, ioblkno, size, 0, 0); - rbp->b_flags |= flags; - rbp->b_blkno = blkno; + splx(s); + if (i >= maxra) { + return 0; } + lblkno += i; + } + reqbp = bp = NULL; + } else { + u_quad_t firstread; + firstread = (u_quad_t) lblkno * size; + if (firstread + totread > filesize) + totread = filesize - firstread; + if (totread > size) { + int nblks = 0; + int ncontigafter; + while (totread > 0) { + nblks++; + totread -= size; + } + if (nblks == 1) + goto single_block_read; + if (nblks > racluster) + nblks = racluster; + + error = VOP_BMAP(vp, lblkno, NULL, + &blkno, &ncontigafter, NULL); + if (error) + goto single_block_read; + if (blkno == -1) + goto single_block_read; + if (ncontigafter == 0) + goto single_block_read; + if (ncontigafter + 1 < nblks) + nblks = ncontigafter + 1; + + bp = cluster_rbuild(vp, filesize, lblkno, + blkno, size, nblks, bp); + lblkno += nblks; } else { - /* case 2; read ahead single block */ - rbp = getblk(vp, ioblkno, size, 0, 0); - rbp->b_flags |= flags; - rbp->b_blkno = blkno; +single_block_read: + /* + * if it isn't in the cache, then get a chunk from + * disk if sequential, otherwise just get the block. + */ + bp->b_flags |= B_READ | B_RAM; + lblkno += 1; } + } - if (rbp == bp) /* case 4 */ - rbp = NULL; - else if (rbp) { /* case 2, 5 */ - trace(TR_BREADMISSRA, - pack(vp, (num_ra + 1) * size), ioblkno); - curproc->p_stats->p_ru.ru_inblock++; /* XXX */ + /* + * if we have been doing sequential I/O, then do some read-ahead + */ + rbp = NULL; + /* if (seqcount && (lblkno < (origblkno + maxra))) { */ + if (seqcount && (lblkno < (origblkno + seqcount))) { + /* + * we now build the read-ahead buffer if it is desirable. + */ + if (((u_quad_t)(lblkno + 1) * size) <= filesize && + !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) && + blkno != -1) { + int nblksread; + int ntoread = num_ra + 1; + nblksread = (origtotread + size - 1) / size; + if (seqcount < nblksread) + seqcount = nblksread; + if (seqcount < ntoread) + ntoread = seqcount; + if (num_ra) { + rbp = cluster_rbuild(vp, filesize, lblkno, + blkno, size, ntoread, NULL); + } else { + rbp = getblk(vp, lblkno, size, 0, 0); + rbp->b_flags |= B_READ | B_ASYNC | B_RAM; + rbp->b_blkno = blkno; + } } } - /* XXX Kirk, do we need to make sure the bp has creds? */ -skip_readahead: - if (bp) - if (bp->b_flags & (B_DONE | B_DELWRI)) + /* + * handle the synchronous read + */ + if (bp) { + if (bp->b_flags & (B_DONE | B_DELWRI)) { panic("cluster_read: DONE bp"); - else + } else { +#if defined(CLUSTERDEBUG) + if (rcluster) + printf("S(%d,%d,%d) ", + bp->b_lblkno, bp->b_bcount, seqcount); +#endif + if ((bp->b_flags & B_CLUSTER) == 0) + vfs_busy_pages(bp, 0); error = VOP_STRATEGY(bp); - - if (rbp) - if (error || rbp->b_flags & (B_DONE | B_DELWRI)) { + curproc->p_stats->p_ru.ru_inblock++; + } + } + /* + * and if we have read-aheads, do them too + */ + if (rbp) { + if (error) { rbp->b_flags &= ~(B_ASYNC | B_READ); brelse(rbp); - } else - (void) VOP_STRATEGY(rbp); + } else if (rbp->b_flags & B_CACHE) { + rbp->b_flags &= ~(B_ASYNC | B_READ); + bqrelse(rbp); + } else { +#if defined(CLUSTERDEBUG) + if (rcluster) { + if (bp) + printf("A+(%d,%d,%d,%d) ", + rbp->b_lblkno, rbp->b_bcount, + rbp->b_lblkno - origblkno, + seqcount); + else + printf("A(%d,%d,%d,%d) ", + rbp->b_lblkno, rbp->b_bcount, + rbp->b_lblkno - origblkno, + seqcount); + } +#endif - /* - * Recalculate our maximum readahead - */ - if (rbp == NULL) - rbp = bp; - if (rbp) - vp->v_maxra = rbp->b_lblkno + (rbp->b_bufsize / size) - 1; - - if (bp) - return(biowait(bp)); - return(error); + if ((rbp->b_flags & B_CLUSTER) == 0) + vfs_busy_pages(rbp, 0); + (void) VOP_STRATEGY(rbp); + curproc->p_stats->p_ru.ru_inblock++; + } + } + if (reqbp) + return (biowait(reqbp)); + else + return (error); } /* @@ -264,145 +314,139 @@ skip_readahead: * read ahead. We will read as many blocks as possible sequentially * and then parcel them up into logical blocks in the buffer hash table. */ -struct buf * -cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags) +static struct buf * +cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp) struct vnode *vp; u_quad_t filesize; - struct buf *bp; daddr_t lbn; daddr_t blkno; long size; int run; - long flags; + struct buf *fbp; { - struct cluster_save *b_save; - struct buf *tbp; + struct buf *bp, *tbp; daddr_t bn; - int i, inc; + int i, inc, j; #ifdef DIAGNOSTIC if (size != vp->v_mount->mnt_stat.f_iosize) panic("cluster_rbuild: size %d != filesize %d\n", - size, vp->v_mount->mnt_stat.f_iosize); + size, vp->v_mount->mnt_stat.f_iosize); #endif - if (size * (lbn + run + 1) > filesize) + /* + * avoid a division + */ + while ((u_quad_t) size * (lbn + run) > filesize) { --run; - if (run == 0) { - if (!bp) { - bp = getblk(vp, lbn, size, 0, 0); - bp->b_blkno = blkno; - bp->b_flags |= flags; - } - return(bp); } - bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1); - if (bp->b_flags & (B_DONE | B_DELWRI)) - return (bp); + if (fbp) { + tbp = fbp; + tbp->b_flags |= B_READ; + } else { + tbp = getblk(vp, lbn, size, 0, 0); + if (tbp->b_flags & B_CACHE) + return tbp; + tbp->b_flags |= B_ASYNC | B_READ | B_RAM; + } + + tbp->b_blkno = blkno; + if( (tbp->b_flags & B_MALLOC) || + ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) ) + return tbp; - b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save), - M_SEGMENT, M_WAITOK); - b_save->bs_bufsize = b_save->bs_bcount = size; - b_save->bs_nchildren = 0; - b_save->bs_children = (struct buf **)(b_save + 1); - b_save->bs_saveaddr = bp->b_saveaddr; - bp->b_saveaddr = (caddr_t) b_save; + bp = trypbuf(); + if (bp == 0) + return tbp; + + (vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK; + bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO; + bp->b_iodone = cluster_callback; + bp->b_blkno = blkno; + bp->b_lblkno = lbn; + pbgetvp(vp, bp); + + TAILQ_INIT(&bp->b_cluster.cluster_head); + + bp->b_bcount = 0; + bp->b_bufsize = 0; + bp->b_npages = 0; inc = btodb(size); - for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) { - /* - * A component of the cluster is already in core, - * terminate the cluster early. - */ - if (incore(vp, lbn + i)) - break; - tbp = getblk(vp, lbn + i, 0, 0, 0); - /* - * getblk may return some memory in the buffer if there were - * no empty buffers to shed it to. If there is currently - * memory in the buffer, we move it down size bytes to make - * room for the valid pages that cluster_callback will insert. - * We do this now so we don't have to do it at interrupt time - * in the callback routine. - */ - if (tbp->b_bufsize != 0) { - caddr_t bdata = (char *)tbp->b_data; + for (bn = blkno, i = 0; i < run; ++i, bn += inc) { + if (i != 0) { + if ((bp->b_npages * PAGE_SIZE) + + round_page(size) > MAXPHYS) + break; - /* - * No room in the buffer to add another page, - * terminate the cluster early. - */ - if (tbp->b_bufsize + size > MAXBSIZE) { -#ifdef DIAGNOSTIC - if (tbp->b_bufsize != MAXBSIZE) - panic("cluster_rbuild: too much memory"); -#endif - brelse(tbp); + if (incore(vp, lbn + i)) break; + + tbp = getblk(vp, lbn + i, size, 0, 0); + + if ((tbp->b_flags & B_CACHE) || + (tbp->b_flags & B_VMIO) == 0) { + bqrelse(tbp); + break; + } + + for (j=0;j<tbp->b_npages;j++) { + if (tbp->b_pages[j]->valid) { + break; + } } - if (tbp->b_bufsize > size) { + + if (j != tbp->b_npages) { /* - * XXX if the source and destination regions - * overlap we have to copy backward to avoid - * clobbering any valid pages (i.e. pagemove - * implementations typically can't handle - * overlap). + * force buffer to be re-constituted later */ - bdata += tbp->b_bufsize; - while (bdata > (char *)tbp->b_data) { - bdata -= CLBYTES; - pagemove(bdata, bdata + size, CLBYTES); - } - } else - pagemove(bdata, bdata + size, tbp->b_bufsize); + tbp->b_flags |= B_RELBUF; + brelse(tbp); + break; + } + + if ((fbp && (i == 1)) || (i == (run - 1))) + tbp->b_flags |= B_RAM; + tbp->b_flags |= B_READ | B_ASYNC; + if (tbp->b_blkno == tbp->b_lblkno) { + tbp->b_blkno = bn; + } else if (tbp->b_blkno != bn) { + brelse(tbp); + break; + } } - tbp->b_blkno = bn; - tbp->b_flags |= flags | B_READ | B_ASYNC; - ++b_save->bs_nchildren; - b_save->bs_children[i - 1] = tbp; - } - /* - * The cluster may have been terminated early, adjust the cluster - * buffer size accordingly. If no cluster could be formed, - * deallocate the cluster save info. - */ - if (i <= run) { - if (i == 1) { - bp->b_saveaddr = b_save->bs_saveaddr; - bp->b_flags &= ~B_CALL; - bp->b_iodone = NULL; - free(b_save, M_SEGMENT); + TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, + tbp, b_cluster.cluster_entry); + for (j = 0; j < tbp->b_npages; j += 1) { + vm_page_t m; + m = tbp->b_pages[j]; + ++m->busy; + ++m->object->paging_in_progress; + if ((bp->b_npages == 0) || + (bp->b_pages[bp->b_npages-1] != m)) { + bp->b_pages[bp->b_npages] = m; + bp->b_npages++; + } + if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL) + tbp->b_pages[j] = bogus_page; } - allocbuf(bp, size * i); + bp->b_bcount += tbp->b_bcount; + bp->b_bufsize += tbp->b_bufsize; } - return(bp); -} -/* - * Either get a new buffer or grow the existing one. - */ -struct buf * -cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run) - struct vnode *vp; - struct buf *bp; - long flags; - daddr_t blkno; - daddr_t lblkno; - long size; - int run; -{ - if (!bp) { - bp = getblk(vp, lblkno, size, 0, 0); - if (bp->b_flags & (B_DONE | B_DELWRI)) { - bp->b_blkno = blkno; - return(bp); - } + for(j=0;j<bp->b_npages;j++) { + if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) == + VM_PAGE_BITS_ALL) + bp->b_pages[j] = bogus_page; } - allocbuf(bp, run * size); - bp->b_blkno = blkno; - bp->b_iodone = cluster_callback; - bp->b_flags |= flags | B_CALL; - return(bp); + if (bp->b_bufsize > bp->b_kvasize) + panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)\n", + bp->b_bufsize, bp->b_kvasize); + bp->b_kvasize = bp->b_bufsize; + + pmap_qenter(trunc_page((vm_offset_t) bp->b_data), + (vm_page_t *)bp->b_pages, bp->b_npages); + return (bp); } /* @@ -415,10 +459,7 @@ void cluster_callback(bp) struct buf *bp; { - struct cluster_save *b_save; - struct buf **bpp, *tbp; - long bsize; - caddr_t cp; + struct buf *nbp, *tbp; int error = 0; /* @@ -427,46 +468,21 @@ cluster_callback(bp) if (bp->b_flags & B_ERROR) error = bp->b_error; - b_save = (struct cluster_save *)(bp->b_saveaddr); - bp->b_saveaddr = b_save->bs_saveaddr; - - bsize = b_save->bs_bufsize; - cp = (char *)bp->b_data + bsize; + pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages); /* * Move memory from the large cluster buffer into the component * buffers and mark IO as done on these. */ - for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) { - tbp = *bpp; - pagemove(cp, tbp->b_data, bsize); - tbp->b_bufsize += bsize; - tbp->b_bcount = bsize; + for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head); + tbp; tbp = nbp) { + nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry); if (error) { tbp->b_flags |= B_ERROR; tbp->b_error = error; } biodone(tbp); - bp->b_bufsize -= bsize; - cp += bsize; - } - /* - * If there was excess memory in the cluster buffer, - * slide it up adjacent to the remaining valid data. - */ - if (bp->b_bufsize != bsize) { - if (bp->b_bufsize < bsize) - panic("cluster_callback: too little memory"); - pagemove(cp, (char *)bp->b_data + bsize, bp->b_bufsize - bsize); - } - bp->b_bcount = bsize; - bp->b_iodone = NULL; - free(b_save, M_SEGMENT); - if (bp->b_flags & B_ASYNC) - brelse(bp); - else { - bp->b_flags &= ~B_WANTED; - wakeup((caddr_t)bp); } + relpbuf(bp); } /* @@ -481,38 +497,53 @@ cluster_callback(bp) */ void cluster_write(bp, filesize) - struct buf *bp; + struct buf *bp; u_quad_t filesize; { - struct vnode *vp; - daddr_t lbn; - int maxclen, cursize; + struct vnode *vp; + daddr_t lbn; + int maxclen, cursize; + int lblocksize; + int async; - vp = bp->b_vp; - lbn = bp->b_lblkno; + vp = bp->b_vp; + async = vp->v_mount->mnt_flag & MNT_ASYNC; + lblocksize = vp->v_mount->mnt_stat.f_iosize; + lbn = bp->b_lblkno; /* Initialize vnode to beginning of file. */ if (lbn == 0) vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0; - if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || - (bp->b_blkno != vp->v_lasta + btodb(bp->b_bcount))) { - maxclen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1; + if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 || + (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) { + maxclen = MAXPHYS / lblocksize - 1; if (vp->v_clen != 0) { /* * Next block is not sequential. * * If we are not writing at end of file, the process - * seeked to another point in the file since its - * last write, or we have reached our maximum - * cluster size, then push the previous cluster. - * Otherwise try reallocating to make it sequential. + * seeked to another point in the file since its last + * write, or we have reached our maximum cluster size, + * then push the previous cluster. Otherwise try + * reallocating to make it sequential. */ cursize = vp->v_lastw - vp->v_cstart + 1; - if ((lbn + 1) * bp->b_bcount != filesize || +#ifndef notyet_block_reallocation_enabled + if (((u_quad_t)(lbn + 1) * lblocksize) != filesize || + lbn != vp->v_lastw + 1 || + vp->v_clen <= cursize) { + if (!async) + cluster_wbuild(vp, lblocksize, + vp->v_cstart, cursize); + } +#else + if (!doreallocblks || + (lbn + 1) * lblocksize != filesize || lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) { - cluster_wbuild(vp, NULL, bp->b_bcount, - vp->v_cstart, cursize, lbn); + if (!async) + cluster_wbuild(vp, lblocksize, + vp->v_cstart, cursize); } else { struct buf **bpp, **endbp; struct cluster_save *buflist; @@ -528,8 +559,8 @@ cluster_write(bp, filesize) bpp < endbp; bpp++) brelse(*bpp); free(buflist, M_SEGMENT); - cluster_wbuild(vp, NULL, bp->b_bcount, - vp->v_cstart, cursize, lbn); + cluster_wbuild(vp, lblocksize, + vp->v_cstart, cursize); } else { /* * Succeeded, keep building cluster. @@ -543,14 +574,16 @@ cluster_write(bp, filesize) return; } } +#endif /* notyet_block_reallocation_enabled */ } /* - * Consider beginning a cluster. - * If at end of file, make cluster as large as possible, - * otherwise find size of existing cluster. + * Consider beginning a cluster. If at end of file, make + * cluster as large as possible, otherwise find size of + * existing cluster. */ - if ((lbn + 1) * bp->b_bcount != filesize && - (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) || + if (((u_quad_t) (lbn + 1) * lblocksize) != filesize && + (bp->b_blkno == bp->b_lblkno) && + (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) || bp->b_blkno == -1)) { bawrite(bp); vp->v_clen = 0; @@ -559,26 +592,25 @@ cluster_write(bp, filesize) vp->v_lastw = lbn; return; } - vp->v_clen = maxclen; - if (maxclen == 0) { /* I/O not contiguous */ + vp->v_clen = maxclen; + if (!async && maxclen == 0) { /* I/O not contiguous */ vp->v_cstart = lbn + 1; - bawrite(bp); - } else { /* Wait for rest of cluster */ + bawrite(bp); + } else { /* Wait for rest of cluster */ vp->v_cstart = lbn; - bdwrite(bp); + bdwrite(bp); } } else if (lbn == vp->v_cstart + vp->v_clen) { /* * At end of cluster, write it out. */ - cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart, - vp->v_clen + 1, lbn); + bdwrite(bp); + cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1); vp->v_clen = 0; vp->v_cstart = lbn + 1; } else /* - * In the middle of a cluster, so just delay the - * I/O for now. + * In the middle of a cluster, so just delay the I/O for now. */ bdwrite(bp); vp->v_lastw = lbn; @@ -592,165 +624,168 @@ cluster_write(bp, filesize) * performed. Check to see that it doesn't fall in the middle of * the current block (if last_bp == NULL). */ -void -cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn) +int +cluster_wbuild(vp, size, start_lbn, len) struct vnode *vp; - struct buf *last_bp; long size; daddr_t start_lbn; int len; - daddr_t lbn; { - struct cluster_save *b_save; struct buf *bp, *tbp; - caddr_t cp; - int i, s; - -#ifdef DIAGNOSTIC - if (size != vp->v_mount->mnt_stat.f_iosize) - panic("cluster_wbuild: size %d != filesize %d\n", - size, vp->v_mount->mnt_stat.f_iosize); -#endif -redo: - while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) { - ++start_lbn; - --len; - } - - /* Get more memory for current buffer */ - if (len <= 1) { - if (last_bp) { - bawrite(last_bp); - } else if (len) { - bp = getblk(vp, start_lbn, size, 0, 0); - bawrite(bp); + int i, j, s; + int totalwritten = 0; + int dbsize = btodb(size); + while (len > 0) { + s = splbio(); + if ( ((tbp = gbincore(vp, start_lbn)) == NULL) || + ((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) { + ++start_lbn; + --len; + splx(s); + continue; } - return; - } - - bp = getblk(vp, start_lbn, size, 0, 0); - if (!(bp->b_flags & B_DELWRI)) { - ++start_lbn; - --len; - brelse(bp); - goto redo; - } + bremfree(tbp); + tbp->b_flags |= B_BUSY; + tbp->b_flags &= ~B_DONE; + splx(s); /* - * Extra memory in the buffer, punt on this buffer. - * XXX we could handle this in most cases, but we would have to - * push the extra memory down to after our max possible cluster - * size and then potentially pull it back up if the cluster was - * terminated prematurely--too much hassle. + * Extra memory in the buffer, punt on this buffer. XXX we could + * handle this in most cases, but we would have to push the extra + * memory down to after our max possible cluster size and then + * potentially pull it back up if the cluster was terminated + * prematurely--too much hassle. */ - if (bp->b_bcount != bp->b_bufsize) { - ++start_lbn; - --len; - bawrite(bp); - goto redo; - } + if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) || + (tbp->b_bcount != tbp->b_bufsize) || + (tbp->b_bcount != size) || + len == 1) { + totalwritten += tbp->b_bufsize; + bawrite(tbp); + ++start_lbn; + --len; + continue; + } - --len; - b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save), - M_SEGMENT, M_WAITOK); - b_save->bs_bcount = bp->b_bcount; - b_save->bs_bufsize = bp->b_bufsize; - b_save->bs_nchildren = 0; - b_save->bs_children = (struct buf **)(b_save + 1); - b_save->bs_saveaddr = bp->b_saveaddr; - bp->b_saveaddr = (caddr_t) b_save; - - bp->b_flags |= B_CALL; - bp->b_iodone = cluster_callback; - cp = (char *)bp->b_data + size; - for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) { - /* - * Block is not in core or the non-sequential block - * ending our cluster was part of the cluster (in which - * case we don't want to write it twice). - */ - if (!incore(vp, start_lbn) || - last_bp == NULL && start_lbn == lbn) - break; + bp = trypbuf(); + if (bp == NULL) { + totalwritten += tbp->b_bufsize; + bawrite(tbp); + ++start_lbn; + --len; + continue; + } - /* - * Get the desired block buffer (unless it is the final - * sequential block whose buffer was passed in explictly - * as last_bp). - */ - if (last_bp == NULL || start_lbn != lbn) { - tbp = getblk(vp, start_lbn, size, 0, 0); - if (!(tbp->b_flags & B_DELWRI)) { - brelse(tbp); - break; - } - } else - tbp = last_bp; - - ++b_save->bs_nchildren; - - /* Move memory from children to parent */ - if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize))) { - printf("Clustered Block: %d addr %x bufsize: %d\n", - bp->b_lblkno, bp->b_blkno, bp->b_bufsize); - printf("Child Block: %d addr: %x\n", tbp->b_lblkno, - tbp->b_blkno); - panic("Clustered write to wrong blocks"); + TAILQ_INIT(&bp->b_cluster.cluster_head); + bp->b_bcount = 0; + bp->b_bufsize = 0; + bp->b_npages = 0; + if (tbp->b_wcred != NOCRED) { + bp->b_wcred = tbp->b_wcred; + crhold(bp->b_wcred); } - pagemove(tbp->b_data, cp, size); - bp->b_bcount += size; - bp->b_bufsize += size; + bp->b_blkno = tbp->b_blkno; + bp->b_lblkno = tbp->b_lblkno; + (vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK; + bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER | (tbp->b_flags & (B_VMIO|B_NEEDCOMMIT)); + bp->b_iodone = cluster_callback; + pbgetvp(vp, bp); + + for (i = 0; i < len; ++i, ++start_lbn) { + if (i != 0) { + s = splbio(); + if ((tbp = gbincore(vp, start_lbn)) == NULL) { + splx(s); + break; + } - tbp->b_bufsize -= size; - tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); - tbp->b_flags |= (B_ASYNC | B_AGE); - s = splbio(); - reassignbuf(tbp, tbp->b_vp); /* put on clean list */ - ++tbp->b_vp->v_numoutput; - splx(s); - b_save->bs_children[i] = tbp; + if ((tbp->b_flags & (B_VMIO|B_CLUSTEROK|B_INVAL|B_BUSY|B_DELWRI|B_NEEDCOMMIT)) != (B_DELWRI|B_CLUSTEROK|(bp->b_flags & (B_VMIO|B_NEEDCOMMIT)))) { + splx(s); + break; + } - cp += size; - } + if (tbp->b_wcred != bp->b_wcred) { + splx(s); + break; + } - if (i == 0) { - /* None to cluster */ - bp->b_saveaddr = b_save->bs_saveaddr; - bp->b_flags &= ~B_CALL; - bp->b_iodone = NULL; - free(b_save, M_SEGMENT); - } - bawrite(bp); - if (i < len) { - len -= i + 1; - start_lbn += 1; - goto redo; + if ((tbp->b_bcount != size) || + ((bp->b_blkno + dbsize * i) != tbp->b_blkno) || + ((tbp->b_npages + bp->b_npages) > (MAXPHYS / PAGE_SIZE))) { + splx(s); + break; + } + bremfree(tbp); + tbp->b_flags |= B_BUSY; + tbp->b_flags &= ~B_DONE; + splx(s); + } + if (tbp->b_flags & B_VMIO) { + for (j = 0; j < tbp->b_npages; j += 1) { + vm_page_t m; + m = tbp->b_pages[j]; + ++m->busy; + ++m->object->paging_in_progress; + if ((bp->b_npages == 0) || + (bp->b_pages[bp->b_npages - 1] != m)) { + bp->b_pages[bp->b_npages] = m; + bp->b_npages++; + } + } + } + bp->b_bcount += size; + bp->b_bufsize += size; + + tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI); + tbp->b_flags |= B_ASYNC; + s = splbio(); + reassignbuf(tbp, tbp->b_vp); /* put on clean list */ + ++tbp->b_vp->v_numoutput; + splx(s); + TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head, + tbp, b_cluster.cluster_entry); + } + pmap_qenter(trunc_page((vm_offset_t) bp->b_data), + (vm_page_t *) bp->b_pages, bp->b_npages); + if (bp->b_bufsize > bp->b_kvasize) + panic("cluster_wbuild: b_bufsize(%d) > b_kvasize(%d)\n", + bp->b_bufsize, bp->b_kvasize); + bp->b_kvasize = bp->b_bufsize; + totalwritten += bp->b_bufsize; + bp->b_dirtyoff = 0; + bp->b_dirtyend = bp->b_bufsize; + bawrite(bp); + + len -= i; } + return totalwritten; } +#ifdef notyet_block_reallocation_enabled /* * Collect together all the buffers in a cluster. * Plus add one additional buffer. */ -struct cluster_save * +static struct cluster_save * cluster_collectbufs(vp, last_bp) struct vnode *vp; struct buf *last_bp; { struct cluster_save *buflist; - daddr_t lbn; + daddr_t lbn; int i, len; len = vp->v_lastw - vp->v_cstart + 1; buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist), M_SEGMENT, M_WAITOK); buflist->bs_nchildren = 0; - buflist->bs_children = (struct buf **)(buflist + 1); + buflist->bs_children = (struct buf **) (buflist + 1); for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++) - (void)bread(vp, lbn, last_bp->b_bcount, NOCRED, - &buflist->bs_children[i]); + (void) bread(vp, lbn, last_bp->b_bcount, NOCRED, + &buflist->bs_children[i]); buflist->bs_children[i] = last_bp; buflist->bs_nchildren = i + 1; return (buflist); } +#endif /* notyet_block_reallocation_enabled */ diff --git a/sys/kern/vfs_conf.c b/sys/kern/vfs_conf.c index 9b57797..779a1c4 100644 --- a/sys/kern/vfs_conf.c +++ b/sys/kern/vfs_conf.c @@ -1,6 +1,7 @@ /* * Copyright (c) 1989, 1993, 1995 * The Regents of the University of California. All rights reserved. + * Copyright (c) 1995 Artisoft, Inc. All Rights Reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -30,219 +31,123 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)vfs_conf.c 8.11 (Berkeley) 5/10/95 + * @(#)vfs_conf.c 8.8 (Berkeley) 3/31/94 + * $Id$ */ -#include <sys/param.h> -#include <sys/mount.h> -#include <sys/vnode.h> - /* - * These define the root filesystem, device, and root filesystem type. + * PURPOSE: This file abstracts the root mounting interface from + * the per file system semantics for handling mounts, + * the overall intent of which is to move the BSD + * internals dependence out of the FS code, both to + * make the FS code more portable and to free up some + * of the BSD internals so that they may more easily + * be changed. + * + * NOTE1: Code is single entry/single exit to aid debugging + * and conversion for kernel multithreading. + * + * NOTE2: Code notes lock state in headers on entry and exit + * as an aid to conversion for kernel multithreading + * on SMP reentrancy */ -struct mount *rootfs; -struct vnode *rootvnode; -int (*mountroot)() = NULL; +#include <sys/param.h> /* dev_t (types.h)*/ +#include <sys/systm.h> /* rootvp*/ +#include <sys/proc.h> /* curproc*/ +#include <sys/vnode.h> /* NULLVP*/ +#include <sys/mount.h> /* struct mount*/ +#include <sys/malloc.h> /* M_MOUNT*/ /* - * Set up the initial array of known filesystem types. + * GLOBALS */ -extern struct vfsops ufs_vfsops; -extern int ffs_mountroot(); -extern struct vfsops lfs_vfsops; -extern int lfs_mountroot(); -extern struct vfsops mfs_vfsops; -extern int mfs_mountroot(); -extern struct vfsops cd9660_vfsops; -extern int cd9660_mountroot(); -extern struct vfsops msdos_vfsops; -extern struct vfsops adosfs_vfsops; -extern struct vfsops nfs_vfsops; -extern int nfs_mountroot(); -extern struct vfsops afs_vfsops; -extern struct vfsops procfs_vfsops; -extern struct vfsops null_vfsops; -extern struct vfsops union_vfsops; -extern struct vfsops umap_vfsops; -extern struct vfsops portal_vfsops; -extern struct vfsops fdesc_vfsops; -extern struct vfsops kernfs_vfsops; /* - * Set up the filesystem operations for vnodes. + * These define the root filesystem, device, and root filesystem type. */ -static struct vfsconf vfsconflist[] = { - - /* Fast Filesystem */ -#ifdef FFS - { &ufs_vfsops, "ufs", 1, 0, MNT_LOCAL, ffs_mountroot, NULL }, -#endif - - /* Log-based Filesystem */ -#ifdef LFS - { &lfs_vfsops, "lfs", 5, 0, MNT_LOCAL, lfs_mountroot, NULL }, -#endif - - /* Memory-based Filesystem */ -#ifdef MFS - { &mfs_vfsops, "mfs", 3, 0, MNT_LOCAL, mfs_mountroot, NULL }, -#endif - - /* ISO9660 (aka CDROM) Filesystem */ -#ifdef CD9660 - { &cd9660_vfsops, "cd9660", 14, 0, MNT_LOCAL, cd9660_mountroot, NULL }, -#endif - - /* MSDOS Filesystem */ -#ifdef MSDOS - { &msdos_vfsops, "msdos", 4, 0, MNT_LOCAL, NULL, NULL }, -#endif - - /* AmigaDOS Filesystem */ -#ifdef ADOSFS - { &adosfs_vfsops, "adosfs", 16, 0, MNT_LOCAL, NULL, NULL }, -#endif - - /* Sun-compatible Network Filesystem */ -#ifdef NFS - { &nfs_vfsops, "nfs", 2, 0, 0, nfs_mountroot, NULL }, -#endif - - /* Andrew Filesystem */ -#ifdef AFS - { &afs_vfsops, "andrewfs", 13, 0, 0, afs_mountroot, NULL }, -#endif - - /* /proc Filesystem */ -#ifdef PROCFS - { &procfs_vfsops, "procfs", 12, 0, 0, NULL, NULL }, -#endif - - /* Loopback (Minimal) Filesystem Layer */ -#ifdef NULLFS - { &null_vfsops, "loopback", 9, 0, 0, NULL, NULL }, -#endif - - /* Union (translucent) Filesystem */ -#ifdef UNION - { &union_vfsops, "union", 15, 0, 0, NULL, NULL }, -#endif - - /* User/Group Identifer Remapping Filesystem */ -#ifdef UMAPFS - { &umap_vfsops, "umap", 10, 0, 0, NULL, NULL }, -#endif - - /* Portal Filesystem */ -#ifdef PORTAL - { &portal_vfsops, "portal", 8, 0, 0, NULL, NULL }, -#endif - - /* File Descriptor Filesystem */ -#ifdef FDESC - { &fdesc_vfsops, "fdesc", 7, 0, 0, NULL, NULL }, -#endif - - /* Kernel Information Filesystem */ -#ifdef KERNFS - { &kernfs_vfsops, "kernfs", 11, 0, 0, NULL, NULL }, -#endif - -}; +struct mount *rootfs; +struct vnode *rootvnode; +char *mountrootfsname; /* - * Initially the size of the list, vfs_init will set maxvfsconf + * vfs_init() will set maxvfsconf * to the highest defined type number. */ -int maxvfsconf = sizeof(vfsconflist) / sizeof (struct vfsconf); -struct vfsconf *vfsconf = vfsconflist; +int maxvfsconf; +struct vfsconf *vfsconf; /* + * Common root mount code shared by all filesystems + */ +#define ROOTNAME "root_device" + +/* + * vfs_mountrootfs + * + * Common entry point for root mounts + * + * PARAMETERS: + * fsname name of the filesystem + * + * RETURNS: 0 Success + * !0 error number (errno.h) * - * vfs_opv_descs enumerates the list of vnode classes, each with it's own - * vnode operation vector. It is consulted at system boot to build operation - * vectors. It is NULL terminated. + * LOCK STATE: + * ENTRY + * <no locks held> + * EXIT + * <no locks held> * + * NOTES: + * This code is currently supported only for use for + * the FFS file system type. This is a matter of + * fixing the other file systems, not this code! */ -extern struct vnodeopv_desc ffs_vnodeop_opv_desc; -extern struct vnodeopv_desc ffs_specop_opv_desc; -extern struct vnodeopv_desc ffs_fifoop_opv_desc; -extern struct vnodeopv_desc lfs_vnodeop_opv_desc; -extern struct vnodeopv_desc lfs_specop_opv_desc; -extern struct vnodeopv_desc lfs_fifoop_opv_desc; -extern struct vnodeopv_desc mfs_vnodeop_opv_desc; -extern struct vnodeopv_desc dead_vnodeop_opv_desc; -extern struct vnodeopv_desc fifo_vnodeop_opv_desc; -extern struct vnodeopv_desc spec_vnodeop_opv_desc; -extern struct vnodeopv_desc nfsv2_vnodeop_opv_desc; -extern struct vnodeopv_desc spec_nfsv2nodeop_opv_desc; -extern struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc; -extern struct vnodeopv_desc fdesc_vnodeop_opv_desc; -extern struct vnodeopv_desc portal_vnodeop_opv_desc; -extern struct vnodeopv_desc null_vnodeop_opv_desc; -extern struct vnodeopv_desc umap_vnodeop_opv_desc; -extern struct vnodeopv_desc kernfs_vnodeop_opv_desc; -extern struct vnodeopv_desc procfs_vnodeop_opv_desc; -extern struct vnodeopv_desc cd9660_vnodeop_opv_desc; -extern struct vnodeopv_desc cd9660_specop_opv_desc; -extern struct vnodeopv_desc cd9660_fifoop_opv_desc; -extern struct vnodeopv_desc union_vnodeop_opv_desc; - -struct vnodeopv_desc *vfs_opv_descs[] = { - &ffs_vnodeop_opv_desc, - &ffs_specop_opv_desc, -#ifdef FIFO - &ffs_fifoop_opv_desc, -#endif - &dead_vnodeop_opv_desc, -#ifdef FIFO - &fifo_vnodeop_opv_desc, -#endif - &spec_vnodeop_opv_desc, -#ifdef LFS - &lfs_vnodeop_opv_desc, - &lfs_specop_opv_desc, -#ifdef FIFO - &lfs_fifoop_opv_desc, -#endif -#endif -#ifdef MFS - &mfs_vnodeop_opv_desc, -#endif -#ifdef NFS - &nfsv2_vnodeop_opv_desc, - &spec_nfsv2nodeop_opv_desc, -#ifdef FIFO - &fifo_nfsv2nodeop_opv_desc, -#endif -#endif -#ifdef FDESC - &fdesc_vnodeop_opv_desc, -#endif -#ifdef PORTAL - &portal_vnodeop_opv_desc, -#endif -#ifdef NULLFS - &null_vnodeop_opv_desc, -#endif -#ifdef UMAPFS - &umap_vnodeop_opv_desc, -#endif -#ifdef KERNFS - &kernfs_vnodeop_opv_desc, -#endif -#ifdef PROCFS - &procfs_vnodeop_opv_desc, -#endif -#ifdef CD9660 - &cd9660_vnodeop_opv_desc, - &cd9660_specop_opv_desc, -#ifdef FIFO - &cd9660_fifoop_opv_desc, -#endif -#endif -#ifdef UNION - &union_vnodeop_opv_desc, -#endif - NULL -}; +int +vfs_mountrootfs(fsname) + char *fsname; +{ + struct mount *mp; + int err = 0; + struct proc *p = curproc; /* XXX */ + + /* + * New root mount structure + */ + err = vfs_rootmountalloc(fsname, ROOTNAME, &mp); + if (err) + return (err); + mp->mnt_flag |= MNT_ROOTFS; + + /* + * Attempt the mount + */ + err = VFS_MOUNT(mp, NULL, NULL, NULL, p); + if (err) + goto error_2; + + simple_lock(&mountlist_slock); + /* Add fs to list of mounted file systems*/ + CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list); + simple_unlock(&mountlist_slock); + + vfs_unbusy(mp, p); + + /* root mount, update system time from FS specific data*/ + inittodr(mp->mnt_time); + + goto success; + + +error_2: /* mount error*/ + + vfs_unbusy(mp, p); + +error_1: /* lock error*/ + + /* free mount struct before failing*/ + free( mp, M_MOUNT); + +success: + return( err); +} diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c new file mode 100644 index 0000000..0b487fd --- /dev/null +++ b/sys/kern/vfs_export.c @@ -0,0 +1,2079 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 + * $Id: vfs_subr.c,v 1.79 1997/03/04 18:31:56 bde Exp $ + */ + +/* + * External virtual filesystem routines + */ +#include "opt_ddb.h" +#include "opt_devfs.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/file.h> +#include <sys/proc.h> +#include <sys/mount.h> +#include <sys/time.h> +#include <sys/vnode.h> +#include <sys/stat.h> +#include <sys/namei.h> +#include <sys/ucred.h> +#include <sys/buf.h> +#include <sys/errno.h> +#include <sys/malloc.h> +#include <sys/domain.h> +#include <sys/mbuf.h> + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> +#include <vm/vm_pager.h> +#include <vm/vnode_pager.h> +#include <sys/sysctl.h> + +#include <miscfs/specfs/specdev.h> + +#ifdef DDB +extern void printlockedvnodes __P((void)); +#endif +static void vclean __P((struct vnode *vp, int flags, struct proc *p)); +extern void vgonel __P((struct vnode *vp, struct proc *p)); +unsigned long numvnodes; +extern void vfs_unmountroot __P((struct mount *rootfs)); +extern void vputrele __P((struct vnode *vp, int put)); + +enum vtype iftovt_tab[16] = { + VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, + VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, +}; +int vttoif_tab[9] = { + 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, + S_IFSOCK, S_IFIFO, S_IFMT, +}; + +/* + * Insq/Remq for the vnode usage lists. + */ +#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs) +#define bufremvn(bp) { \ + LIST_REMOVE(bp, b_vnbufs); \ + (bp)->b_vnbufs.le_next = NOLIST; \ +} +TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ +static u_long freevnodes = 0; + +struct mntlist mountlist; /* mounted filesystem list */ +struct simplelock mountlist_slock; +static struct simplelock mntid_slock; +struct simplelock mntvnode_slock; +struct simplelock vnode_free_list_slock; +static struct simplelock spechash_slock; + +int desiredvnodes; +SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, ""); + +static void vfs_free_addrlist __P((struct netexport *nep)); +static int vfs_free_netcred __P((struct radix_node *rn, void *w)); +static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, + struct export_args *argp)); + +/* + * Initialize the vnode management data structures. + */ +void +vntblinit() +{ + + desiredvnodes = maxproc + vm_object_cache_max; + simple_lock_init(&mntvnode_slock); + simple_lock_init(&mntid_slock); + simple_lock_init(&spechash_slock); + TAILQ_INIT(&vnode_free_list); + simple_lock_init(&vnode_free_list_slock); + CIRCLEQ_INIT(&mountlist); +} + +/* + * Mark a mount point as busy. Used to synchronize access and to delay + * unmounting. Interlock is not released on failure. + */ +int +vfs_busy(mp, flags, interlkp, p) + struct mount *mp; + int flags; + struct simplelock *interlkp; + struct proc *p; +{ + int lkflags; + + if (mp->mnt_flag & MNT_UNMOUNT) { + if (flags & LK_NOWAIT) + return (ENOENT); + mp->mnt_flag |= MNT_MWAIT; + if (interlkp) { + simple_unlock(interlkp); + } + /* + * Since all busy locks are shared except the exclusive + * lock granted when unmounting, the only place that a + * wakeup needs to be done is at the release of the + * exclusive lock at the end of dounmount. + */ + tsleep((caddr_t)mp, PVFS, "vfs_busy", 0); + if (interlkp) { + simple_lock(interlkp); + } + return (ENOENT); + } + lkflags = LK_SHARED; + if (interlkp) + lkflags |= LK_INTERLOCK; + if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p)) + panic("vfs_busy: unexpected lock failure"); + return (0); +} + +/* + * Free a busy filesystem. + */ +void +vfs_unbusy(mp, p) + struct mount *mp; + struct proc *p; +{ + + lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p); +} + +/* + * Lookup a filesystem type, and if found allocate and initialize + * a mount structure for it. + * + * Devname is usually updated by mount(8) after booting. + */ +int +vfs_rootmountalloc(fstypename, devname, mpp) + char *fstypename; + char *devname; + struct mount **mpp; +{ + struct proc *p = curproc; /* XXX */ + struct vfsconf *vfsp; + struct mount *mp; + + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (!strcmp(vfsp->vfc_name, fstypename)) + break; + if (vfsp == NULL) + return (ENODEV); + mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK); + bzero((char *)mp, (u_long)sizeof(struct mount)); + lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0); + (void)vfs_busy(mp, LK_NOWAIT, 0, p); + LIST_INIT(&mp->mnt_vnodelist); + mp->mnt_vfc = vfsp; + mp->mnt_op = vfsp->vfc_vfsops; + mp->mnt_flag = MNT_RDONLY; + mp->mnt_vnodecovered = NULLVP; + vfsp->vfc_refcount++; + mp->mnt_stat.f_type = vfsp->vfc_typenum; + mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; + strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); + mp->mnt_stat.f_mntonname[0] = '/'; + mp->mnt_stat.f_mntonname[1] = 0; + (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); + *mpp = mp; + return (0); +} + +/* + * Find an appropriate filesystem to use for the root. If a filesystem + * has not been preselected, walk through the list of known filesystems + * trying those that have mountroot routines, and try them until one + * works or we have tried them all. + */ +#ifdef notdef /* XXX JH */ +int +lite2_vfs_mountroot(void) +{ + struct vfsconf *vfsp; + extern int (*lite2_mountroot)(void); + int error; + + if (lite2_mountroot != NULL) + return ((*lite2_mountroot)()); + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { + if (vfsp->vfc_mountroot == NULL) + continue; + if ((error = (*vfsp->vfc_mountroot)()) == 0) + return (0); + printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error); + } + return (ENODEV); +} +#endif + +/* + * Lookup a mount point by filesystem identifier. + */ +struct mount * +vfs_getvfs(fsid) + fsid_t *fsid; +{ + register struct mount *mp; + + simple_lock(&mountlist_slock); + for (mp = mountlist.cqh_first; mp != (void *)&mountlist; + mp = mp->mnt_list.cqe_next) { + if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && + mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { + simple_unlock(&mountlist_slock); + return (mp); + } + } + simple_unlock(&mountlist_slock); + return ((struct mount *) 0); +} + +/* + * Get a new unique fsid + */ +void +vfs_getnewfsid(mp) + struct mount *mp; +{ + static u_short xxxfs_mntid; + + fsid_t tfsid; + int mtype; + + simple_lock(&mntid_slock); + mtype = mp->mnt_vfc->vfc_typenum; + mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0); + mp->mnt_stat.f_fsid.val[1] = mtype; + if (xxxfs_mntid == 0) + ++xxxfs_mntid; + tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid); + tfsid.val[1] = mtype; + if (mountlist.cqh_first != (void *)&mountlist) { + while (vfs_getvfs(&tfsid)) { + tfsid.val[0]++; + xxxfs_mntid++; + } + } + mp->mnt_stat.f_fsid.val[0] = tfsid.val[0]; + simple_unlock(&mntid_slock); +} + +/* + * Set vnode attributes to VNOVAL + */ +void +vattr_null(vap) + register struct vattr *vap; +{ + + vap->va_type = VNON; + vap->va_size = VNOVAL; + vap->va_bytes = VNOVAL; + vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid = + vap->va_fsid = vap->va_fileid = + vap->va_blocksize = vap->va_rdev = + vap->va_atime.tv_sec = vap->va_atime.tv_nsec = + vap->va_mtime.tv_sec = vap->va_mtime.tv_nsec = + vap->va_ctime.tv_sec = vap->va_ctime.tv_nsec = + vap->va_flags = vap->va_gen = VNOVAL; + vap->va_vaflags = 0; +} + +/* + * Routines having to do with the management of the vnode table. + */ +extern vop_t **dead_vnodeop_p; + +/* + * Return the next vnode from the free list. + */ +int +getnewvnode(tag, mp, vops, vpp) + enum vtagtype tag; + struct mount *mp; + vop_t **vops; + struct vnode **vpp; +{ + struct proc *p = curproc; /* XXX */ + struct vnode *vp; + + simple_lock(&vnode_free_list_slock); +retry: + /* + * we allocate a new vnode if + * 1. we don't have any free + * Pretty obvious, we actually used to panic, but that + * is a silly thing to do. + * 2. we havn't filled our pool yet + * We don't want to trash the incore (VM-)vnodecache. + * 3. if less that 1/4th of our vnodes are free. + * We don't want to trash the namei cache either. + */ + if (freevnodes < (numvnodes >> 2) || + numvnodes < desiredvnodes || + vnode_free_list.tqh_first == NULL) { + simple_unlock(&vnode_free_list_slock); + vp = (struct vnode *) malloc((u_long) sizeof *vp, + M_VNODE, M_WAITOK); + bzero((char *) vp, sizeof *vp); + numvnodes++; + } else { + for (vp = vnode_free_list.tqh_first; + vp != NULLVP; vp = vp->v_freelist.tqe_next) { + if (simple_lock_try(&vp->v_interlock)) + break; + } + /* + * Unless this is a bad time of the month, at most + * the first NCPUS items on the free list are + * locked, so this is close enough to being empty. + */ + if (vp == NULLVP) { + simple_unlock(&vnode_free_list_slock); + tablefull("vnode"); + *vpp = 0; + return (ENFILE); + } + if (vp->v_usecount) + panic("free vnode isn't"); + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + if (vp->v_usage > 0) { + simple_unlock(&vp->v_interlock); + --vp->v_usage; + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + goto retry; + } + freevnodes--; + + /* see comment on why 0xdeadb is set at end of vgone (below) */ + vp->v_freelist.tqe_prev = (struct vnode **) 0xdeadb; + simple_unlock(&vnode_free_list_slock); + vp->v_lease = NULL; + if (vp->v_type != VBAD) + vgonel(vp, p); + else { + simple_unlock(&vp->v_interlock); + } + +#ifdef DIAGNOSTIC + { + int s; + + if (vp->v_data) + panic("cleaned vnode isn't"); + s = splbio(); + if (vp->v_numoutput) + panic("Clean vnode has pending I/O's"); + splx(s); + } +#endif + vp->v_flag = 0; + vp->v_lastr = 0; + vp->v_lastw = 0; + vp->v_lasta = 0; + vp->v_cstart = 0; + vp->v_clen = 0; + vp->v_socket = 0; + vp->v_writecount = 0; /* XXX */ + vp->v_usage = 0; + } + vp->v_type = VNON; + cache_purge(vp); + vp->v_tag = tag; + vp->v_op = vops; + insmntque(vp, mp); + *vpp = vp; + vp->v_usecount = 1; + vp->v_data = 0; + return (0); +} + +/* + * Move a vnode from one mount queue to another. + */ +void +insmntque(vp, mp) + register struct vnode *vp; + register struct mount *mp; +{ + + simple_lock(&mntvnode_slock); + /* + * Delete from old mount point vnode list, if on one. + */ + if (vp->v_mount != NULL) + LIST_REMOVE(vp, v_mntvnodes); + /* + * Insert into list of vnodes for the new mount point, if available. + */ + if ((vp->v_mount = mp) == NULL) { + simple_unlock(&mntvnode_slock); + return; + } + LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); + simple_unlock(&mntvnode_slock); +} + +/* + * Update outstanding I/O count and do wakeup if requested. + */ +void +vwakeup(bp) + register struct buf *bp; +{ + register struct vnode *vp; + + bp->b_flags &= ~B_WRITEINPROG; + if ((vp = bp->b_vp)) { + vp->v_numoutput--; + if (vp->v_numoutput < 0) + panic("vwakeup: neg numoutput"); + if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { + vp->v_flag &= ~VBWAIT; + wakeup((caddr_t) &vp->v_numoutput); + } + } +} + +/* + * Flush out and invalidate all buffers associated with a vnode. + * Called with the underlying object locked. + */ +int +vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) + register struct vnode *vp; + int flags; + struct ucred *cred; + struct proc *p; + int slpflag, slptimeo; +{ + register struct buf *bp; + struct buf *nbp, *blist; + int s, error; + vm_object_t object; + + if (flags & V_SAVE) { + if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p))) + return (error); + if (vp->v_dirtyblkhd.lh_first != NULL) + panic("vinvalbuf: dirty bufs"); + } + + s = splbio(); + for (;;) { + if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA)) + while (blist && blist->b_lblkno < 0) + blist = blist->b_vnbufs.le_next; + if (!blist && (blist = vp->v_dirtyblkhd.lh_first) && + (flags & V_SAVEMETA)) + while (blist && blist->b_lblkno < 0) + blist = blist->b_vnbufs.le_next; + if (!blist) + break; + + for (bp = blist; bp; bp = nbp) { + nbp = bp->b_vnbufs.le_next; + if ((flags & V_SAVEMETA) && bp->b_lblkno < 0) + continue; + if (bp->b_flags & B_BUSY) { + bp->b_flags |= B_WANTED; + error = tsleep((caddr_t) bp, + slpflag | (PRIBIO + 1), "vinvalbuf", + slptimeo); + if (error) { + splx(s); + return (error); + } + break; + } + bremfree(bp); + bp->b_flags |= B_BUSY; + /* + * XXX Since there are no node locks for NFS, I + * believe there is a slight chance that a delayed + * write will occur while sleeping just above, so + * check for it. + */ + if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) { + (void) VOP_BWRITE(bp); + break; + } + bp->b_flags |= (B_INVAL|B_NOCACHE|B_RELBUF); + brelse(bp); + } + } + + while (vp->v_numoutput > 0) { + vp->v_flag |= VBWAIT; + tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); + } + + splx(s); + + /* + * Destroy the copy in the VM cache, too. + */ + object = vp->v_object; + if (object != NULL) { + vm_object_page_remove(object, 0, object->size, + (flags & V_SAVE) ? TRUE : FALSE); + } + if (!(flags & V_SAVEMETA) && + (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first)) + panic("vinvalbuf: flush failed"); + return (0); +} + +/* + * Associate a buffer with a vnode. + */ +void +bgetvp(vp, bp) + register struct vnode *vp; + register struct buf *bp; +{ + int s; + + if (bp->b_vp) + panic("bgetvp: not free"); + VHOLD(vp); + bp->b_vp = vp; + if (vp->v_type == VBLK || vp->v_type == VCHR) + bp->b_dev = vp->v_rdev; + else + bp->b_dev = NODEV; + /* + * Insert onto list for new vnode. + */ + s = splbio(); + bufinsvn(bp, &vp->v_cleanblkhd); + splx(s); +} + +/* + * Disassociate a buffer from a vnode. + */ +void +brelvp(bp) + register struct buf *bp; +{ + struct vnode *vp; + int s; + + if (bp->b_vp == (struct vnode *) 0) + panic("brelvp: NULL"); + /* + * Delete from old vnode list, if on one. + */ + s = splbio(); + if (bp->b_vnbufs.le_next != NOLIST) + bufremvn(bp); + splx(s); + + vp = bp->b_vp; + bp->b_vp = (struct vnode *) 0; + HOLDRELE(vp); +} + +/* + * Associate a p-buffer with a vnode. + */ +void +pbgetvp(vp, bp) + register struct vnode *vp; + register struct buf *bp; +{ +#if defined(DIAGNOSTIC) + if (bp->b_vp) + panic("pbgetvp: not free"); +#endif + bp->b_vp = vp; + if (vp->v_type == VBLK || vp->v_type == VCHR) + bp->b_dev = vp->v_rdev; + else + bp->b_dev = NODEV; +} + +/* + * Disassociate a p-buffer from a vnode. + */ +void +pbrelvp(bp) + register struct buf *bp; +{ + struct vnode *vp; + +#if defined(DIAGNOSTIC) + if (bp->b_vp == (struct vnode *) 0) + panic("pbrelvp: NULL"); +#endif + + bp->b_vp = (struct vnode *) 0; +} + +/* + * Reassign a buffer from one vnode to another. + * Used to assign file specific control information + * (indirect blocks) to the vnode to which they belong. + */ +void +reassignbuf(bp, newvp) + register struct buf *bp; + register struct vnode *newvp; +{ + int s; + + if (newvp == NULL) { + printf("reassignbuf: NULL"); + return; + } + + s = splbio(); + /* + * Delete from old vnode list, if on one. + */ + if (bp->b_vnbufs.le_next != NOLIST) + bufremvn(bp); + /* + * If dirty, put on list of dirty buffers; otherwise insert onto list + * of clean buffers. + */ + if (bp->b_flags & B_DELWRI) { + struct buf *tbp; + + tbp = newvp->v_dirtyblkhd.lh_first; + if (!tbp || (tbp->b_lblkno > bp->b_lblkno)) { + bufinsvn(bp, &newvp->v_dirtyblkhd); + } else { + while (tbp->b_vnbufs.le_next && + (tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) { + tbp = tbp->b_vnbufs.le_next; + } + LIST_INSERT_AFTER(tbp, bp, b_vnbufs); + } + } else { + bufinsvn(bp, &newvp->v_cleanblkhd); + } + splx(s); +} + +#ifndef DEVFS_ROOT +/* + * Create a vnode for a block device. + * Used for root filesystem, argdev, and swap areas. + * Also used for memory file system special devices. + */ +int +bdevvp(dev, vpp) + dev_t dev; + struct vnode **vpp; +{ + register struct vnode *vp; + struct vnode *nvp; + int error; + + if (dev == NODEV) + return (0); + error = getnewvnode(VT_NON, (struct mount *) 0, spec_vnodeop_p, &nvp); + if (error) { + *vpp = 0; + return (error); + } + vp = nvp; + vp->v_type = VBLK; + if ((nvp = checkalias(vp, dev, (struct mount *) 0))) { + vput(vp); + vp = nvp; + } + *vpp = vp; + return (0); +} +#endif /* !DEVFS_ROOT */ + +/* + * Check to see if the new vnode represents a special device + * for which we already have a vnode (either because of + * bdevvp() or because of a different vnode representing + * the same block device). If such an alias exists, deallocate + * the existing contents and return the aliased vnode. The + * caller is responsible for filling it with its new contents. + */ +struct vnode * +checkalias(nvp, nvp_rdev, mp) + register struct vnode *nvp; + dev_t nvp_rdev; + struct mount *mp; +{ + struct proc *p = curproc; /* XXX */ + struct vnode *vp; + struct vnode **vpp; + + if (nvp->v_type != VBLK && nvp->v_type != VCHR) + return (NULLVP); + + vpp = &speclisth[SPECHASH(nvp_rdev)]; +loop: + simple_lock(&spechash_slock); + for (vp = *vpp; vp; vp = vp->v_specnext) { + if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type) + continue; + /* + * Alias, but not in use, so flush it out. + */ + simple_lock(&vp->v_interlock); + if (vp->v_usecount == 0) { + simple_unlock(&spechash_slock); + vgonel(vp, p); + goto loop; + } + if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) { + simple_unlock(&spechash_slock); + goto loop; + } + break; + } + if (vp == NULL || vp->v_tag != VT_NON) { + MALLOC(nvp->v_specinfo, struct specinfo *, + sizeof(struct specinfo), M_VNODE, M_WAITOK); + nvp->v_rdev = nvp_rdev; + nvp->v_hashchain = vpp; + nvp->v_specnext = *vpp; + nvp->v_specflags = 0; + simple_unlock(&spechash_slock); + *vpp = nvp; + if (vp != NULLVP) { + nvp->v_flag |= VALIASED; + vp->v_flag |= VALIASED; + vput(vp); + } + return (NULLVP); + } + simple_unlock(&spechash_slock); + VOP_UNLOCK(vp, 0, p); + simple_lock(&vp->v_interlock); + vclean(vp, 0, p); + vp->v_op = nvp->v_op; + vp->v_tag = nvp->v_tag; + nvp->v_type = VNON; + insmntque(vp, mp); + return (vp); +} + +/* + * Grab a particular vnode from the free list, increment its + * reference count and lock it. The vnode lock bit is set the + * vnode is being eliminated in vgone. The process is awakened + * when the transition is completed, and an error returned to + * indicate that the vnode is no longer usable (possibly having + * been changed to a new file system type). + */ +int +vget(vp, flags, p) + register struct vnode *vp; + int flags; + struct proc *p; +{ + int error; + + /* + * If the vnode is in the process of being cleaned out for + * another use, we wait for the cleaning to finish and then + * return failure. Cleaning is determined by checking that + * the VXLOCK flag is set. + */ + if ((flags & LK_INTERLOCK) == 0) { + simple_lock(&vp->v_interlock); + } + if (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + simple_unlock(&vp->v_interlock); + tsleep((caddr_t)vp, PINOD, "vget", 0); + return (ENOENT); + } + if (vp->v_usecount == 0) { + simple_lock(&vnode_free_list_slock); + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + simple_unlock(&vnode_free_list_slock); + freevnodes--; + } + vp->v_usecount++; + /* + * Create the VM object, if needed + */ + if ((vp->v_type == VREG) && + ((vp->v_object == NULL) || + (vp->v_object->flags & OBJ_VFS_REF) == 0)) { + /* + * XXX vfs_object_create probably needs the interlock. + */ + simple_unlock(&vp->v_interlock); + vfs_object_create(vp, curproc, curproc->p_ucred, 0); + simple_lock(&vp->v_interlock); + } + if (flags & LK_TYPE_MASK) { + if (error = vn_lock(vp, flags | LK_INTERLOCK, p)) + vrele(vp); + return (error); + } + simple_unlock(&vp->v_interlock); + return (0); +} + +/* + * Stubs to use when there is no locking to be done on the underlying object. + * A minimal shared lock is necessary to ensure that the underlying object + * is not revoked while an operation is in progress. So, an active shared + * count is maintained in an auxillary vnode lock structure. + */ +int +vop_nolock(ap) + struct vop_lock_args /* { + struct vnode *a_vp; + int a_flags; + struct proc *a_p; + } */ *ap; +{ +#ifdef notyet + /* + * This code cannot be used until all the non-locking filesystems + * (notably NFS) are converted to properly lock and release nodes. + * Also, certain vnode operations change the locking state within + * the operation (create, mknod, remove, link, rename, mkdir, rmdir, + * and symlink). Ideally these operations should not change the + * lock state, but should be changed to let the caller of the + * function unlock them. Otherwise all intermediate vnode layers + * (such as union, umapfs, etc) must catch these functions to do + * the necessary locking at their layer. Note that the inactive + * and lookup operations also change their lock state, but this + * cannot be avoided, so these two operations will always need + * to be handled in intermediate layers. + */ + struct vnode *vp = ap->a_vp; + int vnflags, flags = ap->a_flags; + + if (vp->v_vnlock == NULL) { + if ((flags & LK_TYPE_MASK) == LK_DRAIN) + return (0); + MALLOC(vp->v_vnlock, struct lock *, sizeof(struct lock), + M_VNODE, M_WAITOK); + lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0); + } + switch (flags & LK_TYPE_MASK) { + case LK_DRAIN: + vnflags = LK_DRAIN; + break; + case LK_EXCLUSIVE: + case LK_SHARED: + vnflags = LK_SHARED; + break; + case LK_UPGRADE: + case LK_EXCLUPGRADE: + case LK_DOWNGRADE: + return (0); + case LK_RELEASE: + default: + panic("vop_nolock: bad operation %d", flags & LK_TYPE_MASK); + } + if (flags & LK_INTERLOCK) + vnflags |= LK_INTERLOCK; + return(lockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p)); +#else /* for now */ + /* + * Since we are not using the lock manager, we must clear + * the interlock here. + */ + if (ap->a_flags & LK_INTERLOCK) { + simple_unlock(&ap->a_vp->v_interlock); + } + return (0); +#endif +} + +/* + * Do the inverse of vop_nolock, handling the interlock in a compatible way. + */ +int +vop_nounlock(ap) + struct vop_unlock_args /* { + struct vnode *a_vp; + int a_flags; + struct proc *a_p; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + + if (vp->v_vnlock == NULL) { + if (ap->a_flags & LK_INTERLOCK) + simple_unlock(&ap->a_vp->v_interlock); + return (0); + } + return (lockmgr(vp->v_vnlock, LK_RELEASE | ap->a_flags, + &ap->a_vp->v_interlock, ap->a_p)); +} + +/* + * Return whether or not the node is in use. + */ +int +vop_noislocked(ap) + struct vop_islocked_args /* { + struct vnode *a_vp; + } */ *ap; +{ + struct vnode *vp = ap->a_vp; + + if (vp->v_vnlock == NULL) + return (0); + return (lockstatus(vp->v_vnlock)); +} + +/* #ifdef DIAGNOSTIC */ +/* + * Vnode reference, just increment the count + */ +void +vref(vp) + struct vnode *vp; +{ + simple_lock(&vp->v_interlock); + if (vp->v_usecount <= 0) + panic("vref used where vget required"); + + vp->v_usecount++; + + if ((vp->v_type == VREG) && + ((vp->v_object == NULL) || + ((vp->v_object->flags & OBJ_VFS_REF) == 0)) ) { + /* + * We need to lock to VP during the time that + * the object is created. This is necessary to + * keep the system from re-entrantly doing it + * multiple times. + * XXX vfs_object_create probably needs the interlock? + */ + simple_unlock(&vp->v_interlock); + vfs_object_create(vp, curproc, curproc->p_ucred, 0); + return; + } + simple_unlock(&vp->v_interlock); +} + +/* + * Vnode put/release. + * If count drops to zero, call inactive routine and return to freelist. + */ +void +vputrele(vp, put) + struct vnode *vp; + int put; +{ + struct proc *p = curproc; /* XXX */ + +#ifdef DIAGNOSTIC + if (vp == NULL) + panic("vputrele: null vp"); +#endif + simple_lock(&vp->v_interlock); + vp->v_usecount--; + + if ((vp->v_usecount == 1) && + vp->v_object && + (vp->v_object->flags & OBJ_VFS_REF)) { + vp->v_object->flags &= ~OBJ_VFS_REF; + if (put) { + VOP_UNLOCK(vp, LK_INTERLOCK, p); + } else { + simple_unlock(&vp->v_interlock); + } + vm_object_deallocate(vp->v_object); + return; + } + + if (vp->v_usecount > 0) { + if (put) { + VOP_UNLOCK(vp, LK_INTERLOCK, p); + } else { + simple_unlock(&vp->v_interlock); + } + return; + } + + if (vp->v_usecount < 0) { +#ifdef DIAGNOSTIC + vprint("vputrele: negative ref count", vp); +#endif + panic("vputrele: negative ref cnt"); + } + simple_lock(&vnode_free_list_slock); + if (vp->v_flag & VAGE) { + vp->v_flag &= ~VAGE; + vp->v_usage = 0; + if(vp->v_tag != VT_TFS) + TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); + } else { + if(vp->v_tag != VT_TFS) + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + } + freevnodes++; + simple_unlock(&vnode_free_list_slock); + + /* + * If we are doing a vput, the node is already locked, and we must + * call VOP_INACTIVE with the node locked. So, in the case of + * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. + */ + if (put) { + simple_unlock(&vp->v_interlock); + VOP_INACTIVE(vp, p); + } else if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { + VOP_INACTIVE(vp, p); + } +} + +/* + * vput(), just unlock and vrele() + */ +void +vput(vp) + struct vnode *vp; +{ + vputrele(vp, 1); +} + +void +vrele(vp) + struct vnode *vp; +{ + vputrele(vp, 0); +} + +#ifdef DIAGNOSTIC +/* + * Page or buffer structure gets a reference. + */ +void +vhold(vp) + register struct vnode *vp; +{ + + simple_lock(&vp->v_interlock); + vp->v_holdcnt++; + simple_unlock(&vp->v_interlock); +} + +/* + * Page or buffer structure frees a reference. + */ +void +holdrele(vp) + register struct vnode *vp; +{ + + simple_lock(&vp->v_interlock); + if (vp->v_holdcnt <= 0) + panic("holdrele: holdcnt"); + vp->v_holdcnt--; + simple_unlock(&vp->v_interlock); +} +#endif /* DIAGNOSTIC */ + +/* + * Remove any vnodes in the vnode table belonging to mount point mp. + * + * If MNT_NOFORCE is specified, there should not be any active ones, + * return error if any are found (nb: this is a user error, not a + * system error). If MNT_FORCE is specified, detach any active vnodes + * that are found. + */ +#ifdef DIAGNOSTIC +static int busyprt = 0; /* print out busy vnodes */ +SYSCTL_INT(_debug, 1, busyprt, CTLFLAG_RW, &busyprt, 0, ""); +#endif + +int +vflush(mp, skipvp, flags) + struct mount *mp; + struct vnode *skipvp; + int flags; +{ + struct proc *p = curproc; /* XXX */ + struct vnode *vp, *nvp; + int busy = 0; + + simple_lock(&mntvnode_slock); +loop: + for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { + /* + * Make sure this vnode wasn't reclaimed in getnewvnode(). + * Start over if it has (it won't be on the list anymore). + */ + if (vp->v_mount != mp) + goto loop; + nvp = vp->v_mntvnodes.le_next; + /* + * Skip over a selected vnode. + */ + if (vp == skipvp) + continue; + + simple_lock(&vp->v_interlock); + /* + * Skip over a vnodes marked VSYSTEM. + */ + if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) { + simple_unlock(&vp->v_interlock); + continue; + } + /* + * If WRITECLOSE is set, only flush out regular file vnodes + * open for writing. + */ + if ((flags & WRITECLOSE) && + (vp->v_writecount == 0 || vp->v_type != VREG)) { + simple_unlock(&vp->v_interlock); + continue; + } + + if (vp->v_object && (vp->v_object->flags & OBJ_VFS_REF)) { + simple_unlock(&vp->v_interlock); + simple_unlock(&mntvnode_slock); + vm_object_reference(vp->v_object); + pager_cache(vp->v_object, FALSE); + vp->v_object->flags &= ~OBJ_VFS_REF; + vm_object_deallocate(vp->v_object); + simple_lock(&mntvnode_slock); + simple_lock(&vp->v_interlock); + } + + /* + * With v_usecount == 0, all we need to do is clear out the + * vnode data structures and we are done. + */ + if (vp->v_usecount == 0) { + simple_unlock(&mntvnode_slock); + vgonel(vp, p); + simple_lock(&mntvnode_slock); + continue; + } + + /* + * If FORCECLOSE is set, forcibly close the vnode. For block + * or character devices, revert to an anonymous device. For + * all other files, just kill them. + */ + if (flags & FORCECLOSE) { + simple_unlock(&mntvnode_slock); + if (vp->v_type != VBLK && vp->v_type != VCHR) { + vgonel(vp, p); + } else { + vclean(vp, 0, p); + vp->v_op = spec_vnodeop_p; + insmntque(vp, (struct mount *) 0); + } + simple_lock(&mntvnode_slock); + continue; + } +#ifdef DIAGNOSTIC + if (busyprt) + vprint("vflush: busy vnode", vp); +#endif + simple_unlock(&vp->v_interlock); + busy++; + } + simple_unlock(&mntvnode_slock); + if (busy) + return (EBUSY); + return (0); +} + +/* + * Disassociate the underlying file system from a vnode. + */ +static void +vclean(struct vnode *vp, int flags, struct proc *p) +{ + int active; + + /* + * Check to see if the vnode is in use. If so we have to reference it + * before we clean it out so that its count cannot fall to zero and + * generate a race against ourselves to recycle it. + */ + if ((active = vp->v_usecount)) + vp->v_usecount++; + /* + * Prevent the vnode from being recycled or brought into use while we + * clean it out. + */ + if (vp->v_flag & VXLOCK) + panic("vclean: deadlock"); + vp->v_flag |= VXLOCK; + /* + * Even if the count is zero, the VOP_INACTIVE routine may still + * have the object locked while it cleans it out. The VOP_LOCK + * ensures that the VOP_INACTIVE routine is done with its work. + * For active vnodes, it ensures that no other activity can + * occur while the underlying object is being cleaned out. + */ + VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p); + /* + * Clean out any buffers associated with the vnode. + */ + if (flags & DOCLOSE) + vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0); + /* + * If purging an active vnode, it must be closed and + * deactivated before being reclaimed. Note that the + * VOP_INACTIVE will unlock the vnode. + */ + if (active) { + if (flags & DOCLOSE) + VOP_CLOSE(vp, IO_NDELAY, NOCRED, p); + VOP_INACTIVE(vp, p); + } else { + /* + * Any other processes trying to obtain this lock must first + * wait for VXLOCK to clear, then call the new lock operation. + */ + VOP_UNLOCK(vp, 0, p); + } + /* + * Reclaim the vnode. + */ + if (VOP_RECLAIM(vp, p)) + panic("vclean: cannot reclaim"); + if (active) + vrele(vp); + cache_purge(vp); + if (vp->v_vnlock) { + if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0) + vprint("vclean: lock not drained", vp); + FREE(vp->v_vnlock, M_VNODE); + vp->v_vnlock = NULL; + } + + /* + * Done with purge, notify sleepers of the grim news. + */ + vp->v_op = dead_vnodeop_p; + vp->v_tag = VT_NON; + vp->v_flag &= ~VXLOCK; + if (vp->v_flag & VXWANT) { + vp->v_flag &= ~VXWANT; + wakeup((caddr_t) vp); + } +} + +/* + * Eliminate all activity associated with the requested vnode + * and with all vnodes aliased to the requested vnode. + */ +int +vop_revoke(ap) + struct vop_revoke_args /* { + struct vnode *a_vp; + int a_flags; + } */ *ap; +{ + struct vnode *vp, *vq; + struct proc *p = curproc; /* XXX */ + +#ifdef DIAGNOSTIC + if ((ap->a_flags & REVOKEALL) == 0) + panic("vop_revoke"); +#endif + + vp = ap->a_vp; + simple_lock(&vp->v_interlock); + + if (vp->v_flag & VALIASED) { + /* + * If a vgone (or vclean) is already in progress, + * wait until it is done and return. + */ + if (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + simple_unlock(&vp->v_interlock); + tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0); + return (0); + } + /* + * Ensure that vp will not be vgone'd while we + * are eliminating its aliases. + */ + vp->v_flag |= VXLOCK; + simple_unlock(&vp->v_interlock); + while (vp->v_flag & VALIASED) { + simple_lock(&spechash_slock); + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_rdev != vp->v_rdev || + vq->v_type != vp->v_type || vp == vq) + continue; + simple_unlock(&spechash_slock); + vgone(vq); + break; + } + if (vq == NULLVP) { + simple_unlock(&spechash_slock); + } + } + /* + * Remove the lock so that vgone below will + * really eliminate the vnode after which time + * vgone will awaken any sleepers. + */ + simple_lock(&vp->v_interlock); + vp->v_flag &= ~VXLOCK; + } + vgonel(vp, p); + return (0); +} + +/* + * Recycle an unused vnode to the front of the free list. + * Release the passed interlock if the vnode will be recycled. + */ +int +vrecycle(vp, inter_lkp, p) + struct vnode *vp; + struct simplelock *inter_lkp; + struct proc *p; +{ + + simple_lock(&vp->v_interlock); + if (vp->v_usecount == 0) { + if (inter_lkp) { + simple_unlock(inter_lkp); + } + vgonel(vp, p); + return (1); + } + simple_unlock(&vp->v_interlock); + return (0); +} + +/* + * Eliminate all activity associated with a vnode + * in preparation for reuse. + */ +void +vgone(vp) + register struct vnode *vp; +{ + struct proc *p = curproc; /* XXX */ + + simple_lock(&vp->v_interlock); + vgonel(vp, p); +} + +/* + * vgone, with the vp interlock held. + */ +void +vgonel(vp, p) + struct vnode *vp; + struct proc *p; +{ + struct vnode *vq; + struct vnode *vx; + + /* + * If a vgone (or vclean) is already in progress, + * wait until it is done and return. + */ + if (vp->v_flag & VXLOCK) { + vp->v_flag |= VXWANT; + simple_unlock(&vp->v_interlock); + tsleep((caddr_t)vp, PINOD, "vgone", 0); + return; + } + + if (vp->v_object) { + vp->v_object->flags |= OBJ_VNODE_GONE; + } + + /* + * Clean out the filesystem specific data. + */ + vclean(vp, DOCLOSE, p); + /* + * Delete from old mount point vnode list, if on one. + */ + if (vp->v_mount != NULL) + insmntque(vp, (struct mount *)0); + /* + * If special device, remove it from special device alias list + * if it is on one. + */ + if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) { + simple_lock(&spechash_slock); + if (*vp->v_hashchain == vp) { + *vp->v_hashchain = vp->v_specnext; + } else { + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_specnext != vp) + continue; + vq->v_specnext = vp->v_specnext; + break; + } + if (vq == NULL) + panic("missing bdev"); + } + if (vp->v_flag & VALIASED) { + vx = NULL; + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_rdev != vp->v_rdev || + vq->v_type != vp->v_type) + continue; + if (vx) + break; + vx = vq; + } + if (vx == NULL) + panic("missing alias"); + if (vq == NULL) + vx->v_flag &= ~VALIASED; + vp->v_flag &= ~VALIASED; + } + simple_unlock(&spechash_slock); + FREE(vp->v_specinfo, M_VNODE); + vp->v_specinfo = NULL; + } + + /* + * If it is on the freelist and not already at the head, + * move it to the head of the list. The test of the back + * pointer and the reference count of zero is because + * it will be removed from the free list by getnewvnode, + * but will not have its reference count incremented until + * after calling vgone. If the reference count were + * incremented first, vgone would (incorrectly) try to + * close the previous instance of the underlying object. + * So, the back pointer is explicitly set to `0xdeadb' in + * getnewvnode after removing it from the freelist to ensure + * that we do not try to move it here. + */ + if (vp->v_usecount == 0) { + simple_lock(&vnode_free_list_slock); + if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && + vnode_free_list.tqh_first != vp) { + TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); + } + simple_unlock(&vnode_free_list_slock); + } + + vp->v_type = VBAD; +} + +/* + * Lookup a vnode by device number. + */ +int +vfinddev(dev, type, vpp) + dev_t dev; + enum vtype type; + struct vnode **vpp; +{ + register struct vnode *vp; + int rc = 0; + + simple_lock(&spechash_slock); + for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) { + if (dev != vp->v_rdev || type != vp->v_type) + continue; + *vpp = vp; + rc = 1; + break; + } + simple_unlock(&spechash_slock); + return (rc); +} + +/* + * Calculate the total number of references to a special device. + */ +int +vcount(vp) + register struct vnode *vp; +{ + struct vnode *vq, *vnext; + int count; + +loop: + if ((vp->v_flag & VALIASED) == 0) + return (vp->v_usecount); + simple_lock(&spechash_slock); + for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) { + vnext = vq->v_specnext; + if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type) + continue; + /* + * Alias, but not in use, so flush it out. + */ + if (vq->v_usecount == 0 && vq != vp) { + simple_unlock(&spechash_slock); + vgone(vq); + goto loop; + } + count += vq->v_usecount; + } + simple_unlock(&spechash_slock); + return (count); +} + +/* + * Print out a description of a vnode. + */ +static char *typename[] = +{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; + +void +vprint(label, vp) + char *label; + register struct vnode *vp; +{ + char buf[64]; + + if (label != NULL) + printf("%s: ", label); + printf("type %s, usecount %d, writecount %d, refcount %ld,", + typename[vp->v_type], vp->v_usecount, vp->v_writecount, + vp->v_holdcnt); + buf[0] = '\0'; + if (vp->v_flag & VROOT) + strcat(buf, "|VROOT"); + if (vp->v_flag & VTEXT) + strcat(buf, "|VTEXT"); + if (vp->v_flag & VSYSTEM) + strcat(buf, "|VSYSTEM"); + if (vp->v_flag & VXLOCK) + strcat(buf, "|VXLOCK"); + if (vp->v_flag & VXWANT) + strcat(buf, "|VXWANT"); + if (vp->v_flag & VBWAIT) + strcat(buf, "|VBWAIT"); + if (vp->v_flag & VALIASED) + strcat(buf, "|VALIASED"); + if (buf[0] != '\0') + printf(" flags (%s)", &buf[1]); + if (vp->v_data == NULL) { + printf("\n"); + } else { + printf("\n\t"); + VOP_PRINT(vp); + } +} + +#ifdef DDB +/* + * List all of the locked vnodes in the system. + * Called when debugging the kernel. + */ +void +printlockedvnodes() +{ + struct proc *p = curproc; /* XXX */ + struct mount *mp, *nmp; + struct vnode *vp; + + printf("Locked vnodes\n"); + simple_lock(&mountlist_slock); + for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { + nmp = mp->mnt_list.cqe_next; + continue; + } + for (vp = mp->mnt_vnodelist.lh_first; + vp != NULL; + vp = vp->v_mntvnodes.le_next) { + if (VOP_ISLOCKED(vp)) + vprint((char *)0, vp); + } + simple_lock(&mountlist_slock); + nmp = mp->mnt_list.cqe_next; + vfs_unbusy(mp, p); + } + simple_unlock(&mountlist_slock); +} +#endif + +/* + * Top level filesystem related information gathering. + */ +static int sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS); + +static int +vfs_sysctl SYSCTL_HANDLER_ARGS +{ + int *name = (int *)arg1 - 1; /* XXX */ + u_int namelen = arg2 + 1; /* XXX */ + struct vfsconf *vfsp; + +#ifndef NO_COMPAT_PRELITE2 + /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ + if (namelen == 1) + return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); +#endif + +#ifdef notyet + /* all sysctl names at this level are at least name and field */ + if (namelen < 2) + return (ENOTDIR); /* overloaded */ + if (name[0] != VFS_GENERIC) { + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (vfsp->vfc_typenum == name[0]) + break; + if (vfsp == NULL) + return (EOPNOTSUPP); + return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, + oldp, oldlenp, newp, newlen, p)); + } +#endif + switch (name[1]) { + case VFS_MAXTYPENUM: + if (namelen != 2) + return (ENOTDIR); + return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); + case VFS_CONF: + if (namelen != 3) + return (ENOTDIR); /* overloaded */ + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (vfsp->vfc_typenum == name[2]) + break; + if (vfsp == NULL) + return (EOPNOTSUPP); + return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); + } + return (EOPNOTSUPP); +} + +SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, + "Generic filesystem"); + +#ifndef NO_COMPAT_PRELITE2 + +static int +sysctl_ovfs_conf SYSCTL_HANDLER_ARGS +{ + int error; + struct vfsconf *vfsp; + struct ovfsconf ovfs; + + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { + ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ + strcpy(ovfs.vfc_name, vfsp->vfc_name); + ovfs.vfc_index = vfsp->vfc_typenum; + ovfs.vfc_refcount = vfsp->vfc_refcount; + ovfs.vfc_flags = vfsp->vfc_flags; + error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); + if (error) + return error; + } + return 0; +} + +#endif /* !NO_COMPAT_PRELITE2 */ + +int kinfo_vdebug = 1; +int kinfo_vgetfailed; + +#define KINFO_VNODESLOP 10 +/* + * Dump vnode list (via sysctl). + * Copyout address of vnode followed by vnode. + */ +/* ARGSUSED */ +static int +sysctl_vnode SYSCTL_HANDLER_ARGS +{ + struct proc *p = curproc; /* XXX */ + struct mount *mp, *nmp; + struct vnode *nvp, *vp; + int error; + +#define VPTRSZ sizeof (struct vnode *) +#define VNODESZ sizeof (struct vnode) + + req->lock = 0; + if (!req->oldptr) /* Make an estimate */ + return (SYSCTL_OUT(req, 0, + (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); + + simple_lock(&mountlist_slock); + for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { + nmp = mp->mnt_list.cqe_next; + continue; + } +again: + simple_lock(&mntvnode_slock); + for (vp = mp->mnt_vnodelist.lh_first; + vp != NULL; + vp = nvp) { + /* + * Check that the vp is still associated with + * this filesystem. RACE: could have been + * recycled onto the same filesystem. + */ + if (vp->v_mount != mp) { + simple_unlock(&mntvnode_slock); + if (kinfo_vdebug) + printf("kinfo: vp changed\n"); + goto again; + } + nvp = vp->v_mntvnodes.le_next; + simple_unlock(&mntvnode_slock); + if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || + (error = SYSCTL_OUT(req, vp, VNODESZ))) + return (error); + simple_lock(&mntvnode_slock); + } + simple_unlock(&mntvnode_slock); + simple_lock(&mountlist_slock); + nmp = mp->mnt_list.cqe_next; + vfs_unbusy(mp, p); + } + simple_unlock(&mountlist_slock); + + return (0); +} + +SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, + 0, 0, sysctl_vnode, "S,vnode", ""); + +/* + * Check to see if a filesystem is mounted on a block device. + */ +int +vfs_mountedon(vp) + struct vnode *vp; +{ + struct vnode *vq; + int error = 0; + + if (vp->v_specflags & SI_MOUNTEDON) + return (EBUSY); + if (vp->v_flag & VALIASED) { + simple_lock(&spechash_slock); + for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) { + if (vq->v_rdev != vp->v_rdev || + vq->v_type != vp->v_type) + continue; + if (vq->v_specflags & SI_MOUNTEDON) { + error = EBUSY; + break; + } + } + simple_unlock(&spechash_slock); + } + return (error); +} + +/* + * Unmount all filesystems. The list is traversed in reverse order + * of mounting to avoid dependencies. + */ +void +vfs_unmountall() +{ + struct mount *mp, *nmp; + struct proc *p = initproc; /* XXX XXX should this be proc0? */ + int error; + + /* + * Since this only runs when rebooting, it is not interlocked. + */ + for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { + nmp = mp->mnt_list.cqe_prev; + error = dounmount(mp, MNT_FORCE, p); + if (error) { + printf("unmount of %s failed (", + mp->mnt_stat.f_mntonname); + if (error == EBUSY) + printf("BUSY)\n"); + else + printf("%d)\n", error); + } + } +} + +/* + * Build hash lists of net addresses and hang them off the mount point. + * Called by ufs_mount() to set up the lists of export addresses. + */ +static int +vfs_hang_addrlist(struct mount *mp, struct netexport *nep, + struct export_args *argp) +{ + register struct netcred *np; + register struct radix_node_head *rnh; + register int i; + struct radix_node *rn; + struct sockaddr *saddr, *smask = 0; + struct domain *dom; + int error; + + if (argp->ex_addrlen == 0) { + if (mp->mnt_flag & MNT_DEFEXPORTED) + return (EPERM); + np = &nep->ne_defexported; + np->netc_exflags = argp->ex_flags; + np->netc_anon = argp->ex_anon; + np->netc_anon.cr_ref = 1; + mp->mnt_flag |= MNT_DEFEXPORTED; + return (0); + } + i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; + np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); + bzero((caddr_t) np, i); + saddr = (struct sockaddr *) (np + 1); + if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) + goto out; + if (saddr->sa_len > argp->ex_addrlen) + saddr->sa_len = argp->ex_addrlen; + if (argp->ex_masklen) { + smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); + error = copyin(argp->ex_addr, (caddr_t) smask, argp->ex_masklen); + if (error) + goto out; + if (smask->sa_len > argp->ex_masklen) + smask->sa_len = argp->ex_masklen; + } + i = saddr->sa_family; + if ((rnh = nep->ne_rtable[i]) == 0) { + /* + * Seems silly to initialize every AF when most are not used, + * do so on demand here + */ + for (dom = domains; dom; dom = dom->dom_next) + if (dom->dom_family == i && dom->dom_rtattach) { + dom->dom_rtattach((void **) &nep->ne_rtable[i], + dom->dom_rtoffset); + break; + } + if ((rnh = nep->ne_rtable[i]) == 0) { + error = ENOBUFS; + goto out; + } + } + rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, + np->netc_rnodes); + if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ + error = EPERM; + goto out; + } + np->netc_exflags = argp->ex_flags; + np->netc_anon = argp->ex_anon; + np->netc_anon.cr_ref = 1; + return (0); +out: + free(np, M_NETADDR); + return (error); +} + +/* ARGSUSED */ +static int +vfs_free_netcred(struct radix_node *rn, void *w) +{ + register struct radix_node_head *rnh = (struct radix_node_head *) w; + + (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); + free((caddr_t) rn, M_NETADDR); + return (0); +} + +/* + * Free the net address hash lists that are hanging off the mount points. + */ +static void +vfs_free_addrlist(struct netexport *nep) +{ + register int i; + register struct radix_node_head *rnh; + + for (i = 0; i <= AF_MAX; i++) + if ((rnh = nep->ne_rtable[i])) { + (*rnh->rnh_walktree) (rnh, vfs_free_netcred, + (caddr_t) rnh); + free((caddr_t) rnh, M_RTABLE); + nep->ne_rtable[i] = 0; + } +} + +int +vfs_export(mp, nep, argp) + struct mount *mp; + struct netexport *nep; + struct export_args *argp; +{ + int error; + + if (argp->ex_flags & MNT_DELEXPORT) { + vfs_free_addrlist(nep); + mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); + } + if (argp->ex_flags & MNT_EXPORTED) { + if ((error = vfs_hang_addrlist(mp, nep, argp))) + return (error); + mp->mnt_flag |= MNT_EXPORTED; + } + return (0); +} + +struct netcred * +vfs_export_lookup(mp, nep, nam) + register struct mount *mp; + struct netexport *nep; + struct mbuf *nam; +{ + register struct netcred *np; + register struct radix_node_head *rnh; + struct sockaddr *saddr; + + np = NULL; + if (mp->mnt_flag & MNT_EXPORTED) { + /* + * Lookup in the export list first. + */ + if (nam != NULL) { + saddr = mtod(nam, struct sockaddr *); + rnh = nep->ne_rtable[saddr->sa_family]; + if (rnh != NULL) { + np = (struct netcred *) + (*rnh->rnh_matchaddr)((caddr_t)saddr, + rnh); + if (np && np->netc_rnodes->rn_flags & RNF_ROOT) + np = NULL; + } + } + /* + * If no address match, use the default if it exists. + */ + if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED) + np = &nep->ne_defexported; + } + return (np); +} + +/* + * perform msync on all vnodes under a mount point + * the mount point must be locked. + */ +void +vfs_msync(struct mount *mp, int flags) { + struct vnode *vp, *nvp; +loop: + for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { + + if (vp->v_mount != mp) + goto loop; + nvp = vp->v_mntvnodes.le_next; + if (VOP_ISLOCKED(vp) && (flags != MNT_WAIT)) + continue; + if (vp->v_object && + (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) { + vm_object_page_clean(vp->v_object, 0, 0, TRUE, TRUE); + } + } +} + +/* + * Create the VM object needed for VMIO and mmap support. This + * is done for all VREG files in the system. Some filesystems might + * afford the additional metadata buffering capability of the + * VMIO code by making the device node be VMIO mode also. + */ +int +vfs_object_create(vp, p, cred, waslocked) + struct vnode *vp; + struct proc *p; + struct ucred *cred; + int waslocked; +{ + struct vattr vat; + vm_object_t object; + int error = 0; + +retry: + if ((object = vp->v_object) == NULL) { + if (vp->v_type == VREG) { + if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0) + goto retn; + (void) vnode_pager_alloc(vp, + OFF_TO_IDX(round_page(vat.va_size)), 0, 0); + } else { + /* + * This simply allocates the biggest object possible + * for a VBLK vnode. This should be fixed, but doesn't + * cause any problems (yet). + */ + (void) vnode_pager_alloc(vp, INT_MAX, 0, 0); + } + vp->v_object->flags |= OBJ_VFS_REF; + } else { + if (object->flags & OBJ_DEAD) { + if (waslocked) + VOP_UNLOCK(vp, 0, p); + tsleep(object, PVM, "vodead", 0); + if (waslocked) + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + goto retry; + } + if ((object->flags & OBJ_VFS_REF) == 0) { + object->flags |= OBJ_VFS_REF; + vm_object_reference(object); + } + } + if (vp->v_object) + vp->v_flag |= VVMIO; + +retn: + return error; +} diff --git a/sys/kern/vfs_extattr.c b/sys/kern/vfs_extattr.c new file mode 100644 index 0000000..2997fe5 --- /dev/null +++ b/sys/kern/vfs_extattr.c @@ -0,0 +1,2756 @@ +/* + * Copyright (c) 1989, 1993 + * The Regents of the University of California. All rights reserved. + * (c) UNIX System Laboratories, Inc. + * All or some portions of this file are derived from material licensed + * to the University of California by American Telephone and Telegraph + * Co. or Unix System Laboratories, Inc. and are reproduced herein with + * the permission of UNIX System Laboratories, Inc. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 + * $Id: vfs_syscalls.c,v 1.60 1997/03/23 03:36:35 bde Exp $ + */ + +/* + * XXX - The following is required because of some magic done + * in getdirentries() below which is only done if the translucent + * filesystem `UNION' is compiled into the kernel. This is broken, + * but I don't have time to study the code deeply enough to understand + * what's going on and determine an appropriate fix. -GAW + */ +#include "opt_union.h" + +#include <sys/param.h> +#include <sys/systm.h> +#include <sys/sysent.h> +#include <sys/sysproto.h> +#include <sys/namei.h> +#include <sys/filedesc.h> +#include <sys/kernel.h> +#include <sys/fcntl.h> +#include <sys/file.h> +#include <sys/stat.h> +#include <sys/unistd.h> +#include <sys/vnode.h> +#include <sys/mount.h> +#include <sys/proc.h> +#include <sys/uio.h> +#include <sys/malloc.h> +#include <sys/dirent.h> + +#ifdef UNION +#include <miscfs/union/union.h> +#endif + +#include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> +#include <sys/sysctl.h> + +static int change_dir __P((struct nameidata *ndp, struct proc *p)); +static void checkdirs __P((struct vnode *olddp)); + +/* + * Virtual File System System Calls + */ + +/* + * Mount a file system. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mount_args { + char *type; + char *path; + int flags; + caddr_t data; +}; +#endif +/* ARGSUSED */ +int +mount(p, uap, retval) + struct proc *p; + register struct mount_args /* { + syscallarg(char *) type; + syscallarg(char *) path; + syscallarg(int) flags; + syscallarg(caddr_t) data; + } */ *uap; + register_t *retval; +{ + struct vnode *vp; + struct mount *mp; + struct vfsconf *vfsp; + int error, flag = 0; + struct vattr va; + u_long fstypenum; + struct nameidata nd; + char fstypename[MFSNAMELEN]; + + /* + * Get vnode to be covered + */ + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (SCARG(uap, flags) & MNT_UPDATE) { + if ((vp->v_flag & VROOT) == 0) { + vput(vp); + return (EINVAL); + } + mp = vp->v_mount; + flag = mp->mnt_flag; + /* + * We only allow the filesystem to be reloaded if it + * is currently mounted read-only. + */ + if ((SCARG(uap, flags) & MNT_RELOAD) && + ((mp->mnt_flag & MNT_RDONLY) == 0)) { + vput(vp); + return (EOPNOTSUPP); /* Needs translation */ + } + mp->mnt_flag |= + SCARG(uap, flags) & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE); + /* + * Only root, or the user that did the original mount is + * permitted to update it. + */ + if (mp->mnt_stat.f_owner != p->p_ucred->cr_uid && + (error = suser(p->p_ucred, &p->p_acflag))) { + vput(vp); + return (error); + } + /* + * Do not allow NFS export by non-root users. Silently + * enforce MNT_NOSUID and MNT_NODEV for non-root users. + */ + if (p->p_ucred->cr_uid != 0) { + if (SCARG(uap, flags) & MNT_EXPORTED) { + vput(vp); + return (EPERM); + } + SCARG(uap, flags) |= MNT_NOSUID | MNT_NODEV; + } + if (vfs_busy(mp, LK_NOWAIT, 0, p)) { + vput(vp); + return (EBUSY); + } + VOP_UNLOCK(vp, 0, p); + goto update; + } + /* + * If the user is not root, ensure that they own the directory + * onto which we are attempting to mount. + */ + if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)) || + (va.va_uid != p->p_ucred->cr_uid && + (error = suser(p->p_ucred, &p->p_acflag)))) { + vput(vp); + return (error); + } + /* + * Do not allow NFS export by non-root users. Silently + * enforce MNT_NOSUID and MNT_NODEV for non-root users. + */ + if (p->p_ucred->cr_uid != 0) { + if (SCARG(uap, flags) & MNT_EXPORTED) { + vput(vp); + return (EPERM); + } + SCARG(uap, flags) |= MNT_NOSUID | MNT_NODEV; + } + if (error = vinvalbuf(vp, V_SAVE, p->p_ucred, p, 0, 0)) + return (error); + if (vp->v_type != VDIR) { + vput(vp); + return (ENOTDIR); + } +#ifdef COMPAT_43 + /* + * Historically filesystem types were identified by number. If we + * get an integer for the filesystem type instead of a string, we + * check to see if it matches one of the historic filesystem types. + */ + fstypenum = (u_long)SCARG(uap, type); + if (fstypenum < maxvfsconf) { + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (vfsp->vfc_typenum == fstypenum) + break; + if (vfsp == NULL) { + vput(vp); + return (ENODEV); + } + strncpy(fstypename, vfsp->vfc_name, MFSNAMELEN); + } else +#endif /* COMPAT_43 */ + if (error = copyinstr(SCARG(uap, type), fstypename, MFSNAMELEN, NULL)) { + vput(vp); + return (error); + } + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) + if (!strcmp(vfsp->vfc_name, fstypename)) + break; + if (vfsp == NULL) { + vput(vp); + return (ENODEV); + } + if (vp->v_mountedhere != NULL) { + vput(vp); + return (EBUSY); + } + + /* + * Allocate and initialize the filesystem. + */ + mp = (struct mount *)malloc((u_long)sizeof(struct mount), + M_MOUNT, M_WAITOK); + bzero((char *)mp, (u_long)sizeof(struct mount)); + lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0); + (void)vfs_busy(mp, LK_NOWAIT, 0, p); + mp->mnt_op = vfsp->vfc_vfsops; + mp->mnt_vfc = vfsp; + vfsp->vfc_refcount++; + mp->mnt_stat.f_type = vfsp->vfc_typenum; + mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; + strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); + vp->v_mountedhere = mp; + mp->mnt_vnodecovered = vp; + mp->mnt_stat.f_owner = p->p_ucred->cr_uid; +update: + /* + * Set the mount level flags. + */ + if (SCARG(uap, flags) & MNT_RDONLY) + mp->mnt_flag |= MNT_RDONLY; + else if (mp->mnt_flag & MNT_RDONLY) + mp->mnt_flag |= MNT_WANTRDWR; + mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | + MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOATIME); + mp->mnt_flag |= SCARG(uap, flags) & (MNT_NOSUID | MNT_NOEXEC | + MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_FORCE | + MNT_NOATIME); + /* + * Mount the filesystem. + */ + error = VFS_MOUNT(mp, SCARG(uap, path), SCARG(uap, data), &nd, p); + if (mp->mnt_flag & MNT_UPDATE) { + vrele(vp); + if (mp->mnt_flag & MNT_WANTRDWR) + mp->mnt_flag &= ~MNT_RDONLY; + mp->mnt_flag &=~ + (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_WANTRDWR); + if (error) + mp->mnt_flag = flag; + vfs_unbusy(mp, p); + return (error); + } + /* + * Put the new filesystem on the mount list after root. + */ + cache_purge(vp); + if (!error) { + simple_lock(&mountlist_slock); + CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list); + simple_unlock(&mountlist_slock); + checkdirs(vp); + VOP_UNLOCK(vp, 0, p); + vfs_unbusy(mp, p); + if (error = VFS_START(mp, 0, p)) + vrele(vp); + } else { + mp->mnt_vnodecovered->v_mountedhere = (struct mount *)0; + mp->mnt_vfc->vfc_refcount--; + vfs_unbusy(mp, p); + free((caddr_t)mp, M_MOUNT); + vput(vp); + } + return (error); +} + +/* + * Scan all active processes to see if any of them have a current + * or root directory onto which the new filesystem has just been + * mounted. If so, replace them with the new mount point. + */ +static void +checkdirs(olddp) + struct vnode *olddp; +{ + struct filedesc *fdp; + struct vnode *newdp; + struct proc *p; + + if (olddp->v_usecount == 1) + return; + if (VFS_ROOT(olddp->v_mountedhere, &newdp)) + panic("mount: lost mount"); + for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) { + fdp = p->p_fd; + if (fdp->fd_cdir == olddp) { + vrele(fdp->fd_cdir); + VREF(newdp); + fdp->fd_cdir = newdp; + } + if (fdp->fd_rdir == olddp) { + vrele(fdp->fd_rdir); + VREF(newdp); + fdp->fd_rdir = newdp; + } + } + if (rootvnode == olddp) { + vrele(rootvnode); + VREF(newdp); + rootvnode = newdp; + } + vput(newdp); +} + +/* + * Unmount a file system. + * + * Note: unmount takes a path to the vnode mounted on as argument, + * not special file (as before). + */ +#ifndef _SYS_SYSPROTO_H_ +struct unmount_args { + char *path; + int flags; +}; +#endif +/* ARGSUSED */ +int +unmount(p, uap, retval) + struct proc *p; + register struct unmount_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct mount *mp; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + mp = vp->v_mount; + + /* + * Only root, or the user that did the original mount is + * permitted to unmount this filesystem. + */ + if ((mp->mnt_stat.f_owner != p->p_ucred->cr_uid) && + (error = suser(p->p_ucred, &p->p_acflag))) { + vput(vp); + return (error); + } + + /* + * Don't allow unmounting the root file system. + */ + if (mp->mnt_flag & MNT_ROOTFS) { + vput(vp); + return (EINVAL); + } + + /* + * Must be the root of the filesystem + */ + if ((vp->v_flag & VROOT) == 0) { + vput(vp); + return (EINVAL); + } + vput(vp); + return (dounmount(mp, SCARG(uap, flags), p)); +} + +/* + * Do the actual file system unmount. + */ +int +dounmount(mp, flags, p) + register struct mount *mp; + int flags; + struct proc *p; +{ + struct vnode *coveredvp; + int error; + + simple_lock(&mountlist_slock); + mp->mnt_flag |= MNT_UNMOUNT; + lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK, &mountlist_slock, p); + mp->mnt_flag &=~ MNT_ASYNC; + vfs_msync(mp, MNT_NOWAIT); + vnode_pager_umount(mp); /* release cached vnodes */ + cache_purgevfs(mp); /* remove cache entries for this file sys */ + if (((mp->mnt_flag & MNT_RDONLY) || + (error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p)) == 0) || + (flags & MNT_FORCE)) + error = VFS_UNMOUNT(mp, flags, p); + simple_lock(&mountlist_slock); + if (error) { + mp->mnt_flag &= ~MNT_UNMOUNT; + lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK | LK_REENABLE, + &mountlist_slock, p); + return (error); + } + CIRCLEQ_REMOVE(&mountlist, mp, mnt_list); + if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) { + coveredvp->v_mountedhere = (struct mount *)0; + vrele(coveredvp); + } + mp->mnt_vfc->vfc_refcount--; + if (mp->mnt_vnodelist.lh_first != NULL) + panic("unmount: dangling vnode"); + lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_slock, p); + if (mp->mnt_flag & MNT_MWAIT) + wakeup((caddr_t)mp); + free((caddr_t)mp, M_MOUNT); + return (0); +} + +/* + * Sync each mounted filesystem. + */ +#ifndef _SYS_SYSPROTO_H_ +struct sync_args { + int dummy; +}; +#endif + +#ifdef DEBUG +int syncprt = 0; +SYSCTL_INT(_debug, 0, syncprt, CTLFLAG_RW, &syncprt, 0, ""); +#endif + +/* ARGSUSED */ +int +sync(p, uap, retval) + struct proc *p; + struct sync_args *uap; + register_t *retval; +{ + register struct mount *mp, *nmp; + int asyncflag; + + simple_lock(&mountlist_slock); + for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { + nmp = mp->mnt_list.cqe_next; + continue; + } + if ((mp->mnt_flag & MNT_RDONLY) == 0) { + asyncflag = mp->mnt_flag & MNT_ASYNC; + mp->mnt_flag &= ~MNT_ASYNC; + vfs_msync(mp, MNT_NOWAIT); + VFS_SYNC(mp, MNT_NOWAIT, p != NULL ? p->p_ucred : NOCRED, p); + if (asyncflag) + mp->mnt_flag |= MNT_ASYNC; + } + simple_lock(&mountlist_slock); + nmp = mp->mnt_list.cqe_next; + vfs_unbusy(mp, p); + } + simple_unlock(&mountlist_slock); +#if 0 +/* + * XXX don't call vfs_bufstats() yet because that routine + * was not imported in the Lite2 merge. + */ +#ifdef DIAGNOSTIC + if (syncprt) + vfs_bufstats(); +#endif /* DIAGNOSTIC */ +#endif + return (0); +} + +/* + * Change filesystem quotas. + */ +#ifndef _SYS_SYSPROTO_H_ +struct quotactl_args { + char *path; + int cmd; + int uid; + caddr_t arg; +}; +#endif +/* ARGSUSED */ +int +quotactl(p, uap, retval) + struct proc *p; + register struct quotactl_args /* { + syscallarg(char *) path; + syscallarg(int) cmd; + syscallarg(int) uid; + syscallarg(caddr_t) arg; + } */ *uap; + register_t *retval; +{ + register struct mount *mp; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + mp = nd.ni_vp->v_mount; + vrele(nd.ni_vp); + return (VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid), + SCARG(uap, arg), p)); +} + +/* + * Get filesystem statistics. + */ +#ifndef _SYS_SYSPROTO_H_ +struct statfs_args { + char *path; + struct statfs *buf; +}; +#endif +/* ARGSUSED */ +int +statfs(p, uap, retval) + struct proc *p; + register struct statfs_args /* { + syscallarg(char *) path; + syscallarg(struct statfs *) buf; + } */ *uap; + register_t *retval; +{ + register struct mount *mp; + register struct statfs *sp; + int error; + struct nameidata nd; + struct statfs sb; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + mp = nd.ni_vp->v_mount; + sp = &mp->mnt_stat; + vrele(nd.ni_vp); + error = VFS_STATFS(mp, sp, p); + if (error) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (p->p_ucred->cr_uid != 0) { + bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb)); + sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; + sp = &sb; + } + return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp))); +} + +/* + * Get filesystem statistics. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fstatfs_args { + int fd; + struct statfs *buf; +}; +#endif +/* ARGSUSED */ +int +fstatfs(p, uap, retval) + struct proc *p; + register struct fstatfs_args /* { + syscallarg(int) fd; + syscallarg(struct statfs *) buf; + } */ *uap; + register_t *retval; +{ + struct file *fp; + struct mount *mp; + register struct statfs *sp; + int error; + struct statfs sb; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + mp = ((struct vnode *)fp->f_data)->v_mount; + sp = &mp->mnt_stat; + error = VFS_STATFS(mp, sp, p); + if (error) + return (error); + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (p->p_ucred->cr_uid != 0) { + bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb)); + sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; + sp = &sb; + } + return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp))); +} + +/* + * Get statistics on all filesystems. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getfsstat_args { + struct statfs *buf; + long bufsize; + int flags; +}; +#endif +int +getfsstat(p, uap, retval) + struct proc *p; + register struct getfsstat_args /* { + syscallarg(struct statfs *) buf; + syscallarg(long) bufsize; + syscallarg(int) flags; + } */ *uap; + register_t *retval; +{ + register struct mount *mp, *nmp; + register struct statfs *sp; + caddr_t sfsp; + long count, maxcount, error; + + maxcount = SCARG(uap, bufsize) / sizeof(struct statfs); + sfsp = (caddr_t)SCARG(uap, buf); + count = 0; + simple_lock(&mountlist_slock); + for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { + if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { + nmp = mp->mnt_list.cqe_next; + continue; + } + if (sfsp && count < maxcount) { + sp = &mp->mnt_stat; + /* + * If MNT_NOWAIT is specified, do not refresh the + * fsstat cache. MNT_WAIT overrides MNT_NOWAIT. + */ + if (((SCARG(uap, flags) & MNT_NOWAIT) == 0 || + (SCARG(uap, flags) & MNT_WAIT)) && + (error = VFS_STATFS(mp, sp, p))) { + simple_lock(&mountlist_slock); + nmp = mp->mnt_list.cqe_next; + vfs_unbusy(mp, p); + continue; + } + sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + error = copyout((caddr_t)sp, sfsp, sizeof(*sp)); + if (error) { + vfs_unbusy(mp, p); + return (error); + } + sfsp += sizeof(*sp); + } + count++; + simple_lock(&mountlist_slock); + nmp = mp->mnt_list.cqe_next; + vfs_unbusy(mp, p); + } + simple_unlock(&mountlist_slock); + if (sfsp && count > maxcount) + *retval = maxcount; + else + *retval = count; + return (0); +} + +/* + * Change current working directory to a given file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchdir_args { + int fd; +}; +#endif +/* ARGSUSED */ +int +fchdir(p, uap, retval) + struct proc *p; + struct fchdir_args /* { + syscallarg(int) fd; + } */ *uap; + register_t *retval; +{ + register struct filedesc *fdp = p->p_fd; + struct vnode *vp, *tdp; + struct mount *mp; + struct file *fp; + int error; + + if (error = getvnode(fdp, SCARG(uap, fd), &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + VREF(vp); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + if (vp->v_type != VDIR) + error = ENOTDIR; + else + error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p); + while (!error && (mp = vp->v_mountedhere) != NULL) { + if (vfs_busy(mp, 0, 0, p)) + continue; + error = VFS_ROOT(mp, &tdp); + vfs_unbusy(mp, p); + if (error) + break; + vput(vp); + vp = tdp; + } + if (error) { + vput(vp); + return (error); + } + VOP_UNLOCK(vp, 0, p); + vrele(fdp->fd_cdir); + fdp->fd_cdir = vp; + return (0); +} + +/* + * Change current working directory (``.''). + */ +#ifndef _SYS_SYSPROTO_H_ +struct chdir_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +chdir(p, uap, retval) + struct proc *p; + struct chdir_args /* { + syscallarg(char *) path; + } */ *uap; + register_t *retval; +{ + register struct filedesc *fdp = p->p_fd; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = change_dir(&nd, p)) + return (error); + vrele(fdp->fd_cdir); + fdp->fd_cdir = nd.ni_vp; + return (0); +} + +/* + * Change notion of root (``/'') directory. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chroot_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +chroot(p, uap, retval) + struct proc *p; + struct chroot_args /* { + syscallarg(char *) path; + } */ *uap; + register_t *retval; +{ + register struct filedesc *fdp = p->p_fd; + int error; + struct nameidata nd; + + error = suser(p->p_ucred, &p->p_acflag); + if (error) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = change_dir(&nd, p)) + return (error); + if (fdp->fd_rdir != NULL) + vrele(fdp->fd_rdir); + fdp->fd_rdir = nd.ni_vp; + return (0); +} + +/* + * Common routine for chroot and chdir. + */ +static int +change_dir(ndp, p) + register struct nameidata *ndp; + struct proc *p; +{ + struct vnode *vp; + int error; + + error = namei(ndp); + if (error) + return (error); + vp = ndp->ni_vp; + if (vp->v_type != VDIR) + error = ENOTDIR; + else + error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p); + if (error) + vput(vp); + else + VOP_UNLOCK(vp, 0, p); + return (error); +} + +/* + * Check permissions, allocate an open file structure, + * and call the device open routine if any. + */ +#ifndef _SYS_SYSPROTO_H_ +struct open_args { + char *path; + int flags; + int mode; +}; +#endif +int +open(p, uap, retval) + struct proc *p; + register struct open_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + syscallarg(int) mode; + } */ *uap; + register_t *retval; +{ + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + register struct vnode *vp; + int flags, cmode; + struct file *nfp; + int type, indx, error; + struct flock lf; + struct nameidata nd; + + error = falloc(p, &nfp, &indx); + if (error) + return (error); + fp = nfp; + flags = FFLAGS(SCARG(uap, flags)); + cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT; + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + p->p_dupfd = -indx - 1; /* XXX check for fdopen */ + error = vn_open(&nd, flags, cmode); + if (error) { + ffree(fp); + if ((error == ENODEV || error == ENXIO) && + p->p_dupfd >= 0 && /* XXX from fdopen */ + (error = + dupfdopen(fdp, indx, p->p_dupfd, flags, error)) == 0) { + *retval = indx; + return (0); + } + if (error == ERESTART) + error = EINTR; + fdp->fd_ofiles[indx] = NULL; + return (error); + } + p->p_dupfd = 0; + vp = nd.ni_vp; + + fp->f_flag = flags & FMASK; + fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE); + fp->f_ops = &vnops; + fp->f_data = (caddr_t)vp; + if (flags & (O_EXLOCK | O_SHLOCK)) { + lf.l_whence = SEEK_SET; + lf.l_start = 0; + lf.l_len = 0; + if (flags & O_EXLOCK) + lf.l_type = F_WRLCK; + else + lf.l_type = F_RDLCK; + type = F_FLOCK; + if ((flags & FNONBLOCK) == 0) + type |= F_WAIT; + VOP_UNLOCK(vp, 0, p); + if (error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) { + (void) vn_close(vp, fp->f_flag, fp->f_cred, p); + ffree(fp); + fdp->fd_ofiles[indx] = NULL; + return (error); + } + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + fp->f_flag |= FHASLOCK; + } + VOP_UNLOCK(vp, 0, p); + *retval = indx; + return (0); +} + +#ifdef COMPAT_43 +/* + * Create a file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ocreat_args { + char *path; + int mode; +}; +#endif +int +ocreat(p, uap, retval) + struct proc *p; + register struct ocreat_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; + register_t *retval; +{ + struct open_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + syscallarg(int) mode; + } */ nuap; + + SCARG(&nuap, path) = SCARG(uap, path); + SCARG(&nuap, mode) = SCARG(uap, mode); + SCARG(&nuap, flags) = O_WRONLY | O_CREAT | O_TRUNC; + return (open(p, &nuap, retval)); +} +#endif /* COMPAT_43 */ + +/* + * Create a special file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mknod_args { + char *path; + int mode; + int dev; +}; +#endif +/* ARGSUSED */ +int +mknod(p, uap, retval) + struct proc *p; + register struct mknod_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + syscallarg(int) dev; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + int whiteout; + struct nameidata nd; + + error = suser(p->p_ucred, &p->p_acflag); + if (error) + return (error); + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp != NULL) + error = EEXIST; + else { + VATTR_NULL(&vattr); + vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask; + vattr.va_rdev = SCARG(uap, dev); + whiteout = 0; + + switch (SCARG(uap, mode) & S_IFMT) { + case S_IFMT: /* used by badsect to flag bad sectors */ + vattr.va_type = VBAD; + break; + case S_IFCHR: + vattr.va_type = VCHR; + break; + case S_IFBLK: + vattr.va_type = VBLK; + break; + case S_IFWHT: + whiteout = 1; + break; + default: + error = EINVAL; + break; + } + } + if (!error) { + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + if (whiteout) { + error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE); + if (error) + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + vput(nd.ni_dvp); + } else { + error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, + &nd.ni_cnd, &vattr); + } + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (vp) + vrele(vp); + } + return (error); +} + +/* + * Create a named pipe. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mkfifo_args { + char *path; + int mode; +}; +#endif +/* ARGSUSED */ +int +mkfifo(p, uap, retval) + struct proc *p; + register struct mkfifo_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; + register_t *retval; +{ + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + if (nd.ni_vp != NULL) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(nd.ni_vp); + return (EEXIST); + } + VATTR_NULL(&vattr); + vattr.va_type = VFIFO; + vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask; + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + return (VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr)); +} + +/* + * Make a hard file link. + */ +#ifndef _SYS_SYSPROTO_H_ +struct link_args { + char *path; + char *link; +}; +#endif +/* ARGSUSED */ +int +link(p, uap, retval) + struct proc *p; + register struct link_args /* { + syscallarg(char *) path; + syscallarg(char *) link; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct nameidata nd; + int error; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type == VDIR) + error = EPERM; /* POSIX */ + else { + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, link), p); + error = namei(&nd); + if (!error) { + if (nd.ni_vp != NULL) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (nd.ni_vp) + vrele(nd.ni_vp); + error = EEXIST; + } else { + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, + LEASE_WRITE); + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); + } + } + } + vrele(vp); + return (error); +} + +/* + * Make a symbolic link. + */ +#ifndef _SYS_SYSPROTO_H_ +struct symlink_args { + char *path; + char *link; +}; +#endif +/* ARGSUSED */ +int +symlink(p, uap, retval) + struct proc *p; + register struct symlink_args /* { + syscallarg(char *) path; + syscallarg(char *) link; + } */ *uap; + register_t *retval; +{ + struct vattr vattr; + char *path; + int error; + struct nameidata nd; + + MALLOC(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK); + if (error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL)) + goto out; + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, link), p); + if (error = namei(&nd)) + goto out; + if (nd.ni_vp) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(nd.ni_vp); + error = EEXIST; + goto out; + } + VATTR_NULL(&vattr); + vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask; + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path); +out: + FREE(path, M_NAMEI); + return (error); +} + +/* + * Delete a whiteout from the filesystem. + */ +/* ARGSUSED */ +int +undelete(p, uap, retval) + struct proc *p; + register struct undelete_args /* { + syscallarg(char *) path; + } */ *uap; + register_t *retval; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE, + SCARG(uap, path), p); + error = namei(&nd); + if (error) + return (error); + + if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == nd.ni_vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (nd.ni_vp) + vrele(nd.ni_vp); + return (EEXIST); + } + + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + if (error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE)) + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + vput(nd.ni_dvp); + return (error); +} + +/* + * Delete a name from the filesystem. + */ +#ifndef _SYS_SYSPROTO_H_ +struct unlink_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +unlink(p, uap, retval) + struct proc *p; + struct unlink_args /* { + syscallarg(char *) path; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + int error; + struct nameidata nd; + + NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + + if (vp->v_type == VDIR) + error = EPERM; /* POSIX */ + else { + /* + * The root of a mounted filesystem cannot be deleted. + * + * XXX: can this only be a VDIR case? + */ + if (vp->v_flag & VROOT) + error = EBUSY; + else + (void) vnode_pager_uncache(vp, p); + } + + if (!error) { + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + if (vp != NULLVP) + vput(vp); + } + return (error); +} + +/* + * Reposition read/write file offset. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lseek_args { + int fd; + int pad; + off_t offset; + int whence; +}; +#endif +int +lseek(p, uap, retval) + struct proc *p; + register struct lseek_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) offset; + syscallarg(int) whence; + } */ *uap; + register_t *retval; /* XXX */ +{ + struct ucred *cred = p->p_ucred; + register struct filedesc *fdp = p->p_fd; + register struct file *fp; + struct vattr vattr; + int error; + + if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL) + return (EBADF); + if (fp->f_type != DTYPE_VNODE) + return (ESPIPE); + switch (SCARG(uap, whence)) { + case L_INCR: + fp->f_offset += SCARG(uap, offset); + break; + case L_XTND: + error=VOP_GETATTR((struct vnode *)fp->f_data, &vattr, cred, p); + if (error) + return (error); + fp->f_offset = SCARG(uap, offset) + vattr.va_size; + break; + case L_SET: + fp->f_offset = SCARG(uap, offset); + break; + default: + return (EINVAL); + } + *(off_t *)retval = fp->f_offset; + return (0); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Reposition read/write file offset. + */ +#ifndef _SYS_SYSPROTO_H_ +struct olseek_args { + int fd; + long offset; + int whence; +}; +#endif +int +olseek(p, uap, retval) + struct proc *p; + register struct olseek_args /* { + syscallarg(int) fd; + syscallarg(long) offset; + syscallarg(int) whence; + } */ *uap; + register_t *retval; +{ + struct lseek_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) offset; + syscallarg(int) whence; + } */ nuap; + off_t qret; + int error; + + SCARG(&nuap, fd) = SCARG(uap, fd); + SCARG(&nuap, offset) = SCARG(uap, offset); + SCARG(&nuap, whence) = SCARG(uap, whence); + error = lseek(p, &nuap, (register_t *) &qret); + *(long *)retval = qret; + return (error); +} +#endif /* COMPAT_43 */ + +/* + * Check access permissions. + */ +#ifndef _SYS_SYSPROTO_H_ +struct access_args { + char *path; + int flags; +}; +#endif +int +access(p, uap, retval) + struct proc *p; + register struct access_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + } */ *uap; + register_t *retval; +{ + register struct ucred *cred = p->p_ucred; + register struct vnode *vp; + int error, flags, t_gid, t_uid; + struct nameidata nd; + + t_uid = cred->cr_uid; + t_gid = cred->cr_groups[0]; + cred->cr_uid = p->p_cred->p_ruid; + cred->cr_groups[0] = p->p_cred->p_rgid; + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + goto out1; + vp = nd.ni_vp; + + /* Flags == 0 means only check for existence. */ + if (SCARG(uap, flags)) { + flags = 0; + if (SCARG(uap, flags) & R_OK) + flags |= VREAD; + if (SCARG(uap, flags) & W_OK) + flags |= VWRITE; + if (SCARG(uap, flags) & X_OK) + flags |= VEXEC; + if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0) + error = VOP_ACCESS(vp, flags, cred, p); + } + vput(vp); +out1: + cred->cr_uid = t_uid; + cred->cr_groups[0] = t_gid; + return (error); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Get file status; this version follows links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ostat_args { + char *path; + struct ostat *ub; +}; +#endif +/* ARGSUSED */ +int +ostat(p, uap, retval) + struct proc *p; + register struct ostat_args /* { + syscallarg(char *) path; + syscallarg(struct ostat *) ub; + } */ *uap; + register_t *retval; +{ + struct stat sb; + struct ostat osb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = vn_stat(nd.ni_vp, &sb, p); + vput(nd.ni_vp); + if (error) + return (error); + cvtstat(&sb, &osb); + error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb)); + return (error); +} + +/* + * Get file status; this version does not follow links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct olstat_args { + char *path; + struct ostat *ub; +}; +#endif +/* ARGSUSED */ +int +olstat(p, uap, retval) + struct proc *p; + register struct olstat_args /* { + syscallarg(char *) path; + syscallarg(struct ostat *) ub; + } */ *uap; + register_t *retval; +{ + struct vnode *vp, *dvp; + struct stat sb, sb1; + struct ostat osb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKPARENT, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + /* + * For symbolic links, always return the attributes of its + * containing directory, except for mode, size, and links. + */ + vp = nd.ni_vp; + dvp = nd.ni_dvp; + if (vp->v_type != VLNK) { + if (dvp == vp) + vrele(dvp); + else + vput(dvp); + error = vn_stat(vp, &sb, p); + vput(vp); + if (error) + return (error); + } else { + error = vn_stat(dvp, &sb, p); + vput(dvp); + if (error) { + vput(vp); + return (error); + } + error = vn_stat(vp, &sb1, p); + vput(vp); + if (error) + return (error); + sb.st_mode &= ~S_IFDIR; + sb.st_mode |= S_IFLNK; + sb.st_nlink = sb1.st_nlink; + sb.st_size = sb1.st_size; + sb.st_blocks = sb1.st_blocks; + } + cvtstat(&sb, &osb); + error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb)); + return (error); +} + +/* + * Convert from an old to a new stat structure. + */ +void +cvtstat(st, ost) + struct stat *st; + struct ostat *ost; +{ + + ost->st_dev = st->st_dev; + ost->st_ino = st->st_ino; + ost->st_mode = st->st_mode; + ost->st_nlink = st->st_nlink; + ost->st_uid = st->st_uid; + ost->st_gid = st->st_gid; + ost->st_rdev = st->st_rdev; + if (st->st_size < (quad_t)1 << 32) + ost->st_size = st->st_size; + else + ost->st_size = -2; + ost->st_atime = st->st_atime; + ost->st_mtime = st->st_mtime; + ost->st_ctime = st->st_ctime; + ost->st_blksize = st->st_blksize; + ost->st_blocks = st->st_blocks; + ost->st_flags = st->st_flags; + ost->st_gen = st->st_gen; +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Get file status; this version follows links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct stat_args { + char *path; + struct stat *ub; +}; +#endif +/* ARGSUSED */ +int +stat(p, uap, retval) + struct proc *p; + register struct stat_args /* { + syscallarg(char *) path; + syscallarg(struct stat *) ub; + } */ *uap; + register_t *retval; +{ + struct stat sb; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = vn_stat(nd.ni_vp, &sb, p); + vput(nd.ni_vp); + if (error) + return (error); + error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb)); + return (error); +} + +/* + * Get file status; this version does not follow links. + */ +#ifndef _SYS_SYSPROTO_H_ +struct lstat_args { + char *path; + struct stat *ub; +}; +#endif +/* ARGSUSED */ +int +lstat(p, uap, retval) + struct proc *p; + register struct lstat_args /* { + syscallarg(char *) path; + syscallarg(struct stat *) ub; + } */ *uap; + register_t *retval; +{ + int error; + struct vnode *vp, *dvp; + struct stat sb, sb1; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKPARENT, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + /* + * For symbolic links, always return the attributes of its containing + * directory, except for mode, size, inode number, and links. + */ + vp = nd.ni_vp; + dvp = nd.ni_dvp; + if (vp->v_type != VLNK) { + if (dvp == vp) + vrele(dvp); + else + vput(dvp); + error = vn_stat(vp, &sb, p); + vput(vp); + if (error) + return (error); + } else { + error = vn_stat(dvp, &sb, p); + vput(dvp); + if (error) { + vput(vp); + return (error); + } + error = vn_stat(vp, &sb1, p); + vput(vp); + if (error) + return (error); + sb.st_mode &= ~S_IFDIR; + sb.st_mode |= S_IFLNK; + sb.st_nlink = sb1.st_nlink; + sb.st_size = sb1.st_size; + sb.st_blocks = sb1.st_blocks; + sb.st_ino = sb1.st_ino; + } + error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb)); + return (error); +} + +/* + * Get configurable pathname variables. + */ +#ifndef _SYS_SYSPROTO_H_ +struct pathconf_args { + char *path; + int name; +}; +#endif +/* ARGSUSED */ +int +pathconf(p, uap, retval) + struct proc *p; + register struct pathconf_args /* { + syscallarg(char *) path; + syscallarg(int) name; + } */ *uap; + register_t *retval; +{ + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), retval); + vput(nd.ni_vp); + return (error); +} + +/* + * Return target name of a symbolic link. + */ +#ifndef _SYS_SYSPROTO_H_ +struct readlink_args { + char *path; + char *buf; + int count; +}; +#endif +/* ARGSUSED */ +int +readlink(p, uap, retval) + struct proc *p; + register struct readlink_args /* { + syscallarg(char *) path; + syscallarg(char *) buf; + syscallarg(int) count; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct iovec aiov; + struct uio auio; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VLNK) + error = EINVAL; + else { + aiov.iov_base = SCARG(uap, buf); + aiov.iov_len = SCARG(uap, count); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_offset = 0; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + auio.uio_resid = SCARG(uap, count); + error = VOP_READLINK(vp, &auio, p->p_ucred); + } + vput(vp); + *retval = SCARG(uap, count) - auio.uio_resid; + return (error); +} + +/* + * Change flags of a file given a path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chflags_args { + char *path; + int flags; +}; +#endif +/* ARGSUSED */ +int +chflags(p, uap, retval) + struct proc *p; + register struct chflags_args /* { + syscallarg(char *) path; + syscallarg(int) flags; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + VATTR_NULL(&vattr); + vattr.va_flags = SCARG(uap, flags); + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + vput(vp); + return (error); +} + +/* + * Change flags of a file given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchflags_args { + int fd; + int flags; +}; +#endif +/* ARGSUSED */ +int +fchflags(p, uap, retval) + struct proc *p; + register struct fchflags_args /* { + syscallarg(int) fd; + syscallarg(int) flags; + } */ *uap; + register_t *retval; +{ + struct vattr vattr; + struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + VATTR_NULL(&vattr); + vattr.va_flags = SCARG(uap, flags); + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + VOP_UNLOCK(vp, 0, p); + return (error); +} + +/* + * Change mode of a file given path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chmod_args { + char *path; + int mode; +}; +#endif +/* ARGSUSED */ +int +chmod(p, uap, retval) + struct proc *p; + register struct chmod_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + VATTR_NULL(&vattr); + vattr.va_mode = SCARG(uap, mode) & ALLPERMS; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + vput(vp); + return (error); +} + +/* + * Change mode of a file given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchmod_args { + int fd; + int mode; +}; +#endif +/* ARGSUSED */ +int +fchmod(p, uap, retval) + struct proc *p; + register struct fchmod_args /* { + syscallarg(int) fd; + syscallarg(int) mode; + } */ *uap; + register_t *retval; +{ + struct vattr vattr; + struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + VATTR_NULL(&vattr); + vattr.va_mode = SCARG(uap, mode) & ALLPERMS; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + VOP_UNLOCK(vp, 0, p); + return (error); +} + +/* + * Set ownership given a path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct chown_args { + char *path; + int uid; + int gid; +}; +#endif +/* ARGSUSED */ +int +chown(p, uap, retval) + struct proc *p; + register struct chown_args /* { + syscallarg(char *) path; + syscallarg(int) uid; + syscallarg(int) gid; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + VATTR_NULL(&vattr); + vattr.va_uid = SCARG(uap, uid); + vattr.va_gid = SCARG(uap, gid); + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + vput(vp); + return (error); +} + +/* + * Set ownership given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fchown_args { + int fd; + int uid; + int gid; +}; +#endif +/* ARGSUSED */ +int +fchown(p, uap, retval) + struct proc *p; + register struct fchown_args /* { + syscallarg(int) fd; + syscallarg(int) uid; + syscallarg(int) gid; + } */ *uap; + register_t *retval; +{ + struct vattr vattr; + struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + VATTR_NULL(&vattr); + vattr.va_uid = SCARG(uap, uid); + vattr.va_gid = SCARG(uap, gid); + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + VOP_UNLOCK(vp, 0, p); + return (error); +} + +/* + * Set the access and modification times of a file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct utimes_args { + char *path; + struct timeval *tptr; +}; +#endif +/* ARGSUSED */ +int +utimes(p, uap, retval) + struct proc *p; + register struct utimes_args /* { + syscallarg(char *) path; + syscallarg(struct timeval *) tptr; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct timeval tv[2]; + struct vattr vattr; + int error; + struct nameidata nd; + + VATTR_NULL(&vattr); + if (SCARG(uap, tptr) == NULL) { + microtime(&tv[0]); + tv[1] = tv[0]; + vattr.va_vaflags |= VA_UTIMES_NULL; + } else if (error = copyin((caddr_t)SCARG(uap, tptr), (caddr_t)tv, + sizeof (tv))) + return (error); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + vattr.va_atime.tv_sec = tv[0].tv_sec; + vattr.va_atime.tv_nsec = tv[0].tv_usec * 1000; + vattr.va_mtime.tv_sec = tv[1].tv_sec; + vattr.va_mtime.tv_nsec = tv[1].tv_usec * 1000; + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + vput(vp); + return (error); +} + +/* + * Truncate a file given its path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct truncate_args { + char *path; + int pad; + off_t length; +}; +#endif +/* ARGSUSED */ +int +truncate(p, uap, retval) + struct proc *p; + register struct truncate_args /* { + syscallarg(char *) path; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + if (uap->length < 0) + return(EINVAL); + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + if (vp->v_type == VDIR) + error = EISDIR; + else if ((error = vn_writechk(vp)) == 0 && + (error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) == 0) { + VATTR_NULL(&vattr); + vattr.va_size = SCARG(uap, length); + error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); + } + vput(vp); + return (error); +} + +/* + * Truncate a file given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ftruncate_args { + int fd; + int pad; + off_t length; +}; +#endif +/* ARGSUSED */ +int +ftruncate(p, uap, retval) + struct proc *p; + register struct ftruncate_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ *uap; + register_t *retval; +{ + struct vattr vattr; + struct vnode *vp; + struct file *fp; + int error; + + if (uap->length < 0) + return(EINVAL); + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + if ((fp->f_flag & FWRITE) == 0) + return (EINVAL); + vp = (struct vnode *)fp->f_data; + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + if (vp->v_type == VDIR) + error = EISDIR; + else if ((error = vn_writechk(vp)) == 0) { + VATTR_NULL(&vattr); + vattr.va_size = SCARG(uap, length); + error = VOP_SETATTR(vp, &vattr, fp->f_cred, p); + } + VOP_UNLOCK(vp, 0, p); + return (error); +} + +#if defined(COMPAT_43) || defined(COMPAT_SUNOS) +/* + * Truncate a file given its path name. + */ +#ifndef _SYS_SYSPROTO_H_ +struct otruncate_args { + char *path; + long length; +}; +#endif +/* ARGSUSED */ +int +otruncate(p, uap, retval) + struct proc *p; + register struct otruncate_args /* { + syscallarg(char *) path; + syscallarg(long) length; + } */ *uap; + register_t *retval; +{ + struct truncate_args /* { + syscallarg(char *) path; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ nuap; + + SCARG(&nuap, path) = SCARG(uap, path); + SCARG(&nuap, length) = SCARG(uap, length); + return (truncate(p, &nuap, retval)); +} + +/* + * Truncate a file given a file descriptor. + */ +#ifndef _SYS_SYSPROTO_H_ +struct oftruncate_args { + int fd; + long length; +}; +#endif +/* ARGSUSED */ +int +oftruncate(p, uap, retval) + struct proc *p; + register struct oftruncate_args /* { + syscallarg(int) fd; + syscallarg(long) length; + } */ *uap; + register_t *retval; +{ + struct ftruncate_args /* { + syscallarg(int) fd; + syscallarg(int) pad; + syscallarg(off_t) length; + } */ nuap; + + SCARG(&nuap, fd) = SCARG(uap, fd); + SCARG(&nuap, length) = SCARG(uap, length); + return (ftruncate(p, &nuap, retval)); +} +#endif /* COMPAT_43 || COMPAT_SUNOS */ + +/* + * Sync an open file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct fsync_args { + int fd; +}; +#endif +/* ARGSUSED */ +int +fsync(p, uap, retval) + struct proc *p; + struct fsync_args /* { + syscallarg(int) fd; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct file *fp; + int error; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + vp = (struct vnode *)fp->f_data; + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + if (vp->v_object) { + vm_object_page_clean(vp->v_object, 0, 0 ,0, FALSE); + } + error = VOP_FSYNC(vp, fp->f_cred, + (vp->v_mount && (vp->v_mount->mnt_flag & MNT_ASYNC)) ? + MNT_NOWAIT : MNT_WAIT, p); + VOP_UNLOCK(vp, 0, p); + return (error); +} + +/* + * Rename files. Source and destination must either both be directories, + * or both not be directories. If target is a directory, it must be empty. + */ +#ifndef _SYS_SYSPROTO_H_ +struct rename_args { + char *from; + char *to; +}; +#endif +/* ARGSUSED */ +int +rename(p, uap, retval) + struct proc *p; + register struct rename_args /* { + syscallarg(char *) from; + syscallarg(char *) to; + } */ *uap; + register_t *retval; +{ + register struct vnode *tvp, *fvp, *tdvp; + struct nameidata fromnd, tond; + int error; + + NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE, + SCARG(uap, from), p); + if (error = namei(&fromnd)) + return (error); + fvp = fromnd.ni_vp; + NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART, + UIO_USERSPACE, SCARG(uap, to), p); + if (fromnd.ni_vp->v_type == VDIR) + tond.ni_cnd.cn_flags |= WILLBEDIR; + if (error = namei(&tond)) { + /* Translate error code for rename("dir1", "dir2/."). */ + if (error == EISDIR && fvp->v_type == VDIR) + error = EINVAL; + VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); + vrele(fromnd.ni_dvp); + vrele(fvp); + goto out1; + } + tdvp = tond.ni_dvp; + tvp = tond.ni_vp; + if (tvp != NULL) { + if (fvp->v_type == VDIR && tvp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) { + error = EISDIR; + goto out; + } + } + if (fvp == tdvp) + error = EINVAL; + /* + * If source is the same as the destination (that is the + * same inode number with the same name in the same directory), + * then there is nothing to do. + */ + if (fvp == tvp && fromnd.ni_dvp == tdvp && + fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen && + !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr, + fromnd.ni_cnd.cn_namelen)) + error = -1; +out: + if (!error) { + VOP_LEASE(tdvp, p, p->p_ucred, LEASE_WRITE); + if (fromnd.ni_dvp != tdvp) + VOP_LEASE(fromnd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + if (tvp) { + VOP_LEASE(tvp, p, p->p_ucred, LEASE_WRITE); + (void) vnode_pager_uncache(tvp, p); + } + error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, + tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); + } else { + VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd); + if (tdvp == tvp) + vrele(tdvp); + else + vput(tdvp); + if (tvp) + vput(tvp); + VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); + vrele(fromnd.ni_dvp); + vrele(fvp); + } + vrele(tond.ni_startdir); + FREE(tond.ni_cnd.cn_pnbuf, M_NAMEI); +out1: + if (fromnd.ni_startdir) + vrele(fromnd.ni_startdir); + FREE(fromnd.ni_cnd.cn_pnbuf, M_NAMEI); + if (error == -1) + return (0); + return (error); +} + +/* + * Make a directory file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct mkdir_args { + char *path; + int mode; +}; +#endif +/* ARGSUSED */ +int +mkdir(p, uap, retval) + struct proc *p; + register struct mkdir_args /* { + syscallarg(char *) path; + syscallarg(int) mode; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); + nd.ni_cnd.cn_flags |= WILLBEDIR; + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp != NULL) { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vrele(vp); + return (EEXIST); + } + VATTR_NULL(&vattr); + vattr.va_type = VDIR; + vattr.va_mode = (SCARG(uap, mode) & ACCESSPERMS) &~ p->p_fd->fd_cmask; + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr); + if (!error) + vput(nd.ni_vp); + return (error); +} + +/* + * Remove a directory file. + */ +#ifndef _SYS_SYSPROTO_H_ +struct rmdir_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +rmdir(p, uap, retval) + struct proc *p; + struct rmdir_args /* { + syscallarg(char *) path; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + int error; + struct nameidata nd; + + NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE, + SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (vp->v_type != VDIR) { + error = ENOTDIR; + goto out; + } + /* + * No rmdir "." please. + */ + if (nd.ni_dvp == vp) { + error = EINVAL; + goto out; + } + /* + * The root of a mounted filesystem cannot be deleted. + */ + if (vp->v_flag & VROOT) + error = EBUSY; +out: + if (!error) { + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd); + } else { + VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); + if (nd.ni_dvp == vp) + vrele(nd.ni_dvp); + else + vput(nd.ni_dvp); + vput(vp); + } + return (error); +} + +#ifdef COMPAT_43 +/* + * Read a block of directory entries in a file system independent format. + */ +#ifndef _SYS_SYSPROTO_H_ +struct ogetdirentries_args { + int fd; + char *buf; + u_int count; + long *basep; +}; +#endif +int +ogetdirentries(p, uap, retval) + struct proc *p; + register struct ogetdirentries_args /* { + syscallarg(int) fd; + syscallarg(char *) buf; + syscallarg(u_int) count; + syscallarg(long *) basep; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct file *fp; + struct uio auio, kuio; + struct iovec aiov, kiov; + struct dirent *dp, *edp; + caddr_t dirbuf; + int error, eofflag, readcnt; + long loff; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + if ((fp->f_flag & FREAD) == 0) + return (EBADF); + vp = (struct vnode *)fp->f_data; +unionread: + if (vp->v_type != VDIR) + return (EINVAL); + aiov.iov_base = SCARG(uap, buf); + aiov.iov_len = SCARG(uap, count); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + auio.uio_resid = SCARG(uap, count); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + loff = auio.uio_offset = fp->f_offset; +# if (BYTE_ORDER != LITTLE_ENDIAN) + if (vp->v_mount->mnt_maxsymlinklen <= 0) { + error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, + NULL, NULL); + fp->f_offset = auio.uio_offset; + } else +# endif + { + kuio = auio; + kuio.uio_iov = &kiov; + kuio.uio_segflg = UIO_SYSSPACE; + kiov.iov_len = SCARG(uap, count); + MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK); + kiov.iov_base = dirbuf; + error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag, + NULL, NULL); + fp->f_offset = kuio.uio_offset; + if (error == 0) { + readcnt = SCARG(uap, count) - kuio.uio_resid; + edp = (struct dirent *)&dirbuf[readcnt]; + for (dp = (struct dirent *)dirbuf; dp < edp; ) { +# if (BYTE_ORDER == LITTLE_ENDIAN) + /* + * The expected low byte of + * dp->d_namlen is our dp->d_type. + * The high MBZ byte of dp->d_namlen + * is our dp->d_namlen. + */ + dp->d_type = dp->d_namlen; + dp->d_namlen = 0; +# else + /* + * The dp->d_type is the high byte + * of the expected dp->d_namlen, + * so must be zero'ed. + */ + dp->d_type = 0; +# endif + if (dp->d_reclen > 0) { + dp = (struct dirent *) + ((char *)dp + dp->d_reclen); + } else { + error = EIO; + break; + } + } + if (dp >= edp) + error = uiomove(dirbuf, readcnt, &auio); + } + FREE(dirbuf, M_TEMP); + } + VOP_UNLOCK(vp, 0, p); + if (error) + return (error); + +#ifdef UNION +{ + if ((SCARG(uap, count) == auio.uio_resid) && + (vp->v_op == union_vnodeop_p)) { + struct vnode *lvp; + + lvp = union_dircache(vp, p); + if (lvp != NULLVP) { + struct vattr va; + + /* + * If the directory is opaque, + * then don't show lower entries + */ + error = VOP_GETATTR(vp, &va, fp->f_cred, p); + if (va.va_flags & OPAQUE) { + vput(lvp); + lvp = NULL; + } + } + + if (lvp != NULLVP) { + error = VOP_OPEN(lvp, FREAD, fp->f_cred, p); + if (error) { + vput(lvp); + return (error); + } + VOP_UNLOCK(lvp, 0, p); + fp->f_data = (caddr_t) lvp; + fp->f_offset = 0; + error = vn_close(vp, FREAD, fp->f_cred, p); + if (error) + return (error); + vp = lvp; + goto unionread; + } + } +} +#endif /* UNION */ + + if ((SCARG(uap, count) == auio.uio_resid) && + (vp->v_flag & VROOT) && + (vp->v_mount->mnt_flag & MNT_UNION)) { + struct vnode *tvp = vp; + vp = vp->v_mount->mnt_vnodecovered; + VREF(vp); + fp->f_data = (caddr_t) vp; + fp->f_offset = 0; + vrele(tvp); + goto unionread; + } + error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep), + sizeof(long)); + *retval = SCARG(uap, count) - auio.uio_resid; + return (error); +} +#endif /* COMPAT_43 */ + +/* + * Read a block of directory entries in a file system independent format. + */ +#ifndef _SYS_SYSPROTO_H_ +struct getdirentries_args { + int fd; + char *buf; + u_int count; + long *basep; +}; +#endif +int +getdirentries(p, uap, retval) + struct proc *p; + register struct getdirentries_args /* { + syscallarg(int) fd; + syscallarg(char *) buf; + syscallarg(u_int) count; + syscallarg(long *) basep; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct file *fp; + struct uio auio; + struct iovec aiov; + long loff; + int error, eofflag; + + if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) + return (error); + if ((fp->f_flag & FREAD) == 0) + return (EBADF); + vp = (struct vnode *)fp->f_data; +unionread: + if (vp->v_type != VDIR) + return (EINVAL); + aiov.iov_base = SCARG(uap, buf); + aiov.iov_len = SCARG(uap, count); + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + auio.uio_rw = UIO_READ; + auio.uio_segflg = UIO_USERSPACE; + auio.uio_procp = p; + auio.uio_resid = SCARG(uap, count); + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + loff = auio.uio_offset = fp->f_offset; + error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); + fp->f_offset = auio.uio_offset; + VOP_UNLOCK(vp, 0, p); + if (error) + return (error); + +#ifdef UNION +{ + if ((SCARG(uap, count) == auio.uio_resid) && + (vp->v_op == union_vnodeop_p)) { + struct vnode *lvp; + + lvp = union_dircache(vp, p); + if (lvp != NULLVP) { + struct vattr va; + + /* + * If the directory is opaque, + * then don't show lower entries + */ + error = VOP_GETATTR(vp, &va, fp->f_cred, p); + if (va.va_flags & OPAQUE) { + vput(lvp); + lvp = NULL; + } + } + + if (lvp != NULLVP) { + error = VOP_OPEN(lvp, FREAD, fp->f_cred, p); + if (error) { + vput(lvp); + return (error); + } + VOP_UNLOCK(lvp, 0, p); + fp->f_data = (caddr_t) lvp; + fp->f_offset = 0; + error = vn_close(vp, FREAD, fp->f_cred, p); + if (error) + return (error); + vp = lvp; + goto unionread; + } + } +} +#endif /* UNION */ + + if ((SCARG(uap, count) == auio.uio_resid) && + (vp->v_flag & VROOT) && + (vp->v_mount->mnt_flag & MNT_UNION)) { + struct vnode *tvp = vp; + vp = vp->v_mount->mnt_vnodecovered; + VREF(vp); + fp->f_data = (caddr_t) vp; + fp->f_offset = 0; + vrele(tvp); + goto unionread; + } + error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep), + sizeof(long)); + *retval = SCARG(uap, count) - auio.uio_resid; + return (error); +} + +/* + * Set the mode mask for creation of filesystem nodes. + */ +#ifndef _SYS_SYSPROTO_H_ +struct umask_args { + int newmask; +}; +#endif +int +umask(p, uap, retval) + struct proc *p; + struct umask_args /* { + syscallarg(int) newmask; + } */ *uap; + int *retval; /* XXX */ +{ + register struct filedesc *fdp; + + fdp = p->p_fd; + *retval = fdp->fd_cmask; + fdp->fd_cmask = SCARG(uap, newmask) & ALLPERMS; + return (0); +} + +/* + * Void all references to file by ripping underlying filesystem + * away from vnode. + */ +#ifndef _SYS_SYSPROTO_H_ +struct revoke_args { + char *path; +}; +#endif +/* ARGSUSED */ +int +revoke(p, uap, retval) + struct proc *p; + register struct revoke_args /* { + syscallarg(char *) path; + } */ *uap; + register_t *retval; +{ + register struct vnode *vp; + struct vattr vattr; + int error; + struct nameidata nd; + + NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); + if (error = namei(&nd)) + return (error); + vp = nd.ni_vp; + if (error = VOP_GETATTR(vp, &vattr, p->p_ucred, p)) + goto out; + if (p->p_ucred->cr_uid != vattr.va_uid && + (error = suser(p->p_ucred, &p->p_acflag))) + goto out; + if (vp->v_usecount > 1 || (vp->v_flag & VALIASED)) + VOP_REVOKE(vp, REVOKEALL); +out: + vrele(vp); + return (error); +} + +/* + * Convert a user file descriptor to a kernel file entry. + */ +int +getvnode(fdp, fd, fpp) + struct filedesc *fdp; + int fd; + struct file **fpp; +{ + struct file *fp; + + if ((u_int)fd >= fdp->fd_nfiles || + (fp = fdp->fd_ofiles[fd]) == NULL) + return (EBADF); + if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) + return (EINVAL); + *fpp = fp; + return (0); +} diff --git a/sys/kern/vfs_init.c b/sys/kern/vfs_init.c index b5abe58..21061e8 100644 --- a/sys/kern/vfs_init.c +++ b/sys/kern/vfs_init.c @@ -35,11 +35,14 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)vfs_init.c 8.5 (Berkeley) 5/11/95 + * @(#)vfs_init.c 8.3 (Berkeley) 1/4/94 + * $Id: vfs_init.c,v 1.24 1997/02/22 09:39:32 peter Exp $ */ #include <sys/param.h> +#include <sys/systm.h> +#include <sys/kernel.h> #include <sys/mount.h> #include <sys/time.h> #include <sys/vnode.h> @@ -49,6 +52,12 @@ #include <sys/buf.h> #include <sys/errno.h> #include <sys/malloc.h> +#include <sys/proc.h> + +static void vfs_op_init __P((void)); + +static void vfsinit __P((void *)); +SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vfsinit, NULL) /* * Sigh, such primitive tools are these... @@ -59,8 +68,13 @@ #define DODEBUG(A) #endif -extern struct vnodeopv_desc *vfs_opv_descs[]; - /* a list of lists of vnodeops defns */ +struct vfsconf void_vfsconf; + +extern struct linker_set vfs_opv_descs_; +#define vfs_opv_descs ((struct vnodeopv_desc **)vfs_opv_descs_.ls_items) + +extern struct linker_set vfs_set; + extern struct vnodeop_desc *vfs_op_descs[]; /* and the operations they perform */ /* @@ -69,9 +83,7 @@ extern struct vnodeop_desc *vfs_op_descs[]; * extra level of indirection for arrays. It's an interesting * "feature" of C. */ -int vfs_opv_numops; - -typedef (*PFI)(); /* the standard Pointer to a Function returning an Int */ +static int vfs_opv_numops; /* * A miscellaneous routine. @@ -101,33 +113,35 @@ vn_default_error() * that is a(whole)nother story.) This is a feature. */ void -vfs_opv_init() +vfs_opv_init(struct vnodeopv_desc **them) { int i, j, k; - int (***opv_desc_vector_p)(); - int (**opv_desc_vector)(); + vop_t ***opv_desc_vector_p; + vop_t **opv_desc_vector; struct vnodeopv_entry_desc *opve_descp; /* * Allocate the dynamic vectors and fill them in. */ - for (i=0; vfs_opv_descs[i]; i++) { - opv_desc_vector_p = vfs_opv_descs[i]->opv_desc_vector_p; + for (i=0; them[i]; i++) { + opv_desc_vector_p = them[i]->opv_desc_vector_p; /* * Allocate and init the vector, if it needs it. * Also handle backwards compatibility. */ if (*opv_desc_vector_p == NULL) { /* XXX - shouldn't be M_VNODE */ - MALLOC(*opv_desc_vector_p, PFI*, - vfs_opv_numops*sizeof(PFI), M_VNODE, M_WAITOK); - bzero (*opv_desc_vector_p, vfs_opv_numops*sizeof(PFI)); + MALLOC(*opv_desc_vector_p, vop_t **, + vfs_opv_numops * sizeof(vop_t *), M_VNODE, + M_WAITOK); + bzero(*opv_desc_vector_p, + vfs_opv_numops * sizeof(vop_t *)); DODEBUG(printf("vector at %x allocated\n", opv_desc_vector_p)); } opv_desc_vector = *opv_desc_vector_p; - for (j=0; vfs_opv_descs[i]->opv_desc_ops[j].opve_op; j++) { - opve_descp = &(vfs_opv_descs[i]->opv_desc_ops[j]); + for (j=0; them[i]->opv_desc_ops[j].opve_op; j++) { + opve_descp = &(them[i]->opv_desc_ops[j]); /* * Sanity check: is this operation listed @@ -166,8 +180,8 @@ vfs_opv_init() * with their default. (Sigh, an O(n^3) algorithm. I * could make it better, but that'd be work, and n is small.) */ - for (i = 0; vfs_opv_descs[i]; i++) { - opv_desc_vector = *(vfs_opv_descs[i]->opv_desc_vector_p); + for (i = 0; them[i]; i++) { + opv_desc_vector = *(them[i]->opv_desc_vector_p); /* * Force every operations vector to have a default routine. */ @@ -176,7 +190,7 @@ vfs_opv_init() } for (k = 0; k<vfs_opv_numops; k++) if (opv_desc_vector[k] == NULL) - opv_desc_vector[k] = + opv_desc_vector[k] = opv_desc_vector[VOFFSET(vop_default)]; } } @@ -184,7 +198,7 @@ vfs_opv_init() /* * Initialize known vnode operations vectors. */ -void +static void vfs_op_init() { int i; @@ -216,10 +230,13 @@ struct vattr va_null; /* * Initialize the vnode structures and initialize each file system type. */ -vfsinit() +/* ARGSUSED*/ +static void +vfsinit(dummy) + void *dummy; { - struct vfsconf *vfsp; - int i, maxtypenum; + struct vfsconf **vfc; + int maxtypenum; /* * Initialize the vnode table @@ -233,15 +250,19 @@ vfsinit() * Build vnode operation vectors. */ vfs_op_init(); - vfs_opv_init(); /* finish the job */ + vfs_opv_init(vfs_opv_descs); /* finish the job */ /* * Initialize each file system type. */ vattr_null(&va_null); maxtypenum = 0; - for (vfsp = vfsconf, i = 1; i <= maxvfsconf; i++, vfsp++) { - if (i < maxvfsconf) - vfsp->vfc_next = vfsp + 1; + vfc = (struct vfsconf **)vfs_set.ls_items; + vfsconf = *vfc; /* simulate Lite2 vfsconf array */ + while (*vfc) { + struct vfsconf *vfsp = *vfc; + + vfc++; + vfsp->vfc_next = *vfc; if (maxtypenum <= vfsp->vfc_typenum) maxtypenum = vfsp->vfc_typenum + 1; (*vfsp->vfc_vfsops->vfs_init)(vfsp); @@ -249,3 +270,30 @@ vfsinit() /* next vfc_typenum to be used */ maxvfsconf = maxtypenum; } + +/* + * kernel related system variables. + */ + +/* + * This goop is here to support a loadable NFS module... grumble... + */ +int (*lease_check_hook) __P((struct vop_lease_args *)) + = 0; +void (*lease_updatetime) __P((int)) + = 0; + +int +lease_check(ap) + struct vop_lease_args /* { + struct vnode *a_vp; + struct proc *a_p; + struct ucred *a_cred; + int a_flag; + } */ *ap; +{ + if (lease_check_hook) + return (*lease_check_hook)(ap); + else + return 0; +} diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c index 826fbfe..0c04b01 100644 --- a/sys/kern/vfs_lookup.c +++ b/sys/kern/vfs_lookup.c @@ -35,10 +35,14 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)vfs_lookup.c 8.10 (Berkeley) 5/27/95 + * @(#)vfs_lookup.c 8.4 (Berkeley) 2/16/94 + * $Id$ */ +#include "opt_ktrace.h" + #include <sys/param.h> +#include <sys/systm.h> #include <sys/syslimits.h> #include <sys/time.h> #include <sys/namei.h> @@ -105,10 +109,17 @@ namei(ndp) MALLOC(cnp->cn_pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK); if (ndp->ni_segflg == UIO_SYSSPACE) error = copystr(ndp->ni_dirp, cnp->cn_pnbuf, - MAXPATHLEN, &ndp->ni_pathlen); + MAXPATHLEN, (u_int *)&ndp->ni_pathlen); else error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf, - MAXPATHLEN, &ndp->ni_pathlen); + MAXPATHLEN, (u_int *)&ndp->ni_pathlen); + + /* + * Don't allow empty pathnames. + */ + if (!error && *cnp->cn_pnbuf == '\0') + error = ENOENT; + if (error) { free(cnp->cn_pnbuf, M_NAMEI); ndp->ni_vp = NULL; @@ -143,7 +154,8 @@ namei(ndp) VREF(dp); } ndp->ni_startdir = dp; - if (error = lookup(ndp)) { + error = lookup(ndp); + if (error) { FREE(cnp->cn_pnbuf, M_NAMEI); return (error); } @@ -176,7 +188,8 @@ namei(ndp) auio.uio_segflg = UIO_SYSSPACE; auio.uio_procp = (struct proc *)0; auio.uio_resid = MAXPATHLEN; - if (error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred)) { + error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred); + if (error) { if (ndp->ni_pathlen > 1) free(cp, M_NAMEI); break; @@ -226,7 +239,7 @@ namei(ndp) * the target is returned locked, otherwise it is returned unlocked. * When creating or renaming and LOCKPARENT is specified, the target may not * be ".". When deleting and LOCKPARENT is specified, the target may be ".". - * + * * Overall outline of lookup: * * dirloop: @@ -254,6 +267,7 @@ lookup(ndp) int docache; /* == 0 do not cache last component */ int wantparent; /* 1 => wantparent or lockparent flag */ int rdonly; /* lookup read-only flag bit */ + int trailing_slash; int error = 0; struct componentname *cnp = &ndp->ni_cnd; struct proc *p = cnp->cn_proc; @@ -264,7 +278,8 @@ lookup(ndp) wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT); docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE; if (cnp->cn_nameiop == DELETE || - (wantparent && cnp->cn_nameiop != CREATE)) + (wantparent && cnp->cn_nameiop != CREATE && + cnp->cn_nameiop != LOOKUP)) docache = 0; rdonly = cnp->cn_flags & RDONLY; ndp->ni_dvp = NULL; @@ -300,6 +315,25 @@ dirloop: #endif ndp->ni_pathlen -= cnp->cn_namelen; ndp->ni_next = cp; + + /* + * Replace multiple slashes by a single slash and trailing slashes + * by a null. This must be done before VOP_LOOKUP() because some + * fs's don't know about trailing slashes. Remember if there were + * trailing slashes to handle symlinks, existing non-directories + * and non-existing files that won't be directories specially later. + */ + trailing_slash = 0; + while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) { + cp++; + ndp->ni_pathlen--; + if (*cp == '\0') { + trailing_slash = 1; + *ndp->ni_next = '\0'; /* XXX for direnter() ... */ + } + } + ndp->ni_next = cp; + cnp->cn_flags |= MAKEENTRY; if (*cp == '\0' && docache == 0) cnp->cn_flags &= ~MAKEENTRY; @@ -404,6 +438,11 @@ unionlookup: error = EROFS; goto bad; } + if (*cp == '\0' && trailing_slash && + !(cnp->cn_flags & WILLBEDIR)) { + error = ENOENT; + goto bad; + } /* * We return with ni_vp NULL to indicate that the entry * doesn't currently exist, leaving a pointer to the @@ -431,6 +470,7 @@ unionlookup: } dp = ndp->ni_vp; + /* * Check to see if the vnode has been mounted on; * if so find the root of the mounted file system. @@ -451,11 +491,20 @@ unionlookup: * Check for symbolic link */ if ((dp->v_type == VLNK) && - ((cnp->cn_flags & FOLLOW) || *ndp->ni_next == '/')) { + ((cnp->cn_flags & FOLLOW) || trailing_slash || + *ndp->ni_next == '/')) { cnp->cn_flags |= ISSYMLINK; return (0); } + /* + * Check for bogus trailing slashes. + */ + if (trailing_slash && dp->v_type != VDIR) { + error = ENOTDIR; + goto bad2; + } + nextname: /* * Not a symbolic link. If more pathname, diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c new file mode 100644 index 0000000..779a1c4 --- /dev/null +++ b/sys/kern/vfs_mount.c @@ -0,0 +1,153 @@ +/* + * Copyright (c) 1989, 1993, 1995 + * The Regents of the University of California. All rights reserved. + * Copyright (c) 1995 Artisoft, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. All advertising materials mentioning features or use of this software + * must display the following acknowledgement: + * This product includes software developed by the University of + * California, Berkeley and its contributors. + * 4. Neither the name of the University nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * @(#)vfs_conf.c 8.8 (Berkeley) 3/31/94 + * $Id$ + */ + +/* + * PURPOSE: This file abstracts the root mounting interface from + * the per file system semantics for handling mounts, + * the overall intent of which is to move the BSD + * internals dependence out of the FS code, both to + * make the FS code more portable and to free up some + * of the BSD internals so that they may more easily + * be changed. + * + * NOTE1: Code is single entry/single exit to aid debugging + * and conversion for kernel multithreading. + * + * NOTE2: Code notes lock state in headers on entry and exit + * as an aid to conversion for kernel multithreading + * on SMP reentrancy + */ +#include <sys/param.h> /* dev_t (types.h)*/ +#include <sys/systm.h> /* rootvp*/ +#include <sys/proc.h> /* curproc*/ +#include <sys/vnode.h> /* NULLVP*/ +#include <sys/mount.h> /* struct mount*/ +#include <sys/malloc.h> /* M_MOUNT*/ + +/* + * GLOBALS + */ + +/* + * These define the root filesystem, device, and root filesystem type. + */ +struct mount *rootfs; +struct vnode *rootvnode; +char *mountrootfsname; + +/* + * vfs_init() will set maxvfsconf + * to the highest defined type number. + */ +int maxvfsconf; +struct vfsconf *vfsconf; + +/* + * Common root mount code shared by all filesystems + */ +#define ROOTNAME "root_device" + +/* + * vfs_mountrootfs + * + * Common entry point for root mounts + * + * PARAMETERS: + * fsname name of the filesystem + * + * RETURNS: 0 Success + * !0 error number (errno.h) + * + * LOCK STATE: + * ENTRY + * <no locks held> + * EXIT + * <no locks held> + * + * NOTES: + * This code is currently supported only for use for + * the FFS file system type. This is a matter of + * fixing the other file systems, not this code! + */ +int +vfs_mountrootfs(fsname) + char *fsname; +{ + struct mount *mp; + int err = 0; + struct proc *p = curproc; /* XXX */ + + /* + * New root mount structure + */ + err = vfs_rootmountalloc(fsname, ROOTNAME, &mp); + if (err) + return (err); + mp->mnt_flag |= MNT_ROOTFS; + + /* + * Attempt the mount + */ + err = VFS_MOUNT(mp, NULL, NULL, NULL, p); + if (err) + goto error_2; + + simple_lock(&mountlist_slock); + /* Add fs to list of mounted file systems*/ + CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list); + simple_unlock(&mountlist_slock); + + vfs_unbusy(mp, p); + + /* root mount, update system time from FS specific data*/ + inittodr(mp->mnt_time); + + goto success; + + +error_2: /* mount error*/ + + vfs_unbusy(mp, p); + +error_1: /* lock error*/ + + /* free mount struct before failing*/ + free( mp, M_MOUNT); + +success: + return( err); +} diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c index f891e02..0b487fd 100644 --- a/sys/kern/vfs_subr.c +++ b/sys/kern/vfs_subr.c @@ -36,14 +36,19 @@ * SUCH DAMAGE. * * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95 + * $Id: vfs_subr.c,v 1.79 1997/03/04 18:31:56 bde Exp $ */ /* * External virtual filesystem routines */ +#include "opt_ddb.h" +#include "opt_devfs.h" #include <sys/param.h> #include <sys/systm.h> +#include <sys/kernel.h> +#include <sys/file.h> #include <sys/proc.h> #include <sys/mount.h> #include <sys/time.h> @@ -58,15 +63,29 @@ #include <sys/mbuf.h> #include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> +#include <vm/vm_pager.h> +#include <vm/vnode_pager.h> #include <sys/sysctl.h> #include <miscfs/specfs/specdev.h> +#ifdef DDB +extern void printlockedvnodes __P((void)); +#endif +static void vclean __P((struct vnode *vp, int flags, struct proc *p)); +extern void vgonel __P((struct vnode *vp, struct proc *p)); +unsigned long numvnodes; +extern void vfs_unmountroot __P((struct mount *rootfs)); +extern void vputrele __P((struct vnode *vp, int put)); + enum vtype iftovt_tab[16] = { VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON, VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD, }; -int vttoif_tab[9] = { +int vttoif_tab[9] = { 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK, S_IFSOCK, S_IFIFO, S_IFMT, }; @@ -80,13 +99,23 @@ int vttoif_tab[9] = { (bp)->b_vnbufs.le_next = NOLIST; \ } TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */ -struct mntlist mountlist; /* mounted filesystem list */ +static u_long freevnodes = 0; + +struct mntlist mountlist; /* mounted filesystem list */ struct simplelock mountlist_slock; static struct simplelock mntid_slock; struct simplelock mntvnode_slock; struct simplelock vnode_free_list_slock; static struct simplelock spechash_slock; +int desiredvnodes; +SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, ""); + +static void vfs_free_addrlist __P((struct netexport *nep)); +static int vfs_free_netcred __P((struct radix_node *rn, void *w)); +static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep, + struct export_args *argp)); + /* * Initialize the vnode management data structures. */ @@ -94,6 +123,7 @@ void vntblinit() { + desiredvnodes = maxproc + vm_object_cache_max; simple_lock_init(&mntvnode_slock); simple_lock_init(&mntid_slock); simple_lock_init(&spechash_slock); @@ -119,17 +149,19 @@ vfs_busy(mp, flags, interlkp, p) if (flags & LK_NOWAIT) return (ENOENT); mp->mnt_flag |= MNT_MWAIT; - if (interlkp) + if (interlkp) { simple_unlock(interlkp); + } /* * Since all busy locks are shared except the exclusive * lock granted when unmounting, the only place that a * wakeup needs to be done is at the release of the * exclusive lock at the end of dounmount. */ - sleep((caddr_t)mp, PVFS); - if (interlkp) + tsleep((caddr_t)mp, PVFS, "vfs_busy", 0); + if (interlkp) { simple_lock(interlkp); + } return (ENOENT); } lkflags = LK_SHARED; @@ -187,6 +219,7 @@ vfs_rootmountalloc(fstypename, devname, mpp) mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK; strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN); mp->mnt_stat.f_mntonname[0] = '/'; + mp->mnt_stat.f_mntonname[1] = 0; (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0); *mpp = mp; return (0); @@ -198,15 +231,16 @@ vfs_rootmountalloc(fstypename, devname, mpp) * trying those that have mountroot routines, and try them until one * works or we have tried them all. */ +#ifdef notdef /* XXX JH */ int -vfs_mountroot() +lite2_vfs_mountroot(void) { struct vfsconf *vfsp; - extern int (*mountroot)(void); + extern int (*lite2_mountroot)(void); int error; - if (mountroot != NULL) - return ((*mountroot)()); + if (lite2_mountroot != NULL) + return ((*lite2_mountroot)()); for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { if (vfsp->vfc_mountroot == NULL) continue; @@ -216,6 +250,7 @@ vfs_mountroot() } return (ENODEV); } +#endif /* * Lookup a mount point by filesystem identifier. @@ -228,15 +263,15 @@ vfs_getvfs(fsid) simple_lock(&mountlist_slock); for (mp = mountlist.cqh_first; mp != (void *)&mountlist; - mp = mp->mnt_list.cqe_next) { + mp = mp->mnt_list.cqe_next) { if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] && mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) { simple_unlock(&mountlist_slock); return (mp); - } + } } simple_unlock(&mountlist_slock); - return ((struct mount *)0); + return ((struct mount *) 0); } /* @@ -246,12 +281,12 @@ void vfs_getnewfsid(mp) struct mount *mp; { -static u_short xxxfs_mntid; + static u_short xxxfs_mntid; fsid_t tfsid; int mtype; - simple_lock(&mntid_slock); + simple_lock(&mntid_slock); mtype = mp->mnt_vfc->vfc_typenum; mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0); mp->mnt_stat.f_fsid.val[1] = mtype; @@ -278,25 +313,22 @@ vattr_null(vap) { vap->va_type = VNON; - vap->va_size = vap->va_bytes = VNOVAL; + vap->va_size = VNOVAL; + vap->va_bytes = VNOVAL; vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid = - vap->va_fsid = vap->va_fileid = - vap->va_blocksize = vap->va_rdev = - vap->va_atime.ts_sec = vap->va_atime.ts_nsec = - vap->va_mtime.ts_sec = vap->va_mtime.ts_nsec = - vap->va_ctime.ts_sec = vap->va_ctime.ts_nsec = - vap->va_flags = vap->va_gen = VNOVAL; + vap->va_fsid = vap->va_fileid = + vap->va_blocksize = vap->va_rdev = + vap->va_atime.tv_sec = vap->va_atime.tv_nsec = + vap->va_mtime.tv_sec = vap->va_mtime.tv_nsec = + vap->va_ctime.tv_sec = vap->va_ctime.tv_nsec = + vap->va_flags = vap->va_gen = VNOVAL; vap->va_vaflags = 0; } /* * Routines having to do with the management of the vnode table. */ -extern int (**dead_vnodeop_p)(); -static void vclean __P((struct vnode *vp, int flag, struct proc *p)); -extern void vgonel __P((struct vnode *vp, struct proc *p)); -long numvnodes; -extern struct vattr va_null; +extern vop_t **dead_vnodeop_p; /* * Return the next vnode from the free list. @@ -305,23 +337,31 @@ int getnewvnode(tag, mp, vops, vpp) enum vtagtype tag; struct mount *mp; - int (**vops)(); + vop_t **vops; struct vnode **vpp; { struct proc *p = curproc; /* XXX */ struct vnode *vp; - int s; - int cnt; -top: simple_lock(&vnode_free_list_slock); - if ((vnode_free_list.tqh_first == NULL && - numvnodes < 2 * desiredvnodes) || - numvnodes < desiredvnodes) { +retry: + /* + * we allocate a new vnode if + * 1. we don't have any free + * Pretty obvious, we actually used to panic, but that + * is a silly thing to do. + * 2. we havn't filled our pool yet + * We don't want to trash the incore (VM-)vnodecache. + * 3. if less that 1/4th of our vnodes are free. + * We don't want to trash the namei cache either. + */ + if (freevnodes < (numvnodes >> 2) || + numvnodes < desiredvnodes || + vnode_free_list.tqh_first == NULL) { simple_unlock(&vnode_free_list_slock); - vp = (struct vnode *)malloc((u_long)sizeof *vp, + vp = (struct vnode *) malloc((u_long) sizeof *vp, M_VNODE, M_WAITOK); - bzero((char *)vp, sizeof *vp); + bzero((char *) vp, sizeof *vp); numvnodes++; } else { for (vp = vnode_free_list.tqh_first; @@ -343,31 +383,45 @@ top: if (vp->v_usecount) panic("free vnode isn't"); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); + if (vp->v_usage > 0) { + simple_unlock(&vp->v_interlock); + --vp->v_usage; + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + goto retry; + } + freevnodes--; + /* see comment on why 0xdeadb is set at end of vgone (below) */ - vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb; + vp->v_freelist.tqe_prev = (struct vnode **) 0xdeadb; simple_unlock(&vnode_free_list_slock); vp->v_lease = NULL; if (vp->v_type != VBAD) vgonel(vp, p); - else + else { simple_unlock(&vp->v_interlock); + } + #ifdef DIAGNOSTIC - if (vp->v_data) - panic("cleaned vnode isn't"); - s = splbio(); - if (vp->v_numoutput) - panic("Clean vnode has pending I/O's"); - splx(s); + { + int s; + + if (vp->v_data) + panic("cleaned vnode isn't"); + s = splbio(); + if (vp->v_numoutput) + panic("Clean vnode has pending I/O's"); + splx(s); + } #endif vp->v_flag = 0; vp->v_lastr = 0; - vp->v_ralen = 0; - vp->v_maxra = 0; vp->v_lastw = 0; vp->v_lasta = 0; vp->v_cstart = 0; vp->v_clen = 0; vp->v_socket = 0; + vp->v_writecount = 0; /* XXX */ + vp->v_usage = 0; } vp->v_type = VNON; cache_purge(vp); @@ -385,8 +439,8 @@ top: */ void insmntque(vp, mp) - struct vnode *vp; - struct mount *mp; + register struct vnode *vp; + register struct mount *mp; { simple_lock(&mntvnode_slock); @@ -398,8 +452,11 @@ insmntque(vp, mp) /* * Insert into list of vnodes for the new mount point, if available. */ - if ((vp->v_mount = mp) != NULL) - LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); + if ((vp->v_mount = mp) == NULL) { + simple_unlock(&mntvnode_slock); + return; + } + LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes); simple_unlock(&mntvnode_slock); } @@ -413,14 +470,13 @@ vwakeup(bp) register struct vnode *vp; bp->b_flags &= ~B_WRITEINPROG; - if (vp = bp->b_vp) { - if (--vp->v_numoutput < 0) + if ((vp = bp->b_vp)) { + vp->v_numoutput--; + if (vp->v_numoutput < 0) panic("vwakeup: neg numoutput"); - if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) { - if (vp->v_numoutput < 0) - panic("vwakeup: neg numoutput 2"); + if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) { vp->v_flag &= ~VBWAIT; - wakeup((caddr_t)&vp->v_numoutput); + wakeup((caddr_t) &vp->v_numoutput); } } } @@ -440,15 +496,18 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) register struct buf *bp; struct buf *nbp, *blist; int s, error; + vm_object_t object; if (flags & V_SAVE) { - if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p)) + if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p))) return (error); if (vp->v_dirtyblkhd.lh_first != NULL) panic("vinvalbuf: dirty bufs"); } + + s = splbio(); for (;;) { - if ((blist = vp->v_cleanblkhd.lh_first) && flags & V_SAVEMETA) + if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA)) while (blist && blist->b_lblkno < 0) blist = blist->b_vnbufs.le_next; if (!blist && (blist = vp->v_dirtyblkhd.lh_first) && @@ -460,35 +519,51 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo) for (bp = blist; bp; bp = nbp) { nbp = bp->b_vnbufs.le_next; - if (flags & V_SAVEMETA && bp->b_lblkno < 0) + if ((flags & V_SAVEMETA) && bp->b_lblkno < 0) continue; - s = splbio(); if (bp->b_flags & B_BUSY) { bp->b_flags |= B_WANTED; - error = tsleep((caddr_t)bp, - slpflag | (PRIBIO + 1), "vinvalbuf", - slptimeo); - splx(s); - if (error) + error = tsleep((caddr_t) bp, + slpflag | (PRIBIO + 1), "vinvalbuf", + slptimeo); + if (error) { + splx(s); return (error); + } break; } bremfree(bp); bp->b_flags |= B_BUSY; - splx(s); /* - * XXX Since there are no node locks for NFS, I believe - * there is a slight chance that a delayed write will - * occur while sleeping just above, so check for it. + * XXX Since there are no node locks for NFS, I + * believe there is a slight chance that a delayed + * write will occur while sleeping just above, so + * check for it. */ if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) { (void) VOP_BWRITE(bp); break; } - bp->b_flags |= B_INVAL; + bp->b_flags |= (B_INVAL|B_NOCACHE|B_RELBUF); brelse(bp); } } + + while (vp->v_numoutput > 0) { + vp->v_flag |= VBWAIT; + tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0); + } + + splx(s); + + /* + * Destroy the copy in the VM cache, too. + */ + object = vp->v_object; + if (object != NULL) { + vm_object_page_remove(object, 0, object->size, + (flags & V_SAVE) ? TRUE : FALSE); + } if (!(flags & V_SAVEMETA) && (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first)) panic("vinvalbuf: flush failed"); @@ -503,6 +578,7 @@ bgetvp(vp, bp) register struct vnode *vp; register struct buf *bp; { + int s; if (bp->b_vp) panic("bgetvp: not free"); @@ -515,7 +591,9 @@ bgetvp(vp, bp) /* * Insert onto list for new vnode. */ + s = splbio(); bufinsvn(bp, &vp->v_cleanblkhd); + splx(s); } /* @@ -526,20 +604,60 @@ brelvp(bp) register struct buf *bp; { struct vnode *vp; + int s; if (bp->b_vp == (struct vnode *) 0) panic("brelvp: NULL"); /* * Delete from old vnode list, if on one. */ + s = splbio(); if (bp->b_vnbufs.le_next != NOLIST) bufremvn(bp); + splx(s); + vp = bp->b_vp; bp->b_vp = (struct vnode *) 0; HOLDRELE(vp); } /* + * Associate a p-buffer with a vnode. + */ +void +pbgetvp(vp, bp) + register struct vnode *vp; + register struct buf *bp; +{ +#if defined(DIAGNOSTIC) + if (bp->b_vp) + panic("pbgetvp: not free"); +#endif + bp->b_vp = vp; + if (vp->v_type == VBLK || vp->v_type == VCHR) + bp->b_dev = vp->v_rdev; + else + bp->b_dev = NODEV; +} + +/* + * Disassociate a p-buffer from a vnode. + */ +void +pbrelvp(bp) + register struct buf *bp; +{ + struct vnode *vp; + +#if defined(DIAGNOSTIC) + if (bp->b_vp == (struct vnode *) 0) + panic("pbrelvp: NULL"); +#endif + + bp->b_vp = (struct vnode *) 0; +} + +/* * Reassign a buffer from one vnode to another. * Used to assign file specific control information * (indirect blocks) to the vnode to which they belong. @@ -549,28 +667,43 @@ reassignbuf(bp, newvp) register struct buf *bp; register struct vnode *newvp; { - register struct buflists *listheadp; + int s; if (newvp == NULL) { printf("reassignbuf: NULL"); return; } + + s = splbio(); /* * Delete from old vnode list, if on one. */ if (bp->b_vnbufs.le_next != NOLIST) bufremvn(bp); /* - * If dirty, put on list of dirty buffers; - * otherwise insert onto list of clean buffers. + * If dirty, put on list of dirty buffers; otherwise insert onto list + * of clean buffers. */ - if (bp->b_flags & B_DELWRI) - listheadp = &newvp->v_dirtyblkhd; - else - listheadp = &newvp->v_cleanblkhd; - bufinsvn(bp, listheadp); + if (bp->b_flags & B_DELWRI) { + struct buf *tbp; + + tbp = newvp->v_dirtyblkhd.lh_first; + if (!tbp || (tbp->b_lblkno > bp->b_lblkno)) { + bufinsvn(bp, &newvp->v_dirtyblkhd); + } else { + while (tbp->b_vnbufs.le_next && + (tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) { + tbp = tbp->b_vnbufs.le_next; + } + LIST_INSERT_AFTER(tbp, bp, b_vnbufs); + } + } else { + bufinsvn(bp, &newvp->v_cleanblkhd); + } + splx(s); } +#ifndef DEVFS_ROOT /* * Create a vnode for a block device. * Used for root filesystem, argdev, and swap areas. @@ -585,24 +718,23 @@ bdevvp(dev, vpp) struct vnode *nvp; int error; - if (dev == NODEV) { - *vpp = NULLVP; - return (ENODEV); - } - error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp); + if (dev == NODEV) + return (0); + error = getnewvnode(VT_NON, (struct mount *) 0, spec_vnodeop_p, &nvp); if (error) { - *vpp = NULLVP; + *vpp = 0; return (error); } vp = nvp; vp->v_type = VBLK; - if (nvp = checkalias(vp, dev, (struct mount *)0)) { + if ((nvp = checkalias(vp, dev, (struct mount *) 0))) { vput(vp); vp = nvp; } *vpp = vp; return (0); } +#endif /* !DEVFS_ROOT */ /* * Check to see if the new vnode represents a special device @@ -648,7 +780,7 @@ loop: } if (vp == NULL || vp->v_tag != VT_NON) { MALLOC(nvp->v_specinfo, struct specinfo *, - sizeof(struct specinfo), M_VNODE, M_WAITOK); + sizeof(struct specinfo), M_VNODE, M_WAITOK); nvp->v_rdev = nvp_rdev; nvp->v_hashchain = vpp; nvp->v_specnext = *vpp; @@ -683,7 +815,7 @@ loop: */ int vget(vp, flags, p) - struct vnode *vp; + register struct vnode *vp; int flags; struct proc *p; { @@ -695,8 +827,9 @@ vget(vp, flags, p) * return failure. Cleaning is determined by checking that * the VXLOCK flag is set. */ - if ((flags & LK_INTERLOCK) == 0) + if ((flags & LK_INTERLOCK) == 0) { simple_lock(&vp->v_interlock); + } if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; simple_unlock(&vp->v_interlock); @@ -707,8 +840,22 @@ vget(vp, flags, p) simple_lock(&vnode_free_list_slock); TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); simple_unlock(&vnode_free_list_slock); + freevnodes--; } vp->v_usecount++; + /* + * Create the VM object, if needed + */ + if ((vp->v_type == VREG) && + ((vp->v_object == NULL) || + (vp->v_object->flags & OBJ_VFS_REF) == 0)) { + /* + * XXX vfs_object_create probably needs the interlock. + */ + simple_unlock(&vp->v_interlock); + vfs_object_create(vp, curproc, curproc->p_ucred, 0); + simple_lock(&vp->v_interlock); + } if (flags & LK_TYPE_MASK) { if (error = vn_lock(vp, flags | LK_INTERLOCK, p)) vrele(vp); @@ -781,14 +928,15 @@ vop_nolock(ap) * Since we are not using the lock manager, we must clear * the interlock here. */ - if (ap->a_flags & LK_INTERLOCK) + if (ap->a_flags & LK_INTERLOCK) { simple_unlock(&ap->a_vp->v_interlock); + } return (0); #endif } /* - * Decrement the active use count. + * Do the inverse of vop_nolock, handling the interlock in a compatible way. */ int vop_nounlock(ap) @@ -800,9 +948,13 @@ vop_nounlock(ap) { struct vnode *vp = ap->a_vp; - if (vp->v_vnlock == NULL) + if (vp->v_vnlock == NULL) { + if (ap->a_flags & LK_INTERLOCK) + simple_unlock(&ap->a_vp->v_interlock); return (0); - return (lockmgr(vp->v_vnlock, LK_RELEASE, NULL, ap->a_p)); + } + return (lockmgr(vp->v_vnlock, LK_RELEASE | ap->a_flags, + &ap->a_vp->v_interlock, ap->a_p)); } /* @@ -821,91 +973,124 @@ vop_noislocked(ap) return (lockstatus(vp->v_vnlock)); } +/* #ifdef DIAGNOSTIC */ /* - * Vnode reference. + * Vnode reference, just increment the count */ void vref(vp) struct vnode *vp; { - simple_lock(&vp->v_interlock); if (vp->v_usecount <= 0) panic("vref used where vget required"); + vp->v_usecount++; + + if ((vp->v_type == VREG) && + ((vp->v_object == NULL) || + ((vp->v_object->flags & OBJ_VFS_REF) == 0)) ) { + /* + * We need to lock to VP during the time that + * the object is created. This is necessary to + * keep the system from re-entrantly doing it + * multiple times. + * XXX vfs_object_create probably needs the interlock? + */ + simple_unlock(&vp->v_interlock); + vfs_object_create(vp, curproc, curproc->p_ucred, 0); + return; + } simple_unlock(&vp->v_interlock); } /* - * vput(), just unlock and vrele() + * Vnode put/release. + * If count drops to zero, call inactive routine and return to freelist. */ void -vput(vp) +vputrele(vp, put) struct vnode *vp; + int put; { struct proc *p = curproc; /* XXX */ -#ifdef DIGANOSTIC +#ifdef DIAGNOSTIC if (vp == NULL) - panic("vput: null vp"); + panic("vputrele: null vp"); #endif simple_lock(&vp->v_interlock); vp->v_usecount--; + + if ((vp->v_usecount == 1) && + vp->v_object && + (vp->v_object->flags & OBJ_VFS_REF)) { + vp->v_object->flags &= ~OBJ_VFS_REF; + if (put) { + VOP_UNLOCK(vp, LK_INTERLOCK, p); + } else { + simple_unlock(&vp->v_interlock); + } + vm_object_deallocate(vp->v_object); + return; + } + if (vp->v_usecount > 0) { - simple_unlock(&vp->v_interlock); - VOP_UNLOCK(vp, 0, p); + if (put) { + VOP_UNLOCK(vp, LK_INTERLOCK, p); + } else { + simple_unlock(&vp->v_interlock); + } return; } + + if (vp->v_usecount < 0) { #ifdef DIAGNOSTIC - if (vp->v_usecount < 0 || vp->v_writecount != 0) { - vprint("vput: bad ref count", vp); - panic("vput: ref cnt"); - } + vprint("vputrele: negative ref count", vp); #endif - /* - * insert at tail of LRU list - */ + panic("vputrele: negative ref cnt"); + } simple_lock(&vnode_free_list_slock); - TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + if (vp->v_flag & VAGE) { + vp->v_flag &= ~VAGE; + vp->v_usage = 0; + if(vp->v_tag != VT_TFS) + TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); + } else { + if(vp->v_tag != VT_TFS) + TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); + } + freevnodes++; simple_unlock(&vnode_free_list_slock); - simple_unlock(&vp->v_interlock); - VOP_INACTIVE(vp, p); + + /* + * If we are doing a vput, the node is already locked, and we must + * call VOP_INACTIVE with the node locked. So, in the case of + * vrele, we explicitly lock the vnode before calling VOP_INACTIVE. + */ + if (put) { + simple_unlock(&vp->v_interlock); + VOP_INACTIVE(vp, p); + } else if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) { + VOP_INACTIVE(vp, p); + } } /* - * Vnode release. - * If count drops to zero, call inactive routine and return to freelist. + * vput(), just unlock and vrele() */ void -vrele(vp) +vput(vp) struct vnode *vp; { - struct proc *p = curproc; /* XXX */ + vputrele(vp, 1); +} -#ifdef DIAGNOSTIC - if (vp == NULL) - panic("vrele: null vp"); -#endif - simple_lock(&vp->v_interlock); - vp->v_usecount--; - if (vp->v_usecount > 0) { - simple_unlock(&vp->v_interlock); - return; - } -#ifdef DIAGNOSTIC - if (vp->v_usecount < 0 || vp->v_writecount != 0) { - vprint("vrele: bad ref count", vp); - panic("vrele: ref cnt"); - } -#endif - /* - * insert at tail of LRU list - */ - simple_lock(&vnode_free_list_slock); - TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist); - simple_unlock(&vnode_free_list_slock); - if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) - VOP_INACTIVE(vp, p); +void +vrele(vp) + struct vnode *vp; +{ + vputrele(vp, 0); } #ifdef DIAGNOSTIC @@ -947,8 +1132,8 @@ holdrele(vp) * that are found. */ #ifdef DIAGNOSTIC -int busyprt = 0; /* print out busy vnodes */ -struct ctldebug debug1 = { "busyprt", &busyprt }; +static int busyprt = 0; /* print out busy vnodes */ +SYSCTL_INT(_debug, 1, busyprt, CTLFLAG_RW, &busyprt, 0, ""); #endif int @@ -964,6 +1149,10 @@ vflush(mp, skipvp, flags) simple_lock(&mntvnode_slock); loop: for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) { + /* + * Make sure this vnode wasn't reclaimed in getnewvnode(). + * Start over if it has (it won't be on the list anymore). + */ if (vp->v_mount != mp) goto loop; nvp = vp->v_mntvnodes.le_next; @@ -982,17 +1171,29 @@ loop: continue; } /* - * If WRITECLOSE is set, only flush out regular file - * vnodes open for writing. + * If WRITECLOSE is set, only flush out regular file vnodes + * open for writing. */ if ((flags & WRITECLOSE) && (vp->v_writecount == 0 || vp->v_type != VREG)) { simple_unlock(&vp->v_interlock); continue; } + + if (vp->v_object && (vp->v_object->flags & OBJ_VFS_REF)) { + simple_unlock(&vp->v_interlock); + simple_unlock(&mntvnode_slock); + vm_object_reference(vp->v_object); + pager_cache(vp->v_object, FALSE); + vp->v_object->flags &= ~OBJ_VFS_REF; + vm_object_deallocate(vp->v_object); + simple_lock(&mntvnode_slock); + simple_lock(&vp->v_interlock); + } + /* - * With v_usecount == 0, all we need to do is clear - * out the vnode data structures and we are done. + * With v_usecount == 0, all we need to do is clear out the + * vnode data structures and we are done. */ if (vp->v_usecount == 0) { simple_unlock(&mntvnode_slock); @@ -1000,10 +1201,11 @@ loop: simple_lock(&mntvnode_slock); continue; } + /* - * If FORCECLOSE is set, forcibly close the vnode. - * For block or character devices, revert to an - * anonymous device. For all other files, just kill them. + * If FORCECLOSE is set, forcibly close the vnode. For block + * or character devices, revert to an anonymous device. For + * all other files, just kill them. */ if (flags & FORCECLOSE) { simple_unlock(&mntvnode_slock); @@ -1012,7 +1214,7 @@ loop: } else { vclean(vp, 0, p); vp->v_op = spec_vnodeop_p; - insmntque(vp, (struct mount *)0); + insmntque(vp, (struct mount *) 0); } simple_lock(&mntvnode_slock); continue; @@ -1032,27 +1234,22 @@ loop: /* * Disassociate the underlying file system from a vnode. - * The vnode interlock is held on entry. */ static void -vclean(vp, flags, p) - struct vnode *vp; - int flags; - struct proc *p; +vclean(struct vnode *vp, int flags, struct proc *p) { int active; /* - * Check to see if the vnode is in use. - * If so we have to reference it before we clean it out - * so that its count cannot fall to zero and generate a - * race against ourselves to recycle it. + * Check to see if the vnode is in use. If so we have to reference it + * before we clean it out so that its count cannot fall to zero and + * generate a race against ourselves to recycle it. */ - if (active = vp->v_usecount) + if ((active = vp->v_usecount)) vp->v_usecount++; /* - * Prevent the vnode from being recycled or - * brought into use while we clean it out. + * Prevent the vnode from being recycled or brought into use while we + * clean it out. */ if (vp->v_flag & VXLOCK) panic("vclean: deadlock"); @@ -1109,12 +1306,12 @@ vclean(vp, flags, p) vp->v_flag &= ~VXLOCK; if (vp->v_flag & VXWANT) { vp->v_flag &= ~VXWANT; - wakeup((caddr_t)vp); + wakeup((caddr_t) vp); } } /* - * Eliminate all activity associated with the requested vnode + * Eliminate all activity associated with the requested vnode * and with all vnodes aliased to the requested vnode. */ int @@ -1162,8 +1359,9 @@ vop_revoke(ap) vgone(vq); break; } - if (vq == NULLVP) + if (vq == NULLVP) { simple_unlock(&spechash_slock); + } } /* * Remove the lock so that vgone below will @@ -1190,8 +1388,9 @@ vrecycle(vp, inter_lkp, p) simple_lock(&vp->v_interlock); if (vp->v_usecount == 0) { - if (inter_lkp) + if (inter_lkp) { simple_unlock(inter_lkp); + } vgonel(vp, p); return (1); } @@ -1205,7 +1404,7 @@ vrecycle(vp, inter_lkp, p) */ void vgone(vp) - struct vnode *vp; + register struct vnode *vp; { struct proc *p = curproc; /* XXX */ @@ -1234,6 +1433,11 @@ vgonel(vp, p) tsleep((caddr_t)vp, PINOD, "vgone", 0); return; } + + if (vp->v_object) { + vp->v_object->flags |= OBJ_VNODE_GONE; + } + /* * Clean out the filesystem specific data. */ @@ -1281,6 +1485,7 @@ vgonel(vp, p) FREE(vp->v_specinfo, M_VNODE); vp->v_specinfo = NULL; } + /* * If it is on the freelist and not already at the head, * move it to the head of the list. The test of the back @@ -1297,12 +1502,13 @@ vgonel(vp, p) if (vp->v_usecount == 0) { simple_lock(&vnode_free_list_slock); if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) && - vnode_free_list.tqh_first != vp) { + vnode_free_list.tqh_first != vp) { TAILQ_REMOVE(&vnode_free_list, vp, v_freelist); TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist); } simple_unlock(&vnode_free_list_slock); } + vp->v_type = VBAD; } @@ -1315,7 +1521,7 @@ vfinddev(dev, type, vpp) enum vtype type; struct vnode **vpp; { - struct vnode *vp; + register struct vnode *vp; int rc = 0; simple_lock(&spechash_slock); @@ -1335,7 +1541,7 @@ vfinddev(dev, type, vpp) */ int vcount(vp) - struct vnode *vp; + register struct vnode *vp; { struct vnode *vq, *vnext; int count; @@ -1366,7 +1572,7 @@ loop: * Print out a description of a vnode. */ static char *typename[] = - { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" }; +{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"}; void vprint(label, vp) @@ -1377,9 +1583,9 @@ vprint(label, vp) if (label != NULL) printf("%s: ", label); - printf("type %s, usecount %d, writecount %d, refcount %d,", - typename[vp->v_type], vp->v_usecount, vp->v_writecount, - vp->v_holdcnt); + printf("type %s, usecount %d, writecount %d, refcount %ld,", + typename[vp->v_type], vp->v_usecount, vp->v_writecount, + vp->v_holdcnt); buf[0] = '\0'; if (vp->v_flag & VROOT) strcat(buf, "|VROOT"); @@ -1405,7 +1611,7 @@ vprint(label, vp) } } -#ifdef DEBUG +#ifdef DDB /* * List all of the locked vnodes in the system. * Called when debugging the kernel. @@ -1441,19 +1647,22 @@ printlockedvnodes() /* * Top level filesystem related information gathering. */ -int -vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) - int *name; - u_int namelen; - void *oldp; - size_t *oldlenp; - void *newp; - size_t newlen; - struct proc *p; +static int sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS); + +static int +vfs_sysctl SYSCTL_HANDLER_ARGS { - struct ctldebug *cdp; + int *name = (int *)arg1 - 1; /* XXX */ + u_int namelen = arg2 + 1; /* XXX */ struct vfsconf *vfsp; +#ifndef NO_COMPAT_PRELITE2 + /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */ + if (namelen == 1) + return (sysctl_ovfs_conf(oidp, arg1, arg2, req)); +#endif + +#ifdef notyet /* all sysctl names at this level are at least name and field */ if (namelen < 2) return (ENOTDIR); /* overloaded */ @@ -1466,58 +1675,83 @@ vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p) return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1, oldp, oldlenp, newp, newlen, p)); } +#endif switch (name[1]) { case VFS_MAXTYPENUM: - return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf)); + if (namelen != 2) + return (ENOTDIR); + return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int))); case VFS_CONF: - if (namelen < 3) + if (namelen != 3) return (ENOTDIR); /* overloaded */ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) if (vfsp->vfc_typenum == name[2]) break; if (vfsp == NULL) return (EOPNOTSUPP); - return (sysctl_rdstruct(oldp, oldlenp, newp, vfsp, - sizeof(struct vfsconf))); + return (SYSCTL_OUT(req, vfsp, sizeof *vfsp)); } return (EOPNOTSUPP); } +SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl, + "Generic filesystem"); + +#ifndef NO_COMPAT_PRELITE2 + +static int +sysctl_ovfs_conf SYSCTL_HANDLER_ARGS +{ + int error; + struct vfsconf *vfsp; + struct ovfsconf ovfs; + + for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) { + ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */ + strcpy(ovfs.vfc_name, vfsp->vfc_name); + ovfs.vfc_index = vfsp->vfc_typenum; + ovfs.vfc_refcount = vfsp->vfc_refcount; + ovfs.vfc_flags = vfsp->vfc_flags; + error = SYSCTL_OUT(req, &ovfs, sizeof ovfs); + if (error) + return error; + } + return 0; +} + +#endif /* !NO_COMPAT_PRELITE2 */ + int kinfo_vdebug = 1; int kinfo_vgetfailed; + #define KINFO_VNODESLOP 10 /* * Dump vnode list (via sysctl). * Copyout address of vnode followed by vnode. */ /* ARGSUSED */ -int -sysctl_vnode(where, sizep, p) - char *where; - size_t *sizep; - struct proc *p; +static int +sysctl_vnode SYSCTL_HANDLER_ARGS { + struct proc *p = curproc; /* XXX */ struct mount *mp, *nmp; struct vnode *nvp, *vp; - char *bp = where, *savebp; - char *ewhere; int error; #define VPTRSZ sizeof (struct vnode *) #define VNODESZ sizeof (struct vnode) - if (where == NULL) { - *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ); - return (0); - } - ewhere = where + *sizep; - + + req->lock = 0; + if (!req->oldptr) /* Make an estimate */ + return (SYSCTL_OUT(req, 0, + (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ))); + simple_lock(&mountlist_slock); for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) { if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) { nmp = mp->mnt_list.cqe_next; continue; } - savebp = bp; again: simple_lock(&mntvnode_slock); for (vp = mp->mnt_vnodelist.lh_first; @@ -1532,20 +1766,13 @@ again: simple_unlock(&mntvnode_slock); if (kinfo_vdebug) printf("kinfo: vp changed\n"); - bp = savebp; goto again; } nvp = vp->v_mntvnodes.le_next; - if (bp + VPTRSZ + VNODESZ > ewhere) { - simple_unlock(&mntvnode_slock); - *sizep = bp - where; - return (ENOMEM); - } simple_unlock(&mntvnode_slock); - if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) || - (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ))) + if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) || + (error = SYSCTL_OUT(req, vp, VNODESZ))) return (error); - bp += VPTRSZ + VNODESZ; simple_lock(&mntvnode_slock); } simple_unlock(&mntvnode_slock); @@ -1555,10 +1782,12 @@ again: } simple_unlock(&mountlist_slock); - *sizep = bp - where; return (0); } +SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD, + 0, 0, sysctl_vnode, "S,vnode", ""); + /* * Check to see if a filesystem is mounted on a block device. */ @@ -1595,14 +1824,23 @@ void vfs_unmountall() { struct mount *mp, *nmp; - struct proc *p = curproc; /* XXX */ + struct proc *p = initproc; /* XXX XXX should this be proc0? */ + int error; /* * Since this only runs when rebooting, it is not interlocked. */ for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) { nmp = mp->mnt_list.cqe_prev; - (void) dounmount(mp, MNT_FORCE, p); + error = dounmount(mp, MNT_FORCE, p); + if (error) { + printf("unmount of %s failed (", + mp->mnt_stat.f_mntonname); + if (error == EBUSY) + printf("BUSY)\n"); + else + printf("%d)\n", error); + } } } @@ -1611,10 +1849,8 @@ vfs_unmountall() * Called by ufs_mount() to set up the lists of export addresses. */ static int -vfs_hang_addrlist(mp, nep, argp) - struct mount *mp; - struct netexport *nep; - struct export_args *argp; +vfs_hang_addrlist(struct mount *mp, struct netexport *nep, + struct export_args *argp) { register struct netcred *np; register struct radix_node_head *rnh; @@ -1635,16 +1871,16 @@ vfs_hang_addrlist(mp, nep, argp) return (0); } i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen; - np = (struct netcred *)malloc(i, M_NETADDR, M_WAITOK); - bzero((caddr_t)np, i); - saddr = (struct sockaddr *)(np + 1); - if (error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen)) + np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK); + bzero((caddr_t) np, i); + saddr = (struct sockaddr *) (np + 1); + if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen))) goto out; if (saddr->sa_len > argp->ex_addrlen) saddr->sa_len = argp->ex_addrlen; if (argp->ex_masklen) { - smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen); - error = copyin(argp->ex_addr, (caddr_t)smask, argp->ex_masklen); + smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen); + error = copyin(argp->ex_addr, (caddr_t) smask, argp->ex_masklen); if (error) goto out; if (smask->sa_len > argp->ex_masklen) @@ -1653,13 +1889,13 @@ vfs_hang_addrlist(mp, nep, argp) i = saddr->sa_family; if ((rnh = nep->ne_rtable[i]) == 0) { /* - * Seems silly to initialize every AF when most are not - * used, do so on demand here + * Seems silly to initialize every AF when most are not used, + * do so on demand here */ for (dom = domains; dom; dom = dom->dom_next) if (dom->dom_family == i && dom->dom_rtattach) { - dom->dom_rtattach((void **)&nep->ne_rtable[i], - dom->dom_rtoffset); + dom->dom_rtattach((void **) &nep->ne_rtable[i], + dom->dom_rtoffset); break; } if ((rnh = nep->ne_rtable[i]) == 0) { @@ -1667,23 +1903,11 @@ vfs_hang_addrlist(mp, nep, argp) goto out; } } - rn = (*rnh->rnh_addaddr)((caddr_t)saddr, (caddr_t)smask, rnh, - np->netc_rnodes); - if (rn == 0) { - /* - * One of the reasons that rnh_addaddr may fail is that - * the entry already exists. To check for this case, we - * look up the entry to see if it is there. If so, we - * do not need to make a new entry but do return success. - */ - free(np, M_NETADDR); - rn = (*rnh->rnh_matchaddr)((caddr_t)saddr, rnh); - if (rn != 0 && (rn->rn_flags & RNF_ROOT) == 0 && - ((struct netcred *)rn)->netc_exflags == argp->ex_flags && - !bcmp((caddr_t)&((struct netcred *)rn)->netc_anon, - (caddr_t)&argp->ex_anon, sizeof(struct ucred))) - return (0); - return (EPERM); + rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh, + np->netc_rnodes); + if (rn == 0 || np != (struct netcred *) rn) { /* already exists */ + error = EPERM; + goto out; } np->netc_exflags = argp->ex_flags; np->netc_anon = argp->ex_anon; @@ -1696,14 +1920,12 @@ out: /* ARGSUSED */ static int -vfs_free_netcred(rn, w) - struct radix_node *rn; - caddr_t w; +vfs_free_netcred(struct radix_node *rn, void *w) { - register struct radix_node_head *rnh = (struct radix_node_head *)w; + register struct radix_node_head *rnh = (struct radix_node_head *) w; - (*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh); - free((caddr_t)rn, M_NETADDR); + (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh); + free((caddr_t) rn, M_NETADDR); return (0); } @@ -1711,17 +1933,16 @@ vfs_free_netcred(rn, w) * Free the net address hash lists that are hanging off the mount points. */ static void -vfs_free_addrlist(nep) - struct netexport *nep; +vfs_free_addrlist(struct netexport *nep) { register int i; register struct radix_node_head *rnh; for (i = 0; i <= AF_MAX; i++) - if (rnh = nep->ne_rtable[i]) { - (*rnh->rnh_walktree)(rnh, vfs_free_netcred, - (caddr_t)rnh); - free((caddr_t)rnh, M_RTABLE); + if ((rnh = nep->ne_rtable[i])) { + (*rnh->rnh_walktree) (rnh, vfs_free_netcred, + (caddr_t) rnh); + free((caddr_t) rnh, M_RTABLE); nep->ne_rtable[i] = 0; } } @@ -1739,7 +1960,7 @@ vfs_export(mp, nep, argp) mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED); } if (argp->ex_flags & MNT_EXPORTED) { - if (error = vfs_hang_addrlist(mp, nep, argp)) + if ((error = vfs_hang_addrlist(mp, nep, argp))) return (error); mp->mnt_flag |= MNT_EXPORTED; } @@ -1780,3 +2001,79 @@ vfs_export_lookup(mp, nep, nam) } return (np); } + +/* + * perform msync on all vnodes under a mount point + * the mount point must be locked. + */ +void +vfs_msync(struct mount *mp, int flags) { + struct vnode *vp, *nvp; +loop: + for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) { + + if (vp->v_mount != mp) + goto loop; + nvp = vp->v_mntvnodes.le_next; + if (VOP_ISLOCKED(vp) && (flags != MNT_WAIT)) + continue; + if (vp->v_object && + (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) { + vm_object_page_clean(vp->v_object, 0, 0, TRUE, TRUE); + } + } +} + +/* + * Create the VM object needed for VMIO and mmap support. This + * is done for all VREG files in the system. Some filesystems might + * afford the additional metadata buffering capability of the + * VMIO code by making the device node be VMIO mode also. + */ +int +vfs_object_create(vp, p, cred, waslocked) + struct vnode *vp; + struct proc *p; + struct ucred *cred; + int waslocked; +{ + struct vattr vat; + vm_object_t object; + int error = 0; + +retry: + if ((object = vp->v_object) == NULL) { + if (vp->v_type == VREG) { + if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0) + goto retn; + (void) vnode_pager_alloc(vp, + OFF_TO_IDX(round_page(vat.va_size)), 0, 0); + } else { + /* + * This simply allocates the biggest object possible + * for a VBLK vnode. This should be fixed, but doesn't + * cause any problems (yet). + */ + (void) vnode_pager_alloc(vp, INT_MAX, 0, 0); + } + vp->v_object->flags |= OBJ_VFS_REF; + } else { + if (object->flags & OBJ_DEAD) { + if (waslocked) + VOP_UNLOCK(vp, 0, p); + tsleep(object, PVM, "vodead", 0); + if (waslocked) + vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); + goto retry; + } + if ((object->flags & OBJ_VFS_REF) == 0) { + object->flags |= OBJ_VFS_REF; + vm_object_reference(object); + } + } + if (vp->v_object) + vp->v_flag |= VVMIO; + +retn: + return error; +} diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c index 0cf7680..2997fe5 100644 --- a/sys/kern/vfs_syscalls.c +++ b/sys/kern/vfs_syscalls.c @@ -35,16 +35,30 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95 + * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94 + * $Id: vfs_syscalls.c,v 1.60 1997/03/23 03:36:35 bde Exp $ */ +/* + * XXX - The following is required because of some magic done + * in getdirentries() below which is only done if the translucent + * filesystem `UNION' is compiled into the kernel. This is broken, + * but I don't have time to study the code deeply enough to understand + * what's going on and determine an appropriate fix. -GAW + */ +#include "opt_union.h" + #include <sys/param.h> #include <sys/systm.h> +#include <sys/sysent.h> +#include <sys/sysproto.h> #include <sys/namei.h> #include <sys/filedesc.h> #include <sys/kernel.h> +#include <sys/fcntl.h> #include <sys/file.h> #include <sys/stat.h> +#include <sys/unistd.h> #include <sys/vnode.h> #include <sys/mount.h> #include <sys/proc.h> @@ -52,9 +66,14 @@ #include <sys/malloc.h> #include <sys/dirent.h> -#include <sys/syscallargs.h> +#ifdef UNION +#include <miscfs/union/union.h> +#endif #include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_object.h> +#include <vm/vm_extern.h> #include <sys/sysctl.h> static int change_dir __P((struct nameidata *ndp, struct proc *p)); @@ -67,6 +86,14 @@ static void checkdirs __P((struct vnode *olddp)); /* * Mount a file system. */ +#ifndef _SYS_SYSPROTO_H_ +struct mount_args { + char *type; + char *path; + int flags; + caddr_t data; +}; +#endif /* ARGSUSED */ int mount(p, uap, retval) @@ -82,7 +109,7 @@ mount(p, uap, retval) struct vnode *vp; struct mount *mp; struct vfsconf *vfsp; - int error, flag; + int error, flag = 0; struct vattr va; u_long fstypenum; struct nameidata nd; @@ -228,9 +255,10 @@ update: else if (mp->mnt_flag & MNT_RDONLY) mp->mnt_flag |= MNT_WANTRDWR; mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV | - MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC); + MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOATIME); mp->mnt_flag |= SCARG(uap, flags) & (MNT_NOSUID | MNT_NOEXEC | - MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC); + MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_FORCE | + MNT_NOATIME); /* * Mount the filesystem. */ @@ -313,6 +341,12 @@ checkdirs(olddp) * Note: unmount takes a path to the vnode mounted on as argument, * not special file (as before). */ +#ifndef _SYS_SYSPROTO_H_ +struct unmount_args { + char *path; + int flags; +}; +#endif /* ARGSUSED */ int unmount(p, uap, retval) @@ -380,6 +414,7 @@ dounmount(mp, flags, p) mp->mnt_flag |= MNT_UNMOUNT; lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK, &mountlist_slock, p); mp->mnt_flag &=~ MNT_ASYNC; + vfs_msync(mp, MNT_NOWAIT); vnode_pager_umount(mp); /* release cached vnodes */ cache_purgevfs(mp); /* remove cache entries for this file sys */ if (((mp->mnt_flag & MNT_RDONLY) || @@ -411,16 +446,22 @@ dounmount(mp, flags, p) /* * Sync each mounted filesystem. */ +#ifndef _SYS_SYSPROTO_H_ +struct sync_args { + int dummy; +}; +#endif + #ifdef DEBUG int syncprt = 0; -struct ctldebug debug0 = { "syncprt", &syncprt }; +SYSCTL_INT(_debug, 0, syncprt, CTLFLAG_RW, &syncprt, 0, ""); #endif /* ARGSUSED */ int sync(p, uap, retval) struct proc *p; - void *uap; + struct sync_args *uap; register_t *retval; { register struct mount *mp, *nmp; @@ -435,7 +476,8 @@ sync(p, uap, retval) if ((mp->mnt_flag & MNT_RDONLY) == 0) { asyncflag = mp->mnt_flag & MNT_ASYNC; mp->mnt_flag &= ~MNT_ASYNC; - VFS_SYNC(mp, MNT_NOWAIT, p->p_ucred, p); + vfs_msync(mp, MNT_NOWAIT); + VFS_SYNC(mp, MNT_NOWAIT, p != NULL ? p->p_ucred : NOCRED, p); if (asyncflag) mp->mnt_flag |= MNT_ASYNC; } @@ -444,16 +486,30 @@ sync(p, uap, retval) vfs_unbusy(mp, p); } simple_unlock(&mountlist_slock); +#if 0 +/* + * XXX don't call vfs_bufstats() yet because that routine + * was not imported in the Lite2 merge. + */ #ifdef DIAGNOSTIC if (syncprt) vfs_bufstats(); #endif /* DIAGNOSTIC */ +#endif return (0); } /* * Change filesystem quotas. */ +#ifndef _SYS_SYSPROTO_H_ +struct quotactl_args { + char *path; + int cmd; + int uid; + caddr_t arg; +}; +#endif /* ARGSUSED */ int quotactl(p, uap, retval) @@ -482,6 +538,12 @@ quotactl(p, uap, retval) /* * Get filesystem statistics. */ +#ifndef _SYS_SYSPROTO_H_ +struct statfs_args { + char *path; + struct statfs *buf; +}; +#endif /* ARGSUSED */ int statfs(p, uap, retval) @@ -496,6 +558,7 @@ statfs(p, uap, retval) register struct statfs *sp; int error; struct nameidata nd; + struct statfs sb; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); if (error = namei(&nd)) @@ -503,15 +566,27 @@ statfs(p, uap, retval) mp = nd.ni_vp->v_mount; sp = &mp->mnt_stat; vrele(nd.ni_vp); - if (error = VFS_STATFS(mp, sp, p)) + error = VFS_STATFS(mp, sp, p); + if (error) return (error); sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (p->p_ucred->cr_uid != 0) { + bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb)); + sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; + sp = &sb; + } return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp))); } /* * Get filesystem statistics. */ +#ifndef _SYS_SYSPROTO_H_ +struct fstatfs_args { + int fd; + struct statfs *buf; +}; +#endif /* ARGSUSED */ int fstatfs(p, uap, retval) @@ -526,20 +601,34 @@ fstatfs(p, uap, retval) struct mount *mp; register struct statfs *sp; int error; + struct statfs sb; if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) return (error); mp = ((struct vnode *)fp->f_data)->v_mount; sp = &mp->mnt_stat; - if (error = VFS_STATFS(mp, sp, p)) + error = VFS_STATFS(mp, sp, p); + if (error) return (error); sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; + if (p->p_ucred->cr_uid != 0) { + bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb)); + sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0; + sp = &sb; + } return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp))); } /* * Get statistics on all filesystems. */ +#ifndef _SYS_SYSPROTO_H_ +struct getfsstat_args { + struct statfs *buf; + long bufsize; + int flags; +}; +#endif int getfsstat(p, uap, retval) struct proc *p; @@ -579,8 +668,11 @@ getfsstat(p, uap, retval) continue; } sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK; - if (error = copyout((caddr_t)sp, sfsp, sizeof(*sp))) + error = copyout((caddr_t)sp, sfsp, sizeof(*sp)); + if (error) { + vfs_unbusy(mp, p); return (error); + } sfsp += sizeof(*sp); } count++; @@ -599,6 +691,11 @@ getfsstat(p, uap, retval) /* * Change current working directory to a given file descriptor. */ +#ifndef _SYS_SYSPROTO_H_ +struct fchdir_args { + int fd; +}; +#endif /* ARGSUSED */ int fchdir(p, uap, retval) @@ -646,6 +743,11 @@ fchdir(p, uap, retval) /* * Change current working directory (``.''). */ +#ifndef _SYS_SYSPROTO_H_ +struct chdir_args { + char *path; +}; +#endif /* ARGSUSED */ int chdir(p, uap, retval) @@ -671,6 +773,11 @@ chdir(p, uap, retval) /* * Change notion of root (``/'') directory. */ +#ifndef _SYS_SYSPROTO_H_ +struct chroot_args { + char *path; +}; +#endif /* ARGSUSED */ int chroot(p, uap, retval) @@ -684,7 +791,8 @@ chroot(p, uap, retval) int error; struct nameidata nd; - if (error = suser(p->p_ucred, &p->p_acflag)) + error = suser(p->p_ucred, &p->p_acflag); + if (error) return (error); NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE, SCARG(uap, path), p); @@ -707,7 +815,8 @@ change_dir(ndp, p) struct vnode *vp; int error; - if (error = namei(ndp)) + error = namei(ndp); + if (error) return (error); vp = ndp->ni_vp; if (vp->v_type != VDIR) @@ -725,6 +834,13 @@ change_dir(ndp, p) * Check permissions, allocate an open file structure, * and call the device open routine if any. */ +#ifndef _SYS_SYSPROTO_H_ +struct open_args { + char *path; + int flags; + int mode; +}; +#endif int open(p, uap, retval) struct proc *p; @@ -743,16 +859,17 @@ open(p, uap, retval) int type, indx, error; struct flock lf; struct nameidata nd; - extern struct fileops vnops; - if (error = falloc(p, &nfp, &indx)) + error = falloc(p, &nfp, &indx); + if (error) return (error); fp = nfp; flags = FFLAGS(SCARG(uap, flags)); cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT; NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); p->p_dupfd = -indx - 1; /* XXX check for fdopen */ - if (error = vn_open(&nd, flags, cmode)) { + error = vn_open(&nd, flags, cmode); + if (error) { ffree(fp); if ((error == ENODEV || error == ENXIO) && p->p_dupfd >= 0 && /* XXX from fdopen */ @@ -768,8 +885,9 @@ open(p, uap, retval) } p->p_dupfd = 0; vp = nd.ni_vp; + fp->f_flag = flags & FMASK; - fp->f_type = DTYPE_VNODE; + fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE); fp->f_ops = &vnops; fp->f_data = (caddr_t)vp; if (flags & (O_EXLOCK | O_SHLOCK)) { @@ -802,10 +920,16 @@ open(p, uap, retval) /* * Create a file. */ +#ifndef _SYS_SYSPROTO_H_ +struct ocreat_args { + char *path; + int mode; +}; +#endif int -compat_43_creat(p, uap, retval) +ocreat(p, uap, retval) struct proc *p; - register struct compat_43_creat_args /* { + register struct ocreat_args /* { syscallarg(char *) path; syscallarg(int) mode; } */ *uap; @@ -827,6 +951,13 @@ compat_43_creat(p, uap, retval) /* * Create a special file. */ +#ifndef _SYS_SYSPROTO_H_ +struct mknod_args { + char *path; + int mode; + int dev; +}; +#endif /* ARGSUSED */ int mknod(p, uap, retval) @@ -844,7 +975,8 @@ mknod(p, uap, retval) int whiteout; struct nameidata nd; - if (error = suser(p->p_ucred, &p->p_acflag)) + error = suser(p->p_ucred, &p->p_acflag); + if (error) return (error); NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); if (error = namei(&nd)) @@ -902,6 +1034,12 @@ mknod(p, uap, retval) /* * Create a named pipe. */ +#ifndef _SYS_SYSPROTO_H_ +struct mkfifo_args { + char *path; + int mode; +}; +#endif /* ARGSUSED */ int mkfifo(p, uap, retval) @@ -916,9 +1054,6 @@ mkfifo(p, uap, retval) int error; struct nameidata nd; -#ifndef FIFO - return (EOPNOTSUPP); -#else NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); if (error = namei(&nd)) return (error); @@ -936,12 +1071,17 @@ mkfifo(p, uap, retval) vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask; VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE); return (VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr)); -#endif /* FIFO */ } /* * Make a hard file link. */ +#ifndef _SYS_SYSPROTO_H_ +struct link_args { + char *path; + char *link; +}; +#endif /* ARGSUSED */ int link(p, uap, retval) @@ -960,20 +1100,13 @@ link(p, uap, retval) if (error = namei(&nd)) return (error); vp = nd.ni_vp; - if (vp->v_type != VDIR || - (error = suser(p->p_ucred, &p->p_acflag)) == 0) { - nd.ni_cnd.cn_nameiop = CREATE; - nd.ni_cnd.cn_flags = LOCKPARENT; - nd.ni_dirp = SCARG(uap, link); - if ((error = namei(&nd)) == 0) { - if (nd.ni_vp != NULL) - error = EEXIST; - if (!error) { - VOP_LEASE(nd.ni_dvp, p, p->p_ucred, - LEASE_WRITE); - VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); - error = VOP_LINK(vp, nd.ni_dvp, &nd.ni_cnd); - } else { + if (vp->v_type == VDIR) + error = EPERM; /* POSIX */ + else { + NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, link), p); + error = namei(&nd); + if (!error) { + if (nd.ni_vp != NULL) { VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd); if (nd.ni_dvp == nd.ni_vp) vrele(nd.ni_dvp); @@ -981,6 +1114,12 @@ link(p, uap, retval) vput(nd.ni_dvp); if (nd.ni_vp) vrele(nd.ni_vp); + error = EEXIST; + } else { + VOP_LEASE(nd.ni_dvp, p, p->p_ucred, + LEASE_WRITE); + VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); + error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd); } } } @@ -991,6 +1130,12 @@ link(p, uap, retval) /* * Make a symbolic link. */ +#ifndef _SYS_SYSPROTO_H_ +struct symlink_args { + char *path; + char *link; +}; +#endif /* ARGSUSED */ int symlink(p, uap, retval) @@ -1073,6 +1218,11 @@ undelete(p, uap, retval) /* * Delete a name from the filesystem. */ +#ifndef _SYS_SYSPROTO_H_ +struct unlink_args { + char *path; +}; +#endif /* ARGSUSED */ int unlink(p, uap, retval) @@ -1093,15 +1243,18 @@ unlink(p, uap, retval) VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - if (vp->v_type != VDIR || - (error = suser(p->p_ucred, &p->p_acflag)) == 0) { + if (vp->v_type == VDIR) + error = EPERM; /* POSIX */ + else { /* * The root of a mounted filesystem cannot be deleted. + * + * XXX: can this only be a VDIR case? */ if (vp->v_flag & VROOT) error = EBUSY; else - (void)vnode_pager_uncache(vp); + (void) vnode_pager_uncache(vp, p); } if (!error) { @@ -1122,6 +1275,14 @@ unlink(p, uap, retval) /* * Reposition read/write file offset. */ +#ifndef _SYS_SYSPROTO_H_ +struct lseek_args { + int fd; + int pad; + off_t offset; + int whence; +}; +#endif int lseek(p, uap, retval) struct proc *p; @@ -1131,7 +1292,7 @@ lseek(p, uap, retval) syscallarg(off_t) offset; syscallarg(int) whence; } */ *uap; - register_t *retval; + register_t *retval; /* XXX */ { struct ucred *cred = p->p_ucred; register struct filedesc *fdp = p->p_fd; @@ -1149,8 +1310,8 @@ lseek(p, uap, retval) fp->f_offset += SCARG(uap, offset); break; case L_XTND: - if (error = - VOP_GETATTR((struct vnode *)fp->f_data, &vattr, cred, p)) + error=VOP_GETATTR((struct vnode *)fp->f_data, &vattr, cred, p); + if (error) return (error); fp->f_offset = SCARG(uap, offset) + vattr.va_size; break; @@ -1168,10 +1329,17 @@ lseek(p, uap, retval) /* * Reposition read/write file offset. */ +#ifndef _SYS_SYSPROTO_H_ +struct olseek_args { + int fd; + long offset; + int whence; +}; +#endif int -compat_43_lseek(p, uap, retval) +olseek(p, uap, retval) struct proc *p; - register struct compat_43_lseek_args /* { + register struct olseek_args /* { syscallarg(int) fd; syscallarg(long) offset; syscallarg(int) whence; @@ -1190,7 +1358,7 @@ compat_43_lseek(p, uap, retval) SCARG(&nuap, fd) = SCARG(uap, fd); SCARG(&nuap, offset) = SCARG(uap, offset); SCARG(&nuap, whence) = SCARG(uap, whence); - error = lseek(p, &nuap, &qret); + error = lseek(p, &nuap, (register_t *) &qret); *(long *)retval = qret; return (error); } @@ -1199,6 +1367,12 @@ compat_43_lseek(p, uap, retval) /* * Check access permissions. */ +#ifndef _SYS_SYSPROTO_H_ +struct access_args { + char *path; + int flags; +}; +#endif int access(p, uap, retval) struct proc *p; @@ -1246,11 +1420,17 @@ out1: /* * Get file status; this version follows links. */ +#ifndef _SYS_SYSPROTO_H_ +struct ostat_args { + char *path; + struct ostat *ub; +}; +#endif /* ARGSUSED */ int -compat_43_stat(p, uap, retval) +ostat(p, uap, retval) struct proc *p; - register struct compat_43_stat_args /* { + register struct ostat_args /* { syscallarg(char *) path; syscallarg(struct ostat *) ub; } */ *uap; @@ -1277,11 +1457,17 @@ compat_43_stat(p, uap, retval) /* * Get file status; this version does not follow links. */ +#ifndef _SYS_SYSPROTO_H_ +struct olstat_args { + char *path; + struct ostat *ub; +}; +#endif /* ARGSUSED */ int -compat_43_lstat(p, uap, retval) +olstat(p, uap, retval) struct proc *p; - register struct compat_43_lstat_args /* { + register struct olstat_args /* { syscallarg(char *) path; syscallarg(struct ostat *) ub; } */ *uap; @@ -1367,6 +1553,12 @@ cvtstat(st, ost) /* * Get file status; this version follows links. */ +#ifndef _SYS_SYSPROTO_H_ +struct stat_args { + char *path; + struct stat *ub; +}; +#endif /* ARGSUSED */ int stat(p, uap, retval) @@ -1396,6 +1588,12 @@ stat(p, uap, retval) /* * Get file status; this version does not follow links. */ +#ifndef _SYS_SYSPROTO_H_ +struct lstat_args { + char *path; + struct stat *ub; +}; +#endif /* ARGSUSED */ int lstat(p, uap, retval) @@ -1455,6 +1653,12 @@ lstat(p, uap, retval) /* * Get configurable pathname variables. */ +#ifndef _SYS_SYSPROTO_H_ +struct pathconf_args { + char *path; + int name; +}; +#endif /* ARGSUSED */ int pathconf(p, uap, retval) @@ -1480,6 +1684,13 @@ pathconf(p, uap, retval) /* * Return target name of a symbolic link. */ +#ifndef _SYS_SYSPROTO_H_ +struct readlink_args { + char *path; + char *buf; + int count; +}; +#endif /* ARGSUSED */ int readlink(p, uap, retval) @@ -1524,6 +1735,12 @@ readlink(p, uap, retval) /* * Change flags of a file given a path name. */ +#ifndef _SYS_SYSPROTO_H_ +struct chflags_args { + char *path; + int flags; +}; +#endif /* ARGSUSED */ int chflags(p, uap, retval) @@ -1555,6 +1772,12 @@ chflags(p, uap, retval) /* * Change flags of a file given a file descriptor. */ +#ifndef _SYS_SYSPROTO_H_ +struct fchflags_args { + int fd; + int flags; +}; +#endif /* ARGSUSED */ int fchflags(p, uap, retval) @@ -1585,6 +1808,12 @@ fchflags(p, uap, retval) /* * Change mode of a file given path name. */ +#ifndef _SYS_SYSPROTO_H_ +struct chmod_args { + char *path; + int mode; +}; +#endif /* ARGSUSED */ int chmod(p, uap, retval) @@ -1616,6 +1845,12 @@ chmod(p, uap, retval) /* * Change mode of a file given a file descriptor. */ +#ifndef _SYS_SYSPROTO_H_ +struct fchmod_args { + int fd; + int mode; +}; +#endif /* ARGSUSED */ int fchmod(p, uap, retval) @@ -1646,6 +1881,13 @@ fchmod(p, uap, retval) /* * Set ownership given a path name. */ +#ifndef _SYS_SYSPROTO_H_ +struct chown_args { + char *path; + int uid; + int gid; +}; +#endif /* ARGSUSED */ int chown(p, uap, retval) @@ -1679,6 +1921,13 @@ chown(p, uap, retval) /* * Set ownership given a file descriptor. */ +#ifndef _SYS_SYSPROTO_H_ +struct fchown_args { + int fd; + int uid; + int gid; +}; +#endif /* ARGSUSED */ int fchown(p, uap, retval) @@ -1711,6 +1960,12 @@ fchown(p, uap, retval) /* * Set the access and modification times of a file. */ +#ifndef _SYS_SYSPROTO_H_ +struct utimes_args { + char *path; + struct timeval *tptr; +}; +#endif /* ARGSUSED */ int utimes(p, uap, retval) @@ -1741,10 +1996,10 @@ utimes(p, uap, retval) vp = nd.ni_vp; VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - vattr.va_atime.ts_sec = tv[0].tv_sec; - vattr.va_atime.ts_nsec = tv[0].tv_usec * 1000; - vattr.va_mtime.ts_sec = tv[1].tv_sec; - vattr.va_mtime.ts_nsec = tv[1].tv_usec * 1000; + vattr.va_atime.tv_sec = tv[0].tv_sec; + vattr.va_atime.tv_nsec = tv[0].tv_usec * 1000; + vattr.va_mtime.tv_sec = tv[1].tv_sec; + vattr.va_mtime.tv_nsec = tv[1].tv_usec * 1000; error = VOP_SETATTR(vp, &vattr, p->p_ucred, p); vput(vp); return (error); @@ -1753,6 +2008,13 @@ utimes(p, uap, retval) /* * Truncate a file given its path name. */ +#ifndef _SYS_SYSPROTO_H_ +struct truncate_args { + char *path; + int pad; + off_t length; +}; +#endif /* ARGSUSED */ int truncate(p, uap, retval) @@ -1769,6 +2031,8 @@ truncate(p, uap, retval) int error; struct nameidata nd; + if (uap->length < 0) + return(EINVAL); NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p); if (error = namei(&nd)) return (error); @@ -1790,6 +2054,13 @@ truncate(p, uap, retval) /* * Truncate a file given a file descriptor. */ +#ifndef _SYS_SYSPROTO_H_ +struct ftruncate_args { + int fd; + int pad; + off_t length; +}; +#endif /* ARGSUSED */ int ftruncate(p, uap, retval) @@ -1806,6 +2077,8 @@ ftruncate(p, uap, retval) struct file *fp; int error; + if (uap->length < 0) + return(EINVAL); if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp)) return (error); if ((fp->f_flag & FWRITE) == 0) @@ -1828,11 +2101,17 @@ ftruncate(p, uap, retval) /* * Truncate a file given its path name. */ +#ifndef _SYS_SYSPROTO_H_ +struct otruncate_args { + char *path; + long length; +}; +#endif /* ARGSUSED */ int -compat_43_truncate(p, uap, retval) +otruncate(p, uap, retval) struct proc *p; - register struct compat_43_truncate_args /* { + register struct otruncate_args /* { syscallarg(char *) path; syscallarg(long) length; } */ *uap; @@ -1852,11 +2131,17 @@ compat_43_truncate(p, uap, retval) /* * Truncate a file given a file descriptor. */ +#ifndef _SYS_SYSPROTO_H_ +struct oftruncate_args { + int fd; + long length; +}; +#endif /* ARGSUSED */ int -compat_43_ftruncate(p, uap, retval) +oftruncate(p, uap, retval) struct proc *p; - register struct compat_43_ftruncate_args /* { + register struct oftruncate_args /* { syscallarg(int) fd; syscallarg(long) length; } */ *uap; @@ -1877,6 +2162,11 @@ compat_43_ftruncate(p, uap, retval) /* * Sync an open file. */ +#ifndef _SYS_SYSPROTO_H_ +struct fsync_args { + int fd; +}; +#endif /* ARGSUSED */ int fsync(p, uap, retval) @@ -1894,7 +2184,12 @@ fsync(p, uap, retval) return (error); vp = (struct vnode *)fp->f_data; vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); - error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, p); + if (vp->v_object) { + vm_object_page_clean(vp->v_object, 0, 0 ,0, FALSE); + } + error = VOP_FSYNC(vp, fp->f_cred, + (vp->v_mount && (vp->v_mount->mnt_flag & MNT_ASYNC)) ? + MNT_NOWAIT : MNT_WAIT, p); VOP_UNLOCK(vp, 0, p); return (error); } @@ -1903,6 +2198,12 @@ fsync(p, uap, retval) * Rename files. Source and destination must either both be directories, * or both not be directories. If target is a directory, it must be empty. */ +#ifndef _SYS_SYSPROTO_H_ +struct rename_args { + char *from; + char *to; +}; +#endif /* ARGSUSED */ int rename(p, uap, retval) @@ -1924,7 +2225,12 @@ rename(p, uap, retval) fvp = fromnd.ni_vp; NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART, UIO_USERSPACE, SCARG(uap, to), p); + if (fromnd.ni_vp->v_type == VDIR) + tond.ni_cnd.cn_flags |= WILLBEDIR; if (error = namei(&tond)) { + /* Translate error code for rename("dir1", "dir2/."). */ + if (error == EISDIR && fvp->v_type == VDIR) + error = EINVAL; VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd); vrele(fromnd.ni_dvp); vrele(fvp); @@ -1958,8 +2264,10 @@ out: VOP_LEASE(tdvp, p, p->p_ucred, LEASE_WRITE); if (fromnd.ni_dvp != tdvp) VOP_LEASE(fromnd.ni_dvp, p, p->p_ucred, LEASE_WRITE); - if (tvp) + if (tvp) { VOP_LEASE(tvp, p, p->p_ucred, LEASE_WRITE); + (void) vnode_pager_uncache(tvp, p); + } error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd, tond.ni_dvp, tond.ni_vp, &tond.ni_cnd); } else { @@ -1988,6 +2296,12 @@ out1: /* * Make a directory file. */ +#ifndef _SYS_SYSPROTO_H_ +struct mkdir_args { + char *path; + int mode; +}; +#endif /* ARGSUSED */ int mkdir(p, uap, retval) @@ -2004,6 +2318,7 @@ mkdir(p, uap, retval) struct nameidata nd; NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p); + nd.ni_cnd.cn_flags |= WILLBEDIR; if (error = namei(&nd)) return (error); vp = nd.ni_vp; @@ -2029,6 +2344,11 @@ mkdir(p, uap, retval) /* * Remove a directory file. */ +#ifndef _SYS_SYSPROTO_H_ +struct rmdir_args { + char *path; +}; +#endif /* ARGSUSED */ int rmdir(p, uap, retval) @@ -2083,10 +2403,18 @@ out: /* * Read a block of directory entries in a file system independent format. */ +#ifndef _SYS_SYSPROTO_H_ +struct ogetdirentries_args { + int fd; + char *buf; + u_int count; + long *basep; +}; +#endif int -compat_43_getdirentries(p, uap, retval) +ogetdirentries(p, uap, retval) struct proc *p; - register struct compat_43_getdirentries_args /* { + register struct ogetdirentries_args /* { syscallarg(int) fd; syscallarg(char *) buf; syscallarg(u_int) count; @@ -2124,7 +2452,7 @@ unionread: # if (BYTE_ORDER != LITTLE_ENDIAN) if (vp->v_mount->mnt_maxsymlinklen <= 0) { error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, - (int *)0, (u_long *)0); + NULL, NULL); fp->f_offset = auio.uio_offset; } else # endif @@ -2136,7 +2464,7 @@ unionread: MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK); kiov.iov_base = dirbuf; error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag, - (int *)0, (u_long *)0); + NULL, NULL); fp->f_offset = kuio.uio_offset; if (error == 0) { readcnt = SCARG(uap, count) - kuio.uio_resid; @@ -2178,9 +2506,6 @@ unionread: #ifdef UNION { - extern int (**union_vnodeop_p)(); - extern struct vnode *union_dircache __P((struct vnode*, struct proc*)); - if ((SCARG(uap, count) == auio.uio_resid) && (vp->v_op == union_vnodeop_p)) { struct vnode *lvp; @@ -2240,6 +2565,14 @@ unionread: /* * Read a block of directory entries in a file system independent format. */ +#ifndef _SYS_SYSPROTO_H_ +struct getdirentries_args { + int fd; + char *buf; + u_int count; + long *basep; +}; +#endif int getdirentries(p, uap, retval) struct proc *p; @@ -2276,8 +2609,7 @@ unionread: auio.uio_resid = SCARG(uap, count); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); loff = auio.uio_offset = fp->f_offset; - error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, - (int *)0, (u_long *)0); + error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL); fp->f_offset = auio.uio_offset; VOP_UNLOCK(vp, 0, p); if (error) @@ -2285,9 +2617,6 @@ unionread: #ifdef UNION { - extern int (**union_vnodeop_p)(); - extern struct vnode *union_dircache __P((struct vnode*, struct proc*)); - if ((SCARG(uap, count) == auio.uio_resid) && (vp->v_op == union_vnodeop_p)) { struct vnode *lvp; @@ -2346,13 +2675,18 @@ unionread: /* * Set the mode mask for creation of filesystem nodes. */ +#ifndef _SYS_SYSPROTO_H_ +struct umask_args { + int newmask; +}; +#endif int umask(p, uap, retval) struct proc *p; struct umask_args /* { syscallarg(int) newmask; } */ *uap; - register_t *retval; + int *retval; /* XXX */ { register struct filedesc *fdp; @@ -2366,6 +2700,11 @@ umask(p, uap, retval) * Void all references to file by ripping underlying filesystem * away from vnode. */ +#ifndef _SYS_SYSPROTO_H_ +struct revoke_args { + char *path; +}; +#endif /* ARGSUSED */ int revoke(p, uap, retval) @@ -2402,15 +2741,15 @@ out: int getvnode(fdp, fd, fpp) struct filedesc *fdp; - struct file **fpp; int fd; + struct file **fpp; { struct file *fp; if ((u_int)fd >= fdp->fd_nfiles || (fp = fdp->fd_ofiles[fd]) == NULL) return (EBADF); - if (fp->f_type != DTYPE_VNODE) + if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO) return (EINVAL); *fpp = fp; return (0); diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c index 3cfc6fd..cb6c932 100644 --- a/sys/kern/vfs_vnops.c +++ b/sys/kern/vfs_vnops.c @@ -35,12 +35,14 @@ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * - * @(#)vfs_vnops.c 8.14 (Berkeley) 6/15/95 + * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94 + * $Id: vfs_vnops.c,v 1.33 1997/03/23 03:36:38 bde Exp $ */ #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> +#include <sys/fcntl.h> #include <sys/file.h> #include <sys/stat.h> #include <sys/buf.h> @@ -48,10 +50,22 @@ #include <sys/mount.h> #include <sys/namei.h> #include <sys/vnode.h> -#include <sys/ioctl.h> -#include <sys/tty.h> +#include <sys/filio.h> +#include <sys/ttycom.h> #include <vm/vm.h> +#include <vm/vm_param.h> +#include <vm/vm_object.h> +#include <vm/vnode_pager.h> + +static int vn_closefile __P((struct file *fp, struct proc *p)); +static int vn_ioctl __P((struct file *fp, int com, caddr_t data, + struct proc *p)); +static int vn_read __P((struct file *fp, struct uio *uio, + struct ucred *cred)); +static int vn_select __P((struct file *fp, int which, struct proc *p)); +static int vn_write __P((struct file *fp, struct uio *uio, + struct ucred *cred)); struct fileops vnops = { vn_read, vn_write, vn_ioctl, vn_select, vn_closefile }; @@ -60,6 +74,7 @@ struct fileops vnops = * Common code for vnode open operations. * Check permissions, and call the VOP_OPEN or VOP_CREATE routine. */ +int vn_open(ndp, fmode, cmode) register struct nameidata *ndp; int fmode, cmode; @@ -76,7 +91,8 @@ vn_open(ndp, fmode, cmode) ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF; if ((fmode & O_EXCL) == 0) ndp->ni_cnd.cn_flags |= FOLLOW; - if (error = namei(ndp)) + error = namei(ndp); + if (error) return (error); if (ndp->ni_vp == NULL) { VATTR_NULL(vap); @@ -107,7 +123,8 @@ vn_open(ndp, fmode, cmode) } else { ndp->ni_cnd.cn_nameiop = LOOKUP; ndp->ni_cnd.cn_flags = FOLLOW | LOCKLEAF; - if (error = namei(ndp)) + error = namei(ndp); + if (error) return (error); vp = ndp->ni_vp; } @@ -117,7 +134,8 @@ vn_open(ndp, fmode, cmode) } if ((fmode & O_CREAT) == 0) { if (fmode & FREAD) { - if (error = VOP_ACCESS(vp, VREAD, cred, p)) + error = VOP_ACCESS(vp, VREAD, cred, p); + if (error) goto bad; } if (fmode & (FWRITE | O_TRUNC)) { @@ -125,8 +143,11 @@ vn_open(ndp, fmode, cmode) error = EISDIR; goto bad; } - if ((error = vn_writechk(vp)) || - (error = VOP_ACCESS(vp, VWRITE, cred, p))) + error = vn_writechk(vp); + if (error) + goto bad; + error = VOP_ACCESS(vp, VWRITE, cred, p); + if (error) goto bad; } } @@ -136,11 +157,21 @@ vn_open(ndp, fmode, cmode) vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); /* XXX */ VATTR_NULL(vap); vap->va_size = 0; - if (error = VOP_SETATTR(vp, vap, cred, p)) + error = VOP_SETATTR(vp, vap, cred, p); + if (error) goto bad; } - if (error = VOP_OPEN(vp, fmode, cred, p)) + error = VOP_OPEN(vp, fmode, cred, p); + if (error) goto bad; + /* + * Make sure that a VM object is created for VMIO support. + */ + if (vp->v_type == VREG) { + if ((error = vfs_object_create(vp, p, cred, 1)) != 0) + goto bad; + } + if (fmode & FWRITE) vp->v_writecount++; return (0); @@ -153,6 +184,7 @@ bad: * Check for write permissions on the specified vnode. * Prototype text segments cannot be written. */ +int vn_writechk(vp) register struct vnode *vp; { @@ -162,7 +194,7 @@ vn_writechk(vp) * the vnode, try to free it up once. If * we fail, we can't allow writing. */ - if ((vp->v_flag & VTEXT) && !vnode_pager_uncache(vp)) + if (vp->v_flag & VTEXT) return (ETXTBSY); return (0); } @@ -170,6 +202,7 @@ vn_writechk(vp) /* * Vnode close call */ +int vn_close(vp, flags, cred, p) register struct vnode *vp; int flags; @@ -188,6 +221,7 @@ vn_close(vp, flags, cred, p) /* * Package up an I/O request on a vnode into a uio and do it. */ +int vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p) enum uio_rw rw; struct vnode *vp; @@ -233,6 +267,7 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p) /* * File table vnode read routine. */ +static int vn_read(fp, uio, cred) struct file *fp; struct uio *uio; @@ -241,14 +276,46 @@ vn_read(fp, uio, cred) struct vnode *vp = (struct vnode *)fp->f_data; struct proc *p = uio->uio_procp; int count, error; + int flag, seq; VOP_LEASE(vp, p, cred, LEASE_READ); vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); uio->uio_offset = fp->f_offset; count = uio->uio_resid; - error = VOP_READ(vp, uio, (fp->f_flag & FNONBLOCK) ? IO_NDELAY : 0, - cred); + flag = 0; + if (fp->f_flag & FNONBLOCK) + flag |= IO_NDELAY; + + /* + * Sequential read heuristic. + * If we have been doing sequential input, + * a rewind operation doesn't turn off + * sequential input mode. + */ + if (((fp->f_offset == 0) && (fp->f_seqcount > 0)) || + (fp->f_offset == fp->f_nextread)) { + int tmpseq = fp->f_seqcount; + /* + * XXX we assume that the filesystem block size is + * the default. Not true, but still gives us a pretty + * good indicator of how sequential the read operations + * are. + */ + tmpseq += ((count + BKVASIZE - 1) / BKVASIZE); + if (tmpseq >= CHAR_MAX) + tmpseq = CHAR_MAX; + fp->f_seqcount = tmpseq; + flag |= (fp->f_seqcount << 16); + } else { + if (fp->f_seqcount > 1) + fp->f_seqcount = 1; + else + fp->f_seqcount = 0; + } + + error = VOP_READ(vp, uio, flag, cred); fp->f_offset += count - uio->uio_resid; + fp->f_nextread = fp->f_offset; VOP_UNLOCK(vp, 0, p); return (error); } @@ -256,6 +323,7 @@ vn_read(fp, uio, cred) /* * File table vnode write routine. */ +static int vn_write(fp, uio, cred) struct file *fp; struct uio *uio; @@ -288,6 +356,7 @@ vn_write(fp, uio, cred) /* * File table vnode stat routine. */ +int vn_stat(vp, sb, p) struct vnode *vp; register struct stat *sb; @@ -344,17 +413,27 @@ vn_stat(vp, sb, p) sb->st_ctimespec = vap->va_ctime; sb->st_blksize = vap->va_blocksize; sb->st_flags = vap->va_flags; - sb->st_gen = vap->va_gen; + if (p->p_ucred->cr_uid != 0) + sb->st_gen = 0; + else + sb->st_gen = vap->va_gen; + +#if (S_BLKSIZE == 512) + /* Optimize this case */ + sb->st_blocks = vap->va_bytes >> 9; +#else sb->st_blocks = vap->va_bytes / S_BLKSIZE; +#endif return (0); } /* * File table vnode ioctl routine. */ +static int vn_ioctl(fp, com, data, p) struct file *fp; - u_long com; + int com; caddr_t data; struct proc *p; { @@ -367,7 +446,8 @@ vn_ioctl(fp, com, data, p) case VREG: case VDIR: if (com == FIONREAD) { - if (error = VOP_GETATTR(vp, &vattr, p->p_ucred, p)) + error = VOP_GETATTR(vp, &vattr, p->p_ucred, p); + if (error) return (error); *(int *)data = vattr.va_size - fp->f_offset; return (0); @@ -384,8 +464,15 @@ vn_ioctl(fp, com, data, p) case VBLK: error = VOP_IOCTL(vp, com, data, fp->f_flag, p->p_ucred, p); if (error == 0 && com == TIOCSCTTY) { + + /* Do nothing if reassigning same control tty */ + if (p->p_session->s_ttyvp == vp) + return (0); + + /* Get rid of reference to old control tty */ if (p->p_session->s_ttyvp) vrele(p->p_session->s_ttyvp); + p->p_session->s_ttyvp = vp; VREF(vp); } @@ -396,6 +483,7 @@ vn_ioctl(fp, com, data, p) /* * File table vnode select routine. */ +static int vn_select(fp, which, p) struct file *fp; int which; @@ -407,6 +495,19 @@ vn_select(fp, which, p) } /* + * File table vnode close routine. + */ +static int +vn_closefile(fp, p) + struct file *fp; + struct proc *p; +{ + + return (vn_close(((struct vnode *)fp->f_data), fp->f_flag, + fp->f_cred, p)); +} + +/* * Check that the vnode is still valid, and if so * acquire requested lock. */ @@ -419,8 +520,9 @@ vn_lock(vp, flags, p) int error; do { - if ((flags & LK_INTERLOCK) == 0) + if ((flags & LK_INTERLOCK) == 0) { simple_lock(&vp->v_interlock); + } if (vp->v_flag & VXLOCK) { vp->v_flag |= VXWANT; simple_unlock(&vp->v_interlock); @@ -435,15 +537,3 @@ vn_lock(vp, flags, p) } while (flags & LK_RETRY); return (error); } - -/* - * File table vnode close routine. - */ -vn_closefile(fp, p) - struct file *fp; - struct proc *p; -{ - - return (vn_close(((struct vnode *)fp->f_data), fp->f_flag, - fp->f_cred, p)); -} diff --git a/sys/kern/vnode_if.pl b/sys/kern/vnode_if.pl new file mode 100644 index 0000000..75f49a7 --- /dev/null +++ b/sys/kern/vnode_if.pl @@ -0,0 +1,459 @@ +#!/bin/sh - +# +# Copyright (c) 1992, 1993 +# The Regents of the University of California. All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# 1. Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# 2. Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in the +# documentation and/or other materials provided with the distribution. +# 3. All advertising materials mentioning features or use of this software +# must display the following acknowledgement: +# This product includes software developed by the University of +# California, Berkeley and its contributors. +# 4. Neither the name of the University nor the names of its contributors +# may be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND +# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE +# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY +# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF +# SUCH DAMAGE. +# +# @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 +# $Id$ +# + +# Script to produce VFS front-end sugar. +# +# usage: vnode_if.sh srcfile +# (where srcfile is currently /sys/kern/vnode_if.src) +# +# These awk scripts are not particularly well written, specifically they +# don't use arrays well and figure out the same information repeatedly. +# Please rewrite them if you actually understand how to use awk. Note, +# they use nawk extensions and gawk's toupper. + +if [ $# -ne 1 ] ; then + echo 'usage: vnode_if.sh srcfile' + exit 1 +fi + +# Name of the source file. +SRC=$1 + +# Names of the created files. +CFILE=vnode_if.c +HEADER=vnode_if.h + +# Awk program (must support nawk extensions and gawk's "toupper") +# Use "awk" at Berkeley, "gawk" elsewhere. +AWK=awk + +# Print out header information for vnode_if.h. +cat << END_OF_LEADING_COMMENT > $HEADER +/* + * This file is produced automatically. + * Do not modify anything in here by hand. + * + * Created from @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 + */ + +extern struct vnodeop_desc vop_default_desc; +END_OF_LEADING_COMMENT + +# Awk script to take vnode_if.src and turn it into vnode_if.h. +$AWK ' + NF == 0 || $0 ~ "^#" { + next; + } + { + # Get the function name. + name = $1; + uname = toupper(name); + + # Get the function arguments. + for (c1 = 0;; ++c1) { + if (getline <= 0) + exit + if ($0 ~ "^};") + break; + a[c1] = $0; + } + + # Print out the vop_F_args structure. + printf("struct %s_args {\n\tstruct vnodeop_desc *a_desc;\n", + name); + for (c2 = 0; c2 < c1; ++c2) { + c3 = split(a[c2], t); + printf("\t"); + if (t[2] ~ "WILLRELE") + c4 = 3; + else + c4 = 2; + for (; c4 < c3; ++c4) + printf("%s ", t[c4]); + beg = match(t[c3], "[^*]"); + printf("%sa_%s\n", + substr(t[c4], 0, beg - 1), substr(t[c4], beg)); + } + printf("};\n"); + + # Print out extern declaration. + printf("extern struct vnodeop_desc %s_desc;\n", name); + + # Print out prototype. + printf("static int %s __P((\n", uname); + sep = ",\n"; + for (c2 = 0; c2 < c1; ++c2) { + if (c2 == c1 - 1) + sep = "));\n"; + c3 = split(a[c2], t); + printf("\t"); + if (t[2] ~ "WILLRELE") + c4 = 3; + else + c4 = 2; + for (; c4 < c3; ++c4) + printf("%s ", t[c4]); + beg = match(t[c3], "[^*]"); + end = match(t[c3], ";"); + printf("%s%s%s", + substr(t[c4], 0, beg - 1), + substr(t[c4], beg, end - beg), sep); + } + + # Print out inline struct. + printf("static inline int %s(", uname); + sep = ", "; + for (c2 = 0; c2 < c1; ++c2) { + if (c2 == c1 - 1) + sep = ")\n"; + c3 = split(a[c2], t); + beg = match(t[c3], "[^*]"); + end = match(t[c3], ";"); + printf("%s%s", substr(t[c3], beg, end - beg), sep); + } + for (c2 = 0; c2 < c1; ++c2) { + c3 = split(a[c2], t); + printf("\t"); + if (t[2] ~ "WILLRELE") + c4 = 3; + else + c4 = 2; + for (; c4 < c3; ++c4) + printf("%s ", t[c4]); + beg = match(t[c3], "[^*]"); + printf("%s%s\n", + substr(t[c4], 0, beg - 1), substr(t[c4], beg)); + } + printf("{\n\tstruct %s_args a;\n\n", name); + printf("\ta.a_desc = VDESC(%s);\n", name); + for (c2 = 0; c2 < c1; ++c2) { + c3 = split(a[c2], t); + printf("\t"); + beg = match(t[c3], "[^*]"); + end = match(t[c3], ";"); + printf("a.a_%s = %s\n", + substr(t[c3], beg, end - beg), substr(t[c3], beg)); + } + c1 = split(a[0], t); + beg = match(t[c1], "[^*]"); + end = match(t[c1], ";"); + printf("\treturn (VCALL(%s, VOFFSET(%s), &a));\n}\n", + substr(t[c1], beg, end - beg), name); + }' < $SRC >> $HEADER + +# Print out header information for vnode_if.c. +cat << END_OF_LEADING_COMMENT > $CFILE +/* + * This file is produced automatically. + * Do not modify anything in here by hand. + * + * Created from @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 + */ + +#include <sys/param.h> +#include <sys/mount.h> +#include <sys/vnode.h> + +struct vnodeop_desc vop_default_desc = { + 0, + "default", + 0, + NULL, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, +}; + +END_OF_LEADING_COMMENT + +# Awk script to take vnode_if.src and turn it into vnode_if.c. +$AWK 'function kill_surrounding_ws (s) { + sub (/^[ \t]*/, "", s); + sub (/[ \t]*$/, "", s); + return s; + } + + function read_args() { + numargs = 0; + while (getline ln) { + if (ln ~ /}/) { + break; + }; + + # Delete comments, if any. + gsub (/\/\*.*\*\//, "", ln); + + # Delete leading/trailing space. + ln = kill_surrounding_ws(ln); + + # Pick off direction. + if (1 == sub(/^INOUT[ \t]+/, "", ln)) + dir = "INOUT"; + else if (1 == sub(/^IN[ \t]+/, "", ln)) + dir = "IN"; + else if (1 == sub(/^OUT[ \t]+/, "", ln)) + dir = "OUT"; + else + bail("No IN/OUT direction for \"" ln "\"."); + + # check for "WILLRELE" + if (1 == sub(/^WILLRELE[ \t]+/, "", ln)) { + rele = "WILLRELE"; + } else { + rele = "WONTRELE"; + }; + + # kill trailing ; + if (1 != sub (/;$/, "", ln)) { + bail("Missing end-of-line ; in \"" ln "\"."); + }; + + # pick off variable name + if (!(i = match(ln, /[A-Za-z0-9_]+$/))) { + bail("Missing var name \"a_foo\" in \"" ln "\"."); + }; + arg = substr (ln, i); + # Want to <<substr(ln, i) = "";>>, but nawk cannot. + # Hack around this. + ln = substr(ln, 1, i-1); + + # what is left must be type + # (put clean it up some) + type = ln; + gsub (/[ \t]+/, " ", type); # condense whitespace + type = kill_surrounding_ws(type); + + # (boy this was easier in Perl) + + numargs++; + dirs[numargs] = dir; + reles[numargs] = rele; + types[numargs] = type; + args[numargs] = arg; + }; + } + + function generate_operation_vp_offsets() { + printf ("static int %s_vp_offsets[] = {\n", name); + # as a side effect, figure out the releflags + releflags = ""; + vpnum = 0; + for (i=1; i<=numargs; i++) { + if (types[i] == "struct vnode *") { + printf ("\tVOPARG_OFFSETOF(struct %s_args,a_%s),\n", + name, args[i]); + if (reles[i] == "WILLRELE") { + releflags = releflags "|VDESC_VP" vpnum "_WILLRELE"; + }; + vpnum++; + }; + }; + sub (/^\|/, "", releflags); + print "\tVDESC_NO_OFFSET"; + print "};"; + } + + function find_arg_with_type (type) { + for (i=1; i<=numargs; i++) { + if (types[i] == type) { + return "VOPARG_OFFSETOF(struct " name "_args,a_" args[i] ")"; + }; + }; + return "VDESC_NO_OFFSET"; + } + + function generate_operation_desc() { + printf ("struct vnodeop_desc %s_desc = {\n", name); + # offset + printf ("\t0,\n"); + # printable name + printf ("\t\"%s\",\n", name); + # flags + vppwillrele = ""; + for (i=1; i<=numargs; i++) { + if (types[i] == "struct vnode **" && + (reles[i] == "WILLRELE")) { + vppwillrele = "|VDESC_VPP_WILLRELE"; + }; + }; + if (releflags == "") { + printf ("\t0%s,\n", vppwillrele); + } else { + printf ("\t%s%s,\n", releflags, vppwillrele); + }; + # vp offsets + printf ("\t%s_vp_offsets,\n", name); + # vpp (if any) + printf ("\t%s,\n", find_arg_with_type("struct vnode **")); + # cred (if any) + printf ("\t%s,\n", find_arg_with_type("struct ucred *")); + # proc (if any) + printf ("\t%s,\n", find_arg_with_type("struct proc *")); + # componentname + printf ("\t%s,\n", find_arg_with_type("struct componentname *")); + # transport layer information + printf ("\tNULL,\n};\n"); + } + + NF == 0 || $0 ~ "^#" { + next; + } + { + # get the function name + name = $1; + + # get the function arguments + read_args(); + + # Print out the vop_F_vp_offsets structure. This all depends + # on naming conventions and nothing else. + generate_operation_vp_offsets(); + + # Print out the vnodeop_desc structure. + generate_operation_desc(); + + printf "\n"; + + }' < $SRC >> $CFILE +# THINGS THAT DON'T WORK RIGHT YET. +# +# Two existing BSD vnodeops (bwrite and strategy) don't take any vnodes as +# arguments. This means that these operations can't function successfully +# through a bypass routine. +# +# Bwrite and strategy will be replaced when the VM page/buffer cache +# integration happens. +# +# To get around this problem for now we handle these ops as special cases. + +cat << END_OF_SPECIAL_CASES >> $HEADER +#include <sys/buf.h> +struct vop_strategy_args { + struct vnodeop_desc *a_desc; + struct buf *a_bp; +}; +extern struct vnodeop_desc vop_strategy_desc; +static int VOP_STRATEGY __P(( + struct buf *bp)); +static inline int VOP_STRATEGY(bp) + struct buf *bp; +{ + struct vop_strategy_args a; + + a.a_desc = VDESC(vop_strategy); + a.a_bp = bp; + return (VCALL((bp)->b_vp, VOFFSET(vop_strategy), &a)); +} + +struct vop_bwrite_args { + struct vnodeop_desc *a_desc; + struct buf *a_bp; +}; +extern struct vnodeop_desc vop_bwrite_desc; +static int VOP_BWRITE __P(( + struct buf *bp)); +static inline int VOP_BWRITE(bp) + struct buf *bp; +{ + struct vop_bwrite_args a; + + a.a_desc = VDESC(vop_bwrite); + a.a_bp = bp; + return (VCALL((bp)->b_vp, VOFFSET(vop_bwrite), &a)); +} +END_OF_SPECIAL_CASES + +cat << END_OF_SPECIAL_CASES >> $CFILE +static int vop_strategy_vp_offsets[] = { + VDESC_NO_OFFSET +}; +struct vnodeop_desc vop_strategy_desc = { + 0, + "vop_strategy", + 0, + vop_strategy_vp_offsets, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, +}; +static int vop_bwrite_vp_offsets[] = { + VDESC_NO_OFFSET +}; +struct vnodeop_desc vop_bwrite_desc = { + 0, + "vop_bwrite", + 0, + vop_bwrite_vp_offsets, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, +}; +END_OF_SPECIAL_CASES + +# Add the vfs_op_descs array to the C file. +$AWK ' + BEGIN { + printf("\nstruct vnodeop_desc *vfs_op_descs[] = {\n"); + printf("\t&vop_default_desc, /* MUST BE FIRST */\n"); + printf("\t&vop_strategy_desc, /* XXX: SPECIAL CASE */\n"); + printf("\t&vop_bwrite_desc, /* XXX: SPECIAL CASE */\n"); + } + END { + printf("\tNULL\n};\n"); + } + NF == 0 || $0 ~ "^#" { + next; + } + { + # Get the function name. + printf("\t&%s_desc,\n", $1); + + # Skip the function arguments. + for (;;) { + if (getline <= 0) + exit + if ($0 ~ "^};") + break; + } + }' < $SRC >> $CFILE + diff --git a/sys/kern/vnode_if.sh b/sys/kern/vnode_if.sh index 8b74d83..75f49a7 100644 --- a/sys/kern/vnode_if.sh +++ b/sys/kern/vnode_if.sh @@ -1,9 +1,8 @@ #!/bin/sh - -copyright=' -/* - * Copyright (c) 1992, 1993, 1994, 1995 - * The Regents of the University of California. All rights reserved. - * +# +# Copyright (c) 1992, 1993 +# The Regents of the University of California. All rights reserved. +# # Redistribution and use in source and binary forms, with or without # modification, are permitted provided that the following conditions # are met: @@ -31,17 +30,20 @@ copyright=' # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF # SUCH DAMAGE. - * - * from: NetBSD: vnode_if.sh,v 1.7 1994/08/25 03:04:28 cgd Exp $ - */ -' -SCRIPT_ID='@(#)vnode_if.sh 8.7 (Berkeley) 5/11/95' +# +# @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 +# $Id$ +# # Script to produce VFS front-end sugar. # # usage: vnode_if.sh srcfile # (where srcfile is currently /sys/kern/vnode_if.src) # +# These awk scripts are not particularly well written, specifically they +# don't use arrays well and figure out the same information repeatedly. +# Please rewrite them if you actually understand how to use awk. Note, +# they use nawk extensions and gawk's toupper. if [ $# -ne 1 ] ; then echo 'usage: vnode_if.sh srcfile' @@ -49,180 +51,139 @@ if [ $# -ne 1 ] ; then fi # Name of the source file. -src=$1 +SRC=$1 # Names of the created files. -out_c=vnode_if.c -out_h=vnode_if.h - -# Awk program (must support nawk extensions) -# Use "awk" at Berkeley, "nawk" or "gawk" elsewhere. -awk=${AWK:-awk} - -# Does this awk have a "toupper" function? (i.e. is it GNU awk) -isgawk=`$awk 'BEGIN { print toupper("true"); exit; }' 2>/dev/null` - -# If this awk does not define "toupper" then define our own. -if [ "$isgawk" = TRUE ] ; then - # GNU awk provides it. - toupper= -else - # Provide our own toupper() - toupper=' -function toupper(str) { - _toupper_cmd = "echo "str" |tr a-z A-Z" - _toupper_cmd | getline _toupper_str; - close(_toupper_cmd); - return _toupper_str; -}' -fi +CFILE=vnode_if.c +HEADER=vnode_if.h -# -# This is the common part of all awk programs that read $src -# This parses the input for one function into the arrays: -# argdir, argtype, argname, willrele -# and calls "doit()" to generate output for the function. -# -# Input to this parser is pre-processed slightly by sed -# so this awk parser doesn't have to work so hard. The -# changes done by the sed pre-processing step are: -# insert a space beween * and pointer name -# replace semicolons with spaces -# -sed_prep='s:\*\([^\*/]\):\* \1:g -s/;/ /' -awk_parser=' -# Comment line -/^#/ { next; } -# First line of description -/^vop_/ { - name=$1; - argc=0; - next; -} -# Last line of description -/^}/ { - doit(); - next; -} -# Middle lines of description -{ - argdir[argc] = $1; i=2; - if ($2 == "WILLRELE") { - willrele[argc] = 1; - i++; - } else - willrele[argc] = 0; - argtype[argc] = $i; i++; - while (i < NF) { - argtype[argc] = argtype[argc]" "$i; - i++; - } - argname[argc] = $i; - argc++; - next; -} -' +# Awk program (must support nawk extensions and gawk's "toupper") +# Use "awk" at Berkeley, "gawk" elsewhere. +AWK=awk -# This is put after the copyright on each generated file. -warning=" +# Print out header information for vnode_if.h. +cat << END_OF_LEADING_COMMENT > $HEADER /* - * Warning: This file is generated automatically. - * (Modifications made here may easily be lost!) + * This file is produced automatically. + * Do not modify anything in here by hand. * - * Created by the script: - * ${SCRIPT_ID} + * Created from @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 */ -" - -# Get rid of ugly spaces -space_elim='s:\([^/]\*\) :\1:g' - -# -# Redirect stdout to the H file. -# -echo "$0: Creating $out_h" 1>&2 -exec > $out_h -# Begin stuff -echo "$copyright" -echo "$warning" -echo ' extern struct vnodeop_desc vop_default_desc; -' - -# Body stuff -# This awk program needs toupper() so define it if necessary. -sed -e "$sed_prep" $src | $awk "$toupper"' -function doit() { - # Declare arg struct, descriptor. - printf("\nstruct %s_args {\n", name); - printf("\tstruct vnodeop_desc * a_desc;\n"); - for (i=0; i<argc; i++) { - printf("\t%s a_%s;\n", argtype[i], argname[i]); - } - printf("};\n"); - printf("extern struct vnodeop_desc %s_desc;\n", name); - # Define inline function. - printf("#define %s(", toupper(name)); - for (i=0; i<argc; i++) { - printf("%s", argname[i]); - if (i < (argc-1)) printf(", "); - } - printf(") _%s(", toupper(name)); - for (i=0; i<argc; i++) { - printf("%s", argname[i]); - if (i < (argc-1)) printf(", "); - } - printf(")\n"); - printf("static __inline int _%s(", toupper(name)); - for (i=0; i<argc; i++) { - printf("%s", argname[i]); - if (i < (argc-1)) printf(", "); - } - printf(")\n"); - for (i=0; i<argc; i++) { - printf("\t%s %s;\n", argtype[i], argname[i]); - } - printf("{\n\tstruct %s_args a;\n", name); - printf("\ta.a_desc = VDESC(%s);\n", name); - for (i=0; i<argc; i++) { - printf("\ta.a_%s = %s;\n", argname[i], argname[i]); +END_OF_LEADING_COMMENT + +# Awk script to take vnode_if.src and turn it into vnode_if.h. +$AWK ' + NF == 0 || $0 ~ "^#" { + next; } - printf("\treturn (VCALL(%s%s, VOFFSET(%s), &a));\n}\n", - argname[0], arg0special, name); -} -BEGIN { - arg0special=""; -} -END { - printf("\n/* Special cases: */\n#include <sys/buf.h>\n"); - argc=1; - argtype[0]="struct buf *"; - argname[0]="bp"; - arg0special="->b_vp"; - name="vop_strategy"; - doit(); - name="vop_bwrite"; - doit(); -} -'"$awk_parser" | sed -e "$space_elim" + { + # Get the function name. + name = $1; + uname = toupper(name); -# End stuff -echo ' -/* End of special cases. */' + # Get the function arguments. + for (c1 = 0;; ++c1) { + if (getline <= 0) + exit + if ($0 ~ "^};") + break; + a[c1] = $0; + } + # Print out the vop_F_args structure. + printf("struct %s_args {\n\tstruct vnodeop_desc *a_desc;\n", + name); + for (c2 = 0; c2 < c1; ++c2) { + c3 = split(a[c2], t); + printf("\t"); + if (t[2] ~ "WILLRELE") + c4 = 3; + else + c4 = 2; + for (; c4 < c3; ++c4) + printf("%s ", t[c4]); + beg = match(t[c3], "[^*]"); + printf("%sa_%s\n", + substr(t[c4], 0, beg - 1), substr(t[c4], beg)); + } + printf("};\n"); -# -# Redirect stdout to the C file. -# -echo "$0: Creating $out_c" 1>&2 -exec > $out_c + # Print out extern declaration. + printf("extern struct vnodeop_desc %s_desc;\n", name); + + # Print out prototype. + printf("static int %s __P((\n", uname); + sep = ",\n"; + for (c2 = 0; c2 < c1; ++c2) { + if (c2 == c1 - 1) + sep = "));\n"; + c3 = split(a[c2], t); + printf("\t"); + if (t[2] ~ "WILLRELE") + c4 = 3; + else + c4 = 2; + for (; c4 < c3; ++c4) + printf("%s ", t[c4]); + beg = match(t[c3], "[^*]"); + end = match(t[c3], ";"); + printf("%s%s%s", + substr(t[c4], 0, beg - 1), + substr(t[c4], beg, end - beg), sep); + } + + # Print out inline struct. + printf("static inline int %s(", uname); + sep = ", "; + for (c2 = 0; c2 < c1; ++c2) { + if (c2 == c1 - 1) + sep = ")\n"; + c3 = split(a[c2], t); + beg = match(t[c3], "[^*]"); + end = match(t[c3], ";"); + printf("%s%s", substr(t[c3], beg, end - beg), sep); + } + for (c2 = 0; c2 < c1; ++c2) { + c3 = split(a[c2], t); + printf("\t"); + if (t[2] ~ "WILLRELE") + c4 = 3; + else + c4 = 2; + for (; c4 < c3; ++c4) + printf("%s ", t[c4]); + beg = match(t[c3], "[^*]"); + printf("%s%s\n", + substr(t[c4], 0, beg - 1), substr(t[c4], beg)); + } + printf("{\n\tstruct %s_args a;\n\n", name); + printf("\ta.a_desc = VDESC(%s);\n", name); + for (c2 = 0; c2 < c1; ++c2) { + c3 = split(a[c2], t); + printf("\t"); + beg = match(t[c3], "[^*]"); + end = match(t[c3], ";"); + printf("a.a_%s = %s\n", + substr(t[c3], beg, end - beg), substr(t[c3], beg)); + } + c1 = split(a[0], t); + beg = match(t[c1], "[^*]"); + end = match(t[c1], ";"); + printf("\treturn (VCALL(%s, VOFFSET(%s), &a));\n}\n", + substr(t[c1], beg, end - beg), name); + }' < $SRC >> $HEADER + +# Print out header information for vnode_if.c. +cat << END_OF_LEADING_COMMENT > $CFILE +/* + * This file is produced automatically. + * Do not modify anything in here by hand. + * + * Created from @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93 + */ -# Begin stuff -echo "$copyright" -echo "$warning" -echo ' #include <sys/param.h> #include <sys/mount.h> #include <sys/vnode.h> @@ -238,107 +199,261 @@ struct vnodeop_desc vop_default_desc = { VDESC_NO_OFFSET, NULL, }; -' - -# Body stuff -sed -e "$sed_prep" $src | $awk ' -function do_offset(typematch) { - for (i=0; i<argc; i++) { - if (argtype[i] == typematch) { - printf("\tVOPARG_OFFSETOF(struct %s_args, a_%s),\n", - name, argname[i]); - return i; - }; - }; - print "\tVDESC_NO_OFFSET,"; - return -1; -} -function doit() { - # Define offsets array - printf("\nint %s_vp_offsets[] = {\n", name); - for (i=0; i<argc; i++) { - if (argtype[i] == "struct vnode *") { - printf ("\tVOPARG_OFFSETOF(struct %s_args,a_%s),\n", - name, argname[i]); - } +END_OF_LEADING_COMMENT + +# Awk script to take vnode_if.src and turn it into vnode_if.c. +$AWK 'function kill_surrounding_ws (s) { + sub (/^[ \t]*/, "", s); + sub (/[ \t]*$/, "", s); + return s; } - print "\tVDESC_NO_OFFSET"; - print "};"; - # Define F_desc - printf("struct vnodeop_desc %s_desc = {\n", name); - # offset - printf ("\t0,\n"); - # printable name - printf ("\t\"%s\",\n", name); - # flags - printf("\t0"); - vpnum = 0; - for (i=0; i<argc; i++) { - if (willrele[i]) { - if (argdir[i] ~ /OUT/) { - printf(" | VDESC_VPP_WILLRELE"); + + function read_args() { + numargs = 0; + while (getline ln) { + if (ln ~ /}/) { + break; + }; + + # Delete comments, if any. + gsub (/\/\*.*\*\//, "", ln); + + # Delete leading/trailing space. + ln = kill_surrounding_ws(ln); + + # Pick off direction. + if (1 == sub(/^INOUT[ \t]+/, "", ln)) + dir = "INOUT"; + else if (1 == sub(/^IN[ \t]+/, "", ln)) + dir = "IN"; + else if (1 == sub(/^OUT[ \t]+/, "", ln)) + dir = "OUT"; + else + bail("No IN/OUT direction for \"" ln "\"."); + + # check for "WILLRELE" + if (1 == sub(/^WILLRELE[ \t]+/, "", ln)) { + rele = "WILLRELE"; } else { - printf(" | VDESC_VP%s_WILLRELE", vpnum); + rele = "WONTRELE"; }; - vpnum++; - } + + # kill trailing ; + if (1 != sub (/;$/, "", ln)) { + bail("Missing end-of-line ; in \"" ln "\"."); + }; + + # pick off variable name + if (!(i = match(ln, /[A-Za-z0-9_]+$/))) { + bail("Missing var name \"a_foo\" in \"" ln "\"."); + }; + arg = substr (ln, i); + # Want to <<substr(ln, i) = "";>>, but nawk cannot. + # Hack around this. + ln = substr(ln, 1, i-1); + + # what is left must be type + # (put clean it up some) + type = ln; + gsub (/[ \t]+/, " ", type); # condense whitespace + type = kill_surrounding_ws(type); + + # (boy this was easier in Perl) + + numargs++; + dirs[numargs] = dir; + reles[numargs] = rele; + types[numargs] = type; + args[numargs] = arg; + }; } - print ","; - # vp offsets - printf ("\t%s_vp_offsets,\n", name); - # vpp (if any) - do_offset("struct vnode **"); - # cred (if any) - do_offset("struct ucred *"); - # proc (if any) - do_offset("struct proc *"); - # componentname - do_offset("struct componentname *"); - # transport layer information - printf ("\tNULL,\n};\n"); -} -END { - printf("\n/* Special cases: */\n"); - argc=1; - argdir[0]="IN"; - argtype[0]="struct buf *"; - argname[0]="bp"; - willrele[0]=0; - name="vop_strategy"; - doit(); - name="vop_bwrite"; - doit(); + + function generate_operation_vp_offsets() { + printf ("static int %s_vp_offsets[] = {\n", name); + # as a side effect, figure out the releflags + releflags = ""; + vpnum = 0; + for (i=1; i<=numargs; i++) { + if (types[i] == "struct vnode *") { + printf ("\tVOPARG_OFFSETOF(struct %s_args,a_%s),\n", + name, args[i]); + if (reles[i] == "WILLRELE") { + releflags = releflags "|VDESC_VP" vpnum "_WILLRELE"; + }; + vpnum++; + }; + }; + sub (/^\|/, "", releflags); + print "\tVDESC_NO_OFFSET"; + print "};"; + } + + function find_arg_with_type (type) { + for (i=1; i<=numargs; i++) { + if (types[i] == type) { + return "VOPARG_OFFSETOF(struct " name "_args,a_" args[i] ")"; + }; + }; + return "VDESC_NO_OFFSET"; + } + + function generate_operation_desc() { + printf ("struct vnodeop_desc %s_desc = {\n", name); + # offset + printf ("\t0,\n"); + # printable name + printf ("\t\"%s\",\n", name); + # flags + vppwillrele = ""; + for (i=1; i<=numargs; i++) { + if (types[i] == "struct vnode **" && + (reles[i] == "WILLRELE")) { + vppwillrele = "|VDESC_VPP_WILLRELE"; + }; + }; + if (releflags == "") { + printf ("\t0%s,\n", vppwillrele); + } else { + printf ("\t%s%s,\n", releflags, vppwillrele); + }; + # vp offsets + printf ("\t%s_vp_offsets,\n", name); + # vpp (if any) + printf ("\t%s,\n", find_arg_with_type("struct vnode **")); + # cred (if any) + printf ("\t%s,\n", find_arg_with_type("struct ucred *")); + # proc (if any) + printf ("\t%s,\n", find_arg_with_type("struct proc *")); + # componentname + printf ("\t%s,\n", find_arg_with_type("struct componentname *")); + # transport layer information + printf ("\tNULL,\n};\n"); + } + + NF == 0 || $0 ~ "^#" { + next; + } + { + # get the function name + name = $1; + + # get the function arguments + read_args(); + + # Print out the vop_F_vp_offsets structure. This all depends + # on naming conventions and nothing else. + generate_operation_vp_offsets(); + + # Print out the vnodeop_desc structure. + generate_operation_desc(); + + printf "\n"; + + }' < $SRC >> $CFILE +# THINGS THAT DON'T WORK RIGHT YET. +# +# Two existing BSD vnodeops (bwrite and strategy) don't take any vnodes as +# arguments. This means that these operations can't function successfully +# through a bypass routine. +# +# Bwrite and strategy will be replaced when the VM page/buffer cache +# integration happens. +# +# To get around this problem for now we handle these ops as special cases. + +cat << END_OF_SPECIAL_CASES >> $HEADER +#include <sys/buf.h> +struct vop_strategy_args { + struct vnodeop_desc *a_desc; + struct buf *a_bp; +}; +extern struct vnodeop_desc vop_strategy_desc; +static int VOP_STRATEGY __P(( + struct buf *bp)); +static inline int VOP_STRATEGY(bp) + struct buf *bp; +{ + struct vop_strategy_args a; + + a.a_desc = VDESC(vop_strategy); + a.a_bp = bp; + return (VCALL((bp)->b_vp, VOFFSET(vop_strategy), &a)); } -'"$awk_parser" | sed -e "$space_elim" -# End stuff -echo ' -/* End of special cases. */' +struct vop_bwrite_args { + struct vnodeop_desc *a_desc; + struct buf *a_bp; +}; +extern struct vnodeop_desc vop_bwrite_desc; +static int VOP_BWRITE __P(( + struct buf *bp)); +static inline int VOP_BWRITE(bp) + struct buf *bp; +{ + struct vop_bwrite_args a; -# Add the vfs_op_descs array to the C file. -# Begin stuff -echo ' -struct vnodeop_desc *vfs_op_descs[] = { - &vop_default_desc, /* MUST BE FIRST */ - &vop_strategy_desc, /* XXX: SPECIAL CASE */ - &vop_bwrite_desc, /* XXX: SPECIAL CASE */ -' - -# Body stuff -sed -e "$sed_prep" $src | $awk ' -function doit() { - printf("\t&%s_desc,\n", name); + a.a_desc = VDESC(vop_bwrite); + a.a_bp = bp; + return (VCALL((bp)->b_vp, VOFFSET(vop_bwrite), &a)); } -'"$awk_parser" +END_OF_SPECIAL_CASES -# End stuff -echo ' NULL +cat << END_OF_SPECIAL_CASES >> $CFILE +static int vop_strategy_vp_offsets[] = { + VDESC_NO_OFFSET +}; +struct vnodeop_desc vop_strategy_desc = { + 0, + "vop_strategy", + 0, + vop_strategy_vp_offsets, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, +}; +static int vop_bwrite_vp_offsets[] = { + VDESC_NO_OFFSET +}; +struct vnodeop_desc vop_bwrite_desc = { + 0, + "vop_bwrite", + 0, + vop_bwrite_vp_offsets, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + VDESC_NO_OFFSET, + NULL, }; -' +END_OF_SPECIAL_CASES -exit 0 +# Add the vfs_op_descs array to the C file. +$AWK ' + BEGIN { + printf("\nstruct vnodeop_desc *vfs_op_descs[] = {\n"); + printf("\t&vop_default_desc, /* MUST BE FIRST */\n"); + printf("\t&vop_strategy_desc, /* XXX: SPECIAL CASE */\n"); + printf("\t&vop_bwrite_desc, /* XXX: SPECIAL CASE */\n"); + } + END { + printf("\tNULL\n};\n"); + } + NF == 0 || $0 ~ "^#" { + next; + } + { + # Get the function name. + printf("\t&%s_desc,\n", $1); + + # Skip the function arguments. + for (;;) { + if (getline <= 0) + exit + if ($0 ~ "^};") + break; + } + }' < $SRC >> $CFILE -# Local Variables: -# tab-width: 4 -# End: diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src index 1e32f29..7e3338f 100644 --- a/sys/kern/vnode_if.src +++ b/sys/kern/vnode_if.src @@ -31,6 +31,7 @@ # SUCH DAMAGE. # # @(#)vnode_if.src 8.12 (Berkeley) 5/14/95 +# $Id: vnode_if.src,v 1.9.2000.1 1996/09/17 14:32:01 peter Exp $ # # @@ -255,8 +256,8 @@ vop_remove { #% link tdvp L U U # vop_link { - IN WILLRELE struct vnode *vp; - IN struct vnode *tdvp; + IN WILLRELE struct vnode *tdvp; + IN struct vnode *vp; IN struct componentname *cnp; }; @@ -385,6 +386,7 @@ vop_bmap { OUT struct vnode **vpp; IN daddr_t *bnp; OUT int *runp; + OUT int *runb; }; # @@ -486,6 +488,23 @@ vop_update { IN int waitfor; }; +vop_getpages { + IN struct vnode *vp; + IN vm_page_t *m; + IN int count; + IN int reqpage; + IN vm_ooffset_t offset; +}; + +vop_putpages { + IN struct vnode *vp; + IN vm_page_t *m; + IN int count; + IN int sync; + IN int *rtvals; + IN vm_ooffset_t offset; +}; + # # Needs work: no vp? # |