summaryrefslogtreecommitdiffstats
path: root/sys/kern
diff options
context:
space:
mode:
Diffstat (limited to 'sys/kern')
-rw-r--r--sys/kern/Make.tags.inc4
-rw-r--r--sys/kern/Makefile11
-rw-r--r--sys/kern/imgact_aout.c210
-rw-r--r--sys/kern/imgact_elf.c749
-rw-r--r--sys/kern/imgact_gzip.c378
-rw-r--r--sys/kern/imgact_shell.c141
-rw-r--r--sys/kern/inflate.c1072
-rw-r--r--sys/kern/init_main.c536
-rw-r--r--sys/kern/init_sysent.c1010
-rw-r--r--sys/kern/init_sysvec.c30
-rw-r--r--sys/kern/kern_acct.c250
-rw-r--r--sys/kern/kern_clock.c855
-rw-r--r--sys/kern/kern_conf.c208
-rw-r--r--sys/kern/kern_descrip.c506
-rw-r--r--sys/kern/kern_exec.c615
-rw-r--r--sys/kern/kern_exit.c236
-rw-r--r--sys/kern/kern_fork.c220
-rw-r--r--sys/kern/kern_ktrace.c156
-rw-r--r--sys/kern/kern_lkm.c957
-rw-r--r--sys/kern/kern_lockf.c796
-rw-r--r--sys/kern/kern_malloc.c136
-rw-r--r--sys/kern/kern_mib.c167
-rw-r--r--sys/kern/kern_ntptime.c269
-rw-r--r--sys/kern/kern_opt.c49
-rw-r--r--sys/kern/kern_physio.c226
-rw-r--r--sys/kern/kern_proc.c240
-rw-r--r--sys/kern/kern_prot.c407
-rw-r--r--sys/kern/kern_random.c515
-rw-r--r--sys/kern/kern_resource.c329
-rw-r--r--sys/kern/kern_shutdown.c445
-rw-r--r--sys/kern/kern_sig.c498
-rw-r--r--sys/kern/kern_subr.c57
-rw-r--r--sys/kern/kern_synch.c275
-rw-r--r--sys/kern/kern_sysctl.c1480
-rw-r--r--sys/kern/kern_tc.c1303
-rw-r--r--sys/kern/kern_time.c184
-rw-r--r--sys/kern/kern_timeout.c1303
-rw-r--r--sys/kern/kern_xxx.c230
-rw-r--r--sys/kern/makesyscalls.sh639
-rw-r--r--sys/kern/md5c.c331
-rw-r--r--sys/kern/subr_autoconf.c15
-rw-r--r--sys/kern/subr_clist.c694
-rw-r--r--sys/kern/subr_disklabel.c406
-rw-r--r--sys/kern/subr_diskmbr.c456
-rw-r--r--sys/kern/subr_diskslice.c1066
-rw-r--r--sys/kern/subr_dkbad.c159
-rw-r--r--sys/kern/subr_log.c93
-rw-r--r--sys/kern/subr_param.c186
-rw-r--r--sys/kern/subr_prf.c611
-rw-r--r--sys/kern/subr_prof.c283
-rw-r--r--sys/kern/subr_rlist.c311
-rw-r--r--sys/kern/subr_rmap.c81
-rw-r--r--sys/kern/subr_trap.c940
-rw-r--r--sys/kern/subr_xxx.c254
-rw-r--r--sys/kern/sys_generic.c347
-rw-r--r--sys/kern/sys_pipe.c1107
-rw-r--r--sys/kern/sys_process.c490
-rw-r--r--sys/kern/sys_socket.c30
-rw-r--r--sys/kern/syscalls.c247
-rw-r--r--sys/kern/syscalls.conf12
-rw-r--r--sys/kern/syscalls.master633
-rw-r--r--sys/kern/sysv_ipc.c297
-rw-r--r--sys/kern/sysv_msg.c1034
-rw-r--r--sys/kern/sysv_sem.c985
-rw-r--r--sys/kern/sysv_shm.c622
-rw-r--r--sys/kern/tty.c985
-rw-r--r--sys/kern/tty_compat.c326
-rw-r--r--sys/kern/tty_conf.c198
-rw-r--r--sys/kern/tty_cons.c353
-rw-r--r--sys/kern/tty_pty.c271
-rw-r--r--sys/kern/tty_snoop.c548
-rw-r--r--sys/kern/tty_subr.c689
-rw-r--r--sys/kern/tty_tb.c5
-rw-r--r--sys/kern/tty_tty.c61
-rw-r--r--sys/kern/uipc_domain.c179
-rw-r--r--sys/kern/uipc_mbuf.c327
-rw-r--r--sys/kern/uipc_proto.c36
-rw-r--r--sys/kern/uipc_sockbuf.c1018
-rw-r--r--sys/kern/uipc_socket.c191
-rw-r--r--sys/kern/uipc_socket2.c405
-rw-r--r--sys/kern/uipc_syscalls.c1095
-rw-r--r--sys/kern/uipc_usrreq.c234
-rw-r--r--sys/kern/vfs_bio.c2009
-rw-r--r--sys/kern/vfs_cache.c71
-rw-r--r--sys/kern/vfs_cluster.c997
-rw-r--r--sys/kern/vfs_conf.c299
-rw-r--r--sys/kern/vfs_export.c2079
-rw-r--r--sys/kern/vfs_extattr.c2756
-rw-r--r--sys/kern/vfs_init.c102
-rw-r--r--sys/kern/vfs_lookup.c65
-rw-r--r--sys/kern/vfs_mount.c153
-rw-r--r--sys/kern/vfs_subr.c809
-rw-r--r--sys/kern/vfs_syscalls.c491
-rw-r--r--sys/kern/vfs_vnops.c148
-rw-r--r--sys/kern/vnode_if.pl459
-rw-r--r--sys/kern/vnode_if.sh637
-rw-r--r--sys/kern/vnode_if.src23
97 files changed, 40170 insertions, 7231 deletions
diff --git a/sys/kern/Make.tags.inc b/sys/kern/Make.tags.inc
index 79cb83a..a09e484 100644
--- a/sys/kern/Make.tags.inc
+++ b/sys/kern/Make.tags.inc
@@ -1,4 +1,5 @@
-# @(#)Make.tags.inc 8.2 (Berkeley) 11/23/94
+# @(#)Make.tags.inc 8.1 (Berkeley) 6/11/93
+# $Id$
# Common files for "make tags".
# Included by the Makefile for each architecture.
@@ -9,7 +10,6 @@
COMM= /sys/conf/*.[ch] \
/sys/dev/*.[ch] /sys/dev/scsi/*.[ch] \
- /sys/isofs/*/*.[ch] \
/sys/kern/*.[ch] /sys/libkern/*.[ch] \
/sys/miscfs/*/*.[ch] \
/sys/net/*.[ch] /sys/netccitt/*.[ch] /sys/netinet/*.[ch] \
diff --git a/sys/kern/Makefile b/sys/kern/Makefile
index 3159d20..f42a44e 100644
--- a/sys/kern/Makefile
+++ b/sys/kern/Makefile
@@ -1,17 +1,20 @@
-# @(#)Makefile 8.3 (Berkeley) 2/14/95
+# @(#)Makefile 8.2 (Berkeley) 3/21/94
# Makefile for kernel tags files, init_sysent, etc.
-ARCH= hp300 i386 luna68k news3400 pmax sparc tahoe vax
+ARCH= i386 # luna68k news3400 pmax sparc tahoe vax
all:
@echo "make tags, make links or init_sysent.c only"
-init_sysent.c syscalls.c ../sys/syscall.h ../sys/syscallargs.h: makesyscalls.sh syscalls.master
+init_sysent.c syscalls.c ../sys/syscall.h ../sys/syscall-hide.h \
+../sys/sysproto.h: makesyscalls.sh syscalls.master
-mv -f init_sysent.c init_sysent.c.bak
-mv -f syscalls.c syscalls.c.bak
-mv -f ../sys/syscall.h ../sys/syscall.h.bak
- sh makesyscalls.sh syscalls.conf syscalls.master
+ -mv -f ../sys/syscall-hide.h ../sys/syscall-hide.h.bak
+ -mv -f ../sys/sysproto.h ../sys/sysproto.h.bak
+ sh makesyscalls.sh syscalls.master
# Kernel tags:
# Tags files are built in the top-level directory for each architecture,
diff --git a/sys/kern/imgact_aout.c b/sys/kern/imgact_aout.c
new file mode 100644
index 0000000..4adbd05
--- /dev/null
+++ b/sys/kern/imgact_aout.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "opt_rlimit.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/resourcevar.h>
+#include <sys/exec.h>
+#include <sys/mman.h>
+#include <sys/imgact.h>
+#include <sys/imgact_aout.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/sysent.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+
+static int exec_aout_imgact __P((struct image_params *imgp));
+
+static int
+exec_aout_imgact(imgp)
+ struct image_params *imgp;
+{
+ const struct exec *a_out = (const struct exec *) imgp->image_header;
+ struct vmspace *vmspace = imgp->proc->p_vmspace;
+ vm_offset_t vmaddr;
+ unsigned long virtual_offset;
+ unsigned long file_offset;
+ unsigned long bss_size;
+ int error;
+
+ /*
+ * Linux and *BSD binaries look very much alike,
+ * only the machine id is different:
+ * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI.
+ * NetBSD is in network byte order.. ugh.
+ */
+ if (((a_out->a_magic >> 16) & 0xff) != 0x86 &&
+ ((a_out->a_magic >> 16) & 0xff) != 0 &&
+ ((((int)ntohl(a_out->a_magic)) >> 16) & 0xff) != 0x86)
+ return -1;
+
+ /*
+ * Set file/virtual offset based on a.out variant.
+ * We do two cases: host byte order and network byte order
+ * (for NetBSD compatibility)
+ */
+ switch ((int)(a_out->a_magic & 0xffff)) {
+ case ZMAGIC:
+ virtual_offset = 0;
+ if (a_out->a_text) {
+ file_offset = PAGE_SIZE;
+ } else {
+ /* Bill's "screwball mode" */
+ file_offset = 0;
+ }
+ break;
+ case QMAGIC:
+ virtual_offset = PAGE_SIZE;
+ file_offset = 0;
+ break;
+ default:
+ /* NetBSD compatibility */
+ switch ((int)(ntohl(a_out->a_magic) & 0xffff)) {
+ case ZMAGIC:
+ case QMAGIC:
+ virtual_offset = PAGE_SIZE;
+ file_offset = 0;
+ break;
+ default:
+ return (-1);
+ }
+ }
+
+ bss_size = roundup(a_out->a_bss, PAGE_SIZE);
+
+ /*
+ * Check various fields in header for validity/bounds.
+ */
+ if (/* entry point must lay with text region */
+ a_out->a_entry < virtual_offset ||
+ a_out->a_entry >= virtual_offset + a_out->a_text ||
+
+ /* text and data size must each be page rounded */
+ a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK)
+ return (-1);
+
+ /* text + data can't exceed file size */
+ if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
+ return (EFAULT);
+
+ /*
+ * text/data/bss must not exceed limits
+ */
+ if (/* text can't exceed maximum text size */
+ a_out->a_text > MAXTSIZ ||
+
+ /* data + bss can't exceed maximum data size */
+ a_out->a_data + bss_size > MAXDSIZ ||
+
+ /* data + bss can't exceed rlimit */
+ a_out->a_data + bss_size >
+ imgp->proc->p_rlimit[RLIMIT_DATA].rlim_cur)
+ return (ENOMEM);
+
+ /* copy in arguments and/or environment from old process */
+ error = exec_extract_strings(imgp);
+ if (error)
+ return (error);
+
+ /*
+ * Destroy old process VM and create a new one (with a new stack)
+ */
+ exec_new_vmspace(imgp);
+
+ /*
+ * Map text/data read/execute
+ */
+ vmaddr = virtual_offset;
+ error =
+ vm_mmap(&vmspace->vm_map, /* map */
+ &vmaddr, /* address */
+ a_out->a_text + a_out->a_data, /* size */
+ VM_PROT_READ | VM_PROT_EXECUTE, /* protection */
+ VM_PROT_ALL, /* max protection */
+ MAP_PRIVATE | MAP_FIXED, /* flags */
+ (caddr_t)imgp->vp, /* vnode */
+ file_offset); /* offset */
+ if (error)
+ return (error);
+
+ /*
+ * allow writing of data
+ */
+ vm_map_protect(&vmspace->vm_map,
+ vmaddr + a_out->a_text,
+ vmaddr + a_out->a_text + a_out->a_data,
+ VM_PROT_ALL,
+ FALSE);
+
+ if (bss_size != 0) {
+ /*
+ * Allocate demand-zeroed area for uninitialized data
+ * "bss" = 'block started by symbol' - named after the IBM 7090
+ * instruction of the same name.
+ */
+ vmaddr = virtual_offset + a_out->a_text + a_out->a_data;
+ error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (error)
+ return (error);
+ }
+
+ /* Fill in process VM information */
+ vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT;
+ vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT;
+ vmspace->vm_taddr = (caddr_t) virtual_offset;
+ vmspace->vm_daddr = (caddr_t) virtual_offset + a_out->a_text;
+
+ /* Fill in image_params */
+ imgp->interpreted = 0;
+ imgp->entry_addr = a_out->a_entry;
+
+ imgp->proc->p_sysent = &aout_sysvec;
+
+ /* Indicate that this file should not be modified */
+ imgp->vp->v_flag |= VTEXT;
+
+ return (0);
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ * Since `const' objects end up in the text segment, TEXT_SET is the
+ * correct directive to use.
+ */
+static const struct execsw aout_execsw = { exec_aout_imgact, "a.out" };
+TEXT_SET(execsw_set, aout_execsw);
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
new file mode 100644
index 0000000..525d76d
--- /dev/null
+++ b/sys/kern/imgact_elf.c
@@ -0,0 +1,749 @@
+/*-
+ * Copyright (c) 1995-1996 Søren Schmidt
+ * Copyright (c) 1996 Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer
+ * in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ * derived from this software withough specific prior written permission
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Id: imgact_elf.c,v 1.16 1997/02/22 09:38:56 peter Exp $
+ */
+
+#include "opt_rlimit.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/resourcevar.h>
+#include <sys/exec.h>
+#include <sys/mman.h>
+#include <sys/imgact.h>
+#include <sys/imgact_elf.h>
+#include <sys/kernel.h>
+#include <sys/sysent.h>
+#include <sys/fcntl.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/sysproto.h>
+#include <sys/syscall.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <sys/lock.h>
+#include <vm/vm_map.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_extern.h>
+
+#include <machine/md_var.h>
+#include <i386/linux/linux_syscall.h>
+#include <i386/linux/linux.h>
+
+#define MAX_PHDR 32 /* XXX enough ? */
+
+static int map_pages __P((struct vnode *vp, vm_offset_t offset, vm_offset_t *buf, vm_size_t size));
+static void unmap_pages __P((vm_offset_t buf, vm_size_t size));
+static int elf_check_permissions __P((struct proc *p, struct vnode *vp));
+static int elf_check_header __P((const Elf32_Ehdr *hdr, int type));
+static int elf_load_section __P((struct vmspace *vmspace, struct vnode *vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot));
+static int elf_load_file __P((struct proc *p, char *file, u_long *addr, u_long *entry));
+static int elf_freebsd_fixup __P((int **stack_base, struct image_params *imgp));
+int exec_elf_imgact __P((struct image_params *imgp));
+
+int elf_trace = 0;
+SYSCTL_INT(_debug, 1, elf_trace, CTLFLAG_RW, &elf_trace, 0, "");
+#define UPRINTF if (elf_trace) uprintf
+
+static struct sysentvec elf_freebsd_sysvec = {
+ SYS_MAXSYSCALL,
+ sysent,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ elf_freebsd_fixup,
+ sendsig,
+ sigcode,
+ &szsigcode,
+ 0,
+ "FreeBSD ELF"
+};
+
+static Elf32_Brandinfo freebsd_brand_info = {
+ "FreeBSD",
+ "",
+ "/usr/libexec/ld-elf.so.1",
+ &elf_freebsd_sysvec
+ };
+static Elf32_Brandinfo *elf_brand_list[MAX_BRANDS] = {
+ &freebsd_brand_info,
+ NULL, NULL, NULL,
+ NULL, NULL, NULL, NULL
+ };
+
+int
+elf_insert_brand_entry(Elf32_Brandinfo *entry)
+{
+ int i;
+
+ for (i=1; i<MAX_BRANDS; i++) {
+ if (elf_brand_list[i] == NULL) {
+ elf_brand_list[i] = entry;
+ break;
+ }
+ }
+ if (i == MAX_BRANDS)
+ return -1;
+ return 0;
+}
+
+int
+elf_remove_brand_entry(Elf32_Brandinfo *entry)
+{
+ int i;
+
+ for (i=1; i<MAX_BRANDS; i++) {
+ if (elf_brand_list[i] == entry) {
+ elf_brand_list[i] = NULL;
+ break;
+ }
+ }
+ if (i == MAX_BRANDS)
+ return -1;
+ return 0;
+}
+
+static int
+map_pages(struct vnode *vp, vm_offset_t offset,
+ vm_offset_t *buf, vm_size_t size)
+{
+ int error;
+ vm_offset_t kern_buf;
+ vm_size_t pageoff;
+
+ /*
+ * The request may not be aligned, and may even cross several
+ * page boundaries in the file...
+ */
+ pageoff = (offset & PAGE_MASK);
+ offset -= pageoff; /* start of first aligned page to map */
+ size += pageoff;
+ size = round_page(size); /* size of aligned pages to map */
+
+ if (error = vm_mmap(kernel_map,
+ &kern_buf,
+ size,
+ VM_PROT_READ,
+ VM_PROT_READ,
+ 0,
+ (caddr_t)vp,
+ offset))
+ return error;
+
+ *buf = kern_buf + pageoff;
+
+ return 0;
+}
+
+static void
+unmap_pages(vm_offset_t buf, vm_size_t size)
+{
+ vm_size_t pageoff;
+
+ pageoff = (buf & PAGE_MASK);
+ buf -= pageoff; /* start of first aligned page to map */
+ size += pageoff;
+ size = round_page(size);/* size of aligned pages to map */
+
+ vm_map_remove(kernel_map, buf, buf + size);
+}
+
+static int
+elf_check_permissions(struct proc *p, struct vnode *vp)
+{
+ struct vattr attr;
+ int error;
+
+ /*
+ * Check number of open-for-writes on the file and deny execution
+ * if there are any.
+ */
+ if (vp->v_writecount) {
+ return (ETXTBSY);
+ }
+
+ /* Get file attributes */
+ error = VOP_GETATTR(vp, &attr, p->p_ucred, p);
+ if (error)
+ return (error);
+
+ /*
+ * 1) Check if file execution is disabled for the filesystem that this
+ * file resides on.
+ * 2) Insure that at least one execute bit is on - otherwise root
+ * will always succeed, and we don't want to happen unless the
+ * file really is executable.
+ * 3) Insure that the file is a regular file.
+ */
+ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
+ ((attr.va_mode & 0111) == 0) ||
+ (attr.va_type != VREG)) {
+ return (EACCES);
+ }
+
+ /*
+ * Zero length files can't be exec'd
+ */
+ if (attr.va_size == 0)
+ return (ENOEXEC);
+
+ /*
+ * Check for execute permission to file based on current credentials.
+ * Then call filesystem specific open routine (which does nothing
+ * in the general case).
+ */
+ error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p);
+ if (error)
+ return (error);
+
+ error = VOP_OPEN(vp, FREAD, p->p_ucred, p);
+ if (error)
+ return (error);
+
+ return (0);
+}
+
+static int
+elf_check_header(const Elf32_Ehdr *hdr, int type)
+{
+ if (!(hdr->e_ident[EI_MAG0] == ELFMAG0 &&
+ hdr->e_ident[EI_MAG1] == ELFMAG1 &&
+ hdr->e_ident[EI_MAG2] == ELFMAG2 &&
+ hdr->e_ident[EI_MAG3] == ELFMAG3))
+ return ENOEXEC;
+
+ if (hdr->e_machine != EM_386 && hdr->e_machine != EM_486)
+ return ENOEXEC;
+
+ if (hdr->e_type != type)
+ return ENOEXEC;
+
+ return 0;
+}
+
+static int
+elf_load_section(struct vmspace *vmspace, struct vnode *vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot)
+{
+ size_t map_len;
+ vm_offset_t map_addr;
+ int error;
+ unsigned char *data_buf = 0;
+ size_t copy_len;
+
+ map_addr = trunc_page(vmaddr);
+
+ if (memsz > filsz)
+ map_len = trunc_page(offset+filsz) - trunc_page(offset);
+ else
+ map_len = round_page(offset+filsz) - trunc_page(offset);
+
+ if (error = vm_mmap (&vmspace->vm_map,
+ &map_addr,
+ map_len,
+ prot,
+ VM_PROT_ALL,
+ MAP_PRIVATE | MAP_FIXED,
+ (caddr_t)vp,
+ trunc_page(offset)))
+ return error;
+
+ if (memsz == filsz)
+ return 0;
+
+ /*
+ * We have to map the remaining bit of the file into the kernel's
+ * memory map, allocate some anonymous memory, and copy that last
+ * bit into it. The remaining space should be .bss...
+ */
+ copy_len = (offset + filsz) - trunc_page(offset + filsz);
+ map_addr = trunc_page(vmaddr + filsz);
+ map_len = round_page(vmaddr + memsz) - map_addr;
+
+ if (map_len != 0) {
+ if (error = vm_map_find(&vmspace->vm_map, NULL, 0,
+ &map_addr, map_len, FALSE,
+ VM_PROT_ALL, VM_PROT_ALL,0))
+ return error;
+ }
+
+ if (error = vm_mmap(kernel_map,
+ (vm_offset_t *)&data_buf,
+ PAGE_SIZE,
+ VM_PROT_READ,
+ VM_PROT_READ,
+ 0,
+ (caddr_t)vp,
+ trunc_page(offset + filsz)))
+ return error;
+
+ error = copyout(data_buf, (caddr_t)map_addr, copy_len);
+
+ vm_map_remove(kernel_map, (vm_offset_t)data_buf,
+ (vm_offset_t)data_buf + PAGE_SIZE);
+
+ /*
+ * set it to the specified protection
+ */
+ vm_map_protect(&vmspace->vm_map, map_addr, map_addr + map_len, prot,
+ FALSE);
+
+ UPRINTF("bss size %d (%x)\n", map_len-copy_len, map_len-copy_len);
+ return error;
+}
+
+static int
+elf_load_file(struct proc *p, char *file, u_long *addr, u_long *entry)
+{
+ Elf32_Ehdr *hdr = NULL;
+ Elf32_Phdr *phdr = NULL;
+ struct nameidata nd;
+ struct vmspace *vmspace = p->p_vmspace;
+ vm_prot_t prot = 0;
+ unsigned long text_size = 0, data_size = 0;
+ unsigned long text_addr = 0, data_addr = 0;
+ int header_size = 0;
+ int error, i;
+
+ NDINIT(&nd, LOOKUP, LOCKLEAF|FOLLOW, UIO_SYSSPACE, file, p);
+
+ if (error = namei(&nd))
+ goto fail;
+
+ if (nd.ni_vp == NULL) {
+ error = ENOEXEC;
+ goto fail;
+ }
+
+ /*
+ * Check permissions, modes, uid, etc on the file, and "open" it.
+ */
+ error = elf_check_permissions(p, nd.ni_vp);
+
+ /*
+ * No longer need this, and it prevents demand paging.
+ */
+ VOP_UNLOCK(nd.ni_vp, 0, p);
+
+ if (error)
+ goto fail;
+
+ /*
+ * Map in the header
+ */
+ if (error = map_pages(nd.ni_vp, 0, (vm_offset_t *)&hdr, sizeof(hdr)))
+ goto fail;
+
+ /*
+ * Do we have a valid ELF header ?
+ */
+ if (error = elf_check_header(hdr, ET_DYN))
+ goto fail;
+
+ /*
+ * ouch, need to bounds check in case user gives us a corrupted
+ * file with an insane header size
+ */
+ if (hdr->e_phnum > MAX_PHDR) { /* XXX: ever more than this? */
+ error = ENOEXEC;
+ goto fail;
+ }
+
+ header_size = hdr->e_phentsize * hdr->e_phnum;
+
+ if (error = map_pages(nd.ni_vp, hdr->e_phoff, (vm_offset_t *)&phdr,
+ header_size))
+ goto fail;
+
+ for (i = 0; i < hdr->e_phnum; i++) {
+ switch(phdr[i].p_type) {
+
+ case PT_NULL: /* NULL section */
+ UPRINTF ("ELF(file) PT_NULL section\n");
+ break;
+ case PT_LOAD: /* Loadable segment */
+ {
+ UPRINTF ("ELF(file) PT_LOAD section ");
+ if (phdr[i].p_flags & PF_X)
+ prot |= VM_PROT_EXECUTE;
+ if (phdr[i].p_flags & PF_W)
+ prot |= VM_PROT_WRITE;
+ if (phdr[i].p_flags & PF_R)
+ prot |= VM_PROT_READ;
+
+ if (error = elf_load_section(vmspace, nd.ni_vp,
+ phdr[i].p_offset,
+ (caddr_t)phdr[i].p_vaddr +
+ (*addr),
+ phdr[i].p_memsz,
+ phdr[i].p_filesz, prot))
+ goto fail;
+
+ /*
+ * Is this .text or .data ??
+ *
+ * We only handle one each of those yet XXX
+ */
+ if (hdr->e_entry >= phdr[i].p_vaddr &&
+ hdr->e_entry <(phdr[i].p_vaddr+phdr[i].p_memsz)) {
+ text_addr = trunc_page(phdr[i].p_vaddr+(*addr));
+ text_size = round_page(phdr[i].p_memsz +
+ phdr[i].p_vaddr -
+ trunc_page(phdr[i].p_vaddr));
+ *entry=(unsigned long)hdr->e_entry+(*addr);
+ UPRINTF(".text <%08x,%08x> entry=%08x\n",
+ text_addr, text_size, *entry);
+ } else {
+ data_addr = trunc_page(phdr[i].p_vaddr+(*addr));
+ data_size = round_page(phdr[i].p_memsz +
+ phdr[i].p_vaddr -
+ trunc_page(phdr[i].p_vaddr));
+ UPRINTF(".data <%08x,%08x>\n",
+ data_addr, data_size);
+ }
+ }
+ break;
+
+ case PT_DYNAMIC:/* Dynamic link information */
+ UPRINTF ("ELF(file) PT_DYNAMIC section\n");
+ break;
+ case PT_INTERP: /* Path to interpreter */
+ UPRINTF ("ELF(file) PT_INTERP section\n");
+ break;
+ case PT_NOTE: /* Note section */
+ UPRINTF ("ELF(file) PT_NOTE section\n");
+ break;
+ case PT_SHLIB: /* Shared lib section */
+ UPRINTF ("ELF(file) PT_SHLIB section\n");
+ break;
+ case PT_PHDR: /* Program header table info */
+ UPRINTF ("ELF(file) PT_PHDR section\n");
+ break;
+ default:
+ UPRINTF ("ELF(file) %d section ??\n", phdr[i].p_type );
+ }
+ }
+
+fail:
+ if (phdr)
+ unmap_pages((vm_offset_t)phdr, header_size);
+ if (hdr)
+ unmap_pages((vm_offset_t)hdr, sizeof(hdr));
+
+ return error;
+}
+
+int
+exec_elf_imgact(struct image_params *imgp)
+{
+ const Elf32_Ehdr *hdr = (const Elf32_Ehdr *) imgp->image_header;
+ const Elf32_Phdr *phdr, *mapped_phdr = NULL;
+ Elf32_Auxargs *elf_auxargs = NULL;
+ struct vmspace *vmspace = imgp->proc->p_vmspace;
+ vm_prot_t prot = 0;
+ u_long text_size = 0, data_size = 0;
+ u_long text_addr = 0, data_addr = 0;
+ u_long addr, entry = 0, proghdr = 0;
+ int error, i, header_size = 0, interp_len = 0;
+ char *interp = NULL;
+ char *brand = NULL;
+ char path[MAXPATHLEN];
+
+ /*
+ * Do we have a valid ELF header ?
+ */
+ if (elf_check_header(hdr, ET_EXEC))
+ return -1;
+
+ /*
+ * From here on down, we return an errno, not -1, as we've
+ * detected an ELF file.
+ */
+
+ /*
+ * ouch, need to bounds check in case user gives us a corrupted
+ * file with an insane header size
+ */
+ if (hdr->e_phnum > MAX_PHDR) { /* XXX: ever more than this? */
+ return ENOEXEC;
+ }
+
+ header_size = hdr->e_phentsize * hdr->e_phnum;
+
+ if ((hdr->e_phoff > PAGE_SIZE) ||
+ (hdr->e_phoff + header_size) > PAGE_SIZE) {
+ /*
+ * Ouch ! we only get one page full of header...
+ * Try to map it in ourselves, and see how we go.
+ */
+ if (error = map_pages(imgp->vp, hdr->e_phoff,
+ (vm_offset_t *)&mapped_phdr, header_size))
+ return (error);
+ /*
+ * Save manual mapping for cleanup
+ */
+ phdr = mapped_phdr;
+ } else {
+ phdr = (const Elf32_Phdr*)
+ ((const char *)imgp->image_header + hdr->e_phoff);
+ }
+
+ /*
+ * From this point on, we may have resources that need to be freed.
+ */
+ if (error = exec_extract_strings(imgp))
+ goto fail;
+
+ exec_new_vmspace(imgp);
+
+ for (i = 0; i < hdr->e_phnum; i++) {
+ switch(phdr[i].p_type) {
+
+ case PT_NULL: /* NULL section */
+ UPRINTF ("ELF PT_NULL section\n");
+ break;
+ case PT_LOAD: /* Loadable segment */
+ {
+ UPRINTF ("ELF PT_LOAD section ");
+ if (phdr[i].p_flags & PF_X)
+ prot |= VM_PROT_EXECUTE;
+ if (phdr[i].p_flags & PF_W)
+ prot |= VM_PROT_WRITE;
+ if (phdr[i].p_flags & PF_R)
+ prot |= VM_PROT_READ;
+
+ if (error = elf_load_section(vmspace, imgp->vp,
+ phdr[i].p_offset,
+ (caddr_t)phdr[i].p_vaddr,
+ phdr[i].p_memsz,
+ phdr[i].p_filesz, prot))
+ goto fail;
+
+ /*
+ * Is this .text or .data ??
+ *
+ * We only handle one each of those yet XXX
+ */
+ if (hdr->e_entry >= phdr[i].p_vaddr &&
+ hdr->e_entry <(phdr[i].p_vaddr+phdr[i].p_memsz)) {
+ text_addr = trunc_page(phdr[i].p_vaddr);
+ text_size = round_page(phdr[i].p_memsz +
+ phdr[i].p_vaddr -
+ text_addr);
+ entry = (u_long)hdr->e_entry;
+ UPRINTF(".text <%08x,%08x> entry=%08x\n",
+ text_addr, text_size, entry);
+ } else {
+ data_addr = trunc_page(phdr[i].p_vaddr);
+ data_size = round_page(phdr[i].p_memsz +
+ phdr[i].p_vaddr -
+ data_addr);
+ UPRINTF(".data <%08x,%08x>\n",
+ data_addr, data_size);
+ }
+ }
+ break;
+
+ case PT_DYNAMIC:/* Dynamic link information */
+ UPRINTF ("ELF PT_DYNAMIC section ??\n");
+ break;
+ case PT_INTERP: /* Path to interpreter */
+ UPRINTF ("ELF PT_INTERP section ");
+ if (phdr[i].p_filesz > MAXPATHLEN) {
+ error = ENOEXEC;
+ goto fail;
+ }
+ interp_len = MAXPATHLEN;
+ if (error = map_pages(imgp->vp, phdr[i].p_offset,
+ (vm_offset_t *)&interp, interp_len))
+ goto fail;
+ UPRINTF("<%s>\n", interp);
+ break;
+ case PT_NOTE: /* Note section */
+ UPRINTF ("ELF PT_NOTE section\n");
+ break;
+ case PT_SHLIB: /* Shared lib section */
+ UPRINTF ("ELF PT_SHLIB section\n");
+ break;
+ case PT_PHDR: /* Program header table info */
+ UPRINTF ("ELF PT_PHDR section <%x>\n", phdr[i].p_vaddr);
+ proghdr = phdr[i].p_vaddr;
+ break;
+ default:
+ UPRINTF ("ELF %d section ??\n", phdr[i].p_type);
+ }
+ }
+
+ vmspace->vm_tsize = text_size >> PAGE_SHIFT;
+ vmspace->vm_taddr = (caddr_t)text_addr;
+ vmspace->vm_dsize = data_size >> PAGE_SHIFT;
+ vmspace->vm_daddr = (caddr_t)data_addr;
+
+ addr = 2*MAXDSIZ; /* May depend on OS type XXX */
+
+ imgp->entry_addr = entry;
+
+ /*
+ * So which kind (brand) of ELF binary do we have at hand
+ * FreeBSD, Linux, SVR4 or something else ??
+ * If its has a interpreter section try that first
+ */
+ if (interp) {
+ for (i=0; i<MAX_BRANDS; i++) {
+ if (elf_brand_list[i] != NULL) {
+ if (!strcmp(interp, elf_brand_list[i]->interp_path)) {
+ imgp->proc->p_sysent =
+ elf_brand_list[i]->sysvec;
+ strcpy(path, elf_brand_list[i]->emul_path);
+ strcat(path, elf_brand_list[i]->interp_path);
+ UPRINTF("interpreter=<%s> %s\n",
+ elf_brand_list[i]->interp_path,
+ elf_brand_list[i]->emul_path);
+ break;
+ }
+ }
+ }
+ }
+
+ /*
+ * If there is no interpreter, or recognition of it
+ * failed, se if the binary is branded.
+ */
+ if (!interp || i == MAX_BRANDS) {
+ brand = (char *)&(hdr->e_ident[EI_BRAND]);
+ for (i=0; i<MAX_BRANDS; i++) {
+ if (elf_brand_list[i] != NULL) {
+ if (!strcmp(brand, elf_brand_list[i]->brand)) {
+ imgp->proc->p_sysent = elf_brand_list[i]->sysvec;
+ if (interp) {
+ strcpy(path, elf_brand_list[i]->emul_path);
+ strcat(path, elf_brand_list[i]->interp_path);
+ UPRINTF("interpreter=<%s> %s\n",
+ elf_brand_list[i]->interp_path,
+ elf_brand_list[i]->emul_path);
+ }
+ break;
+ }
+ }
+ }
+ }
+ if (i == MAX_BRANDS) {
+ uprintf("ELF binary type not known\n");
+ error = ENOEXEC;
+ goto fail;
+ }
+ if (interp) {
+ if (error = elf_load_file(imgp->proc,
+ path,
+ &addr, /* XXX */
+ &imgp->entry_addr)) {
+ uprintf("ELF interpreter %s not found\n", path);
+ goto fail;
+ }
+ }
+
+ UPRINTF("Executing %s binary\n", elf_brand_list[i]->brand);
+
+ /*
+ * Construct auxargs table (used by the fixup routine)
+ */
+ elf_auxargs = malloc(sizeof(Elf32_Auxargs), M_TEMP, M_WAITOK);
+ elf_auxargs->execfd = -1;
+ elf_auxargs->phdr = proghdr;
+ elf_auxargs->phent = hdr->e_phentsize;
+ elf_auxargs->phnum = hdr->e_phnum;
+ elf_auxargs->pagesz = PAGE_SIZE;
+ elf_auxargs->base = addr;
+ elf_auxargs->flags = 0;
+ elf_auxargs->entry = entry;
+ elf_auxargs->trace = elf_trace;
+
+ imgp->auxargs = elf_auxargs;
+ imgp->interpreted = 0;
+
+ /* don't allow modifying the file while we run it */
+ imgp->vp->v_flag |= VTEXT;
+
+fail:
+ if (mapped_phdr)
+ unmap_pages((vm_offset_t)mapped_phdr, header_size);
+ if (interp)
+ unmap_pages((vm_offset_t)interp, interp_len);
+
+ return error;
+}
+
+static int
+elf_freebsd_fixup(int **stack_base, struct image_params *imgp)
+{
+ Elf32_Auxargs *args = (Elf32_Auxargs *)imgp->auxargs;
+ int *pos;
+
+ pos = *stack_base + (imgp->argc + imgp->envc + 2);
+
+ if (args->trace) {
+ AUXARGS_ENTRY(pos, AT_DEBUG, 1);
+ }
+ if (args->execfd != -1) {
+ AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
+ }
+ AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
+ AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
+ AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
+ AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
+ AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
+ AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
+ AUXARGS_ENTRY(pos, AT_BASE, args->base);
+ AUXARGS_ENTRY(pos, AT_NULL, 0);
+
+ free(imgp->auxargs, M_TEMP);
+ imgp->auxargs = NULL;
+
+ (*stack_base)--;
+ **stack_base = (int)imgp->argc;
+ return 0;
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ * Since `const' objects end up in the text segment, TEXT_SET is the
+ * correct directive to use.
+ */
+const struct execsw elf_execsw = {exec_elf_imgact, "ELF"};
+TEXT_SET(execsw_set, elf_execsw);
+
diff --git a/sys/kern/imgact_gzip.c b/sys/kern/imgact_gzip.c
new file mode 100644
index 0000000..9a3237f
--- /dev/null
+++ b/sys/kern/imgact_gzip.c
@@ -0,0 +1,378 @@
+/*
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@login.dkuug.dk> wrote this file. As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * $Id$
+ *
+ * This module handles execution of a.out files which have been run through
+ * "gzip". This saves diskspace, but wastes cpu-cycles and VM.
+ *
+ * TODO:
+ * text-segments should be made R/O after being filled
+ * is the vm-stuff safe ?
+ * should handle the entire header of gzip'ed stuff.
+ * inflate isn't quite reentrant yet...
+ * error-handling is a mess...
+ * so is the rest...
+ * tidy up unnecesary includes
+ */
+
+#include "opt_rlimit.h"
+
+#include <sys/param.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/imgact_aout.h>
+#include <sys/kernel.h>
+#include <sys/mman.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/inflate.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+struct imgact_gzip {
+ struct image_params *ip;
+ struct exec a_out;
+ int error;
+ int where;
+ u_char *inbuf;
+ u_long offset;
+ u_long output;
+ u_long len;
+ int idx;
+ u_long virtual_offset, file_offset, file_end, bss_size;
+};
+
+static int exec_gzip_imgact __P((struct image_params *imgp));
+static int NextByte __P((void *vp));
+static int do_aout_hdr __P((struct imgact_gzip *));
+static int Flush __P((void *vp, u_char *, u_long siz));
+
+static int
+exec_gzip_imgact(imgp)
+ struct image_params *imgp;
+{
+ int error, error2 = 0;
+ const u_char *p = (const u_char *) imgp->image_header;
+ struct imgact_gzip igz;
+ struct inflate infl;
+ struct vmspace *vmspace;
+
+ /* If these four are not OK, it isn't a gzip file */
+ if (p[0] != 0x1f)
+ return -1; /* 0 Simply magic */
+ if (p[1] != 0x8b)
+ return -1; /* 1 Simply magic */
+ if (p[2] != 0x08)
+ return -1; /* 2 Compression method */
+ if (p[9] != 0x03)
+ return -1; /* 9 OS compressed on */
+
+ /*
+ * If this one contains anything but a comment or a filename marker,
+ * we don't want to chew on it
+ */
+ if (p[3] & ~(0x18))
+ return ENOEXEC; /* 3 Flags */
+
+ /* These are of no use to us */
+ /* 4-7 Timestamp */
+ /* 8 Extra flags */
+
+ bzero(&igz, sizeof igz);
+ bzero(&infl, sizeof infl);
+ infl.gz_private = (void *) &igz;
+ infl.gz_input = NextByte;
+ infl.gz_output = Flush;
+
+ igz.ip = imgp;
+ igz.idx = 10;
+
+ if (p[3] & 0x08) { /* skip a filename */
+ while (p[igz.idx++])
+ if (igz.idx >= PAGE_SIZE)
+ return ENOEXEC;
+ }
+ if (p[3] & 0x10) { /* skip a comment */
+ while (p[igz.idx++])
+ if (igz.idx >= PAGE_SIZE)
+ return ENOEXEC;
+ }
+ igz.len = imgp->attr->va_size;
+
+ error = inflate(&infl);
+
+ if ( !error ) {
+ vmspace = imgp->proc->p_vmspace;
+ error = vm_map_protect(&vmspace->vm_map,
+ (vm_offset_t) vmspace->vm_taddr,
+ (vm_offset_t) (vmspace->vm_taddr +
+ (vmspace->vm_tsize << PAGE_SHIFT)) ,
+ VM_PROT_READ|VM_PROT_EXECUTE,0);
+ }
+
+ if (igz.inbuf) {
+ error2 =
+ vm_map_remove(kernel_map, (vm_offset_t) igz.inbuf,
+ (vm_offset_t) igz.inbuf + PAGE_SIZE);
+ }
+ if (igz.error || error || error2) {
+ printf("Output=%lu ", igz.output);
+ printf("Inflate_error=%d igz.error=%d error2=%d where=%d\n",
+ error, igz.error, error2, igz.where);
+ }
+ if (igz.error)
+ return igz.error;
+ if (error)
+ return ENOEXEC;
+ if (error2)
+ return error2;
+ return 0;
+}
+
+static int
+do_aout_hdr(struct imgact_gzip * gz)
+{
+ int error;
+ struct vmspace *vmspace = gz->ip->proc->p_vmspace;
+ vm_offset_t vmaddr;
+
+ /*
+ * Set file/virtual offset based on a.out variant. We do two cases:
+ * host byte order and network byte order (for NetBSD compatibility)
+ */
+ switch ((int) (gz->a_out.a_magic & 0xffff)) {
+ case ZMAGIC:
+ gz->virtual_offset = 0;
+ if (gz->a_out.a_text) {
+ gz->file_offset = PAGE_SIZE;
+ } else {
+ /* Bill's "screwball mode" */
+ gz->file_offset = 0;
+ }
+ break;
+ case QMAGIC:
+ gz->virtual_offset = PAGE_SIZE;
+ gz->file_offset = 0;
+ break;
+ default:
+ /* NetBSD compatibility */
+ switch ((int) (ntohl(gz->a_out.a_magic) & 0xffff)) {
+ case ZMAGIC:
+ case QMAGIC:
+ gz->virtual_offset = PAGE_SIZE;
+ gz->file_offset = 0;
+ break;
+ default:
+ gz->where = __LINE__;
+ return (-1);
+ }
+ }
+
+ gz->bss_size = roundup(gz->a_out.a_bss, PAGE_SIZE);
+
+ /*
+ * Check various fields in header for validity/bounds.
+ */
+ if ( /* entry point must lay with text region */
+ gz->a_out.a_entry < gz->virtual_offset ||
+ gz->a_out.a_entry >= gz->virtual_offset + gz->a_out.a_text ||
+
+ /* text and data size must each be page rounded */
+ gz->a_out.a_text & PAGE_MASK || gz->a_out.a_data & PAGE_MASK) {
+ gz->where = __LINE__;
+ return (-1);
+ }
+ /*
+ * text/data/bss must not exceed limits
+ */
+ if ( /* text can't exceed maximum text size */
+ gz->a_out.a_text > MAXTSIZ ||
+
+ /* data + bss can't exceed maximum data size */
+ gz->a_out.a_data + gz->bss_size > MAXDSIZ ||
+
+ /* data + bss can't exceed rlimit */
+ gz->a_out.a_data + gz->bss_size >
+ gz->ip->proc->p_rlimit[RLIMIT_DATA].rlim_cur) {
+ gz->where = __LINE__;
+ return (ENOMEM);
+ }
+ /* Find out how far we should go */
+ gz->file_end = gz->file_offset + gz->a_out.a_text + gz->a_out.a_data;
+
+ /* copy in arguments and/or environment from old process */
+ error = exec_extract_strings(gz->ip);
+ if (error) {
+ gz->where = __LINE__;
+ return (error);
+ }
+ /*
+ * Destroy old process VM and create a new one (with a new stack)
+ */
+ exec_new_vmspace(gz->ip);
+
+ vmaddr = gz->virtual_offset;
+
+ error = vm_mmap(&vmspace->vm_map,
+ &vmaddr,
+ gz->a_out.a_text + gz->a_out.a_data,
+ VM_PROT_ALL, VM_PROT_ALL, MAP_ANON | MAP_FIXED,
+ 0,
+ 0);
+
+ if (error) {
+ gz->where = __LINE__;
+ return (error);
+ }
+
+ if (gz->bss_size != 0) {
+ /*
+ * Allocate demand-zeroed area for uninitialized data.
+ * "bss" = 'block started by symbol' - named after the
+ * IBM 7090 instruction of the same name.
+ */
+ vmaddr = gz->virtual_offset + gz->a_out.a_text +
+ gz->a_out.a_data;
+ error = vm_map_find(&vmspace->vm_map,
+ NULL,
+ 0,
+ &vmaddr,
+ gz->bss_size,
+ FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (error) {
+ gz->where = __LINE__;
+ return (error);
+ }
+ }
+ /* Fill in process VM information */
+ vmspace->vm_tsize = gz->a_out.a_text >> PAGE_SHIFT;
+ vmspace->vm_dsize = (gz->a_out.a_data + gz->bss_size) >> PAGE_SHIFT;
+ vmspace->vm_taddr = (caddr_t) gz->virtual_offset;
+ vmspace->vm_daddr = (caddr_t) gz->virtual_offset + gz->a_out.a_text;
+
+ /* Fill in image_params */
+ gz->ip->interpreted = 0;
+ gz->ip->entry_addr = gz->a_out.a_entry;
+
+ gz->ip->proc->p_sysent = &aout_sysvec;
+
+ return 0;
+}
+
+static int
+NextByte(void *vp)
+{
+ int error;
+ struct imgact_gzip *igz = (struct imgact_gzip *) vp;
+
+ if (igz->idx >= igz->len) {
+ igz->where = __LINE__;
+ return GZ_EOF;
+ }
+ if (igz->inbuf && igz->idx < (igz->offset + PAGE_SIZE)) {
+ return igz->inbuf[(igz->idx++) - igz->offset];
+ }
+ if (igz->inbuf) {
+ error = vm_map_remove(kernel_map, (vm_offset_t) igz->inbuf,
+ (vm_offset_t) igz->inbuf + PAGE_SIZE);
+ if (error) {
+ igz->where = __LINE__;
+ igz->error = error;
+ return GZ_EOF;
+ }
+ }
+ igz->offset = igz->idx & ~PAGE_MASK;
+
+ error = vm_mmap(kernel_map, /* map */
+ (vm_offset_t *) & igz->inbuf, /* address */
+ PAGE_SIZE, /* size */
+ VM_PROT_READ, /* protection */
+ VM_PROT_READ, /* max protection */
+ 0, /* flags */
+ (caddr_t) igz->ip->vp, /* vnode */
+ igz->offset); /* offset */
+ if (error) {
+ igz->where = __LINE__;
+ igz->error = error;
+ return GZ_EOF;
+ }
+ return igz->inbuf[(igz->idx++) - igz->offset];
+}
+
+static int
+Flush(void *vp, u_char * ptr, u_long siz)
+{
+ struct imgact_gzip *gz = (struct imgact_gzip *) vp;
+ u_char *p = ptr, *q;
+ int i;
+
+ /* First, find a a.out-header */
+ if (gz->output < sizeof gz->a_out) {
+ q = (u_char *) & gz->a_out;
+ i = min(siz, sizeof gz->a_out - gz->output);
+ bcopy(p, q + gz->output, i);
+ gz->output += i;
+ p += i;
+ siz -= i;
+ if (gz->output == sizeof gz->a_out) {
+ i = do_aout_hdr(gz);
+ if (i == -1) {
+ if (!gz->where)
+ gz->where = __LINE__;
+ gz->error = ENOEXEC;
+ return ENOEXEC;
+ } else if (i) {
+ gz->where = __LINE__;
+ gz->error = i;
+ return ENOEXEC;
+ }
+ if (gz->file_offset < sizeof gz->a_out) {
+ q = (u_char *) gz->virtual_offset + gz->output - gz->file_offset;
+ bcopy(&gz->a_out, q, sizeof gz->a_out - gz->file_offset);
+ }
+ }
+ }
+ /* Skip over zero-padded first PAGE if needed */
+ if (gz->output < gz->file_offset && (gz->output + siz) > gz->file_offset) {
+ i = min(siz, gz->file_offset - gz->output);
+ gz->output += i;
+ p += i;
+ siz -= i;
+ }
+ if (gz->output >= gz->file_offset && gz->output < gz->file_end) {
+ i = min(siz, gz->file_end - gz->output);
+ q = (u_char *) gz->virtual_offset + gz->output - gz->file_offset;
+ bcopy(p, q, i);
+ gz->output += i;
+ p += i;
+ siz -= i;
+ }
+ gz->output += siz;
+ return 0;
+}
+
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ * Since `const' objects end up in the text segment, TEXT_SET is the
+ * correct directive to use.
+ */
+
+static const struct execsw gzip_execsw = {exec_gzip_imgact, "gzip"};
+TEXT_SET(execsw_set, gzip_execsw);
diff --git a/sys/kern/imgact_shell.c b/sys/kern/imgact_shell.c
new file mode 100644
index 0000000..fb03011
--- /dev/null
+++ b/sys/kern/imgact_shell.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/resourcevar.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/kernel.h>
+#include <machine/endian.h>
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define SHELLMAGIC 0x2123 /* #! */
+#else
+#define SHELLMAGIC 0x2321
+#endif
+
+#define MAXSHELLCMDLEN 64
+
+static int exec_shell_imgact __P((struct image_params *imgp));
+
+/*
+ * Shell interpreter image activator. A interpreter name beginning
+ * at imgp->stringbase is the minimal successful exit requirement.
+ */
+static int
+exec_shell_imgact(imgp)
+ struct image_params *imgp;
+{
+ const char *image_header = imgp->image_header;
+ const char *ihp, *line_endp;
+ char *interp;
+
+ /* a shell script? */
+ if (((const short *) image_header)[0] != SHELLMAGIC)
+ return(-1);
+
+ /*
+ * Don't allow a shell script to be the shell for a shell
+ * script. :-)
+ */
+ if (imgp->interpreted)
+ return(ENOEXEC);
+
+ imgp->interpreted = 1;
+
+ /*
+ * Copy shell name and arguments from image_header into string
+ * buffer.
+ */
+
+ /*
+ * Find end of line; return if the line > MAXSHELLCMDLEN long.
+ */
+ for (ihp = &image_header[2]; *ihp != '\n'; ++ihp) {
+ if (ihp >= &image_header[MAXSHELLCMDLEN])
+ return(ENOEXEC);
+ }
+ line_endp = ihp;
+
+ /* reset for another pass */
+ ihp = &image_header[2];
+
+ /* Skip over leading spaces - until the interpreter name */
+ while ((*ihp == ' ') || (*ihp == '\t')) ihp++;
+
+ /* copy the interpreter name */
+ interp = imgp->interpreter_name;
+ while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t'))
+ *interp++ = *ihp++;
+ *interp = '\0';
+
+ /* Disallow a null interpreter filename */
+ if (*imgp->interpreter_name == '\0')
+ return(ENOEXEC);
+
+ /* reset for another pass */
+ ihp = &image_header[2];
+
+ /* copy the interpreter name and arguments */
+ while (ihp < line_endp) {
+ /* Skip over leading spaces */
+ while ((*ihp == ' ') || (*ihp == '\t')) ihp++;
+
+ if (ihp < line_endp) {
+ /*
+ * Copy to end of token. No need to watch stringspace
+ * because this is at the front of the string buffer
+ * and the maximum shell command length is tiny.
+ */
+ while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t')) {
+ *imgp->stringp++ = *ihp++;
+ imgp->stringspace--;
+ }
+
+ *imgp->stringp++ = 0;
+ imgp->stringspace--;
+
+ imgp->argc++;
+ }
+ }
+
+ /* set argv[0] to point to original file name */
+ suword(imgp->uap->argv, (int)imgp->uap->fname);
+
+ return(0);
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ * Since `const' objects end up in the text segment, TEXT_SET is the
+ * correct directive to use.
+ */
+static const struct execsw shell_execsw = { exec_shell_imgact, "#!" };
+TEXT_SET(execsw_set, shell_execsw);
diff --git a/sys/kern/inflate.c b/sys/kern/inflate.c
new file mode 100644
index 0000000..2024bc1
--- /dev/null
+++ b/sys/kern/inflate.c
@@ -0,0 +1,1072 @@
+/*
+ * Most parts of this file are not covered by:
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@login.dknet.dk> wrote this file. As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return. Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * $Id$
+ *
+ *
+ */
+
+#include <sys/param.h>
+#include <sys/inflate.h>
+#ifdef KERNEL
+#include <sys/systm.h>
+#endif
+#include <sys/mman.h>
+#include <sys/malloc.h>
+
+/* needed to make inflate() work */
+#define uch u_char
+#define ush u_short
+#define ulg u_long
+
+/* Stuff to make inflate() work */
+#ifdef KERNEL
+#define memzero(dest,len) bzero(dest,len)
+#endif
+#define NOMEMCPY
+#ifdef KERNEL
+#define FPRINTF printf
+#else
+extern void putstr (char *);
+#define FPRINTF putstr
+#endif
+
+#define FLUSH(x,y) { \
+ int foo = (*x->gz_output)(x->gz_private,x->gz_slide,y); \
+ if (foo) \
+ return foo; \
+ }
+
+static const int qflag = 0;
+
+#ifndef KERNEL /* want to use this file in kzip also */
+extern unsigned char *malloc (int, int, int);
+extern void free (void*, int);
+#endif
+
+/*
+ * This came from unzip-5.12. I have changed it the flow to pass
+ * a structure pointer around, thus hopefully making it re-entrant.
+ * Poul-Henning
+ */
+
+/* inflate.c -- put in the public domain by Mark Adler
+ version c14o, 23 August 1994 */
+
+/* You can do whatever you like with this source file, though I would
+ prefer that if you modify it and redistribute it that you include
+ comments to that effect with your name and the date. Thank you.
+
+ History:
+ vers date who what
+ ---- --------- -------------- ------------------------------------
+ a ~~ Feb 92 M. Adler used full (large, one-step) lookup table
+ b1 21 Mar 92 M. Adler first version with partial lookup tables
+ b2 21 Mar 92 M. Adler fixed bug in fixed-code blocks
+ b3 22 Mar 92 M. Adler sped up match copies, cleaned up some
+ b4 25 Mar 92 M. Adler added prototypes; removed window[] (now
+ is the responsibility of unzip.h--also
+ changed name to slide[]), so needs diffs
+ for unzip.c and unzip.h (this allows
+ compiling in the small model on MSDOS);
+ fixed cast of q in huft_build();
+ b5 26 Mar 92 M. Adler got rid of unintended macro recursion.
+ b6 27 Mar 92 M. Adler got rid of nextbyte() routine. fixed
+ bug in inflate_fixed().
+ c1 30 Mar 92 M. Adler removed lbits, dbits environment variables.
+ changed BMAX to 16 for explode. Removed
+ OUTB usage, and replaced it with flush()--
+ this was a 20% speed improvement! Added
+ an explode.c (to replace unimplod.c) that
+ uses the huft routines here. Removed
+ register union.
+ c2 4 Apr 92 M. Adler fixed bug for file sizes a multiple of 32k.
+ c3 10 Apr 92 M. Adler reduced memory of code tables made by
+ huft_build significantly (factor of two to
+ three).
+ c4 15 Apr 92 M. Adler added NOMEMCPY do kill use of memcpy().
+ worked around a Turbo C optimization bug.
+ c5 21 Apr 92 M. Adler added the GZ_WSIZE #define to allow reducing
+ the 32K window size for specialized
+ applications.
+ c6 31 May 92 M. Adler added some typecasts to eliminate warnings
+ c7 27 Jun 92 G. Roelofs added some more typecasts (444: MSC bug).
+ c8 5 Oct 92 J-l. Gailly added ifdef'd code to deal with PKZIP bug.
+ c9 9 Oct 92 M. Adler removed a memory error message (~line 416).
+ c10 17 Oct 92 G. Roelofs changed ULONG/UWORD/byte to ulg/ush/uch,
+ removed old inflate, renamed inflate_entry
+ to inflate, added Mark's fix to a comment.
+ c10.5 14 Dec 92 M. Adler fix up error messages for incomplete trees.
+ c11 2 Jan 93 M. Adler fixed bug in detection of incomplete
+ tables, and removed assumption that EOB is
+ the longest code (bad assumption).
+ c12 3 Jan 93 M. Adler make tables for fixed blocks only once.
+ c13 5 Jan 93 M. Adler allow all zero length codes (pkzip 2.04c
+ outputs one zero length code for an empty
+ distance tree).
+ c14 12 Mar 93 M. Adler made inflate.c standalone with the
+ introduction of inflate.h.
+ c14b 16 Jul 93 G. Roelofs added (unsigned) typecast to w at 470.
+ c14c 19 Jul 93 J. Bush changed v[N_MAX], l[288], ll[28x+3x] arrays
+ to static for Amiga.
+ c14d 13 Aug 93 J-l. Gailly de-complicatified Mark's c[*p++]++ thing.
+ c14e 8 Oct 93 G. Roelofs changed memset() to memzero().
+ c14f 22 Oct 93 G. Roelofs renamed quietflg to qflag; made Trace()
+ conditional; added inflate_free().
+ c14g 28 Oct 93 G. Roelofs changed l/(lx+1) macro to pointer (Cray bug)
+ c14h 7 Dec 93 C. Ghisler huft_build() optimizations.
+ c14i 9 Jan 94 A. Verheijen set fixed_t{d,l} to NULL after freeing;
+ G. Roelofs check NEXTBYTE macro for GZ_EOF.
+ c14j 23 Jan 94 G. Roelofs removed Ghisler "optimizations"; ifdef'd
+ GZ_EOF check.
+ c14k 27 Feb 94 G. Roelofs added some typecasts to avoid warnings.
+ c14l 9 Apr 94 G. Roelofs fixed split comments on preprocessor lines
+ to avoid bug in Encore compiler.
+ c14m 7 Jul 94 P. Kienitz modified to allow assembler version of
+ inflate_codes() (define ASM_INFLATECODES)
+ c14n 22 Jul 94 G. Roelofs changed fprintf to FPRINTF for DLL versions
+ c14o 23 Aug 94 C. Spieler added a newline to a debug statement;
+ G. Roelofs added another typecast to avoid MSC warning
+ */
+
+
+/*
+ Inflate deflated (PKZIP's method 8 compressed) data. The compression
+ method searches for as much of the current string of bytes (up to a
+ length of 258) in the previous 32K bytes. If it doesn't find any
+ matches (of at least length 3), it codes the next byte. Otherwise, it
+ codes the length of the matched string and its distance backwards from
+ the current position. There is a single Huffman code that codes both
+ single bytes (called "literals") and match lengths. A second Huffman
+ code codes the distance information, which follows a length code. Each
+ length or distance code actually represents a base value and a number
+ of "extra" (sometimes zero) bits to get to add to the base value. At
+ the end of each deflated block is a special end-of-block (EOB) literal/
+ length code. The decoding process is basically: get a literal/length
+ code; if EOB then done; if a literal, emit the decoded byte; if a
+ length then get the distance and emit the referred-to bytes from the
+ sliding window of previously emitted data.
+
+ There are (currently) three kinds of inflate blocks: stored, fixed, and
+ dynamic. The compressor outputs a chunk of data at a time and decides
+ which method to use on a chunk-by-chunk basis. A chunk might typically
+ be 32K to 64K, uncompressed. If the chunk is uncompressible, then the
+ "stored" method is used. In this case, the bytes are simply stored as
+ is, eight bits per byte, with none of the above coding. The bytes are
+ preceded by a count, since there is no longer an EOB code.
+
+ If the data is compressible, then either the fixed or dynamic methods
+ are used. In the dynamic method, the compressed data is preceded by
+ an encoding of the literal/length and distance Huffman codes that are
+ to be used to decode this block. The representation is itself Huffman
+ coded, and so is preceded by a description of that code. These code
+ descriptions take up a little space, and so for small blocks, there is
+ a predefined set of codes, called the fixed codes. The fixed method is
+ used if the block ends up smaller that way (usually for quite small
+ chunks); otherwise the dynamic method is used. In the latter case, the
+ codes are customized to the probabilities in the current block and so
+ can code it much better than the pre-determined fixed codes can.
+
+ The Huffman codes themselves are decoded using a mutli-level table
+ lookup, in order to maximize the speed of decoding plus the speed of
+ building the decoding tables. See the comments below that precede the
+ lbits and dbits tuning parameters.
+ */
+
+
+/*
+ Notes beyond the 1.93a appnote.txt:
+
+ 1. Distance pointers never point before the beginning of the output
+ stream.
+ 2. Distance pointers can point back across blocks, up to 32k away.
+ 3. There is an implied maximum of 7 bits for the bit length table and
+ 15 bits for the actual data.
+ 4. If only one code exists, then it is encoded using one bit. (Zero
+ would be more efficient, but perhaps a little confusing.) If two
+ codes exist, they are coded using one bit each (0 and 1).
+ 5. There is no way of sending zero distance codes--a dummy must be
+ sent if there are none. (History: a pre 2.0 version of PKZIP would
+ store blocks with no distance codes, but this was discovered to be
+ too harsh a criterion.) Valid only for 1.93a. 2.04c does allow
+ zero distance codes, which is sent as one code of zero bits in
+ length.
+ 6. There are up to 286 literal/length codes. Code 256 represents the
+ end-of-block. Note however that the static length tree defines
+ 288 codes just to fill out the Huffman codes. Codes 286 and 287
+ cannot be used though, since there is no length base or extra bits
+ defined for them. Similarily, there are up to 30 distance codes.
+ However, static trees define 32 codes (all 5 bits) to fill out the
+ Huffman codes, but the last two had better not show up in the data.
+ 7. Unzip can check dynamic Huffman blocks for complete code sets.
+ The exception is that a single code would not be complete (see #4).
+ 8. The five bits following the block type is really the number of
+ literal codes sent minus 257.
+ 9. Length codes 8,16,16 are interpreted as 13 length codes of 8 bits
+ (1+6+6). Therefore, to output three times the length, you output
+ three codes (1+1+1), whereas to output four times the same length,
+ you only need two codes (1+3). Hmm.
+ 10. In the tree reconstruction algorithm, Code = Code + Increment
+ only if BitLength(i) is not zero. (Pretty obvious.)
+ 11. Correction: 4 Bits: # of Bit Length codes - 4 (4 - 19)
+ 12. Note: length code 284 can represent 227-258, but length code 285
+ really is 258. The last length deserves its own, short code
+ since it gets used a lot in very redundant files. The length
+ 258 is special since 258 - 3 (the min match length) is 255.
+ 13. The literal/length and distance code bit lengths are read as a
+ single stream of lengths. It is possible (and advantageous) for
+ a repeat code (16, 17, or 18) to go across the boundary between
+ the two sets of lengths.
+ */
+
+
+#define PKZIP_BUG_WORKAROUND /* PKZIP 1.93a problem--live with it */
+
+/*
+ inflate.h must supply the uch slide[GZ_WSIZE] array and the NEXTBYTE,
+ FLUSH() and memzero macros. If the window size is not 32K, it
+ should also define GZ_WSIZE. If INFMOD is defined, it can include
+ compiled functions to support the NEXTBYTE and/or FLUSH() macros.
+ There are defaults for NEXTBYTE and FLUSH() below for use as
+ examples of what those functions need to do. Normally, you would
+ also want FLUSH() to compute a crc on the data. inflate.h also
+ needs to provide these typedefs:
+
+ typedef unsigned char uch;
+ typedef unsigned short ush;
+ typedef unsigned long ulg;
+
+ This module uses the external functions malloc() and free() (and
+ probably memset() or bzero() in the memzero() macro). Their
+ prototypes are normally found in <string.h> and <stdlib.h>.
+ */
+#define INFMOD /* tell inflate.h to include code to be
+ * compiled */
+
+/* Huffman code lookup table entry--this entry is four bytes for machines
+ that have 16-bit pointers (e.g. PC's in the small or medium model).
+ Valid extra bits are 0..13. e == 15 is EOB (end of block), e == 16
+ means that v is a literal, 16 < e < 32 means that v is a pointer to
+ the next table, which codes e - 16 bits, and lastly e == 99 indicates
+ an unused code. If a code with e == 99 is looked up, this implies an
+ error in the data. */
+struct huft {
+ uch e; /* number of extra bits or operation */
+ uch b; /* number of bits in this code or subcode */
+ union {
+ ush n; /* literal, length base, or distance
+ * base */
+ struct huft *t; /* pointer to next level of table */
+ } v;
+};
+
+
+/* Function prototypes */
+static int huft_build __P((struct inflate *, unsigned *, unsigned, unsigned, const ush *, const ush *, struct huft **, int *));
+static int huft_free __P((struct inflate *, struct huft *));
+static int inflate_codes __P((struct inflate *, struct huft *, struct huft *, int, int));
+static int inflate_stored __P((struct inflate *));
+static int xinflate __P((struct inflate *));
+static int inflate_fixed __P((struct inflate *));
+static int inflate_dynamic __P((struct inflate *));
+static int inflate_block __P((struct inflate *, int *));
+
+/* The inflate algorithm uses a sliding 32K byte window on the uncompressed
+ stream to find repeated byte strings. This is implemented here as a
+ circular buffer. The index is updated simply by incrementing and then
+ and'ing with 0x7fff (32K-1). */
+/* It is left to other modules to supply the 32K area. It is assumed
+ to be usable as if it were declared "uch slide[32768];" or as just
+ "uch *slide;" and then malloc'ed in the latter case. The definition
+ must be in unzip.h, included above. */
+
+
+/* Tables for deflate from PKZIP's appnote.txt. */
+
+/* Order of the bit length code lengths */
+static const unsigned border[] = {
+ 16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+
+static const ush cplens[] = { /* Copy lengths for literal codes 257..285 */
+ 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
+ 35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
+ /* note: see note #13 above about the 258 in this list. */
+
+static const ush cplext[] = { /* Extra bits for literal codes 257..285 */
+ 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
+ 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 99, 99}; /* 99==invalid */
+
+static const ush cpdist[] = { /* Copy offsets for distance codes 0..29 */
+ 1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
+ 257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
+ 8193, 12289, 16385, 24577};
+
+static const ush cpdext[] = { /* Extra bits for distance codes */
+ 0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
+ 7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
+ 12, 12, 13, 13};
+
+/* And'ing with mask[n] masks the lower n bits */
+static const ush mask[] = {
+ 0x0000,
+ 0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff,
+ 0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff
+};
+
+
+/* Macros for inflate() bit peeking and grabbing.
+ The usage is:
+
+ NEEDBITS(glbl,j)
+ x = b & mask[j];
+ DUMPBITS(j)
+
+ where NEEDBITS makes sure that b has at least j bits in it, and
+ DUMPBITS removes the bits from b. The macros use the variable k
+ for the number of bits in b. Normally, b and k are register
+ variables for speed, and are initialized at the begining of a
+ routine that uses these macros from a global bit buffer and count.
+
+ In order to not ask for more bits than there are in the compressed
+ stream, the Huffman tables are constructed to only ask for just
+ enough bits to make up the end-of-block code (value 256). Then no
+ bytes need to be "returned" to the buffer at the end of the last
+ block. See the huft_build() routine.
+ */
+
+/*
+ * The following 2 were global variables.
+ * They are now fields of the inflate structure.
+ */
+
+#define NEEDBITS(glbl,n) { \
+ while(k<(n)) { \
+ int c=(*glbl->gz_input)(glbl->gz_private); \
+ if(c==GZ_EOF) \
+ return 1; \
+ b|=((ulg)c)<<k; \
+ k+=8; \
+ } \
+ }
+
+#define DUMPBITS(n) {b>>=(n);k-=(n);}
+
+/*
+ Huffman code decoding is performed using a multi-level table lookup.
+ The fastest way to decode is to simply build a lookup table whose
+ size is determined by the longest code. However, the time it takes
+ to build this table can also be a factor if the data being decoded
+ is not very long. The most common codes are necessarily the
+ shortest codes, so those codes dominate the decoding time, and hence
+ the speed. The idea is you can have a shorter table that decodes the
+ shorter, more probable codes, and then point to subsidiary tables for
+ the longer codes. The time it costs to decode the longer codes is
+ then traded against the time it takes to make longer tables.
+
+ This results of this trade are in the variables lbits and dbits
+ below. lbits is the number of bits the first level table for literal/
+ length codes can decode in one step, and dbits is the same thing for
+ the distance codes. Subsequent tables are also less than or equal to
+ those sizes. These values may be adjusted either when all of the
+ codes are shorter than that, in which case the longest code length in
+ bits is used, or when the shortest code is *longer* than the requested
+ table size, in which case the length of the shortest code in bits is
+ used.
+
+ There are two different values for the two tables, since they code a
+ different number of possibilities each. The literal/length table
+ codes 286 possible values, or in a flat code, a little over eight
+ bits. The distance table codes 30 possible values, or a little less
+ than five bits, flat. The optimum values for speed end up being
+ about one bit more than those, so lbits is 8+1 and dbits is 5+1.
+ The optimum values may differ though from machine to machine, and
+ possibly even between compilers. Your mileage may vary.
+ */
+
+static const int lbits = 9; /* bits in base literal/length lookup table */
+static const int dbits = 6; /* bits in base distance lookup table */
+
+
+/* If BMAX needs to be larger than 16, then h and x[] should be ulg. */
+#define BMAX 16 /* maximum bit length of any code (16 for
+ * explode) */
+#define N_MAX 288 /* maximum number of codes in any set */
+
+/* Given a list of code lengths and a maximum table size, make a set of
+ tables to decode that set of codes. Return zero on success, one if
+ the given code set is incomplete (the tables are still built in this
+ case), two if the input is invalid (all zero length codes or an
+ oversubscribed set of lengths), and three if not enough memory.
+ The code with value 256 is special, and the tables are constructed
+ so that no bits beyond that code are fetched when that code is
+ decoded. */
+static int
+huft_build(glbl, b, n, s, d, e, t, m)
+ struct inflate *glbl;
+ unsigned *b; /* code lengths in bits (all assumed <= BMAX) */
+ unsigned n; /* number of codes (assumed <= N_MAX) */
+ unsigned s; /* number of simple-valued codes (0..s-1) */
+ const ush *d; /* list of base values for non-simple codes */
+ const ush *e; /* list of extra bits for non-simple codes */
+ struct huft **t; /* result: starting table */
+ int *m; /* maximum lookup bits, returns actual */
+{
+ unsigned a; /* counter for codes of length k */
+ unsigned c[BMAX + 1]; /* bit length count table */
+ unsigned el; /* length of EOB code (value 256) */
+ unsigned f; /* i repeats in table every f entries */
+ int g; /* maximum code length */
+ int h; /* table level */
+ register unsigned i; /* counter, current code */
+ register unsigned j; /* counter */
+ register int k; /* number of bits in current code */
+ int lx[BMAX + 1]; /* memory for l[-1..BMAX-1] */
+ int *l = lx + 1; /* stack of bits per table */
+ register unsigned *p; /* pointer into c[], b[], or v[] */
+ register struct huft *q;/* points to current table */
+ struct huft r; /* table entry for structure assignment */
+ struct huft *u[BMAX];/* table stack */
+ unsigned v[N_MAX]; /* values in order of bit length */
+ register int w; /* bits before this table == (l * h) */
+ unsigned x[BMAX + 1]; /* bit offsets, then code stack */
+ unsigned *xp; /* pointer into x */
+ int y; /* number of dummy codes added */
+ unsigned z; /* number of entries in current table */
+
+ /* Generate counts for each bit length */
+ el = n > 256 ? b[256] : BMAX; /* set length of EOB code, if any */
+#ifdef KERNEL
+ memzero((char *) c, sizeof(c));
+#else
+ for (i = 0; i < BMAX+1; i++)
+ c [i] = 0;
+#endif
+ p = b;
+ i = n;
+ do {
+ c[*p]++;
+ p++; /* assume all entries <= BMAX */
+ } while (--i);
+ if (c[0] == n) { /* null input--all zero length codes */
+ *t = (struct huft *) NULL;
+ *m = 0;
+ return 0;
+ }
+ /* Find minimum and maximum length, bound *m by those */
+ for (j = 1; j <= BMAX; j++)
+ if (c[j])
+ break;
+ k = j; /* minimum code length */
+ if ((unsigned) *m < j)
+ *m = j;
+ for (i = BMAX; i; i--)
+ if (c[i])
+ break;
+ g = i; /* maximum code length */
+ if ((unsigned) *m > i)
+ *m = i;
+
+ /* Adjust last length count to fill out codes, if needed */
+ for (y = 1 << j; j < i; j++, y <<= 1)
+ if ((y -= c[j]) < 0)
+ return 2; /* bad input: more codes than bits */
+ if ((y -= c[i]) < 0)
+ return 2;
+ c[i] += y;
+
+ /* Generate starting offsets into the value table for each length */
+ x[1] = j = 0;
+ p = c + 1;
+ xp = x + 2;
+ while (--i) { /* note that i == g from above */
+ *xp++ = (j += *p++);
+ }
+
+ /* Make a table of values in order of bit lengths */
+ p = b;
+ i = 0;
+ do {
+ if ((j = *p++) != 0)
+ v[x[j]++] = i;
+ } while (++i < n);
+
+ /* Generate the Huffman codes and for each, make the table entries */
+ x[0] = i = 0; /* first Huffman code is zero */
+ p = v; /* grab values in bit order */
+ h = -1; /* no tables yet--level -1 */
+ w = l[-1] = 0; /* no bits decoded yet */
+ u[0] = (struct huft *) NULL; /* just to keep compilers happy */
+ q = (struct huft *) NULL; /* ditto */
+ z = 0; /* ditto */
+
+ /* go through the bit lengths (k already is bits in shortest code) */
+ for (; k <= g; k++) {
+ a = c[k];
+ while (a--) {
+ /*
+ * here i is the Huffman code of length k bits for
+ * value *p
+ */
+ /* make tables up to required level */
+ while (k > w + l[h]) {
+ w += l[h++]; /* add bits already decoded */
+
+ /*
+ * compute minimum size table less than or
+ * equal to *m bits
+ */
+ z = (z = g - w) > (unsigned) *m ? *m : z; /* upper limit */
+ if ((f = 1 << (j = k - w)) > a + 1) { /* try a k-w bit table *//* t
+ * oo few codes for k-w
+ * bit table */
+ f -= a + 1; /* deduct codes from
+ * patterns left */
+ xp = c + k;
+ while (++j < z) { /* try smaller tables up
+ * to z bits */
+ if ((f <<= 1) <= *++xp)
+ break; /* enough codes to use
+ * up j bits */
+ f -= *xp; /* else deduct codes
+ * from patterns */
+ }
+ }
+ if ((unsigned) w + j > el && (unsigned) w < el)
+ j = el - w; /* make EOB code end at
+ * table */
+ z = 1 << j; /* table entries for j-bit
+ * table */
+ l[h] = j; /* set table size in stack */
+
+ /* allocate and link in new table */
+ if ((q = (struct huft *) malloc((z + 1) * sizeof(struct huft), M_GZIP, M_WAITOK)) ==
+ (struct huft *) NULL) {
+ if (h)
+ huft_free(glbl, u[0]);
+ return 3; /* not enough memory */
+ }
+ glbl->gz_hufts += z + 1; /* track memory usage */
+ *t = q + 1; /* link to list for
+ * huft_free() */
+ *(t = &(q->v.t)) = (struct huft *) NULL;
+ u[h] = ++q; /* table starts after link */
+
+ /* connect to last table, if there is one */
+ if (h) {
+ x[h] = i; /* save pattern for
+ * backing up */
+ r.b = (uch) l[h - 1]; /* bits to dump before
+ * this table */
+ r.e = (uch) (16 + j); /* bits in this table */
+ r.v.t = q; /* pointer to this table */
+ j = (i & ((1 << w) - 1)) >> (w - l[h - 1]);
+ u[h - 1][j] = r; /* connect to last table */
+ }
+ }
+
+ /* set up table entry in r */
+ r.b = (uch) (k - w);
+ if (p >= v + n)
+ r.e = 99; /* out of values--invalid
+ * code */
+ else if (*p < s) {
+ r.e = (uch) (*p < 256 ? 16 : 15); /* 256 is end-of-block
+ * code */
+ r.v.n = *p++; /* simple code is just the
+ * value */
+ } else {
+ r.e = (uch) e[*p - s]; /* non-simple--look up
+ * in lists */
+ r.v.n = d[*p++ - s];
+ }
+
+ /* fill code-like entries with r */
+ f = 1 << (k - w);
+ for (j = i >> w; j < z; j += f)
+ q[j] = r;
+
+ /* backwards increment the k-bit code i */
+ for (j = 1 << (k - 1); i & j; j >>= 1)
+ i ^= j;
+ i ^= j;
+
+ /* backup over finished tables */
+ while ((i & ((1 << w) - 1)) != x[h])
+ w -= l[--h]; /* don't need to update q */
+ }
+ }
+
+ /* return actual size of base table */
+ *m = l[0];
+
+ /* Return true (1) if we were given an incomplete table */
+ return y != 0 && g != 1;
+}
+
+static int
+huft_free(glbl, t)
+ struct inflate *glbl;
+ struct huft *t; /* table to free */
+/* Free the malloc'ed tables built by huft_build(), which makes a linked
+ list of the tables it made, with the links in a dummy first entry of
+ each table. */
+{
+ register struct huft *p, *q;
+
+ /* Go through linked list, freeing from the malloced (t[-1]) address. */
+ p = t;
+ while (p != (struct huft *) NULL) {
+ q = (--p)->v.t;
+ free(p, M_GZIP);
+ p = q;
+ }
+ return 0;
+}
+
+/* inflate (decompress) the codes in a deflated (compressed) block.
+ Return an error code or zero if it all goes ok. */
+static int
+inflate_codes(glbl, tl, td, bl, bd)
+ struct inflate *glbl;
+ struct huft *tl, *td;/* literal/length and distance decoder tables */
+ int bl, bd; /* number of bits decoded by tl[] and td[] */
+{
+ register unsigned e; /* table entry flag/number of extra bits */
+ unsigned n, d; /* length and index for copy */
+ unsigned w; /* current window position */
+ struct huft *t; /* pointer to table entry */
+ unsigned ml, md; /* masks for bl and bd bits */
+ register ulg b; /* bit buffer */
+ register unsigned k; /* number of bits in bit buffer */
+
+ /* make local copies of globals */
+ b = glbl->gz_bb; /* initialize bit buffer */
+ k = glbl->gz_bk;
+ w = glbl->gz_wp; /* initialize window position */
+
+ /* inflate the coded data */
+ ml = mask[bl]; /* precompute masks for speed */
+ md = mask[bd];
+ while (1) { /* do until end of block */
+ NEEDBITS(glbl, (unsigned) bl)
+ if ((e = (t = tl + ((unsigned) b & ml))->e) > 16)
+ do {
+ if (e == 99)
+ return 1;
+ DUMPBITS(t->b)
+ e -= 16;
+ NEEDBITS(glbl, e)
+ } while ((e = (t = t->v.t + ((unsigned) b & mask[e]))->e) > 16);
+ DUMPBITS(t->b)
+ if (e == 16) { /* then it's a literal */
+ glbl->gz_slide[w++] = (uch) t->v.n;
+ if (w == GZ_WSIZE) {
+ FLUSH(glbl, w);
+ w = 0;
+ }
+ } else { /* it's an EOB or a length */
+ /* exit if end of block */
+ if (e == 15)
+ break;
+
+ /* get length of block to copy */
+ NEEDBITS(glbl, e)
+ n = t->v.n + ((unsigned) b & mask[e]);
+ DUMPBITS(e);
+
+ /* decode distance of block to copy */
+ NEEDBITS(glbl, (unsigned) bd)
+ if ((e = (t = td + ((unsigned) b & md))->e) > 16)
+ do {
+ if (e == 99)
+ return 1;
+ DUMPBITS(t->b)
+ e -= 16;
+ NEEDBITS(glbl, e)
+ } while ((e = (t = t->v.t + ((unsigned) b & mask[e]))->e) > 16);
+ DUMPBITS(t->b)
+ NEEDBITS(glbl, e)
+ d = w - t->v.n - ((unsigned) b & mask[e]);
+ DUMPBITS(e)
+ /* do the copy */
+ do {
+ n -= (e = (e = GZ_WSIZE - ((d &= GZ_WSIZE - 1) > w ? d : w)) > n ? n : e);
+#ifndef NOMEMCPY
+ if (w - d >= e) { /* (this test assumes
+ * unsigned comparison) */
+ memcpy(glbl->gz_slide + w, glbl->gz_slide + d, e);
+ w += e;
+ d += e;
+ } else /* do it slow to avoid memcpy()
+ * overlap */
+#endif /* !NOMEMCPY */
+ do {
+ glbl->gz_slide[w++] = glbl->gz_slide[d++];
+ } while (--e);
+ if (w == GZ_WSIZE) {
+ FLUSH(glbl, w);
+ w = 0;
+ }
+ } while (n);
+ }
+ }
+
+ /* restore the globals from the locals */
+ glbl->gz_wp = w; /* restore global window pointer */
+ glbl->gz_bb = b; /* restore global bit buffer */
+ glbl->gz_bk = k;
+
+ /* done */
+ return 0;
+}
+
+/* "decompress" an inflated type 0 (stored) block. */
+static int
+inflate_stored(glbl)
+ struct inflate *glbl;
+{
+ unsigned n; /* number of bytes in block */
+ unsigned w; /* current window position */
+ register ulg b; /* bit buffer */
+ register unsigned k; /* number of bits in bit buffer */
+
+ /* make local copies of globals */
+ b = glbl->gz_bb; /* initialize bit buffer */
+ k = glbl->gz_bk;
+ w = glbl->gz_wp; /* initialize window position */
+
+ /* go to byte boundary */
+ n = k & 7;
+ DUMPBITS(n);
+
+ /* get the length and its complement */
+ NEEDBITS(glbl, 16)
+ n = ((unsigned) b & 0xffff);
+ DUMPBITS(16)
+ NEEDBITS(glbl, 16)
+ if (n != (unsigned) ((~b) & 0xffff))
+ return 1; /* error in compressed data */
+ DUMPBITS(16)
+ /* read and output the compressed data */
+ while (n--) {
+ NEEDBITS(glbl, 8)
+ glbl->gz_slide[w++] = (uch) b;
+ if (w == GZ_WSIZE) {
+ FLUSH(glbl, w);
+ w = 0;
+ }
+ DUMPBITS(8)
+ }
+
+ /* restore the globals from the locals */
+ glbl->gz_wp = w; /* restore global window pointer */
+ glbl->gz_bb = b; /* restore global bit buffer */
+ glbl->gz_bk = k;
+ return 0;
+}
+
+/* decompress an inflated type 1 (fixed Huffman codes) block. We should
+ either replace this with a custom decoder, or at least precompute the
+ Huffman tables. */
+static int
+inflate_fixed(glbl)
+ struct inflate *glbl;
+{
+ /* if first time, set up tables for fixed blocks */
+ if (glbl->gz_fixed_tl == (struct huft *) NULL) {
+ int i; /* temporary variable */
+ static unsigned l[288]; /* length list for huft_build */
+
+ /* literal table */
+ for (i = 0; i < 144; i++)
+ l[i] = 8;
+ for (; i < 256; i++)
+ l[i] = 9;
+ for (; i < 280; i++)
+ l[i] = 7;
+ for (; i < 288; i++) /* make a complete, but wrong code
+ * set */
+ l[i] = 8;
+ glbl->gz_fixed_bl = 7;
+ if ((i = huft_build(glbl, l, 288, 257, cplens, cplext,
+ &glbl->gz_fixed_tl, &glbl->gz_fixed_bl)) != 0) {
+ glbl->gz_fixed_tl = (struct huft *) NULL;
+ return i;
+ }
+ /* distance table */
+ for (i = 0; i < 30; i++) /* make an incomplete code
+ * set */
+ l[i] = 5;
+ glbl->gz_fixed_bd = 5;
+ if ((i = huft_build(glbl, l, 30, 0, cpdist, cpdext,
+ &glbl->gz_fixed_td, &glbl->gz_fixed_bd)) > 1) {
+ huft_free(glbl, glbl->gz_fixed_tl);
+ glbl->gz_fixed_tl = (struct huft *) NULL;
+ return i;
+ }
+ }
+ /* decompress until an end-of-block code */
+ return inflate_codes(glbl, glbl->gz_fixed_tl, glbl->gz_fixed_td, glbl->gz_fixed_bl, glbl->gz_fixed_bd) != 0;
+}
+
+/* decompress an inflated type 2 (dynamic Huffman codes) block. */
+static int
+inflate_dynamic(glbl)
+ struct inflate *glbl;
+{
+ int i; /* temporary variables */
+ unsigned j;
+ unsigned l; /* last length */
+ unsigned m; /* mask for bit lengths table */
+ unsigned n; /* number of lengths to get */
+ struct huft *tl; /* literal/length code table */
+ struct huft *td; /* distance code table */
+ int bl; /* lookup bits for tl */
+ int bd; /* lookup bits for td */
+ unsigned nb; /* number of bit length codes */
+ unsigned nl; /* number of literal/length codes */
+ unsigned nd; /* number of distance codes */
+#ifdef PKZIP_BUG_WORKAROUND
+ unsigned ll[288 + 32]; /* literal/length and distance code
+ * lengths */
+#else
+ unsigned ll[286 + 30]; /* literal/length and distance code
+ * lengths */
+#endif
+ register ulg b; /* bit buffer */
+ register unsigned k; /* number of bits in bit buffer */
+
+ /* make local bit buffer */
+ b = glbl->gz_bb;
+ k = glbl->gz_bk;
+
+ /* read in table lengths */
+ NEEDBITS(glbl, 5)
+ nl = 257 + ((unsigned) b & 0x1f); /* number of
+ * literal/length codes */
+ DUMPBITS(5)
+ NEEDBITS(glbl, 5)
+ nd = 1 + ((unsigned) b & 0x1f); /* number of distance codes */
+ DUMPBITS(5)
+ NEEDBITS(glbl, 4)
+ nb = 4 + ((unsigned) b & 0xf); /* number of bit length codes */
+ DUMPBITS(4)
+#ifdef PKZIP_BUG_WORKAROUND
+ if (nl > 288 || nd > 32)
+#else
+ if (nl > 286 || nd > 30)
+#endif
+ return 1; /* bad lengths */
+ /* read in bit-length-code lengths */
+ for (j = 0; j < nb; j++) {
+ NEEDBITS(glbl, 3)
+ ll[border[j]] = (unsigned) b & 7;
+ DUMPBITS(3)
+ }
+ for (; j < 19; j++)
+ ll[border[j]] = 0;
+
+ /* build decoding table for trees--single level, 7 bit lookup */
+ bl = 7;
+ if ((i = huft_build(glbl, ll, 19, 19, NULL, NULL, &tl, &bl)) != 0) {
+ if (i == 1)
+ huft_free(glbl, tl);
+ return i; /* incomplete code set */
+ }
+ /* read in literal and distance code lengths */
+ n = nl + nd;
+ m = mask[bl];
+ i = l = 0;
+ while ((unsigned) i < n) {
+ NEEDBITS(glbl, (unsigned) bl)
+ j = (td = tl + ((unsigned) b & m))->b;
+ DUMPBITS(j)
+ j = td->v.n;
+ if (j < 16) /* length of code in bits (0..15) */
+ ll[i++] = l = j; /* save last length in l */
+ else if (j == 16) { /* repeat last length 3 to 6 times */
+ NEEDBITS(glbl, 2)
+ j = 3 + ((unsigned) b & 3);
+ DUMPBITS(2)
+ if ((unsigned) i + j > n)
+ return 1;
+ while (j--)
+ ll[i++] = l;
+ } else if (j == 17) { /* 3 to 10 zero length codes */
+ NEEDBITS(glbl, 3)
+ j = 3 + ((unsigned) b & 7);
+ DUMPBITS(3)
+ if ((unsigned) i + j > n)
+ return 1;
+ while (j--)
+ ll[i++] = 0;
+ l = 0;
+ } else { /* j == 18: 11 to 138 zero length codes */
+ NEEDBITS(glbl, 7)
+ j = 11 + ((unsigned) b & 0x7f);
+ DUMPBITS(7)
+ if ((unsigned) i + j > n)
+ return 1;
+ while (j--)
+ ll[i++] = 0;
+ l = 0;
+ }
+ }
+
+ /* free decoding table for trees */
+ huft_free(glbl, tl);
+
+ /* restore the global bit buffer */
+ glbl->gz_bb = b;
+ glbl->gz_bk = k;
+
+ /* build the decoding tables for literal/length and distance codes */
+ bl = lbits;
+ i = huft_build(glbl, ll, nl, 257, cplens, cplext, &tl, &bl);
+ if (i != 0) {
+ if (i == 1 && !qflag) {
+ FPRINTF("(incomplete l-tree) ");
+ huft_free(glbl, tl);
+ }
+ return i; /* incomplete code set */
+ }
+ bd = dbits;
+ i = huft_build(glbl, ll + nl, nd, 0, cpdist, cpdext, &td, &bd);
+ if (i != 0) {
+ if (i == 1 && !qflag) {
+ FPRINTF("(incomplete d-tree) ");
+#ifdef PKZIP_BUG_WORKAROUND
+ i = 0;
+ }
+#else
+ huft_free(glbl, td);
+ }
+ huft_free(glbl, tl);
+ return i; /* incomplete code set */
+#endif
+ }
+ /* decompress until an end-of-block code */
+ if (inflate_codes(glbl, tl, td, bl, bd))
+ return 1;
+
+ /* free the decoding tables, return */
+ huft_free(glbl, tl);
+ huft_free(glbl, td);
+ return 0;
+}
+
+/* decompress an inflated block */
+static int
+inflate_block(glbl, e)
+ struct inflate *glbl;
+ int *e; /* last block flag */
+{
+ unsigned t; /* block type */
+ register ulg b; /* bit buffer */
+ register unsigned k; /* number of bits in bit buffer */
+
+ /* make local bit buffer */
+ b = glbl->gz_bb;
+ k = glbl->gz_bk;
+
+ /* read in last block bit */
+ NEEDBITS(glbl, 1)
+ * e = (int) b & 1;
+ DUMPBITS(1)
+ /* read in block type */
+ NEEDBITS(glbl, 2)
+ t = (unsigned) b & 3;
+ DUMPBITS(2)
+ /* restore the global bit buffer */
+ glbl->gz_bb = b;
+ glbl->gz_bk = k;
+
+ /* inflate that block type */
+ if (t == 2)
+ return inflate_dynamic(glbl);
+ if (t == 0)
+ return inflate_stored(glbl);
+ if (t == 1)
+ return inflate_fixed(glbl);
+ /* bad block type */
+ return 2;
+}
+
+
+
+/* decompress an inflated entry */
+static int
+xinflate(glbl)
+ struct inflate *glbl;
+{
+ int e; /* last block flag */
+ int r; /* result code */
+ unsigned h; /* maximum struct huft's malloc'ed */
+
+ glbl->gz_fixed_tl = (struct huft *) NULL;
+
+ /* initialize window, bit buffer */
+ glbl->gz_wp = 0;
+ glbl->gz_bk = 0;
+ glbl->gz_bb = 0;
+
+ /* decompress until the last block */
+ h = 0;
+ do {
+ glbl->gz_hufts = 0;
+ if ((r = inflate_block(glbl, &e)) != 0)
+ return r;
+ if (glbl->gz_hufts > h)
+ h = glbl->gz_hufts;
+ } while (!e);
+
+ /* flush out slide */
+ FLUSH(glbl, glbl->gz_wp);
+
+ /* return success */
+ return 0;
+}
+
+/* Nobody uses this - why not? */
+int
+inflate(glbl)
+ struct inflate *glbl;
+{
+ int i;
+#ifdef KERNEL
+ u_char *p = NULL;
+
+ if (!glbl->gz_slide)
+ p = glbl->gz_slide = malloc(GZ_WSIZE, M_GZIP, M_WAITOK);
+#endif
+ if (!glbl->gz_slide)
+#ifdef KERNEL
+ return(ENOMEM);
+#else
+ return 3; /* kzip expects 3 */
+#endif
+ i = xinflate(glbl);
+
+ if (glbl->gz_fixed_td != (struct huft *) NULL) {
+ huft_free(glbl, glbl->gz_fixed_td);
+ glbl->gz_fixed_td = (struct huft *) NULL;
+ }
+ if (glbl->gz_fixed_tl != (struct huft *) NULL) {
+ huft_free(glbl, glbl->gz_fixed_tl);
+ glbl->gz_fixed_tl = (struct huft *) NULL;
+ }
+#ifdef KERNEL
+ if (p == glbl->gz_slide) {
+ free(glbl->gz_slide, M_GZIP);
+ glbl->gz_slide = NULL;
+ }
+#endif
+ return i;
+}
+/* ----------------------- END INFLATE.C */
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
index 61a0a14..f108547 100644
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@@ -1,4 +1,7 @@
/*
+ * Copyright (c) 1995 Terrence R. Lambert
+ * All rights reserved.
+ *
* Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
@@ -35,100 +38,270 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)init_main.c 8.16 (Berkeley) 5/14/95
+ * @(#)init_main.c 8.9 (Berkeley) 1/21/94
+ * $Id: init_main.c,v 1.58 1997/03/01 17:49:09 wosch Exp $
*/
+#include "opt_rlimit.h"
+#include "opt_devfs.h"
+
#include <sys/param.h>
+#include <sys/file.h>
#include <sys/filedesc.h>
-#include <sys/errno.h>
-#include <sys/exec.h>
#include <sys/kernel.h>
#include <sys/mount.h>
-#include <sys/map.h>
+#include <sys/sysctl.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/signalvar.h>
#include <sys/systm.h>
#include <sys/vnode.h>
-#include <sys/conf.h>
-#include <sys/buf.h>
-#include <sys/clist.h>
-#include <sys/device.h>
-#include <sys/protosw.h>
+#include <sys/sysent.h>
#include <sys/reboot.h>
-#include <sys/user.h>
-#include <sys/syscallargs.h>
-
-#include <ufs/ufs/quota.h>
+#include <sys/sysproto.h>
+#include <sys/vmmeter.h>
#include <machine/cpu.h>
#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/user.h>
+#include <sys/copyright.h>
-#ifdef HPFPLIB
-char copyright[] =
-"Copyright (c) 1982, 1986, 1989, 1991, 1993\n\tThe Regents of the University of California.\nCopyright (c) 1992 Hewlett-Packard Company\nCopyright (c) 1992 Motorola Inc.\nAll rights reserved.\n\n";
-#else
-char copyright[] =
-"Copyright (c) 1982, 1986, 1989, 1991, 1993\n\tThe Regents of the University of California. All rights reserved.\n\n";
-#endif
+extern struct linker_set sysinit_set; /* XXX */
+
+extern void __main __P((void));
+extern void main __P((void *framep));
/* Components of the first process -- never freed. */
-struct session session0;
-struct pgrp pgrp0;
+static struct session session0;
+static struct pgrp pgrp0;
struct proc proc0;
-struct pcred cred0;
-struct filedesc0 filedesc0;
-struct plimit limit0;
-struct vmspace vmspace0;
+static struct pcred cred0;
+static struct filedesc0 filedesc0;
+static struct plimit limit0;
+static struct vmspace vmspace0;
struct proc *curproc = &proc0;
-struct proc *initproc, *pageproc;
+struct proc *initproc;
-int cmask = CMASK;
+int cmask = CMASK;
extern struct user *proc0paddr;
-struct vnode *rootvp, *swapdev_vp;
+struct vnode *rootvp;
int boothowto;
+
struct timeval boottime;
+SYSCTL_STRUCT(_kern, KERN_BOOTTIME, boottime,
+ CTLFLAG_RW, &boottime, timeval, "");
+
struct timeval runtime;
-static void start_init __P((struct proc *p, void *framep));
+/*
+ * Promiscuous argument pass for start_init()
+ *
+ * This is a kludge because we use a return from main() rather than a call
+ * to a new routine in locore.s to kick the kernel alive from locore.s.
+ */
+static void *init_framep;
+
+
+#if __GNUC__ >= 2
+void __main() {}
+#endif
+
+
+/*
+ * This ensures that there is at least one entry so that the sysinit_set
+ * symbol is not undefined. A sybsystem ID of SI_SUB_DUMMY is never
+ * executed.
+ */
+SYSINIT(placeholder, SI_SUB_DUMMY,SI_ORDER_ANY, NULL, NULL)
+
/*
* System startup; initialize the world, create process 0, mount root
* filesystem, and fork to create init and pagedaemon. Most of the
* hard work is done in the lower-level initialization routines including
* startup(), which does memory initialization and autoconfiguration.
+ *
+ * This allows simple addition of new kernel subsystems that require
+ * boot time initialization. It also allows substitution of subsystem
+ * (for instance, a scheduler, kernel profiler, or VM system) by object
+ * module. Finally, it allows for optional "kernel threads", like an LFS
+ * cleaner.
*/
+void
main(framep)
void *framep;
{
- register struct proc *p;
- register struct filedesc0 *fdp;
- register struct pdevinit *pdev;
- register int i;
- int s;
- register_t rval[2];
- extern struct pdevinit pdevinit[];
- extern void roundrobin __P((void *));
- extern void schedcpu __P((void *));
+
+ register struct sysinit **sipp; /* system initialization*/
+ register struct sysinit **xipp; /* interior loop of sort*/
+ register struct sysinit *save; /* bubble*/
+ int rval[2]; /* SI_TYPE_KTHREAD support*/
/*
- * Initialize the current process pointer (curproc) before
- * any possible traps/probes to simplify trap processing.
+ * Save the locore.s frame pointer for start_init().
*/
- p = &proc0;
- curproc = p;
+ init_framep = framep;
+
/*
- * Attempt to find console and initialize
- * in case of early panic or other messages.
+ * Perform a bubble sort of the system initialization objects by
+ * their subsystem (primary key) and order (secondary key).
+ *
+ * Since some things care about execution order, this is the
+ * operation which ensures continued function.
*/
- consinit();
- printf(copyright);
+ for( sipp = (struct sysinit **)sysinit_set.ls_items; *sipp; sipp++) {
+ for( xipp = sipp + 1; *xipp; xipp++) {
+ if( (*sipp)->subsystem < (*xipp)->subsystem ||
+ ( (*sipp)->subsystem == (*xipp)->subsystem &&
+ (*sipp)->order < (*xipp)->order))
+ continue; /* skip*/
+ save = *sipp;
+ *sipp = *xipp;
+ *xipp = save;
+ }
+ }
+
+ /*
+ * Traverse the (now) ordered list of system initialization tasks.
+ * Perform each task, and continue on to the next task.
+ *
+ * The last item on the list is expected to be the scheduler,
+ * which will not return.
+ */
+ for( sipp = (struct sysinit **)sysinit_set.ls_items; *sipp; sipp++) {
+ if( (*sipp)->subsystem == SI_SUB_DUMMY)
+ continue; /* skip dummy task(s)*/
+
+ switch( (*sipp)->type) {
+ case SI_TYPE_DEFAULT:
+ /* no special processing*/
+ (*((*sipp)->func))( (*sipp)->udata);
+ break;
+
+ case SI_TYPE_KTHREAD:
+ /* kernel thread*/
+ if (fork(&proc0, NULL, rval))
+ panic("fork kernel process");
+ if (rval[1]) {
+ (*((*sipp)->func))( (*sipp)->udata);
+ /*
+ * The call to start "init" returns
+ * here after the scheduler has been
+ * started, and returns to the caller
+ * in i386/i386/locore.s. This is a
+ * necessary part of initialization
+ * and is rather non-obvious.
+ *
+ * No other "kernel threads" should
+ * return here. Call panic() instead.
+ */
+ return;
+ }
+ break;
+
+ default:
+ panic( "init_main: unrecognized init type");
+ }
+ }
+
+ /* NOTREACHED*/
+}
+
+
+/*
+ * Start a kernel process. This is called after a fork() call in
+ * main() in the file kern/init_main.c.
+ *
+ * This function is used to start "internal" daemons.
+ */
+/* ARGSUSED*/
+void
+kproc_start(udata)
+ void *udata;
+{
+ struct kproc_desc *kp = udata;
+ struct proc *p = curproc;
+
+ /* save a global descriptor, if desired*/
+ if( kp->global_procpp != NULL)
+ *kp->global_procpp = p;
+
+ /* this is a non-swapped system process*/
+ p->p_flag |= P_INMEM | P_SYSTEM;
- vm_mem_init();
- kmeminit();
- cpu_startup();
+ /* set up arg0 for 'ps', et al*/
+ strcpy( p->p_comm, kp->arg0);
+
+ /* call the processes' main()...*/
+ (*kp->func)();
+
+ /* NOTREACHED */
+ panic("kproc_start: %s", kp->arg0);
+}
+
+
+/*
+ ***************************************************************************
+ ****
+ **** The following SYSINIT's belong elsewhere, but have not yet
+ **** been moved.
+ ****
+ ***************************************************************************
+ */
+#ifdef OMIT
+/*
+ * Handled by vfs_mountroot (bad idea) at this time... should be
+ * done the same as 4.4Lite2.
+ */
+SYSINIT(swapinit, SI_SUB_SWAP, SI_ORDER_FIRST, swapinit, NULL)
+#endif /* OMIT*/
+
+static void print_caddr_t __P((void *data));
+static void
+print_caddr_t(data)
+ void *data;
+{
+ printf("%s", (char *)data);
+}
+SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright)
+
+
+/*
+ ***************************************************************************
+ ****
+ **** The two following SYSINT's are proc0 specific glue code. I am not
+ **** convinced that they can not be safely combined, but their order of
+ **** operation has been maintained as the same as the original init_main.c
+ **** for right now.
+ ****
+ **** These probably belong in init_proc.c or kern_proc.c, since they
+ **** deal with proc0 (the fork template process).
+ ****
+ ***************************************************************************
+ */
+/* ARGSUSED*/
+static void proc0_init __P((void *dummy));
+static void
+proc0_init(dummy)
+ void *dummy;
+{
+ register struct proc *p;
+ register struct filedesc0 *fdp;
+ register unsigned i;
+
+ /*
+ * Initialize the current process pointer (curproc) before
+ * any possible traps/probes to simplify trap processing.
+ */
+ p = &proc0;
+ curproc = p; /* XXX redundant*/
/*
* Initialize process and pgrp structures.
@@ -136,6 +309,11 @@ main(framep)
procinit();
/*
+ * Initialize sleep queue hash table
+ */
+ sleepinit();
+
+ /*
* Create process 0 (the swapper).
*/
LIST_INSERT_HEAD(&allproc, p, p_list);
@@ -148,9 +326,14 @@ main(framep)
session0.s_count = 1;
session0.s_leader = p;
+ p->p_sysent = &aout_sysvec;
+
p->p_flag = P_INMEM | P_SYSTEM;
p->p_stat = SRUN;
p->p_nice = NZERO;
+ p->p_rtprio.type = RTP_PRIO_NORMAL;
+ p->p_rtprio.prio = 0;
+
bcopy("swapper", p->p_comm, sizeof ("swapper"));
/* Create credentials. */
@@ -173,8 +356,10 @@ main(framep)
for (i = 0; i < sizeof(p->p_rlimit)/sizeof(p->p_rlimit[0]); i++)
limit0.pl_rlimit[i].rlim_cur =
limit0.pl_rlimit[i].rlim_max = RLIM_INFINITY;
- limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur = NOFILE;
- limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur = MAXUPRC;
+ limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur =
+ limit0.pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles;
+ limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur =
+ limit0.pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc;
i = ptoa(cnt.v_free_count);
limit0.pl_rlimit[RLIMIT_RSS].rlim_max = i;
limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i;
@@ -185,11 +370,22 @@ main(framep)
p->p_vmspace = &vmspace0;
vmspace0.vm_refcnt = 1;
pmap_pinit(&vmspace0.vm_pmap);
- vm_map_init(&p->p_vmspace->vm_map, round_page(VM_MIN_ADDRESS),
- trunc_page(VM_MAX_ADDRESS), TRUE);
+ vm_map_init(&vmspace0.vm_map, round_page(VM_MIN_ADDRESS),
+ trunc_page(VM_MAXUSER_ADDRESS), TRUE);
vmspace0.vm_map.pmap = &vmspace0.vm_pmap;
p->p_addr = proc0paddr; /* XXX */
+#define INCOMPAT_LITES2
+#ifdef INCOMPAT_LITES2
+ /*
+ * proc0 needs to have a coherent frame base, too.
+ * This probably makes the identical call for the init proc
+ * that happens later unnecessary since it should inherit
+ * it during the fork.
+ */
+ cpu_set_init_frame(p, init_framep); /* XXX! */
+#endif /* INCOMPAT_LITES2*/
+
/*
* We continue to place resource usage info and signal
* actions in the user struct so they're pageable.
@@ -201,104 +397,127 @@ main(framep)
* Charge root for one process.
*/
(void)chgproccnt(0, 1);
+}
+SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL)
- rqinit();
-
- /* Configure virtual memory system, set vm rlimits. */
- vm_init_limits(p);
-
- /* Initialize the file systems. */
- vfsinit();
+/* ARGSUSED*/
+static void proc0_post __P((void *dummy));
+static void
+proc0_post(dummy)
+ void *dummy;
+{
+ struct timeval tv;
- /* Start real time and statistics clocks. */
- initclocks();
+ /*
+ * Now can look at time, having had a chance to verify the time
+ * from the file system. Reset p->p_rtime as it may have been
+ * munched in mi_switch() after the time got set.
+ */
+ gettime(&boottime);
+ proc0.p_stats->p_start = runtime = mono_time = boottime;
+ proc0.p_rtime.tv_sec = proc0.p_rtime.tv_usec = 0;
- /* Initialize mbuf's. */
- mbinit();
+ /*
+ * Give the ``random'' number generator a thump.
+ */
+ microtime(&tv);
+ srandom(tv.tv_sec ^ tv.tv_usec);
- /* Initialize clists. */
- clist_init();
+ /* Initialize signal state for process 0. */
+ siginit(&proc0);
+}
+SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL)
-#ifdef SYSVSHM
- /* Initialize System V style shared memory. */
- shminit();
-#endif
- /* Attach pseudo-devices. */
- for (pdev = pdevinit; pdev->pdev_attach != NULL; pdev++)
- (*pdev->pdev_attach)(pdev->pdev_count);
- /*
- * Initialize protocols. Block reception of incoming packets
- * until everything is ready.
- */
- s = splimp();
- ifinit();
- domaininit();
- splx(s);
-
-#ifdef GPROF
- /* Initialize kernel profiling. */
- kmstartup();
-#endif
+/*
+ ***************************************************************************
+ ****
+ **** The following SYSINIT's and glue code should be moved to the
+ **** respective files on a per subsystem basis.
+ ****
+ ***************************************************************************
+ */
+/* ARGSUSED*/
+static void sched_setup __P((void *dummy));
+static void
+sched_setup(dummy)
+ void *dummy;
+{
/* Kick off timeout driven events by calling first time. */
roundrobin(NULL);
schedcpu(NULL);
+}
+SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
+/* ARGSUSED*/
+static void xxx_vfs_mountroot __P((void *fsnamep));
+static void
+xxx_vfs_mountroot(fsnamep)
+ void *fsnamep;
+{
/* Mount the root file system. */
- if (vfs_mountroot())
+ if (vfs_mountrootfs(*((char **) fsnamep)))
panic("cannot mount root");
- mountlist.cqh_first->mnt_flag |= MNT_ROOTFS;
+}
+SYSINIT(mountroot, SI_SUB_ROOT, SI_ORDER_FIRST, xxx_vfs_mountroot, &mountrootfsname)
+
+/* ARGSUSED*/
+static void xxx_vfs_root_fdtab __P((void *dummy));
+static void
+xxx_vfs_root_fdtab(dummy)
+ void *dummy;
+{
+ register struct filedesc0 *fdp = &filedesc0;
/* Get the vnode for '/'. Set fdp->fd_fd.fd_cdir to reference it. */
if (VFS_ROOT(mountlist.cqh_first, &rootvnode))
panic("cannot find root vnode");
fdp->fd_fd.fd_cdir = rootvnode;
VREF(fdp->fd_fd.fd_cdir);
- VOP_UNLOCK(rootvnode, 0, p);
+ VOP_UNLOCK(rootvnode, 0, &proc0);
fdp->fd_fd.fd_rdir = NULL;
- swapinit();
+}
+SYSINIT(retrofit, SI_SUB_ROOT_FDTAB, SI_ORDER_FIRST, xxx_vfs_root_fdtab, NULL)
- /*
- * Now can look at time, having had a chance to verify the time
- * from the file system. Reset p->p_rtime as it may have been
- * munched in mi_switch() after the time got set.
- */
- p->p_stats->p_start = runtime = mono_time = boottime = time;
- p->p_rtime.tv_sec = p->p_rtime.tv_usec = 0;
- /* Initialize signal state for process 0. */
- siginit(p);
+/*
+ ***************************************************************************
+ ****
+ **** The following code probably belongs in another file, like
+ **** kern/init_init.c. It is here for two reasons only:
+ ****
+ **** 1) This code returns to startup the system; this is
+ **** abnormal for a kernel thread.
+ **** 2) This code promiscuously uses init_frame
+ ****
+ ***************************************************************************
+ */
- /* Create process 1 (init(8)). */
- if (fork(p, NULL, rval))
- panic("fork init");
- if (rval[1]) {
- start_init(curproc, framep);
- return;
- }
+static void kthread_init __P((void *dummy));
+SYSINIT_KT(init,SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kthread_init, NULL)
- /* Create process 2 (the pageout daemon). */
- if (fork(p, NULL, rval))
- panic("fork pager");
- if (rval[1]) {
- /*
- * Now in process 2.
- */
- p = curproc;
- pageproc = p;
- p->p_flag |= P_INMEM | P_SYSTEM; /* XXX */
- bcopy("pagedaemon", curproc->p_comm, sizeof ("pagedaemon"));
- vm_pageout();
- /* NOTREACHED */
- }
- /* The scheduler is an infinite loop. */
- scheduler();
- /* NOTREACHED */
+static void start_init __P((struct proc *p, void *framep));
+
+/* ARGSUSED*/
+static void
+kthread_init(dummy)
+ void *dummy;
+{
+
+ /* Create process 1 (init(8)). */
+ start_init(curproc, init_framep);
+
+ /*
+ * This is the only kernel thread allowed to return yo the
+ * caller!!!
+ */
+ return;
}
+
/*
* List of paths to try when searching for "init".
*/
@@ -306,6 +525,7 @@ static char *initpaths[] = {
"/sbin/init",
"/sbin/oinit",
"/sbin/init.bak",
+ "/stand/sysinstall",
NULL,
};
@@ -319,14 +539,8 @@ start_init(p, framep)
void *framep;
{
vm_offset_t addr;
- struct execve_args /* {
- syscallarg(char *) path;
- syscallarg(char **) argp;
- syscallarg(char **) envp;
- } */ args;
- int options, i, error;
- register_t retval[2];
- char flags[4] = "-", *flagsp;
+ struct execve_args args;
+ int options, i, retval[2], error;
char **pathp, *path, *ucp, **uap, *arg0, *arg1;
initproc = p;
@@ -343,66 +557,74 @@ start_init(p, framep)
/*
* Need just enough stack to hold the faked-up "execve()" arguments.
*/
- addr = trunc_page(VM_MAX_ADDRESS - PAGE_SIZE);
- if (vm_allocate(&p->p_vmspace->vm_map, &addr, PAGE_SIZE, FALSE) != 0)
+ addr = trunc_page(VM_MAXUSER_ADDRESS - PAGE_SIZE);
+ if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0)
panic("init: couldn't allocate argument space");
p->p_vmspace->vm_maxsaddr = (caddr_t)addr;
+ p->p_vmspace->vm_ssize = 1;
for (pathp = &initpaths[0]; (path = *pathp) != NULL; pathp++) {
/*
- * Construct the boot flag argument.
+ * Move out the boot flag argument.
*/
options = 0;
- flagsp = flags + 1;
ucp = (char *)USRSTACK;
+ (void)subyte(--ucp, 0); /* trailing zero */
if (boothowto & RB_SINGLE) {
- *flagsp++ = 's';
+ (void)subyte(--ucp, 's');
options = 1;
}
#ifdef notyet
if (boothowto & RB_FASTBOOT) {
- *flagsp++ = 'f';
+ (void)subyte(--ucp, 'f');
options = 1;
}
#endif
- /*
- * Move out the flags (arg 1), if necessary.
- */
- if (options != 0) {
- *flagsp++ = '\0';
- i = flagsp - flags;
- (void)copyout((caddr_t)flags, (caddr_t)(ucp -= i), i);
- arg1 = ucp;
- }
+
+#ifdef BOOTCDROM
+ (void)subyte(--ucp, 'C');
+ options = 1;
+#endif
+
+#if defined(DEVFS) && defined(DEVFS_ROOT)
+ (void)subyte(--ucp, 'd');
+ options = 1;
+#endif
+ if (options == 0)
+ (void)subyte(--ucp, '-');
+ (void)subyte(--ucp, '-'); /* leading hyphen */
+ arg1 = ucp;
/*
* Move out the file name (also arg 0).
*/
- i = strlen(path) + 1;
- (void)copyout((caddr_t)path, (caddr_t)(ucp -= i), i);
+ for (i = strlen(path) + 1; i >= 0; i--)
+ (void)subyte(--ucp, path[i]);
arg0 = ucp;
/*
* Move out the arg pointers.
*/
- uap = (char **)((long)ucp & ~ALIGNBYTES);
+ uap = (char **)((int)ucp & ~(NBPW-1));
(void)suword((caddr_t)--uap, 0); /* terminator */
- if (options != 0)
- (void)suword((caddr_t)--uap, (long)arg1);
- (void)suword((caddr_t)--uap, (long)arg0);
+ (void)suword((caddr_t)--uap, (int)arg1);
+ (void)suword((caddr_t)--uap, (int)arg0);
/*
* Point at the arguments.
*/
- SCARG(&args, path) = arg0;
- SCARG(&args, argp) = uap;
- SCARG(&args, envp) = NULL;
+ args.fname = arg0;
+ args.argv = uap;
+ args.envv = NULL;
/*
* Now try to exec the program. If can't for any reason
* other than it doesn't exist, complain.
+ *
+ * Otherwise return to main() which returns to btext
+ * which completes the system startup.
*/
- if ((error = execve(p, &args, retval)) == 0)
+ if ((error = execve(p, &args, &retval[0])) == 0)
return;
if (error != ENOENT)
printf("exec %s: error %d\n", path, error);
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
index 0bbdd20..6954a04 100644
--- a/sys/kern/init_sysent.c
+++ b/sys/kern/init_sysent.c
@@ -2,766 +2,286 @@
* System call switch table.
*
* DO NOT EDIT-- this file is automatically generated.
- * created from @(#)syscalls.master 8.6 (Berkeley) 3/30/95
+ * created from Id: syscalls.master,v 1.33 1997/02/22 09:39:21 peter Exp
*/
#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/signal.h>
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
-int nosys();
-int exit();
-int fork();
-int read();
-int write();
-int open();
-int close();
-int wait4();
-int link();
-int unlink();
-int chdir();
-int fchdir();
-int mknod();
-int chmod();
-int chown();
-int obreak();
-int getfsstat();
-int getpid();
-int mount();
-int unmount();
-int setuid();
-int getuid();
-int geteuid();
-int ptrace();
-int recvmsg();
-int sendmsg();
-int recvfrom();
-int accept();
-int getpeername();
-int getsockname();
-int access();
-int chflags();
-int fchflags();
-int sync();
-int kill();
-int getppid();
-int dup();
-int pipe();
-int getegid();
-int profil();
-#ifdef KTRACE
-int ktrace();
-#else
-#endif
-int sigaction();
-int getgid();
-int sigprocmask();
-int getlogin();
-int setlogin();
-int acct();
-int sigpending();
-int sigaltstack();
-int ioctl();
-int reboot();
-int revoke();
-int symlink();
-int readlink();
-int execve();
-int umask();
-int chroot();
-int msync();
-int vfork();
-int sbrk();
-int sstk();
-int ovadvise();
-int munmap();
-int mprotect();
-int madvise();
-int mincore();
-int getgroups();
-int setgroups();
-int getpgrp();
-int setpgid();
-int setitimer();
-int swapon();
-int getitimer();
-int getdtablesize();
-int dup2();
-int fcntl();
-int select();
-int fsync();
-int setpriority();
-int socket();
-int connect();
-int getpriority();
-int sigreturn();
-int bind();
-int setsockopt();
-int listen();
-int sigsuspend();
-#ifdef TRACE
-int vtrace();
-#else
-#endif
-int gettimeofday();
-int getrusage();
-int getsockopt();
-#ifdef vax
-int resuba();
-#else
-#endif
-int readv();
-int writev();
-int settimeofday();
-int fchown();
-int fchmod();
-int rename();
-int flock();
-int mkfifo();
-int sendto();
-int shutdown();
-int socketpair();
-int mkdir();
-int rmdir();
-int utimes();
-int adjtime();
-int setsid();
-int quotactl();
-#ifdef NFS
-int nfssvc();
-#else
-#endif
-int statfs();
-int fstatfs();
-#ifdef NFS
-int getfh();
-#else
-#endif
-#if defined(SYSVSHM) && !defined(alpha)
-#else
-#endif
-int setgid();
-int setegid();
-int seteuid();
-#ifdef LFS
-int lfs_bmapv();
-int lfs_markv();
-int lfs_segclean();
-int lfs_segwait();
-#else
-#endif
-int stat();
-int fstat();
-int lstat();
-int pathconf();
-int fpathconf();
-int getrlimit();
-int setrlimit();
-int getdirentries();
-int mmap();
-int nosys();
-int lseek();
-int truncate();
-int ftruncate();
-int __sysctl();
-int mlock();
-int munlock();
-int undelete();
-#if defined(SYSVSHM) && 0
-int shmat();
-int shmctl();
-int shmdt();
-int shmget();
-#else
-#endif
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
#ifdef COMPAT_43
-#define compat_43(func) __CONCAT(compat_43_,func)
-
-int compat_43(creat)();
-int compat_43(lseek)();
-int compat_43(stat)();
-int compat_43(lstat)();
-#ifdef KTRACE
+#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name)
#else
+#define compat(n, name) 0, (sy_call_t *)nosys
#endif
-int compat_43(fstat)();
-int compat_43(getkerninfo)();
-int compat_43(getpagesize)();
-int compat_43(mmap)();
-int compat_43(wait)();
-int compat_43(gethostname)();
-int compat_43(sethostname)();
-int compat_43(accept)();
-int compat_43(send)();
-int compat_43(recv)();
-int compat_43(sigvec)();
-int compat_43(sigblock)();
-int compat_43(sigsetmask)();
-int compat_43(sigstack)();
-int compat_43(recvmsg)();
-int compat_43(sendmsg)();
-#ifdef TRACE
-#else
-#endif
-#ifdef vax
-#else
-#endif
-int compat_43(recvfrom)();
-int compat_43(setreuid)();
-int compat_43(setregid)();
-int compat_43(truncate)();
-int compat_43(ftruncate)();
-int compat_43(getpeername)();
-int compat_43(gethostid)();
-int compat_43(sethostid)();
-int compat_43(getrlimit)();
-int compat_43(setrlimit)();
-int compat_43(killpg)();
-int compat_43(quota)();
-int compat_43(getsockname)();
-#ifdef NFS
-#else
-#endif
-int compat_43(getdirentries)();
-#ifdef NFS
-#else
-#endif
-#if defined(SYSVSHM) && !defined(alpha)
-int compat_43(shmsys)();
-#else
-#endif
-#ifdef LFS
-#else
-#endif
-#if defined(SYSVSHM) && 0
-#else
-#endif
-
-#else /* COMPAT_43 */
-#define compat_43(func) nosys
-#endif /* COMPAT_43 */
-
-#define s(type) sizeof(type)
+/* The casts are bogus but will do for now. */
struct sysent sysent[] = {
- { 0, 0,
- nosys }, /* 0 = syscall */
- { 1, s(struct exit_args),
- exit }, /* 1 = exit */
- { 0, 0,
- fork }, /* 2 = fork */
- { 3, s(struct read_args),
- read }, /* 3 = read */
- { 3, s(struct write_args),
- write }, /* 4 = write */
- { 3, s(struct open_args),
- open }, /* 5 = open */
- { 1, s(struct close_args),
- close }, /* 6 = close */
- { 4, s(struct wait4_args),
- wait4 }, /* 7 = wait4 */
- { 2, s(struct compat_43_creat_args),
- compat_43(creat) }, /* 8 = compat_43 creat */
- { 2, s(struct link_args),
- link }, /* 9 = link */
- { 1, s(struct unlink_args),
- unlink }, /* 10 = unlink */
- { 0, 0,
- nosys }, /* 11 = obsolete execv */
- { 1, s(struct chdir_args),
- chdir }, /* 12 = chdir */
- { 1, s(struct fchdir_args),
- fchdir }, /* 13 = fchdir */
- { 3, s(struct mknod_args),
- mknod }, /* 14 = mknod */
- { 2, s(struct chmod_args),
- chmod }, /* 15 = chmod */
- { 3, s(struct chown_args),
- chown }, /* 16 = chown */
- { 1, s(struct obreak_args),
- obreak }, /* 17 = break */
- { 3, s(struct getfsstat_args),
- getfsstat }, /* 18 = getfsstat */
- { 3, s(struct compat_43_lseek_args),
- compat_43(lseek) }, /* 19 = compat_43 lseek */
- { 0, 0,
- getpid }, /* 20 = getpid */
- { 4, s(struct mount_args),
- mount }, /* 21 = mount */
- { 2, s(struct unmount_args),
- unmount }, /* 22 = unmount */
- { 1, s(struct setuid_args),
- setuid }, /* 23 = setuid */
- { 0, 0,
- getuid }, /* 24 = getuid */
- { 0, 0,
- geteuid }, /* 25 = geteuid */
- { 4, s(struct ptrace_args),
- ptrace }, /* 26 = ptrace */
- { 3, s(struct recvmsg_args),
- recvmsg }, /* 27 = recvmsg */
- { 3, s(struct sendmsg_args),
- sendmsg }, /* 28 = sendmsg */
- { 6, s(struct recvfrom_args),
- recvfrom }, /* 29 = recvfrom */
- { 3, s(struct accept_args),
- accept }, /* 30 = accept */
- { 3, s(struct getpeername_args),
- getpeername }, /* 31 = getpeername */
- { 3, s(struct getsockname_args),
- getsockname }, /* 32 = getsockname */
- { 2, s(struct access_args),
- access }, /* 33 = access */
- { 2, s(struct chflags_args),
- chflags }, /* 34 = chflags */
- { 2, s(struct fchflags_args),
- fchflags }, /* 35 = fchflags */
- { 0, 0,
- sync }, /* 36 = sync */
- { 2, s(struct kill_args),
- kill }, /* 37 = kill */
- { 2, s(struct compat_43_stat_args),
- compat_43(stat) }, /* 38 = compat_43 stat */
- { 0, 0,
- getppid }, /* 39 = getppid */
- { 2, s(struct compat_43_lstat_args),
- compat_43(lstat) }, /* 40 = compat_43 lstat */
- { 1, s(struct dup_args),
- dup }, /* 41 = dup */
- { 0, 0,
- pipe }, /* 42 = pipe */
- { 0, 0,
- getegid }, /* 43 = getegid */
- { 4, s(struct profil_args),
- profil }, /* 44 = profil */
-#ifdef KTRACE
- { 4, s(struct ktrace_args),
- ktrace }, /* 45 = ktrace */
-#else
- { 0, 0,
- nosys }, /* 45 = unimplemented ktrace */
-#endif
- { 3, s(struct sigaction_args),
- sigaction }, /* 46 = sigaction */
- { 0, 0,
- getgid }, /* 47 = getgid */
- { 2, s(struct sigprocmask_args),
- sigprocmask }, /* 48 = sigprocmask */
- { 2, s(struct getlogin_args),
- getlogin }, /* 49 = getlogin */
- { 1, s(struct setlogin_args),
- setlogin }, /* 50 = setlogin */
- { 1, s(struct acct_args),
- acct }, /* 51 = acct */
- { 0, 0,
- sigpending }, /* 52 = sigpending */
- { 2, s(struct sigaltstack_args),
- sigaltstack }, /* 53 = sigaltstack */
- { 3, s(struct ioctl_args),
- ioctl }, /* 54 = ioctl */
- { 1, s(struct reboot_args),
- reboot }, /* 55 = reboot */
- { 1, s(struct revoke_args),
- revoke }, /* 56 = revoke */
- { 2, s(struct symlink_args),
- symlink }, /* 57 = symlink */
- { 3, s(struct readlink_args),
- readlink }, /* 58 = readlink */
- { 3, s(struct execve_args),
- execve }, /* 59 = execve */
- { 1, s(struct umask_args),
- umask }, /* 60 = umask */
- { 1, s(struct chroot_args),
- chroot }, /* 61 = chroot */
- { 2, s(struct compat_43_fstat_args),
- compat_43(fstat) }, /* 62 = compat_43 fstat */
- { 4, s(struct compat_43_getkerninfo_args),
- compat_43(getkerninfo) }, /* 63 = compat_43 getkerninfo */
- { 0, 0,
- compat_43(getpagesize) }, /* 64 = compat_43 getpagesize */
- { 2, s(struct msync_args),
- msync }, /* 65 = msync */
- { 0, 0,
- vfork }, /* 66 = vfork */
- { 0, 0,
- nosys }, /* 67 = obsolete vread */
- { 0, 0,
- nosys }, /* 68 = obsolete vwrite */
- { 1, s(struct sbrk_args),
- sbrk }, /* 69 = sbrk */
- { 1, s(struct sstk_args),
- sstk }, /* 70 = sstk */
- { 6, s(struct compat_43_mmap_args),
- compat_43(mmap) }, /* 71 = compat_43 mmap */
- { 1, s(struct ovadvise_args),
- ovadvise }, /* 72 = vadvise */
- { 2, s(struct munmap_args),
- munmap }, /* 73 = munmap */
- { 3, s(struct mprotect_args),
- mprotect }, /* 74 = mprotect */
- { 3, s(struct madvise_args),
- madvise }, /* 75 = madvise */
- { 0, 0,
- nosys }, /* 76 = obsolete vhangup */
- { 0, 0,
- nosys }, /* 77 = obsolete vlimit */
- { 3, s(struct mincore_args),
- mincore }, /* 78 = mincore */
- { 2, s(struct getgroups_args),
- getgroups }, /* 79 = getgroups */
- { 2, s(struct setgroups_args),
- setgroups }, /* 80 = setgroups */
- { 0, 0,
- getpgrp }, /* 81 = getpgrp */
- { 2, s(struct setpgid_args),
- setpgid }, /* 82 = setpgid */
- { 3, s(struct setitimer_args),
- setitimer }, /* 83 = setitimer */
- { 0, 0,
- compat_43(wait) }, /* 84 = compat_43 wait */
- { 1, s(struct swapon_args),
- swapon }, /* 85 = swapon */
- { 2, s(struct getitimer_args),
- getitimer }, /* 86 = getitimer */
- { 2, s(struct compat_43_gethostname_args),
- compat_43(gethostname) }, /* 87 = compat_43 gethostname */
- { 2, s(struct compat_43_sethostname_args),
- compat_43(sethostname) }, /* 88 = compat_43 sethostname */
- { 0, 0,
- getdtablesize }, /* 89 = getdtablesize */
- { 2, s(struct dup2_args),
- dup2 }, /* 90 = dup2 */
- { 0, 0,
- nosys }, /* 91 = unimplemented getdopt */
- { 3, s(struct fcntl_args),
- fcntl }, /* 92 = fcntl */
- { 5, s(struct select_args),
- select }, /* 93 = select */
- { 0, 0,
- nosys }, /* 94 = unimplemented setdopt */
- { 1, s(struct fsync_args),
- fsync }, /* 95 = fsync */
- { 3, s(struct setpriority_args),
- setpriority }, /* 96 = setpriority */
- { 3, s(struct socket_args),
- socket }, /* 97 = socket */
- { 3, s(struct connect_args),
- connect }, /* 98 = connect */
- { 3, s(struct compat_43_accept_args),
- compat_43(accept) }, /* 99 = compat_43 accept */
- { 2, s(struct getpriority_args),
- getpriority }, /* 100 = getpriority */
- { 4, s(struct compat_43_send_args),
- compat_43(send) }, /* 101 = compat_43 send */
- { 4, s(struct compat_43_recv_args),
- compat_43(recv) }, /* 102 = compat_43 recv */
- { 1, s(struct sigreturn_args),
- sigreturn }, /* 103 = sigreturn */
- { 3, s(struct bind_args),
- bind }, /* 104 = bind */
- { 5, s(struct setsockopt_args),
- setsockopt }, /* 105 = setsockopt */
- { 2, s(struct listen_args),
- listen }, /* 106 = listen */
- { 0, 0,
- nosys }, /* 107 = obsolete vtimes */
- { 3, s(struct compat_43_sigvec_args),
- compat_43(sigvec) }, /* 108 = compat_43 sigvec */
- { 1, s(struct compat_43_sigblock_args),
- compat_43(sigblock) }, /* 109 = compat_43 sigblock */
- { 1, s(struct compat_43_sigsetmask_args),
- compat_43(sigsetmask) }, /* 110 = compat_43 sigsetmask */
- { 1, s(struct sigsuspend_args),
- sigsuspend }, /* 111 = sigsuspend */
- { 2, s(struct compat_43_sigstack_args),
- compat_43(sigstack) }, /* 112 = compat_43 sigstack */
- { 3, s(struct compat_43_recvmsg_args),
- compat_43(recvmsg) }, /* 113 = compat_43 recvmsg */
- { 3, s(struct compat_43_sendmsg_args),
- compat_43(sendmsg) }, /* 114 = compat_43 sendmsg */
-#ifdef TRACE
- { 2, s(struct vtrace_args),
- vtrace }, /* 115 = vtrace */
-#else
- { 0, 0,
- nosys }, /* 115 = obsolete vtrace */
-#endif
- { 2, s(struct gettimeofday_args),
- gettimeofday }, /* 116 = gettimeofday */
- { 2, s(struct getrusage_args),
- getrusage }, /* 117 = getrusage */
- { 5, s(struct getsockopt_args),
- getsockopt }, /* 118 = getsockopt */
-#ifdef vax
- { 1, s(struct resuba_args),
- resuba }, /* 119 = resuba */
-#else
- { 0, 0,
- nosys }, /* 119 = unimplemented resuba */
-#endif
- { 3, s(struct readv_args),
- readv }, /* 120 = readv */
- { 3, s(struct writev_args),
- writev }, /* 121 = writev */
- { 2, s(struct settimeofday_args),
- settimeofday }, /* 122 = settimeofday */
- { 3, s(struct fchown_args),
- fchown }, /* 123 = fchown */
- { 2, s(struct fchmod_args),
- fchmod }, /* 124 = fchmod */
- { 6, s(struct compat_43_recvfrom_args),
- compat_43(recvfrom) }, /* 125 = compat_43 recvfrom */
- { 2, s(struct compat_43_setreuid_args),
- compat_43(setreuid) }, /* 126 = compat_43 setreuid */
- { 2, s(struct compat_43_setregid_args),
- compat_43(setregid) }, /* 127 = compat_43 setregid */
- { 2, s(struct rename_args),
- rename }, /* 128 = rename */
- { 2, s(struct compat_43_truncate_args),
- compat_43(truncate) }, /* 129 = compat_43 truncate */
- { 2, s(struct compat_43_ftruncate_args),
- compat_43(ftruncate) }, /* 130 = compat_43 ftruncate */
- { 2, s(struct flock_args),
- flock }, /* 131 = flock */
- { 2, s(struct mkfifo_args),
- mkfifo }, /* 132 = mkfifo */
- { 6, s(struct sendto_args),
- sendto }, /* 133 = sendto */
- { 2, s(struct shutdown_args),
- shutdown }, /* 134 = shutdown */
- { 4, s(struct socketpair_args),
- socketpair }, /* 135 = socketpair */
- { 2, s(struct mkdir_args),
- mkdir }, /* 136 = mkdir */
- { 1, s(struct rmdir_args),
- rmdir }, /* 137 = rmdir */
- { 2, s(struct utimes_args),
- utimes }, /* 138 = utimes */
- { 0, 0,
- nosys }, /* 139 = obsolete 4.2 sigreturn */
- { 2, s(struct adjtime_args),
- adjtime }, /* 140 = adjtime */
- { 3, s(struct compat_43_getpeername_args),
- compat_43(getpeername) }, /* 141 = compat_43 getpeername */
- { 0, 0,
- compat_43(gethostid) }, /* 142 = compat_43 gethostid */
- { 1, s(struct compat_43_sethostid_args),
- compat_43(sethostid) }, /* 143 = compat_43 sethostid */
- { 2, s(struct compat_43_getrlimit_args),
- compat_43(getrlimit) }, /* 144 = compat_43 getrlimit */
- { 2, s(struct compat_43_setrlimit_args),
- compat_43(setrlimit) }, /* 145 = compat_43 setrlimit */
- { 2, s(struct compat_43_killpg_args),
- compat_43(killpg) }, /* 146 = compat_43 killpg */
- { 0, 0,
- setsid }, /* 147 = setsid */
- { 4, s(struct quotactl_args),
- quotactl }, /* 148 = quotactl */
- { 0, 0,
- compat_43(quota) }, /* 149 = compat_43 quota */
- { 3, s(struct compat_43_getsockname_args),
- compat_43(getsockname) }, /* 150 = compat_43 getsockname */
- { 0, 0,
- nosys }, /* 151 = unimplemented */
- { 0, 0,
- nosys }, /* 152 = unimplemented */
- { 0, 0,
- nosys }, /* 153 = unimplemented */
- { 0, 0,
- nosys }, /* 154 = unimplemented */
+ { 0, (sy_call_t *)nosys }, /* 0 = syscall */
+ { 1, (sy_call_t *)exit }, /* 1 = exit */
+ { 0, (sy_call_t *)fork }, /* 2 = fork */
+ { 3, (sy_call_t *)read }, /* 3 = read */
+ { 3, (sy_call_t *)write }, /* 4 = write */
+ { 3, (sy_call_t *)open }, /* 5 = open */
+ { 1, (sy_call_t *)close }, /* 6 = close */
+ { 4, (sy_call_t *)wait4 }, /* 7 = wait4 */
+ { compat(2,creat) }, /* 8 = old creat */
+ { 2, (sy_call_t *)link }, /* 9 = link */
+ { 1, (sy_call_t *)unlink }, /* 10 = unlink */
+ { 0, (sy_call_t *)nosys }, /* 11 = obsolete execv */
+ { 1, (sy_call_t *)chdir }, /* 12 = chdir */
+ { 1, (sy_call_t *)fchdir }, /* 13 = fchdir */
+ { 3, (sy_call_t *)mknod }, /* 14 = mknod */
+ { 2, (sy_call_t *)chmod }, /* 15 = chmod */
+ { 3, (sy_call_t *)chown }, /* 16 = chown */
+ { 1, (sy_call_t *)obreak }, /* 17 = break */
+ { 3, (sy_call_t *)getfsstat }, /* 18 = getfsstat */
+ { compat(3,lseek) }, /* 19 = old lseek */
+ { 0, (sy_call_t *)getpid }, /* 20 = getpid */
+ { 4, (sy_call_t *)mount }, /* 21 = mount */
+ { 2, (sy_call_t *)unmount }, /* 22 = unmount */
+ { 1, (sy_call_t *)setuid }, /* 23 = setuid */
+ { 0, (sy_call_t *)getuid }, /* 24 = getuid */
+ { 0, (sy_call_t *)geteuid }, /* 25 = geteuid */
+ { 4, (sy_call_t *)ptrace }, /* 26 = ptrace */
+ { 3, (sy_call_t *)recvmsg }, /* 27 = recvmsg */
+ { 3, (sy_call_t *)sendmsg }, /* 28 = sendmsg */
+ { 6, (sy_call_t *)recvfrom }, /* 29 = recvfrom */
+ { 3, (sy_call_t *)accept }, /* 30 = accept */
+ { 3, (sy_call_t *)getpeername }, /* 31 = getpeername */
+ { 3, (sy_call_t *)getsockname }, /* 32 = getsockname */
+ { 2, (sy_call_t *)access }, /* 33 = access */
+ { 2, (sy_call_t *)chflags }, /* 34 = chflags */
+ { 2, (sy_call_t *)fchflags }, /* 35 = fchflags */
+ { 0, (sy_call_t *)sync }, /* 36 = sync */
+ { 2, (sy_call_t *)kill }, /* 37 = kill */
+ { compat(2,stat) }, /* 38 = old stat */
+ { 0, (sy_call_t *)getppid }, /* 39 = getppid */
+ { compat(2,lstat) }, /* 40 = old lstat */
+ { 1, (sy_call_t *)dup }, /* 41 = dup */
+ { 0, (sy_call_t *)pipe }, /* 42 = pipe */
+ { 0, (sy_call_t *)getegid }, /* 43 = getegid */
+ { 4, (sy_call_t *)profil }, /* 44 = profil */
+ { 4, (sy_call_t *)ktrace }, /* 45 = ktrace */
+ { 3, (sy_call_t *)sigaction }, /* 46 = sigaction */
+ { 0, (sy_call_t *)getgid }, /* 47 = getgid */
+ { 2, (sy_call_t *)sigprocmask }, /* 48 = sigprocmask */
+ { 2, (sy_call_t *)getlogin }, /* 49 = getlogin */
+ { 1, (sy_call_t *)setlogin }, /* 50 = setlogin */
+ { 1, (sy_call_t *)acct }, /* 51 = acct */
+ { 0, (sy_call_t *)sigpending }, /* 52 = sigpending */
+ { 2, (sy_call_t *)sigaltstack }, /* 53 = sigaltstack */
+ { 3, (sy_call_t *)ioctl }, /* 54 = ioctl */
+ { 1, (sy_call_t *)reboot }, /* 55 = reboot */
+ { 1, (sy_call_t *)revoke }, /* 56 = revoke */
+ { 2, (sy_call_t *)symlink }, /* 57 = symlink */
+ { 3, (sy_call_t *)readlink }, /* 58 = readlink */
+ { 3, (sy_call_t *)execve }, /* 59 = execve */
+ { 1, (sy_call_t *)umask }, /* 60 = umask */
+ { 1, (sy_call_t *)chroot }, /* 61 = chroot */
+ { compat(2,fstat) }, /* 62 = old fstat */
+ { compat(4,getkerninfo) }, /* 63 = old getkerninfo */
+ { compat(0,getpagesize) }, /* 64 = old getpagesize */
+ { 3, (sy_call_t *)msync }, /* 65 = msync */
+ { 0, (sy_call_t *)vfork }, /* 66 = vfork */
+ { 0, (sy_call_t *)nosys }, /* 67 = obsolete vread */
+ { 0, (sy_call_t *)nosys }, /* 68 = obsolete vwrite */
+ { 1, (sy_call_t *)sbrk }, /* 69 = sbrk */
+ { 1, (sy_call_t *)sstk }, /* 70 = sstk */
+ { compat(6,mmap) }, /* 71 = old mmap */
+ { 1, (sy_call_t *)ovadvise }, /* 72 = vadvise */
+ { 2, (sy_call_t *)munmap }, /* 73 = munmap */
+ { 3, (sy_call_t *)mprotect }, /* 74 = mprotect */
+ { 3, (sy_call_t *)madvise }, /* 75 = madvise */
+ { 0, (sy_call_t *)nosys }, /* 76 = obsolete vhangup */
+ { 0, (sy_call_t *)nosys }, /* 77 = obsolete vlimit */
+ { 3, (sy_call_t *)mincore }, /* 78 = mincore */
+ { 2, (sy_call_t *)getgroups }, /* 79 = getgroups */
+ { 2, (sy_call_t *)setgroups }, /* 80 = setgroups */
+ { 0, (sy_call_t *)getpgrp }, /* 81 = getpgrp */
+ { 2, (sy_call_t *)setpgid }, /* 82 = setpgid */
+ { 3, (sy_call_t *)setitimer }, /* 83 = setitimer */
+ { compat(0,wait) }, /* 84 = old wait */
+ { 1, (sy_call_t *)swapon }, /* 85 = swapon */
+ { 2, (sy_call_t *)getitimer }, /* 86 = getitimer */
+ { compat(2,gethostname) }, /* 87 = old gethostname */
+ { compat(2,sethostname) }, /* 88 = old sethostname */
+ { 0, (sy_call_t *)getdtablesize }, /* 89 = getdtablesize */
+ { 2, (sy_call_t *)dup2 }, /* 90 = dup2 */
+ { 0, (sy_call_t *)nosys }, /* 91 = getdopt */
+ { 3, (sy_call_t *)fcntl }, /* 92 = fcntl */
+ { 5, (sy_call_t *)select }, /* 93 = select */
+ { 0, (sy_call_t *)nosys }, /* 94 = setdopt */
+ { 1, (sy_call_t *)fsync }, /* 95 = fsync */
+ { 3, (sy_call_t *)setpriority }, /* 96 = setpriority */
+ { 3, (sy_call_t *)socket }, /* 97 = socket */
+ { 3, (sy_call_t *)connect }, /* 98 = connect */
+ { compat(3,accept) }, /* 99 = old accept */
+ { 2, (sy_call_t *)getpriority }, /* 100 = getpriority */
+ { compat(4,send) }, /* 101 = old send */
+ { compat(4,recv) }, /* 102 = old recv */
+ { 1, (sy_call_t *)sigreturn }, /* 103 = sigreturn */
+ { 3, (sy_call_t *)bind }, /* 104 = bind */
+ { 5, (sy_call_t *)setsockopt }, /* 105 = setsockopt */
+ { 2, (sy_call_t *)listen }, /* 106 = listen */
+ { 0, (sy_call_t *)nosys }, /* 107 = obsolete vtimes */
+ { compat(3,sigvec) }, /* 108 = old sigvec */
+ { compat(1,sigblock) }, /* 109 = old sigblock */
+ { compat(1,sigsetmask) }, /* 110 = old sigsetmask */
+ { 1, (sy_call_t *)sigsuspend }, /* 111 = sigsuspend */
+ { compat(2,sigstack) }, /* 112 = old sigstack */
+ { compat(3,recvmsg) }, /* 113 = old recvmsg */
+ { compat(3,sendmsg) }, /* 114 = old sendmsg */
+ { 0, (sy_call_t *)nosys }, /* 115 = obsolete vtrace */
+ { 2, (sy_call_t *)gettimeofday }, /* 116 = gettimeofday */
+ { 2, (sy_call_t *)getrusage }, /* 117 = getrusage */
+ { 5, (sy_call_t *)getsockopt }, /* 118 = getsockopt */
+ { 0, (sy_call_t *)nosys }, /* 119 = resuba */
+ { 3, (sy_call_t *)readv }, /* 120 = readv */
+ { 3, (sy_call_t *)writev }, /* 121 = writev */
+ { 2, (sy_call_t *)settimeofday }, /* 122 = settimeofday */
+ { 3, (sy_call_t *)fchown }, /* 123 = fchown */
+ { 2, (sy_call_t *)fchmod }, /* 124 = fchmod */
+ { compat(6,recvfrom) }, /* 125 = old recvfrom */
+ { 2, (sy_call_t *)setreuid }, /* 126 = setreuid */
+ { 2, (sy_call_t *)setregid }, /* 127 = setregid */
+ { 2, (sy_call_t *)rename }, /* 128 = rename */
+ { compat(2,truncate) }, /* 129 = old truncate */
+ { compat(2,ftruncate) }, /* 130 = old ftruncate */
+ { 2, (sy_call_t *)flock }, /* 131 = flock */
+ { 2, (sy_call_t *)mkfifo }, /* 132 = mkfifo */
+ { 6, (sy_call_t *)sendto }, /* 133 = sendto */
+ { 2, (sy_call_t *)shutdown }, /* 134 = shutdown */
+ { 4, (sy_call_t *)socketpair }, /* 135 = socketpair */
+ { 2, (sy_call_t *)mkdir }, /* 136 = mkdir */
+ { 1, (sy_call_t *)rmdir }, /* 137 = rmdir */
+ { 2, (sy_call_t *)utimes }, /* 138 = utimes */
+ { 0, (sy_call_t *)nosys }, /* 139 = obsolete 4.2 sigreturn */
+ { 2, (sy_call_t *)adjtime }, /* 140 = adjtime */
+ { compat(3,getpeername) }, /* 141 = old getpeername */
+ { compat(0,gethostid) }, /* 142 = old gethostid */
+ { compat(1,sethostid) }, /* 143 = old sethostid */
+ { compat(2,getrlimit) }, /* 144 = old getrlimit */
+ { compat(2,setrlimit) }, /* 145 = old setrlimit */
+ { compat(2,killpg) }, /* 146 = old killpg */
+ { 0, (sy_call_t *)setsid }, /* 147 = setsid */
+ { 4, (sy_call_t *)quotactl }, /* 148 = quotactl */
+ { compat(0,quota) }, /* 149 = old quota */
+ { compat(3,getsockname) }, /* 150 = old getsockname */
+ { 0, (sy_call_t *)nosys }, /* 151 = sem_lock */
+ { 0, (sy_call_t *)nosys }, /* 152 = sem_wakeup */
+ { 0, (sy_call_t *)nosys }, /* 153 = asyncdaemon */
+ { 0, (sy_call_t *)nosys }, /* 154 = nosys */
#ifdef NFS
- { 2, s(struct nfssvc_args),
- nfssvc }, /* 155 = nfssvc */
+ { 2, (sy_call_t *)nfssvc }, /* 155 = nfssvc */
#else
- { 0, 0,
- nosys }, /* 155 = unimplemented nfssvc */
+ { 0, (sy_call_t *)nosys }, /* 155 = nosys */
#endif
- { 4, s(struct compat_43_getdirentries_args),
- compat_43(getdirentries) }, /* 156 = compat_43 getdirentries */
- { 2, s(struct statfs_args),
- statfs }, /* 157 = statfs */
- { 2, s(struct fstatfs_args),
- fstatfs }, /* 158 = fstatfs */
- { 0, 0,
- nosys }, /* 159 = unimplemented */
- { 0, 0,
- nosys }, /* 160 = unimplemented */
-#ifdef NFS
- { 2, s(struct getfh_args),
- getfh }, /* 161 = getfh */
-#else
- { 0, 0,
- nosys }, /* 161 = unimplemented getfh */
-#endif
- { 0, 0,
- nosys }, /* 162 = unimplemented getdomainname */
- { 0, 0,
- nosys }, /* 163 = unimplemented setdomainname */
- { 0, 0,
- nosys }, /* 164 = unimplemented */
- { 0, 0,
- nosys }, /* 165 = unimplemented */
- { 0, 0,
- nosys }, /* 166 = unimplemented */
- { 0, 0,
- nosys }, /* 167 = unimplemented */
- { 0, 0,
- nosys }, /* 168 = unimplemented */
- { 0, 0,
- nosys }, /* 169 = unimplemented semsys */
- { 0, 0,
- nosys }, /* 170 = unimplemented msgsys */
-#if defined(SYSVSHM) && !defined(alpha)
- { 4, s(struct compat_43_shmsys_args),
- compat_43(shmsys) }, /* 171 = compat_43 shmsys */
+ { compat(4,getdirentries) }, /* 156 = old getdirentries */
+ { 2, (sy_call_t *)statfs }, /* 157 = statfs */
+ { 2, (sy_call_t *)fstatfs }, /* 158 = fstatfs */
+ { 0, (sy_call_t *)nosys }, /* 159 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 160 = nosys */
+#if defined(NFS) && !defined (NFS_NOSERVER)
+ { 2, (sy_call_t *)getfh }, /* 161 = getfh */
#else
- { 0, 0,
- nosys }, /* 171 = unimplemented shmsys */
+ { 0, (sy_call_t *)nosys }, /* 161 = nosys */
#endif
- { 0, 0,
- nosys }, /* 172 = unimplemented */
- { 0, 0,
- nosys }, /* 173 = unimplemented */
- { 0, 0,
- nosys }, /* 174 = unimplemented */
- { 0, 0,
- nosys }, /* 175 = unimplemented */
- { 0, 0,
- nosys }, /* 176 = unimplemented */
- { 0, 0,
- nosys }, /* 177 = unimplemented */
- { 0, 0,
- nosys }, /* 178 = unimplemented */
- { 0, 0,
- nosys }, /* 179 = unimplemented */
- { 0, 0,
- nosys }, /* 180 = unimplemented */
- { 1, s(struct setgid_args),
- setgid }, /* 181 = setgid */
- { 1, s(struct setegid_args),
- setegid }, /* 182 = setegid */
- { 1, s(struct seteuid_args),
- seteuid }, /* 183 = seteuid */
+ { 2, (sy_call_t *)getdomainname }, /* 162 = getdomainname */
+ { 2, (sy_call_t *)setdomainname }, /* 163 = setdomainname */
+ { 1, (sy_call_t *)uname }, /* 164 = uname */
+ { 2, (sy_call_t *)sysarch }, /* 165 = sysarch */
+ { 3, (sy_call_t *)rtprio }, /* 166 = rtprio */
+ { 0, (sy_call_t *)nosys }, /* 167 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 168 = nosys */
+ { 5, (sy_call_t *)semsys }, /* 169 = semsys */
+ { 6, (sy_call_t *)msgsys }, /* 170 = msgsys */
+ { 4, (sy_call_t *)shmsys }, /* 171 = shmsys */
+ { 0, (sy_call_t *)nosys }, /* 172 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 173 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 174 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 175 = nosys */
+ { 1, (sy_call_t *)ntp_adjtime }, /* 176 = ntp_adjtime */
+ { 0, (sy_call_t *)nosys }, /* 177 = sfork */
+ { 0, (sy_call_t *)nosys }, /* 178 = getdescriptor */
+ { 0, (sy_call_t *)nosys }, /* 179 = setdescriptor */
+ { 0, (sy_call_t *)nosys }, /* 180 = nosys */
+ { 1, (sy_call_t *)setgid }, /* 181 = setgid */
+ { 1, (sy_call_t *)setegid }, /* 182 = setegid */
+ { 1, (sy_call_t *)seteuid }, /* 183 = seteuid */
#ifdef LFS
- { 3, s(struct lfs_bmapv_args),
- lfs_bmapv }, /* 184 = lfs_bmapv */
- { 3, s(struct lfs_markv_args),
- lfs_markv }, /* 185 = lfs_markv */
- { 2, s(struct lfs_segclean_args),
- lfs_segclean }, /* 186 = lfs_segclean */
- { 2, s(struct lfs_segwait_args),
- lfs_segwait }, /* 187 = lfs_segwait */
+ { 3, (sy_call_t *)lfs_bmapv }, /* 184 = lfs_bmapv */
+ { 3, (sy_call_t *)lfs_markv }, /* 185 = lfs_markv */
+ { 2, (sy_call_t *)lfs_segclean }, /* 186 = lfs_segclean */
+ { 2, (sy_call_t *)lfs_segwait }, /* 187 = lfs_segwait */
#else
- { 0, 0,
- nosys }, /* 184 = unimplemented lfs_bmapv */
- { 0, 0,
- nosys }, /* 185 = unimplemented lfs_markv */
- { 0, 0,
- nosys }, /* 186 = unimplemented lfs_segclean */
- { 0, 0,
- nosys }, /* 187 = unimplemented lfs_segwait */
-#endif
- { 2, s(struct stat_args),
- stat }, /* 188 = stat */
- { 2, s(struct fstat_args),
- fstat }, /* 189 = fstat */
- { 2, s(struct lstat_args),
- lstat }, /* 190 = lstat */
- { 2, s(struct pathconf_args),
- pathconf }, /* 191 = pathconf */
- { 2, s(struct fpathconf_args),
- fpathconf }, /* 192 = fpathconf */
- { 0, 0,
- nosys }, /* 193 = unimplemented */
- { 2, s(struct getrlimit_args),
- getrlimit }, /* 194 = getrlimit */
- { 2, s(struct setrlimit_args),
- setrlimit }, /* 195 = setrlimit */
- { 4, s(struct getdirentries_args),
- getdirentries }, /* 196 = getdirentries */
- { 7, s(struct mmap_args),
- mmap }, /* 197 = mmap */
- { 0, 0,
- nosys }, /* 198 = __syscall */
- { 4, s(struct lseek_args),
- lseek }, /* 199 = lseek */
- { 3, s(struct truncate_args),
- truncate }, /* 200 = truncate */
- { 3, s(struct ftruncate_args),
- ftruncate }, /* 201 = ftruncate */
- { 6, s(struct __sysctl_args),
- __sysctl }, /* 202 = __sysctl */
- { 2, s(struct mlock_args),
- mlock }, /* 203 = mlock */
- { 2, s(struct munlock_args),
- munlock }, /* 204 = munlock */
- { 1, s(struct undelete_args),
- undelete }, /* 205 = undelete */
- { 0, 0,
- nosys }, /* 206 = unimplemented */
- { 0, 0,
- nosys }, /* 207 = unimplemented */
- { 0, 0,
- nosys }, /* 208 = unimplemented */
- { 0, 0,
- nosys }, /* 209 = unimplemented */
- { 0, 0,
- nosys }, /* 210 = unimplemented */
- { 0, 0,
- nosys }, /* 211 = unimplemented */
- { 0, 0,
- nosys }, /* 212 = unimplemented */
- { 0, 0,
- nosys }, /* 213 = unimplemented */
- { 0, 0,
- nosys }, /* 214 = unimplemented */
- { 0, 0,
- nosys }, /* 215 = unimplemented */
- { 0, 0,
- nosys }, /* 216 = unimplemented */
- { 0, 0,
- nosys }, /* 217 = unimplemented */
- { 0, 0,
- nosys }, /* 218 = unimplemented */
- { 0, 0,
- nosys }, /* 219 = unimplemented */
- { 0, 0,
- nosys }, /* 220 = unimplemented semctl */
- { 0, 0,
- nosys }, /* 221 = unimplemented semget */
- { 0, 0,
- nosys }, /* 222 = unimplemented semop */
- { 0, 0,
- nosys }, /* 223 = unimplemented semconfig */
- { 0, 0,
- nosys }, /* 224 = unimplemented msgctl */
- { 0, 0,
- nosys }, /* 225 = unimplemented msgget */
- { 0, 0,
- nosys }, /* 226 = unimplemented msgsnd */
- { 0, 0,
- nosys }, /* 227 = unimplemented msgrcv */
-#if defined(SYSVSHM) && 0
- { 3, s(struct shmat_args),
- shmat }, /* 228 = shmat */
- { 3, s(struct shmctl_args),
- shmctl }, /* 229 = shmctl */
- { 1, s(struct shmdt_args),
- shmdt }, /* 230 = shmdt */
- { 3, s(struct shmget_args),
- shmget }, /* 231 = shmget */
-#else
- { 0, 0,
- nosys }, /* 228 = unimplemented shmat */
- { 0, 0,
- nosys }, /* 229 = unimplemented shmctl */
- { 0, 0,
- nosys }, /* 230 = unimplemented shmdt */
- { 0, 0,
- nosys }, /* 231 = unimplemented shmget */
+ { 0, (sy_call_t *)nosys }, /* 184 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 185 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 186 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 187 = nosys */
#endif
+ { 2, (sy_call_t *)stat }, /* 188 = stat */
+ { 2, (sy_call_t *)fstat }, /* 189 = fstat */
+ { 2, (sy_call_t *)lstat }, /* 190 = lstat */
+ { 2, (sy_call_t *)pathconf }, /* 191 = pathconf */
+ { 2, (sy_call_t *)fpathconf }, /* 192 = fpathconf */
+ { 0, (sy_call_t *)nosys }, /* 193 = nosys */
+ { 2, (sy_call_t *)getrlimit }, /* 194 = getrlimit */
+ { 2, (sy_call_t *)setrlimit }, /* 195 = setrlimit */
+ { 4, (sy_call_t *)getdirentries }, /* 196 = getdirentries */
+ { 8, (sy_call_t *)mmap }, /* 197 = mmap */
+ { 0, (sy_call_t *)nosys }, /* 198 = __syscall */
+ { 5, (sy_call_t *)lseek }, /* 199 = lseek */
+ { 4, (sy_call_t *)truncate }, /* 200 = truncate */
+ { 4, (sy_call_t *)ftruncate }, /* 201 = ftruncate */
+ { 6, (sy_call_t *)__sysctl }, /* 202 = __sysctl */
+ { 2, (sy_call_t *)mlock }, /* 203 = mlock */
+ { 2, (sy_call_t *)munlock }, /* 204 = munlock */
+ { 2, (sy_call_t *)utrace }, /* 205 = utrace */
+ { 1, (sy_call_t *)undelete }, /* 206 = undelete */
+ { 0, (sy_call_t *)nosys }, /* 207 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 208 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 209 = nosys */
+ { 0, (sy_call_t *)lkmnosys }, /* 210 = lkmnosys */
+ { 0, (sy_call_t *)lkmnosys }, /* 211 = lkmnosys */
+ { 0, (sy_call_t *)lkmnosys }, /* 212 = lkmnosys */
+ { 0, (sy_call_t *)lkmnosys }, /* 213 = lkmnosys */
+ { 0, (sy_call_t *)lkmnosys }, /* 214 = lkmnosys */
+ { 0, (sy_call_t *)lkmnosys }, /* 215 = lkmnosys */
+ { 0, (sy_call_t *)lkmnosys }, /* 216 = lkmnosys */
+ { 0, (sy_call_t *)lkmnosys }, /* 217 = lkmnosys */
+ { 0, (sy_call_t *)lkmnosys }, /* 218 = lkmnosys */
+ { 0, (sy_call_t *)lkmnosys }, /* 219 = lkmnosys */
+ { 4, (sy_call_t *)__semctl }, /* 220 = __semctl */
+ { 3, (sy_call_t *)semget }, /* 221 = semget */
+ { 3, (sy_call_t *)semop }, /* 222 = semop */
+ { 1, (sy_call_t *)semconfig }, /* 223 = semconfig */
+ { 3, (sy_call_t *)msgctl }, /* 224 = msgctl */
+ { 2, (sy_call_t *)msgget }, /* 225 = msgget */
+ { 4, (sy_call_t *)msgsnd }, /* 226 = msgsnd */
+ { 5, (sy_call_t *)msgrcv }, /* 227 = msgrcv */
+ { 3, (sy_call_t *)shmat }, /* 228 = shmat */
+ { 3, (sy_call_t *)shmctl }, /* 229 = shmctl */
+ { 1, (sy_call_t *)shmdt }, /* 230 = shmdt */
+ { 3, (sy_call_t *)shmget }, /* 231 = shmget */
+ { 0, (sy_call_t *)nosys }, /* 232 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 233 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 234 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 235 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 236 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 237 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 238 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 239 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 240 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 241 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 242 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 243 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 244 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 245 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 246 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 247 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 248 = nosys */
+ { 0, (sy_call_t *)nosys }, /* 249 = nosys */
+ { 3, (sy_call_t *)minherit }, /* 250 = minherit */
+ { 1, (sy_call_t *)rfork }, /* 251 = rfork */
};
-
-int nsysent= sizeof(sysent) / sizeof(sysent[0]);
diff --git a/sys/kern/init_sysvec.c b/sys/kern/init_sysvec.c
new file mode 100644
index 0000000..379a1bf
--- /dev/null
+++ b/sys/kern/init_sysvec.c
@@ -0,0 +1,30 @@
+/*
+ * sysentvec for native FreeBSD a.out executable format.
+ *
+ * $Id$
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/mount.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/syscall.h>
+#include <sys/signalvar.h>
+#include <machine/md_var.h>
+
+struct sysentvec aout_sysvec = {
+ SYS_MAXSYSCALL,
+ sysent,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ 0,
+ sendsig,
+ sigcode,
+ &szsigcode,
+ 0,
+ "FreeBSD a.out"
+};
diff --git a/sys/kern/kern_acct.c b/sys/kern/kern_acct.c
index a23543c..f72d2d0 100644
--- a/sys/kern/kern_acct.c
+++ b/sys/kern/kern_acct.c
@@ -1,4 +1,5 @@
/*-
+ * Copyright (c) 1994 Christopher G. Demetriou
* Copyright (c) 1982, 1986, 1989, 1993
* The Regents of the University of California. All rights reserved.
* (c) UNIX System Laboratories, Inc.
@@ -35,91 +36,278 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * from: @(#)kern_acct.c 8.8 (Berkeley) 5/14/95
+ * @(#)kern_acct.c 8.1 (Berkeley) 6/14/93
+ * $Id: kern_acct.c,v 1.14 1997/03/23 03:36:17 bde Exp $
*/
#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
#include <sys/proc.h>
#include <sys/mount.h>
#include <sys/vnode.h>
-#include <sys/file.h>
+#include <sys/fcntl.h>
#include <sys/syslog.h>
#include <sys/kernel.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/namei.h>
+#include <sys/errno.h>
+#include <sys/acct.h>
+#include <sys/resourcevar.h>
+#include <sys/tty.h>
-acct(a1, a2, a3)
+/*
+ * The routines implemented in this file are described in:
+ * Leffler, et al.: The Design and Implementation of the 4.3BSD
+ * UNIX Operating System (Addison Welley, 1989)
+ * on pages 62-63.
+ *
+ * Arguably, to simplify accounting operations, this mechanism should
+ * be replaced by one in which an accounting log file (similar to /dev/klog)
+ * is read by a user process, etc. However, that has its own problems.
+ */
+
+/*
+ * Internal accounting functions.
+ * The former's operation is described in Leffler, et al., and the latter
+ * was provided by UCB with the 4.4BSD-Lite release
+ */
+static comp_t encode_comp_t __P((u_long, u_long));
+static void acctwatch __P((void *));
+
+/*
+ * Accounting vnode pointer, and saved vnode pointer.
+ */
+static struct vnode *acctp;
+static struct vnode *savacctp;
+
+/*
+ * Values associated with enabling and disabling accounting
+ */
+static int acctsuspend = 2; /* stop accounting when < 2% free space left */
+SYSCTL_INT(_kern, OID_AUTO, acct_suspend, CTLFLAG_RW,
+ &acctsuspend, 0, "");
+
+static int acctresume = 4; /* resume when free space risen to > 4% */
+SYSCTL_INT(_kern, OID_AUTO, acct_resume, CTLFLAG_RW,
+ &acctresume, 0, "");
+
+static int acctchkfreq = 15; /* frequency (in seconds) to check space */
+SYSCTL_INT(_kern, OID_AUTO, acct_chkfreq, CTLFLAG_RW,
+ &acctchkfreq, 0, "");
+
+/*
+ * Accounting system call. Written based on the specification and
+ * previous implementation done by Mark Tinguely.
+ */
+int
+acct(a1, uap, a3)
struct proc *a1;
struct acct_args /* {
syscallarg(char *) path;
- } */ *a2;
+ } */ *uap;
int *a3;
{
+ struct proc *p = curproc; /* XXX */
+ struct nameidata nd;
+ int error;
+
+ /* Make sure that the caller is root. */
+ error = suser(p->p_ucred, &p->p_acflag);
+ if (error)
+ return (error);
+
/*
- * Body deleted.
+ * If accounting is to be started to a file, open that file for
+ * writing and make sure it's a 'normal'.
*/
- return (ENOSYS);
-}
+ if (SCARG(uap, path) != NULL) {
+ NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path),
+ p);
+ error = vn_open(&nd, FWRITE, 0);
+ if (error)
+ return (error);
+ VOP_UNLOCK(nd.ni_vp, 0, p);
+ if (nd.ni_vp->v_type != VREG) {
+ vn_close(nd.ni_vp, FWRITE, p->p_ucred, p);
+ return (EACCES);
+ }
+ }
-acct_process(a1)
- struct proc *a1;
-{
+ /*
+ * If accounting was previously enabled, kill the old space-watcher,
+ * close the file, and (if no new file was specified, leave).
+ */
+ if (acctp != NULLVP || savacctp != NULLVP) {
+ untimeout(acctwatch, NULL);
+ error = vn_close((acctp != NULLVP ? acctp : savacctp), FWRITE,
+ p->p_ucred, p);
+ acctp = savacctp = NULLVP;
+ }
+ if (SCARG(uap, path) == NULL)
+ return (error);
/*
- * Body deleted.
+ * Save the new accounting file vnode, and schedule the new
+ * free space watcher.
*/
- return;
+ acctp = nd.ni_vp;
+ acctwatch(NULL);
+ return (error);
}
/*
- * Periodically check the file system to see if accounting
- * should be turned on or off. Beware the case where the vnode
- * has been vgone()'d out from underneath us, e.g. when the file
- * system containing the accounting file has been forcibly unmounted.
+ * Write out process accounting information, on process exit.
+ * Data to be written out is specified in Leffler, et al.
+ * and are enumerated below. (They're also noted in the system
+ * "acct.h" header file.)
*/
+int
+acct_process(p)
+ struct proc *p;
+{
+ struct acct acct;
+ struct rusage *r;
+ struct timeval ut, st, tmp;
+ int t;
+ struct vnode *vp;
+
+ /* If accounting isn't enabled, don't bother */
+ vp = acctp;
+ if (vp == NULLVP)
+ return (0);
+
+ /*
+ * Get process accounting information.
+ */
+
+ /* (1) The name of the command that ran */
+ bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm);
+
+ /* (2) The amount of user and system time that was used */
+ calcru(p, &ut, &st, NULL);
+ acct.ac_utime = encode_comp_t(ut.tv_sec, ut.tv_usec);
+ acct.ac_stime = encode_comp_t(st.tv_sec, st.tv_usec);
+
+ /* (3) The elapsed time the commmand ran (and its starting time) */
+ acct.ac_btime = p->p_stats->p_start.tv_sec;
+ microtime(&tmp);
+ timevalsub(&tmp, &p->p_stats->p_start);
+ acct.ac_etime = encode_comp_t(tmp.tv_sec, tmp.tv_usec);
+
+ /* (4) The average amount of memory used */
+ r = &p->p_stats->p_ru;
+ tmp = ut;
+ timevaladd(&tmp, &st);
+ t = tmp.tv_sec * hz + tmp.tv_usec / tick;
+ if (t)
+ acct.ac_mem = (r->ru_ixrss + r->ru_idrss + r->ru_isrss) / t;
+ else
+ acct.ac_mem = 0;
+
+ /* (5) The number of disk I/O operations done */
+ acct.ac_io = encode_comp_t(r->ru_inblock + r->ru_oublock, 0);
+
+ /* (6) The UID and GID of the process */
+ acct.ac_uid = p->p_cred->p_ruid;
+ acct.ac_gid = p->p_cred->p_rgid;
+
+ /* (7) The terminal from which the process was started */
+ if ((p->p_flag & P_CONTROLT) && p->p_pgrp->pg_session->s_ttyp)
+ acct.ac_tty = p->p_pgrp->pg_session->s_ttyp->t_dev;
+ else
+ acct.ac_tty = NODEV;
+
+ /* (8) The boolean flags that tell how the process terminated, etc. */
+ acct.ac_flag = p->p_acflag;
+
+ /*
+ * Now, just write the accounting information to the file.
+ */
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ return (vn_rdwr(UIO_WRITE, vp, (caddr_t)&acct, sizeof (acct),
+ (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, p->p_ucred,
+ (int *)0, p));
+}
+
/*
- * Values associated with enabling and disabling accounting
+ * Encode_comp_t converts from ticks in seconds and microseconds
+ * to ticks in 1/AHZ seconds. The encoding is described in
+ * Leffler, et al., on page 63.
*/
-int acctsuspend = 2; /* stop accounting when < 2% free space left */
-int acctresume = 4; /* resume when free space risen to > 4% */
-int acctchkfreq = 15; /* frequency (in seconds) to check space */
+
+#define MANTSIZE 13 /* 13 bit mantissa. */
+#define EXPSIZE 3 /* Base 8 (3 bit) exponent. */
+#define MAXFRACT ((1 << MANTSIZE) - 1) /* Maximum fractional value. */
+
+static comp_t
+encode_comp_t(s, us)
+ u_long s, us;
+{
+ int exp, rnd;
+
+ exp = 0;
+ rnd = 0;
+ s *= AHZ;
+ s += us / (1000000 / AHZ); /* Maximize precision. */
+
+ while (s > MAXFRACT) {
+ rnd = s & (1 << (EXPSIZE - 1)); /* Round up? */
+ s >>= EXPSIZE; /* Base 8 exponent == 3 bit shift. */
+ exp++;
+ }
+
+ /* If we need to round up, do it (and handle overflow correctly). */
+ if (rnd && (++s > MAXFRACT)) {
+ s >>= EXPSIZE;
+ exp++;
+ }
+
+ /* Clean it up and polish it off. */
+ exp <<= MANTSIZE; /* Shift the exponent into place */
+ exp += s; /* and add on the mantissa. */
+ return (exp);
+}
/*
- * SHOULD REPLACE THIS WITH A DRIVER THAT CAN BE READ TO SIMPLIFY.
+ * Periodically check the file system to see if accounting
+ * should be turned on or off. Beware the case where the vnode
+ * has been vgone()'d out from underneath us, e.g. when the file
+ * system containing the accounting file has been forcibly unmounted.
*/
-struct vnode *acctp;
-struct vnode *savacctp;
-
/* ARGSUSED */
-void
+static void
acctwatch(a)
void *a;
{
struct statfs sb;
- if (savacctp) {
+ if (savacctp != NULLVP) {
if (savacctp->v_type == VBAD) {
(void) vn_close(savacctp, FWRITE, NOCRED, NULL);
- savacctp = NULL;
+ savacctp = NULLVP;
return;
}
(void)VFS_STATFS(savacctp->v_mount, &sb, (struct proc *)0);
if (sb.f_bavail > acctresume * sb.f_blocks / 100) {
acctp = savacctp;
- savacctp = NULL;
+ savacctp = NULLVP;
log(LOG_NOTICE, "Accounting resumed\n");
}
} else {
- if (acctp == NULL)
+ if (acctp == NULLVP)
return;
if (acctp->v_type == VBAD) {
(void) vn_close(acctp, FWRITE, NOCRED, NULL);
- acctp = NULL;
+ acctp = NULLVP;
return;
}
(void)VFS_STATFS(acctp->v_mount, &sb, (struct proc *)0);
if (sb.f_bavail <= acctsuspend * sb.f_blocks / 100) {
savacctp = acctp;
- acctp = NULL;
+ acctp = NULLVP;
log(LOG_NOTICE, "Accounting suspended\n");
}
}
diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c
index f42900c..171ed0e 100644
--- a/sys/kern/kern_clock.c
+++ b/sys/kern/kern_clock.c
@@ -36,8 +36,28 @@
* SUCH DAMAGE.
*
* @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
+ * $Id: kern_clock.c,v 1.34 1997/03/22 16:52:19 mpp Exp $
*/
+/* Portions of this software are covered by the following: */
+/******************************************************************************
+ * *
+ * Copyright (c) David L. Mills 1993, 1994 *
+ * *
+ * Permission to use, copy, modify, and distribute this software and its *
+ * documentation for any purpose and without fee is hereby granted, provided *
+ * that the above copyright notice appears in all copies and that both the *
+ * copyright notice and this permission notice appear in supporting *
+ * documentation, and that the name University of Delaware not be used in *
+ * advertising or publicity pertaining to distribution of the software *
+ * without specific, written prior permission. The University of Delaware *
+ * makes no representations about the suitability this software for any *
+ * purpose. It is provided "as is" without express or implied warranty. *
+ * *
+ *****************************************************************************/
+
+#include "opt_cpu.h" /* XXX */
+
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/dkstat.h>
@@ -45,13 +65,49 @@
#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/timex.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/sysctl.h>
#include <machine/cpu.h>
+#define CLOCK_HAIR /* XXX */
+#include <machine/clock.h>
#ifdef GPROF
#include <sys/gmon.h>
#endif
+static void initclocks __P((void *dummy));
+SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
+
+/* Exported to machdep.c. */
+struct callout *callfree, *callout;
+
+static struct callout calltodo;
+
+/* Some of these don't belong here, but it's easiest to concentrate them. */
+static long cp_time[CPUSTATES];
+long dk_seek[DK_NDRIVE];
+static long dk_time[DK_NDRIVE];
+long dk_wds[DK_NDRIVE];
+long dk_wpms[DK_NDRIVE];
+long dk_xfer[DK_NDRIVE];
+
+int dk_busy;
+int dk_ndrive = 0;
+char dk_names[DK_NDRIVE][DK_NAMELEN];
+
+long tk_cancc;
+long tk_nin;
+long tk_nout;
+long tk_rawcc;
+
/*
* Clock handling routines.
*
@@ -97,19 +153,278 @@
int stathz;
int profhz;
-int profprocs;
+static int profprocs;
int ticks;
static int psdiv, pscnt; /* prof => stat divider */
-int psratio; /* ratio: prof / stat */
+int psratio; /* ratio: prof / stat */
volatile struct timeval time;
volatile struct timeval mono_time;
/*
- * Initialize clock frequencies and start both clocks running.
+ * Phase/frequency-lock loop (PLL/FLL) definitions
+ *
+ * The following variables are read and set by the ntp_adjtime() system
+ * call.
+ *
+ * time_state shows the state of the system clock, with values defined
+ * in the timex.h header file.
+ *
+ * time_status shows the status of the system clock, with bits defined
+ * in the timex.h header file.
+ *
+ * time_offset is used by the PLL/FLL to adjust the system time in small
+ * increments.
+ *
+ * time_constant determines the bandwidth or "stiffness" of the PLL.
+ *
+ * time_tolerance determines maximum frequency error or tolerance of the
+ * CPU clock oscillator and is a property of the architecture; however,
+ * in principle it could change as result of the presence of external
+ * discipline signals, for instance.
+ *
+ * time_precision is usually equal to the kernel tick variable; however,
+ * in cases where a precision clock counter or external clock is
+ * available, the resolution can be much less than this and depend on
+ * whether the external clock is working or not.
+ *
+ * time_maxerror is initialized by a ntp_adjtime() call and increased by
+ * the kernel once each second to reflect the maximum error
+ * bound growth.
+ *
+ * time_esterror is set and read by the ntp_adjtime() call, but
+ * otherwise not used by the kernel.
+ */
+int time_status = STA_UNSYNC; /* clock status bits */
+int time_state = TIME_OK; /* clock state */
+long time_offset = 0; /* time offset (us) */
+long time_constant = 0; /* pll time constant */
+long time_tolerance = MAXFREQ; /* frequency tolerance (scaled ppm) */
+long time_precision = 1; /* clock precision (us) */
+long time_maxerror = MAXPHASE; /* maximum error (us) */
+long time_esterror = MAXPHASE; /* estimated error (us) */
+
+/*
+ * The following variables establish the state of the PLL/FLL and the
+ * residual time and frequency offset of the local clock. The scale
+ * factors are defined in the timex.h header file.
+ *
+ * time_phase and time_freq are the phase increment and the frequency
+ * increment, respectively, of the kernel time variable at each tick of
+ * the clock.
+ *
+ * time_freq is set via ntp_adjtime() from a value stored in a file when
+ * the synchronization daemon is first started. Its value is retrieved
+ * via ntp_adjtime() and written to the file about once per hour by the
+ * daemon.
+ *
+ * time_adj is the adjustment added to the value of tick at each timer
+ * interrupt and is recomputed from time_phase and time_freq at each
+ * seconds rollover.
+ *
+ * time_reftime is the second's portion of the system time on the last
+ * call to ntp_adjtime(). It is used to adjust the time_freq variable
+ * and to increase the time_maxerror as the time since last update
+ * increases.
+ */
+static long time_phase = 0; /* phase offset (scaled us) */
+long time_freq = 0; /* frequency offset (scaled ppm) */
+static long time_adj = 0; /* tick adjust (scaled 1 / hz) */
+static long time_reftime = 0; /* time at last adjustment (s) */
+
+#ifdef PPS_SYNC
+/*
+ * The following variables are used only if the kernel PPS discipline
+ * code is configured (PPS_SYNC). The scale factors are defined in the
+ * timex.h header file.
+ *
+ * pps_time contains the time at each calibration interval, as read by
+ * microtime(). pps_count counts the seconds of the calibration
+ * interval, the duration of which is nominally pps_shift in powers of
+ * two.
+ *
+ * pps_offset is the time offset produced by the time median filter
+ * pps_tf[], while pps_jitter is the dispersion (jitter) measured by
+ * this filter.
+ *
+ * pps_freq is the frequency offset produced by the frequency median
+ * filter pps_ff[], while pps_stabil is the dispersion (wander) measured
+ * by this filter.
+ *
+ * pps_usec is latched from a high resolution counter or external clock
+ * at pps_time. Here we want the hardware counter contents only, not the
+ * contents plus the time_tv.usec as usual.
+ *
+ * pps_valid counts the number of seconds since the last PPS update. It
+ * is used as a watchdog timer to disable the PPS discipline should the
+ * PPS signal be lost.
+ *
+ * pps_glitch counts the number of seconds since the beginning of an
+ * offset burst more than tick/2 from current nominal offset. It is used
+ * mainly to suppress error bursts due to priority conflicts between the
+ * PPS interrupt and timer interrupt.
+ *
+ * pps_intcnt counts the calibration intervals for use in the interval-
+ * adaptation algorithm. It's just too complicated for words.
+ */
+struct timeval pps_time; /* kernel time at last interval */
+long pps_offset = 0; /* pps time offset (us) */
+long pps_jitter = MAXTIME; /* pps time dispersion (jitter) (us) */
+long pps_tf[] = {0, 0, 0}; /* pps time offset median filter (us) */
+long pps_freq = 0; /* frequency offset (scaled ppm) */
+long pps_stabil = MAXFREQ; /* frequency dispersion (scaled ppm) */
+long pps_ff[] = {0, 0, 0}; /* frequency offset median filter */
+long pps_usec = 0; /* microsec counter at last interval */
+long pps_valid = PPS_VALID; /* pps signal watchdog counter */
+int pps_glitch = 0; /* pps signal glitch counter */
+int pps_count = 0; /* calibration interval counter (s) */
+int pps_shift = PPS_SHIFT; /* interval duration (s) (shift) */
+int pps_intcnt = 0; /* intervals at current duration */
+
+/*
+ * PPS signal quality monitors
+ *
+ * pps_jitcnt counts the seconds that have been discarded because the
+ * jitter measured by the time median filter exceeds the limit MAXTIME
+ * (100 us).
+ *
+ * pps_calcnt counts the frequency calibration intervals, which are
+ * variable from 4 s to 256 s.
+ *
+ * pps_errcnt counts the calibration intervals which have been discarded
+ * because the wander exceeds the limit MAXFREQ (100 ppm) or where the
+ * calibration interval jitter exceeds two ticks.
+ *
+ * pps_stbcnt counts the calibration intervals that have been discarded
+ * because the frequency wander exceeds the limit MAXFREQ / 4 (25 us).
+ */
+long pps_jitcnt = 0; /* jitter limit exceeded */
+long pps_calcnt = 0; /* calibration intervals */
+long pps_errcnt = 0; /* calibration errors */
+long pps_stbcnt = 0; /* stability limit exceeded */
+#endif /* PPS_SYNC */
+
+/* XXX none of this stuff works under FreeBSD */
+#ifdef EXT_CLOCK
+/*
+ * External clock definitions
+ *
+ * The following definitions and declarations are used only if an
+ * external clock (HIGHBALL or TPRO) is configured on the system.
+ */
+#define CLOCK_INTERVAL 30 /* CPU clock update interval (s) */
+
+/*
+ * The clock_count variable is set to CLOCK_INTERVAL at each PPS
+ * interrupt and decremented once each second.
+ */
+int clock_count = 0; /* CPU clock counter */
+
+#ifdef HIGHBALL
+/*
+ * The clock_offset and clock_cpu variables are used by the HIGHBALL
+ * interface. The clock_offset variable defines the offset between
+ * system time and the HIGBALL counters. The clock_cpu variable contains
+ * the offset between the system clock and the HIGHBALL clock for use in
+ * disciplining the kernel time variable.
+ */
+extern struct timeval clock_offset; /* Highball clock offset */
+long clock_cpu = 0; /* CPU clock adjust */
+#endif /* HIGHBALL */
+#endif /* EXT_CLOCK */
+
+/*
+ * hardupdate() - local clock update
+ *
+ * This routine is called by ntp_adjtime() to update the local clock
+ * phase and frequency. The implementation is of an adaptive-parameter,
+ * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new
+ * time and frequency offset estimates for each call. If the kernel PPS
+ * discipline code is configured (PPS_SYNC), the PPS signal itself
+ * determines the new time offset, instead of the calling argument.
+ * Presumably, calls to ntp_adjtime() occur only when the caller
+ * believes the local clock is valid within some bound (+-128 ms with
+ * NTP). If the caller's time is far different than the PPS time, an
+ * argument will ensue, and it's not clear who will lose.
+ *
+ * For uncompensated quartz crystal oscillatores and nominal update
+ * intervals less than 1024 s, operation should be in phase-lock mode
+ * (STA_FLL = 0), where the loop is disciplined to phase. For update
+ * intervals greater than thiss, operation should be in frequency-lock
+ * mode (STA_FLL = 1), where the loop is disciplined to frequency.
+ *
+ * Note: splclock() is in effect.
*/
void
-initclocks()
+hardupdate(offset)
+ long offset;
+{
+ long ltemp, mtemp;
+
+ if (!(time_status & STA_PLL) && !(time_status & STA_PPSTIME))
+ return;
+ ltemp = offset;
+#ifdef PPS_SYNC
+ if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
+ ltemp = pps_offset;
+#endif /* PPS_SYNC */
+
+ /*
+ * Scale the phase adjustment and clamp to the operating range.
+ */
+ if (ltemp > MAXPHASE)
+ time_offset = MAXPHASE << SHIFT_UPDATE;
+ else if (ltemp < -MAXPHASE)
+ time_offset = -(MAXPHASE << SHIFT_UPDATE);
+ else
+ time_offset = ltemp << SHIFT_UPDATE;
+
+ /*
+ * Select whether the frequency is to be controlled and in which
+ * mode (PLL or FLL). Clamp to the operating range. Ugly
+ * multiply/divide should be replaced someday.
+ */
+ if (time_status & STA_FREQHOLD || time_reftime == 0)
+ time_reftime = time.tv_sec;
+ mtemp = time.tv_sec - time_reftime;
+ time_reftime = time.tv_sec;
+ if (time_status & STA_FLL) {
+ if (mtemp >= MINSEC) {
+ ltemp = ((time_offset / mtemp) << (SHIFT_USEC -
+ SHIFT_UPDATE));
+ if (ltemp < 0)
+ time_freq -= -ltemp >> SHIFT_KH;
+ else
+ time_freq += ltemp >> SHIFT_KH;
+ }
+ } else {
+ if (mtemp < MAXSEC) {
+ ltemp *= mtemp;
+ if (ltemp < 0)
+ time_freq -= -ltemp >> (time_constant +
+ time_constant + SHIFT_KF -
+ SHIFT_USEC);
+ else
+ time_freq += ltemp >> (time_constant +
+ time_constant + SHIFT_KF -
+ SHIFT_USEC);
+ }
+ }
+ if (time_freq > time_tolerance)
+ time_freq = time_tolerance;
+ else if (time_freq < -time_tolerance)
+ time_freq = -time_tolerance;
+}
+
+
+
+/*
+ * Initialize clock frequencies and start both clocks running.
+ */
+/* ARGSUSED*/
+static void
+initclocks(dummy)
+ void *dummy;
{
register int i;
@@ -138,9 +453,7 @@ hardclock(frame)
{
register struct callout *p1;
register struct proc *p;
- register int delta, needsoft;
- extern int tickdelta;
- extern long timedelta;
+ register int needsoft;
/*
* Update real-time timeout queue.
@@ -185,18 +498,181 @@ hardclock(frame)
statclock(frame);
/*
- * Increment the time-of-day. The increment is just ``tick'' unless
- * we are still adjusting the clock; see adjtime().
+ * Increment the time-of-day.
*/
ticks++;
- if (timedelta == 0)
- delta = tick;
- else {
- delta = tick + tickdelta;
- timedelta -= tickdelta;
+ {
+ int time_update;
+ struct timeval newtime = time;
+ long ltemp;
+
+ if (timedelta == 0) {
+ time_update = CPU_THISTICKLEN(tick);
+ } else {
+ time_update = CPU_THISTICKLEN(tick) + tickdelta;
+ timedelta -= tickdelta;
+ }
+ BUMPTIME(&mono_time, time_update);
+
+ /*
+ * Compute the phase adjustment. If the low-order bits
+ * (time_phase) of the update overflow, bump the high-order bits
+ * (time_update).
+ */
+ time_phase += time_adj;
+ if (time_phase <= -FINEUSEC) {
+ ltemp = -time_phase >> SHIFT_SCALE;
+ time_phase += ltemp << SHIFT_SCALE;
+ time_update -= ltemp;
+ }
+ else if (time_phase >= FINEUSEC) {
+ ltemp = time_phase >> SHIFT_SCALE;
+ time_phase -= ltemp << SHIFT_SCALE;
+ time_update += ltemp;
+ }
+
+ newtime.tv_usec += time_update;
+ /*
+ * On rollover of the second the phase adjustment to be used for
+ * the next second is calculated. Also, the maximum error is
+ * increased by the tolerance. If the PPS frequency discipline
+ * code is present, the phase is increased to compensate for the
+ * CPU clock oscillator frequency error.
+ *
+ * On a 32-bit machine and given parameters in the timex.h
+ * header file, the maximum phase adjustment is +-512 ms and
+ * maximum frequency offset is a tad less than) +-512 ppm. On a
+ * 64-bit machine, you shouldn't need to ask.
+ */
+ if (newtime.tv_usec >= 1000000) {
+ newtime.tv_usec -= 1000000;
+ newtime.tv_sec++;
+ time_maxerror += time_tolerance >> SHIFT_USEC;
+
+ /*
+ * Compute the phase adjustment for the next second. In
+ * PLL mode, the offset is reduced by a fixed factor
+ * times the time constant. In FLL mode the offset is
+ * used directly. In either mode, the maximum phase
+ * adjustment for each second is clamped so as to spread
+ * the adjustment over not more than the number of
+ * seconds between updates.
+ */
+ if (time_offset < 0) {
+ ltemp = -time_offset;
+ if (!(time_status & STA_FLL))
+ ltemp >>= SHIFT_KG + time_constant;
+ if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+ ltemp = (MAXPHASE / MINSEC) <<
+ SHIFT_UPDATE;
+ time_offset += ltemp;
+ time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ -
+ SHIFT_UPDATE);
+ } else {
+ ltemp = time_offset;
+ if (!(time_status & STA_FLL))
+ ltemp >>= SHIFT_KG + time_constant;
+ if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+ ltemp = (MAXPHASE / MINSEC) <<
+ SHIFT_UPDATE;
+ time_offset -= ltemp;
+ time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ -
+ SHIFT_UPDATE);
+ }
+
+ /*
+ * Compute the frequency estimate and additional phase
+ * adjustment due to frequency error for the next
+ * second. When the PPS signal is engaged, gnaw on the
+ * watchdog counter and update the frequency computed by
+ * the pll and the PPS signal.
+ */
+#ifdef PPS_SYNC
+ pps_valid++;
+ if (pps_valid == PPS_VALID) {
+ pps_jitter = MAXTIME;
+ pps_stabil = MAXFREQ;
+ time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+ STA_PPSWANDER | STA_PPSERROR);
+ }
+ ltemp = time_freq + pps_freq;
+#else
+ ltemp = time_freq;
+#endif /* PPS_SYNC */
+ if (ltemp < 0)
+ time_adj -= -ltemp >>
+ (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+ else
+ time_adj += ltemp >>
+ (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+
+#if SHIFT_HZ == 7
+ /*
+ * When the CPU clock oscillator frequency is not a
+ * power of two in Hz, the SHIFT_HZ is only an
+ * approximate scale factor. In the SunOS kernel, this
+ * results in a PLL gain factor of 1/1.28 = 0.78 what it
+ * should be. In the following code the overall gain is
+ * increased by a factor of 1.25, which results in a
+ * residual error less than 3 percent.
+ */
+ /* Same thing applies for FreeBSD --GAW */
+ if (hz == 100) {
+ if (time_adj < 0)
+ time_adj -= -time_adj >> 2;
+ else
+ time_adj += time_adj >> 2;
+ }
+#endif /* SHIFT_HZ */
+
+ /* XXX - this is really bogus, but can't be fixed until
+ xntpd's idea of the system clock is fixed to know how
+ the user wants leap seconds handled; in the mean time,
+ we assume that users of NTP are running without proper
+ leap second support (this is now the default anyway) */
+ /*
+ * Leap second processing. If in leap-insert state at
+ * the end of the day, the system clock is set back one
+ * second; if in leap-delete state, the system clock is
+ * set ahead one second. The microtime() routine or
+ * external clock driver will insure that reported time
+ * is always monotonic. The ugly divides should be
+ * replaced.
+ */
+ switch (time_state) {
+
+ case TIME_OK:
+ if (time_status & STA_INS)
+ time_state = TIME_INS;
+ else if (time_status & STA_DEL)
+ time_state = TIME_DEL;
+ break;
+
+ case TIME_INS:
+ if (newtime.tv_sec % 86400 == 0) {
+ newtime.tv_sec--;
+ time_state = TIME_OOP;
+ }
+ break;
+
+ case TIME_DEL:
+ if ((newtime.tv_sec + 1) % 86400 == 0) {
+ newtime.tv_sec++;
+ time_state = TIME_WAIT;
+ }
+ break;
+
+ case TIME_OOP:
+ time_state = TIME_WAIT;
+ break;
+
+ case TIME_WAIT:
+ if (!(time_status & (STA_INS | STA_DEL)))
+ time_state = TIME_OK;
+ }
+ }
+ CPU_CLOCKUPDATE(&time, &newtime);
}
- BUMPTIME(&time, delta);
- BUMPTIME(&mono_time, delta);
/*
* Process callouts at a very low cpu priority, so we don't keep the
@@ -256,7 +732,7 @@ softclock()
*/
void
timeout(ftn, arg, ticks)
- void (*ftn) __P((void *));
+ timeout_t ftn;
void *arg;
register int ticks;
{
@@ -301,7 +777,7 @@ timeout(ftn, arg, ticks)
void
untimeout(ftn, arg)
- void (*ftn) __P((void *));
+ timeout_t ftn;
void *arg;
{
register struct callout *p, *t;
@@ -323,6 +799,17 @@ untimeout(ftn, arg)
splx(s);
}
+void
+gettime(struct timeval *tvp)
+{
+ int s;
+
+ s = splclock();
+ /* XXX should use microtime() iff tv_usec is used. */
+ *tvp = time;
+ splx(s);
+}
+
/*
* Compute number of hz until specified time. Used to
* compute third argument to timeout() from an absolute time.
@@ -331,28 +818,54 @@ int
hzto(tv)
struct timeval *tv;
{
- register long ticks, sec;
+ register unsigned long ticks;
+ register long sec, usec;
int s;
/*
- * If number of milliseconds will fit in 32 bit arithmetic,
- * then compute number of milliseconds to time and scale to
- * ticks. Otherwise just compute number of hz in time, rounding
- * times greater than representible to maximum value.
+ * If the number of usecs in the whole seconds part of the time
+ * difference fits in a long, then the total number of usecs will
+ * fit in an unsigned long. Compute the total and convert it to
+ * ticks, rounding up and adding 1 to allow for the current tick
+ * to expire. Rounding also depends on unsigned long arithmetic
+ * to avoid overflow.
+ *
+ * Otherwise, if the number of ticks in the whole seconds part of
+ * the time difference fits in a long, then convert the parts to
+ * ticks separately and add, using similar rounding methods and
+ * overflow avoidance. This method would work in the previous
+ * case but it is slightly slower and assumes that hz is integral.
*
- * Delta times less than 25 days can be computed ``exactly''.
- * Maximum value for any timeout in 10ms ticks is 250 days.
+ * Otherwise, round the time difference down to the maximum
+ * representable value.
+ *
+ * If ints have 32 bits, then the maximum value for any timeout in
+ * 10ms ticks is 248 days.
*/
- s = splhigh();
+ s = splclock();
sec = tv->tv_sec - time.tv_sec;
- if (sec <= 0x7fffffff / 1000 - 1000)
- ticks = ((tv->tv_sec - time.tv_sec) * 1000 +
- (tv->tv_usec - time.tv_usec) / 1000) / (tick / 1000);
- else if (sec <= 0x7fffffff / hz)
- ticks = sec * hz;
- else
- ticks = 0x7fffffff;
+ usec = tv->tv_usec - time.tv_usec;
splx(s);
+ if (usec < 0) {
+ sec--;
+ usec += 1000000;
+ }
+ if (sec < 0) {
+#ifdef DIAGNOSTIC
+ printf("hzto: negative time difference %ld sec %ld usec\n",
+ sec, usec);
+#endif
+ ticks = 1;
+ } else if (sec <= LONG_MAX / 1000000)
+ ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
+ / tick + 1;
+ else if (sec <= LONG_MAX / hz)
+ ticks = sec * hz
+ + ((unsigned long)usec + (tick - 1)) / tick + 1;
+ else
+ ticks = LONG_MAX;
+ if (ticks > INT_MAX)
+ ticks = INT_MAX;
return (ticks);
}
@@ -399,8 +912,6 @@ stopprofclock(p)
}
}
-int dk_ndrive = DK_NDRIVE;
-
/*
* Statistics clock. Grab profile sample, and if divider reaches 0,
* do process and kernel statistics.
@@ -414,6 +925,10 @@ statclock(frame)
#endif
register struct proc *p;
register int i;
+ struct pstats *pstats;
+ long rss;
+ struct rusage *ru;
+ struct vmspace *vm;
if (CLKF_USERMODE(frame)) {
p = curproc;
@@ -505,18 +1020,29 @@ statclock(frame)
if (p->p_priority >= PUSER)
p->p_priority = p->p_usrpri;
}
+
+ /* Update resource usage integrals and maximums. */
+ if ((pstats = p->p_stats) != NULL &&
+ (ru = &pstats->p_ru) != NULL &&
+ (vm = p->p_vmspace) != NULL) {
+ ru->ru_ixrss += vm->vm_tsize * PAGE_SIZE / 1024;
+ ru->ru_idrss += vm->vm_dsize * PAGE_SIZE / 1024;
+ ru->ru_isrss += vm->vm_ssize * PAGE_SIZE / 1024;
+ rss = vm->vm_pmap.pm_stats.resident_count *
+ PAGE_SIZE / 1024;
+ if (ru->ru_maxrss < rss)
+ ru->ru_maxrss = rss;
+ }
}
}
/*
* Return information about system clocks.
*/
-sysctl_clockrate(where, sizep)
- register char *where;
- size_t *sizep;
+static int
+sysctl_kern_clockrate SYSCTL_HANDLER_ARGS
{
struct clockinfo clkinfo;
-
/*
* Construct clockinfo structure.
*/
@@ -524,5 +1050,254 @@ sysctl_clockrate(where, sizep)
clkinfo.tick = tick;
clkinfo.profhz = profhz;
clkinfo.stathz = stathz ? stathz : hz;
- return (sysctl_rdstruct(where, sizep, NULL, &clkinfo, sizeof(clkinfo)));
+ return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
+}
+
+SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
+ 0, 0, sysctl_kern_clockrate, "S,clockinfo","");
+
+#ifdef PPS_SYNC
+/*
+ * hardpps() - discipline CPU clock oscillator to external PPS signal
+ *
+ * This routine is called at each PPS interrupt in order to discipline
+ * the CPU clock oscillator to the PPS signal. It measures the PPS phase
+ * and leaves it in a handy spot for the hardclock() routine. It
+ * integrates successive PPS phase differences and calculates the
+ * frequency offset. This is used in hardclock() to discipline the CPU
+ * clock oscillator so that intrinsic frequency error is cancelled out.
+ * The code requires the caller to capture the time and hardware counter
+ * value at the on-time PPS signal transition.
+ *
+ * Note that, on some Unix systems, this routine runs at an interrupt
+ * priority level higher than the timer interrupt routine hardclock().
+ * Therefore, the variables used are distinct from the hardclock()
+ * variables, except for certain exceptions: The PPS frequency pps_freq
+ * and phase pps_offset variables are determined by this routine and
+ * updated atomically. The time_tolerance variable can be considered a
+ * constant, since it is infrequently changed, and then only when the
+ * PPS signal is disabled. The watchdog counter pps_valid is updated
+ * once per second by hardclock() and is atomically cleared in this
+ * routine.
+ */
+void
+hardpps(tvp, usec)
+ struct timeval *tvp; /* time at PPS */
+ long usec; /* hardware counter at PPS */
+{
+ long u_usec, v_usec, bigtick;
+ long cal_sec, cal_usec;
+
+ /*
+ * An occasional glitch can be produced when the PPS interrupt
+ * occurs in the hardclock() routine before the time variable is
+ * updated. Here the offset is discarded when the difference
+ * between it and the last one is greater than tick/2, but not
+ * if the interval since the first discard exceeds 30 s.
+ */
+ time_status |= STA_PPSSIGNAL;
+ time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
+ pps_valid = 0;
+ u_usec = -tvp->tv_usec;
+ if (u_usec < -500000)
+ u_usec += 1000000;
+ v_usec = pps_offset - u_usec;
+ if (v_usec < 0)
+ v_usec = -v_usec;
+ if (v_usec > (tick >> 1)) {
+ if (pps_glitch > MAXGLITCH) {
+ pps_glitch = 0;
+ pps_tf[2] = u_usec;
+ pps_tf[1] = u_usec;
+ } else {
+ pps_glitch++;
+ u_usec = pps_offset;
+ }
+ } else
+ pps_glitch = 0;
+
+ /*
+ * A three-stage median filter is used to help deglitch the pps
+ * time. The median sample becomes the time offset estimate; the
+ * difference between the other two samples becomes the time
+ * dispersion (jitter) estimate.
+ */
+ pps_tf[2] = pps_tf[1];
+ pps_tf[1] = pps_tf[0];
+ pps_tf[0] = u_usec;
+ if (pps_tf[0] > pps_tf[1]) {
+ if (pps_tf[1] > pps_tf[2]) {
+ pps_offset = pps_tf[1]; /* 0 1 2 */
+ v_usec = pps_tf[0] - pps_tf[2];
+ } else if (pps_tf[2] > pps_tf[0]) {
+ pps_offset = pps_tf[0]; /* 2 0 1 */
+ v_usec = pps_tf[2] - pps_tf[1];
+ } else {
+ pps_offset = pps_tf[2]; /* 0 2 1 */
+ v_usec = pps_tf[0] - pps_tf[1];
+ }
+ } else {
+ if (pps_tf[1] < pps_tf[2]) {
+ pps_offset = pps_tf[1]; /* 2 1 0 */
+ v_usec = pps_tf[2] - pps_tf[0];
+ } else if (pps_tf[2] < pps_tf[0]) {
+ pps_offset = pps_tf[0]; /* 1 0 2 */
+ v_usec = pps_tf[1] - pps_tf[2];
+ } else {
+ pps_offset = pps_tf[2]; /* 1 2 0 */
+ v_usec = pps_tf[1] - pps_tf[0];
+ }
+ }
+ if (v_usec > MAXTIME)
+ pps_jitcnt++;
+ v_usec = (v_usec << PPS_AVG) - pps_jitter;
+ if (v_usec < 0)
+ pps_jitter -= -v_usec >> PPS_AVG;
+ else
+ pps_jitter += v_usec >> PPS_AVG;
+ if (pps_jitter > (MAXTIME >> 1))
+ time_status |= STA_PPSJITTER;
+
+ /*
+ * During the calibration interval adjust the starting time when
+ * the tick overflows. At the end of the interval compute the
+ * duration of the interval and the difference of the hardware
+ * counters at the beginning and end of the interval. This code
+ * is deliciously complicated by the fact valid differences may
+ * exceed the value of tick when using long calibration
+ * intervals and small ticks. Note that the counter can be
+ * greater than tick if caught at just the wrong instant, but
+ * the values returned and used here are correct.
+ */
+ bigtick = (long)tick << SHIFT_USEC;
+ pps_usec -= pps_freq;
+ if (pps_usec >= bigtick)
+ pps_usec -= bigtick;
+ if (pps_usec < 0)
+ pps_usec += bigtick;
+ pps_time.tv_sec++;
+ pps_count++;
+ if (pps_count < (1 << pps_shift))
+ return;
+ pps_count = 0;
+ pps_calcnt++;
+ u_usec = usec << SHIFT_USEC;
+ v_usec = pps_usec - u_usec;
+ if (v_usec >= bigtick >> 1)
+ v_usec -= bigtick;
+ if (v_usec < -(bigtick >> 1))
+ v_usec += bigtick;
+ if (v_usec < 0)
+ v_usec = -(-v_usec >> pps_shift);
+ else
+ v_usec = v_usec >> pps_shift;
+ pps_usec = u_usec;
+ cal_sec = tvp->tv_sec;
+ cal_usec = tvp->tv_usec;
+ cal_sec -= pps_time.tv_sec;
+ cal_usec -= pps_time.tv_usec;
+ if (cal_usec < 0) {
+ cal_usec += 1000000;
+ cal_sec--;
+ }
+ pps_time = *tvp;
+
+ /*
+ * Check for lost interrupts, noise, excessive jitter and
+ * excessive frequency error. The number of timer ticks during
+ * the interval may vary +-1 tick. Add to this a margin of one
+ * tick for the PPS signal jitter and maximum frequency
+ * deviation. If the limits are exceeded, the calibration
+ * interval is reset to the minimum and we start over.
+ */
+ u_usec = (long)tick << 1;
+ if (!((cal_sec == -1 && cal_usec > (1000000 - u_usec))
+ || (cal_sec == 0 && cal_usec < u_usec))
+ || v_usec > time_tolerance || v_usec < -time_tolerance) {
+ pps_errcnt++;
+ pps_shift = PPS_SHIFT;
+ pps_intcnt = 0;
+ time_status |= STA_PPSERROR;
+ return;
+ }
+
+ /*
+ * A three-stage median filter is used to help deglitch the pps
+ * frequency. The median sample becomes the frequency offset
+ * estimate; the difference between the other two samples
+ * becomes the frequency dispersion (stability) estimate.
+ */
+ pps_ff[2] = pps_ff[1];
+ pps_ff[1] = pps_ff[0];
+ pps_ff[0] = v_usec;
+ if (pps_ff[0] > pps_ff[1]) {
+ if (pps_ff[1] > pps_ff[2]) {
+ u_usec = pps_ff[1]; /* 0 1 2 */
+ v_usec = pps_ff[0] - pps_ff[2];
+ } else if (pps_ff[2] > pps_ff[0]) {
+ u_usec = pps_ff[0]; /* 2 0 1 */
+ v_usec = pps_ff[2] - pps_ff[1];
+ } else {
+ u_usec = pps_ff[2]; /* 0 2 1 */
+ v_usec = pps_ff[0] - pps_ff[1];
+ }
+ } else {
+ if (pps_ff[1] < pps_ff[2]) {
+ u_usec = pps_ff[1]; /* 2 1 0 */
+ v_usec = pps_ff[2] - pps_ff[0];
+ } else if (pps_ff[2] < pps_ff[0]) {
+ u_usec = pps_ff[0]; /* 1 0 2 */
+ v_usec = pps_ff[1] - pps_ff[2];
+ } else {
+ u_usec = pps_ff[2]; /* 1 2 0 */
+ v_usec = pps_ff[1] - pps_ff[0];
+ }
+ }
+
+ /*
+ * Here the frequency dispersion (stability) is updated. If it
+ * is less than one-fourth the maximum (MAXFREQ), the frequency
+ * offset is updated as well, but clamped to the tolerance. It
+ * will be processed later by the hardclock() routine.
+ */
+ v_usec = (v_usec >> 1) - pps_stabil;
+ if (v_usec < 0)
+ pps_stabil -= -v_usec >> PPS_AVG;
+ else
+ pps_stabil += v_usec >> PPS_AVG;
+ if (pps_stabil > MAXFREQ >> 2) {
+ pps_stbcnt++;
+ time_status |= STA_PPSWANDER;
+ return;
+ }
+ if (time_status & STA_PPSFREQ) {
+ if (u_usec < 0) {
+ pps_freq -= -u_usec >> PPS_AVG;
+ if (pps_freq < -time_tolerance)
+ pps_freq = -time_tolerance;
+ u_usec = -u_usec;
+ } else {
+ pps_freq += u_usec >> PPS_AVG;
+ if (pps_freq > time_tolerance)
+ pps_freq = time_tolerance;
+ }
+ }
+
+ /*
+ * Here the calibration interval is adjusted. If the maximum
+ * time difference is greater than tick / 4, reduce the interval
+ * by half. If this is not the case for four consecutive
+ * intervals, double the interval.
+ */
+ if (u_usec << pps_shift > bigtick >> 2) {
+ pps_intcnt = 0;
+ if (pps_shift > PPS_SHIFT)
+ pps_shift--;
+ } else if (pps_intcnt >= 4) {
+ pps_intcnt = 0;
+ if (pps_shift < PPS_SHIFTMAX)
+ pps_shift++;
+ } else
+ pps_intcnt++;
}
+#endif /* PPS_SYNC */
diff --git a/sys/kern/kern_conf.c b/sys/kern/kern_conf.c
new file mode 100644
index 0000000..bee8b87
--- /dev/null
+++ b/sys/kern/kern_conf.c
@@ -0,0 +1,208 @@
+/*-
+ * Parts Copyright (c) 1995 Terrence R. Lambert
+ * Copyright (c) 1995 Julian R. Elischer
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Terrence R. Lambert.
+ * 4. The name Terrence R. Lambert may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Julian R. Elischer ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE TERRENCE R. LAMBERT BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/vnode.h>
+
+#define NUMBDEV 128
+#define NUMCDEV 256
+#define bdevsw_ALLOCSTART (NUMBDEV/2)
+#define cdevsw_ALLOCSTART (NUMCDEV/2)
+
+struct bdevsw *bdevsw[NUMBDEV];
+int nblkdev = NUMBDEV;
+struct cdevsw *cdevsw[NUMCDEV];
+int nchrdev = NUMCDEV;
+
+
+
+/*
+ * Routine to determine if a device is a disk.
+ *
+ * KLUDGE XXX add flags to cdevsw entries for disks XXX
+ * A minimal stub routine can always return 0.
+ */
+int
+isdisk(dev, type)
+ dev_t dev;
+ int type;
+{
+
+ switch (major(dev)) {
+ case 15: /* VBLK: vn, VCHR: cd */
+ return (1);
+ case 0: /* wd */
+ case 2: /* fd */
+ case 4: /* sd */
+ case 6: /* cd */
+ case 7: /* mcd */
+ case 16: /* scd */
+ case 17: /* matcd */
+ case 18: /* ata */
+ case 19: /* wcd */
+ case 20: /* od */
+ case 22: /* gd */
+ if (type == VBLK)
+ return (1);
+ return (0);
+ case 3: /* wd */
+ case 9: /* fd */
+ case 13: /* sd */
+ case 29: /* mcd */
+ case 43: /* vn */
+ case 45: /* scd */
+ case 46: /* matcd */
+ case 69: /* wcd */
+ case 70: /* od */
+ case 78: /* gd */
+ if (type == VCHR)
+ return (1);
+ /* fall through */
+ default:
+ return (0);
+ }
+ /* NOTREACHED */
+}
+
+
+/*
+ * Routine to convert from character to block device number.
+ *
+ * A minimal stub routine can always return NODEV.
+ */
+dev_t
+chrtoblk(dev_t dev)
+{
+ struct bdevsw *bd;
+ struct cdevsw *cd;
+
+ if(cd = cdevsw[major(dev)]) {
+ if ( (bd = cd->d_bdev) )
+ return(makedev(bd->d_maj,minor(dev)));
+ }
+ return(NODEV);
+}
+
+/*
+ * (re)place an entry in the bdevsw or cdevsw table
+ * return the slot used in major(*descrip)
+ */
+#define ADDENTRY(TTYPE,NXXXDEV,ALLOCSTART) \
+int TTYPE##_add(dev_t *descrip, \
+ struct TTYPE *newentry, \
+ struct TTYPE **oldentry) \
+{ \
+ int i ; \
+ if ( (int)*descrip == NODEV) { /* auto (0 is valid) */ \
+ /* \
+ * Search the table looking for a slot... \
+ */ \
+ for (i = ALLOCSTART; i < NXXXDEV; i++) \
+ if (TTYPE[i] == NULL) \
+ break; /* found one! */ \
+ /* out of allocable slots? */ \
+ if (i >= NXXXDEV) { \
+ return ENFILE; \
+ } \
+ } else { /* assign */ \
+ i = major(*descrip); \
+ if (i < 0 || i >= NXXXDEV) { \
+ return EINVAL; \
+ } \
+ } \
+ \
+ /* maybe save old */ \
+ if (oldentry) { \
+ *oldentry = TTYPE[i]; \
+ } \
+ if (newentry) \
+ newentry->d_maj = i; \
+ /* replace with new */ \
+ TTYPE[i] = newentry; \
+ \
+ /* done! let them know where we put it */ \
+ *descrip = makedev(i,0); \
+ return 0; \
+} \
+
+ADDENTRY(bdevsw, nblkdev,bdevsw_ALLOCSTART)
+ADDENTRY(cdevsw, nchrdev,cdevsw_ALLOCSTART)
+
+/* Maybe the author might indicate what the f*@# tehis is for? */
+
+void
+cdevsw_make(struct bdevsw *from)
+{
+ struct cdevsw *to = from->d_cdev;
+
+ if (!to)
+ panic("No target cdevsw in bdevsw");
+ to->d_open = from->d_open;
+ to->d_close = from->d_close;
+ to->d_read = rawread;
+ to->d_write = rawwrite;
+ to->d_ioctl = from->d_ioctl;
+ to->d_stop = nostop;
+ to->d_reset = nullreset;
+ to->d_devtotty = nodevtotty;
+ to->d_select = seltrue;
+ to->d_mmap = nommap;
+ to->d_strategy = from->d_strategy;
+ to->d_name = from->d_name;
+ to->d_bdev = from;
+ to->d_maj = -1;
+}
+
+void
+bdevsw_add_generic(int bdev, int cdev, struct bdevsw *bdevsw)
+{
+ dev_t dev;
+ /*
+ * XXX hack alert.
+ */
+ if (isdisk(makedev(bdev, 0), VBLK) && bdevsw->d_flags != D_DISK) {
+ printf("bdevsw_add_generic: adding D_DISK flag for device %d\n",
+ bdev);
+ bdevsw->d_flags = D_DISK;
+ }
+ cdevsw_make(bdevsw);
+ dev = makedev(cdev, 0);
+ cdevsw_add(&dev, bdevsw->d_cdev, NULL);
+ dev = makedev(bdev, 0);
+ bdevsw_add(&dev, bdevsw , NULL);
+}
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index 3f2e424..a5c6d94 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -35,111 +35,105 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)kern_descrip.c 8.8 (Berkeley) 2/14/95
+ * @(#)kern_descrip.c 8.6 (Berkeley) 4/19/94
+ * $Id$
*/
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/conf.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
+#include <sys/sysctl.h>
#include <sys/vnode.h>
#include <sys/proc.h>
#include <sys/file.h>
-#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/stat.h>
-#include <sys/ioctl.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
#include <sys/fcntl.h>
#include <sys/malloc.h>
-#include <sys/syslog.h>
#include <sys/unistd.h>
#include <sys/resourcevar.h>
+#include <sys/pipe.h>
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+
+static d_open_t fdopen;
+#define NUMFDESC 64
+
+#define CDEV_MAJOR 22
+static struct cdevsw fildesc_cdevsw =
+ { fdopen, noclose, noread, nowrite, /*22*/
+ noioc, nostop, nullreset, nodevtotty,/*fd(!=Fd)*/
+ noselect, nommap, nostrat };
+
+static int finishdup(struct filedesc *fdp, int old, int new, int *retval);
/*
* Descriptor management.
*/
struct filelist filehead; /* head of list of open files */
int nfiles; /* actual number of open files */
+extern int cmask;
/*
* System calls on descriptors.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct getdtablesize_args {
+ int dummy;
+};
+#endif
/* ARGSUSED */
int
getdtablesize(p, uap, retval)
struct proc *p;
- void *uap;
- register_t *retval;
+ struct getdtablesize_args *uap;
+ int *retval;
{
- *retval = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
+ *retval = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
return (0);
}
/*
- * Duplicate a file descriptor.
- */
-/* ARGSUSED */
-int
-dup(p, uap, retval)
- struct proc *p;
- struct dup_args /* {
- syscallarg(u_int) fd;
- } */ *uap;
- register_t *retval;
-{
- register struct filedesc *fdp;
- u_int old;
- int new, error;
-
- old = SCARG(uap, fd);
- /*
- * XXX Compatibility
- */
- if (old &~ 077) {
- SCARG(uap, fd) &= 077;
- return (dup2(p, uap, retval));
- }
-
- fdp = p->p_fd;
- if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL)
- return (EBADF);
- if (error = fdalloc(p, 0, &new))
- return (error);
- return (finishdup(fdp, (int)old, new, retval));
-}
-
-/*
* Duplicate a file descriptor to a particular value.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct dup2_args {
+ u_int from;
+ u_int to;
+};
+#endif
/* ARGSUSED */
int
dup2(p, uap, retval)
struct proc *p;
- struct dup2_args /* {
- syscallarg(u_int) from;
- syscallarg(u_int) to;
- } */ *uap;
- register_t *retval;
+ struct dup2_args *uap;
+ int *retval;
{
register struct filedesc *fdp = p->p_fd;
- register int old = SCARG(uap, from), new = SCARG(uap, to);
+ register u_int old = uap->from, new = uap->to;
int i, error;
if (old >= fdp->fd_nfiles ||
fdp->fd_ofiles[old] == NULL ||
new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
- new >= maxfiles)
+ new >= maxfilesperproc)
return (EBADF);
if (old == new) {
*retval = new;
return (0);
}
if (new >= fdp->fd_nfiles) {
- if (error = fdalloc(p, new, &i))
+ if ((error = fdalloc(p, new, &i)))
return (error);
if (new != i)
panic("dup2: fdalloc");
@@ -155,20 +149,58 @@ dup2(p, uap, retval)
}
/*
+ * Duplicate a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct dup_args {
+ u_int fd;
+};
+#endif
+/* ARGSUSED */
+int
+dup(p, uap, retval)
+ struct proc *p;
+ struct dup_args *uap;
+ int *retval;
+{
+ register struct filedesc *fdp;
+ u_int old;
+ int new, error;
+
+ old = uap->fd;
+
+#if 0
+ /*
+ * XXX Compatibility
+ */
+ if (old &~ 077) { uap->fd &= 077; return (dup2(p, uap, retval)); }
+#endif
+
+ fdp = p->p_fd;
+ if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL)
+ return (EBADF);
+ if ((error = fdalloc(p, 0, &new)))
+ return (error);
+ return (finishdup(fdp, (int)old, new, retval));
+}
+
+/*
* The file control system call.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct fcntl_args {
+ int fd;
+ int cmd;
+ int arg;
+};
+#endif
/* ARGSUSED */
int
fcntl(p, uap, retval)
struct proc *p;
- register struct fcntl_args /* {
- syscallarg(int) fd;
- syscallarg(int) cmd;
- syscallarg(void *) arg;
- } */ *uap;
- register_t *retval;
+ register struct fcntl_args *uap;
+ int *retval;
{
- int fd = SCARG(uap, fd);
register struct filedesc *fdp = p->p_fd;
register struct file *fp;
register char *pop;
@@ -177,27 +209,27 @@ fcntl(p, uap, retval)
struct flock fl;
u_int newmin;
- if ((u_int)fd >= fdp->fd_nfiles ||
- (fp = fdp->fd_ofiles[fd]) == NULL)
+ if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL)
return (EBADF);
- pop = &fdp->fd_ofileflags[fd];
- switch (SCARG(uap, cmd)) {
+ pop = &fdp->fd_ofileflags[uap->fd];
+ switch (uap->cmd) {
case F_DUPFD:
- newmin = (long)SCARG(uap, arg);
+ newmin = uap->arg;
if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
- newmin >= maxfiles)
+ newmin >= maxfilesperproc)
return (EINVAL);
- if (error = fdalloc(p, newmin, &i))
+ if ((error = fdalloc(p, newmin, &i)))
return (error);
- return (finishdup(fdp, fd, i, retval));
+ return (finishdup(fdp, uap->fd, i, retval));
case F_GETFD:
*retval = *pop & 1;
return (0);
case F_SETFD:
- *pop = (*pop &~ 1) | ((long)SCARG(uap, arg) & 1);
+ *pop = (*pop &~ 1) | (uap->arg & 1);
return (0);
case F_GETFL:
@@ -206,7 +238,7 @@ fcntl(p, uap, retval)
case F_SETFL:
fp->f_flag &= ~FCNTLFLAGS;
- fp->f_flag |= FFLAGS((long)SCARG(uap, arg)) & FCNTLFLAGS;
+ fp->f_flag |= FFLAGS(uap->arg) & FCNTLFLAGS;
tmp = fp->f_flag & FNONBLOCK;
error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
if (error)
@@ -232,20 +264,19 @@ fcntl(p, uap, retval)
case F_SETOWN:
if (fp->f_type == DTYPE_SOCKET) {
- ((struct socket *)fp->f_data)->so_pgid =
- (long)SCARG(uap, arg);
+ ((struct socket *)fp->f_data)->so_pgid = uap->arg;
return (0);
}
- if ((long)SCARG(uap, arg) <= 0) {
- SCARG(uap, arg) = (void *)(-(long)SCARG(uap, arg));
+ if (uap->arg <= 0) {
+ uap->arg = -uap->arg;
} else {
- struct proc *p1 = pfind((long)SCARG(uap, arg));
+ struct proc *p1 = pfind(uap->arg);
if (p1 == 0)
return (ESRCH);
- SCARG(uap, arg) = (void *)(long)p1->p_pgrp->pg_id;
+ uap->arg = p1->p_pgrp->pg_id;
}
return ((*fp->f_ops->fo_ioctl)
- (fp, TIOCSPGRP, (caddr_t)&SCARG(uap, arg), p));
+ (fp, TIOCSPGRP, (caddr_t)&uap->arg, p));
case F_SETLKW:
flg |= F_WAIT;
@@ -256,8 +287,7 @@ fcntl(p, uap, retval)
return (EBADF);
vp = (struct vnode *)fp->f_data;
/* Copy in the lock structure */
- error = copyin((caddr_t)SCARG(uap, arg), (caddr_t)&fl,
- sizeof (fl));
+ error = copyin((caddr_t)uap->arg, (caddr_t)&fl, sizeof (fl));
if (error)
return (error);
if (fl.l_whence == SEEK_CUR)
@@ -289,16 +319,17 @@ fcntl(p, uap, retval)
return (EBADF);
vp = (struct vnode *)fp->f_data;
/* Copy in the lock structure */
- error = copyin((caddr_t)SCARG(uap, arg), (caddr_t)&fl,
- sizeof (fl));
+ error = copyin((caddr_t)uap->arg, (caddr_t)&fl, sizeof (fl));
if (error)
return (error);
+ if (fl.l_type != F_RDLCK && fl.l_type != F_WRLCK &&
+ fl.l_type != F_UNLCK)
+ return (EINVAL);
if (fl.l_whence == SEEK_CUR)
fl.l_start += fp->f_offset;
- if (error = VOP_ADVLOCK(vp, (caddr_t)p, F_GETLK, &fl, F_POSIX))
+ if ((error = VOP_ADVLOCK(vp,(caddr_t)p,F_GETLK,&fl,F_POSIX)))
return (error);
- return (copyout((caddr_t)&fl, (caddr_t)SCARG(uap, arg),
- sizeof (fl)));
+ return (copyout((caddr_t)&fl, (caddr_t)uap->arg, sizeof (fl)));
default:
return (EINVAL);
@@ -309,11 +340,10 @@ fcntl(p, uap, retval)
/*
* Common code for dup, dup2, and fcntl(F_DUPFD).
*/
-int
+static int
finishdup(fdp, old, new, retval)
register struct filedesc *fdp;
- register int old, new;
- register_t *retval;
+ register int old, new, *retval;
{
register struct file *fp;
@@ -330,21 +360,24 @@ finishdup(fdp, old, new, retval)
/*
* Close a file descriptor.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct close_args {
+ int fd;
+};
+#endif
/* ARGSUSED */
int
close(p, uap, retval)
struct proc *p;
- struct close_args /* {
- syscallarg(int) fd;
- } */ *uap;
- register_t *retval;
+ struct close_args *uap;
+ int *retval;
{
- int fd = SCARG(uap, fd);
register struct filedesc *fdp = p->p_fd;
register struct file *fp;
+ register int fd = uap->fd;
register u_char *pf;
- if ((u_int)fd >= fdp->fd_nfiles ||
+ if ((unsigned)fd >= fdp->fd_nfiles ||
(fp = fdp->fd_ofiles[fd]) == NULL)
return (EBADF);
pf = (u_char *)&fdp->fd_ofileflags[fd];
@@ -363,28 +396,31 @@ close(p, uap, retval)
/*
* Return status information about a file descriptor.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct ofstat_args {
+ int fd;
+ struct ostat *sb;
+};
+#endif
/* ARGSUSED */
int
-compat_43_fstat(p, uap, retval)
+ofstat(p, uap, retval)
struct proc *p;
- register struct compat_43_fstat_args /* {
- syscallarg(int) fd;
- syscallarg(struct ostat *) sb;
- } */ *uap;
- register_t *retval;
+ register struct ofstat_args *uap;
+ int *retval;
{
- int fd = SCARG(uap, fd);
register struct filedesc *fdp = p->p_fd;
register struct file *fp;
struct stat ub;
struct ostat oub;
int error;
- if ((u_int)fd >= fdp->fd_nfiles ||
- (fp = fdp->fd_ofiles[fd]) == NULL)
+ if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL)
return (EBADF);
switch (fp->f_type) {
+ case DTYPE_FIFO:
case DTYPE_VNODE:
error = vn_stat((struct vnode *)fp->f_data, &ub, p);
break;
@@ -393,14 +429,19 @@ compat_43_fstat(p, uap, retval)
error = soo_stat((struct socket *)fp->f_data, &ub);
break;
+#ifndef OLD_PIPE
+ case DTYPE_PIPE:
+ error = pipe_stat((struct pipe *)fp->f_data, &ub);
+ break;
+#endif
+
default:
panic("ofstat");
/*NOTREACHED*/
}
cvtstat(&ub, &oub);
if (error == 0)
- error = copyout((caddr_t)&oub, (caddr_t)SCARG(uap, sb),
- sizeof (oub));
+ error = copyout((caddr_t)&oub, (caddr_t)uap->sb, sizeof (oub));
return (error);
}
#endif /* COMPAT_43 || COMPAT_SUNOS */
@@ -408,27 +449,30 @@ compat_43_fstat(p, uap, retval)
/*
* Return status information about a file descriptor.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct fstat_args {
+ int fd;
+ struct stat *sb;
+};
+#endif
/* ARGSUSED */
int
fstat(p, uap, retval)
struct proc *p;
- register struct fstat_args /* {
- syscallarg(int) fd;
- syscallarg(struct stat *) sb;
- } */ *uap;
- register_t *retval;
+ register struct fstat_args *uap;
+ int *retval;
{
- int fd = SCARG(uap, fd);
register struct filedesc *fdp = p->p_fd;
register struct file *fp;
struct stat ub;
int error;
- if ((u_int)fd >= fdp->fd_nfiles ||
- (fp = fdp->fd_ofiles[fd]) == NULL)
+ if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL)
return (EBADF);
switch (fp->f_type) {
+ case DTYPE_FIFO:
case DTYPE_VNODE:
error = vn_stat((struct vnode *)fp->f_data, &ub, p);
break;
@@ -437,48 +481,59 @@ fstat(p, uap, retval)
error = soo_stat((struct socket *)fp->f_data, &ub);
break;
+#ifndef OLD_PIPE
+ case DTYPE_PIPE:
+ error = pipe_stat((struct pipe *)fp->f_data, &ub);
+ break;
+#endif
+
default:
panic("fstat");
/*NOTREACHED*/
}
if (error == 0)
- error = copyout((caddr_t)&ub, (caddr_t)SCARG(uap, sb),
- sizeof (ub));
+ error = copyout((caddr_t)&ub, (caddr_t)uap->sb, sizeof (ub));
return (error);
}
/*
* Return pathconf information about a file descriptor.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct fpathconf_args {
+ int fd;
+ int name;
+};
+#endif
/* ARGSUSED */
int
fpathconf(p, uap, retval)
struct proc *p;
- register struct fpathconf_args /* {
- syscallarg(int) fd;
- syscallarg(int) name;
- } */ *uap;
- register_t *retval;
+ register struct fpathconf_args *uap;
+ int *retval;
{
- int fd = SCARG(uap, fd);
struct filedesc *fdp = p->p_fd;
struct file *fp;
struct vnode *vp;
- if ((u_int)fd >= fdp->fd_nfiles ||
- (fp = fdp->fd_ofiles[fd]) == NULL)
+ if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL)
return (EBADF);
switch (fp->f_type) {
+#ifndef OLD_PIPE
+ case DTYPE_PIPE:
+#endif
case DTYPE_SOCKET:
- if (SCARG(uap, name) != _PC_PIPE_BUF)
+ if (uap->name != _PC_PIPE_BUF)
return (EINVAL);
*retval = PIPE_BUF;
return (0);
+ case DTYPE_FIFO:
case DTYPE_VNODE:
vp = (struct vnode *)fp->f_data;
- return (VOP_PATHCONF(vp, SCARG(uap, name), retval));
+ return (VOP_PATHCONF(vp, uap->name, retval));
default:
panic("fpathconf");
@@ -489,7 +544,8 @@ fpathconf(p, uap, retval)
/*
* Allocate a file descriptor for the process.
*/
-int fdexpand;
+static int fdexpand;
+SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, "");
int
fdalloc(p, want, result)
@@ -508,7 +564,7 @@ fdalloc(p, want, result)
* of want or fd_freefile. If that fails, consider
* expanding the ofile array.
*/
- lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
+ lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
for (;;) {
last = min(fdp->fd_nfiles, lim);
if ((i = want) < fdp->fd_freefile)
@@ -554,6 +610,7 @@ fdalloc(p, want, result)
fdp->fd_nfiles = nfiles;
fdexpand++;
}
+ return (0);
}
/*
@@ -567,13 +624,15 @@ fdavail(p, n)
{
register struct filedesc *fdp = p->p_fd;
register struct file **fpp;
- register int i, lim;
+ register int i, lim, last;
- lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
+ lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
return (1);
+
+ last = min(fdp->fd_nfiles, lim);
fpp = &fdp->fd_ofiles[fdp->fd_freefile];
- for (i = fdp->fd_nfiles - fdp->fd_freefile; --i >= 0; fpp++)
+ for (i = last - fdp->fd_freefile; --i >= 0; fpp++)
if (*fpp == NULL && --n <= 0)
return (1);
return (0);
@@ -592,7 +651,7 @@ falloc(p, resultfp, resultfd)
register struct file *fp, *fq;
int error, i;
- if (error = fdalloc(p, 0, &i))
+ if ((error = fdalloc(p, 0, &i)))
return (error);
if (nfiles >= maxfiles) {
tablefull("file");
@@ -607,7 +666,7 @@ falloc(p, resultfp, resultfd)
nfiles++;
MALLOC(fp, struct file *, sizeof(struct file), M_FILE, M_WAITOK);
bzero(fp, sizeof(struct file));
- if (fq = p->p_fd->fd_ofiles[0]) {
+ if ((fq = p->p_fd->fd_ofiles[0])) {
LIST_INSERT_AFTER(fq, fp, f_list);
} else {
LIST_INSERT_HEAD(&filehead, fp, f_list);
@@ -615,6 +674,7 @@ falloc(p, resultfp, resultfd)
p->p_fd->fd_ofiles[i] = fp;
fp->f_count = 1;
fp->f_cred = p->p_ucred;
+ fp->f_seqcount = 1;
crhold(fp->f_cred);
if (resultfp)
*resultfp = fp;
@@ -630,8 +690,6 @@ void
ffree(fp)
register struct file *fp;
{
- register struct file *fq;
-
LIST_REMOVE(fp, f_list);
crfree(fp->f_cred);
#ifdef DIAGNOSTIC
@@ -642,6 +700,49 @@ ffree(fp)
}
/*
+ * Build a new filedesc structure.
+ */
+struct filedesc *
+fdinit(p)
+ struct proc *p;
+{
+ register struct filedesc0 *newfdp;
+ register struct filedesc *fdp = p->p_fd;
+
+ MALLOC(newfdp, struct filedesc0 *, sizeof(struct filedesc0),
+ M_FILEDESC, M_WAITOK);
+ bzero(newfdp, sizeof(struct filedesc0));
+ newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
+ VREF(newfdp->fd_fd.fd_cdir);
+ newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
+ if (newfdp->fd_fd.fd_rdir)
+ VREF(newfdp->fd_fd.fd_rdir);
+
+ /* Create the file descriptor table. */
+ newfdp->fd_fd.fd_refcnt = 1;
+ newfdp->fd_fd.fd_cmask = cmask;
+ newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
+ newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
+ newfdp->fd_fd.fd_nfiles = NDFILE;
+
+ newfdp->fd_fd.fd_freefile = 0;
+ newfdp->fd_fd.fd_lastfile = 0;
+
+ return (&newfdp->fd_fd);
+}
+
+/*
+ * Share a filedesc structure.
+ */
+struct filedesc *
+fdshare(p)
+ struct proc *p;
+{
+ p->p_fd->fd_refcnt++;
+ return (p->p_fd);
+}
+
+/*
* Copy a filedesc structure.
*/
struct filedesc *
@@ -720,6 +821,34 @@ fdfree(p)
}
/*
+ * Close any files on exec?
+ */
+void
+fdcloseexec(p)
+ struct proc *p;
+{
+ struct filedesc *fdp = p->p_fd;
+ struct file **fpp;
+ char *fdfp;
+ register int i;
+
+ fpp = fdp->fd_ofiles;
+ fdfp = fdp->fd_ofileflags;
+ for (i = 0; i <= fdp->fd_lastfile; i++, fpp++, fdfp++)
+ if (*fpp != NULL && (*fdfp & UF_EXCLOSE)) {
+ if (*fdfp & UF_MAPPED)
+ (void) munmapfd(p, i);
+ (void) closef(*fpp, p);
+ *fpp = NULL;
+ *fdfp = 0;
+ if (i < fdp->fd_freefile)
+ fdp->fd_freefile = i;
+ }
+ while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
+ fdp->fd_lastfile--;
+}
+
+/*
* Internal form of close.
* Decrement reference count on file structure.
* Note: p may be NULL when closing a file
@@ -778,25 +907,26 @@ closef(fp, p)
* Just attempt to get a record lock of the requested type on
* the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
*/
+#ifndef _SYS_SYSPROTO_H_
+struct flock_args {
+ int fd;
+ int how;
+};
+#endif
/* ARGSUSED */
int
flock(p, uap, retval)
struct proc *p;
- register struct flock_args /* {
- syscallarg(int) fd;
- syscallarg(int) how;
- } */ *uap;
- register_t *retval;
+ register struct flock_args *uap;
+ int *retval;
{
- int fd = SCARG(uap, fd);
- int how = SCARG(uap, how);
register struct filedesc *fdp = p->p_fd;
register struct file *fp;
struct vnode *vp;
struct flock lf;
- if ((u_int)fd >= fdp->fd_nfiles ||
- (fp = fdp->fd_ofiles[fd]) == NULL)
+ if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL)
return (EBADF);
if (fp->f_type != DTYPE_VNODE)
return (EOPNOTSUPP);
@@ -804,19 +934,19 @@ flock(p, uap, retval)
lf.l_whence = SEEK_SET;
lf.l_start = 0;
lf.l_len = 0;
- if (how & LOCK_UN) {
+ if (uap->how & LOCK_UN) {
lf.l_type = F_UNLCK;
fp->f_flag &= ~FHASLOCK;
return (VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK));
}
- if (how & LOCK_EX)
+ if (uap->how & LOCK_EX)
lf.l_type = F_WRLCK;
- else if (how & LOCK_SH)
+ else if (uap->how & LOCK_SH)
lf.l_type = F_RDLCK;
else
return (EBADF);
fp->f_flag |= FHASLOCK;
- if (how & LOCK_NB)
+ if (uap->how & LOCK_NB)
return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK));
return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK|F_WAIT));
}
@@ -830,7 +960,7 @@ flock(p, uap, retval)
* references to this file will be direct to the other driver.
*/
/* ARGSUSED */
-int
+static int
fdopen(dev, mode, type, p)
dev_t dev;
int mode, type;
@@ -839,7 +969,7 @@ fdopen(dev, mode, type, p)
/*
* XXX Kludge: set curproc->p_dupfd to contain the value of the
- * the file descriptor being sought for duplication. The error
+ * the file descriptor being sought for duplication. The error
* return ensures that the vnode for this device will be released
* by vn_open. Open will detect this special error and take the
* actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
@@ -928,3 +1058,89 @@ dupfdopen(fdp, indx, dfd, mode, error)
}
/* NOTREACHED */
}
+
+/*
+ * Get file structures.
+ */
+static int
+sysctl_kern_file SYSCTL_HANDLER_ARGS
+{
+ int error;
+ struct file *fp;
+
+ if (!req->oldptr) {
+ /*
+ * overestimate by 10 files
+ */
+ return (SYSCTL_OUT(req, 0, sizeof(filehead) +
+ (nfiles + 10) * sizeof(struct file)));
+ }
+
+ error = SYSCTL_OUT(req, (caddr_t)&filehead, sizeof(filehead));
+ if (error)
+ return (error);
+
+ /*
+ * followed by an array of file structures
+ */
+ for (fp = filehead.lh_first; fp != NULL; fp = fp->f_list.le_next) {
+ error = SYSCTL_OUT(req, (caddr_t)fp, sizeof (struct file));
+ if (error)
+ return (error);
+ }
+ return (0);
+}
+
+SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
+ 0, 0, sysctl_kern_file, "S,file", "");
+
+SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc,
+ CTLFLAG_RW, &maxfilesperproc, 0, "");
+
+SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, &maxfiles, 0, "");
+
+static fildesc_devsw_installed = 0;
+#ifdef DEVFS
+static void *devfs_token_stdin;
+static void *devfs_token_stdout;
+static void *devfs_token_stderr;
+static void *devfs_token_fildesc[NUMFDESC];
+#endif
+
+static void fildesc_drvinit(void *unused)
+{
+ dev_t dev;
+#ifdef DEVFS
+ int fd;
+#endif
+
+ if( ! fildesc_devsw_installed ) {
+ dev = makedev(CDEV_MAJOR,0);
+ cdevsw_add(&dev,&fildesc_cdevsw,NULL);
+ fildesc_devsw_installed = 1;
+#ifdef DEVFS
+ for (fd = 0; fd < NUMFDESC; fd++)
+ devfs_token_fildesc[fd] =
+ devfs_add_devswf(&fildesc_cdevsw, fd, DV_CHR,
+ UID_BIN, GID_BIN, 0666,
+ "fd/%d", fd);
+ devfs_token_stdin =
+ devfs_add_devswf(&fildesc_cdevsw, 0, DV_CHR,
+ UID_ROOT, GID_WHEEL, 0666,
+ "stdin", fd);
+ devfs_token_stdout =
+ devfs_add_devswf(&fildesc_cdevsw, 1, DV_CHR,
+ UID_ROOT, GID_WHEEL, 0666,
+ "stdout", fd);
+ devfs_token_stderr =
+ devfs_add_devswf(&fildesc_cdevsw, 2, DV_CHR,
+ UID_ROOT, GID_WHEEL, 0666,
+ "stderr", fd);
+#endif
+ }
+}
+
+SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,
+ fildesc_drvinit,NULL)
+
+
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
index fbb4444..21049a3 100644
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -1,11 +1,6 @@
-/*-
- * Copyright (c) 1982, 1986, 1991, 1993
- * The Regents of the University of California. All rights reserved.
- * (c) UNIX System Laboratories, Inc.
- * All or some portions of this file are derived from material licensed
- * to the University of California by American Telephone and Telegraph
- * Co. or Unix System Laboratories, Inc. and are reproduced herein with
- * the permission of UNIX System Laboratories, Inc.
+/*
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -15,18 +10,11 @@
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
*
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
@@ -35,30 +23,597 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * from: @(#)kern_exec.c 8.1 (Berkeley) 6/10/93
+ * $Id$
*/
#include <sys/param.h>
-#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/filedesc.h>
+#include <sys/fcntl.h>
+#include <sys/acct.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/imgact_elf.h>
+#include <sys/wait.h>
#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/namei.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/shm.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+#include <machine/reg.h>
+
+static int *exec_copyout_strings __P((struct image_params *));
+
+static int exec_check_permissions(struct image_params *);
/*
- * exec system call
+ * XXX trouble here if sizeof(caddr_t) != sizeof(int), other parts
+ * of the sysctl code also assumes this, and sizeof(int) == sizeof(long).
*/
+static struct ps_strings *ps_strings = PS_STRINGS;
+SYSCTL_INT(_kern, KERN_PS_STRINGS, ps_strings, 0, &ps_strings, 0, "");
+
+static caddr_t usrstack = (caddr_t)USRSTACK;
+SYSCTL_INT(_kern, KERN_USRSTACK, usrstack, 0, &usrstack, 0, "");
+
+/*
+ * execsw_set is constructed for us by the linker. Each of the items
+ * is a pointer to a `const struct execsw', hence the double pointer here.
+ */
+static const struct execsw **execsw =
+ (const struct execsw **)&execsw_set.ls_items[0];
+
+#ifndef _SYS_SYSPROTO_H_
struct execve_args {
- char *fname;
- char **argp;
- char **envp;
+ char *fname;
+ char **argv;
+ char **envv;
};
-/* ARGSUSED */
-execve(a1, a2, a3)
- struct proc *a1;
- struct execve_args *a2;
- int *a3;
+#endif
+
+/*
+ * execve() system call.
+ */
+int
+execve(p, uap, retval)
+ struct proc *p;
+ register struct execve_args *uap;
+ int *retval;
+{
+ struct nameidata nd, *ndp;
+ int *stack_base;
+ int error, len, i;
+ struct image_params image_params, *imgp;
+ struct vattr attr;
+
+ imgp = &image_params;
+
+ /*
+ * Initialize part of the common data
+ */
+ imgp->proc = p;
+ imgp->uap = uap;
+ imgp->attr = &attr;
+ imgp->image_header = NULL;
+ imgp->argc = imgp->envc = 0;
+ imgp->entry_addr = 0;
+ imgp->vmspace_destroyed = 0;
+ imgp->interpreted = 0;
+ imgp->interpreter_name[0] = '\0';
+ imgp->auxargs = NULL;
+
+ /*
+ * Allocate temporary demand zeroed space for argument and
+ * environment strings
+ */
+ imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX);
+ if (imgp->stringbase == NULL) {
+ error = ENOMEM;
+ goto exec_fail;
+ }
+ imgp->stringp = imgp->stringbase;
+ imgp->stringspace = ARG_MAX;
+
+ /*
+ * Translate the file name. namei() returns a vnode pointer
+ * in ni_vp amoung other things.
+ */
+ ndp = &nd;
+ NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
+ UIO_USERSPACE, uap->fname, p);
+
+interpret:
+
+ error = namei(ndp);
+ if (error) {
+ kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX);
+ goto exec_fail;
+ }
+
+ imgp->vp = ndp->ni_vp;
+ if (imgp->vp == NULL) {
+ error = ENOEXEC;
+ goto exec_fail_dealloc;
+ }
+
+ /*
+ * Check file permissions (also 'opens' file)
+ */
+ error = exec_check_permissions(imgp);
+
+ /*
+ * Lose the lock on the vnode. It's no longer needed, and must not
+ * exist for the pagefault paging to work below.
+ */
+ VOP_UNLOCK(imgp->vp, 0, p);
+
+ if (error)
+ goto exec_fail_dealloc;
+
+ /*
+ * Map the image header (first page) of the file into
+ * kernel address space
+ */
+ error = vm_mmap(exech_map, /* map */
+ (vm_offset_t *)&imgp->image_header, /* address */
+ PAGE_SIZE, /* size */
+ VM_PROT_READ, /* protection */
+ VM_PROT_READ, /* max protection */
+ 0, /* flags */
+ (caddr_t)imgp->vp, /* vnode */
+ 0); /* offset */
+ if (error) {
+ uprintf("mmap failed: %d\n",error);
+ goto exec_fail_dealloc;
+ }
+
+ /*
+ * Loop through list of image activators, calling each one.
+ * If there is no match, the activator returns -1. If there
+ * is a match, but there was an error during the activation,
+ * the error is returned. Otherwise 0 means success. If the
+ * image is interpreted, loop back up and try activating
+ * the interpreter.
+ */
+ for (i = 0; execsw[i]; ++i) {
+ if (execsw[i]->ex_imgact)
+ error = (*execsw[i]->ex_imgact)(imgp);
+ else
+ continue;
+
+ if (error == -1)
+ continue;
+ if (error)
+ goto exec_fail_dealloc;
+ if (imgp->interpreted) {
+ /* free old vnode and name buffer */
+ vrele(ndp->ni_vp);
+ FREE(ndp->ni_cnd.cn_pnbuf, M_NAMEI);
+ if (vm_map_remove(exech_map, (vm_offset_t)imgp->image_header,
+ (vm_offset_t)imgp->image_header + PAGE_SIZE))
+ panic("execve: header dealloc failed (1)");
+
+ /* set new name to that of the interpreter */
+ NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
+ UIO_SYSSPACE, imgp->interpreter_name, p);
+ goto interpret;
+ }
+ break;
+ }
+ /* If we made it through all the activators and none matched, exit. */
+ if (error == -1) {
+ error = ENOEXEC;
+ goto exec_fail_dealloc;
+ }
+
+ /*
+ * Copy out strings (args and env) and initialize stack base
+ */
+ stack_base = exec_copyout_strings(imgp);
+ p->p_vmspace->vm_minsaddr = (char *)stack_base;
+
+ /*
+ * If custom stack fixup routine present for this process
+ * let it do the stack setup.
+ * Else stuff argument count as first item on stack
+ */
+ if (p->p_sysent->sv_fixup)
+ (*p->p_sysent->sv_fixup)(&stack_base, imgp);
+ else
+ suword(--stack_base, imgp->argc);
+
+ /* close files on exec */
+ fdcloseexec(p);
+
+ /* reset caught signals */
+ execsigs(p);
+
+ /* name this process - nameiexec(p, ndp) */
+ len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN);
+ bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len);
+ p->p_comm[len] = 0;
+
+ /*
+ * mark as execed, wakeup the process that vforked (if any) and tell
+ * it that it now has it's own resources back
+ */
+ p->p_flag |= P_EXEC;
+ if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
+ p->p_flag &= ~P_PPWAIT;
+ wakeup((caddr_t)p->p_pptr);
+ }
+
+ /*
+ * Implement image setuid/setgid. Disallow if the process is
+ * being traced.
+ */
+ if ((attr.va_mode & (VSUID | VSGID)) &&
+ (p->p_flag & P_TRACED) == 0) {
+ /*
+ * Turn off syscall tracing for set-id programs, except for
+ * root.
+ */
+ if (p->p_tracep && suser(p->p_ucred, &p->p_acflag)) {
+ p->p_traceflag = 0;
+ vrele(p->p_tracep);
+ p->p_tracep = NULL;
+ }
+ /*
+ * Set the new credentials.
+ */
+ p->p_ucred = crcopy(p->p_ucred);
+ if (attr.va_mode & VSUID)
+ p->p_ucred->cr_uid = attr.va_uid;
+ if (attr.va_mode & VSGID)
+ p->p_ucred->cr_groups[0] = attr.va_gid;
+ p->p_flag |= P_SUGID;
+ } else {
+ if (p->p_ucred->cr_uid == p->p_cred->p_ruid &&
+ p->p_ucred->cr_gid == p->p_cred->p_rgid)
+ p->p_flag &= ~P_SUGID;
+ }
+
+ /*
+ * Implement correct POSIX saved-id behavior.
+ */
+ p->p_cred->p_svuid = p->p_ucred->cr_uid;
+ p->p_cred->p_svgid = p->p_ucred->cr_gid;
+
+ /*
+ * Store the vp for use in procfs
+ */
+ if (p->p_textvp) /* release old reference */
+ vrele(p->p_textvp);
+ VREF(ndp->ni_vp);
+ p->p_textvp = ndp->ni_vp;
+
+ /*
+ * If tracing the process, trap to debugger so breakpoints
+ * can be set before the program executes.
+ */
+ if (p->p_flag & P_TRACED)
+ psignal(p, SIGTRAP);
+
+ /* clear "fork but no exec" flag, as we _are_ execing */
+ p->p_acflag &= ~AFORK;
+
+ /* Set entry address */
+ setregs(p, imgp->entry_addr, (u_long)stack_base);
+
+ /*
+ * free various allocated resources
+ */
+ kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX);
+ if (vm_map_remove(exech_map, (vm_offset_t)imgp->image_header,
+ (vm_offset_t)imgp->image_header + PAGE_SIZE))
+ panic("execve: header dealloc failed (2)");
+ vrele(ndp->ni_vp);
+ FREE(ndp->ni_cnd.cn_pnbuf, M_NAMEI);
+
+ return (0);
+
+exec_fail_dealloc:
+ if (imgp->stringbase != NULL)
+ kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX);
+ if (imgp->image_header && imgp->image_header != (char *)-1)
+ if (vm_map_remove(exech_map, (vm_offset_t)imgp->image_header,
+ (vm_offset_t)imgp->image_header + PAGE_SIZE))
+ panic("execve: header dealloc failed (3)");
+ if (ndp->ni_vp)
+ vrele(ndp->ni_vp);
+ FREE(ndp->ni_cnd.cn_pnbuf, M_NAMEI);
+
+exec_fail:
+ if (imgp->vmspace_destroyed) {
+ /* sorry, no more process anymore. exit gracefully */
+ exit1(p, W_EXITCODE(0, SIGABRT));
+ /* NOT REACHED */
+ return(0);
+ } else {
+ return(error);
+ }
+}
+
+/*
+ * Destroy old address space, and allocate a new stack
+ * The new stack is only SGROWSIZ large because it is grown
+ * automatically in trap.c.
+ */
+int
+exec_new_vmspace(imgp)
+ struct image_params *imgp;
+{
+ int error;
+ struct vmspace *vmspace = imgp->proc->p_vmspace;
+ caddr_t stack_addr = (caddr_t) (USRSTACK - SGROWSIZ);
+
+ imgp->vmspace_destroyed = 1;
+
+ /* Blow away entire process VM */
+ if (vmspace->vm_shm)
+ shmexit(imgp->proc);
+ pmap_remove_pages(&vmspace->vm_pmap, 0, USRSTACK);
+ vm_map_remove(&vmspace->vm_map, 0, USRSTACK);
+
+ /* Allocate a new stack */
+ error = vm_map_find(&vmspace->vm_map, NULL, 0, (vm_offset_t *)&stack_addr,
+ SGROWSIZ, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (error)
+ return(error);
+
+ vmspace->vm_ssize = SGROWSIZ >> PAGE_SHIFT;
+
+ /* Initialize maximum stack address */
+ vmspace->vm_maxsaddr = (char *)USRSTACK - MAXSSIZ;
+
+ return(0);
+}
+
+/*
+ * Copy out argument and environment strings from the old process
+ * address space into the temporary string buffer.
+ */
+int
+exec_extract_strings(imgp)
+ struct image_params *imgp;
+{
+ char **argv, **envv;
+ char *argp, *envp;
+ int error, length;
+
+ /*
+ * extract arguments first
+ */
+
+ argv = imgp->uap->argv;
+
+ if (argv) {
+ while ((argp = (caddr_t) fuword(argv++))) {
+ if (argp == (caddr_t) -1)
+ return (EFAULT);
+ if ((error = copyinstr(argp, imgp->stringp,
+ imgp->stringspace, &length))) {
+ if (error == ENAMETOOLONG)
+ return(E2BIG);
+ return (error);
+ }
+ imgp->stringspace -= length;
+ imgp->stringp += length;
+ imgp->argc++;
+ }
+ }
+
+ /*
+ * extract environment strings
+ */
+
+ envv = imgp->uap->envv;
+
+ if (envv) {
+ while ((envp = (caddr_t) fuword(envv++))) {
+ if (envp == (caddr_t) -1)
+ return (EFAULT);
+ if ((error = copyinstr(envp, imgp->stringp,
+ imgp->stringspace, &length))) {
+ if (error == ENAMETOOLONG)
+ return(E2BIG);
+ return (error);
+ }
+ imgp->stringspace -= length;
+ imgp->stringp += length;
+ imgp->envc++;
+ }
+ }
+
+ return (0);
+}
+
+/*
+ * Copy strings out to the new process address space, constructing
+ * new arg and env vector tables. Return a pointer to the base
+ * so that it can be used as the initial stack pointer.
+ */
+int *
+exec_copyout_strings(imgp)
+ struct image_params *imgp;
{
+ int argc, envc;
+ char **vectp;
+ char *stringp, *destp;
+ int *stack_base;
+ struct ps_strings *arginfo;
+ int szsigcode;
+
+ /*
+ * Calculate string base and vector table pointers.
+ * Also deal with signal trampoline code for this exec type.
+ */
+ arginfo = PS_STRINGS;
+ szsigcode = *(imgp->proc->p_sysent->sv_szsigcode);
+ destp = (caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
+ roundup((ARG_MAX - imgp->stringspace), sizeof(char *));
+
+ /*
+ * install sigcode
+ */
+ if (szsigcode)
+ copyout(imgp->proc->p_sysent->sv_sigcode,
+ ((caddr_t)arginfo - szsigcode), szsigcode);
+
+ /*
+ * If we have a valid auxargs ptr, prepare some room
+ * on the stack.
+ */
+ if (imgp->auxargs)
+ /*
+ * The '+ 2' is for the null pointers at the end of each of the
+ * arg and env vector sets, and 'AT_COUNT*2' is room for the
+ * ELF Auxargs data.
+ */
+ vectp = (char **)(destp - (imgp->argc + imgp->envc + 2 +
+ AT_COUNT*2) * sizeof(char*));
+ else
+ /*
+ * The '+ 2' is for the null pointers at the end of each of the
+ * arg and env vector sets
+ */
+ vectp = (char **)
+ (destp - (imgp->argc + imgp->envc + 2) * sizeof(char*));
/*
- * Body deleted.
+ * vectp also becomes our initial stack base
*/
- return (ENOSYS);
+ stack_base = (int *)vectp;
+
+ stringp = imgp->stringbase;
+ argc = imgp->argc;
+ envc = imgp->envc;
+
+ /*
+ * Copy out strings - arguments and environment.
+ */
+ copyout(stringp, destp, ARG_MAX - imgp->stringspace);
+
+ /*
+ * Fill in "ps_strings" struct for ps, w, etc.
+ */
+ suword(&arginfo->ps_argvstr, (int)vectp);
+ suword(&arginfo->ps_nargvstr, argc);
+
+ /*
+ * Fill in argument portion of vector table.
+ */
+ for (; argc > 0; --argc) {
+ suword(vectp++, (int)destp);
+ while (*stringp++ != 0)
+ destp++;
+ destp++;
+ }
+
+ /* a null vector table pointer seperates the argp's from the envp's */
+ suword(vectp++, 0);
+
+ suword(&arginfo->ps_envstr, (int)vectp);
+ suword(&arginfo->ps_nenvstr, envc);
+
+ /*
+ * Fill in environment portion of vector table.
+ */
+ for (; envc > 0; --envc) {
+ suword(vectp++, (int)destp);
+ while (*stringp++ != 0)
+ destp++;
+ destp++;
+ }
+
+ /* end of vector table is a null pointer */
+ suword(vectp, 0);
+
+ return (stack_base);
+}
+
+/*
+ * Check permissions of file to execute.
+ * Return 0 for success or error code on failure.
+ */
+static int
+exec_check_permissions(imgp)
+ struct image_params *imgp;
+{
+ struct proc *p = imgp->proc;
+ struct vnode *vp = imgp->vp;
+ struct vattr *attr = imgp->attr;
+ int error;
+
+ /*
+ * Check number of open-for-writes on the file and deny execution
+ * if there are any.
+ */
+ if (vp->v_writecount) {
+ return (ETXTBSY);
+ }
+
+ /* Get file attributes */
+ error = VOP_GETATTR(vp, attr, p->p_ucred, p);
+ if (error)
+ return (error);
+
+ /*
+ * 1) Check if file execution is disabled for the filesystem that this
+ * file resides on.
+ * 2) Insure that at least one execute bit is on - otherwise root
+ * will always succeed, and we don't want to happen unless the
+ * file really is executable.
+ * 3) Insure that the file is a regular file.
+ */
+ if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
+ ((attr->va_mode & 0111) == 0) ||
+ (attr->va_type != VREG)) {
+ return (EACCES);
+ }
+
+ /*
+ * Zero length files can't be exec'd
+ */
+ if (attr->va_size == 0)
+ return (ENOEXEC);
+
+ /*
+ * Disable setuid/setgid if the filesystem prohibits it or if
+ * the process is being traced.
+ */
+ if ((vp->v_mount->mnt_flag & MNT_NOSUID) || (p->p_flag & P_TRACED))
+ attr->va_mode &= ~(VSUID | VSGID);
+
+ /*
+ * Check for execute permission to file based on current credentials.
+ * Then call filesystem specific open routine (which does nothing
+ * in the general case).
+ */
+ error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p);
+ if (error)
+ return (error);
+
+ error = VOP_OPEN(vp, FREAD, p->p_ucred, p);
+ if (error)
+ return (error);
+
+ return (0);
}
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index 4ed48ac..2f8074c 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@@ -35,13 +35,16 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)kern_exit.c 8.10 (Berkeley) 2/23/95
+ * @(#)kern_exit.c 8.7 (Berkeley) 2/12/94
+ * $Id: kern_exit.c,v 1.45 1997/02/22 09:39:04 peter Exp $
*/
+#include "opt_ktrace.h"
+
#include <sys/param.h>
#include <sys/systm.h>
-#include <sys/map.h>
-#include <sys/ioctl.h>
+#include <sys/sysproto.h>
+#include <sys/sysent.h>
#include <sys/proc.h>
#include <sys/tty.h>
#include <sys/time.h>
@@ -54,31 +57,48 @@
#include <sys/syslog.h>
#include <sys/malloc.h>
#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
#include <sys/ptrace.h>
+#include <sys/acct.h> /* for acct_process() function prototype */
+#include <sys/filedesc.h>
+#include <sys/shm.h>
+#include <sys/sem.h>
-#include <machine/cpu.h>
#ifdef COMPAT_43
#include <machine/reg.h>
#include <machine/psl.h>
#endif
#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
#include <vm/vm_kern.h>
-__dead void cpu_exit __P((struct proc *));
-__dead void exit1 __P((struct proc *, int));
+static int wait1 __P((struct proc *, struct wait_args *, int [], int));
+
+/*
+ * callout list for things to do at exit time
+ */
+typedef struct exit_list_element {
+ struct exit_list_element *next;
+ exitlist_fn function;
+} *ele_p;
+
+static ele_p exit_list;
/*
* exit --
* Death of process.
*/
-struct rexit_args {
- int rval;
-};
-__dead void
+void
exit(p, uap, retval)
struct proc *p;
- struct rexit_args *uap;
+ struct rexit_args /* {
+ int rval;
+ } */ *uap;
int *retval;
{
@@ -91,21 +111,33 @@ exit(p, uap, retval)
* to zombie, and unlink proc from allproc and parent's lists. Save exit
* status and rusage for wait(). Check for child processes and orphan them.
*/
-__dead void
+void
exit1(p, rv)
register struct proc *p;
int rv;
{
register struct proc *q, *nq;
- register struct proc **pp;
register struct vmspace *vm;
+ ele_p ep = exit_list;
- if (p->p_pid == 1)
- panic("init died (signal %d, exit %d)",
+ if (p->p_pid == 1) {
+ printf("init died (signal %d, exit %d)\n",
WTERMSIG(rv), WEXITSTATUS(rv));
+ panic("Going nowhere without my init!");
+ }
#ifdef PGINPROF
vmsizmon();
#endif
+ /*
+ * Check if any LKMs need anything done at process exit.
+ * e.g. SYSV IPC stuff
+ * XXX what if one of these generates an error?
+ */
+ while (ep) {
+ (*ep->function)(p);
+ ep = ep->next;
+ }
+
if (p->p_flag & P_PROFIL)
stopprofclock(p);
MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage),
@@ -126,12 +158,21 @@ exit1(p, rv)
*/
fdfree(p);
+ /*
+ * Delete select() buffers
+ */
+ if (p->p_selbits)
+ free (p->p_selbits, M_SELECT);
+
+ /*
+ * XXX Shutdown SYSV semaphores
+ */
+ semexit(p);
+
/* The next two chunks should probably be moved to vmspace_exit. */
vm = p->p_vmspace;
-#ifdef SYSVSHM
if (vm->vm_shm)
shmexit(p);
-#endif
/*
* Release user portion of address space.
* This releases references to vnodes,
@@ -140,9 +181,12 @@ exit1(p, rv)
* Can't free the entire vmspace as the kernel stack
* may be mapped within that space also.
*/
- if (vm->vm_refcnt == 1)
+ if (vm->vm_refcnt == 1) {
+ pmap_remove_pages(&vm->vm_pmap, VM_MIN_ADDRESS,
+ VM_MAXUSER_ADDRESS);
(void) vm_map_remove(&vm->vm_map, VM_MIN_ADDRESS,
VM_MAXUSER_ADDRESS);
+ }
if (SESS_LEADER(p)) {
register struct session *sp = p->p_session;
@@ -154,7 +198,7 @@ exit1(p, rv)
* drain controlling terminal
* and revoke access to controlling terminal.
*/
- if (sp->s_ttyp->t_session == sp) {
+ if (sp->s_ttyp && (sp->s_ttyp->t_session == sp)) {
if (sp->s_ttyp->t_pgrp)
pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1);
(void) ttywait(sp->s_ttyp);
@@ -177,10 +221,15 @@ exit1(p, rv)
sp->s_leader = NULL;
}
fixjobc(p, p->p_pgrp, 0);
+ if (p->p_limit->p_refcnt > 1 &&
+ (p->p_limit->p_lflags & PL_SHAREMOD) == 0) {
+ p->p_limit->p_refcnt--;
+ p->p_limit = limcopy(p->p_limit);
+ }
p->p_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
(void)acct_process(p);
#ifdef KTRACE
- /*
+ /*
* release trace file
*/
p->p_traceflag = 0; /* don't trace the vrele() */
@@ -244,8 +293,10 @@ exit1(p, rv)
* Other substructures are freed from wait().
*/
curproc = NULL;
- if (--p->p_limit->p_refcnt == 0)
+ if (--p->p_limit->p_refcnt == 0) {
FREE(p->p_limit, M_SUBPROC);
+ p->p_limit = NULL;
+ }
/*
* Finally, call machine-dependent code to release the remaining
@@ -253,22 +304,12 @@ exit1(p, rv)
* The address space is released by "vmspace_free(p->p_vmspace)";
* This is machine-dependent, as we may have to change stacks
* or ensure that the current one isn't reallocated before we
- * finish. cpu_exit will end with a call to cpu_swtch(), finishing
+ * finish. cpu_exit will end with a call to cpu_switch(), finishing
* our execution (pun intended).
*/
cpu_exit(p);
}
-struct wait_args {
- int pid;
- int *status;
- int options;
- struct rusage *rusage;
-#ifdef COMPAT_43
- int compat; /* pseudo */
-#endif
-};
-
#ifdef COMPAT_43
#if defined(hp300) || defined(luna68k)
#include <machine/frame.h>
@@ -277,48 +318,55 @@ struct wait_args {
#define GETPS(rp) (rp)[PS]
#endif
-compat_43_wait(p, uap, retval)
+int
+owait(p, uap, retval)
struct proc *p;
- register struct wait_args *uap;
+ register struct owait_args /* {
+ int dummy;
+ } */ *uap;
int *retval;
{
+ struct wait_args w;
#ifdef PSL_ALLCC
if ((GETPS(p->p_md.md_regs) & PSL_ALLCC) != PSL_ALLCC) {
- uap->options = 0;
- uap->rusage = NULL;
+ w.options = 0;
+ w.rusage = NULL;
} else {
- uap->options = p->p_md.md_regs[R0];
- uap->rusage = (struct rusage *)p->p_md.md_regs[R1];
+ w.options = p->p_md.md_regs[R0];
+ w.rusage = (struct rusage *)p->p_md.md_regs[R1];
}
#else
- uap->options = 0;
- uap->rusage = NULL;
+ w.options = 0;
+ w.rusage = NULL;
#endif
- uap->pid = WAIT_ANY;
- uap->status = NULL;
- uap->compat = 1;
- return (wait1(p, uap, retval));
+ w.pid = WAIT_ANY;
+ w.status = NULL;
+ return (wait1(p, &w, retval, 1));
}
+#endif /* COMPAT_43 */
+int
wait4(p, uap, retval)
struct proc *p;
struct wait_args *uap;
int *retval;
{
- uap->compat = 0;
- return (wait1(p, uap, retval));
+ return (wait1(p, uap, retval, 0));
}
-#else
-#define wait1 wait4
-#endif
-int
-wait1(q, uap, retval)
+static int
+wait1(q, uap, retval, compat)
register struct proc *q;
- register struct wait_args *uap;
+ register struct wait_args /* {
+ int pid;
+ int *status;
+ int options;
+ struct rusage *rusage;
+ } */ *uap;
int retval[];
+ int compat;
{
register int nfound;
register struct proc *p, *t;
@@ -338,16 +386,22 @@ loop:
continue;
nfound++;
if (p->p_stat == SZOMB) {
+ /* charge childs scheduling cpu usage to parent */
+ if (curproc->p_pid != 1) {
+ curproc->p_estcpu = min(curproc->p_estcpu +
+ p->p_estcpu, UCHAR_MAX);
+ }
+
retval[0] = p->p_pid;
#ifdef COMPAT_43
- if (uap->compat)
+ if (compat)
retval[1] = p->p_xstat;
else
#endif
if (uap->status) {
status = p->p_xstat; /* convert to int */
- if (error = copyout((caddr_t)&status,
- (caddr_t)uap->status, sizeof(status)))
+ if ((error = copyout((caddr_t)&status,
+ (caddr_t)uap->status, sizeof(status))))
return (error);
}
if (uap->rusage && (error = copyout((caddr_t)p->p_ru,
@@ -367,6 +421,7 @@ loop:
p->p_xstat = 0;
ruadd(&q->p_stats->p_cru, p->p_ru);
FREE(p->p_ru, M_ZOMBIE);
+ p->p_ru = NULL;
/*
* Decrement the count of procs running with this uid.
@@ -374,20 +429,21 @@ loop:
(void)chgproccnt(p->p_cred->p_ruid, -1);
/*
+ * Release reference to text vnode
+ */
+ if (p->p_textvp)
+ vrele(p->p_textvp);
+
+ /*
* Free up credentials.
*/
if (--p->p_cred->p_refcnt == 0) {
crfree(p->p_cred->pc_ucred);
FREE(p->p_cred, M_SUBPROC);
+ p->p_cred = NULL;
}
/*
- * Release reference to text vnode
- */
- if (p->p_textvp)
- vrele(p->p_textvp);
-
- /*
* Finally finished with old proc entry.
* Unlink it from its process group and free it.
*/
@@ -410,7 +466,7 @@ loop:
p->p_flag |= P_WAITED;
retval[0] = p->p_pid;
#ifdef COMPAT_43
- if (uap->compat) {
+ if (compat) {
retval[1] = W_STOPCODE(p->p_xstat);
error = 0;
} else
@@ -430,7 +486,7 @@ loop:
retval[0] = 0;
return (0);
}
- if (error = tsleep((caddr_t)q, PWAIT | PCATCH, "wait", 0))
+ if ((error = tsleep((caddr_t)q, PWAIT | PCATCH, "wait", 0)))
return (error);
goto loop;
}
@@ -451,3 +507,57 @@ proc_reparent(child, parent)
LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
child->p_pptr = parent;
}
+
+/*
+ * The next two functions are to handle adding/deleting items on the
+ * exit callout list
+ *
+ * at_exit():
+ * Take the arguments given and put them onto the exit callout list,
+ * However first make sure that it's not already there.
+ * returns 0 on success.
+ */
+int
+at_exit(exitlist_fn function)
+{
+ ele_p ep;
+
+ /* Be noisy if the programmer has lost track of things */
+ if (rm_at_exit(function))
+ printf("exit callout entry already present\n");
+ ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT);
+ if (ep == NULL)
+ return (ENOMEM);
+ ep->next = exit_list;
+ ep->function = function;
+ exit_list = ep;
+ return (0);
+}
+/*
+ * Scan the exit callout list for the given items and remove them.
+ * Returns the number of items removed.
+ * Logically this can only be 0 or 1.
+ */
+int
+rm_at_exit(exitlist_fn function)
+{
+ ele_p *epp, ep;
+ int count;
+
+ count = 0;
+ epp = &exit_list;
+ ep = *epp;
+ while (ep) {
+ if (ep->function == function) {
+ *epp = ep->next;
+ free(ep, M_TEMP);
+ count++;
+ } else {
+ epp = &ep->next;
+ }
+ ep = *epp;
+ }
+ return (count);
+}
+
+
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 6c5f22f..8327b81 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -35,55 +35,104 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)kern_fork.c 8.8 (Berkeley) 2/14/95
+ * @(#)kern_fork.c 8.6 (Berkeley) 4/8/94
+ * $Id$
*/
+#include "opt_ktrace.h"
+
#include <sys/param.h>
#include <sys/systm.h>
-#include <sys/map.h>
+#include <sys/sysproto.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
#include <sys/proc.h>
#include <sys/resourcevar.h>
#include <sys/vnode.h>
-#include <sys/file.h>
#include <sys/acct.h>
#include <sys/ktrace.h>
+#include <sys/unistd.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_inherit.h>
+
+static int fork1 __P((struct proc *p, int flags, int *retval));
+
+/*
+ * These are the stuctures used to create a callout list for things to do
+ * when forking a process
+ */
+typedef struct fork_list_element {
+ struct fork_list_element *next;
+ forklist_fn function;
+} *fle_p;
+
+static fle_p fork_list;
+
+#ifndef _SYS_SYSPROTO_H_
+struct fork_args {
+ int dummy;
+};
+#endif
/* ARGSUSED */
+int
fork(p, uap, retval)
struct proc *p;
- void *uap;
- register_t *retval;
+ struct fork_args *uap;
+ int retval[];
{
-
- return (fork1(p, 0, retval));
+ return (fork1(p, (RFFDG|RFPROC), retval));
}
/* ARGSUSED */
+int
vfork(p, uap, retval)
struct proc *p;
- void *uap;
- register_t *retval;
+ struct vfork_args *uap;
+ int retval[];
{
+ return (fork1(p, (RFFDG|RFPROC|RFPPWAIT), retval));
+}
- return (fork1(p, 1, retval));
+/* ARGSUSED */
+int
+rfork(p, uap, retval)
+ struct proc *p;
+ struct rfork_args *uap;
+ int retval[];
+{
+ return (fork1(p, uap->flags, retval));
}
+
int nprocs = 1; /* process 0 */
+static int nextpid = 0;
-fork1(p1, isvfork, retval)
+static int
+fork1(p1, flags, retval)
register struct proc *p1;
- int isvfork;
- register_t *retval;
+ int flags;
+ int retval[];
{
- register struct proc *p2;
+ register struct proc *p2, *pptr;
register uid_t uid;
struct proc *newproc;
- struct proc **hash;
int count;
- static int nextpid, pidchecked = 0;
+ static int pidchecked = 0;
+ fle_p ep ;
+
+ ep = fork_list;
+ if ((flags & RFPROC) == 0)
+ return (EINVAL);
+ if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
+ return (EINVAL);
/*
* Although process entries are dynamically created, we still keep
@@ -97,6 +146,11 @@ fork1(p1, isvfork, retval)
tablefull("proc");
return (EAGAIN);
}
+ /*
+ * Increment the nprocs resource before blocking can occur. There
+ * are hard-limits as to the number of processes that can run.
+ */
+ nprocs++;
/*
* Increment the count of procs running with this uid. Don't allow
@@ -105,6 +159,10 @@ fork1(p1, isvfork, retval)
count = chgproccnt(uid, 1);
if (uid != 0 && count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur) {
(void)chgproccnt(uid, -1);
+ /*
+ * Back out the process count
+ */
+ nprocs--;
return (EAGAIN);
}
@@ -146,7 +204,7 @@ again:
}
if (p2->p_pid > nextpid && pidchecked > p2->p_pid)
pidchecked = p2->p_pid;
- if (p2->p_pgrp->pg_id > nextpid &&
+ if (p2->p_pgrp->pg_id > nextpid &&
pidchecked > p2->p_pgrp->pg_id)
pidchecked = p2->p_pgrp->pg_id;
}
@@ -157,12 +215,10 @@ again:
}
}
- nprocs++;
p2 = newproc;
p2->p_stat = SIDL; /* protect against others */
p2->p_pid = nextpid;
LIST_INSERT_HEAD(&allproc, p2, p_list);
- p2->p_forw = p2->p_back = NULL; /* shouldn't be necessary */
LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
/*
@@ -176,6 +232,11 @@ again:
(unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy));
/*
+ * XXX: this should be done as part of the startzero above
+ */
+ p2->p_vmspace = 0; /* XXX */
+
+ /*
* Duplicate sub-structures as needed.
* Increase reference counts on shared objects.
* The p_stats and p_sigacts substructs are set in vm_fork.
@@ -194,7 +255,13 @@ again:
if (p2->p_textvp)
VREF(p2->p_textvp);
- p2->p_fd = fdcopy(p1);
+ if (flags & RFCFDG)
+ p2->p_fd = fdinit(p1);
+ else if (flags & RFFDG)
+ p2->p_fd = fdcopy(p1);
+ else
+ p2->p_fd = fdshare(p1);
+
/*
* If p_limit is still copy-on-write, bump refcnt,
* otherwise get a copy that won't be modified.
@@ -208,13 +275,29 @@ again:
p2->p_limit->p_refcnt++;
}
+ /*
+ * Preserve some flags in subprocess.
+ */
+ p2->p_flag |= p1->p_flag & P_SUGID;
if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
p2->p_flag |= P_CONTROLT;
- if (isvfork)
+ if (flags & RFPPWAIT)
p2->p_flag |= P_PPWAIT;
LIST_INSERT_AFTER(p1, p2, p_pglist);
- p2->p_pptr = p1;
- LIST_INSERT_HEAD(&p1->p_children, p2, p_sibling);
+
+ /*
+ * Attach the new process to its parent.
+ *
+ * If RFNOWAIT is set, the newly created process becomes a child
+ * of init. This effectively disassociates the child from the
+ * parent.
+ */
+ if (flags & RFNOWAIT)
+ pptr = initproc;
+ else
+ pptr = p1;
+ p2->p_pptr = pptr;
+ LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
LIST_INIT(&p2->p_children);
#ifdef KTRACE
@@ -230,10 +313,25 @@ again:
#endif
/*
+ * set priority of child to be that of parent
+ */
+ p2->p_estcpu = p1->p_estcpu;
+
+ /*
* This begins the section where we must prevent the parent
* from being swapped.
*/
p1->p_flag |= P_NOSWAP;
+
+ /*
+ * share as much address space as possible
+ * XXX this should probably go in vm_fork()
+ */
+ if (flags & RFMEM)
+ (void) vm_map_inherit(&p1->p_vmspace->vm_map,
+ VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS - MAXSSIZ,
+ VM_INHERIT_SHARE);
+
/*
* Set return values for child before vm_fork,
* so they can be copied to child stack.
@@ -244,18 +342,28 @@ again:
*/
retval[0] = p1->p_pid;
retval[1] = 1;
- if (vm_fork(p1, p2, isvfork)) {
+ if (vm_fork(p1, p2)) {
/*
* Child process. Set start time and get to work.
*/
- (void) splclock();
- p2->p_stats->p_start = time;
+ microtime(&runtime);
(void) spl0();
+ p2->p_stats->p_start = runtime;
p2->p_acflag = AFORK;
return (0);
}
/*
+ * Both processes are set up, now check if any LKMs want
+ * to adjust anything.
+ * What if they have an error? XXX
+ */
+ while (ep) {
+ (*ep->function)(p1, p2, flags);
+ ep = ep->next;
+ }
+
+ /*
* Make child runnable and add to run queue.
*/
(void) splhigh();
@@ -273,9 +381,8 @@ again:
* child to exec or exit, set P_PPWAIT on child, and sleep on our
* proc (in case of exit).
*/
- if (isvfork)
- while (p2->p_flag & P_PPWAIT)
- tsleep(p1, PWAIT, "ppwait", 0);
+ while (p2->p_flag & P_PPWAIT)
+ tsleep(p1, PWAIT, "ppwait", 0);
/*
* Return child pid to parent process,
@@ -285,3 +392,58 @@ again:
retval[1] = 0;
return (0);
}
+
+/*
+ * The next two functionms are general routines to handle adding/deleting
+ * items on the fork callout list.
+ *
+ * at_fork():
+ * Take the arguments given and put them onto the fork callout list,
+ * However first make sure that it's not already there.
+ * Returns 0 on success or a standard error number.
+ */
+int
+at_fork(forklist_fn function)
+{
+ fle_p ep;
+
+ /* let the programmer know if he's been stupid */
+ if (rm_at_fork(function))
+ printf("fork callout entry already present\n");
+ ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT);
+ if (ep == NULL)
+ return (ENOMEM);
+ ep->next = fork_list;
+ ep->function = function;
+ fork_list = ep;
+ return (0);
+}
+
+/*
+ * Scan the exit callout list for the given items and remove them.
+ * Returns the number of items removed.
+ * Theoretically this value can only be 0 or 1.
+ */
+int
+rm_at_fork(forklist_fn function)
+{
+ fle_p *epp, ep;
+ int count;
+
+ count= 0;
+ epp = &fork_list;
+ ep = *epp;
+ while (ep) {
+ if (ep->function == function) {
+ *epp = ep->next;
+ free(ep, M_TEMP);
+ count++;
+ } else {
+ epp = &ep->next;
+ }
+ ep = *epp;
+ }
+ return (count);
+}
+
+
diff --git a/sys/kern/kern_ktrace.c b/sys/kern/kern_ktrace.c
index b841754..f8e4e25 100644
--- a/sys/kern/kern_ktrace.c
+++ b/sys/kern/kern_ktrace.c
@@ -30,33 +30,40 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)kern_ktrace.c 8.5 (Berkeley) 5/14/95
+ * @(#)kern_ktrace.c 8.2 (Berkeley) 9/23/93
+ * $Id: kern_ktrace.c,v 1.17 1997/02/22 09:39:05 peter Exp $
*/
-#ifdef KTRACE
+#include "opt_ktrace.h"
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/sysproto.h>
#include <sys/proc.h>
-#include <sys/file.h>
+#include <sys/fcntl.h>
#include <sys/namei.h>
#include <sys/vnode.h>
#include <sys/ktrace.h>
#include <sys/malloc.h>
#include <sys/syslog.h>
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
+#ifdef KTRACE
+static struct ktr_header *ktrgetheader __P((int type));
+static void ktrwrite __P((struct vnode *, struct ktr_header *));
+static int ktrcanset __P((struct proc *,struct proc *));
+static int ktrsetchildren __P((struct proc *,struct proc *,int,int,struct vnode *));
+static int ktrops __P((struct proc *,struct proc *,int,int,struct vnode *));
+
-struct ktr_header *
+static struct ktr_header *
ktrgetheader(type)
int type;
{
register struct ktr_header *kth;
struct proc *p = curproc; /* XXX */
- MALLOC(kth, struct ktr_header *, sizeof (struct ktr_header),
- M_TEMP, M_WAITOK);
+ MALLOC(kth, struct ktr_header *, sizeof (struct ktr_header),
+ M_KTRACE, M_WAITOK);
kth->ktr_type = type;
microtime(&kth->ktr_time);
kth->ktr_pid = p->p_pid;
@@ -65,31 +72,29 @@ ktrgetheader(type)
}
void
-ktrsyscall(vp, code, argsize, args)
+ktrsyscall(vp, code, narg, args)
struct vnode *vp;
- int code, argsize;
- register_t args[];
+ int code, narg, args[];
{
struct ktr_header *kth;
struct ktr_syscall *ktp;
- register len = sizeof(struct ktr_syscall) + argsize;
+ register len = sizeof(struct ktr_syscall) + (narg * sizeof(int));
struct proc *p = curproc; /* XXX */
- register_t *argp;
- int i;
+ int *argp, i;
p->p_traceflag |= KTRFAC_ACTIVE;
kth = ktrgetheader(KTR_SYSCALL);
- MALLOC(ktp, struct ktr_syscall *, len, M_TEMP, M_WAITOK);
+ MALLOC(ktp, struct ktr_syscall *, len, M_KTRACE, M_WAITOK);
ktp->ktr_code = code;
- ktp->ktr_argsize = argsize;
- argp = (register_t *)((char *)ktp + sizeof(struct ktr_syscall));
- for (i = 0; i < (argsize / sizeof *argp); i++)
+ ktp->ktr_narg = narg;
+ argp = (int *)((char *)ktp + sizeof(struct ktr_syscall));
+ for (i = 0; i < narg; i++)
*argp++ = args[i];
kth->ktr_buf = (caddr_t)ktp;
kth->ktr_len = len;
ktrwrite(vp, kth);
- FREE(ktp, M_TEMP);
- FREE(kth, M_TEMP);
+ FREE(ktp, M_KTRACE);
+ FREE(kth, M_KTRACE);
p->p_traceflag &= ~KTRFAC_ACTIVE;
}
@@ -112,7 +117,7 @@ ktrsysret(vp, code, error, retval)
kth->ktr_len = sizeof(struct ktr_sysret);
ktrwrite(vp, kth);
- FREE(kth, M_TEMP);
+ FREE(kth, M_KTRACE);
p->p_traceflag &= ~KTRFAC_ACTIVE;
}
@@ -130,7 +135,7 @@ ktrnamei(vp, path)
kth->ktr_buf = path;
ktrwrite(vp, kth);
- FREE(kth, M_TEMP);
+ FREE(kth, M_KTRACE);
p->p_traceflag &= ~KTRFAC_ACTIVE;
}
@@ -147,13 +152,13 @@ ktrgenio(vp, fd, rw, iov, len, error)
register caddr_t cp;
register int resid = len, cnt;
struct proc *p = curproc; /* XXX */
-
+
if (error)
return;
p->p_traceflag |= KTRFAC_ACTIVE;
kth = ktrgetheader(KTR_GENIO);
MALLOC(ktp, struct ktr_genio *, sizeof(struct ktr_genio) + len,
- M_TEMP, M_WAITOK);
+ M_KTRACE, M_WAITOK);
ktp->ktr_fd = fd;
ktp->ktr_rw = rw;
cp = (caddr_t)((char *)ktp + sizeof (struct ktr_genio));
@@ -171,8 +176,8 @@ ktrgenio(vp, fd, rw, iov, len, error)
ktrwrite(vp, kth);
done:
- FREE(kth, M_TEMP);
- FREE(ktp, M_TEMP);
+ FREE(kth, M_KTRACE);
+ FREE(ktp, M_KTRACE);
p->p_traceflag &= ~KTRFAC_ACTIVE;
}
@@ -197,7 +202,7 @@ ktrpsig(vp, sig, action, mask, code)
kth->ktr_len = sizeof (struct ktr_psig);
ktrwrite(vp, kth);
- FREE(kth, M_TEMP);
+ FREE(kth, M_KTRACE);
p->p_traceflag &= ~KTRFAC_ACTIVE;
}
@@ -218,33 +223,38 @@ ktrcsw(vp, out, user)
kth->ktr_len = sizeof (struct ktr_csw);
ktrwrite(vp, kth);
- FREE(kth, M_TEMP);
+ FREE(kth, M_KTRACE);
p->p_traceflag &= ~KTRFAC_ACTIVE;
}
+#endif
/* Interface and common routines */
/*
* ktrace system call
*/
+#ifndef _SYS_SYSPROTO_H_
+struct ktrace_args {
+ char *fname;
+ int ops;
+ int facs;
+ int pid;
+};
+#endif
/* ARGSUSED */
int
ktrace(curp, uap, retval)
struct proc *curp;
- register struct ktrace_args /* {
- syscallarg(char *) fname;
- syscallarg(int) ops;
- syscallarg(int) facs;
- syscallarg(int) pid;
- } */ *uap;
- register_t *retval;
+ register struct ktrace_args *uap;
+ int *retval;
{
+#ifdef KTRACE
register struct vnode *vp = NULL;
register struct proc *p;
struct pgrp *pg;
- int facs = SCARG(uap, facs) & ~KTRFAC_ROOT;
- int ops = KTROP(SCARG(uap, ops));
- int descend = SCARG(uap, ops) & KTRFLAG_DESCEND;
+ int facs = uap->facs & ~KTRFAC_ROOT;
+ int ops = KTROP(uap->ops);
+ int descend = uap->ops & KTRFLAG_DESCEND;
int ret = 0;
int error = 0;
struct nameidata nd;
@@ -254,14 +264,14 @@ ktrace(curp, uap, retval)
/*
* an operation which requires a file argument.
*/
- NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, fname),
- curp);
- if (error = vn_open(&nd, FREAD|FWRITE, 0)) {
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->fname, curp);
+ error = vn_open(&nd, FREAD|FWRITE, 0);
+ if (error) {
curp->p_traceflag &= ~KTRFAC_ACTIVE;
return (error);
}
vp = nd.ni_vp;
- VOP_UNLOCK(vp, 0, p);
+ VOP_UNLOCK(vp, 0, curp);
if (vp->v_type != VREG) {
(void) vn_close(vp, FREAD|FWRITE, curp->p_ucred, curp);
curp->p_traceflag &= ~KTRFAC_ACTIVE;
@@ -292,14 +302,14 @@ ktrace(curp, uap, retval)
error = EINVAL;
goto done;
}
- /*
+ /*
* do it
*/
- if (SCARG(uap, pid) < 0) {
+ if (uap->pid < 0) {
/*
* by process group
*/
- pg = pgfind(-SCARG(uap, pid));
+ pg = pgfind(-uap->pid);
if (pg == NULL) {
error = ESRCH;
goto done;
@@ -307,14 +317,14 @@ ktrace(curp, uap, retval)
for (p = pg->pg_members.lh_first; p != 0; p = p->p_pglist.le_next)
if (descend)
ret |= ktrsetchildren(curp, p, ops, facs, vp);
- else
+ else
ret |= ktrops(curp, p, ops, facs, vp);
-
+
} else {
/*
* by pid
*/
- p = pfind(SCARG(uap, pid));
+ p = pfind(uap->pid);
if (p == NULL) {
error = ESRCH;
goto done;
@@ -331,9 +341,48 @@ done:
(void) vn_close(vp, FWRITE, curp->p_ucred, curp);
curp->p_traceflag &= ~KTRFAC_ACTIVE;
return (error);
+#else
+ return ENOSYS;
+#endif
}
+/*
+ * utrace system call
+ */
+/* ARGSUSED */
int
+utrace(curp, uap, retval)
+ struct proc *curp;
+ register struct utrace_args *uap;
+ int *retval;
+{
+#ifdef KTRACE
+ struct ktr_header *kth;
+ struct proc *p = curproc; /* XXX */
+ register caddr_t cp;
+
+ if (!KTRPOINT(p, KTR_USER))
+ return (0);
+ p->p_traceflag |= KTRFAC_ACTIVE;
+ kth = ktrgetheader(KTR_USER);
+ MALLOC(cp, caddr_t, uap->len, M_KTRACE, M_WAITOK);
+ if (!copyin(uap->addr, cp, uap->len)) {
+ kth->ktr_buf = cp;
+ kth->ktr_len = uap->len;
+ ktrwrite(p->p_tracep, kth);
+ }
+ FREE(kth, M_KTRACE);
+ FREE(cp, M_KTRACE);
+ p->p_traceflag &= ~KTRFAC_ACTIVE;
+
+ return (0);
+#else
+ return (ENOSYS);
+#endif
+}
+
+#ifdef KTRACE
+static int
ktrops(curp, p, ops, facs, vp)
struct proc *p, *curp;
int ops, facs;
@@ -343,7 +392,7 @@ ktrops(curp, p, ops, facs, vp)
if (!ktrcanset(curp, p))
return (0);
if (ops == KTROP_SET) {
- if (p->p_tracep != vp) {
+ if (p->p_tracep != vp) {
/*
* if trace file already in use, relinquish
*/
@@ -355,7 +404,7 @@ ktrops(curp, p, ops, facs, vp)
p->p_traceflag |= facs;
if (curp->p_ucred->cr_uid == 0)
p->p_traceflag |= KTRFAC_ROOT;
- } else {
+ } else {
/* KTROP_CLEAR */
if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) {
/* no more tracing */
@@ -370,6 +419,7 @@ ktrops(curp, p, ops, facs, vp)
return (1);
}
+static int
ktrsetchildren(curp, top, ops, facs, vp)
struct proc *curp, *top;
int ops, facs;
@@ -401,6 +451,7 @@ ktrsetchildren(curp, top, ops, facs, vp)
/*NOTREACHED*/
}
+static void
ktrwrite(vp, kth)
struct vnode *vp;
register struct ktr_header *kth;
@@ -450,11 +501,12 @@ ktrwrite(vp, kth)
* Return true if caller has permission to set the ktracing state
* of target. Essentially, the target can't possess any
* more permissions than the caller. KTRFAC_ROOT signifies that
- * root previously set the tracing status on the target process, and
+ * root previously set the tracing status on the target process, and
* so, only root may further change it.
*
* TODO: check groups. use caller effective gid.
*/
+static int
ktrcanset(callp, targetp)
struct proc *callp, *targetp;
{
@@ -472,4 +524,4 @@ ktrcanset(callp, targetp)
return (0);
}
-#endif
+#endif /* KTRACE */
diff --git a/sys/kern/kern_lkm.c b/sys/kern/kern_lkm.c
new file mode 100644
index 0000000..f371c37
--- /dev/null
+++ b/sys/kern/kern_lkm.c
@@ -0,0 +1,957 @@
+/*-
+ * Copyright (c) 1992 Terrence R. Lambert.
+ * Copyright (c) 1994 Christopher G. Demetriou
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Terrence R. Lambert.
+ * 4. The name Terrence R. Lambert may not be used to endorse or promote
+ * products derived from this software without specific prior written
+ * permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY TERRENCE R. LAMBERT ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE TERRENCE R. LAMBERT BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: kern_lkm.c,v 1.38 1997/03/23 03:36:20 bde Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/tty.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/sysent.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/lkm.h>
+#include <sys/vnode.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+
+#define PAGESIZE 1024 /* kmem_alloc() allocation quantum */
+
+#define LKM_ALLOC 0x01
+#define LKM_WANT 0x02
+
+#define LKMS_IDLE 0x00
+#define LKMS_RESERVED 0x01
+#define LKMS_LOADING 0x02
+#define LKMS_LOADED 0x04
+#define LKMS_UNLOADING 0x08
+
+static int lkm_v = 0;
+static int lkm_state = LKMS_IDLE;
+
+#ifndef MAXLKMS
+#define MAXLKMS 20
+#endif
+
+static struct lkm_table lkmods[MAXLKMS]; /* table of loaded modules */
+static struct lkm_table *curp; /* global for in-progress ops */
+
+static int _lkm_dev __P((struct lkm_table *lkmtp, int cmd));
+static int _lkm_exec __P((struct lkm_table *lkmtp, int cmd));
+static int _lkm_vfs __P((struct lkm_table *lkmtp, int cmd));
+static int _lkm_syscall __P((struct lkm_table *lkmtp, int cmd));
+static void lkmunreserve __P((void));
+
+static d_open_t lkmcopen;
+static d_close_t lkmcclose;
+static d_ioctl_t lkmcioctl;
+
+#define CDEV_MAJOR 32
+static struct cdevsw lkmc_cdevsw =
+ { lkmcopen, lkmcclose, noread, nowrite, /*32*/
+ lkmcioctl, nostop, nullreset, nodevtotty,
+ noselect, nommap, NULL, "lkm", NULL, -1 };
+
+
+/*ARGSUSED*/
+static int
+lkmcopen(dev, flag, devtype, p)
+ dev_t dev;
+ int flag;
+ int devtype;
+ struct proc *p;
+{
+ int error;
+
+ if (minor(dev) != 0)
+ return(ENXIO); /* bad minor # */
+
+ /*
+ * Use of the loadable kernel module device must be exclusive; we
+ * may try to remove this restriction later, but it's really no
+ * hardship.
+ */
+ while (lkm_v & LKM_ALLOC) {
+ if (flag & FNONBLOCK) /* don't hang */
+ return(EBUSY);
+ lkm_v |= LKM_WANT;
+ /*
+ * Sleep pending unlock; we use tsleep() to allow
+ * an alarm out of the open.
+ */
+ error = tsleep((caddr_t)&lkm_v, TTIPRI|PCATCH, "lkmopn", 0);
+ if (error)
+ return(error); /* leave LKM_WANT set -- no problem */
+ }
+ lkm_v |= LKM_ALLOC;
+
+ return(0); /* pseudo-device open */
+}
+
+/*
+ * Unreserve the memory associated with the current loaded module; done on
+ * a coerced close of the lkm device (close on premature exit of modload)
+ * or explicitly by modload as a result of a link failure.
+ */
+static void
+lkmunreserve()
+{
+
+ if (lkm_state == LKMS_IDLE)
+ return;
+
+ /*
+ * Actually unreserve the memory
+ */
+ if (curp && curp->area) {
+ kmem_free(kernel_map, curp->area, curp->size);/**/
+ curp->area = 0;
+ if (curp->private.lkm_any != NULL)
+ curp->private.lkm_any = NULL;
+ }
+
+ lkm_state = LKMS_IDLE;
+}
+
+static int
+lkmcclose(dev, flag, mode, p)
+ dev_t dev;
+ int flag;
+ int mode;
+ struct proc *p;
+{
+
+ if (!(lkm_v & LKM_ALLOC)) {
+#ifdef DEBUG
+ printf("LKM: close before open!\n");
+#endif /* DEBUG */
+ return(EBADF);
+ }
+
+ /* do this before waking the herd... */
+ if (curp && !curp->used) {
+ /*
+ * If we close before setting used, we have aborted
+ * by way of error or by way of close-on-exit from
+ * a premature exit of "modload".
+ */
+ lkmunreserve(); /* coerce state to LKM_IDLE */
+ }
+
+ lkm_v &= ~LKM_ALLOC;
+ wakeup((caddr_t)&lkm_v); /* thundering herd "problem" here */
+
+ return(0); /* pseudo-device closed */
+}
+
+/*ARGSUSED*/
+static int
+lkmcioctl(dev, cmd, data, flag, p)
+ dev_t dev;
+ int cmd;
+ caddr_t data;
+ int flag;
+ struct proc *p;
+{
+ int err = 0;
+ int i;
+ struct lmc_resrv *resrvp;
+ struct lmc_loadbuf *loadbufp;
+ struct lmc_unload *unloadp;
+ struct lmc_stat *statp;
+ char istr[MAXLKMNAME];
+
+ switch(cmd) {
+ case LMRESERV: /* reserve pages for a module */
+ if ((flag & FWRITE) == 0 || securelevel > 0)
+ /* only allow this if writing and insecure */
+ return EPERM;
+
+ resrvp = (struct lmc_resrv *)data;
+
+ /*
+ * Find a free slot.
+ */
+ for (i = 0; i < MAXLKMS; i++)
+ if (!lkmods[i].used)
+ break;
+ if (i == MAXLKMS) {
+ err = ENOMEM; /* no slots available */
+ break;
+ }
+ curp = &lkmods[i];
+ curp->id = i; /* self reference slot offset */
+
+ resrvp->slot = i; /* return slot */
+
+ /*
+ * Get memory for module
+ */
+ curp->size = resrvp->size;
+
+ curp->area = kmem_alloc(kernel_map, curp->size);/**/
+
+ curp->offset = 0; /* load offset */
+
+ resrvp->addr = curp->area; /* ret kernel addr */
+
+#ifdef DEBUG
+ printf("LKM: LMRESERV (actual = 0x%08x)\n", curp->area);
+ printf("LKM: LMRESERV (adjusted = 0x%08x)\n",
+ trunc_page(curp->area));
+#endif /* DEBUG */
+ lkm_state = LKMS_RESERVED;
+ break;
+
+ case LMLOADBUF: /* Copy in; stateful, follows LMRESERV */
+ if ((flag & FWRITE) == 0 || securelevel > 0)
+ /* only allow this if writing and insecure */
+ return EPERM;
+
+ loadbufp = (struct lmc_loadbuf *)data;
+ i = loadbufp->cnt;
+ if ((lkm_state != LKMS_RESERVED && lkm_state != LKMS_LOADING)
+ || i < 0
+ || i > MODIOBUF
+ || i > curp->size - curp->offset) {
+ err = ENOMEM;
+ break;
+ }
+
+ /* copy in buffer full of data */
+ err = copyin((caddr_t)loadbufp->data,
+ (caddr_t)curp->area + curp->offset, i);
+ if (err)
+ break;
+
+ if ((curp->offset + i) < curp->size) {
+ lkm_state = LKMS_LOADING;
+#ifdef DEBUG
+ printf("LKM: LMLOADBUF (loading @ %d of %d, i = %d)\n",
+ curp->offset, curp->size, i);
+#endif /* DEBUG */
+ } else {
+ lkm_state = LKMS_LOADED;
+#ifdef DEBUG
+ printf("LKM: LMLOADBUF (loaded)\n");
+#endif /* DEBUG */
+ }
+ curp->offset += i;
+ break;
+
+ case LMUNRESRV: /* discard reserved pages for a module */
+ if ((flag & FWRITE) == 0 || securelevel > 0)
+ /* only allow this if writing and insecure */
+ return EPERM;
+
+ lkmunreserve(); /* coerce state to LKM_IDLE */
+#ifdef DEBUG
+ printf("LKM: LMUNRESERV\n");
+#endif /* DEBUG */
+ break;
+
+ case LMREADY: /* module loaded: call entry */
+ if ((flag & FWRITE) == 0 || securelevel > 0)
+ /* only allow this if writing or insecure */
+ return EPERM;
+
+ switch (lkm_state) {
+ case LKMS_LOADED:
+ break;
+ case LKMS_LOADING:
+ /* The remainder must be bss, so we clear it */
+ bzero((caddr_t)curp->area + curp->offset,
+ curp->size - curp->offset);
+ break;
+ default:
+
+#ifdef DEBUG
+ printf("lkm_state is %02x\n", lkm_state);
+#endif /* DEBUG */
+ return ENXIO;
+ }
+
+ /* XXX gack */
+ curp->entry = (int (*) __P((struct lkm_table *, int, int)))
+ (*((int *)data));
+
+ /* call entry(load)... (assigns "private" portion) */
+ err = (*(curp->entry))(curp, LKM_E_LOAD, LKM_VERSION);
+ if (err) {
+ /*
+ * Module may refuse loading or may have a
+ * version mismatch...
+ */
+ lkm_state = LKMS_UNLOADING; /* for lkmunreserve */
+ lkmunreserve(); /* free memory */
+ curp->used = 0; /* free slot */
+ break;
+ }
+ /*
+ * It's possible for a user to load a module that doesn't
+ * initialize itself correctly. (You can even get away with
+ * using it for a while.) Unfortunately, we are faced with
+ * the following problems:
+ * - we can't tell a good module from a bad one until
+ * after we've run its entry function (if the private
+ * section is uninitalized after we return from the
+ * entry, then something's fishy)
+ * - now that we've called the entry function, we can't
+ * forcibly unload the module without risking a crash
+ * - since we don't know what the module's entry function
+ * did, we can't easily clean up the mess it may have
+ * made, so we can't know just how unstable the system
+ * may be
+ * So, being stuck between a rock and a hard place, we
+ * have no choice but to do this...
+ */
+ if (curp->private.lkm_any == NULL)
+ panic("loadable module initialization failed");
+
+ curp->used = 1;
+#ifdef DEBUG
+ printf("LKM: LMREADY\n");
+#endif /* DEBUG */
+ lkm_state = LKMS_IDLE;
+ break;
+
+ case LMUNLOAD: /* unload a module */
+ if ((flag & FWRITE) == 0 || securelevel > 0)
+ /* only allow this if writing and insecure */
+ return EPERM;
+
+ unloadp = (struct lmc_unload *)data;
+
+ if ((i = unloadp->id) == -1) { /* unload by name */
+ /*
+ * Copy name and lookup id from all loaded
+ * modules. May fail.
+ */
+ err =copyinstr(unloadp->name, istr, MAXLKMNAME-1, NULL);
+ if (err)
+ break;
+
+ /*
+ * look up id...
+ */
+ for (i = 0; i < MAXLKMS; i++) {
+ if (!lkmods[i].used)
+ continue;
+ if (!strcmp(istr,
+ lkmods[i].private.lkm_any->lkm_name))
+ break;
+ }
+ }
+
+ /*
+ * Range check the value; on failure, return EINVAL
+ */
+ if (i < 0 || i >= MAXLKMS) {
+ err = EINVAL;
+ break;
+ }
+
+ curp = &lkmods[i];
+
+ if (!curp->used) {
+ err = ENOENT;
+ break;
+ }
+
+ /* call entry(unload) */
+ if ((*(curp->entry))(curp, LKM_E_UNLOAD, LKM_VERSION)) {
+ err = EBUSY;
+ break;
+ }
+
+ lkm_state = LKMS_UNLOADING; /* non-idle for lkmunreserve */
+ lkmunreserve(); /* free memory */
+ curp->used = 0; /* free slot */
+ break;
+
+ case LMSTAT: /* stat a module by id/name */
+ /* allow readers and writers to stat */
+
+ statp = (struct lmc_stat *)data;
+
+ if ((i = statp->id) == -1) { /* stat by name */
+ /*
+ * Copy name and lookup id from all loaded
+ * modules.
+ */
+ copystr(statp->name, istr, MAXLKMNAME-1, NULL);
+ /*
+ * look up id...
+ */
+ for (i = 0; i < MAXLKMS; i++) {
+ if (!lkmods[i].used)
+ continue;
+ if (!strcmp(istr,
+ lkmods[i].private.lkm_any->lkm_name))
+ break;
+ }
+
+ if (i == MAXLKMS) { /* Not found */
+ err = ENOENT;
+ break;
+ }
+ }
+
+ /*
+ * Range check the value; on failure, return EINVAL
+ */
+ if (i < 0 || i >= MAXLKMS) {
+ err = EINVAL;
+ break;
+ }
+
+ curp = &lkmods[i];
+
+ if (!curp->used) { /* Not found */
+ err = ENOENT;
+ break;
+ }
+
+ /*
+ * Copy out stat information for this module...
+ */
+ statp->id = curp->id;
+ statp->offset = curp->private.lkm_any->lkm_offset;
+ statp->type = curp->private.lkm_any->lkm_type;
+ statp->area = curp->area;
+ statp->size = curp->size / PAGESIZE;
+ statp->private = (unsigned long)curp->private.lkm_any;
+ statp->ver = curp->private.lkm_any->lkm_ver;
+ copystr(curp->private.lkm_any->lkm_name,
+ statp->name,
+ MAXLKMNAME - 2,
+ NULL);
+
+ break;
+
+ default: /* bad ioctl()... */
+ err = ENOTTY;
+ break;
+ }
+
+ return (err);
+}
+
+/*
+ * Acts like "nosys" but can be identified in sysent for dynamic call
+ * number assignment for a limited number of calls.
+ *
+ * Place holder for system call slots reserved for loadable modules.
+ */
+int
+lkmnosys(p, args, retval)
+ struct proc *p;
+ struct nosys_args *args;
+ int *retval;
+{
+
+ return(nosys(p, args, retval));
+}
+
+int
+lkmexists(lkmtp)
+ struct lkm_table *lkmtp;
+{
+ int i;
+
+ /*
+ * see if name exists...
+ */
+ for (i = 0; i < MAXLKMS; i++) {
+ /*
+ * An unused module and the one we are testing are not
+ * considered.
+ */
+ if (!lkmods[i].used || &lkmods[i] == lkmtp)
+ continue;
+ if (!strcmp(lkmtp->private.lkm_any->lkm_name,
+ lkmods[i].private.lkm_any->lkm_name))
+ return(1); /* already loaded... */
+ }
+
+ return(0); /* module not loaded... */
+}
+
+/*
+ * For the loadable system call described by the structure pointed to
+ * by lkmtp, load/unload/stat it depending on the cmd requested.
+ */
+static int
+_lkm_syscall(lkmtp, cmd)
+ struct lkm_table *lkmtp;
+ int cmd;
+{
+ struct lkm_syscall *args = lkmtp->private.lkm_syscall;
+ int i;
+ int err = 0;
+
+ switch(cmd) {
+ case LKM_E_LOAD:
+ /* don't load twice! */
+ if (lkmexists(lkmtp))
+ return(EEXIST);
+ if ((i = args->lkm_offset) == -1) { /* auto */
+ /*
+ * Search the table looking for a slot...
+ */
+ for (i = 0; i < aout_sysvec.sv_size; i++)
+ if (aout_sysvec.sv_table[i].sy_call ==
+ (sy_call_t *)lkmnosys)
+ break; /* found it! */
+ /* out of allocable slots? */
+ if (i == aout_sysvec.sv_size) {
+ err = ENFILE;
+ break;
+ }
+ } else { /* assign */
+ if (i < 0 || i >= aout_sysvec.sv_size) {
+ err = EINVAL;
+ break;
+ }
+ }
+
+ /* save old */
+ bcopy(&aout_sysvec.sv_table[i],
+ &(args->lkm_oldent),
+ sizeof(struct sysent));
+
+ /* replace with new */
+ bcopy(args->lkm_sysent,
+ &aout_sysvec.sv_table[i],
+ sizeof(struct sysent));
+
+ /* done! */
+ args->lkm_offset = i; /* slot in sysent[] */
+
+ break;
+
+ case LKM_E_UNLOAD:
+ /* current slot... */
+ i = args->lkm_offset;
+
+ /* replace current slot contents with old contents */
+ bcopy(&(args->lkm_oldent),
+ &aout_sysvec.sv_table[i],
+ sizeof(struct sysent));
+
+ break;
+
+ case LKM_E_STAT: /* no special handling... */
+ break;
+ }
+
+ return(err);
+}
+
+/*
+ * For the loadable virtual file system described by the structure pointed
+ * to by lkmtp, load/unload/stat it depending on the cmd requested.
+ */
+static int
+_lkm_vfs(lkmtp, cmd)
+ struct lkm_table *lkmtp;
+ int cmd;
+{
+ struct lkm_vfs *args = lkmtp->private.lkm_vfs;
+ struct vfsconf *vfc = args->lkm_vfsconf;
+ struct vfsconf *vfsp, *prev_vfsp;
+ int i, maxtypenum;
+ int err = 0;
+
+ switch(cmd) {
+ case LKM_E_LOAD:
+ /* don't load twice! */
+ if (lkmexists(lkmtp))
+ return(EEXIST);
+
+ for (vfsp = vfsconf; vfsp->vfc_next; vfsp = vfsp->vfc_next) {
+ if (!strcmp(vfc->vfc_name, vfsp->vfc_name)) {
+ return EEXIST;
+ }
+ }
+
+ i = args->lkm_offset = vfc->vfc_typenum;
+ if (i < 0) {
+ i = maxvfsconf;
+ }
+ args->lkm_offset = vfc->vfc_typenum = i;
+
+ if (maxvfsconf <= i)
+ maxvfsconf = i + 1;
+
+ vfsp->vfc_next = vfc;
+ vfc->vfc_next = NULL;
+
+ /* like in vfs_op_init */
+ for(i = 0; args->lkm_vnodeops->ls_items[i]; i++) {
+ const struct vnodeopv_desc *opv =
+ args->lkm_vnodeops->ls_items[i];
+ *(opv->opv_desc_vector_p) = NULL;
+ }
+ vfs_opv_init((struct vnodeopv_desc **)args->lkm_vnodeops->ls_items);
+
+ /*
+ * Call init function for this VFS...
+ */
+ (*(vfc->vfc_vfsops->vfs_init))(vfc);
+
+ /* done! */
+ break;
+
+ case LKM_E_UNLOAD:
+ /* current slot... */
+ i = args->lkm_offset;
+
+ prev_vfsp = NULL;
+ for (vfsp = vfsconf; vfsp;
+ prev_vfsp = vfsp, vfsp = vfsp->vfc_next) {
+ if (!strcmp(vfc->vfc_name, vfsp->vfc_name))
+ break;
+ }
+ if (vfsp == NULL) {
+ return EINVAL;
+ }
+
+ if (vfsp->vfc_refcount) {
+ return EBUSY;
+ }
+
+ FREE(vfsp, M_VFSCONF);
+
+ prev_vfsp->vfc_next = vfsp->vfc_next;
+
+ /*
+ * Maintain maxvfsconf.
+ */
+ maxtypenum = 0;
+ for (vfsp = vfsconf; vfsp != NULL; vfsp = vfsp->vfc_next)
+ if (maxtypenum < vfsp->vfc_typenum)
+ maxtypenum = vfsp->vfc_typenum;
+ maxvfsconf = maxtypenum + 1;
+
+ break;
+
+ case LKM_E_STAT: /* no special handling... */
+ break;
+ }
+ return(err);
+}
+
+/*
+ * For the loadable device driver described by the structure pointed to
+ * by lkmtp, load/unload/stat it depending on the cmd requested.
+ */
+static int
+_lkm_dev(lkmtp, cmd)
+ struct lkm_table *lkmtp;
+ int cmd;
+{
+ struct lkm_dev *args = lkmtp->private.lkm_dev;
+ int i;
+ dev_t descrip;
+ int err = 0;
+
+ switch(cmd) {
+ case LKM_E_LOAD:
+ /* don't load twice! */
+ if (lkmexists(lkmtp))
+ return(EEXIST);
+ switch(args->lkm_devtype) {
+ case LM_DT_BLOCK:
+ if ((i = args->lkm_offset) == -1)
+ descrip = (dev_t) -1;
+ else
+ descrip = makedev(args->lkm_offset,0);
+ if ( err = bdevsw_add(&descrip, args->lkm_dev.bdev,
+ &(args->lkm_olddev.bdev))) {
+ break;
+ }
+ args->lkm_offset = major(descrip) ;
+ break;
+
+ case LM_DT_CHAR:
+ if ((i = args->lkm_offset) == -1)
+ descrip = (dev_t) -1;
+ else
+ descrip = makedev(args->lkm_offset,0);
+ if ( err = cdevsw_add(&descrip, args->lkm_dev.cdev,
+ &(args->lkm_olddev.cdev))) {
+ break;
+ }
+ args->lkm_offset = major(descrip) ;
+ break;
+
+ default:
+ err = ENODEV;
+ break;
+ }
+ break;
+
+ case LKM_E_UNLOAD:
+ /* current slot... */
+ i = args->lkm_offset;
+ descrip = makedev(i,0);
+
+ switch(args->lkm_devtype) {
+ case LM_DT_BLOCK:
+ /* replace current slot contents with old contents */
+ bdevsw_add(&descrip, args->lkm_olddev.bdev,NULL);
+ break;
+
+ case LM_DT_CHAR:
+ /* replace current slot contents with old contents */
+ cdevsw_add(&descrip, args->lkm_olddev.cdev,NULL);
+ break;
+
+ default:
+ err = ENODEV;
+ break;
+ }
+ break;
+
+ case LKM_E_STAT: /* no special handling... */
+ break;
+ }
+
+ return(err);
+}
+
+#ifdef STREAMS
+/*
+ * For the loadable streams module described by the structure pointed to
+ * by lkmtp, load/unload/stat it depending on the cmd requested.
+ */
+static int
+_lkm_strmod(lkmtp, cmd)
+ struct lkm_table *lkmtp;
+ int cmd;
+{
+ struct lkm_strmod *args = lkmtp->private.lkm_strmod;
+ int i;
+ int err = 0;
+
+ switch(cmd) {
+ case LKM_E_LOAD:
+ /* don't load twice! */
+ if (lkmexists(lkmtp))
+ return(EEXIST);
+ break;
+
+ case LKM_E_UNLOAD:
+ break;
+
+ case LKM_E_STAT: /* no special handling... */
+ break;
+ }
+
+ return(err);
+}
+#endif /* STREAMS */
+
+/*
+ * For the loadable execution class described by the structure pointed to
+ * by lkmtp, load/unload/stat it depending on the cmd requested.
+ */
+static int
+_lkm_exec(lkmtp, cmd)
+ struct lkm_table *lkmtp;
+ int cmd;
+{
+ struct lkm_exec *args = lkmtp->private.lkm_exec;
+ int i;
+ int err = 0;
+ const struct execsw **execsw =
+ (const struct execsw **)&execsw_set.ls_items[0];
+
+ switch(cmd) {
+ case LKM_E_LOAD:
+ /* don't load twice! */
+ if (lkmexists(lkmtp))
+ return(EEXIST);
+ if ((i = args->lkm_offset) == -1) { /* auto */
+ /*
+ * Search the table looking for a slot...
+ */
+ for (i = 0; execsw[i] != NULL; i++)
+ if (execsw[i]->ex_imgact == NULL)
+ break; /* found it! */
+ /* out of allocable slots? */
+ if (execsw[i] == NULL) {
+ err = ENFILE;
+ break;
+ }
+ } else { /* assign */
+ err = EINVAL;
+ break;
+ }
+
+ /* save old */
+ bcopy(&execsw[i], &(args->lkm_oldexec), sizeof(struct execsw*));
+
+ /* replace with new */
+ bcopy(&(args->lkm_exec), &execsw[i], sizeof(struct execsw*));
+
+ /* done! */
+ args->lkm_offset = i; /* slot in execsw[] */
+
+ break;
+
+ case LKM_E_UNLOAD:
+ /* current slot... */
+ i = args->lkm_offset;
+
+ /* replace current slot contents with old contents */
+ bcopy(&(args->lkm_oldexec), &execsw[i], sizeof(struct execsw*));
+
+ break;
+
+ case LKM_E_STAT: /* no special handling... */
+ break;
+ }
+ return(err);
+}
+
+/* XXX: This is bogus. we should find a better method RSN! */
+static const struct execsw lkm_exec_dummy1 = { NULL, "lkm" };
+static const struct execsw lkm_exec_dummy2 = { NULL, "lkm" };
+static const struct execsw lkm_exec_dummy3 = { NULL, "lkm" };
+static const struct execsw lkm_exec_dummy4 = { NULL, "lkm" };
+TEXT_SET(execsw_set, lkm_exec_dummy1);
+TEXT_SET(execsw_set, lkm_exec_dummy2);
+TEXT_SET(execsw_set, lkm_exec_dummy3);
+TEXT_SET(execsw_set, lkm_exec_dummy4);
+
+/*
+ * This code handles the per-module type "wiring-in" of loadable modules
+ * into existing kernel tables. For "LM_MISC" modules, wiring and unwiring
+ * is assumed to be done in their entry routines internal to the module
+ * itself.
+ */
+int
+lkmdispatch(lkmtp, cmd)
+ struct lkm_table *lkmtp;
+ int cmd;
+{
+ int err = 0; /* default = success */
+
+ switch(lkmtp->private.lkm_any->lkm_type) {
+ case LM_SYSCALL:
+ err = _lkm_syscall(lkmtp, cmd);
+ break;
+
+ case LM_VFS:
+ err = _lkm_vfs(lkmtp, cmd);
+ break;
+
+ case LM_DEV:
+ err = _lkm_dev(lkmtp, cmd);
+ break;
+
+#ifdef STREAMS
+ case LM_STRMOD:
+ {
+ struct lkm_strmod *args = lkmtp->private.lkm_strmod;
+ }
+ break;
+
+#endif /* STREAMS */
+
+ case LM_EXEC:
+ err = _lkm_exec(lkmtp, cmd);
+ break;
+
+ case LM_MISC: /* ignore content -- no "misc-specific" procedure */
+ if (lkmexists(lkmtp))
+ err = EEXIST;
+ break;
+
+ default:
+ err = ENXIO; /* unknown type */
+ break;
+ }
+
+ return(err);
+}
+
+int
+lkm_nullcmd(lkmtp, cmd)
+ struct lkm_table *lkmtp;
+ int cmd;
+{
+
+ return (0);
+}
+
+static lkm_devsw_installed = 0;
+#ifdef DEVFS
+static void *lkmc_devfs_token;
+#endif
+
+static void lkm_drvinit(void *unused)
+{
+ dev_t dev;
+
+ if( ! lkm_devsw_installed ) {
+ dev = makedev(CDEV_MAJOR, 0);
+ cdevsw_add(&dev,&lkmc_cdevsw, NULL);
+ lkm_devsw_installed = 1;
+#ifdef DEVFS
+ lkmc_devfs_token = devfs_add_devswf(&lkmc_cdevsw, 0, DV_CHR,
+ UID_ROOT, GID_WHEEL, 0644,
+ "lkm");
+#endif
+ }
+}
+
+SYSINIT(lkmdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,lkm_drvinit,NULL)
+
+
diff --git a/sys/kern/kern_lockf.c b/sys/kern/kern_lockf.c
new file mode 100644
index 0000000..fb1a8a0
--- /dev/null
+++ b/sys/kern/kern_lockf.c
@@ -0,0 +1,796 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Scooter Morris at Genentech Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ufs_lockf.c 8.3 (Berkeley) 1/6/94
+ * $Id$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/fcntl.h>
+
+#include <sys/lockf.h>
+
+/*
+ * This variable controls the maximum number of processes that will
+ * be checked in doing deadlock detection.
+ */
+static int maxlockdepth = MAXDEPTH;
+
+#ifdef LOCKF_DEBUG
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+
+int lockf_debug = 0;
+SYSCTL_INT(_debug, 4, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, "");
+#endif
+
+#define NOLOCKF (struct lockf *)0
+#define SELF 0x1
+#define OTHERS 0x2
+static int lf_clearlock __P((struct lockf *));
+static int lf_findoverlap __P((struct lockf *,
+ struct lockf *, int, struct lockf ***, struct lockf **));
+static struct lockf *
+ lf_getblock __P((struct lockf *));
+static int lf_getlock __P((struct lockf *, struct flock *));
+static int lf_setlock __P((struct lockf *));
+static void lf_split __P((struct lockf *, struct lockf *));
+static void lf_wakelock __P((struct lockf *));
+
+/*
+ * Advisory record locking support
+ */
+int
+lf_advlock(ap, head, size)
+ struct vop_advlock_args /* {
+ struct vnode *a_vp;
+ caddr_t a_id;
+ int a_op;
+ struct flock *a_fl;
+ int a_flags;
+ } */ *ap;
+ struct lockf **head;
+ u_quad_t size;
+{
+ register struct flock *fl = ap->a_fl;
+ register struct lockf *lock;
+ off_t start, end;
+ int error;
+
+ /*
+ * Convert the flock structure into a start and end.
+ */
+ switch (fl->l_whence) {
+
+ case SEEK_SET:
+ case SEEK_CUR:
+ /*
+ * Caller is responsible for adding any necessary offset
+ * when SEEK_CUR is used.
+ */
+ start = fl->l_start;
+ break;
+
+ case SEEK_END:
+ start = size + fl->l_start;
+ break;
+
+ default:
+ return (EINVAL);
+ }
+ if (start < 0)
+ return (EINVAL);
+ if (fl->l_len == 0)
+ end = -1;
+ else {
+ end = start + fl->l_len - 1;
+ if (end < start)
+ return (EINVAL);
+ }
+ /*
+ * Avoid the common case of unlocking when inode has no locks.
+ */
+ if (*head == (struct lockf *)0) {
+ if (ap->a_op != F_SETLK) {
+ fl->l_type = F_UNLCK;
+ return (0);
+ }
+ }
+ /*
+ * Create the lockf structure
+ */
+ MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
+ lock->lf_start = start;
+ lock->lf_end = end;
+ lock->lf_id = ap->a_id;
+/* lock->lf_inode = ip; */ /* XXX JH */
+ lock->lf_type = fl->l_type;
+ lock->lf_head = head;
+ lock->lf_next = (struct lockf *)0;
+ TAILQ_INIT(&lock->lf_blkhd);
+ lock->lf_flags = ap->a_flags;
+ /*
+ * Do the requested operation.
+ */
+ switch(ap->a_op) {
+ case F_SETLK:
+ return (lf_setlock(lock));
+
+ case F_UNLCK:
+ error = lf_clearlock(lock);
+ FREE(lock, M_LOCKF);
+ return (error);
+
+ case F_GETLK:
+ error = lf_getlock(lock, fl);
+ FREE(lock, M_LOCKF);
+ return (error);
+
+ default:
+ free(lock, M_LOCKF);
+ return (EINVAL);
+ }
+ /* NOTREACHED */
+}
+
+/*
+ * Set a byte-range lock.
+ */
+static int
+lf_setlock(lock)
+ register struct lockf *lock;
+{
+ register struct lockf *block;
+ struct lockf **head = lock->lf_head;
+ struct lockf **prev, *overlap, *ltmp;
+ static char lockstr[] = "lockf";
+ int ovcase, priority, needtolink, error;
+
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1)
+ lf_print("lf_setlock", lock);
+#endif /* LOCKF_DEBUG */
+
+ /*
+ * Set the priority
+ */
+ priority = PLOCK;
+ if (lock->lf_type == F_WRLCK)
+ priority += 4;
+ priority |= PCATCH;
+ /*
+ * Scan lock list for this file looking for locks that would block us.
+ */
+ while ((block = lf_getblock(lock))) {
+ /*
+ * Free the structure and return if nonblocking.
+ */
+ if ((lock->lf_flags & F_WAIT) == 0) {
+ FREE(lock, M_LOCKF);
+ return (EAGAIN);
+ }
+ /*
+ * We are blocked. Since flock style locks cover
+ * the whole file, there is no chance for deadlock.
+ * For byte-range locks we must check for deadlock.
+ *
+ * Deadlock detection is done by looking through the
+ * wait channels to see if there are any cycles that
+ * involve us. MAXDEPTH is set just to make sure we
+ * do not go off into neverland.
+ */
+ if ((lock->lf_flags & F_POSIX) &&
+ (block->lf_flags & F_POSIX)) {
+ register struct proc *wproc;
+ register struct lockf *waitblock;
+ int i = 0;
+
+ /* The block is waiting on something */
+ wproc = (struct proc *)block->lf_id;
+ while (wproc->p_wchan &&
+ (wproc->p_wmesg == lockstr) &&
+ (i++ < maxlockdepth)) {
+ waitblock = (struct lockf *)wproc->p_wchan;
+ /* Get the owner of the blocking lock */
+ waitblock = waitblock->lf_next;
+ if ((waitblock->lf_flags & F_POSIX) == 0)
+ break;
+ wproc = (struct proc *)waitblock->lf_id;
+ if (wproc == (struct proc *)lock->lf_id) {
+ free(lock, M_LOCKF);
+ return (EDEADLK);
+ }
+ }
+ }
+ /*
+ * For flock type locks, we must first remove
+ * any shared locks that we hold before we sleep
+ * waiting for an exclusive lock.
+ */
+ if ((lock->lf_flags & F_FLOCK) &&
+ lock->lf_type == F_WRLCK) {
+ lock->lf_type = F_UNLCK;
+ (void) lf_clearlock(lock);
+ lock->lf_type = F_WRLCK;
+ }
+ /*
+ * Add our lock to the blocked list and sleep until we're free.
+ * Remember who blocked us (for deadlock detection).
+ */
+ lock->lf_next = block;
+ TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1) {
+ lf_print("lf_setlock: blocking on", block);
+ lf_printlist("lf_setlock", block);
+ }
+#endif /* LOCKF_DEBUG */
+ if ((error = tsleep((caddr_t)lock, priority, lockstr, 0))) {
+ /*
+ * We may have been awakened by a signal (in
+ * which case we must remove ourselves from the
+ * blocked list) and/or by another process
+ * releasing a lock (in which case we have already
+ * been removed from the blocked list and our
+ * lf_next field set to NOLOCKF).
+ */
+ if (lock->lf_next)
+ TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock,
+ lf_block);
+ free(lock, M_LOCKF);
+ return (error);
+ }
+ }
+ /*
+ * No blocks!! Add the lock. Note that we will
+ * downgrade or upgrade any overlapping locks this
+ * process already owns.
+ *
+ * Skip over locks owned by other processes.
+ * Handle any locks that overlap and are owned by ourselves.
+ */
+ prev = head;
+ block = *head;
+ needtolink = 1;
+ for (;;) {
+ ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
+ if (ovcase)
+ block = overlap->lf_next;
+ /*
+ * Six cases:
+ * 0) no overlap
+ * 1) overlap == lock
+ * 2) overlap contains lock
+ * 3) lock contains overlap
+ * 4) overlap starts before lock
+ * 5) overlap ends after lock
+ */
+ switch (ovcase) {
+ case 0: /* no overlap */
+ if (needtolink) {
+ *prev = lock;
+ lock->lf_next = overlap;
+ }
+ break;
+
+ case 1: /* overlap == lock */
+ /*
+ * If downgrading lock, others may be
+ * able to acquire it.
+ */
+ if (lock->lf_type == F_RDLCK &&
+ overlap->lf_type == F_WRLCK)
+ lf_wakelock(overlap);
+ overlap->lf_type = lock->lf_type;
+ FREE(lock, M_LOCKF);
+ lock = overlap; /* for debug output below */
+ break;
+
+ case 2: /* overlap contains lock */
+ /*
+ * Check for common starting point and different types.
+ */
+ if (overlap->lf_type == lock->lf_type) {
+ free(lock, M_LOCKF);
+ lock = overlap; /* for debug output below */
+ break;
+ }
+ if (overlap->lf_start == lock->lf_start) {
+ *prev = lock;
+ lock->lf_next = overlap;
+ overlap->lf_start = lock->lf_end + 1;
+ } else
+ lf_split(overlap, lock);
+ lf_wakelock(overlap);
+ break;
+
+ case 3: /* lock contains overlap */
+ /*
+ * If downgrading lock, others may be able to
+ * acquire it, otherwise take the list.
+ */
+ if (lock->lf_type == F_RDLCK &&
+ overlap->lf_type == F_WRLCK) {
+ lf_wakelock(overlap);
+ } else {
+ while (ltmp = overlap->lf_blkhd.tqh_first) {
+ TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
+ lf_block);
+ TAILQ_INSERT_TAIL(&lock->lf_blkhd,
+ ltmp, lf_block);
+ }
+ }
+ /*
+ * Add the new lock if necessary and delete the overlap.
+ */
+ if (needtolink) {
+ *prev = lock;
+ lock->lf_next = overlap->lf_next;
+ prev = &lock->lf_next;
+ needtolink = 0;
+ } else
+ *prev = overlap->lf_next;
+ free(overlap, M_LOCKF);
+ continue;
+
+ case 4: /* overlap starts before lock */
+ /*
+ * Add lock after overlap on the list.
+ */
+ lock->lf_next = overlap->lf_next;
+ overlap->lf_next = lock;
+ overlap->lf_end = lock->lf_start - 1;
+ prev = &lock->lf_next;
+ lf_wakelock(overlap);
+ needtolink = 0;
+ continue;
+
+ case 5: /* overlap ends after lock */
+ /*
+ * Add the new lock before overlap.
+ */
+ if (needtolink) {
+ *prev = lock;
+ lock->lf_next = overlap;
+ }
+ overlap->lf_start = lock->lf_end + 1;
+ lf_wakelock(overlap);
+ break;
+ }
+ break;
+ }
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1) {
+ lf_print("lf_setlock: got the lock", lock);
+ lf_printlist("lf_setlock", lock);
+ }
+#endif /* LOCKF_DEBUG */
+ return (0);
+}
+
+/*
+ * Remove a byte-range lock on an inode.
+ *
+ * Generally, find the lock (or an overlap to that lock)
+ * and remove it (or shrink it), then wakeup anyone we can.
+ */
+static int
+lf_clearlock(unlock)
+ register struct lockf *unlock;
+{
+ struct lockf **head = unlock->lf_head;
+ register struct lockf *lf = *head;
+ struct lockf *overlap, **prev;
+ int ovcase;
+
+ if (lf == NOLOCKF)
+ return (0);
+#ifdef LOCKF_DEBUG
+ if (unlock->lf_type != F_UNLCK)
+ panic("lf_clearlock: bad type");
+ if (lockf_debug & 1)
+ lf_print("lf_clearlock", unlock);
+#endif /* LOCKF_DEBUG */
+ prev = head;
+ while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap))) {
+ /*
+ * Wakeup the list of locks to be retried.
+ */
+ lf_wakelock(overlap);
+
+ switch (ovcase) {
+
+ case 1: /* overlap == lock */
+ *prev = overlap->lf_next;
+ FREE(overlap, M_LOCKF);
+ break;
+
+ case 2: /* overlap contains lock: split it */
+ if (overlap->lf_start == unlock->lf_start) {
+ overlap->lf_start = unlock->lf_end + 1;
+ break;
+ }
+ lf_split(overlap, unlock);
+ overlap->lf_next = unlock->lf_next;
+ break;
+
+ case 3: /* lock contains overlap */
+ *prev = overlap->lf_next;
+ lf = overlap->lf_next;
+ free(overlap, M_LOCKF);
+ continue;
+
+ case 4: /* overlap starts before lock */
+ overlap->lf_end = unlock->lf_start - 1;
+ prev = &overlap->lf_next;
+ lf = overlap->lf_next;
+ continue;
+
+ case 5: /* overlap ends after lock */
+ overlap->lf_start = unlock->lf_end + 1;
+ break;
+ }
+ break;
+ }
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1)
+ lf_printlist("lf_clearlock", unlock);
+#endif /* LOCKF_DEBUG */
+ return (0);
+}
+
+/*
+ * Check whether there is a blocking lock,
+ * and if so return its process identifier.
+ */
+static int
+lf_getlock(lock, fl)
+ register struct lockf *lock;
+ register struct flock *fl;
+{
+ register struct lockf *block;
+
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 1)
+ lf_print("lf_getlock", lock);
+#endif /* LOCKF_DEBUG */
+
+ if ((block = lf_getblock(lock))) {
+ fl->l_type = block->lf_type;
+ fl->l_whence = SEEK_SET;
+ fl->l_start = block->lf_start;
+ if (block->lf_end == -1)
+ fl->l_len = 0;
+ else
+ fl->l_len = block->lf_end - block->lf_start + 1;
+ if (block->lf_flags & F_POSIX)
+ fl->l_pid = ((struct proc *)(block->lf_id))->p_pid;
+ else
+ fl->l_pid = -1;
+ } else {
+ fl->l_type = F_UNLCK;
+ }
+ return (0);
+}
+
+/*
+ * Walk the list of locks for an inode and
+ * return the first blocking lock.
+ */
+static struct lockf *
+lf_getblock(lock)
+ register struct lockf *lock;
+{
+ struct lockf **prev, *overlap, *lf = *(lock->lf_head);
+ int ovcase;
+
+ prev = lock->lf_head;
+ while ((ovcase = lf_findoverlap(lf, lock, OTHERS, &prev, &overlap))) {
+ /*
+ * We've found an overlap, see if it blocks us
+ */
+ if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
+ return (overlap);
+ /*
+ * Nope, point to the next one on the list and
+ * see if it blocks us
+ */
+ lf = overlap->lf_next;
+ }
+ return (NOLOCKF);
+}
+
+/*
+ * Walk the list of locks for an inode to
+ * find an overlapping lock (if any).
+ *
+ * NOTE: this returns only the FIRST overlapping lock. There
+ * may be more than one.
+ */
+static int
+lf_findoverlap(lf, lock, type, prev, overlap)
+ register struct lockf *lf;
+ struct lockf *lock;
+ int type;
+ struct lockf ***prev;
+ struct lockf **overlap;
+{
+ off_t start, end;
+
+ *overlap = lf;
+ if (lf == NOLOCKF)
+ return (0);
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ lf_print("lf_findoverlap: looking for overlap in", lock);
+#endif /* LOCKF_DEBUG */
+ start = lock->lf_start;
+ end = lock->lf_end;
+ while (lf != NOLOCKF) {
+ if (((type & SELF) && lf->lf_id != lock->lf_id) ||
+ ((type & OTHERS) && lf->lf_id == lock->lf_id)) {
+ *prev = &lf->lf_next;
+ *overlap = lf = lf->lf_next;
+ continue;
+ }
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ lf_print("\tchecking", lf);
+#endif /* LOCKF_DEBUG */
+ /*
+ * OK, check for overlap
+ *
+ * Six cases:
+ * 0) no overlap
+ * 1) overlap == lock
+ * 2) overlap contains lock
+ * 3) lock contains overlap
+ * 4) overlap starts before lock
+ * 5) overlap ends after lock
+ */
+ if ((lf->lf_end != -1 && start > lf->lf_end) ||
+ (end != -1 && lf->lf_start > end)) {
+ /* Case 0 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("no overlap\n");
+#endif /* LOCKF_DEBUG */
+ if ((type & SELF) && end != -1 && lf->lf_start > end)
+ return (0);
+ *prev = &lf->lf_next;
+ *overlap = lf = lf->lf_next;
+ continue;
+ }
+ if ((lf->lf_start == start) && (lf->lf_end == end)) {
+ /* Case 1 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("overlap == lock\n");
+#endif /* LOCKF_DEBUG */
+ return (1);
+ }
+ if ((lf->lf_start <= start) &&
+ (end != -1) &&
+ ((lf->lf_end >= end) || (lf->lf_end == -1))) {
+ /* Case 2 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("overlap contains lock\n");
+#endif /* LOCKF_DEBUG */
+ return (2);
+ }
+ if (start <= lf->lf_start &&
+ (end == -1 ||
+ (lf->lf_end != -1 && end >= lf->lf_end))) {
+ /* Case 3 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("lock contains overlap\n");
+#endif /* LOCKF_DEBUG */
+ return (3);
+ }
+ if ((lf->lf_start < start) &&
+ ((lf->lf_end >= start) || (lf->lf_end == -1))) {
+ /* Case 4 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("overlap starts before lock\n");
+#endif /* LOCKF_DEBUG */
+ return (4);
+ }
+ if ((lf->lf_start > start) &&
+ (end != -1) &&
+ ((lf->lf_end > end) || (lf->lf_end == -1))) {
+ /* Case 5 */
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ printf("overlap ends after lock\n");
+#endif /* LOCKF_DEBUG */
+ return (5);
+ }
+ panic("lf_findoverlap: default");
+ }
+ return (0);
+}
+
+/*
+ * Split a lock and a contained region into
+ * two or three locks as necessary.
+ */
+static void
+lf_split(lock1, lock2)
+ register struct lockf *lock1;
+ register struct lockf *lock2;
+{
+ register struct lockf *splitlock;
+
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2) {
+ lf_print("lf_split", lock1);
+ lf_print("splitting from", lock2);
+ }
+#endif /* LOCKF_DEBUG */
+ /*
+ * Check to see if spliting into only two pieces.
+ */
+ if (lock1->lf_start == lock2->lf_start) {
+ lock1->lf_start = lock2->lf_end + 1;
+ lock2->lf_next = lock1;
+ return;
+ }
+ if (lock1->lf_end == lock2->lf_end) {
+ lock1->lf_end = lock2->lf_start - 1;
+ lock2->lf_next = lock1->lf_next;
+ lock1->lf_next = lock2;
+ return;
+ }
+ /*
+ * Make a new lock consisting of the last part of
+ * the encompassing lock
+ */
+ MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK);
+ bcopy((caddr_t)lock1, (caddr_t)splitlock, sizeof *splitlock);
+ splitlock->lf_start = lock2->lf_end + 1;
+ TAILQ_INIT(&splitlock->lf_blkhd);
+ lock1->lf_end = lock2->lf_start - 1;
+ /*
+ * OK, now link it in
+ */
+ splitlock->lf_next = lock1->lf_next;
+ lock2->lf_next = splitlock;
+ lock1->lf_next = lock2;
+}
+
+/*
+ * Wakeup a blocklist
+ */
+static void
+lf_wakelock(listhead)
+ struct lockf *listhead;
+{
+ register struct lockf *wakelock;
+
+ while (wakelock = listhead->lf_blkhd.tqh_first) {
+ TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
+ wakelock->lf_next = NOLOCKF;
+#ifdef LOCKF_DEBUG
+ if (lockf_debug & 2)
+ lf_print("lf_wakelock: awakening", wakelock);
+#endif /* LOCKF_DEBUG */
+ wakeup((caddr_t)wakelock);
+ }
+}
+
+#ifdef LOCKF_DEBUG
+/*
+ * Print out a lock.
+ */
+void
+lf_print(tag, lock)
+ char *tag;
+ register struct lockf *lock;
+{
+
+ printf("%s: lock 0x%lx for ", tag, lock);
+ if (lock->lf_flags & F_POSIX)
+ printf("proc %d", ((struct proc *)(lock->lf_id))->p_pid);
+ else
+ printf("id 0x%x", lock->lf_id);
+ printf(" in ino %d on dev <%d, %d>, %s, start %d, end %d",
+ lock->lf_inode->i_number,
+ major(lock->lf_inode->i_dev),
+ minor(lock->lf_inode->i_dev),
+ lock->lf_type == F_RDLCK ? "shared" :
+ lock->lf_type == F_WRLCK ? "exclusive" :
+ lock->lf_type == F_UNLCK ? "unlock" :
+ "unknown", lock->lf_start, lock->lf_end);
+ if (lock->lf_blkhd.tqh_first)
+ printf(" block 0x%x\n", lock->lf_blkhd.tqh_first);
+ else
+ printf("\n");
+}
+
+void
+lf_printlist(tag, lock)
+ char *tag;
+ struct lockf *lock;
+{
+ register struct lockf *lf, *blk;
+
+ printf("%s: Lock list for ino %d on dev <%d, %d>:\n",
+ tag, lock->lf_inode->i_number,
+ major(lock->lf_inode->i_dev),
+ minor(lock->lf_inode->i_dev));
+ for (lf = lock->lf_inode->i_lockf; lf; lf = lf->lf_next) {
+ printf("\tlock 0x%lx for ", lf);
+ if (lf->lf_flags & F_POSIX)
+ printf("proc %d", ((struct proc *)(lf->lf_id))->p_pid);
+ else
+ printf("id 0x%x", lf->lf_id);
+ printf(", %s, start %d, end %d",
+ lf->lf_type == F_RDLCK ? "shared" :
+ lf->lf_type == F_WRLCK ? "exclusive" :
+ lf->lf_type == F_UNLCK ? "unlock" :
+ "unknown", lf->lf_start, lf->lf_end);
+ for (blk = lf->lf_blkhd.tqh_first; blk;
+ blk = blk->lf_block.tqe_next) {
+ printf("\n\t\tlock request 0x%lx for ", blk);
+ if (blk->lf_flags & F_POSIX)
+ printf("proc %d",
+ ((struct proc *)(blk->lf_id))->p_pid);
+ else
+ printf("id 0x%x", blk->lf_id);
+ printf(", %s, start %d, end %d",
+ blk->lf_type == F_RDLCK ? "shared" :
+ blk->lf_type == F_WRLCK ? "exclusive" :
+ blk->lf_type == F_UNLCK ? "unlock" :
+ "unknown", blk->lf_start, blk->lf_end);
+ if (blk->lf_blkhd.tqh_first)
+ panic("lf_printlist: bad list");
+ }
+ printf("\n");
+ }
+}
+#endif /* LOCKF_DEBUG */
diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c
index 363cde5..94c6b4e 100644
--- a/sys/kern/kern_malloc.c
+++ b/sys/kern/kern_malloc.c
@@ -30,19 +30,27 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)kern_malloc.c 8.4 (Berkeley) 5/20/95
+ * @(#)kern_malloc.c 8.3 (Berkeley) 1/4/94
+ * $Id$
*/
#include <sys/param.h>
+#include <sys/systm.h>
#include <sys/proc.h>
-#include <sys/map.h>
#include <sys/kernel.h>
#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/vmmeter.h>
#include <vm/vm.h>
+#include <vm/vm_param.h>
#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
-struct kmembuckets bucket[MINBUCKET + 16];
+static void kmeminit __P((void *));
+SYSINIT(kmem, SI_SUB_KMEM, SI_ORDER_FIRST, kmeminit, NULL)
+
+static struct kmembuckets bucket[MINBUCKET + 16];
struct kmemstats kmemstats[M_LAST];
struct kmemusage *kmemusage;
char *kmembase, *kmemlimit;
@@ -52,7 +60,7 @@ char *memname[] = INITKMEMNAMES;
/*
* This structure provides a set of masks to catch unaligned frees.
*/
-long addrmask[] = { 0,
+static long addrmask[] = { 0,
0x00000001, 0x00000003, 0x00000007, 0x0000000f,
0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff,
0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
@@ -63,8 +71,8 @@ long addrmask[] = { 0,
* The WEIRD_ADDR is used as known text to copy into free objects so
* that modifications after frees can be detected.
*/
-#define WEIRD_ADDR 0xdeadbeef
-#define MAX_COPY 32
+#define WEIRD_ADDR 0xdeadc0de
+#define MAX_COPY 64
/*
* Normally the first word of the structure is used to hold the list
@@ -103,9 +111,6 @@ malloc(size, type, flags)
int copysize;
char *savedtype;
#endif
-#ifdef DEBUG
- extern int simplelockrecurse;
-#endif
#ifdef KMEMSTATS
register struct kmemstats *ksp = &kmemstats[type];
@@ -114,7 +119,7 @@ malloc(size, type, flags)
#endif
indx = BUCKETINDX(size);
kbp = &bucket[indx];
- s = splimp();
+ s = splhigh();
#ifdef KMEMSTATS
while (ksp->ks_memuse >= ksp->ks_limit) {
if (flags & M_NOWAIT) {
@@ -130,25 +135,16 @@ malloc(size, type, flags)
#ifdef DIAGNOSTIC
copysize = 1 << indx < MAX_COPY ? 1 << indx : MAX_COPY;
#endif
-#ifdef DEBUG
- if (flags & M_NOWAIT)
- simplelockrecurse++;
-#endif
if (kbp->kb_next == NULL) {
kbp->kb_last = NULL;
if (size > MAXALLOCSAVE)
- allocsize = roundup(size, CLBYTES);
+ allocsize = roundup(size, PAGE_SIZE);
else
allocsize = 1 << indx;
- npg = clrnd(btoc(allocsize));
- va = (caddr_t) kmem_malloc(kmem_map, (vm_size_t)ctob(npg),
- !(flags & M_NOWAIT));
+ npg = btoc(allocsize);
+ va = (caddr_t) kmem_malloc(kmem_map, (vm_size_t)ctob(npg), flags);
if (va == NULL) {
splx(s);
-#ifdef DEBUG
- if (flags & M_NOWAIT)
- simplelockrecurse--;
-#endif
return ((void *) NULL);
}
#ifdef KMEMSTATS
@@ -175,7 +171,7 @@ malloc(size, type, flags)
* bucket, don't assume the list is still empty.
*/
savedlist = kbp->kb_next;
- kbp->kb_next = cp = va + (npg * NBPG) - allocsize;
+ kbp->kb_next = cp = va + (npg * PAGE_SIZE) - allocsize;
for (;;) {
freep = (struct freelist *)cp;
#ifdef DIAGNOSTIC
@@ -205,7 +201,7 @@ malloc(size, type, flags)
memname[freep->type] : "???";
if (kbp->kb_next &&
!kernacc(kbp->kb_next, sizeof(struct freelist), 0)) {
- printf("%s of object 0x%x size %d %s %s (invalid addr 0x%x)\n",
+ printf("%s of object %p size %ld %s %s (invalid addr %p)\n",
"Data modified on freelist: word 2.5", va, size,
"previous type", savedtype, kbp->kb_next);
kbp->kb_next = NULL;
@@ -224,7 +220,7 @@ malloc(size, type, flags)
for (lp = (long *)va; lp < end; lp++) {
if (*lp == WEIRD_ADDR)
continue;
- printf("%s %d of object 0x%x size %d %s %s (0x%x != 0x%x)\n",
+ printf("%s %d of object %p size %ld %s %s (0x%lx != 0x%x)\n",
"Data modified on freelist: word", lp - (long *)va,
va, size, "previous type", savedtype, *lp, WEIRD_ADDR);
break;
@@ -250,10 +246,6 @@ out:
out:
#endif
splx(s);
-#ifdef DEBUG
- if (flags & M_NOWAIT)
- simplelockrecurse--;
-#endif
return ((void *) va);
}
@@ -271,34 +263,42 @@ free(addr, type)
long size;
int s;
#ifdef DIAGNOSTIC
- caddr_t cp;
+ struct freelist *fp;
long *end, *lp, alloc, copysize;
#endif
#ifdef KMEMSTATS
register struct kmemstats *ksp = &kmemstats[type];
#endif
+#ifdef DIAGNOSTIC
+ if ((char *)addr < kmembase || (char *)addr >= kmemlimit) {
+ panic("free: address 0x%x out of range", addr);
+ }
+ if ((u_long)type > M_LAST) {
+ panic("free: type %d out of range", type);
+ }
+#endif
kup = btokup(addr);
size = 1 << kup->ku_indx;
kbp = &bucket[kup->ku_indx];
- s = splimp();
+ s = splhigh();
#ifdef DIAGNOSTIC
/*
* Check for returns of data that do not point to the
* beginning of the allocation.
*/
- if (size > NBPG * CLSIZE)
- alloc = addrmask[BUCKETINDX(NBPG * CLSIZE)];
+ if (size > PAGE_SIZE)
+ alloc = addrmask[BUCKETINDX(PAGE_SIZE)];
else
alloc = addrmask[kup->ku_indx];
if (((u_long)addr & alloc) != 0)
- panic("free: unaligned addr 0x%x, size %d, type %s, mask %d\n",
+ panic("free: unaligned addr 0x%x, size %d, type %s, mask %d",
addr, size, memname[type], alloc);
#endif /* DIAGNOSTIC */
if (size > MAXALLOCSAVE) {
kmem_free(kmem_map, (vm_offset_t)addr, ctob(kup->ku_pagecnt));
#ifdef KMEMSTATS
- size = kup->ku_pagecnt << PGSHIFT;
+ size = kup->ku_pagecnt << PAGE_SHIFT;
ksp->ks_memuse -= size;
kup->ku_indx = 0;
kup->ku_pagecnt = 0;
@@ -318,11 +318,16 @@ free(addr, type)
* it looks free before laboriously searching the freelist.
*/
if (freep->spare0 == WEIRD_ADDR) {
- for (cp = kbp->kb_next; cp; cp = *(caddr_t *)cp) {
- if (addr != cp)
- continue;
- printf("multiply freed item 0x%x\n", addr);
- panic("free: duplicated free");
+ fp = (struct freelist *)kbp->kb_next;
+ while (fp) {
+ if (fp->spare0 != WEIRD_ADDR) {
+ printf("trashed free item %p\n", fp);
+ panic("free: free item modified");
+ } else if (addr == (caddr_t)fp) {
+ printf("multiple freed item %p\n", addr);
+ panic("free: multiple free");
+ }
+ fp = (struct freelist *)fp->next;
}
}
/*
@@ -351,46 +356,75 @@ free(addr, type)
wakeup((caddr_t)ksp);
ksp->ks_inuse--;
#endif
+#ifdef OLD_MALLOC_MEMORY_POLICY
if (kbp->kb_next == NULL)
kbp->kb_next = addr;
else
((struct freelist *)kbp->kb_last)->next = addr;
freep->next = NULL;
kbp->kb_last = addr;
+#else
+ /*
+ * Return memory to the head of the queue for quick reuse. This
+ * can improve performance by improving the probability of the
+ * item being in the cache when it is reused.
+ */
+ if (kbp->kb_next == NULL) {
+ kbp->kb_next = addr;
+ kbp->kb_last = addr;
+ freep->next = NULL;
+ } else {
+ freep->next = kbp->kb_next;
+ kbp->kb_next = addr;
+ }
+#endif
splx(s);
}
/*
* Initialize the kernel memory allocator
*/
-kmeminit()
+/* ARGSUSED*/
+static void
+kmeminit(dummy)
+ void *dummy;
{
register long indx;
int npg;
#if ((MAXALLOCSAVE & (MAXALLOCSAVE - 1)) != 0)
- ERROR!_kmeminit:_MAXALLOCSAVE_not_power_of_2
+#error "kmeminit: MAXALLOCSAVE not power of 2"
#endif
#if (MAXALLOCSAVE > MINALLOCSIZE * 32768)
- ERROR!_kmeminit:_MAXALLOCSAVE_too_big
+#error "kmeminit: MAXALLOCSAVE too big"
#endif
-#if (MAXALLOCSAVE < CLBYTES)
- ERROR!_kmeminit:_MAXALLOCSAVE_too_small
+#if (MAXALLOCSAVE < PAGE_SIZE)
+#error "kmeminit: MAXALLOCSAVE too small"
#endif
- npg = VM_KMEM_SIZE/ NBPG;
+ npg = (nmbufs * MSIZE + nmbclusters * MCLBYTES + VM_KMEM_SIZE)
+ / PAGE_SIZE;
+
kmemusage = (struct kmemusage *) kmem_alloc(kernel_map,
(vm_size_t)(npg * sizeof(struct kmemusage)));
kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase,
- (vm_offset_t *)&kmemlimit, (vm_size_t)(npg * NBPG), FALSE);
+ (vm_offset_t *)&kmemlimit, (vm_size_t)(npg * PAGE_SIZE),
+ FALSE);
#ifdef KMEMSTATS
for (indx = 0; indx < MINBUCKET + 16; indx++) {
- if (1 << indx >= CLBYTES)
+ if (1 << indx >= PAGE_SIZE)
bucket[indx].kb_elmpercl = 1;
else
- bucket[indx].kb_elmpercl = CLBYTES / (1 << indx);
+ bucket[indx].kb_elmpercl = PAGE_SIZE / (1 << indx);
bucket[indx].kb_highwat = 5 * bucket[indx].kb_elmpercl;
}
- for (indx = 0; indx < M_LAST; indx++)
- kmemstats[indx].ks_limit = npg * NBPG * 6 / 10;
+ /*
+ * Limit maximum memory for each type to 60% of malloc area size or
+ * 60% of physical memory, whichever is smaller.
+ */
+ for (indx = 0; indx < M_LAST; indx++) {
+ kmemstats[indx].ks_limit = min(cnt.v_page_count * PAGE_SIZE,
+ (npg * PAGE_SIZE - nmbclusters * MCLBYTES
+ - nmbufs * MSIZE)) * 6 / 10;
+ }
#endif
}
diff --git a/sys/kern/kern_mib.c b/sys/kern/kern_mib.c
new file mode 100644
index 0000000..8105aa4
--- /dev/null
+++ b/sys/kern/kern_mib.c
@@ -0,0 +1,167 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Karels at Berkeley Software Design, Inc.
+ *
+ * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
+ * project, to make these variables more userfriendly.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94
+ * $Id: kern_mib.c,v 1.7 1997/03/03 12:58:19 bde Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/unistd.h>
+
+SYSCTL_NODE(, 0, sysctl, CTLFLAG_RW, 0,
+ "Sysctl internal magic");
+SYSCTL_NODE(, CTL_KERN, kern, CTLFLAG_RW, 0,
+ "High kernel, proc, limits &c");
+SYSCTL_NODE(, CTL_VM, vm, CTLFLAG_RW, 0,
+ "Virtual memory");
+SYSCTL_NODE(, CTL_VFS, vfs, CTLFLAG_RW, 0,
+ "File system");
+SYSCTL_NODE(, CTL_NET, net, CTLFLAG_RW, 0,
+ "Network, (see socket.h)");
+SYSCTL_NODE(, CTL_DEBUG, debug, CTLFLAG_RW, 0,
+ "Debugging");
+SYSCTL_NODE(, CTL_HW, hw, CTLFLAG_RW, 0,
+ "hardware");
+SYSCTL_NODE(, CTL_MACHDEP, machdep, CTLFLAG_RW, 0,
+ "machine dependent");
+SYSCTL_NODE(, CTL_USER, user, CTLFLAG_RW, 0,
+ "user-level");
+
+SYSCTL_STRING(_kern, KERN_OSRELEASE, osrelease, CTLFLAG_RD, osrelease, 0, "");
+
+SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD, 0, BSD, "");
+
+SYSCTL_STRING(_kern, KERN_VERSION, version, CTLFLAG_RD, version, 0, "");
+
+SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD, ostype, 0, "");
+
+extern int osreldate;
+SYSCTL_INT(_kern, KERN_OSRELDATE, osreldate, CTLFLAG_RD, &osreldate, 0, "");
+
+SYSCTL_INT(_kern, KERN_MAXPROC, maxproc, CTLFLAG_RW, &maxproc, 0, "");
+
+SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid,
+ CTLFLAG_RW, &maxprocperuid, 0, "");
+
+SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD, 0, ARG_MAX, "");
+
+SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD, 0, _POSIX_VERSION, "");
+
+SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RD, 0, NGROUPS_MAX, "");
+
+SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD, 0, 1, "");
+
+#ifdef _POSIX_SAVED_IDS
+SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD, 0, 1, "");
+#else
+SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD, 0, 0, "");
+#endif
+
+char kernelname[MAXPATHLEN] = "/kernel"; /* XXX bloat */
+
+SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile,
+ CTLFLAG_RW, kernelname, sizeof kernelname, "");
+
+SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD, 0, 1, "");
+
+SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD, 0, BYTE_ORDER, "");
+
+SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD, 0, PAGE_SIZE, "");
+
+char hostname[MAXHOSTNAMELEN];
+
+SYSCTL_STRING(_kern, KERN_HOSTNAME, hostname, CTLFLAG_RW,
+ hostname, sizeof(hostname), "");
+
+int securelevel = -1;
+
+static int
+sysctl_kern_securelvl SYSCTL_HANDLER_ARGS
+{
+ int error, level;
+
+ level = securelevel;
+ error = sysctl_handle_int(oidp, &level, 0, req);
+ if (error || !req->newptr)
+ return (error);
+ if (level < securelevel && req->p->p_pid != 1)
+ return (EPERM);
+ securelevel = level;
+ return (error);
+}
+
+SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel, CTLTYPE_INT|CTLFLAG_RW,
+ 0, 0, sysctl_kern_securelvl, "I", "");
+
+char domainname[MAXHOSTNAMELEN];
+SYSCTL_STRING(_kern, KERN_NISDOMAINNAME, domainname, CTLFLAG_RW,
+ &domainname, sizeof(domainname), "");
+
+long hostid;
+/* Some trouble here, if sizeof (int) != sizeof (long) */
+SYSCTL_INT(_kern, KERN_HOSTID, hostid, CTLFLAG_RW, &hostid, 0, "");
+
+/*
+ * This is really cheating. These actually live in the libc, something
+ * which I'm not quite sure is a good idea anyway, but in order for
+ * getnext and friends to actually work, we define dummies here.
+ */
+SYSCTL_STRING(_user, USER_CS_PATH, cs_path, CTLFLAG_RD, "", 0, "");
+SYSCTL_INT(_user, USER_BC_BASE_MAX, bc_base_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_BC_DIM_MAX, bc_dim_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_BC_SCALE_MAX, bc_scale_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_BC_STRING_MAX, bc_string_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_COLL_WEIGHTS_MAX, coll_weights_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_EXPR_NEST_MAX, expr_nest_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_LINE_MAX, line_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_RE_DUP_MAX, re_dup_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_VERSION, posix2_version, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_C_BIND, posix2_c_bind, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_C_DEV, posix2_c_dev, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_CHAR_TERM, posix2_char_term, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_FORT_DEV, posix2_fort_dev, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_FORT_RUN, posix2_fort_run, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_LOCALEDEF, posix2_localedef, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_SW_DEV, posix2_sw_dev, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_UPE, posix2_upe, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_STREAM_MAX, stream_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_TZNAME_MAX, tzname_max, CTLFLAG_RD, 0, 0, "");
diff --git a/sys/kern/kern_ntptime.c b/sys/kern/kern_ntptime.c
new file mode 100644
index 0000000..88ba077b
--- /dev/null
+++ b/sys/kern/kern_ntptime.c
@@ -0,0 +1,269 @@
+/******************************************************************************
+ * *
+ * Copyright (c) David L. Mills 1993, 1994 *
+ * *
+ * Permission to use, copy, modify, and distribute this software and its *
+ * documentation for any purpose and without fee is hereby granted, provided *
+ * that the above copyright notice appears in all copies and that both the *
+ * copyright notice and this permission notice appear in supporting *
+ * documentation, and that the name University of Delaware not be used in *
+ * advertising or publicity pertaining to distribution of the software *
+ * without specific, written prior permission. The University of Delaware *
+ * makes no representations about the suitability this software for any *
+ * purpose. It is provided "as is" without express or implied warranty. *
+ * *
+ ******************************************************************************/
+
+/*
+ * Modification history kern_ntptime.c
+ *
+ * 24 Sep 94 David L. Mills
+ * Tightened code at exits.
+ *
+ * 24 Mar 94 David L. Mills
+ * Revised syscall interface to include new variables for PPS
+ * time discipline.
+ *
+ * 14 Feb 94 David L. Mills
+ * Added code for external clock
+ *
+ * 28 Nov 93 David L. Mills
+ * Revised frequency scaling to conform with adjusted parameters
+ *
+ * 17 Sep 93 David L. Mills
+ * Created file
+ */
+/*
+ * ntp_gettime(), ntp_adjtime() - precision time interface for SunOS
+ * V4.1.1 and V4.1.3
+ *
+ * These routines consitute the Network Time Protocol (NTP) interfaces
+ * for user and daemon application programs. The ntp_gettime() routine
+ * provides the time, maximum error (synch distance) and estimated error
+ * (dispersion) to client user application programs. The ntp_adjtime()
+ * routine is used by the NTP daemon to adjust the system clock to an
+ * externally derived time. The time offset and related variables set by
+ * this routine are used by hardclock() to adjust the phase and
+ * frequency of the phase-lock loop which controls the system clock.
+ */
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/timex.h>
+#include <sys/sysctl.h>
+
+/*
+ * The following variables are used by the hardclock() routine in the
+ * kern_clock.c module and are described in that module.
+ */
+extern int time_state; /* clock state */
+extern int time_status; /* clock status bits */
+extern long time_offset; /* time adjustment (us) */
+extern long time_freq; /* frequency offset (scaled ppm) */
+extern long time_maxerror; /* maximum error (us) */
+extern long time_esterror; /* estimated error (us) */
+extern long time_constant; /* pll time constant */
+extern long time_precision; /* clock precision (us) */
+extern long time_tolerance; /* frequency tolerance (scaled ppm) */
+
+#ifdef PPS_SYNC
+/*
+ * The following variables are used only if the PPS signal discipline
+ * is configured in the kernel.
+ */
+extern int pps_shift; /* interval duration (s) (shift) */
+extern long pps_freq; /* pps frequency offset (scaled ppm) */
+extern long pps_jitter; /* pps jitter (us) */
+extern long pps_stabil; /* pps stability (scaled ppm) */
+extern long pps_jitcnt; /* jitter limit exceeded */
+extern long pps_calcnt; /* calibration intervals */
+extern long pps_errcnt; /* calibration errors */
+extern long pps_stbcnt; /* stability limit exceeded */
+#endif /* PPS_SYNC */
+
+static int
+ntp_sysctl SYSCTL_HANDLER_ARGS
+{
+ struct timeval atv;
+ struct ntptimeval ntv;
+ int s;
+
+ s = splclock();
+#ifdef EXT_CLOCK
+ /*
+ * The microtime() external clock routine returns a
+ * status code. If less than zero, we declare an error
+ * in the clock status word and return the kernel
+ * (software) time variable. While there are other
+ * places that call microtime(), this is the only place
+ * that matters from an application point of view.
+ */
+ if (microtime(&atv) < 0) {
+ time_status |= STA_CLOCKERR;
+ ntv.time = time;
+ } else {
+ time_status &= ~STA_CLOCKERR;
+ }
+#else /* EXT_CLOCK */
+ microtime(&atv);
+#endif /* EXT_CLOCK */
+ ntv.time = atv;
+ ntv.maxerror = time_maxerror;
+ ntv.esterror = time_esterror;
+ splx(s);
+
+ ntv.time_state = time_state;
+
+ /*
+ * Status word error decode. If any of these conditions
+ * occur, an error is returned, instead of the status
+ * word. Most applications will care only about the fact
+ * the system clock may not be trusted, not about the
+ * details.
+ *
+ * Hardware or software error
+ */
+ if (time_status & (STA_UNSYNC | STA_CLOCKERR)) {
+ ntv.time_state = TIME_ERROR;
+ }
+
+ /*
+ * PPS signal lost when either time or frequency
+ * synchronization requested
+ */
+ if (time_status & (STA_PPSFREQ | STA_PPSTIME) &&
+ !(time_status & STA_PPSSIGNAL)) {
+ ntv.time_state = TIME_ERROR;
+ }
+
+ /*
+ * PPS jitter exceeded when time synchronization
+ * requested
+ */
+ if (time_status & STA_PPSTIME &&
+ time_status & STA_PPSJITTER) {
+ ntv.time_state = TIME_ERROR;
+ }
+
+ /*
+ * PPS wander exceeded or calibration error when
+ * frequency synchronization requested
+ */
+ if (time_status & STA_PPSFREQ &&
+ time_status & (STA_PPSWANDER | STA_PPSERROR)) {
+ ntv.time_state = TIME_ERROR;
+ }
+ return (sysctl_handle_opaque(oidp, &ntv, sizeof ntv, req));
+}
+
+SYSCTL_NODE(_kern, KERN_NTP_PLL, ntp_pll, CTLFLAG_RW, 0,
+ "NTP kernel PLL related stuff");
+SYSCTL_PROC(_kern_ntp_pll, NTP_PLL_GETTIME, gettime, CTLTYPE_OPAQUE|CTLFLAG_RD,
+ 0, sizeof(struct ntptimeval) , ntp_sysctl, "S,ntptimeval", "");
+
+/*
+ * ntp_adjtime() - NTP daemon application interface
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ntp_adjtime_args {
+ struct timex *tp;
+};
+#endif
+
+int
+ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap, int *retval)
+{
+ struct timex ntv;
+ int modes;
+ int s;
+ int error;
+
+ error = copyin((caddr_t)uap->tp, (caddr_t)&ntv, sizeof(ntv));
+ if (error)
+ return error;
+
+ /*
+ * Update selected clock variables - only the superuser can
+ * change anything. Note that there is no error checking here on
+ * the assumption the superuser should know what it is doing.
+ */
+ modes = ntv.modes;
+ if ((modes != 0)
+ && (error = suser(p->p_cred->pc_ucred, &p->p_acflag)))
+ return error;
+
+ s = splclock();
+ if (modes & MOD_FREQUENCY)
+#ifdef PPS_SYNC
+ time_freq = ntv.freq - pps_freq;
+#else /* PPS_SYNC */
+ time_freq = ntv.freq;
+#endif /* PPS_SYNC */
+ if (modes & MOD_MAXERROR)
+ time_maxerror = ntv.maxerror;
+ if (modes & MOD_ESTERROR)
+ time_esterror = ntv.esterror;
+ if (modes & MOD_STATUS) {
+ time_status &= STA_RONLY;
+ time_status |= ntv.status & ~STA_RONLY;
+ }
+ if (modes & MOD_TIMECONST)
+ time_constant = ntv.constant;
+ if (modes & MOD_OFFSET)
+ hardupdate(ntv.offset);
+
+ /*
+ * Retrieve all clock variables
+ */
+ if (time_offset < 0)
+ ntv.offset = -(-time_offset >> SHIFT_UPDATE);
+ else
+ ntv.offset = time_offset >> SHIFT_UPDATE;
+#ifdef PPS_SYNC
+ ntv.freq = time_freq + pps_freq;
+#else /* PPS_SYNC */
+ ntv.freq = time_freq;
+#endif /* PPS_SYNC */
+ ntv.maxerror = time_maxerror;
+ ntv.esterror = time_esterror;
+ ntv.status = time_status;
+ ntv.constant = time_constant;
+ ntv.precision = time_precision;
+ ntv.tolerance = time_tolerance;
+#ifdef PPS_SYNC
+ ntv.shift = pps_shift;
+ ntv.ppsfreq = pps_freq;
+ ntv.jitter = pps_jitter >> PPS_AVG;
+ ntv.stabil = pps_stabil;
+ ntv.calcnt = pps_calcnt;
+ ntv.errcnt = pps_errcnt;
+ ntv.jitcnt = pps_jitcnt;
+ ntv.stbcnt = pps_stbcnt;
+#endif /* PPS_SYNC */
+ (void)splx(s);
+
+ error = copyout((caddr_t)&ntv, (caddr_t)uap->tp, sizeof(ntv));
+ if (!error) {
+ /*
+ * Status word error decode. See comments in
+ * ntp_gettime() routine.
+ */
+ retval[0] = time_state;
+ if (time_status & (STA_UNSYNC | STA_CLOCKERR))
+ retval[0] = TIME_ERROR;
+ if (time_status & (STA_PPSFREQ | STA_PPSTIME) &&
+ !(time_status & STA_PPSSIGNAL))
+ retval[0] = TIME_ERROR;
+ if (time_status & STA_PPSTIME &&
+ time_status & STA_PPSJITTER)
+ retval[0] = TIME_ERROR;
+ if (time_status & STA_PPSFREQ &&
+ time_status & (STA_PPSWANDER | STA_PPSERROR))
+ retval[0] = TIME_ERROR;
+ }
+ return error;
+}
+
+
diff --git a/sys/kern/kern_opt.c b/sys/kern/kern_opt.c
new file mode 100644
index 0000000..08b04b2
--- /dev/null
+++ b/sys/kern/kern_opt.c
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 1997 Bruce D. Evans
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include "opt_defunct.h"
+
+#ifdef ARP_PROXYALL
+#warning "obsolete option ARP_PROXYALL - use `sysctl -w net.link.ether.inet.proxyall=1'"
+#endif
+
+#ifdef CHILD_MAX
+#warning "obsolete option CHILD_MAX - use /etc/login.conf"
+#endif
+
+#ifdef EXTRAVNODES
+#warning "obsolete option EXTRAVNODES - use `sysctl -w kern.maxvnodes=value'"
+#endif
+
+#ifdef GATEWAY
+#warning "obsolete option GATEWAY - use `sysctl -w net.inet.ip.forwarding=1'"
+#endif
+
+#ifdef OPEN_MAX
+#warning "obsolete option OPEN_MAX - use /etc/login.conf"
+#endif
diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c
index 1eaae35..42d1d21 100644
--- a/sys/kern/kern_physio.c
+++ b/sys/kern/kern_physio.c
@@ -1,41 +1,22 @@
-/*-
- * Copyright (c) 1982, 1986, 1990, 1993
- * The Regents of the University of California. All rights reserved.
- * (c) UNIX System Laboratories, Inc.
- * All or some portions of this file are derived from material licensed
- * to the University of California by American Telephone and Telegraph
- * Co. or Unix System Laboratories, Inc. and are reproduced herein with
- * the permission of UNIX System Laboratories, Inc.
+/*
+ * Copyright (c) 1994 John S. Dyson
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
+ * notice immediately at the beginning of the file, without modification,
+ * this list of conditions, and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * 3. Absolutely no warranty of function or purpose is made by the author
+ * John S. Dyson.
+ * 4. Modifications may be freely made to this file if the above conditions
+ * are met.
*
- * from: @(#)kern_physio.c 8.1 (Berkeley) 6/10/93
+ * $Id$
*/
#include <sys/param.h>
@@ -43,51 +24,176 @@
#include <sys/buf.h>
#include <sys/conf.h>
#include <sys/proc.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_extern.h>
+
+static void physwakeup __P((struct buf *bp));
-physio(a1, a2, a3, a4, a5, a6)
- int (*a1)();
- struct buf *a2;
- dev_t a3;
- int a4;
- u_int (*a5)();
- struct uio *a6;
+int
+physio(strategy, bp, dev, rw, minp, uio)
+ d_strategy_t *strategy;
+ struct buf *bp;
+ dev_t dev;
+ int rw;
+ u_int (*minp) __P((struct buf *bp));
+ struct uio *uio;
{
+ int i;
+ int bufflags = rw?B_READ:0;
+ int error;
+ int spl;
+ caddr_t sa;
+ int bp_alloc = (bp == 0);
+ struct buf *bpa;
+
+/*
+ * keep the process from being swapped
+ */
+ curproc->p_flag |= P_PHYSIO;
+
+ /* create and build a buffer header for a transfer */
+ bpa = (struct buf *)getpbuf();
+ if (!bp_alloc) {
+ spl = splbio();
+ while (bp->b_flags & B_BUSY) {
+ bp->b_flags |= B_WANTED;
+ tsleep((caddr_t)bp, PRIBIO, "physbw", 0);
+ }
+ bp->b_flags |= B_BUSY;
+ splx(spl);
+ } else {
+ bp = bpa;
+ }
/*
- * Body deleted.
+ * get a copy of the kva from the physical buffer
*/
- return (EIO);
+ sa = bpa->b_data;
+ bp->b_proc = curproc;
+ bp->b_dev = dev;
+ error = bp->b_error = 0;
+
+ for(i=0;i<uio->uio_iovcnt;i++) {
+ while( uio->uio_iov[i].iov_len) {
+
+ bp->b_bcount = uio->uio_iov[i].iov_len;
+ bp->b_flags = B_BUSY | B_PHYS | B_CALL | bufflags;
+ bp->b_iodone = physwakeup;
+ bp->b_data = uio->uio_iov[i].iov_base;
+ bp->b_bcount = minp( bp);
+ if( minp != minphys)
+ bp->b_bcount = minphys( bp);
+ bp->b_bufsize = bp->b_bcount;
+ /*
+ * pass in the kva from the physical buffer
+ * for the temporary kernel mapping.
+ */
+ bp->b_saveaddr = sa;
+ bp->b_blkno = btodb(uio->uio_offset);
+
+
+ if (uio->uio_segflg == UIO_USERSPACE) {
+ if (rw && !useracc(bp->b_data, bp->b_bufsize, B_WRITE)) {
+ error = EFAULT;
+ goto doerror;
+ }
+ if (!rw && !useracc(bp->b_data, bp->b_bufsize, B_READ)) {
+ error = EFAULT;
+ goto doerror;
+ }
+
+ /* bring buffer into kernel space */
+ vmapbuf(bp);
+ }
+
+ /* perform transfer */
+ (*strategy)(bp);
+
+ spl = splbio();
+ while ((bp->b_flags & B_DONE) == 0)
+ tsleep((caddr_t)bp, PRIBIO, "physstr", 0);
+ splx(spl);
+
+ /* release mapping into kernel space */
+ if (uio->uio_segflg == UIO_USERSPACE)
+ vunmapbuf(bp);
+
+ /*
+ * update the uio data
+ */
+ {
+ int iolen = bp->b_bcount - bp->b_resid;
+
+ if (iolen == 0 && !(bp->b_flags & B_ERROR))
+ goto doerror; /* EOF */
+ uio->uio_iov[i].iov_len -= iolen;
+ uio->uio_iov[i].iov_base += iolen;
+ uio->uio_resid -= iolen;
+ uio->uio_offset += iolen;
+ }
+
+ /*
+ * check for an error
+ */
+ if( bp->b_flags & B_ERROR) {
+ error = bp->b_error;
+ goto doerror;
+ }
+ }
+ }
+
+
+doerror:
+ relpbuf(bpa);
+ if (!bp_alloc) {
+ bp->b_flags &= ~(B_BUSY|B_PHYS);
+ if( bp->b_flags & B_WANTED) {
+ bp->b_flags &= ~B_WANTED;
+ wakeup((caddr_t)bp);
+ }
+ }
+/*
+ * allow the process to be swapped
+ */
+ curproc->p_flag &= ~P_PHYSIO;
+
+ return (error);
}
u_int
-minphys(a1)
- struct buf *a1;
+minphys(struct buf *bp)
{
+ u_int maxphys = MAXPHYS;
- /*
- * Body deleted.
- */
- return (0);
+ if( ((vm_offset_t) bp->b_data) & PAGE_MASK) {
+ maxphys = MAXPHYS - PAGE_SIZE;
+ }
+
+ if( bp->b_bcount > maxphys) {
+ bp->b_bcount = maxphys;
+ }
+ return bp->b_bcount;
}
-/*
- * Do a read on a device for a user process.
- */
-rawread(dev, uio)
- dev_t dev;
- struct uio *uio;
+int
+rawread(dev_t dev, struct uio *uio, int ioflag)
{
- return (physio(cdevsw[major(dev)].d_strategy, (struct buf *)NULL,
- dev, B_READ, minphys, uio));
+ return (physio(cdevsw[major(dev)]->d_strategy, (struct buf *)NULL,
+ dev, 1, minphys, uio));
}
-/*
- * Do a write on a device for a user process.
- */
-rawwrite(dev, uio)
- dev_t dev;
- struct uio *uio;
+int
+rawwrite(dev_t dev, struct uio *uio, int ioflag)
+{
+ return (physio(cdevsw[major(dev)]->d_strategy, (struct buf *)NULL,
+ dev, 0, minphys, uio));
+}
+
+static void
+physwakeup(bp)
+ struct buf *bp;
{
- return (physio(cdevsw[major(dev)].d_strategy, (struct buf *)NULL,
- dev, B_WRITE, minphys, uio));
+ wakeup((caddr_t) bp);
+ bp->b_flags &= ~B_CALL;
}
diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c
index 6701793..cecf89f 100644
--- a/sys/kern/kern_proc.c
+++ b/sys/kern/kern_proc.c
@@ -31,12 +31,13 @@
* SUCH DAMAGE.
*
* @(#)kern_proc.c 8.7 (Berkeley) 2/14/95
+ * $Id: kern_proc.c,v 1.25 1997/02/22 09:39:08 peter Exp $
*/
#include <sys/param.h>
#include <sys/systm.h>
-#include <sys/map.h>
#include <sys/kernel.h>
+#include <sys/sysctl.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/acct.h>
@@ -46,8 +47,21 @@
#include <sys/uio.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
-#include <sys/ioctl.h>
#include <sys/tty.h>
+#include <sys/signalvar.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/user.h>
+
+struct prochd qs[NQS]; /* as good a place as any... */
+struct prochd rtqs[NQS]; /* Space for REALTIME queues too */
+struct prochd idqs[NQS]; /* Space for IDLE queues too */
+
+static void pgdelete __P((struct pgrp *));
/*
* Structure associated with user cacheing.
@@ -59,7 +73,9 @@ struct uidinfo {
};
#define UIHASH(uid) (&uihashtbl[(uid) & uihash])
LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
-u_long uihash; /* size of hash table - 1 */
+static u_long uihash; /* size of hash table - 1 */
+
+static void orphanpg __P((struct pgrp *pg));
/*
* Other process lists
@@ -126,6 +142,7 @@ chgproccnt(uid, diff)
/*
* Is p an inferior of the current process?
*/
+int
inferior(p)
register struct proc *p;
{
@@ -263,12 +280,12 @@ leavepgrp(p)
/*
* delete a process group
*/
-void
+static void
pgdelete(pgrp)
register struct pgrp *pgrp;
{
- if (pgrp->pg_session->s_ttyp != NULL &&
+ if (pgrp->pg_session->s_ttyp != NULL &&
pgrp->pg_session->s_ttyp->t_pgrp == pgrp)
pgrp->pg_session->s_ttyp->t_pgrp = NULL;
LIST_REMOVE(pgrp, pg_hash);
@@ -277,8 +294,6 @@ pgdelete(pgrp)
FREE(pgrp, M_PGRP);
}
-static void orphanpg();
-
/*
* Adjust pgrp jobc counters when specified process changes process group.
* We count the number of processes in each process group that "qualify"
@@ -324,7 +339,7 @@ fixjobc(p, pgrp, entering)
orphanpg(hispgrp);
}
-/*
+/*
* A process group has become orphaned;
* if there are any stopped processes in the group,
* hang-up all process in that group.
@@ -347,8 +362,11 @@ orphanpg(pg)
}
}
-#ifdef DEBUG
-pgrpdump()
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(pgrpdump, pgrpdump)
{
register struct pgrp *pgrp;
register struct proc *p;
@@ -371,4 +389,204 @@ pgrpdump()
}
}
}
-#endif /* DEBUG */
+#endif /* DDB */
+
+/*
+ * Fill in an eproc structure for the specified process.
+ */
+void
+fill_eproc(p, ep)
+ register struct proc *p;
+ register struct eproc *ep;
+{
+ register struct tty *tp;
+
+ bzero(ep, sizeof(*ep));
+
+ ep->e_paddr = p;
+ if (p->p_cred) {
+ ep->e_pcred = *p->p_cred;
+ if (p->p_ucred)
+ ep->e_ucred = *p->p_ucred;
+ }
+ if (p->p_stat != SIDL && p->p_stat != SZOMB && p->p_vmspace != NULL) {
+ register struct vmspace *vm = p->p_vmspace;
+
+#ifdef pmap_resident_count
+ ep->e_vm.vm_rssize = pmap_resident_count(&vm->vm_pmap); /*XXX*/
+#else
+ ep->e_vm.vm_rssize = vm->vm_rssize;
+#endif
+ ep->e_vm.vm_tsize = vm->vm_tsize;
+ ep->e_vm.vm_dsize = vm->vm_dsize;
+ ep->e_vm.vm_ssize = vm->vm_ssize;
+#ifndef sparc
+ ep->e_vm.vm_pmap = vm->vm_pmap;
+#endif
+ }
+ if (p->p_pptr)
+ ep->e_ppid = p->p_pptr->p_pid;
+ if (p->p_pgrp) {
+ ep->e_pgid = p->p_pgrp->pg_id;
+ ep->e_jobc = p->p_pgrp->pg_jobc;
+ ep->e_sess = p->p_pgrp->pg_session;
+
+ if (ep->e_sess) {
+ bcopy(ep->e_sess->s_login, ep->e_login, sizeof(ep->e_login));
+ if (ep->e_sess->s_ttyvp)
+ ep->e_flag = EPROC_CTTY;
+ if (p->p_session && SESS_LEADER(p))
+ ep->e_flag |= EPROC_SLEADER;
+ }
+ }
+ if ((p->p_flag & P_CONTROLT) &&
+ (ep->e_sess != NULL) &&
+ ((tp = ep->e_sess->s_ttyp) != NULL)) {
+ ep->e_tdev = tp->t_dev;
+ ep->e_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
+ ep->e_tsess = tp->t_session;
+ } else
+ ep->e_tdev = NODEV;
+ if (p->p_wmesg) {
+ strncpy(ep->e_wmesg, p->p_wmesg, WMESGLEN);
+ ep->e_wmesg[WMESGLEN] = 0;
+ }
+}
+
+static struct proc *
+zpfind(pid_t pid)
+{
+ struct proc *p;
+
+ for (p = zombproc.lh_first; p != 0; p = p->p_list.le_next)
+ if (p->p_pid == pid)
+ return (p);
+ return (NULL);
+}
+
+
+static int
+sysctl_out_proc(struct proc *p, struct sysctl_req *req, int doingzomb)
+{
+ struct eproc eproc;
+ int error;
+ pid_t pid = p->p_pid;
+
+ fill_eproc(p, &eproc);
+ error = SYSCTL_OUT(req,(caddr_t)p, sizeof(struct proc));
+ if (error)
+ return (error);
+ error = SYSCTL_OUT(req,(caddr_t)&eproc, sizeof(eproc));
+ if (error)
+ return (error);
+ if (!doingzomb && pid && (pfind(pid) != p))
+ return EAGAIN;
+ if (doingzomb && zpfind(pid) != p)
+ return EAGAIN;
+ return (0);
+}
+
+static int
+sysctl_kern_proc SYSCTL_HANDLER_ARGS
+{
+ int *name = (int*) arg1;
+ u_int namelen = arg2;
+ struct proc *p;
+ int doingzomb;
+ int error = 0;
+
+ if (oidp->oid_number == KERN_PROC_PID) {
+ if (namelen != 1)
+ return (EINVAL);
+ p = pfind((pid_t)name[0]);
+ if (!p)
+ return (0);
+ error = sysctl_out_proc(p, req, 0);
+ return (error);
+ }
+ if (oidp->oid_number == KERN_PROC_ALL && !namelen)
+ ;
+ else if (oidp->oid_number != KERN_PROC_ALL && namelen == 1)
+ ;
+ else
+ return (EINVAL);
+
+ if (!req->oldptr) {
+ /* overestimate by 5 procs */
+ error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5);
+ if (error)
+ return (error);
+ }
+ for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) {
+ if (!doingzomb)
+ p = allproc.lh_first;
+ else
+ p = zombproc.lh_first;
+ for (; p != 0; p = p->p_list.le_next) {
+ /*
+ * Skip embryonic processes.
+ */
+ if (p->p_stat == SIDL)
+ continue;
+ /*
+ * TODO - make more efficient (see notes below).
+ * do by session.
+ */
+ switch (oidp->oid_number) {
+
+ case KERN_PROC_PGRP:
+ /* could do this by traversing pgrp */
+ if (p->p_pgrp == NULL ||
+ p->p_pgrp->pg_id != (pid_t)name[0])
+ continue;
+ break;
+
+ case KERN_PROC_TTY:
+ if ((p->p_flag & P_CONTROLT) == 0 ||
+ p->p_session == NULL ||
+ p->p_session->s_ttyp == NULL ||
+ p->p_session->s_ttyp->t_dev != (dev_t)name[0])
+ continue;
+ break;
+
+ case KERN_PROC_UID:
+ if (p->p_ucred == NULL ||
+ p->p_ucred->cr_uid != (uid_t)name[0])
+ continue;
+ break;
+
+ case KERN_PROC_RUID:
+ if (p->p_ucred == NULL ||
+ p->p_cred->p_ruid != (uid_t)name[0])
+ continue;
+ break;
+ }
+
+ error = sysctl_out_proc(p, req, doingzomb);
+ if (error)
+ return (error);
+ }
+ }
+ return (0);
+}
+
+
+SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD, 0, "Process table");
+
+SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT,
+ 0, 0, sysctl_kern_proc, "S,proc", "");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD,
+ sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD,
+ sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD,
+ sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD,
+ sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD,
+ sysctl_kern_proc, "Process table");
diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c
index 29e4c67..5c2ec5b 100644
--- a/sys/kern/kern_prot.c
+++ b/sys/kern/kern_prot.c
@@ -35,7 +35,8 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)kern_prot.c 8.9 (Berkeley) 2/14/95
+ * @(#)kern_prot.c 8.6 (Berkeley) 1/21/94
+ * $Id: kern_prot.c,v 1.25 1997/03/03 22:46:16 ache Exp $
*/
/*
@@ -45,21 +46,26 @@
#include <sys/param.h>
#include <sys/acct.h>
#include <sys/systm.h>
+#include <sys/sysproto.h>
#include <sys/ucred.h>
#include <sys/proc.h>
#include <sys/timeb.h>
#include <sys/times.h>
#include <sys/malloc.h>
+#include <sys/unistd.h>
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
+#ifndef _SYS_SYSPROTO_H_
+struct getpid_args {
+ int dummy;
+};
+#endif
/* ARGSUSED */
int
getpid(p, uap, retval)
struct proc *p;
- void *uap;
- register_t *retval;
+ struct getpid_args *uap;
+ int *retval;
{
*retval = p->p_pid;
@@ -69,12 +75,17 @@ getpid(p, uap, retval)
return (0);
}
+#ifndef _SYS_SYSPROTO_H_
+struct getppid_args {
+ int dummy;
+};
+#endif
/* ARGSUSED */
int
getppid(p, uap, retval)
struct proc *p;
- void *uap;
- register_t *retval;
+ struct getppid_args *uap;
+ int *retval;
{
*retval = p->p_pptr->p_pid;
@@ -82,23 +93,35 @@ getppid(p, uap, retval)
}
/* Get process group ID; note that POSIX getpgrp takes no parameter */
+#ifndef _SYS_SYSPROTO_H_
+struct getpgrp_args {
+ int dummy;
+};
+#endif
+
int
getpgrp(p, uap, retval)
struct proc *p;
- void *uap;
- register_t *retval;
+ struct getpgrp_args *uap;
+ int *retval;
{
*retval = p->p_pgrp->pg_id;
return (0);
}
+#ifndef _SYS_SYSPROTO_H_
+struct getuid_args {
+ int dummy;
+};
+#endif
+
/* ARGSUSED */
int
getuid(p, uap, retval)
struct proc *p;
- void *uap;
- register_t *retval;
+ struct getuid_args *uap;
+ int *retval;
{
*retval = p->p_cred->p_ruid;
@@ -108,24 +131,36 @@ getuid(p, uap, retval)
return (0);
}
+#ifndef _SYS_SYSPROTO_H_
+struct geteuid_args {
+ int dummy;
+};
+#endif
+
/* ARGSUSED */
int
geteuid(p, uap, retval)
struct proc *p;
- void *uap;
- register_t *retval;
+ struct geteuid_args *uap;
+ int *retval;
{
*retval = p->p_ucred->cr_uid;
return (0);
}
+#ifndef _SYS_SYSPROTO_H_
+struct getgid_args {
+ int dummy;
+};
+#endif
+
/* ARGSUSED */
int
getgid(p, uap, retval)
struct proc *p;
- void *uap;
- register_t *retval;
+ struct getgid_args *uap;
+ int *retval;
{
*retval = p->p_cred->p_rgid;
@@ -140,51 +175,66 @@ getgid(p, uap, retval)
* via getgroups. This syscall exists because it is somewhat painful to do
* correctly in a library function.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct getegid_args {
+ int dummy;
+};
+#endif
+
/* ARGSUSED */
int
getegid(p, uap, retval)
struct proc *p;
- void *uap;
- register_t *retval;
+ struct getegid_args *uap;
+ int *retval;
{
*retval = p->p_ucred->cr_groups[0];
return (0);
}
+#ifndef _SYS_SYSPROTO_H_
+struct getgroups_args {
+ u_int gidsetsize;
+ gid_t *gidset;
+};
+#endif
int
getgroups(p, uap, retval)
struct proc *p;
- register struct getgroups_args /* {
- syscallarg(u_int) gidsetsize;
- syscallarg(gid_t *) gidset;
- } */ *uap;
- register_t *retval;
+ register struct getgroups_args *uap;
+ int *retval;
{
register struct pcred *pc = p->p_cred;
register u_int ngrp;
int error;
- if ((ngrp = SCARG(uap, gidsetsize)) == 0) {
+ if ((ngrp = uap->gidsetsize) == 0) {
*retval = pc->pc_ucred->cr_ngroups;
return (0);
}
if (ngrp < pc->pc_ucred->cr_ngroups)
return (EINVAL);
ngrp = pc->pc_ucred->cr_ngroups;
- if (error = copyout((caddr_t)pc->pc_ucred->cr_groups,
- (caddr_t)SCARG(uap, gidset), ngrp * sizeof(gid_t)))
+ if ((error = copyout((caddr_t)pc->pc_ucred->cr_groups,
+ (caddr_t)uap->gidset, ngrp * sizeof(gid_t))))
return (error);
*retval = ngrp;
return (0);
}
+#ifndef _SYS_SYSPROTO_H_
+struct setsid_args {
+ int dummy;
+};
+#endif
+
/* ARGSUSED */
int
setsid(p, uap, retval)
register struct proc *p;
- void *uap;
- register_t *retval;
+ struct setsid_args *uap;
+ int *retval;
{
if (p->p_pgid == p->p_pid || pgfind(p->p_pid)) {
@@ -209,23 +259,28 @@ setsid(p, uap, retval)
* there must exist some pid in same session having pgid (EPERM)
* pid must not be session leader (EPERM)
*/
+#ifndef _SYS_SYSPROTO_H_
+struct setpgid_args {
+ int pid; /* target process id */
+ int pgid; /* target pgrp id */
+};
+#endif
/* ARGSUSED */
int
setpgid(curp, uap, retval)
struct proc *curp;
- register struct setpgid_args /* {
- syscallarg(int) pid;
- syscallarg(int) pgid;
- } */ *uap;
- register_t *retval;
+ register struct setpgid_args *uap;
+ int *retval;
{
register struct proc *targp; /* target process */
register struct pgrp *pgrp; /* target pgrp */
- if (SCARG(uap, pid) != 0 && SCARG(uap, pid) != curp->p_pid) {
- if ((targp = pfind(SCARG(uap, pid))) == 0 || !inferior(targp))
+ if (uap->pgid < 0)
+ return (EINVAL);
+ if (uap->pid != 0 && uap->pid != curp->p_pid) {
+ if ((targp = pfind(uap->pid)) == 0 || !inferior(targp))
return (ESRCH);
- if (targp->p_session != curp->p_session)
+ if (targp->p_pgrp == NULL || targp->p_session != curp->p_session)
return (EPERM);
if (targp->p_flag & P_EXEC)
return (EACCES);
@@ -233,30 +288,36 @@ setpgid(curp, uap, retval)
targp = curp;
if (SESS_LEADER(targp))
return (EPERM);
- if (SCARG(uap, pgid) == 0)
- SCARG(uap, pgid) = targp->p_pid;
- else if (SCARG(uap, pgid) != targp->p_pid)
- if ((pgrp = pgfind(SCARG(uap, pgid))) == 0 ||
+ if (uap->pgid == 0)
+ uap->pgid = targp->p_pid;
+ else if (uap->pgid != targp->p_pid)
+ if ((pgrp = pgfind(uap->pgid)) == 0 ||
pgrp->pg_session != curp->p_session)
return (EPERM);
- return (enterpgrp(targp, SCARG(uap, pgid), 0));
+ return (enterpgrp(targp, uap->pgid, 0));
}
+#ifndef _SYS_SYSPROTO_H_
+struct setuid_args {
+ uid_t uid;
+};
+#endif
/* ARGSUSED */
int
setuid(p, uap, retval)
struct proc *p;
- struct setuid_args /* {
- syscallarg(uid_t) uid;
- } */ *uap;
- register_t *retval;
+ struct setuid_args *uap;
+ int *retval;
{
register struct pcred *pc = p->p_cred;
register uid_t uid;
int error;
- uid = SCARG(uap, uid);
+ uid = uap->uid;
if (uid != pc->p_ruid &&
+#ifdef _POSIX_SAVED_IDS
+ uid != pc->p_svuid &&
+#endif
(error = suser(pc->pc_ucred, &p->p_acflag)))
return (error);
/*
@@ -264,30 +325,45 @@ setuid(p, uap, retval)
* Transfer proc count to new user.
* Copy credentials so other references do not see our changes.
*/
- (void)chgproccnt(pc->p_ruid, -1);
- (void)chgproccnt(uid, 1);
+ if (
+#ifdef _POSIX_SAVED_IDS
+ pc->pc_ucred->cr_uid == 0 &&
+#endif
+ uid != pc->p_ruid) {
+ (void)chgproccnt(pc->p_ruid, -1);
+ (void)chgproccnt(uid, 1);
+ }
pc->pc_ucred = crcopy(pc->pc_ucred);
+#ifdef _POSIX_SAVED_IDS
+ if (pc->pc_ucred->cr_uid == 0) {
+#endif
+ pc->p_ruid = uid;
+ pc->p_svuid = uid;
+#ifdef _POSIX_SAVED_IDS
+ }
+#endif
pc->pc_ucred->cr_uid = uid;
- pc->p_ruid = uid;
- pc->p_svuid = uid;
p->p_flag |= P_SUGID;
return (0);
}
+#ifndef _SYS_SYSPROTO_H_
+struct seteuid_args {
+ uid_t euid;
+};
+#endif
/* ARGSUSED */
int
seteuid(p, uap, retval)
struct proc *p;
- struct seteuid_args /* {
- syscallarg(uid_t) euid;
- } */ *uap;
- register_t *retval;
+ struct seteuid_args *uap;
+ int *retval;
{
register struct pcred *pc = p->p_cred;
register uid_t euid;
int error;
- euid = SCARG(uap, euid);
+ euid = uap->euid;
if (euid != pc->p_ruid && euid != pc->p_svuid &&
(error = suser(pc->pc_ucred, &p->p_acflag)))
return (error);
@@ -301,44 +377,60 @@ seteuid(p, uap, retval)
return (0);
}
+#ifndef _SYS_SYSPROTO_H_
+struct setgid_args {
+ gid_t gid;
+};
+#endif
/* ARGSUSED */
int
setgid(p, uap, retval)
struct proc *p;
- struct setgid_args /* {
- syscallarg(gid_t) gid;
- } */ *uap;
- register_t *retval;
+ struct setgid_args *uap;
+ int *retval;
{
register struct pcred *pc = p->p_cred;
register gid_t gid;
int error;
- gid = SCARG(uap, gid);
- if (gid != pc->p_rgid && (error = suser(pc->pc_ucred, &p->p_acflag)))
+ gid = uap->gid;
+ if (gid != pc->p_rgid &&
+#ifdef _POSIX_SAVED_IDS
+ gid != pc->p_svgid &&
+#endif
+ (error = suser(pc->pc_ucred, &p->p_acflag)))
return (error);
pc->pc_ucred = crcopy(pc->pc_ucred);
pc->pc_ucred->cr_groups[0] = gid;
- pc->p_rgid = gid;
- pc->p_svgid = gid; /* ??? */
+#ifdef _POSIX_SAVED_IDS
+ if (pc->pc_ucred->cr_uid == 0) {
+#endif
+ pc->p_rgid = gid;
+ pc->p_svgid = gid;
+#ifdef _POSIX_SAVED_IDS
+ }
+#endif
p->p_flag |= P_SUGID;
return (0);
}
+#ifndef _SYS_SYSPROTO_H_
+struct setegid_args {
+ gid_t egid;
+};
+#endif
/* ARGSUSED */
int
setegid(p, uap, retval)
struct proc *p;
- struct setegid_args /* {
- syscallarg(gid_t) egid;
- } */ *uap;
- register_t *retval;
+ struct setegid_args *uap;
+ int *retval;
{
register struct pcred *pc = p->p_cred;
register gid_t egid;
int error;
- egid = SCARG(uap, egid);
+ egid = uap->egid;
if (egid != pc->p_rgid && egid != pc->p_svgid &&
(error = suser(pc->pc_ucred, &p->p_acflag)))
return (error);
@@ -348,113 +440,109 @@ setegid(p, uap, retval)
return (0);
}
+#ifndef _SYS_SYSPROTO_H_
+struct setgroups_args {
+ u_int gidsetsize;
+ gid_t *gidset;
+};
+#endif
/* ARGSUSED */
int
setgroups(p, uap, retval)
struct proc *p;
- struct setgroups_args /* {
- syscallarg(u_int) gidsetsize;
- syscallarg(gid_t *) gidset;
- } */ *uap;
- register_t *retval;
+ struct setgroups_args *uap;
+ int *retval;
{
register struct pcred *pc = p->p_cred;
register u_int ngrp;
int error;
- if (error = suser(pc->pc_ucred, &p->p_acflag))
+ if ((error = suser(pc->pc_ucred, &p->p_acflag)))
return (error);
- ngrp = SCARG(uap, gidsetsize);
+ ngrp = uap->gidsetsize;
if (ngrp < 1 || ngrp > NGROUPS)
return (EINVAL);
pc->pc_ucred = crcopy(pc->pc_ucred);
- if (error = copyin((caddr_t)SCARG(uap, gidset),
- (caddr_t)pc->pc_ucred->cr_groups, ngrp * sizeof(gid_t)))
+ if ((error = copyin((caddr_t)uap->gidset,
+ (caddr_t)pc->pc_ucred->cr_groups, ngrp * sizeof(gid_t))))
return (error);
pc->pc_ucred->cr_ngroups = ngrp;
p->p_flag |= P_SUGID;
return (0);
}
-#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#ifndef _SYS_SYSPROTO_H_
+struct setreuid_args {
+ uid_t ruid;
+ uid_t euid;
+};
+#endif
/* ARGSUSED */
int
-compat_43_setreuid(p, uap, retval)
+setreuid(p, uap, retval)
register struct proc *p;
- struct compat_43_setreuid_args /* {
- syscallarg(int) ruid;
- syscallarg(int) euid;
- } */ *uap;
- register_t *retval;
+ struct setreuid_args *uap;
+ int *retval;
{
register struct pcred *pc = p->p_cred;
- union {
- struct setuid_args sa;
- struct seteuid_args ea;
- } args;
+ register uid_t ruid, euid;
+ int error;
- /*
- * If ruid == euid then setreuid is being used to emulate setuid,
- * just do it.
- */
- if (SCARG(uap, ruid) != -1 && SCARG(uap, ruid) == SCARG(uap, euid)) {
- SCARG(&args.sa, uid) = SCARG(uap, ruid);
- return (setuid(p, &args.sa, retval));
+ ruid = uap->ruid;
+ euid = uap->euid;
+ if ((ruid != (uid_t)-1 && ruid != pc->p_ruid && ruid != pc->p_svuid ||
+ euid != (uid_t)-1 && euid != pc->p_ruid && euid != pc->p_svuid) &&
+ (error = suser(pc->pc_ucred, &p->p_acflag)))
+ return (error);
+
+ pc->pc_ucred = crcopy(pc->pc_ucred);
+ if (euid != (uid_t)-1)
+ pc->pc_ucred->cr_uid = euid;
+ if (ruid != (uid_t)-1 && ruid != pc->p_ruid) {
+ (void)chgproccnt(pc->p_ruid, -1);
+ (void)chgproccnt(ruid, 1);
+ pc->p_ruid = ruid;
}
- /*
- * Otherwise we assume that the intent of setting ruid is to be
- * able to get back ruid priviledge (i.e. swapping ruid and euid).
- * So we make sure that we will be able to do so, but do not
- * actually set the ruid.
- */
- if (SCARG(uap, ruid) != (uid_t)-1 && SCARG(uap, ruid) != pc->p_ruid &&
- SCARG(uap, ruid) != pc->p_svuid)
- return (EPERM);
- if (SCARG(uap, euid) == (uid_t)-1)
- return (0);
- SCARG(&args.ea, euid) = SCARG(uap, euid);
- return (seteuid(p, &args.ea, retval));
+ if (ruid != (uid_t)-1 || pc->pc_ucred->cr_uid != pc->p_ruid)
+ pc->p_svuid = pc->pc_ucred->cr_uid;
+ p->p_flag |= P_SUGID;
+ return (0);
}
+#ifndef _SYS_SYSPROTO_H_
+struct setregid_args {
+ gid_t rgid;
+ gid_t egid;
+};
+#endif
/* ARGSUSED */
int
-compat_43_setregid(p, uap, retval)
+setregid(p, uap, retval)
register struct proc *p;
- struct compat_43_setregid_args /* {
- syscallarg(int) rgid;
- syscallarg(int) egid;
- } */ *uap;
- register_t *retval;
+ struct setregid_args *uap;
+ int *retval;
{
register struct pcred *pc = p->p_cred;
- union {
- struct setgid_args sa;
- struct setegid_args ea;
- } args;
+ register gid_t rgid, egid;
+ int error;
- /*
- * If rgid == egid then setreuid is being used to emulate setgid,
- * just do it.
- */
- if (SCARG(uap, rgid) != -1 && SCARG(uap, rgid) == SCARG(uap, egid)) {
- SCARG(&args.sa, gid) = SCARG(uap, rgid);
- return (setgid(p, &args.sa, retval));
- }
- /*
- * Otherwise we assume that the intent of setting rgid is to be
- * able to get back rgid priviledge (i.e. swapping rgid and egid).
- * So we make sure that we will be able to do so, but do not
- * actually set the rgid.
- */
- if (SCARG(uap, rgid) != (gid_t)-1 && SCARG(uap, rgid) != pc->p_rgid &&
- SCARG(uap, rgid) != pc->p_svgid)
- return (EPERM);
- if (SCARG(uap, egid) == (gid_t)-1)
- return (0);
- SCARG(&args.ea, egid) = SCARG(uap, egid);
- return (setegid(p, &args.ea, retval));
+ rgid = uap->rgid;
+ egid = uap->egid;
+ if ((rgid != (gid_t)-1 && rgid != pc->p_rgid && rgid != pc->p_svgid ||
+ egid != (gid_t)-1 && egid != pc->p_rgid && egid != pc->p_svgid) &&
+ (error = suser(pc->pc_ucred, &p->p_acflag)))
+ return (error);
+
+ pc->pc_ucred = crcopy(pc->pc_ucred);
+ if (egid != (gid_t)-1)
+ pc->pc_ucred->cr_groups[0] = egid;
+ if (rgid != (gid_t)-1)
+ pc->p_rgid = rgid;
+ if (rgid != (gid_t)-1 || pc->pc_ucred->cr_groups[0] != pc->p_rgid)
+ pc->p_svgid = pc->pc_ucred->cr_groups[0];
+ p->p_flag |= P_SUGID;
+ return (0);
}
-#endif /* defined(COMPAT_43) || defined(COMPAT_SUNOS) */
/*
* Check if gid is a member of the group set.
@@ -559,43 +647,52 @@ crdup(cr)
/*
* Get login name, if available.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct getlogin_args {
+ char *namebuf;
+ u_int namelen;
+};
+#endif
/* ARGSUSED */
int
getlogin(p, uap, retval)
struct proc *p;
- struct getlogin_args /* {
- syscallarg(char *) namebuf;
- syscallarg(u_int) namelen;
- } */ *uap;
- register_t *retval;
+ struct getlogin_args *uap;
+ int *retval;
{
- if (SCARG(uap, namelen) > sizeof (p->p_pgrp->pg_session->s_login))
- SCARG(uap, namelen) = sizeof (p->p_pgrp->pg_session->s_login);
+ if (uap->namelen > MAXLOGNAME)
+ uap->namelen = MAXLOGNAME;
return (copyout((caddr_t) p->p_pgrp->pg_session->s_login,
- (caddr_t) SCARG(uap, namebuf), SCARG(uap, namelen)));
+ (caddr_t) uap->namebuf, uap->namelen));
}
/*
* Set login name.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct setlogin_args {
+ char *namebuf;
+};
+#endif
/* ARGSUSED */
int
setlogin(p, uap, retval)
struct proc *p;
- struct setlogin_args /* {
- syscallarg(char *) namebuf;
- } */ *uap;
- register_t *retval;
+ struct setlogin_args *uap;
+ int *retval;
{
int error;
+ char logintmp[MAXLOGNAME];
- if (error = suser(p->p_ucred, &p->p_acflag))
+ if ((error = suser(p->p_ucred, &p->p_acflag)))
return (error);
- error = copyinstr((caddr_t) SCARG(uap, namebuf),
- (caddr_t) p->p_pgrp->pg_session->s_login,
- sizeof (p->p_pgrp->pg_session->s_login) - 1, (u_int *)0);
+ error = copyinstr((caddr_t) uap->namebuf, (caddr_t) logintmp,
+ sizeof(logintmp), (u_int *)0);
if (error == ENAMETOOLONG)
error = EINVAL;
+ else if (!error)
+ (void) memcpy(p->p_pgrp->pg_session->s_login, logintmp,
+ sizeof(logintmp));
return (error);
}
diff --git a/sys/kern/kern_random.c b/sys/kern/kern_random.c
new file mode 100644
index 0000000..64c215c
--- /dev/null
+++ b/sys/kern/kern_random.c
@@ -0,0 +1,515 @@
+/*
+ * random_machdep.c -- A strong random number generator
+ *
+ * $Id$
+ *
+ * Version 0.95, last modified 18-Oct-95
+ *
+ * Copyright Theodore Ts'o, 1994, 1995. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, and the entire permission notice in its entirety,
+ * including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ * products derived from this software without specific prior
+ * written permission.
+ *
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions. (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ *
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_cpu.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/select.h>
+#include <sys/fcntl.h>
+
+#include <machine/clock.h>
+#include <machine/random.h>
+
+#include <i386/isa/icu.h>
+#ifdef PC98
+#include <pc98/pc98/pc98.h>
+#else
+#include <i386/isa/isa.h>
+#endif
+#include <i386/isa/timerreg.h>
+
+#define MAX_BLKDEV 4
+
+/*
+ * The pool is stirred with a primitive polynomial of degree 128
+ * over GF(2), namely x^128 + x^99 + x^59 + x^31 + x^9 + x^7 + 1.
+ * For a pool of size 64, try x^64+x^62+x^38+x^10+x^6+x+1.
+ */
+#define POOLWORDS 128 /* Power of 2 - note that this is 32-bit words */
+#define POOLBITS (POOLWORDS*32)
+
+#if POOLWORDS == 128
+#define TAP1 99 /* The polynomial taps */
+#define TAP2 59
+#define TAP3 31
+#define TAP4 9
+#define TAP5 7
+#elif POOLWORDS == 64
+#define TAP1 62 /* The polynomial taps */
+#define TAP2 38
+#define TAP3 10
+#define TAP4 6
+#define TAP5 1
+#else
+#error No primitive polynomial available for chosen POOLWORDS
+#endif
+
+#define WRITEBUFFER 512 /* size in bytes */
+
+/* There is actually only one of these, globally. */
+struct random_bucket {
+ u_int add_ptr;
+ u_int entropy_count;
+ int input_rotate;
+ u_int32_t *pool;
+ struct selinfo rsel;
+};
+
+/* There is one of these per entropy source */
+struct timer_rand_state {
+ u_long last_time;
+ int last_delta;
+ int nbits;
+};
+
+static struct random_bucket random_state;
+static u_int32_t random_pool[POOLWORDS];
+static struct timer_rand_state keyboard_timer_state;
+static struct timer_rand_state extract_timer_state;
+static struct timer_rand_state irq_timer_state[ICU_LEN];
+#ifdef notyet
+static struct timer_rand_state blkdev_timer_state[MAX_BLKDEV];
+#endif
+static struct wait_queue *random_wait;
+
+inthand2_t *sec_intr_handler[ICU_LEN];
+int sec_intr_unit[ICU_LEN];
+
+#ifndef MIN
+#define MIN(a,b) (((a) < (b)) ? (a) : (b))
+#endif
+
+void
+rand_initialize(void)
+{
+ random_state.add_ptr = 0;
+ random_state.entropy_count = 0;
+ random_state.pool = random_pool;
+ random_wait = NULL;
+ random_state.rsel.si_flags = 0;
+ random_state.rsel.si_pid = 0;
+}
+
+/*
+ * This function adds an int into the entropy "pool". It does not
+ * update the entropy estimate. The caller must do this if appropriate.
+ *
+ * The pool is stirred with a primitive polynomial of degree 128
+ * over GF(2), namely x^128 + x^99 + x^59 + x^31 + x^9 + x^7 + 1.
+ * For a pool of size 64, try x^64+x^62+x^38+x^10+x^6+x+1.
+ *
+ * We rotate the input word by a changing number of bits, to help
+ * assure that all bits in the entropy get toggled. Otherwise, if we
+ * consistently feed the entropy pool small numbers (like ticks and
+ * scancodes, for example), the upper bits of the entropy pool don't
+ * get affected. --- TYT, 10/11/95
+ */
+static inline void
+add_entropy_word(struct random_bucket *r, const u_int32_t input)
+{
+ u_int i;
+ u_int32_t w;
+
+ w = (input << r->input_rotate) | (input >> (32 - r->input_rotate));
+ i = r->add_ptr = (r->add_ptr - 1) & (POOLWORDS-1);
+ if (i)
+ r->input_rotate = (r->input_rotate + 7) & 31;
+ else
+ /*
+ * At the beginning of the pool, add an extra 7 bits
+ * rotation, so that successive passes spread the
+ * input bits across the pool evenly.
+ */
+ r->input_rotate = (r->input_rotate + 14) & 31;
+
+ /* XOR in the various taps */
+ w ^= r->pool[(i+TAP1)&(POOLWORDS-1)];
+ w ^= r->pool[(i+TAP2)&(POOLWORDS-1)];
+ w ^= r->pool[(i+TAP3)&(POOLWORDS-1)];
+ w ^= r->pool[(i+TAP4)&(POOLWORDS-1)];
+ w ^= r->pool[(i+TAP5)&(POOLWORDS-1)];
+ w ^= r->pool[i];
+ /* Rotate w left 1 bit (stolen from SHA) and store */
+ r->pool[i] = (w << 1) | (w >> 31);
+}
+
+/*
+ * This function adds entropy to the entropy "pool" by using timing
+ * delays. It uses the timer_rand_state structure to make an estimate
+ * of how any bits of entropy this call has added to the pool.
+ *
+ * The number "num" is also added to the pool - it should somehow describe
+ * the type of event which just happened. This is currently 0-255 for
+ * keyboard scan codes, and 256 upwards for interrupts.
+ * On the i386, this is assumed to be at most 16 bits, and the high bits
+ * are used for a high-resolution timer.
+ */
+static void
+add_timer_randomness(struct random_bucket *r, struct timer_rand_state *state,
+ u_int num)
+{
+ int delta, delta2;
+ u_int nbits;
+ u_int32_t time;
+
+#if defined(I586_CPU) || defined(I686_CPU)
+ if (i586_ctr_freq != 0) {
+ num ^= (u_int32_t) rdtsc() << 16;
+ r->entropy_count += 2;
+ } else {
+#endif
+ disable_intr();
+ outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH);
+ num ^= inb(TIMER_CNTR0) << 16;
+ num ^= inb(TIMER_CNTR0) << 24;
+ enable_intr();
+ r->entropy_count += 2;
+#if defined(I586_CPU) || defined(I686_CPU)
+ }
+#endif
+
+ time = ticks;
+
+ add_entropy_word(r, (u_int32_t) num);
+ add_entropy_word(r, time);
+
+ /*
+ * Calculate number of bits of randomness we probably
+ * added. We take into account the first and second order
+ * deltas in order to make our estimate.
+ */
+ delta = time - state->last_time;
+ state->last_time = time;
+
+ delta2 = delta - state->last_delta;
+ state->last_delta = delta;
+
+ if (delta < 0) delta = -delta;
+ if (delta2 < 0) delta2 = -delta2;
+ delta = MIN(delta, delta2) >> 1;
+ for (nbits = 0; delta; nbits++)
+ delta >>= 1;
+
+ r->entropy_count += nbits;
+
+ /* Prevent overflow */
+ if (r->entropy_count > POOLBITS)
+ r->entropy_count = POOLBITS;
+
+ if (r->entropy_count >= 8)
+ selwakeup(&random_state.rsel);
+}
+
+void
+add_keyboard_randomness(u_char scancode)
+{
+ add_timer_randomness(&random_state, &keyboard_timer_state, scancode);
+}
+
+void
+add_interrupt_randomness(int irq)
+{
+ (sec_intr_handler[irq])(sec_intr_unit[irq]);
+ add_timer_randomness(&random_state, &irq_timer_state[irq], irq);
+}
+
+#ifdef notused
+void
+add_blkdev_randomness(int major)
+{
+ if (major >= MAX_BLKDEV)
+ return;
+
+ add_timer_randomness(&random_state, &blkdev_timer_state[major],
+ 0x200+major);
+}
+#endif /* notused */
+
+/*
+ * MD5 transform algorithm, taken from code written by Colin Plumb,
+ * and put into the public domain
+ *
+ * QUESTION: Replace this with SHA, which as generally received better
+ * reviews from the cryptographic community?
+ */
+
+/* The four core functions - F1 is optimized somewhat */
+
+/* #define F1(x, y, z) (x & y | ~x & z) */
+#define F1(x, y, z) (z ^ (x & (y ^ z)))
+#define F2(x, y, z) F1(z, x, y)
+#define F3(x, y, z) (x ^ y ^ z)
+#define F4(x, y, z) (y ^ (x | ~z))
+
+/* This is the central step in the MD5 algorithm. */
+#define MD5STEP(f, w, x, y, z, data, s) \
+ ( w += f(x, y, z) + data, w = w<<s | w>>(32-s), w += x )
+
+/*
+ * The core of the MD5 algorithm, this alters an existing MD5 hash to
+ * reflect the addition of 16 longwords of new data. MD5Update blocks
+ * the data and converts bytes into longwords for this routine.
+ */
+static void
+MD5Transform(u_int32_t buf[4],
+ u_int32_t const in[16])
+{
+ u_int32_t a, b, c, d;
+
+ a = buf[0];
+ b = buf[1];
+ c = buf[2];
+ d = buf[3];
+
+ MD5STEP(F1, a, b, c, d, in[ 0]+0xd76aa478, 7);
+ MD5STEP(F1, d, a, b, c, in[ 1]+0xe8c7b756, 12);
+ MD5STEP(F1, c, d, a, b, in[ 2]+0x242070db, 17);
+ MD5STEP(F1, b, c, d, a, in[ 3]+0xc1bdceee, 22);
+ MD5STEP(F1, a, b, c, d, in[ 4]+0xf57c0faf, 7);
+ MD5STEP(F1, d, a, b, c, in[ 5]+0x4787c62a, 12);
+ MD5STEP(F1, c, d, a, b, in[ 6]+0xa8304613, 17);
+ MD5STEP(F1, b, c, d, a, in[ 7]+0xfd469501, 22);
+ MD5STEP(F1, a, b, c, d, in[ 8]+0x698098d8, 7);
+ MD5STEP(F1, d, a, b, c, in[ 9]+0x8b44f7af, 12);
+ MD5STEP(F1, c, d, a, b, in[10]+0xffff5bb1, 17);
+ MD5STEP(F1, b, c, d, a, in[11]+0x895cd7be, 22);
+ MD5STEP(F1, a, b, c, d, in[12]+0x6b901122, 7);
+ MD5STEP(F1, d, a, b, c, in[13]+0xfd987193, 12);
+ MD5STEP(F1, c, d, a, b, in[14]+0xa679438e, 17);
+ MD5STEP(F1, b, c, d, a, in[15]+0x49b40821, 22);
+
+ MD5STEP(F2, a, b, c, d, in[ 1]+0xf61e2562, 5);
+ MD5STEP(F2, d, a, b, c, in[ 6]+0xc040b340, 9);
+ MD5STEP(F2, c, d, a, b, in[11]+0x265e5a51, 14);
+ MD5STEP(F2, b, c, d, a, in[ 0]+0xe9b6c7aa, 20);
+ MD5STEP(F2, a, b, c, d, in[ 5]+0xd62f105d, 5);
+ MD5STEP(F2, d, a, b, c, in[10]+0x02441453, 9);
+ MD5STEP(F2, c, d, a, b, in[15]+0xd8a1e681, 14);
+ MD5STEP(F2, b, c, d, a, in[ 4]+0xe7d3fbc8, 20);
+ MD5STEP(F2, a, b, c, d, in[ 9]+0x21e1cde6, 5);
+ MD5STEP(F2, d, a, b, c, in[14]+0xc33707d6, 9);
+ MD5STEP(F2, c, d, a, b, in[ 3]+0xf4d50d87, 14);
+ MD5STEP(F2, b, c, d, a, in[ 8]+0x455a14ed, 20);
+ MD5STEP(F2, a, b, c, d, in[13]+0xa9e3e905, 5);
+ MD5STEP(F2, d, a, b, c, in[ 2]+0xfcefa3f8, 9);
+ MD5STEP(F2, c, d, a, b, in[ 7]+0x676f02d9, 14);
+ MD5STEP(F2, b, c, d, a, in[12]+0x8d2a4c8a, 20);
+
+ MD5STEP(F3, a, b, c, d, in[ 5]+0xfffa3942, 4);
+ MD5STEP(F3, d, a, b, c, in[ 8]+0x8771f681, 11);
+ MD5STEP(F3, c, d, a, b, in[11]+0x6d9d6122, 16);
+ MD5STEP(F3, b, c, d, a, in[14]+0xfde5380c, 23);
+ MD5STEP(F3, a, b, c, d, in[ 1]+0xa4beea44, 4);
+ MD5STEP(F3, d, a, b, c, in[ 4]+0x4bdecfa9, 11);
+ MD5STEP(F3, c, d, a, b, in[ 7]+0xf6bb4b60, 16);
+ MD5STEP(F3, b, c, d, a, in[10]+0xbebfbc70, 23);
+ MD5STEP(F3, a, b, c, d, in[13]+0x289b7ec6, 4);
+ MD5STEP(F3, d, a, b, c, in[ 0]+0xeaa127fa, 11);
+ MD5STEP(F3, c, d, a, b, in[ 3]+0xd4ef3085, 16);
+ MD5STEP(F3, b, c, d, a, in[ 6]+0x04881d05, 23);
+ MD5STEP(F3, a, b, c, d, in[ 9]+0xd9d4d039, 4);
+ MD5STEP(F3, d, a, b, c, in[12]+0xe6db99e5, 11);
+ MD5STEP(F3, c, d, a, b, in[15]+0x1fa27cf8, 16);
+ MD5STEP(F3, b, c, d, a, in[ 2]+0xc4ac5665, 23);
+
+ MD5STEP(F4, a, b, c, d, in[ 0]+0xf4292244, 6);
+ MD5STEP(F4, d, a, b, c, in[ 7]+0x432aff97, 10);
+ MD5STEP(F4, c, d, a, b, in[14]+0xab9423a7, 15);
+ MD5STEP(F4, b, c, d, a, in[ 5]+0xfc93a039, 21);
+ MD5STEP(F4, a, b, c, d, in[12]+0x655b59c3, 6);
+ MD5STEP(F4, d, a, b, c, in[ 3]+0x8f0ccc92, 10);
+ MD5STEP(F4, c, d, a, b, in[10]+0xffeff47d, 15);
+ MD5STEP(F4, b, c, d, a, in[ 1]+0x85845dd1, 21);
+ MD5STEP(F4, a, b, c, d, in[ 8]+0x6fa87e4f, 6);
+ MD5STEP(F4, d, a, b, c, in[15]+0xfe2ce6e0, 10);
+ MD5STEP(F4, c, d, a, b, in[ 6]+0xa3014314, 15);
+ MD5STEP(F4, b, c, d, a, in[13]+0x4e0811a1, 21);
+ MD5STEP(F4, a, b, c, d, in[ 4]+0xf7537e82, 6);
+ MD5STEP(F4, d, a, b, c, in[11]+0xbd3af235, 10);
+ MD5STEP(F4, c, d, a, b, in[ 2]+0x2ad7d2bb, 15);
+ MD5STEP(F4, b, c, d, a, in[ 9]+0xeb86d391, 21);
+
+ buf[0] += a;
+ buf[1] += b;
+ buf[2] += c;
+ buf[3] += d;
+}
+
+#undef F1
+#undef F2
+#undef F3
+#undef F4
+#undef MD5STEP
+
+
+#if POOLWORDS % 16
+#error extract_entropy() assumes that POOLWORDS is a multiple of 16 words.
+#endif
+/*
+ * This function extracts randomness from the "entropy pool", and
+ * returns it in a buffer. This function computes how many remaining
+ * bits of entropy are left in the pool, but it does not restrict the
+ * number of bytes that are actually obtained.
+ */
+static inline int
+extract_entropy(struct random_bucket *r, char *buf, int nbytes)
+{
+ int ret, i;
+ u_int32_t tmp[4];
+
+ add_timer_randomness(r, &extract_timer_state, nbytes);
+
+ /* Redundant, but just in case... */
+ if (r->entropy_count > POOLBITS)
+ r->entropy_count = POOLBITS;
+ /* Why is this here? Left in from Ted Ts'o. Perhaps to limit time. */
+ if (nbytes > 32768)
+ nbytes = 32768;
+
+ ret = nbytes;
+ if (r->entropy_count / 8 >= nbytes)
+ r->entropy_count -= nbytes*8;
+ else
+ r->entropy_count = 0;
+
+ while (nbytes) {
+ /* Hash the pool to get the output */
+ tmp[0] = 0x67452301;
+ tmp[1] = 0xefcdab89;
+ tmp[2] = 0x98badcfe;
+ tmp[3] = 0x10325476;
+ for (i = 0; i < POOLWORDS; i += 16)
+ MD5Transform(tmp, r->pool+i);
+ /* Modify pool so next hash will produce different results */
+ add_entropy_word(r, tmp[0]);
+ add_entropy_word(r, tmp[1]);
+ add_entropy_word(r, tmp[2]);
+ add_entropy_word(r, tmp[3]);
+ /*
+ * Run the MD5 Transform one more time, since we want
+ * to add at least minimal obscuring of the inputs to
+ * add_entropy_word(). --- TYT
+ */
+ MD5Transform(tmp, r->pool);
+
+ /* Copy data to destination buffer */
+ i = MIN(nbytes, 16);
+ bcopy(tmp, buf, i);
+ nbytes -= i;
+ buf += i;
+ }
+
+ /* Wipe data from memory */
+ bzero(tmp, sizeof(tmp));
+
+ return ret;
+}
+
+#ifdef notused /* XXX NOT the exported kernel interface */
+/*
+ * This function is the exported kernel interface. It returns some
+ * number of good random numbers, suitable for seeding TCP sequence
+ * numbers, etc.
+ */
+void
+get_random_bytes(void *buf, u_int nbytes)
+{
+ extract_entropy(&random_state, (char *) buf, nbytes);
+}
+#endif /* notused */
+
+u_int
+read_random(char *buf, u_int nbytes)
+{
+ if ((nbytes * 8) > random_state.entropy_count)
+ nbytes = random_state.entropy_count / 8;
+
+ return extract_entropy(&random_state, buf, nbytes);
+}
+
+u_int
+read_random_unlimited(char *buf, u_int nbytes)
+{
+ return extract_entropy(&random_state, buf, nbytes);
+}
+
+#ifdef notused
+u_int
+write_random(const char *buf, u_int nbytes)
+{
+ u_int i;
+ u_int32_t word, *p;
+
+ for (i = nbytes, p = (u_int32_t *)buf;
+ i >= sizeof(u_int32_t);
+ i-= sizeof(u_int32_t), p++)
+ add_entropy_word(&random_state, *p);
+ if (i) {
+ word = 0;
+ bcopy(p, &word, i);
+ add_entropy_word(&random_state, word);
+ }
+ return nbytes;
+}
+#endif /* notused */
+
+int
+random_select(dev_t dev, int rw, struct proc *p)
+{
+ int s, ret;
+
+ if (rw == FWRITE)
+ return 1; /* heh. */
+
+ s = splhigh();
+ if (random_state.entropy_count >= 8)
+ ret = 1;
+ else {
+ selrecord(p, &random_state.rsel);
+ ret = 0;
+ }
+ splx(s);
+
+ return ret;
+}
+
diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c
index 569b9d9..fe50cf9 100644
--- a/sys/kern/kern_resource.c
+++ b/sys/kern/kern_resource.c
@@ -35,21 +35,27 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)kern_resource.c 8.8 (Berkeley) 2/14/95
+ * @(#)kern_resource.c 8.5 (Berkeley) 1/21/94
+ * $Id$
*/
+#include "opt_rlimit.h"
+
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/sysproto.h>
#include <sys/kernel.h>
#include <sys/file.h>
#include <sys/resourcevar.h>
#include <sys/malloc.h>
#include <sys/proc.h>
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
-
#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
int donice __P((struct proc *curp, struct proc *chgp, int n));
int dosetrlimit __P((struct proc *p, u_int which, struct rlimit *limp));
@@ -58,25 +64,28 @@ int dosetrlimit __P((struct proc *p, u_int which, struct rlimit *limp));
* Resource controls and accounting.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct getpriority_args {
+ int which;
+ int who;
+};
+#endif
int
getpriority(curp, uap, retval)
struct proc *curp;
- register struct getpriority_args /* {
- syscallarg(int) which;
- syscallarg(int) who;
- } */ *uap;
- register_t *retval;
+ register struct getpriority_args *uap;
+ int *retval;
{
register struct proc *p;
register int low = PRIO_MAX + 1;
- switch (SCARG(uap, which)) {
+ switch (uap->which) {
case PRIO_PROCESS:
- if (SCARG(uap, who) == 0)
+ if (uap->who == 0)
p = curp;
else
- p = pfind(SCARG(uap, who));
+ p = pfind(uap->who);
if (p == 0)
break;
low = p->p_nice;
@@ -85,9 +94,9 @@ getpriority(curp, uap, retval)
case PRIO_PGRP: {
register struct pgrp *pg;
- if (SCARG(uap, who) == 0)
+ if (uap->who == 0)
pg = curp->p_pgrp;
- else if ((pg = pgfind(SCARG(uap, who))) == NULL)
+ else if ((pg = pgfind(uap->who)) == NULL)
break;
for (p = pg->pg_members.lh_first; p != 0;
p = p->p_pglist.le_next) {
@@ -98,10 +107,10 @@ getpriority(curp, uap, retval)
}
case PRIO_USER:
- if (SCARG(uap, who) == 0)
- SCARG(uap, who) = curp->p_ucred->cr_uid;
+ if (uap->who == 0)
+ uap->who = curp->p_ucred->cr_uid;
for (p = allproc.lh_first; p != 0; p = p->p_list.le_next)
- if (p->p_ucred->cr_uid == SCARG(uap, who) &&
+ if (p->p_ucred->cr_uid == uap->who &&
p->p_nice < low)
low = p->p_nice;
break;
@@ -115,54 +124,57 @@ getpriority(curp, uap, retval)
return (0);
}
+#ifndef _SYS_SYSPROTO_H_
+struct setpriority_args {
+ int which;
+ int who;
+ int prio;
+};
+#endif
/* ARGSUSED */
int
setpriority(curp, uap, retval)
struct proc *curp;
- register struct setpriority_args /* {
- syscallarg(int) which;
- syscallarg(int) who;
- syscallarg(int) prio;
- } */ *uap;
- register_t *retval;
+ register struct setpriority_args *uap;
+ int *retval;
{
register struct proc *p;
int found = 0, error = 0;
- switch (SCARG(uap, which)) {
+ switch (uap->which) {
case PRIO_PROCESS:
- if (SCARG(uap, who) == 0)
+ if (uap->who == 0)
p = curp;
else
- p = pfind(SCARG(uap, who));
+ p = pfind(uap->who);
if (p == 0)
break;
- error = donice(curp, p, SCARG(uap, prio));
+ error = donice(curp, p, uap->prio);
found++;
break;
case PRIO_PGRP: {
register struct pgrp *pg;
-
- if (SCARG(uap, who) == 0)
+
+ if (uap->who == 0)
pg = curp->p_pgrp;
- else if ((pg = pgfind(SCARG(uap, who))) == NULL)
+ else if ((pg = pgfind(uap->who)) == NULL)
break;
for (p = pg->pg_members.lh_first; p != 0;
p = p->p_pglist.le_next) {
- error = donice(curp, p, SCARG(uap, prio));
+ error = donice(curp, p, uap->prio);
found++;
}
break;
}
case PRIO_USER:
- if (SCARG(uap, who) == 0)
- SCARG(uap, who) = curp->p_ucred->cr_uid;
+ if (uap->who == 0)
+ uap->who = curp->p_ucred->cr_uid;
for (p = allproc.lh_first; p != 0; p = p->p_list.le_next)
- if (p->p_ucred->cr_uid == SCARG(uap, who)) {
- error = donice(curp, p, SCARG(uap, prio));
+ if (p->p_ucred->cr_uid == uap->who) {
+ error = donice(curp, p, uap->prio);
found++;
}
break;
@@ -197,71 +209,150 @@ donice(curp, chgp, n)
return (0);
}
+/* rtprio system call */
+#ifndef _SYS_SYSPROTO_H_
+struct rtprio_args {
+ int function;
+ pid_t pid;
+ struct rtprio *rtp;
+};
+#endif
+
+/*
+ * Set realtime priority
+ */
+
+/* ARGSUSED */
+int
+rtprio(curp, uap, retval)
+ struct proc *curp;
+ register struct rtprio_args *uap;
+ int *retval;
+{
+ register struct proc *p;
+ register struct pcred *pcred = curp->p_cred;
+ struct rtprio rtp;
+ int error;
+
+ error = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
+ if (error)
+ return (error);
+
+ if (uap->pid == 0)
+ p = curp;
+ else
+ p = pfind(uap->pid);
+
+ if (p == 0)
+ return (ESRCH);
+
+ switch (uap->function) {
+ case RTP_LOOKUP:
+ return (copyout(&p->p_rtprio, uap->rtp, sizeof(struct rtprio)));
+ case RTP_SET:
+ if (pcred->pc_ucred->cr_uid && pcred->p_ruid &&
+ pcred->pc_ucred->cr_uid != p->p_ucred->cr_uid &&
+ pcred->p_ruid != p->p_ucred->cr_uid)
+ return (EPERM);
+ /* disallow setting rtprio in most cases if not superuser */
+ if (suser(pcred->pc_ucred, &curp->p_acflag)) {
+ /* can't set someone else's */
+ if (uap->pid)
+ return (EPERM);
+ /* can't set realtime priority */
+ if (rtp.type == RTP_PRIO_REALTIME)
+ return (EPERM);
+ }
+ switch (rtp.type) {
+ case RTP_PRIO_REALTIME:
+ case RTP_PRIO_NORMAL:
+ case RTP_PRIO_IDLE:
+ if (rtp.prio > RTP_PRIO_MAX)
+ return (EINVAL);
+ p->p_rtprio = rtp;
+ return (0);
+ default:
+ return (EINVAL);
+ }
+
+ default:
+ return (EINVAL);
+ }
+}
+
#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#ifndef _SYS_SYSPROTO_H_
+struct osetrlimit_args {
+ u_int which;
+ struct orlimit *rlp;
+};
+#endif
/* ARGSUSED */
int
-compat_43_setrlimit(p, uap, retval)
+osetrlimit(p, uap, retval)
struct proc *p;
- struct compat_43_setrlimit_args /* {
- syscallarg(u_int) which;
- syscallarg(struct ogetrlimit *) rlp;
- } */ *uap;
- register_t *retval;
+ register struct osetrlimit_args *uap;
+ int *retval;
{
struct orlimit olim;
struct rlimit lim;
int error;
- if (error = copyin((caddr_t)SCARG(uap, rlp), (caddr_t)&olim,
- sizeof (struct orlimit)))
+ if ((error =
+ copyin((caddr_t)uap->rlp, (caddr_t)&olim, sizeof(struct orlimit))))
return (error);
lim.rlim_cur = olim.rlim_cur;
lim.rlim_max = olim.rlim_max;
- return (dosetrlimit(p, SCARG(uap, which), &lim));
+ return (dosetrlimit(p, uap->which, &lim));
}
+#ifndef _SYS_SYSPROTO_H_
+struct ogetrlimit_args {
+ u_int which;
+ struct orlimit *rlp;
+};
+#endif
/* ARGSUSED */
int
-compat_43_getrlimit(p, uap, retval)
+ogetrlimit(p, uap, retval)
struct proc *p;
- register struct compat_43_getrlimit_args /* {
- syscallarg(u_int) which;
- syscallarg(struct ogetrlimit *) rlp;
- } */ *uap;
- register_t *retval;
+ register struct ogetrlimit_args *uap;
+ int *retval;
{
struct orlimit olim;
- if (SCARG(uap, which) >= RLIM_NLIMITS)
+ if (uap->which >= RLIM_NLIMITS)
return (EINVAL);
- olim.rlim_cur = p->p_rlimit[SCARG(uap, which)].rlim_cur;
+ olim.rlim_cur = p->p_rlimit[uap->which].rlim_cur;
if (olim.rlim_cur == -1)
olim.rlim_cur = 0x7fffffff;
- olim.rlim_max = p->p_rlimit[SCARG(uap, which)].rlim_max;
+ olim.rlim_max = p->p_rlimit[uap->which].rlim_max;
if (olim.rlim_max == -1)
olim.rlim_max = 0x7fffffff;
- return (copyout((caddr_t)&olim, (caddr_t)SCARG(uap, rlp),
- sizeof(olim)));
+ return (copyout((caddr_t)&olim, (caddr_t)uap->rlp, sizeof(olim)));
}
#endif /* COMPAT_43 || COMPAT_SUNOS */
+#ifndef _SYS_SYSPROTO_H_
+struct __setrlimit_args {
+ u_int which;
+ struct rlimit *rlp;
+};
+#endif
/* ARGSUSED */
int
setrlimit(p, uap, retval)
struct proc *p;
- register struct setrlimit_args /* {
- syscallarg(u_int) which;
- syscallarg(struct rlimit *) rlp;
- } */ *uap;
- register_t *retval;
+ register struct __setrlimit_args *uap;
+ int *retval;
{
struct rlimit alim;
int error;
- if (error = copyin((caddr_t)SCARG(uap, rlp), (caddr_t)&alim,
- sizeof (struct rlimit)))
+ if ((error =
+ copyin((caddr_t)uap->rlp, (caddr_t)&alim, sizeof (struct rlimit))))
return (error);
- return (dosetrlimit(p, SCARG(uap, which), &alim));
+ return (dosetrlimit(p, uap->which, &alim));
}
int
@@ -271,15 +362,23 @@ dosetrlimit(p, which, limp)
struct rlimit *limp;
{
register struct rlimit *alimp;
- extern unsigned maxdmap;
int error;
if (which >= RLIM_NLIMITS)
return (EINVAL);
alimp = &p->p_rlimit[which];
- if (limp->rlim_cur > alimp->rlim_max ||
+
+ /*
+ * Preserve historical bugs by treating negative limits as unsigned.
+ */
+ if (limp->rlim_cur < 0)
+ limp->rlim_cur = RLIM_INFINITY;
+ if (limp->rlim_max < 0)
+ limp->rlim_max = RLIM_INFINITY;
+
+ if (limp->rlim_cur > alimp->rlim_max ||
limp->rlim_max > alimp->rlim_max)
- if (error = suser(p->p_ucred, &p->p_acflag))
+ if ((error = suser(p->p_ucred, &p->p_acflag)))
return (error);
if (limp->rlim_cur > limp->rlim_max)
limp->rlim_cur = limp->rlim_max;
@@ -293,17 +392,17 @@ dosetrlimit(p, which, limp)
switch (which) {
case RLIMIT_DATA:
- if (limp->rlim_cur > maxdmap)
- limp->rlim_cur = maxdmap;
- if (limp->rlim_max > maxdmap)
- limp->rlim_max = maxdmap;
+ if (limp->rlim_cur > MAXDSIZ)
+ limp->rlim_cur = MAXDSIZ;
+ if (limp->rlim_max > MAXDSIZ)
+ limp->rlim_max = MAXDSIZ;
break;
case RLIMIT_STACK:
- if (limp->rlim_cur > maxdmap)
- limp->rlim_cur = maxdmap;
- if (limp->rlim_max > maxdmap)
- limp->rlim_max = maxdmap;
+ if (limp->rlim_cur > MAXSSIZ)
+ limp->rlim_cur = MAXSSIZ;
+ if (limp->rlim_max > MAXSSIZ)
+ limp->rlim_max = MAXSSIZ;
/*
* Stack is allocated to the max at exec time with only
* "rlim_cur" bytes accessible. If stack limit is going
@@ -331,38 +430,41 @@ dosetrlimit(p, which, limp)
break;
case RLIMIT_NOFILE:
- if (limp->rlim_cur > maxfiles)
- limp->rlim_cur = maxfiles;
- if (limp->rlim_max > maxfiles)
- limp->rlim_max = maxfiles;
+ if (limp->rlim_cur > maxfilesperproc)
+ limp->rlim_cur = maxfilesperproc;
+ if (limp->rlim_max > maxfilesperproc)
+ limp->rlim_max = maxfilesperproc;
break;
case RLIMIT_NPROC:
- if (limp->rlim_cur > maxproc)
- limp->rlim_cur = maxproc;
- if (limp->rlim_max > maxproc)
- limp->rlim_max = maxproc;
+ if (limp->rlim_cur > maxprocperuid)
+ limp->rlim_cur = maxprocperuid;
+ if (limp->rlim_max > maxprocperuid)
+ limp->rlim_max = maxprocperuid;
break;
}
*alimp = *limp;
return (0);
}
+#ifndef _SYS_SYSPROTO_H_
+struct __getrlimit_args {
+ u_int which;
+ struct rlimit *rlp;
+};
+#endif
/* ARGSUSED */
int
getrlimit(p, uap, retval)
struct proc *p;
- register struct getrlimit_args /* {
- syscallarg(u_int) which;
- syscallarg(struct rlimit *) rlp;
- } */ *uap;
- register_t *retval;
+ register struct __getrlimit_args *uap;
+ int *retval;
{
- if (SCARG(uap, which) >= RLIM_NLIMITS)
+ if (uap->which >= RLIM_NLIMITS)
return (EINVAL);
- return (copyout((caddr_t)&p->p_rlimit[SCARG(uap, which)],
- (caddr_t)SCARG(uap, rlp), sizeof (struct rlimit)));
+ return (copyout((caddr_t)&p->p_rlimit[uap->which], (caddr_t)uap->rlp,
+ sizeof (struct rlimit)));
}
/*
@@ -371,14 +473,15 @@ getrlimit(p, uap, retval)
*/
void
calcru(p, up, sp, ip)
- register struct proc *p;
- register struct timeval *up;
- register struct timeval *sp;
- register struct timeval *ip;
+ struct proc *p;
+ struct timeval *up;
+ struct timeval *sp;
+ struct timeval *ip;
{
- register u_quad_t u, st, ut, it, tot;
- register u_long sec, usec;
- register int s;
+ quad_t totusec;
+ u_quad_t u, st, ut, it, tot;
+ long sec, usec;
+ int s;
struct timeval tv;
s = splstatclock();
@@ -389,11 +492,8 @@ calcru(p, up, sp, ip)
tot = st + ut + it;
if (tot == 0) {
- up->tv_sec = up->tv_usec = 0;
- sp->tv_sec = sp->tv_usec = 0;
- if (ip != NULL)
- ip->tv_sec = ip->tv_usec = 0;
- return;
+ st = 1;
+ tot = 1;
}
sec = p->p_rtime.tv_sec;
@@ -408,7 +508,13 @@ calcru(p, up, sp, ip)
sec += tv.tv_sec - runtime.tv_sec;
usec += tv.tv_usec - runtime.tv_usec;
}
- u = sec * 1000000 + usec;
+ totusec = (quad_t)sec * 1000000 + usec;
+ if (totusec < 0) {
+ /* XXX no %qd in kernel. Truncate. */
+ printf("calcru: negative time: %ld usec\n", (long)totusec);
+ totusec = 0;
+ }
+ u = totusec;
st = (u * st) / tot;
sp->tv_sec = st / 1000000;
sp->tv_usec = st % 1000000;
@@ -422,19 +528,22 @@ calcru(p, up, sp, ip)
}
}
+#ifndef _SYS_SYSPROTO_H_
+struct getrusage_args {
+ int who;
+ struct rusage *rusage;
+};
+#endif
/* ARGSUSED */
int
getrusage(p, uap, retval)
register struct proc *p;
- register struct getrusage_args /* {
- syscallarg(int) who;
- syscallarg(struct rusage *) rusage;
- } */ *uap;
- register_t *retval;
+ register struct getrusage_args *uap;
+ int *retval;
{
register struct rusage *rup;
- switch (SCARG(uap, who)) {
+ switch (uap->who) {
case RUSAGE_SELF:
rup = &p->p_stats->p_ru;
@@ -448,7 +557,7 @@ getrusage(p, uap, retval)
default:
return (EINVAL);
}
- return (copyout((caddr_t)rup, (caddr_t)SCARG(uap, rusage),
+ return (copyout((caddr_t)rup, (caddr_t)uap->rusage,
sizeof (struct rusage)));
}
diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c
new file mode 100644
index 0000000..c4922d0
--- /dev/null
+++ b/sys/kern/kern_shutdown.c
@@ -0,0 +1,445 @@
+/*-
+ * Copyright (c) 1986, 1988, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_shutdown.c 8.3 (Berkeley) 1/21/94
+ * $Id$
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/reboot.h>
+#include <sys/msgbuf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/tty.h>
+#include <sys/tprintf.h>
+#include <sys/syslog.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/sysctl.h>
+#include <sys/conf.h>
+#include <sys/sysproto.h>
+
+#include <machine/pcb.h>
+#include <machine/clock.h>
+#include <machine/cons.h>
+#include <machine/md_var.h>
+
+#include <sys/utsname.h>
+#include <sys/signalvar.h>
+
+#ifndef PANIC_REBOOT_WAIT_TIME
+#define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */
+#endif
+
+/*
+ * Note that stdarg.h and the ANSI style va_start macro is used for both
+ * ANSI and traditional C compilers.
+ */
+#include <machine/stdarg.h>
+
+#if defined(DDB)
+#ifdef DDB_UNATTENDED
+ static int debugger_on_panic = 0;
+#else
+ static int debugger_on_panic = 1;
+#endif
+
+SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic, CTLFLAG_RW,
+ &debugger_on_panic, 0, "");
+#endif
+
+
+/*
+ * Variable panicstr contains argument to first call to panic; used as flag
+ * to indicate that the kernel has already called panic.
+ */
+const char *panicstr;
+
+/*
+ * callout list for things to do a shutdown
+ */
+typedef struct shutdown_list_element {
+ struct shutdown_list_element *next;
+ bootlist_fn function;
+ void *arg;
+} *sle_p;
+
+/*
+ * there are two shutdown lists. Some things need to be shut down
+ * Earlier than others.
+ */
+static sle_p shutdown_list1;
+static sle_p shutdown_list2;
+
+
+static void dumpsys(void);
+
+#ifndef _SYS_SYSPROTO_H_
+struct reboot_args {
+ int opt;
+};
+#endif
+/* ARGSUSED */
+
+/*
+ * The system call that results in a reboot
+ */
+int
+reboot(p, uap, retval)
+ struct proc *p;
+ struct reboot_args *uap;
+ int *retval;
+{
+ int error;
+
+ if ((error = suser(p->p_ucred, &p->p_acflag)))
+ return (error);
+
+ boot(uap->opt);
+ return (0);
+}
+
+/*
+ * Called by events that want to shut down.. e.g <CTL><ALT><DEL> on a PC
+ */
+void
+shutdown_nice(void)
+{
+ /* Send a signal to init(8) and have it shutdown the world */
+ if (initproc != NULL) {
+ psignal(initproc, SIGINT);
+ } else {
+ /* No init(8) running, so simply reboot */
+ boot(RB_NOSYNC);
+ }
+ return;
+}
+static int waittime = -1;
+static struct pcb dumppcb;
+
+/*
+ * Go through the rigmarole of shutting down..
+ * this used to be in machdep.c but I'll be dammned if I could see
+ * anything machine dependant in it.
+ */
+void
+boot(howto)
+ int howto;
+{
+ sle_p ep;
+
+ ep = shutdown_list1;
+ while (ep) {
+ shutdown_list1 = ep->next;
+ (*ep->function)(howto, ep->arg);
+ ep = ep->next;
+ }
+ if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) {
+ register struct buf *bp;
+ int iter, nbusy;
+
+ waittime = 0;
+ printf("\nsyncing disks... ");
+
+ sync(&proc0, NULL, NULL);
+
+ for (iter = 0; iter < 20; iter++) {
+ nbusy = 0;
+ for (bp = &buf[nbuf]; --bp >= buf; ) {
+ if ((bp->b_flags & (B_BUSY | B_INVAL)) == B_BUSY) {
+ nbusy++;
+ }
+ }
+ if (nbusy == 0)
+ break;
+ printf("%d ", nbusy);
+ DELAY(40000 * iter);
+ }
+ if (nbusy) {
+ /*
+ * Failed to sync all blocks. Indicate this and don't
+ * unmount filesystems (thus forcing an fsck on reboot).
+ */
+ printf("giving up\n");
+#ifdef SHOW_BUSYBUFS
+ nbusy = 0;
+ for (bp = &buf[nbuf]; --bp >= buf; ) {
+ if ((bp->b_flags & (B_BUSY | B_INVAL)) == B_BUSY) {
+ nbusy++;
+ printf("%d: dev:%08x, flags:%08x, blkno:%d, lblkno:%d\n", nbusy, bp->b_dev, bp->b_flags, bp->b_blkno, bp->b_lblkno);
+ }
+ }
+ DELAY(5000000); /* 5 seconds */
+#endif
+ } else {
+ printf("done\n");
+ /*
+ * Unmount filesystems
+ */
+ if (panicstr == 0)
+ vfs_unmountall();
+ }
+ DELAY(100000); /* wait for console output to finish */
+ }
+ ep = shutdown_list2;
+ while (ep) {
+ shutdown_list2 = ep->next;
+ (*ep->function)(howto, ep->arg);
+ ep = ep->next;
+ }
+ splhigh();
+ if (howto & RB_HALT) {
+ printf("\n");
+ printf("The operating system has halted.\n");
+ printf("Please press any key to reboot.\n\n");
+ switch (cngetc()) {
+ case -1: /* No console, just die */
+ cpu_halt();
+ /* NOTREACHED */
+ default:
+ break;
+ }
+ } else {
+ if (howto & RB_DUMP) {
+ if (!cold) {
+ savectx(&dumppcb);
+ dumppcb.pcb_cr3 = rcr3();
+ dumpsys();
+ }
+
+ if (PANIC_REBOOT_WAIT_TIME != 0) {
+ if (PANIC_REBOOT_WAIT_TIME != -1) {
+ int loop;
+ printf("Automatic reboot in %d seconds - press a key on the console to abort\n",
+ PANIC_REBOOT_WAIT_TIME);
+ for (loop = PANIC_REBOOT_WAIT_TIME * 10; loop > 0; --loop) {
+ DELAY(1000 * 100); /* 1/10th second */
+ /* Did user type a key? */
+ if (cncheckc() != -1)
+ break;
+ }
+ if (!loop)
+ goto die;
+ }
+ } else { /* zero time specified - reboot NOW */
+ goto die;
+ }
+ printf("--> Press a key on the console to reboot <--\n");
+ cngetc();
+ }
+ }
+die:
+ printf("Rebooting...\n");
+ DELAY(1000000); /* wait 1 sec for printf's to complete and be read */
+ /* cpu_boot(howto); */ /* doesn't do anything at the moment */
+ cpu_reset();
+ for(;;) ;
+ /* NOTREACHED */
+}
+
+/*
+ * Magic number for savecore
+ *
+ * exported (symorder) and used at least by savecore(8)
+ *
+ */
+static u_long const dumpmag = 0x8fca0101UL;
+
+static int dumpsize = 0; /* also for savecore */
+
+static int dodump = 1;
+SYSCTL_INT(_machdep, OID_AUTO, do_dump, CTLFLAG_RW, &dodump, 0, "");
+
+/*
+ * Doadump comes here after turning off memory management and
+ * getting on the dump stack, either when called above, or by
+ * the auto-restart code.
+ */
+static void
+dumpsys(void)
+{
+
+ if (!dodump)
+ return;
+ if (dumpdev == NODEV)
+ return;
+ if ((minor(dumpdev)&07) != 1)
+ return;
+ if (!(bdevsw[major(dumpdev)]))
+ return;
+ if (!(bdevsw[major(dumpdev)]->d_dump))
+ return;
+ dumpsize = Maxmem;
+ printf("\ndumping to dev %lx, offset %ld\n", dumpdev, dumplo);
+ printf("dump ");
+ switch ((*bdevsw[major(dumpdev)]->d_dump)(dumpdev)) {
+
+ case ENXIO:
+ printf("device bad\n");
+ break;
+
+ case EFAULT:
+ printf("device not ready\n");
+ break;
+
+ case EINVAL:
+ printf("area improper\n");
+ break;
+
+ case EIO:
+ printf("i/o error\n");
+ break;
+
+ case EINTR:
+ printf("aborted from console\n");
+ break;
+
+ default:
+ printf("succeeded\n");
+ break;
+ }
+}
+
+/*
+ * Panic is called on unresolvable fatal errors. It prints "panic: mesg",
+ * and then reboots. If we are called twice, then we avoid trying to sync
+ * the disks as this often leads to recursive panics.
+ */
+void
+panic(const char *fmt, ...)
+{
+ int bootopt;
+ va_list ap;
+
+ bootopt = RB_AUTOBOOT | RB_DUMP;
+ if (panicstr)
+ bootopt |= RB_NOSYNC;
+ else
+ panicstr = fmt;
+
+ printf("panic: ");
+ va_start(ap, fmt);
+ vprintf(fmt, ap);
+ va_end(ap);
+ printf("\n");
+
+#if defined(DDB)
+ if (debugger_on_panic)
+ Debugger ("panic");
+#endif
+ boot(bootopt);
+}
+
+/*
+ * Two routines to handle adding/deleting items on the
+ * shutdown callout lists
+ *
+ * at_shutdown():
+ * Take the arguments given and put them onto the shutdown callout list.
+ * However first make sure that it's not already there.
+ * returns 0 on success.
+ */
+int
+at_shutdown(bootlist_fn function, void *arg, int position)
+{
+ sle_p ep, *epp;
+
+ switch(position) {
+ case SHUTDOWN_PRE_SYNC:
+ epp = &shutdown_list1;
+ break;
+ case SHUTDOWN_POST_SYNC:
+ epp = &shutdown_list2;
+ break;
+ default:
+ printf("bad exit callout list specified\n");
+ return (EINVAL);
+ }
+ if (rm_at_shutdown(function, arg))
+ printf("exit callout entry already present\n");
+ ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT);
+ if (ep == NULL)
+ return (ENOMEM);
+ ep->next = *epp;
+ ep->function = function;
+ ep->arg = arg;
+ *epp = ep;
+ return (0);
+}
+
+/*
+ * Scan the exit callout lists for the given items and remove them.
+ * Returns the number of items removed.
+ */
+int
+rm_at_shutdown(bootlist_fn function, void *arg)
+{
+ sle_p *epp, ep;
+ int count;
+
+ count = 0;
+ epp = &shutdown_list1;
+ ep = *epp;
+ while (ep) {
+ if ((ep->function == function) && (ep->arg == arg)) {
+ *epp = ep->next;
+ free(ep, M_TEMP);
+ count++;
+ } else {
+ epp = &ep->next;
+ }
+ ep = *epp;
+ }
+ epp = &shutdown_list2;
+ ep = *epp;
+ while (ep) {
+ if ((ep->function == function) && (ep->arg == arg)) {
+ *epp = ep->next;
+ free(ep, M_TEMP);
+ count++;
+ } else {
+ epp = &ep->next;
+ }
+ ep = *epp;
+ }
+ return (count);
+}
+
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index 5683b9c..e0b28e0 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -35,11 +35,15 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)kern_sig.c 8.14 (Berkeley) 5/14/95
+ * @(#)kern_sig.c 8.7 (Berkeley) 4/18/94
+ * $Id: kern_sig.c,v 1.30 1997/02/22 09:39:11 peter Exp $
*/
+#include "opt_ktrace.h"
+
#define SIGPROP /* include signal properties table */
#include <sys/param.h>
+#include <sys/sysproto.h>
#include <sys/signalvar.h>
#include <sys/resourcevar.h>
#include <sys/namei.h>
@@ -50,22 +54,27 @@
#include <sys/times.h>
#include <sys/buf.h>
#include <sys/acct.h>
-#include <sys/file.h>
+#include <sys/fcntl.h>
#include <sys/kernel.h>
#include <sys/wait.h>
#include <sys/ktrace.h>
#include <sys/syslog.h>
#include <sys/stat.h>
-
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
+#include <sys/sysent.h>
#include <machine/cpu.h>
#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
#include <sys/user.h> /* for coredump */
-void stop __P((struct proc *p));
+static int coredump __P((struct proc *p));
+static int killpg1 __P((struct proc *cp, int signum, int pgid, int all));
+static void stop __P((struct proc *));
/*
* Can process p, with pcred pc, send the signal signum to process q?
@@ -78,16 +87,19 @@ void stop __P((struct proc *p));
(pc)->pc_ucred->cr_uid == (q)->p_ucred->cr_uid || \
((signum) == SIGCONT && (q)->p_session == (p)->p_session))
+#ifndef _SYS_SYSPROTO_H_
+struct sigaction_args {
+ int signum;
+ struct sigaction *nsa;
+ struct sigaction *osa;
+};
+#endif
/* ARGSUSED */
int
sigaction(p, uap, retval)
struct proc *p;
- register struct sigaction_args /* {
- syscallarg(int) signum;
- syscallarg(struct sigaction *) nsa;
- syscallarg(struct sigaction *) osa;
- } */ *uap;
- register_t *retval;
+ register struct sigaction_args *uap;
+ int *retval;
{
struct sigaction vec;
register struct sigaction *sa;
@@ -95,12 +107,11 @@ sigaction(p, uap, retval)
register int signum;
int bit, error;
- signum = SCARG(uap, signum);
- if (signum <= 0 || signum >= NSIG ||
- signum == SIGKILL || signum == SIGSTOP)
+ signum = uap->signum;
+ if (signum <= 0 || signum >= NSIG)
return (EINVAL);
sa = &vec;
- if (SCARG(uap, osa)) {
+ if (uap->osa) {
sa->sa_handler = ps->ps_sigact[signum];
sa->sa_mask = ps->ps_catchmask[signum];
bit = sigmask(signum);
@@ -109,16 +120,23 @@ sigaction(p, uap, retval)
sa->sa_flags |= SA_ONSTACK;
if ((ps->ps_sigintr & bit) == 0)
sa->sa_flags |= SA_RESTART;
- if (p->p_flag & P_NOCLDSTOP)
+ if ((ps->ps_sigreset & bit) != 0)
+ sa->sa_flags |= SA_RESETHAND;
+ if ((ps->ps_signodefer & bit) != 0)
+ sa->sa_flags |= SA_NODEFER;
+ if (signum == SIGCHLD && p->p_flag & P_NOCLDSTOP)
sa->sa_flags |= SA_NOCLDSTOP;
- if (error = copyout((caddr_t)sa, (caddr_t)SCARG(uap, osa),
- sizeof (vec)))
+ if ((error = copyout((caddr_t)sa, (caddr_t)uap->osa,
+ sizeof (vec))))
return (error);
}
- if (SCARG(uap, nsa)) {
- if (error = copyin((caddr_t)SCARG(uap, nsa), (caddr_t)sa,
- sizeof (vec)))
+ if (uap->nsa) {
+ if ((error = copyin((caddr_t)uap->nsa, (caddr_t)sa,
+ sizeof (vec))))
return (error);
+ if ((signum == SIGKILL || signum == SIGSTOP) &&
+ sa->sa_handler != SIG_DFL)
+ return (EINVAL);
setsigvec(p, signum, sa);
}
return (0);
@@ -148,6 +166,14 @@ setsigvec(p, signum, sa)
ps->ps_sigonstack |= bit;
else
ps->ps_sigonstack &= ~bit;
+ if (sa->sa_flags & SA_RESETHAND)
+ ps->ps_sigreset |= bit;
+ else
+ ps->ps_sigreset &= ~bit;
+ if (sa->sa_flags & SA_NODEFER)
+ ps->ps_signodefer |= bit;
+ else
+ ps->ps_signodefer &= ~bit;
#ifdef COMPAT_SUNOS
if (sa->sa_flags & SA_USERTRAMP)
ps->ps_usertramp |= bit;
@@ -227,9 +253,9 @@ execsigs(p)
* Reset stack state to the user stack.
* Clear set of signals caught on the signal stack.
*/
- ps->ps_sigstk.ss_flags = SA_DISABLE;
+ ps->ps_sigstk.ss_flags = SS_DISABLE;
ps->ps_sigstk.ss_size = 0;
- ps->ps_sigstk.ss_base = 0;
+ ps->ps_sigstk.ss_sp = 0;
ps->ps_flags = 0;
}
@@ -239,33 +265,36 @@ execsigs(p)
* and return old mask as return value;
* the library stub does the rest.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct sigprocmask_args {
+ int how;
+ sigset_t mask;
+};
+#endif
int
sigprocmask(p, uap, retval)
register struct proc *p;
- struct sigprocmask_args /* {
- syscallarg(int) how;
- syscallarg(sigset_t) mask;
- } */ *uap;
- register_t *retval;
+ struct sigprocmask_args *uap;
+ int *retval;
{
int error = 0;
*retval = p->p_sigmask;
(void) splhigh();
- switch (SCARG(uap, how)) {
+ switch (uap->how) {
case SIG_BLOCK:
- p->p_sigmask |= SCARG(uap, mask) &~ sigcantmask;
+ p->p_sigmask |= uap->mask &~ sigcantmask;
break;
case SIG_UNBLOCK:
- p->p_sigmask &= ~SCARG(uap, mask);
+ p->p_sigmask &= ~uap->mask;
break;
case SIG_SETMASK:
- p->p_sigmask = SCARG(uap, mask) &~ sigcantmask;
+ p->p_sigmask = uap->mask &~ sigcantmask;
break;
-
+
default:
error = EINVAL;
break;
@@ -274,12 +303,17 @@ sigprocmask(p, uap, retval)
return (error);
}
+#ifndef _SYS_SYSPROTO_H_
+struct sigpending_args {
+ int dummy;
+};
+#endif
/* ARGSUSED */
int
sigpending(p, uap, retval)
struct proc *p;
- void *uap;
- register_t *retval;
+ struct sigpending_args *uap;
+ int *retval;
{
*retval = p->p_siglist;
@@ -290,16 +324,19 @@ sigpending(p, uap, retval)
/*
* Generalized interface signal handler, 4.3-compatible.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct osigvec_args {
+ int signum;
+ struct sigvec *nsv;
+ struct sigvec *osv;
+};
+#endif
/* ARGSUSED */
int
-compat_43_sigvec(p, uap, retval)
+osigvec(p, uap, retval)
struct proc *p;
- register struct compat_43_sigvec_args /* {
- syscallarg(int) signum;
- syscallarg(struct sigvec *) nsv;
- syscallarg(struct sigvec *) osv;
- } */ *uap;
- register_t *retval;
+ register struct osigvec_args *uap;
+ int *retval;
{
struct sigvec vec;
register struct sigacts *ps = p->p_sigacts;
@@ -307,12 +344,11 @@ compat_43_sigvec(p, uap, retval)
register int signum;
int bit, error;
- signum = SCARG(uap, signum);
- if (signum <= 0 || signum >= NSIG ||
- signum == SIGKILL || signum == SIGSTOP)
+ signum = uap->signum;
+ if (signum <= 0 || signum >= NSIG)
return (EINVAL);
sv = &vec;
- if (SCARG(uap, osv)) {
+ if (uap->osv) {
*(sig_t *)&sv->sv_handler = ps->ps_sigact[signum];
sv->sv_mask = ps->ps_catchmask[signum];
bit = sigmask(signum);
@@ -321,26 +357,26 @@ compat_43_sigvec(p, uap, retval)
sv->sv_flags |= SV_ONSTACK;
if ((ps->ps_sigintr & bit) != 0)
sv->sv_flags |= SV_INTERRUPT;
+ if ((ps->ps_sigreset & bit) != 0)
+ sv->sv_flags |= SV_RESETHAND;
+ if ((ps->ps_signodefer & bit) != 0)
+ sv->sv_flags |= SV_NODEFER;
#ifndef COMPAT_SUNOS
- if (p->p_flag & P_NOCLDSTOP)
- sv->sv_flags |= SA_NOCLDSTOP;
+ if (signum == SIGCHLD && p->p_flag & P_NOCLDSTOP)
+ sv->sv_flags |= SV_NOCLDSTOP;
#endif
- if (error = copyout((caddr_t)sv, (caddr_t)SCARG(uap, osv),
- sizeof (vec)))
+ if ((error = copyout((caddr_t)sv, (caddr_t)uap->osv,
+ sizeof (vec))))
return (error);
}
- if (SCARG(uap, nsv)) {
- if (error = copyin((caddr_t)SCARG(uap, nsv), (caddr_t)sv,
- sizeof (vec)))
+ if (uap->nsv) {
+ if ((error = copyin((caddr_t)uap->nsv, (caddr_t)sv,
+ sizeof (vec))))
return (error);
-#ifdef COMPAT_SUNOS
- /*
- * SunOS uses this bit (4, aka SA_DISABLE) as SV_RESETHAND,
- * `reset to SIG_DFL on delivery'. We have no such option
- * now or ever!
- */
- if (sv->sv_flags & SA_DISABLE)
+ if ((signum == SIGKILL || signum == SIGSTOP) &&
+ sv->sv_handler != SIG_DFL)
return (EINVAL);
+#ifdef COMPAT_SUNOS
sv->sv_flags |= SA_USERTRAMP;
#endif
sv->sv_flags ^= SA_RESTART; /* opposite of SV_INTERRUPT */
@@ -349,34 +385,40 @@ compat_43_sigvec(p, uap, retval)
return (0);
}
+#ifndef _SYS_SYSPROTO_H_
+struct osigblock_args {
+ int mask;
+};
+#endif
int
-compat_43_sigblock(p, uap, retval)
+osigblock(p, uap, retval)
register struct proc *p;
- struct compat_43_sigblock_args /* {
- syscallarg(int) mask;
- } */ *uap;
- register_t *retval;
+ struct osigblock_args *uap;
+ int *retval;
{
(void) splhigh();
*retval = p->p_sigmask;
- p->p_sigmask |= SCARG(uap, mask) &~ sigcantmask;
+ p->p_sigmask |= uap->mask &~ sigcantmask;
(void) spl0();
return (0);
}
+#ifndef _SYS_SYSPROTO_H_
+struct osigsetmask_args {
+ int mask;
+};
+#endif
int
-compat_43_sigsetmask(p, uap, retval)
+osigsetmask(p, uap, retval)
struct proc *p;
- struct compat_43_sigsetmask_args /* {
- syscallarg(int) mask;
- } */ *uap;
- register_t *retval;
+ struct osigsetmask_args *uap;
+ int *retval;
{
(void) splhigh();
*retval = p->p_sigmask;
- p->p_sigmask = SCARG(uap, mask) &~ sigcantmask;
+ p->p_sigmask = uap->mask &~ sigcantmask;
(void) spl0();
return (0);
}
@@ -387,14 +429,17 @@ compat_43_sigsetmask(p, uap, retval)
* in the meantime. Note nonstandard calling convention:
* libc stub passes mask, not pointer, to save a copyin.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct sigsuspend_args {
+ sigset_t mask;
+};
+#endif
/* ARGSUSED */
int
sigsuspend(p, uap, retval)
register struct proc *p;
- struct sigsuspend_args /* {
- syscallarg(int) mask;
- } */ *uap;
- register_t *retval;
+ struct sigsuspend_args *uap;
+ int *retval;
{
register struct sigacts *ps = p->p_sigacts;
@@ -407,7 +452,7 @@ sigsuspend(p, uap, retval)
*/
ps->ps_oldmask = p->p_sigmask;
ps->ps_flags |= SAS_OLDMASK;
- p->p_sigmask = SCARG(uap, mask) &~ sigcantmask;
+ p->p_sigmask = uap->mask &~ sigcantmask;
while (tsleep((caddr_t) ps, PPAUSE|PCATCH, "pause", 0) == 0)
/* void */;
/* always return EINTR rather than ERESTART... */
@@ -415,46 +460,52 @@ sigsuspend(p, uap, retval)
}
#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#ifndef _SYS_SYSPROTO_H_
+struct osigstack_args {
+ struct sigstack *nss;
+ struct sigstack *oss;
+};
+#endif
/* ARGSUSED */
int
-compat_43_sigstack(p, uap, retval)
+osigstack(p, uap, retval)
struct proc *p;
- register struct compat_43_sigstack_args /* {
- syscallarg(struct sigstack *) nss;
- syscallarg(struct sigstack *) oss;
- } */ *uap;
- register_t *retval;
+ register struct osigstack_args *uap;
+ int *retval;
{
struct sigstack ss;
struct sigacts *psp;
int error = 0;
psp = p->p_sigacts;
- ss.ss_sp = psp->ps_sigstk.ss_base;
- ss.ss_onstack = psp->ps_sigstk.ss_flags & SA_ONSTACK;
- if (SCARG(uap, oss) && (error = copyout((caddr_t)&ss,
- (caddr_t)SCARG(uap, oss), sizeof (struct sigstack))))
+ ss.ss_sp = psp->ps_sigstk.ss_sp;
+ ss.ss_onstack = psp->ps_sigstk.ss_flags & SS_ONSTACK;
+ if (uap->oss && (error = copyout((caddr_t)&ss, (caddr_t)uap->oss,
+ sizeof (struct sigstack))))
return (error);
- if (SCARG(uap, nss) && (error = copyin((caddr_t)SCARG(uap, nss),
- (caddr_t)&ss, sizeof (ss))) == 0) {
- psp->ps_sigstk.ss_base = ss.ss_sp;
+ if (uap->nss && (error = copyin((caddr_t)uap->nss, (caddr_t)&ss,
+ sizeof (ss))) == 0) {
+ psp->ps_sigstk.ss_sp = ss.ss_sp;
psp->ps_sigstk.ss_size = 0;
- psp->ps_sigstk.ss_flags |= ss.ss_onstack & SA_ONSTACK;
+ psp->ps_sigstk.ss_flags |= ss.ss_onstack & SS_ONSTACK;
psp->ps_flags |= SAS_ALTSTACK;
}
return (error);
}
#endif /* COMPAT_43 || COMPAT_SUNOS */
+#ifndef _SYS_SYSPROTO_H_
+struct sigaltstack_args {
+ struct sigaltstack *nss;
+ struct sigaltstack *oss;
+};
+#endif
/* ARGSUSED */
int
sigaltstack(p, uap, retval)
struct proc *p;
- register struct sigaltstack_args /* {
- syscallarg(struct sigaltstack *) nss;
- syscallarg(struct sigaltstack *) oss;
- } */ *uap;
- register_t *retval;
+ register struct sigaltstack_args *uap;
+ int *retval;
{
struct sigacts *psp;
struct sigaltstack ss;
@@ -462,17 +513,16 @@ sigaltstack(p, uap, retval)
psp = p->p_sigacts;
if ((psp->ps_flags & SAS_ALTSTACK) == 0)
- psp->ps_sigstk.ss_flags |= SA_DISABLE;
- if (SCARG(uap, oss) && (error = copyout((caddr_t)&psp->ps_sigstk,
- (caddr_t)SCARG(uap, oss), sizeof (struct sigaltstack))))
+ psp->ps_sigstk.ss_flags |= SS_DISABLE;
+ if (uap->oss && (error = copyout((caddr_t)&psp->ps_sigstk,
+ (caddr_t)uap->oss, sizeof (struct sigaltstack))))
return (error);
- if (SCARG(uap, nss) == 0)
+ if (uap->nss == 0)
return (0);
- if (error = copyin((caddr_t)SCARG(uap, nss), (caddr_t)&ss,
- sizeof (ss)))
+ if ((error = copyin((caddr_t)uap->nss, (caddr_t)&ss, sizeof (ss))))
return (error);
- if (ss.ss_flags & SA_DISABLE) {
- if (psp->ps_sigstk.ss_flags & SA_ONSTACK)
+ if (ss.ss_flags & SS_DISABLE) {
+ if (psp->ps_sigstk.ss_flags & SS_ONSTACK)
return (EINVAL);
psp->ps_flags &= ~SAS_ALTSTACK;
psp->ps_sigstk.ss_flags = ss.ss_flags;
@@ -485,60 +535,6 @@ sigaltstack(p, uap, retval)
return (0);
}
-/* ARGSUSED */
-int
-kill(cp, uap, retval)
- register struct proc *cp;
- register struct kill_args /* {
- syscallarg(int) pid;
- syscallarg(int) signum;
- } */ *uap;
- register_t *retval;
-{
- register struct proc *p;
- register struct pcred *pc = cp->p_cred;
-
- if ((u_int)SCARG(uap, signum) >= NSIG)
- return (EINVAL);
- if (SCARG(uap, pid) > 0) {
- /* kill single process */
- if ((p = pfind(SCARG(uap, pid))) == NULL)
- return (ESRCH);
- if (!CANSIGNAL(cp, pc, p, SCARG(uap, signum)))
- return (EPERM);
- if (SCARG(uap, signum))
- psignal(p, SCARG(uap, signum));
- return (0);
- }
- switch (SCARG(uap, pid)) {
- case -1: /* broadcast signal */
- return (killpg1(cp, SCARG(uap, signum), 0, 1));
- case 0: /* signal own process group */
- return (killpg1(cp, SCARG(uap, signum), 0, 0));
- default: /* negative explicit process group */
- return (killpg1(cp, SCARG(uap, signum), -SCARG(uap, pid), 0));
- }
- /* NOTREACHED */
-}
-
-#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
-/* ARGSUSED */
-int
-compat_43_killpg(p, uap, retval)
- struct proc *p;
- register struct compat_43_killpg_args /* {
- syscallarg(int) pgid;
- syscallarg(int) signum;
- } */ *uap;
- register_t *retval;
-{
-
- if ((u_int)SCARG(uap, signum) >= NSIG)
- return (EINVAL);
- return (killpg1(p, SCARG(uap, signum), SCARG(uap, pgid), 0));
-}
-#endif /* COMPAT_43 || COMPAT_SUNOS */
-
/*
* Common code for kill process group/broadcast kill.
* cp is calling process.
@@ -552,13 +548,13 @@ killpg1(cp, signum, pgid, all)
register struct pcred *pc = cp->p_cred;
struct pgrp *pgrp;
int nfound = 0;
-
- if (all)
- /*
- * broadcast
+
+ if (all)
+ /*
+ * broadcast
*/
for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
- if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
+ if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
p == cp || !CANSIGNAL(cp, pc, p, signum))
continue;
nfound++;
@@ -566,8 +562,8 @@ killpg1(cp, signum, pgid, all)
psignal(p, signum);
}
else {
- if (pgid == 0)
- /*
+ if (pgid == 0)
+ /*
* zero pgid means send to my process group.
*/
pgrp = cp->p_pgrp;
@@ -590,6 +586,66 @@ killpg1(cp, signum, pgid, all)
return (nfound ? 0 : ESRCH);
}
+#ifndef _SYS_SYSPROTO_H_
+struct kill_args {
+ int pid;
+ int signum;
+};
+#endif
+/* ARGSUSED */
+int
+kill(cp, uap, retval)
+ register struct proc *cp;
+ register struct kill_args *uap;
+ int *retval;
+{
+ register struct proc *p;
+ register struct pcred *pc = cp->p_cred;
+
+ if ((u_int)uap->signum >= NSIG)
+ return (EINVAL);
+ if (uap->pid > 0) {
+ /* kill single process */
+ if ((p = pfind(uap->pid)) == NULL)
+ return (ESRCH);
+ if (!CANSIGNAL(cp, pc, p, uap->signum))
+ return (EPERM);
+ if (uap->signum)
+ psignal(p, uap->signum);
+ return (0);
+ }
+ switch (uap->pid) {
+ case -1: /* broadcast signal */
+ return (killpg1(cp, uap->signum, 0, 1));
+ case 0: /* signal own process group */
+ return (killpg1(cp, uap->signum, 0, 0));
+ default: /* negative explicit process group */
+ return (killpg1(cp, uap->signum, -uap->pid, 0));
+ }
+ /* NOTREACHED */
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#ifndef _SYS_SYSPROTO_H_
+struct okillpg_args {
+ int pgid;
+ int signum;
+};
+#endif
+/* ARGSUSED */
+int
+okillpg(p, uap, retval)
+ struct proc *p;
+ register struct okillpg_args *uap;
+ int *retval;
+{
+
+ if ((u_int)uap->signum >= NSIG)
+ return (EINVAL);
+ return (killpg1(p, uap->signum, uap->pgid, 0));
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
/*
* Send a signal to a process group.
*/
@@ -641,11 +697,22 @@ trapsignal(p, signum, code)
p->p_stats->p_ru.ru_nsignals++;
#ifdef KTRACE
if (KTRPOINT(p, KTR_PSIG))
- ktrpsig(p->p_tracep, signum, ps->ps_sigact[signum],
+ ktrpsig(p->p_tracep, signum, ps->ps_sigact[signum],
p->p_sigmask, code);
#endif
- sendsig(ps->ps_sigact[signum], signum, p->p_sigmask, code);
- p->p_sigmask |= ps->ps_catchmask[signum] | mask;
+ (*p->p_sysent->sv_sendsig)(ps->ps_sigact[signum], signum,
+ p->p_sigmask, code);
+ p->p_sigmask |= ps->ps_catchmask[signum] |
+ (mask & ~ps->ps_signodefer);
+ if ((ps->ps_sigreset & mask) != 0) {
+ /*
+ * See setsigvec() for origin of this code.
+ */
+ p->p_sigcatch &= ~mask;
+ if (signum != SIGCONT && sigprop[signum] & SA_IGNORE)
+ p->p_sigignore |= mask;
+ ps->ps_sigact[signum] = SIG_DFL;
+ }
} else {
ps->ps_code = code; /* XXX for core dump/debugger */
ps->ps_sig = signum; /* XXX to verify code */
@@ -719,7 +786,7 @@ psignal(p, signum)
*/
if (prop & SA_TTYSTOP && p->p_pgrp->pg_jobc == 0 &&
action == SIG_DFL)
- return;
+ return;
p->p_siglist &= ~contsigmask;
}
p->p_siglist |= mask;
@@ -900,17 +967,8 @@ issignal(p)
/*
* If traced, always stop, and stay
* stopped until released by the parent.
- *
- * Note that we must clear the pending signal
- * before we call trace_req since that routine
- * might cause a fault, calling tsleep and
- * leading us back here again with the same signal.
- * Then we would be deadlocked because the tracer
- * would still be blocked on the ipc struct from
- * the initial request.
*/
p->p_xstat = signum;
- p->p_siglist &= ~mask;
psignal(p->p_pptr, SIGCHLD);
do {
stop(p);
@@ -918,10 +976,19 @@ issignal(p)
} while (!trace_req(p) && p->p_flag & P_TRACED);
/*
+ * If the traced bit got turned off, go back up
+ * to the top to rescan signals. This ensures
+ * that p_sig* and ps_sigact are consistent.
+ */
+ if ((p->p_flag & P_TRACED) == 0)
+ continue;
+
+ /*
* If parent wants us to take the signal,
* then it will leave it in p->p_xstat;
* otherwise we just look for signals again.
*/
+ p->p_siglist &= ~mask; /* clear the old signal */
signum = p->p_xstat;
if (signum == 0)
continue;
@@ -934,14 +1001,6 @@ issignal(p)
p->p_siglist |= mask;
if (p->p_sigmask & mask)
continue;
-
- /*
- * If the traced bit got turned off, go back up
- * to the top to rescan signals. This ensures
- * that p_sig* and ps_sigact are consistent.
- */
- if ((p->p_flag & P_TRACED) == 0)
- continue;
}
/*
@@ -949,9 +1008,9 @@ issignal(p)
* Return the signal's number, or fall through
* to clear it from the pending mask.
*/
- switch ((long)p->p_sigacts->ps_sigact[signum]) {
+ switch ((int)p->p_sigacts->ps_sigact[signum]) {
- case (long)SIG_DFL:
+ case (int)SIG_DFL:
/*
* Don't take default actions on system processes.
*/
@@ -961,8 +1020,8 @@ issignal(p)
* Are you sure you want to ignore SIGSEGV
* in init? XXX
*/
- printf("Process (pid %d) got signal %d\n",
- p->p_pid, signum);
+ printf("Process (pid %lu) got signal %d\n",
+ (u_long)p->p_pid, signum);
#endif
break; /* == ignore */
}
@@ -994,7 +1053,7 @@ issignal(p)
return (signum);
/*NOTREACHED*/
- case (long)SIG_IGN:
+ case (int)SIG_IGN:
/*
* Masking above should prevent us ever trying
* to take action on an ignored signal other
@@ -1043,8 +1102,7 @@ postsig(signum)
register struct proc *p = curproc;
register struct sigacts *ps = p->p_sigacts;
register sig_t action;
- u_long code;
- int mask, returnmask;
+ int code, mask, returnmask;
#ifdef DIAGNOSTIC
if (signum == 0)
@@ -1089,7 +1147,17 @@ postsig(signum)
ps->ps_flags &= ~SAS_OLDMASK;
} else
returnmask = p->p_sigmask;
- p->p_sigmask |= ps->ps_catchmask[signum] | mask;
+ p->p_sigmask |= ps->ps_catchmask[signum] |
+ (mask & ~ps->ps_signodefer);
+ if ((ps->ps_sigreset & mask) != 0) {
+ /*
+ * See setsigvec() for origin of this code.
+ */
+ p->p_sigcatch &= ~mask;
+ if (signum != SIGCONT && sigprop[signum] & SA_IGNORE)
+ p->p_sigignore |= mask;
+ ps->ps_sigact[signum] = SIG_DFL;
+ }
(void) spl0();
p->p_stats->p_ru.ru_nsignals++;
if (ps->ps_sig != signum) {
@@ -1099,7 +1167,7 @@ postsig(signum)
ps->ps_code = 0;
ps->ps_sig = 0;
}
- sendsig(action, signum, returnmask, code);
+ (*p->p_sysent->sv_sendsig)(action, signum, returnmask, code);
}
}
@@ -1111,9 +1179,8 @@ killproc(p, why)
struct proc *p;
char *why;
{
-
- log(LOG_ERR, "pid %d was killed: %s\n", p->p_pid, why);
- uprintf("sorry, pid %d was killed: %s\n", p->p_pid, why);
+ log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm,
+ p->p_cred && p->p_ucred ? p->p_ucred->cr_uid : -1, why);
psignal(p, SIGKILL);
}
@@ -1134,8 +1201,19 @@ sigexit(p, signum)
p->p_acflag |= AXSIG;
if (sigprop[signum] & SA_CORE) {
p->p_sigacts->ps_sig = signum;
+ /*
+ * Log signals which would cause core dumps
+ * (Log as LOG_INFO to appease those who don't want
+ * these messages.)
+ * XXX : Todo, as well as euid, write out ruid too
+ */
if (coredump(p) == 0)
signum |= WCOREFLAG;
+ log(LOG_INFO, "pid %d (%s), uid %d: exited on signal %d%s\n",
+ p->p_pid, p->p_comm,
+ p->p_cred && p->p_ucred ? p->p_ucred->cr_uid : -1,
+ signum &~ WCOREFLAG,
+ signum & WCOREFLAG ? " (core dumped)" : "");
}
exit1(p, W_EXITCODE(0, signum));
/* NOTREACHED */
@@ -1145,28 +1223,27 @@ sigexit(p, signum)
* Dump core, into a file named "progname.core", unless the process was
* setuid/setgid.
*/
-int
+static int
coredump(p)
register struct proc *p;
{
register struct vnode *vp;
- register struct pcred *pcred = p->p_cred;
- register struct ucred *cred = pcred->pc_ucred;
+ register struct ucred *cred = p->p_cred->pc_ucred;
register struct vmspace *vm = p->p_vmspace;
struct nameidata nd;
struct vattr vattr;
int error, error1;
char name[MAXCOMLEN+6]; /* progname.core */
- if (pcred->p_svuid != pcred->p_ruid || pcred->p_svgid != pcred->p_rgid)
+ if (p->p_flag & P_SUGID)
return (EFAULT);
if (ctob(UPAGES + vm->vm_dsize + vm->vm_ssize) >=
p->p_rlimit[RLIMIT_CORE].rlim_cur)
return (EFAULT);
sprintf(name, "%s.core", p->p_comm);
NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, name, p);
- if (error = vn_open(&nd,
- O_CREAT | FWRITE, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH))
+ if ((error = vn_open(&nd,
+ O_CREAT | FWRITE, S_IRUSR | S_IWUSR)))
return (error);
vp = nd.ni_vp;
@@ -1206,14 +1283,19 @@ out:
* Nonexistent system call-- signal process (may want to handle it).
* Flag error in case process won't see signal immediately (blocked or ignored).
*/
+#ifndef _SYS_SYSPROTO_H_
+struct nosys_args {
+ int dummy;
+};
+#endif
/* ARGSUSED */
int
nosys(p, args, retval)
struct proc *p;
- void *args;
- register_t *retval;
+ struct nosys_args *args;
+ int *retval;
{
psignal(p, SIGSYS);
- return (ENOSYS);
+ return (EINVAL);
}
diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c
index df83710..d0097df 100644
--- a/sys/kern/kern_subr.c
+++ b/sys/kern/kern_subr.c
@@ -35,7 +35,8 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)kern_subr.c 8.4 (Berkeley) 2/14/95
+ * @(#)kern_subr.c 8.3 (Berkeley) 1/21/94
+ * $Id$
*/
#include <sys/param.h>
@@ -52,7 +53,7 @@ uiomove(cp, n, uio)
{
register struct iovec *iov;
u_int cnt;
- int error = 0;
+ int error;
#ifdef DIAGNOSTIC
if (uio->uio_rw != UIO_READ && uio->uio_rw != UIO_WRITE)
@@ -70,6 +71,7 @@ uiomove(cp, n, uio)
}
if (cnt > n)
cnt = n;
+
switch (uio->uio_segflg) {
case UIO_USERSPACE:
@@ -88,6 +90,8 @@ uiomove(cp, n, uio)
else
bcopy(iov->iov_base, (caddr_t)cp, cnt);
break;
+ case UIO_NOCOPY:
+ break;
}
iov->iov_base += cnt;
iov->iov_len -= cnt;
@@ -96,7 +100,7 @@ uiomove(cp, n, uio)
cp += cnt;
n -= cnt;
}
- return (error);
+ return (0);
}
/*
@@ -109,13 +113,11 @@ ureadc(c, uio)
{
register struct iovec *iov;
- if (uio->uio_resid <= 0)
- panic("ureadc: non-positive resid");
again:
- if (uio->uio_iovcnt <= 0)
- panic("ureadc: non-positive iovcnt");
+ if (uio->uio_iovcnt == 0 || uio->uio_resid == 0)
+ panic("ureadc");
iov = uio->uio_iov;
- if (iov->iov_len <= 0) {
+ if (iov->iov_len == 0) {
uio->uio_iovcnt--;
uio->uio_iov++;
goto again;
@@ -135,6 +137,8 @@ again:
if (suibyte(iov->iov_base, c) < 0)
return (EFAULT);
break;
+ case UIO_NOCOPY:
+ break;
}
iov->iov_base++;
iov->iov_len--;
@@ -158,7 +162,7 @@ uwritec(uio)
return (-1);
again:
if (uio->uio_iovcnt <= 0)
- panic("uwritec: non-positive iovcnt");
+ panic("uwritec");
iov = uio->uio_iov;
if (iov->iov_len == 0) {
uio->uio_iov++;
@@ -203,7 +207,7 @@ hashinit(elements, type, hashmask)
int i;
if (elements <= 0)
- panic("hashinit: bad cnt");
+ panic("hashinit: bad elements");
for (hashsize = 1; hashsize <= elements; hashsize <<= 1)
continue;
hashsize >>= 1;
@@ -213,3 +217,36 @@ hashinit(elements, type, hashmask)
*hashmask = hashsize - 1;
return (hashtbl);
}
+
+#define NPRIMES 27
+static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039,
+ 2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653,
+ 7159, 7673, 8191, 12281, 16381, 24571, 32749 };
+
+/*
+ * General routine to allocate a prime number sized hash table.
+ */
+void *
+phashinit(elements, type, nentries)
+ int elements, type;
+ u_long *nentries;
+{
+ long hashsize;
+ LIST_HEAD(generic, generic) *hashtbl;
+ int i;
+
+ if (elements <= 0)
+ panic("phashinit: bad elements");
+ for (i = 1, hashsize = primes[1]; hashsize <= elements;) {
+ i++;
+ if (i == NPRIMES)
+ break;
+ hashsize = primes[i];
+ }
+ hashsize = primes[i - 1];
+ hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK);
+ for (i = 0; i < hashsize; i++)
+ LIST_INIT(&hashtbl[i]);
+ *nentries = hashsize;
+ return (hashtbl);
+}
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
index 6c82027..04339cd 100644
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@@ -36,8 +36,11 @@
* SUCH DAMAGE.
*
* @(#)kern_synch.c 8.9 (Berkeley) 5/19/95
+ * $Id: kern_synch.c,v 1.29 1997/02/22 09:39:12 peter Exp $
*/
+#include "opt_ktrace.h"
+
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
@@ -45,16 +48,26 @@
#include <sys/buf.h>
#include <sys/signalvar.h>
#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
#include <sys/vmmeter.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_extern.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
#include <machine/cpu.h>
+static void rqinit __P((void *));
+SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
+
u_char curpriority; /* usrpri of curproc */
int lbolt; /* once a second sleep address */
+extern void endtsleep __P((void *));
+extern void updatepri __P((struct proc *p));
+
/*
* Force switch among equal priority processes every 100ms.
*/
@@ -75,7 +88,7 @@ roundrobin(arg)
* Note that, as ps(1) mentions, this can let percentages
* total over 100% (I've seen 137.9% for 3 processes).
*
- * Note that hardclock updates p_estcpu and p_cpticks independently.
+ * Note that statclock updates p_estcpu and p_cpticks independently.
*
* We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
* That is, the system wants to compute a value of decay such
@@ -104,7 +117,7 @@ roundrobin(arg)
* We now need to prove two things:
* 1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
* 2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
- *
+ *
* Facts:
* For x close to zero, exp(x) =~ 1 + x, since
* exp(x) = 0! + x**1/1! + x**2/2! + ... .
@@ -183,7 +196,7 @@ schedcpu(arg)
*/
if (p->p_slptime > 1)
continue;
- s = splstatclock(); /* prevent state changes */
+ s = splhigh(); /* prevent state changes and protect run queue */
/*
* p_pctcpu is only for ps.
*/
@@ -215,8 +228,6 @@ schedcpu(arg)
splx(s);
}
vmmeter();
- if (bclnlist != NULL)
- wakeup((caddr_t)pageproc);
timeout(schedcpu, (void *)0, hz);
}
@@ -249,11 +260,8 @@ updatepri(p)
* of 2. Shift right by 8, i.e. drop the bottom 256 worth.
*/
#define TABLESIZE 128
+TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
#define LOOKUP(x) (((long)(x) >> 8) & (TABLESIZE - 1))
-struct slpque {
- struct proc *sq_head;
- struct proc **sq_tailp;
-} slpque[TABLESIZE];
/*
* During autoconfiguration or after a panic, a sleep will simply
@@ -266,6 +274,15 @@ struct slpque {
*/
int safepri;
+void
+sleepinit()
+{
+ int i;
+
+ for (i = 0; i < TABLESIZE; i++)
+ TAILQ_INIT(&slpque[i]);
+}
+
/*
* General sleep call. Suspends the current process until a wakeup is
* performed on the specified identifier. The process will then be made
@@ -283,12 +300,8 @@ tsleep(ident, priority, wmesg, timo)
int priority, timo;
char *wmesg;
{
- register struct proc *p = curproc;
- register struct slpque *qp;
- register s;
- int sig, catch = priority & PCATCH;
- extern int cold;
- void endtsleep __P((void *));
+ struct proc *p = curproc;
+ int s, sig, catch = priority & PCATCH;
#ifdef KTRACE
if (KTRPOINT(p, KTR_CSW))
@@ -307,19 +320,14 @@ tsleep(ident, priority, wmesg, timo)
return (0);
}
#ifdef DIAGNOSTIC
- if (ident == NULL || p->p_stat != SRUN || p->p_back)
+ if (ident == NULL || p->p_stat != SRUN)
panic("tsleep");
#endif
p->p_wchan = ident;
p->p_wmesg = wmesg;
p->p_slptime = 0;
p->p_priority = priority & PRIMASK;
- qp = &slpque[LOOKUP(ident)];
- if (qp->sq_head == 0)
- qp->sq_head = p;
- else
- *qp->sq_tailp = p;
- *(qp->sq_tailp = &p->p_forw) = 0;
+ TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq);
if (timo)
timeout(endtsleep, (void *)p, timo);
/*
@@ -333,7 +341,7 @@ tsleep(ident, priority, wmesg, timo)
*/
if (catch) {
p->p_flag |= P_SINTR;
- if (sig = CURSIG(p)) {
+ if ((sig = CURSIG(p))) {
if (p->p_wchan)
unsleep(p);
p->p_stat = SRUN;
@@ -405,85 +413,17 @@ endtsleep(arg)
}
/*
- * Short-term, non-interruptable sleep.
- */
-void
-sleep(ident, priority)
- void *ident;
- int priority;
-{
- register struct proc *p = curproc;
- register struct slpque *qp;
- register s;
- extern int cold;
-
-#ifdef DIAGNOSTIC
- if (priority > PZERO) {
- printf("sleep called with priority %d > PZERO, wchan: %x\n",
- priority, ident);
- panic("old sleep");
- }
-#endif
- s = splhigh();
- if (cold || panicstr) {
- /*
- * After a panic, or during autoconfiguration,
- * just give interrupts a chance, then just return;
- * don't run any other procs or panic below,
- * in case this is the idle process and already asleep.
- */
- splx(safepri);
- splx(s);
- return;
- }
-#ifdef DIAGNOSTIC
- if (ident == NULL || p->p_stat != SRUN || p->p_back)
- panic("sleep");
-#endif
- p->p_wchan = ident;
- p->p_wmesg = NULL;
- p->p_slptime = 0;
- p->p_priority = priority;
- qp = &slpque[LOOKUP(ident)];
- if (qp->sq_head == 0)
- qp->sq_head = p;
- else
- *qp->sq_tailp = p;
- *(qp->sq_tailp = &p->p_forw) = 0;
- p->p_stat = SSLEEP;
- p->p_stats->p_ru.ru_nvcsw++;
-#ifdef KTRACE
- if (KTRPOINT(p, KTR_CSW))
- ktrcsw(p->p_tracep, 1, 0);
-#endif
- mi_switch();
-#ifdef KTRACE
- if (KTRPOINT(p, KTR_CSW))
- ktrcsw(p->p_tracep, 0, 0);
-#endif
- curpriority = p->p_usrpri;
- splx(s);
-}
-
-/*
* Remove a process from its wait queue
*/
void
unsleep(p)
register struct proc *p;
{
- register struct slpque *qp;
- register struct proc **hp;
int s;
s = splhigh();
if (p->p_wchan) {
- hp = &(qp = &slpque[LOOKUP(p->p_wchan)])->sq_head;
- while (*hp != p)
- hp = &(*hp)->p_forw;
- *hp = p->p_forw;
- if (qp->sq_tailp == &p->p_forw)
- qp->sq_tailp = hp;
+ TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_procq);
p->p_wchan = 0;
}
splx(s);
@@ -496,45 +436,83 @@ void
wakeup(ident)
register void *ident;
{
- register struct slpque *qp;
- register struct proc *p, **q;
+ register struct slpquehead *qp;
+ register struct proc *p;
int s;
s = splhigh();
qp = &slpque[LOOKUP(ident)];
restart:
- for (q = &qp->sq_head; p = *q; ) {
+ for (p = qp->tqh_first; p != NULL; p = p->p_procq.tqe_next) {
#ifdef DIAGNOSTIC
- if (p->p_back || p->p_stat != SSLEEP && p->p_stat != SSTOP)
+ if (p->p_stat != SSLEEP && p->p_stat != SSTOP)
panic("wakeup");
#endif
if (p->p_wchan == ident) {
+ TAILQ_REMOVE(qp, p, p_procq);
p->p_wchan = 0;
- *q = p->p_forw;
- if (qp->sq_tailp == &p->p_forw)
- qp->sq_tailp = q;
if (p->p_stat == SSLEEP) {
/* OPTIMIZED EXPANSION OF setrunnable(p); */
if (p->p_slptime > 1)
updatepri(p);
p->p_slptime = 0;
p->p_stat = SRUN;
- if (p->p_flag & P_INMEM)
+ if (p->p_flag & P_INMEM) {
setrunqueue(p);
- /*
- * Since curpriority is a user priority,
- * p->p_priority is always better than
- * curpriority.
- */
- if ((p->p_flag & P_INMEM) == 0)
- wakeup((caddr_t)&proc0);
- else
need_resched();
+ } else {
+ p->p_flag |= P_SWAPINREQ;
+ wakeup((caddr_t)&proc0);
+ }
/* END INLINE EXPANSION */
goto restart;
}
- } else
- q = &p->p_forw;
+ }
+ }
+ splx(s);
+}
+
+/*
+ * Make a process sleeping on the specified identifier runnable.
+ * May wake more than one process if a target prcoess is currently
+ * swapped out.
+ */
+void
+wakeup_one(ident)
+ register void *ident;
+{
+ register struct slpquehead *qp;
+ register struct proc *p;
+ int s;
+
+ s = splhigh();
+ qp = &slpque[LOOKUP(ident)];
+
+ for (p = qp->tqh_first; p != NULL; p = p->p_procq.tqe_next) {
+#ifdef DIAGNOSTIC
+ if (p->p_stat != SSLEEP && p->p_stat != SSTOP)
+ panic("wakeup_one");
+#endif
+ if (p->p_wchan == ident) {
+ TAILQ_REMOVE(qp, p, p_procq);
+ p->p_wchan = 0;
+ if (p->p_stat == SSLEEP) {
+ /* OPTIMIZED EXPANSION OF setrunnable(p); */
+ if (p->p_slptime > 1)
+ updatepri(p);
+ p->p_slptime = 0;
+ p->p_stat = SRUN;
+ if (p->p_flag & P_INMEM) {
+ setrunqueue(p);
+ need_resched();
+ break;
+ } else {
+ p->p_flag |= P_SWAPINREQ;
+ wakeup((caddr_t)&proc0);
+ }
+ /* END INLINE EXPANSION */
+ }
+ }
}
splx(s);
}
@@ -549,11 +527,31 @@ mi_switch()
register struct proc *p = curproc; /* XXX */
register struct rlimit *rlim;
register long s, u;
+ int x;
struct timeval tv;
-#ifdef DEBUG
+ /*
+ * XXX this spl is almost unnecessary. It is partly to allow for
+ * sloppy callers that don't do it (issignal() via CURSIG() is the
+ * main offender). It is partly to work around a bug in the i386
+ * cpu_switch() (the ipl is not preserved). We ran for years
+ * without it. I think there was only a interrupt latency problem.
+ * The main caller, tsleep(), does an splx() a couple of instructions
+ * after calling here. The buggy caller, issignal(), usually calls
+ * here at spl0() and sometimes returns at splhigh(). The process
+ * then runs for a little too long at splhigh(). The ipl gets fixed
+ * when the process returns to user mode (or earlier).
+ *
+ * It would probably be better to always call here at spl0(). Callers
+ * are prepared to give up control to another process, so they must
+ * be prepared to be interrupted. The clock stuff here may not
+ * actually need splstatclock().
+ */
+ x = splstatclock();
+
+#ifdef SIMPLELOCK_DEBUG
if (p->p_simple_locks)
- panic("sleep: holding simple lock");
+ printf("sleep: holding simple lock");
#endif
/*
* Compute the amount of time during which the current
@@ -574,23 +572,20 @@ mi_switch()
/*
* Check if the process exceeds its cpu resource allocation.
- * If over max, kill it. In any case, if it has run for more
- * than 10 minutes, reduce priority to give others a chance.
+ * If over max, kill it.
*/
- rlim = &p->p_rlimit[RLIMIT_CPU];
- if (s >= rlim->rlim_cur) {
- if (s >= rlim->rlim_max)
- psignal(p, SIGKILL);
- else {
- psignal(p, SIGXCPU);
- if (rlim->rlim_cur < rlim->rlim_max)
- rlim->rlim_cur += 5;
+ if (p->p_stat != SZOMB) {
+ rlim = &p->p_rlimit[RLIMIT_CPU];
+ if (s >= rlim->rlim_cur) {
+ if (s >= rlim->rlim_max)
+ killproc(p, "exceeded maximum CPU limit");
+ else {
+ psignal(p, SIGXCPU);
+ if (rlim->rlim_cur < rlim->rlim_max)
+ rlim->rlim_cur += 5;
+ }
}
}
- if (s > 10 * 60 && p->p_ucred->cr_uid && p->p_nice == NZERO) {
- p->p_nice = NZERO + 4;
- resetpriority(p);
- }
/*
* Pick a new current process and record its start time.
@@ -598,19 +593,25 @@ mi_switch()
cnt.v_swtch++;
cpu_switch(p);
microtime(&runtime);
+ splx(x);
}
/*
* Initialize the (doubly-linked) run queues
* to be empty.
*/
-void
-rqinit()
+/* ARGSUSED*/
+static void
+rqinit(dummy)
+ void *dummy;
{
register int i;
- for (i = 0; i < NQS; i++)
+ for (i = 0; i < NQS; i++) {
qs[i].ph_link = qs[i].ph_rlink = (struct proc *)&qs[i];
+ rtqs[i].ph_link = rtqs[i].ph_rlink = (struct proc *)&rtqs[i];
+ idqs[i].ph_link = idqs[i].ph_rlink = (struct proc *)&idqs[i];
+ }
}
/*
@@ -646,8 +647,10 @@ setrunnable(p)
if (p->p_slptime > 1)
updatepri(p);
p->p_slptime = 0;
- if ((p->p_flag & P_INMEM) == 0)
+ if ((p->p_flag & P_INMEM) == 0) {
+ p->p_flag |= P_SWAPINREQ;
wakeup((caddr_t)&proc0);
+ }
else if (p->p_priority < curpriority)
need_resched();
}
@@ -663,9 +666,13 @@ resetpriority(p)
{
register unsigned int newpriority;
- newpriority = PUSER + p->p_estcpu / 4 + 2 * p->p_nice;
- newpriority = min(newpriority, MAXPRI);
- p->p_usrpri = newpriority;
- if (newpriority < curpriority)
+ if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
+ newpriority = PUSER + p->p_estcpu / 4 + 2 * p->p_nice;
+ newpriority = min(newpriority, MAXPRI);
+ p->p_usrpri = newpriority;
+ if (newpriority < curpriority)
+ need_resched();
+ } else {
need_resched();
+ }
}
diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c
index b178da3..fb07f18 100644
--- a/sys/kern/kern_sysctl.c
+++ b/sys/kern/kern_sysctl.c
@@ -5,6 +5,9 @@
* This code is derived from software contributed to Berkeley by
* Mike Karels at Berkeley Software Design, Inc.
*
+ * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
+ * project, to make these variables more userfriendly.
+ *
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
@@ -33,39 +36,20 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)kern_sysctl.c 8.9 (Berkeley) 5/20/95
- */
-
-/*
- * sysctl system call.
+ * @(#)kern_sysctl.c 8.4 (Berkeley) 4/14/94
+ * $Id$
*/
#include <sys/param.h>
-#include <sys/systm.h>
#include <sys/kernel.h>
+#include <sys/sysctl.h>
#include <sys/malloc.h>
#include <sys/proc.h>
-#include <sys/file.h>
-#include <sys/vnode.h>
-#include <sys/unistd.h>
-#include <sys/buf.h>
-#include <sys/ioctl.h>
-#include <sys/tty.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
#include <vm/vm.h>
-#include <sys/sysctl.h>
-
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
-
-sysctlfn kern_sysctl;
-sysctlfn hw_sysctl;
-#ifdef DEBUG
-sysctlfn debug_sysctl;
-#endif
-extern sysctlfn vm_sysctl;
-extern sysctlfn vfs_sysctl;
-extern sysctlfn net_sysctl;
-extern sysctlfn cpu_sysctl;
+#include <vm/vm_extern.h>
+#include <sys/vnode.h>
/*
* Locking and stats
@@ -76,634 +60,818 @@ static struct sysctl_lock {
int sl_locked;
} memlock;
-int
-__sysctl(p, uap, retval)
- struct proc *p;
- register struct __sysctl_args /* {
- syscallarg(int *) name;
- syscallarg(u_int) namelen;
- syscallarg(void *) old;
- syscallarg(size_t *) oldlenp;
- syscallarg(void *) new;
- syscallarg(size_t) newlen;
- } */ *uap;
- register_t *retval;
+static int sysctl_root SYSCTL_HANDLER_ARGS;
+
+extern struct linker_set sysctl_;
+
+/*
+ * Initialization of the MIB tree.
+ *
+ * Order by number in each linker_set.
+ */
+
+static int
+sysctl_order_cmp(const void *a, const void *b)
{
- int error, dolock = 1;
- size_t savelen, oldlen = 0;
- sysctlfn *fn;
- int name[CTL_MAXNAME];
+ struct sysctl_oid const * const *pa;
+ struct sysctl_oid const * const *pb;
- if (SCARG(uap, new) != NULL &&
- (error = suser(p->p_ucred, &p->p_acflag)))
- return (error);
- /*
- * all top-level sysctl names are non-terminal
- */
- if (SCARG(uap, namelen) > CTL_MAXNAME || SCARG(uap, namelen) < 2)
- return (EINVAL);
- if (error =
- copyin(SCARG(uap, name), &name, SCARG(uap, namelen) * sizeof(int)))
- return (error);
+ pa = (struct sysctl_oid const * const *)a;
+ pb = (struct sysctl_oid const * const *)b;
+ if (*pa == NULL)
+ return (1);
+ if (*pb == NULL)
+ return (-1);
+ return ((*pa)->oid_number - (*pb)->oid_number);
+}
- switch (name[0]) {
- case CTL_KERN:
- fn = kern_sysctl;
- if (name[2] == KERN_VNODE) /* XXX */
- dolock = 0;
- break;
- case CTL_HW:
- fn = hw_sysctl;
- break;
- case CTL_VM:
- fn = vm_sysctl;
- break;
- case CTL_NET:
- fn = net_sysctl;
- break;
- case CTL_VFS:
- fn = vfs_sysctl;
- break;
- case CTL_MACHDEP:
- fn = cpu_sysctl;
- break;
-#ifdef DEBUG
- case CTL_DEBUG:
- fn = debug_sysctl;
- break;
-#endif
- default:
- return (EOPNOTSUPP);
- }
+static void
+sysctl_order(void *arg)
+{
+ int j, k;
+ struct linker_set *l = (struct linker_set *) arg;
+ struct sysctl_oid **oidpp;
- if (SCARG(uap, oldlenp) &&
- (error = copyin(SCARG(uap, oldlenp), &oldlen, sizeof(oldlen))))
- return (error);
- if (SCARG(uap, old) != NULL) {
- if (!useracc(SCARG(uap, old), oldlen, B_WRITE))
- return (EFAULT);
- while (memlock.sl_lock) {
- memlock.sl_want = 1;
- sleep((caddr_t)&memlock, PRIBIO+1);
- memlock.sl_locked++;
+ /* First, find the highest oid we have */
+ j = l->ls_length;
+ oidpp = (struct sysctl_oid **) l->ls_items;
+ for (k = 0; j--; oidpp++) {
+ if ((*oidpp)->oid_arg1 == arg) {
+ *oidpp = 0;
+ continue;
}
- memlock.sl_lock = 1;
- if (dolock)
- vslock(SCARG(uap, old), oldlen);
- savelen = oldlen;
+ if (*oidpp && (*oidpp)->oid_number > k)
+ k = (*oidpp)->oid_number;
}
- error = (*fn)(name + 1, SCARG(uap, namelen) - 1, SCARG(uap, old),
- &oldlen, SCARG(uap, new), SCARG(uap, newlen), p);
- if (SCARG(uap, old) != NULL) {
- if (dolock)
- vsunlock(SCARG(uap, old), savelen, B_WRITE);
- memlock.sl_lock = 0;
- if (memlock.sl_want) {
- memlock.sl_want = 0;
- wakeup((caddr_t)&memlock);
- }
+
+ /* Next, replace all OID_AUTO oids with new numbers */
+ j = l->ls_length;
+ oidpp = (struct sysctl_oid **) l->ls_items;
+ k += 100;
+ for (; j--; oidpp++)
+ if (*oidpp && (*oidpp)->oid_number == OID_AUTO)
+ (*oidpp)->oid_number = k++;
+
+ /* Finally: sort by oid */
+ j = l->ls_length;
+ oidpp = (struct sysctl_oid **) l->ls_items;
+ for (; j--; oidpp++) {
+ if (!*oidpp)
+ continue;
+ if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE)
+ if (!(*oidpp)->oid_handler)
+ sysctl_order((*oidpp)->oid_arg1);
}
- if (error)
- return (error);
- if (SCARG(uap, oldlenp))
- error = copyout(&oldlen, SCARG(uap, oldlenp), sizeof(oldlen));
- *retval = oldlen;
- return (0);
+ qsort(l->ls_items, l->ls_length, sizeof l->ls_items[0],
+ sysctl_order_cmp);
}
-/*
- * Attributes stored in the kernel.
- */
-char hostname[MAXHOSTNAMELEN];
-int hostnamelen;
-long hostid;
-int securelevel;
+SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_ANY, sysctl_order, &sysctl_);
/*
- * kernel related system variables.
+ * "Staff-functions"
+ *
+ * These functions implement a presently undocumented interface
+ * used by the sysctl program to walk the tree, and get the type
+ * so it can print the value.
+ * This interface is under work and consideration, and should probably
+ * be killed with a big axe by the first person who can find the time.
+ * (be aware though, that the proper interface isn't as obvious as it
+ * may seem, there are various conflicting requirements.
+ *
+ * {0,0} printf the entire MIB-tree.
+ * {0,1,...} return the name of the "..." OID.
+ * {0,2,...} return the next OID.
+ * {0,3} return the OID of the name in "new"
+ * {0,4,...} return the kind & format info for the "..." OID.
*/
-kern_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
- int *name;
- u_int namelen;
- void *oldp;
- size_t *oldlenp;
- void *newp;
- size_t newlen;
- struct proc *p;
+
+static void
+sysctl_sysctl_debug_dump_node(struct linker_set *l, int i)
{
- int error, level, inthostid;
- extern char ostype[], osrelease[], version[];
-
- /* all sysctl names at this level are terminal */
- if (namelen != 1 && !(name[0] == KERN_PROC || name[0] == KERN_PROF))
- return (ENOTDIR); /* overloaded */
-
- switch (name[0]) {
- case KERN_OSTYPE:
- return (sysctl_rdstring(oldp, oldlenp, newp, ostype));
- case KERN_OSRELEASE:
- return (sysctl_rdstring(oldp, oldlenp, newp, osrelease));
- case KERN_OSREV:
- return (sysctl_rdint(oldp, oldlenp, newp, BSD));
- case KERN_VERSION:
- return (sysctl_rdstring(oldp, oldlenp, newp, version));
- case KERN_MAXVNODES:
- return(sysctl_int(oldp, oldlenp, newp, newlen, &desiredvnodes));
- case KERN_MAXPROC:
- return (sysctl_int(oldp, oldlenp, newp, newlen, &maxproc));
- case KERN_MAXFILES:
- return (sysctl_int(oldp, oldlenp, newp, newlen, &maxfiles));
- case KERN_ARGMAX:
- return (sysctl_rdint(oldp, oldlenp, newp, ARG_MAX));
- case KERN_SECURELVL:
- level = securelevel;
- if ((error = sysctl_int(oldp, oldlenp, newp, newlen, &level)) ||
- newp == NULL)
- return (error);
- if (level < securelevel && p->p_pid != 1)
- return (EPERM);
- securelevel = level;
- return (0);
- case KERN_HOSTNAME:
- error = sysctl_string(oldp, oldlenp, newp, newlen,
- hostname, sizeof(hostname));
- if (newp && !error)
- hostnamelen = newlen;
- return (error);
- case KERN_HOSTID:
- inthostid = hostid; /* XXX assumes sizeof long <= sizeof int */
- error = sysctl_int(oldp, oldlenp, newp, newlen, &inthostid);
- hostid = inthostid;
- return (error);
- case KERN_CLOCKRATE:
- return (sysctl_clockrate(oldp, oldlenp));
- case KERN_BOOTTIME:
- return (sysctl_rdstruct(oldp, oldlenp, newp, &boottime,
- sizeof(struct timeval)));
- case KERN_VNODE:
- return (sysctl_vnode(oldp, oldlenp, p));
- case KERN_PROC:
- return (sysctl_doproc(name + 1, namelen - 1, oldp, oldlenp));
- case KERN_FILE:
- return (sysctl_file(oldp, oldlenp));
-#ifdef GPROF
- case KERN_PROF:
- return (sysctl_doprof(name + 1, namelen - 1, oldp, oldlenp,
- newp, newlen));
-#endif
- case KERN_POSIX1:
- return (sysctl_rdint(oldp, oldlenp, newp, _POSIX_VERSION));
- case KERN_NGROUPS:
- return (sysctl_rdint(oldp, oldlenp, newp, NGROUPS_MAX));
- case KERN_JOB_CONTROL:
- return (sysctl_rdint(oldp, oldlenp, newp, 1));
- case KERN_SAVED_IDS:
-#ifdef _POSIX_SAVED_IDS
- return (sysctl_rdint(oldp, oldlenp, newp, 1));
-#else
- return (sysctl_rdint(oldp, oldlenp, newp, 0));
-#endif
- default:
- return (EOPNOTSUPP);
+ int j, k;
+ struct sysctl_oid **oidpp;
+
+ j = l->ls_length;
+ oidpp = (struct sysctl_oid **) l->ls_items;
+ for (; j--; oidpp++) {
+
+ if (!*oidpp)
+ continue;
+
+ for (k=0; k<i; k++)
+ printf(" ");
+
+ printf("%d %s ", (*oidpp)->oid_number, (*oidpp)->oid_name);
+
+ printf("%c%c",
+ (*oidpp)->oid_kind & CTLFLAG_RD ? 'R':' ',
+ (*oidpp)->oid_kind & CTLFLAG_WR ? 'W':' ');
+
+ if ((*oidpp)->oid_handler)
+ printf(" *Handler");
+
+ switch ((*oidpp)->oid_kind & CTLTYPE) {
+ case CTLTYPE_NODE:
+ printf(" Node\n");
+ if (!(*oidpp)->oid_handler) {
+ sysctl_sysctl_debug_dump_node(
+ (*oidpp)->oid_arg1, i+2);
+ }
+ break;
+ case CTLTYPE_INT: printf(" Int\n"); break;
+ case CTLTYPE_STRING: printf(" String\n"); break;
+ case CTLTYPE_QUAD: printf(" Quad\n"); break;
+ case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break;
+ default: printf("\n");
+ }
+
}
- /* NOTREACHED */
}
-/*
- * hardware related system variables.
- */
-hw_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
- int *name;
- u_int namelen;
- void *oldp;
- size_t *oldlenp;
- void *newp;
- size_t newlen;
- struct proc *p;
+static int
+sysctl_sysctl_debug SYSCTL_HANDLER_ARGS
{
- extern char machine[], cpu_model[];
-
- /* all sysctl names at this level are terminal */
- if (namelen != 1)
- return (ENOTDIR); /* overloaded */
-
- switch (name[0]) {
- case HW_MACHINE:
- return (sysctl_rdstring(oldp, oldlenp, newp, machine));
- case HW_MODEL:
- return (sysctl_rdstring(oldp, oldlenp, newp, cpu_model));
- case HW_NCPU:
- return (sysctl_rdint(oldp, oldlenp, newp, 1)); /* XXX */
- case HW_BYTEORDER:
- return (sysctl_rdint(oldp, oldlenp, newp, BYTE_ORDER));
- case HW_PHYSMEM:
- return (sysctl_rdint(oldp, oldlenp, newp, ctob(physmem)));
- case HW_USERMEM:
- return (sysctl_rdint(oldp, oldlenp, newp,
- ctob(physmem - cnt.v_wire_count)));
- case HW_PAGESIZE:
- return (sysctl_rdint(oldp, oldlenp, newp, PAGE_SIZE));
- default:
- return (EOPNOTSUPP);
+ sysctl_sysctl_debug_dump_node(&sysctl_, 0);
+ return ENOENT;
+}
+
+SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD,
+ 0, 0, sysctl_sysctl_debug, "-", "");
+
+static int
+sysctl_sysctl_name SYSCTL_HANDLER_ARGS
+{
+ int *name = (int *) arg1;
+ u_int namelen = arg2;
+ int i, j, error = 0;
+ struct sysctl_oid **oidpp;
+ struct linker_set *lsp = &sysctl_;
+ char buf[10];
+
+ while (namelen) {
+ if (!lsp) {
+ sprintf(buf,"%d",*name);
+ if (req->oldidx)
+ error = SYSCTL_OUT(req, ".", 1);
+ if (!error)
+ error = SYSCTL_OUT(req, buf, strlen(buf));
+ if (error)
+ return (error);
+ namelen--;
+ name++;
+ continue;
+ }
+ oidpp = (struct sysctl_oid **) lsp->ls_items;
+ j = lsp->ls_length;
+ lsp = 0;
+ for (i = 0; i < j; i++, oidpp++) {
+ if (*oidpp && ((*oidpp)->oid_number != *name))
+ continue;
+
+ if (req->oldidx)
+ error = SYSCTL_OUT(req, ".", 1);
+ if (!error)
+ error = SYSCTL_OUT(req, (*oidpp)->oid_name,
+ strlen((*oidpp)->oid_name));
+ if (error)
+ return (error);
+
+ namelen--;
+ name++;
+
+ if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+ break;
+
+ if ((*oidpp)->oid_handler)
+ break;
+
+ lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+ break;
+ }
}
- /* NOTREACHED */
+ return (SYSCTL_OUT(req, "", 1));
}
-#ifdef DEBUG
-/*
- * Debugging related system variables.
- */
-struct ctldebug debug0, debug1, debug2, debug3, debug4;
-struct ctldebug debug5, debug6, debug7, debug8, debug9;
-struct ctldebug debug10, debug11, debug12, debug13, debug14;
-struct ctldebug debug15, debug16, debug17, debug18, debug19;
-static struct ctldebug *debugvars[CTL_DEBUG_MAXID] = {
- &debug0, &debug1, &debug2, &debug3, &debug4,
- &debug5, &debug6, &debug7, &debug8, &debug9,
- &debug10, &debug11, &debug12, &debug13, &debug14,
- &debug15, &debug16, &debug17, &debug18, &debug19,
-};
-int
-debug_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
- int *name;
- u_int namelen;
- void *oldp;
- size_t *oldlenp;
- void *newp;
- size_t newlen;
- struct proc *p;
+SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD, sysctl_sysctl_name, "");
+
+static int
+sysctl_sysctl_next_ls (struct linker_set *lsp, int *name, u_int namelen,
+ int *next, int *len, int level, struct sysctl_oid **oidp)
{
- struct ctldebug *cdp;
+ int i, j;
+ struct sysctl_oid **oidpp;
- /* all sysctl names at this level are name and field */
- if (namelen != 2)
- return (ENOTDIR); /* overloaded */
- cdp = debugvars[name[0]];
- if (name[0] >= CTL_DEBUG_MAXID || cdp->debugname == 0)
- return (EOPNOTSUPP);
- switch (name[1]) {
- case CTL_DEBUG_NAME:
- return (sysctl_rdstring(oldp, oldlenp, newp, cdp->debugname));
- case CTL_DEBUG_VALUE:
- return (sysctl_int(oldp, oldlenp, newp, newlen, cdp->debugvar));
- default:
- return (EOPNOTSUPP);
+ oidpp = (struct sysctl_oid **) lsp->ls_items;
+ j = lsp->ls_length;
+ *len = level;
+ for (i = 0; i < j; i++, oidpp++) {
+ if (!*oidpp)
+ continue;
+
+ *next = (*oidpp)->oid_number;
+ *oidp = *oidpp;
+
+ if (!namelen) {
+ if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+ return 0;
+ if ((*oidpp)->oid_handler)
+ /* We really should call the handler here...*/
+ return 0;
+ lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+ if (!sysctl_sysctl_next_ls (lsp, 0, 0, next+1,
+ len, level+1, oidp))
+ return 0;
+ goto next;
+ }
+
+ if ((*oidpp)->oid_number < *name)
+ continue;
+
+ if ((*oidpp)->oid_number > *name) {
+ if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+ return 0;
+ if ((*oidpp)->oid_handler)
+ return 0;
+ lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+ if (!sysctl_sysctl_next_ls (lsp, name+1, namelen-1,
+ next+1, len, level+1, oidp))
+ return (0);
+ goto next;
+ }
+ if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+ continue;
+
+ if ((*oidpp)->oid_handler)
+ continue;
+
+ lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+ if (!sysctl_sysctl_next_ls (lsp, name+1, namelen-1, next+1,
+ len, level+1, oidp))
+ return (0);
+ next:
+ namelen = 1;
+ *len = level;
}
- /* NOTREACHED */
+ return 1;
}
-#endif /* DEBUG */
-/*
- * Validate parameters and get old / set new parameters
- * for an integer-valued sysctl function.
- */
-sysctl_int(oldp, oldlenp, newp, newlen, valp)
- void *oldp;
- size_t *oldlenp;
- void *newp;
- size_t newlen;
- int *valp;
+static int
+sysctl_sysctl_next SYSCTL_HANDLER_ARGS
{
- int error = 0;
+ int *name = (int *) arg1;
+ u_int namelen = arg2;
+ int i, j, error;
+ struct sysctl_oid *oid;
+ struct linker_set *lsp = &sysctl_;
+ int newoid[CTL_MAXNAME];
- if (oldp && *oldlenp < sizeof(int))
- return (ENOMEM);
- if (newp && newlen != sizeof(int))
- return (EINVAL);
- *oldlenp = sizeof(int);
- if (oldp)
- error = copyout(valp, oldp, sizeof(int));
- if (error == 0 && newp)
- error = copyin(newp, valp, sizeof(int));
+ i = sysctl_sysctl_next_ls (lsp, name, namelen, newoid, &j, 1, &oid);
+ if (i)
+ return ENOENT;
+ error = SYSCTL_OUT(req, newoid, j * sizeof (int));
return (error);
}
+SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD, sysctl_sysctl_next, "");
+
+static int
+name2oid (char *name, int *oid, int *len, struct sysctl_oid **oidp)
+{
+ int i, j;
+ struct sysctl_oid **oidpp;
+ struct linker_set *lsp = &sysctl_;
+ char *p;
+
+ if (!*name)
+ return ENOENT;
+
+ p = name + strlen(name) - 1 ;
+ if (*p == '.')
+ *p = '\0';
+
+ *len = 0;
+
+ for (p = name; *p && *p != '.'; p++)
+ ;
+ i = *p;
+ if (i == '.')
+ *p = '\0';
+
+ j = lsp->ls_length;
+ oidpp = (struct sysctl_oid **) lsp->ls_items;
+
+ while (j-- && *len < CTL_MAXNAME) {
+ if (!*oidpp)
+ continue;
+ if (strcmp(name, (*oidpp)->oid_name)) {
+ oidpp++;
+ continue;
+ }
+ *oid++ = (*oidpp)->oid_number;
+ (*len)++;
+
+ if (!i) {
+ if (oidp)
+ *oidp = *oidpp;
+ return (0);
+ }
+
+ if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+ break;
+
+ if ((*oidpp)->oid_handler)
+ break;
+
+ lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+ j = lsp->ls_length;
+ oidpp = (struct sysctl_oid **)lsp->ls_items;
+ name = p+1;
+ for (p = name; *p && *p != '.'; p++)
+ ;
+ i = *p;
+ if (i == '.')
+ *p = '\0';
+ }
+ return ENOENT;
+}
+
+static int
+sysctl_sysctl_name2oid SYSCTL_HANDLER_ARGS
+{
+ char *p;
+ int error, oid[CTL_MAXNAME], len;
+ struct sysctl_oid *op = 0;
+
+ if (!req->newlen)
+ return ENOENT;
+
+ p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK);
+
+ error = SYSCTL_IN(req, p, req->newlen);
+ if (error) {
+ free(p, M_SYSCTL);
+ return (error);
+ }
+
+ p [req->newlen] = '\0';
+
+ error = name2oid(p, oid, &len, &op);
+
+ free(p, M_SYSCTL);
+
+ if (error)
+ return (error);
+
+ error = SYSCTL_OUT(req, oid, len * sizeof *oid);
+ return (error);
+}
+
+SYSCTL_PROC(_sysctl, 3, name2oid, CTLFLAG_RW|CTLFLAG_ANYBODY, 0, 0,
+ sysctl_sysctl_name2oid, "I", "");
+
+static int
+sysctl_sysctl_oidfmt SYSCTL_HANDLER_ARGS
+{
+ int *name = (int *) arg1, error;
+ u_int namelen = arg2;
+ int indx, j;
+ struct sysctl_oid **oidpp;
+ struct linker_set *lsp = &sysctl_;
+
+ j = lsp->ls_length;
+ oidpp = (struct sysctl_oid **) lsp->ls_items;
+
+ indx = 0;
+ while (j-- && indx < CTL_MAXNAME) {
+ if (*oidpp && ((*oidpp)->oid_number == name[indx])) {
+ indx++;
+ if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+ if ((*oidpp)->oid_handler)
+ goto found;
+ if (indx == namelen)
+ goto found;
+ lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+ j = lsp->ls_length;
+ oidpp = (struct sysctl_oid **)lsp->ls_items;
+ } else {
+ if (indx != namelen)
+ return EISDIR;
+ goto found;
+ }
+ } else {
+ oidpp++;
+ }
+ }
+ return ENOENT;
+found:
+ if (!(*oidpp)->oid_fmt)
+ return ENOENT;
+ error = SYSCTL_OUT(req,
+ &(*oidpp)->oid_kind, sizeof((*oidpp)->oid_kind));
+ if (!error)
+ error = SYSCTL_OUT(req, (*oidpp)->oid_fmt,
+ strlen((*oidpp)->oid_fmt)+1);
+ return (error);
+}
+
+
+SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD, sysctl_sysctl_oidfmt, "");
+
+/*
+ * Default "handler" functions.
+ */
+
/*
- * As above, but read-only.
+ * Handle an integer, signed or unsigned.
+ * Two cases:
+ * a variable: point arg1 at it.
+ * a constant: pass it in arg2.
*/
-sysctl_rdint(oldp, oldlenp, newp, val)
- void *oldp;
- size_t *oldlenp;
- void *newp;
- int val;
+
+int
+sysctl_handle_int SYSCTL_HANDLER_ARGS
{
int error = 0;
- if (oldp && *oldlenp < sizeof(int))
- return (ENOMEM);
- if (newp)
- return (EPERM);
- *oldlenp = sizeof(int);
- if (oldp)
- error = copyout((caddr_t)&val, oldp, sizeof(int));
+ if (arg1)
+ error = SYSCTL_OUT(req, arg1, sizeof(int));
+ else
+ error = SYSCTL_OUT(req, &arg2, sizeof(int));
+
+ if (error || !req->newptr)
+ return (error);
+
+ if (!arg1)
+ error = EPERM;
+ else
+ error = SYSCTL_IN(req, arg1, sizeof(int));
return (error);
}
/*
- * Validate parameters and get old / set new parameters
- * for a string-valued sysctl function.
+ * Handle our generic '\0' terminated 'C' string.
+ * Two cases:
+ * a variable string: point arg1 at it, arg2 is max length.
+ * a constant string: point arg1 at it, arg2 is zero.
*/
-sysctl_string(oldp, oldlenp, newp, newlen, str, maxlen)
- void *oldp;
- size_t *oldlenp;
- void *newp;
- size_t newlen;
- char *str;
- int maxlen;
+
+int
+sysctl_handle_string SYSCTL_HANDLER_ARGS
{
- int len, error = 0;
+ int error=0;
- len = strlen(str) + 1;
- if (oldp && *oldlenp < len)
- return (ENOMEM);
- if (newp && newlen >= maxlen)
- return (EINVAL);
- if (oldp) {
- *oldlenp = len;
- error = copyout(str, oldp, len);
- }
- if (error == 0 && newp) {
- error = copyin(newp, str, newlen);
- str[newlen] = 0;
+ error = SYSCTL_OUT(req, arg1, strlen((char *)arg1)+1);
+
+ if (error || !req->newptr || !arg2)
+ return (error);
+
+ if ((req->newlen - req->newidx) > arg2) {
+ error = E2BIG;
+ } else {
+ arg2 = (req->newlen - req->newidx);
+ error = SYSCTL_IN(req, arg1, arg2);
+ ((char *)arg1)[arg2] = '\0';
}
+
return (error);
}
/*
- * As above, but read-only.
+ * Handle any kind of opaque data.
+ * arg1 points to it, arg2 is the size.
*/
-sysctl_rdstring(oldp, oldlenp, newp, str)
- void *oldp;
- size_t *oldlenp;
- void *newp;
- char *str;
+
+int
+sysctl_handle_opaque SYSCTL_HANDLER_ARGS
{
- int len, error = 0;
+ int error;
+
+ error = SYSCTL_OUT(req, arg1, arg2);
+
+ if (error || !req->newptr)
+ return (error);
+
+ error = SYSCTL_IN(req, arg1, arg2);
- len = strlen(str) + 1;
- if (oldp && *oldlenp < len)
- return (ENOMEM);
- if (newp)
- return (EPERM);
- *oldlenp = len;
- if (oldp)
- error = copyout(str, oldp, len);
return (error);
}
/*
- * Validate parameters and get old / set new parameters
- * for a structure oriented sysctl function.
+ * Transfer functions to/from kernel space.
+ * XXX: rather untested at this point
*/
-sysctl_struct(oldp, oldlenp, newp, newlen, sp, len)
- void *oldp;
- size_t *oldlenp;
- void *newp;
- size_t newlen;
- void *sp;
- int len;
+static int
+sysctl_old_kernel(struct sysctl_req *req, const void *p, int l)
{
- int error = 0;
+ int i = 0;
- if (oldp && *oldlenp < len)
+ if (req->oldptr) {
+ i = min(req->oldlen - req->oldidx, l);
+ if (i > 0)
+ bcopy(p, (char *)req->oldptr + req->oldidx, i);
+ }
+ req->oldidx += l;
+ if (req->oldptr && i != l)
return (ENOMEM);
- if (newp && newlen > len)
+ return (0);
+}
+
+static int
+sysctl_new_kernel(struct sysctl_req *req, void *p, int l)
+{
+ if (!req->newptr)
+ return 0;
+ if (req->newlen - req->newidx < l)
return (EINVAL);
- if (oldp) {
- *oldlenp = len;
- error = copyout(sp, oldp, len);
+ bcopy((char *)req->newptr + req->newidx, p, l);
+ req->newidx += l;
+ return (0);
+}
+
+int
+kernel_sysctl(struct proc *p, int *name, u_int namelen, void *old, size_t *oldlenp, void *new, size_t newlen, int *retval)
+{
+ int error = 0;
+ struct sysctl_req req;
+
+ bzero(&req, sizeof req);
+
+ req.p = p;
+
+ if (oldlenp) {
+ req.oldlen = *oldlenp;
+ }
+
+ if (old) {
+ req.oldptr= old;
+ }
+
+ if (newlen) {
+ req.newlen = newlen;
+ req.newptr = new;
+ }
+
+ req.oldfunc = sysctl_old_kernel;
+ req.newfunc = sysctl_new_kernel;
+ req.lock = 1;
+
+ /* XXX this should probably be done in a general way */
+ while (memlock.sl_lock) {
+ memlock.sl_want = 1;
+ (void) tsleep((caddr_t)&memlock, PRIBIO+1, "sysctl", 0);
+ memlock.sl_locked++;
+ }
+ memlock.sl_lock = 1;
+
+ error = sysctl_root(0, name, namelen, &req);
+
+ if (req.lock == 2)
+ vsunlock(req.oldptr, req.oldlen, B_WRITE);
+
+ memlock.sl_lock = 0;
+
+ if (memlock.sl_want) {
+ memlock.sl_want = 0;
+ wakeup((caddr_t)&memlock);
+ }
+
+ if (error && error != ENOMEM)
+ return (error);
+
+ if (retval) {
+ if (req.oldptr && req.oldidx > req.oldlen)
+ *retval = req.oldlen;
+ else
+ *retval = req.oldidx;
}
- if (error == 0 && newp)
- error = copyin(newp, sp, len);
return (error);
}
/*
- * Validate parameters and get old parameters
- * for a structure oriented sysctl function.
+ * Transfer function to/from user space.
*/
-sysctl_rdstruct(oldp, oldlenp, newp, sp, len)
- void *oldp;
- size_t *oldlenp;
- void *newp, *sp;
- int len;
+static int
+sysctl_old_user(struct sysctl_req *req, const void *p, int l)
{
- int error = 0;
+ int error = 0, i = 0;
- if (oldp && *oldlenp < len)
+ if (req->lock == 1 && req->oldptr) {
+ vslock(req->oldptr, req->oldlen);
+ req->lock = 2;
+ }
+ if (req->oldptr) {
+ i = min(req->oldlen - req->oldidx, l);
+ if (i > 0)
+ error = copyout(p, (char *)req->oldptr + req->oldidx,
+ i);
+ }
+ req->oldidx += l;
+ if (error)
+ return (error);
+ if (req->oldptr && i < l)
return (ENOMEM);
- if (newp)
- return (EPERM);
- *oldlenp = len;
- if (oldp)
- error = copyout(sp, oldp, len);
+ return (0);
+}
+
+static int
+sysctl_new_user(struct sysctl_req *req, void *p, int l)
+{
+ int error;
+
+ if (!req->newptr)
+ return 0;
+ if (req->newlen - req->newidx < l)
+ return (EINVAL);
+ error = copyin((char *)req->newptr + req->newidx, p, l);
+ req->newidx += l;
return (error);
}
/*
- * Get file structures.
+ * Traverse our tree, and find the right node, execute whatever it points
+ * at, and return the resulting error code.
*/
-sysctl_file(where, sizep)
- char *where;
- size_t *sizep;
+
+int
+sysctl_root SYSCTL_HANDLER_ARGS
{
- int buflen, error;
- struct file *fp;
- char *start = where;
+ int *name = (int *) arg1;
+ u_int namelen = arg2;
+ int indx, i, j;
+ struct sysctl_oid **oidpp;
+ struct linker_set *lsp = &sysctl_;
- buflen = *sizep;
- if (where == NULL) {
- /*
- * overestimate by 10 files
- */
- *sizep = sizeof(filehead) + (nfiles + 10) * sizeof(struct file);
- return (0);
- }
+ j = lsp->ls_length;
+ oidpp = (struct sysctl_oid **) lsp->ls_items;
- /*
- * first copyout filehead
- */
- if (buflen < sizeof(filehead)) {
- *sizep = 0;
- return (0);
- }
- if (error = copyout((caddr_t)&filehead, where, sizeof(filehead)))
- return (error);
- buflen -= sizeof(filehead);
- where += sizeof(filehead);
-
- /*
- * followed by an array of file structures
- */
- for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next) {
- if (buflen < sizeof(struct file)) {
- *sizep = where - start;
- return (ENOMEM);
+ indx = 0;
+ while (j-- && indx < CTL_MAXNAME) {
+ if (*oidpp && ((*oidpp)->oid_number == name[indx])) {
+ indx++;
+ if ((*oidpp)->oid_kind & CTLFLAG_NOLOCK)
+ req->lock = 0;
+ if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+ if ((*oidpp)->oid_handler)
+ goto found;
+ if (indx == namelen)
+ return ENOENT;
+ lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+ j = lsp->ls_length;
+ oidpp = (struct sysctl_oid **)lsp->ls_items;
+ } else {
+ if (indx != namelen)
+ return EISDIR;
+ goto found;
+ }
+ } else {
+ oidpp++;
}
- if (error = copyout((caddr_t)fp, where, sizeof (struct file)))
- return (error);
- buflen -= sizeof(struct file);
- where += sizeof(struct file);
}
- *sizep = where - start;
- return (0);
+ return ENOENT;
+found:
+ /* If writing isn't allowed */
+ if (req->newptr && !((*oidpp)->oid_kind & CTLFLAG_WR))
+ return (EPERM);
+
+ /* Most likely only root can write */
+ if (!((*oidpp)->oid_kind & CTLFLAG_ANYBODY) &&
+ req->newptr && req->p &&
+ (i = suser(req->p->p_ucred, &req->p->p_acflag)))
+ return (i);
+
+ if (!(*oidpp)->oid_handler)
+ return EINVAL;
+
+ if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+ i = ((*oidpp)->oid_handler) (*oidpp,
+ name + indx, namelen - indx,
+ req);
+ } else {
+ i = ((*oidpp)->oid_handler) (*oidpp,
+ (*oidpp)->oid_arg1, (*oidpp)->oid_arg2,
+ req);
+ }
+ return (i);
}
-/*
- * try over estimating by 5 procs
- */
-#define KERN_PROCSLOP (5 * sizeof (struct kinfo_proc))
+#ifndef _SYS_SYSPROTO_H_
+struct sysctl_args {
+ int *name;
+ u_int namelen;
+ void *old;
+ size_t *oldlenp;
+ void *new;
+ size_t newlen;
+};
+#endif
-sysctl_doproc(name, namelen, where, sizep)
- int *name;
- u_int namelen;
- char *where;
- size_t *sizep;
+int
+__sysctl(struct proc *p, struct sysctl_args *uap, int *retval)
{
- register struct proc *p;
- register struct kinfo_proc *dp = (struct kinfo_proc *)where;
- register int needed = 0;
- int buflen = where != NULL ? *sizep : 0;
- int doingzomb;
- struct eproc eproc;
- int error = 0;
+ int error, i, j, name[CTL_MAXNAME];
- if (namelen != 2 && !(namelen == 1 && name[0] == KERN_PROC_ALL))
+ if (uap->namelen > CTL_MAXNAME || uap->namelen < 2)
return (EINVAL);
- p = allproc.lh_first;
- doingzomb = 0;
-again:
- for (; p != 0; p = p->p_list.le_next) {
- /*
- * Skip embryonic processes.
- */
- if (p->p_stat == SIDL)
- continue;
- /*
- * TODO - make more efficient (see notes below).
- * do by session.
- */
- switch (name[0]) {
- case KERN_PROC_PID:
- /* could do this with just a lookup */
- if (p->p_pid != (pid_t)name[1])
- continue;
- break;
+ error = copyin(uap->name, &name, uap->namelen * sizeof(int));
+ if (error)
+ return (error);
- case KERN_PROC_PGRP:
- /* could do this by traversing pgrp */
- if (p->p_pgrp->pg_id != (pid_t)name[1])
- continue;
- break;
+ error = userland_sysctl(p, name, uap->namelen,
+ uap->old, uap->oldlenp, 0,
+ uap->new, uap->newlen, &j);
+ if (error && error != ENOMEM)
+ return (error);
+ if (uap->oldlenp) {
+ i = copyout(&j, uap->oldlenp, sizeof(j));
+ if (i)
+ return (i);
+ }
+ return (error);
+}
- case KERN_PROC_TTY:
- if ((p->p_flag & P_CONTROLT) == 0 ||
- p->p_session->s_ttyp == NULL ||
- p->p_session->s_ttyp->t_dev != (dev_t)name[1])
- continue;
- break;
+/*
+ * This is used from various compatibility syscalls too. That's why name
+ * must be in kernel space.
+ */
+int
+userland_sysctl(struct proc *p, int *name, u_int namelen, void *old, size_t *oldlenp, int inkernel, void *new, size_t newlen, int *retval)
+{
+ int error = 0;
+ struct sysctl_req req, req2;
- case KERN_PROC_UID:
- if (p->p_ucred->cr_uid != (uid_t)name[1])
- continue;
- break;
+ bzero(&req, sizeof req);
- case KERN_PROC_RUID:
- if (p->p_cred->p_ruid != (uid_t)name[1])
- continue;
- break;
- }
- if (buflen >= sizeof(struct kinfo_proc)) {
- fill_eproc(p, &eproc);
- if (error = copyout((caddr_t)p, &dp->kp_proc,
- sizeof(struct proc)))
- return (error);
- if (error = copyout((caddr_t)&eproc, &dp->kp_eproc,
- sizeof(eproc)))
+ req.p = p;
+
+ if (oldlenp) {
+ if (inkernel) {
+ req.oldlen = *oldlenp;
+ } else {
+ error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp));
+ if (error)
return (error);
- dp++;
- buflen -= sizeof(struct kinfo_proc);
}
- needed += sizeof(struct kinfo_proc);
}
- if (doingzomb == 0) {
- p = zombproc.lh_first;
- doingzomb++;
- goto again;
+
+ if (old) {
+ if (!useracc(old, req.oldlen, B_WRITE))
+ return (EFAULT);
+ req.oldptr= old;
+ }
+
+ if (newlen) {
+ if (!useracc(new, req.newlen, B_READ))
+ return (EFAULT);
+ req.newlen = newlen;
+ req.newptr = new;
}
- if (where != NULL) {
- *sizep = (caddr_t)dp - where;
- if (needed > *sizep)
- return (ENOMEM);
- } else {
- needed += KERN_PROCSLOP;
- *sizep = needed;
+
+ req.oldfunc = sysctl_old_user;
+ req.newfunc = sysctl_new_user;
+ req.lock = 1;
+
+ /* XXX this should probably be done in a general way */
+ while (memlock.sl_lock) {
+ memlock.sl_want = 1;
+ (void) tsleep((caddr_t)&memlock, PRIBIO+1, "sysctl", 0);
+ memlock.sl_locked++;
}
- return (0);
-}
+ memlock.sl_lock = 1;
-/*
- * Fill in an eproc structure for the specified process.
- */
-void
-fill_eproc(p, ep)
- register struct proc *p;
- register struct eproc *ep;
-{
- register struct tty *tp;
-
- ep->e_paddr = p;
- ep->e_sess = p->p_pgrp->pg_session;
- ep->e_pcred = *p->p_cred;
- ep->e_ucred = *p->p_ucred;
- if (p->p_stat == SIDL || p->p_stat == SZOMB) {
- ep->e_vm.vm_rssize = 0;
- ep->e_vm.vm_tsize = 0;
- ep->e_vm.vm_dsize = 0;
- ep->e_vm.vm_ssize = 0;
-#ifndef sparc
- /* ep->e_vm.vm_pmap = XXX; */
-#endif
- } else {
- register struct vmspace *vm = p->p_vmspace;
+ do {
+ req2 = req;
+ error = sysctl_root(0, name, namelen, &req2);
+ } while (error == EAGAIN);
-#ifdef pmap_resident_count
- ep->e_vm.vm_rssize = pmap_resident_count(&vm->vm_pmap); /*XXX*/
-#else
- ep->e_vm.vm_rssize = vm->vm_rssize;
-#endif
- ep->e_vm.vm_tsize = vm->vm_tsize;
- ep->e_vm.vm_dsize = vm->vm_dsize;
- ep->e_vm.vm_ssize = vm->vm_ssize;
-#ifndef sparc
- ep->e_vm.vm_pmap = vm->vm_pmap;
-#endif
+ req = req2;
+ if (req.lock == 2)
+ vsunlock(req.oldptr, req.oldlen, B_WRITE);
+
+ memlock.sl_lock = 0;
+
+ if (memlock.sl_want) {
+ memlock.sl_want = 0;
+ wakeup((caddr_t)&memlock);
}
- if (p->p_pptr)
- ep->e_ppid = p->p_pptr->p_pid;
- else
- ep->e_ppid = 0;
- ep->e_pgid = p->p_pgrp->pg_id;
- ep->e_jobc = p->p_pgrp->pg_jobc;
- if ((p->p_flag & P_CONTROLT) &&
- (tp = ep->e_sess->s_ttyp)) {
- ep->e_tdev = tp->t_dev;
- ep->e_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
- ep->e_tsess = tp->t_session;
- } else
- ep->e_tdev = NODEV;
- ep->e_flag = ep->e_sess->s_ttyvp ? EPROC_CTTY : 0;
- if (SESS_LEADER(p))
- ep->e_flag |= EPROC_SLEADER;
- if (p->p_wmesg)
- strncpy(ep->e_wmesg, p->p_wmesg, WMESGLEN);
- ep->e_xsize = ep->e_xrssize = 0;
- ep->e_xccount = ep->e_xswrss = 0;
+
+ if (error && error != ENOMEM)
+ return (error);
+
+ if (retval) {
+ if (req.oldptr && req.oldidx > req.oldlen)
+ *retval = req.oldlen;
+ else
+ *retval = req.oldidx;
+ }
+ return (error);
}
#ifdef COMPAT_43
#include <sys/socket.h>
+#include <vm/vm_param.h>
+
#define KINFO_PROC (0<<8)
#define KINFO_RT (1<<8)
#define KINFO_VNODE (2<<8)
@@ -712,81 +880,197 @@ fill_eproc(p, ep)
#define KINFO_LOADAVG (5<<8)
#define KINFO_CLOCKRATE (6<<8)
-compat_43_getkerninfo(p, uap, retval)
- struct proc *p;
- register struct compat_43_getkerninfo_args /* {
- syscallarg(int) op;
- syscallarg(char *) where;
- syscallarg(int *) size;
- syscallarg(int) arg;
- } */ *uap;
- register_t *retval;
-{
- int error, name[5];
- size_t size;
+/* Non-standard BSDI extension - only present on their 4.3 net-2 releases */
+#define KINFO_BSDI_SYSINFO (101<<8)
- if (SCARG(uap, size) && (error = copyin((caddr_t)SCARG(uap, size),
- (caddr_t)&size, sizeof(size))))
- return (error);
+/*
+ * XXX this is bloat, but I hope it's better here than on the potentially
+ * limited kernel stack... -Peter
+ */
+
+static struct {
+ int bsdi_machine; /* "i386" on BSD/386 */
+/* ^^^ this is an offset to the string, relative to the struct start */
+ char *pad0;
+ long pad1;
+ long pad2;
+ long pad3;
+ u_long pad4;
+ u_long pad5;
+ u_long pad6;
+
+ int bsdi_ostype; /* "BSD/386" on BSD/386 */
+ int bsdi_osrelease; /* "1.1" on BSD/386 */
+ long pad7;
+ long pad8;
+ char *pad9;
+
+ long pad10;
+ long pad11;
+ int pad12;
+ long pad13;
+ quad_t pad14;
+ long pad15;
+
+ struct timeval pad16;
+ /* we dont set this, because BSDI's uname used gethostname() instead */
+ int bsdi_hostname; /* hostname on BSD/386 */
- switch (SCARG(uap, op) & 0xff00) {
+ /* the actual string data is appended here */
+
+} bsdi_si;
+/*
+ * this data is appended to the end of the bsdi_si structure during copyout.
+ * The "char *" offsets are relative to the base of the bsdi_si struct.
+ * This contains "FreeBSD\02.0-BUILT-nnnnnn\0i386\0", and these strings
+ * should not exceed the length of the buffer here... (or else!! :-)
+ */
+static char bsdi_strings[80]; /* It had better be less than this! */
+
+#ifndef _SYS_SYSPROTO_H_
+struct getkerninfo_args {
+ int op;
+ char *where;
+ int *size;
+ int arg;
+};
+#endif
+
+int
+ogetkerninfo(struct proc *p, struct getkerninfo_args *uap, int *retval)
+{
+ int error, name[6];
+ u_int size;
+
+ switch (uap->op & 0xff00) {
case KINFO_RT:
- name[0] = PF_ROUTE;
- name[1] = 0;
- name[2] = (SCARG(uap, op) & 0xff0000) >> 16;
- name[3] = SCARG(uap, op) & 0xff;
- name[4] = SCARG(uap, arg);
- error =
- net_sysctl(name, 5, SCARG(uap, where), &size, NULL, 0, p);
+ name[0] = CTL_NET;
+ name[1] = PF_ROUTE;
+ name[2] = 0;
+ name[3] = (uap->op & 0xff0000) >> 16;
+ name[4] = uap->op & 0xff;
+ name[5] = uap->arg;
+ error = userland_sysctl(p, name, 6, uap->where, uap->size,
+ 0, 0, 0, &size);
break;
case KINFO_VNODE:
- name[0] = KERN_VNODE;
- error =
- kern_sysctl(name, 1, SCARG(uap, where), &size, NULL, 0, p);
+ name[0] = CTL_KERN;
+ name[1] = KERN_VNODE;
+ error = userland_sysctl(p, name, 2, uap->where, uap->size,
+ 0, 0, 0, &size);
break;
case KINFO_PROC:
- name[0] = KERN_PROC;
- name[1] = SCARG(uap, op) & 0xff;
- name[2] = SCARG(uap, arg);
- error =
- kern_sysctl(name, 3, SCARG(uap, where), &size, NULL, 0, p);
+ name[0] = CTL_KERN;
+ name[1] = KERN_PROC;
+ name[2] = uap->op & 0xff;
+ name[3] = uap->arg;
+ error = userland_sysctl(p, name, 4, uap->where, uap->size,
+ 0, 0, 0, &size);
break;
case KINFO_FILE:
- name[0] = KERN_FILE;
- error =
- kern_sysctl(name, 1, SCARG(uap, where), &size, NULL, 0, p);
+ name[0] = CTL_KERN;
+ name[1] = KERN_FILE;
+ error = userland_sysctl(p, name, 2, uap->where, uap->size,
+ 0, 0, 0, &size);
break;
case KINFO_METER:
- name[0] = VM_METER;
- error =
- vm_sysctl(name, 1, SCARG(uap, where), &size, NULL, 0, p);
+ name[0] = CTL_VM;
+ name[1] = VM_METER;
+ error = userland_sysctl(p, name, 2, uap->where, uap->size,
+ 0, 0, 0, &size);
break;
case KINFO_LOADAVG:
- name[0] = VM_LOADAVG;
- error =
- vm_sysctl(name, 1, SCARG(uap, where), &size, NULL, 0, p);
+ name[0] = CTL_VM;
+ name[1] = VM_LOADAVG;
+ error = userland_sysctl(p, name, 2, uap->where, uap->size,
+ 0, 0, 0, &size);
break;
case KINFO_CLOCKRATE:
- name[0] = KERN_CLOCKRATE;
- error =
- kern_sysctl(name, 1, SCARG(uap, where), &size, NULL, 0, p);
+ name[0] = CTL_KERN;
+ name[1] = KERN_CLOCKRATE;
+ error = userland_sysctl(p, name, 2, uap->where, uap->size,
+ 0, 0, 0, &size);
break;
+ case KINFO_BSDI_SYSINFO: {
+ /*
+ * this is pretty crude, but it's just enough for uname()
+ * from BSDI's 1.x libc to work.
+ *
+ * In particular, it doesn't return the same results when
+ * the supplied buffer is too small. BSDI's version apparently
+ * will return the amount copied, and set the *size to how
+ * much was needed. The emulation framework here isn't capable
+ * of that, so we just set both to the amount copied.
+ * BSDI's 2.x product apparently fails with ENOMEM in this
+ * scenario.
+ */
+
+ u_int needed;
+ u_int left;
+ char *s;
+
+ bzero((char *)&bsdi_si, sizeof(bsdi_si));
+ bzero(bsdi_strings, sizeof(bsdi_strings));
+
+ s = bsdi_strings;
+
+ bsdi_si.bsdi_ostype = (s - bsdi_strings) + sizeof(bsdi_si);
+ strcpy(s, ostype);
+ s += strlen(s) + 1;
+
+ bsdi_si.bsdi_osrelease = (s - bsdi_strings) + sizeof(bsdi_si);
+ strcpy(s, osrelease);
+ s += strlen(s) + 1;
+
+ bsdi_si.bsdi_machine = (s - bsdi_strings) + sizeof(bsdi_si);
+ strcpy(s, machine);
+ s += strlen(s) + 1;
+
+ needed = sizeof(bsdi_si) + (s - bsdi_strings);
+
+ if (uap->where == NULL) {
+ /* process is asking how much buffer to supply.. */
+ size = needed;
+ error = 0;
+ break;
+ }
+
+
+ /* if too much buffer supplied, trim it down */
+ if (size > needed)
+ size = needed;
+
+ /* how much of the buffer is remaining */
+ left = size;
+
+ if ((error = copyout((char *)&bsdi_si, uap->where, left)) != 0)
+ break;
+
+ /* is there any point in continuing? */
+ if (left > sizeof(bsdi_si)) {
+ left -= sizeof(bsdi_si);
+ error = copyout(&bsdi_strings,
+ uap->where + sizeof(bsdi_si), left);
+ }
+ break;
+ }
+
default:
return (EOPNOTSUPP);
}
if (error)
return (error);
*retval = size;
- if (SCARG(uap, size))
- error = copyout((caddr_t)&size, (caddr_t)SCARG(uap, size),
+ if (uap->size)
+ error = copyout((caddr_t)&size, (caddr_t)uap->size,
sizeof(size));
return (error);
}
diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c
new file mode 100644
index 0000000..171ed0e
--- /dev/null
+++ b/sys/kern/kern_tc.c
@@ -0,0 +1,1303 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
+ * $Id: kern_clock.c,v 1.34 1997/03/22 16:52:19 mpp Exp $
+ */
+
+/* Portions of this software are covered by the following: */
+/******************************************************************************
+ * *
+ * Copyright (c) David L. Mills 1993, 1994 *
+ * *
+ * Permission to use, copy, modify, and distribute this software and its *
+ * documentation for any purpose and without fee is hereby granted, provided *
+ * that the above copyright notice appears in all copies and that both the *
+ * copyright notice and this permission notice appear in supporting *
+ * documentation, and that the name University of Delaware not be used in *
+ * advertising or publicity pertaining to distribution of the software *
+ * without specific, written prior permission. The University of Delaware *
+ * makes no representations about the suitability this software for any *
+ * purpose. It is provided "as is" without express or implied warranty. *
+ * *
+ *****************************************************************************/
+
+#include "opt_cpu.h" /* XXX */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/dkstat.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/timex.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/sysctl.h>
+
+#include <machine/cpu.h>
+#define CLOCK_HAIR /* XXX */
+#include <machine/clock.h>
+
+#ifdef GPROF
+#include <sys/gmon.h>
+#endif
+
+static void initclocks __P((void *dummy));
+SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
+
+/* Exported to machdep.c. */
+struct callout *callfree, *callout;
+
+static struct callout calltodo;
+
+/* Some of these don't belong here, but it's easiest to concentrate them. */
+static long cp_time[CPUSTATES];
+long dk_seek[DK_NDRIVE];
+static long dk_time[DK_NDRIVE];
+long dk_wds[DK_NDRIVE];
+long dk_wpms[DK_NDRIVE];
+long dk_xfer[DK_NDRIVE];
+
+int dk_busy;
+int dk_ndrive = 0;
+char dk_names[DK_NDRIVE][DK_NAMELEN];
+
+long tk_cancc;
+long tk_nin;
+long tk_nout;
+long tk_rawcc;
+
+/*
+ * Clock handling routines.
+ *
+ * This code is written to operate with two timers that run independently of
+ * each other. The main clock, running hz times per second, is used to keep
+ * track of real time. The second timer handles kernel and user profiling,
+ * and does resource use estimation. If the second timer is programmable,
+ * it is randomized to avoid aliasing between the two clocks. For example,
+ * the randomization prevents an adversary from always giving up the cpu
+ * just before its quantum expires. Otherwise, it would never accumulate
+ * cpu ticks. The mean frequency of the second timer is stathz.
+ *
+ * If no second timer exists, stathz will be zero; in this case we drive
+ * profiling and statistics off the main clock. This WILL NOT be accurate;
+ * do not do it unless absolutely necessary.
+ *
+ * The statistics clock may (or may not) be run at a higher rate while
+ * profiling. This profile clock runs at profhz. We require that profhz
+ * be an integral multiple of stathz.
+ *
+ * If the statistics clock is running fast, it must be divided by the ratio
+ * profhz/stathz for statistics. (For profiling, every tick counts.)
+ */
+
+/*
+ * TODO:
+ * allocate more timeout table slots when table overflows.
+ */
+
+/*
+ * Bump a timeval by a small number of usec's.
+ */
+#define BUMPTIME(t, usec) { \
+ register volatile struct timeval *tp = (t); \
+ register long us; \
+ \
+ tp->tv_usec = us = tp->tv_usec + (usec); \
+ if (us >= 1000000) { \
+ tp->tv_usec = us - 1000000; \
+ tp->tv_sec++; \
+ } \
+}
+
+int stathz;
+int profhz;
+static int profprocs;
+int ticks;
+static int psdiv, pscnt; /* prof => stat divider */
+int psratio; /* ratio: prof / stat */
+
+volatile struct timeval time;
+volatile struct timeval mono_time;
+
+/*
+ * Phase/frequency-lock loop (PLL/FLL) definitions
+ *
+ * The following variables are read and set by the ntp_adjtime() system
+ * call.
+ *
+ * time_state shows the state of the system clock, with values defined
+ * in the timex.h header file.
+ *
+ * time_status shows the status of the system clock, with bits defined
+ * in the timex.h header file.
+ *
+ * time_offset is used by the PLL/FLL to adjust the system time in small
+ * increments.
+ *
+ * time_constant determines the bandwidth or "stiffness" of the PLL.
+ *
+ * time_tolerance determines maximum frequency error or tolerance of the
+ * CPU clock oscillator and is a property of the architecture; however,
+ * in principle it could change as result of the presence of external
+ * discipline signals, for instance.
+ *
+ * time_precision is usually equal to the kernel tick variable; however,
+ * in cases where a precision clock counter or external clock is
+ * available, the resolution can be much less than this and depend on
+ * whether the external clock is working or not.
+ *
+ * time_maxerror is initialized by a ntp_adjtime() call and increased by
+ * the kernel once each second to reflect the maximum error
+ * bound growth.
+ *
+ * time_esterror is set and read by the ntp_adjtime() call, but
+ * otherwise not used by the kernel.
+ */
+int time_status = STA_UNSYNC; /* clock status bits */
+int time_state = TIME_OK; /* clock state */
+long time_offset = 0; /* time offset (us) */
+long time_constant = 0; /* pll time constant */
+long time_tolerance = MAXFREQ; /* frequency tolerance (scaled ppm) */
+long time_precision = 1; /* clock precision (us) */
+long time_maxerror = MAXPHASE; /* maximum error (us) */
+long time_esterror = MAXPHASE; /* estimated error (us) */
+
+/*
+ * The following variables establish the state of the PLL/FLL and the
+ * residual time and frequency offset of the local clock. The scale
+ * factors are defined in the timex.h header file.
+ *
+ * time_phase and time_freq are the phase increment and the frequency
+ * increment, respectively, of the kernel time variable at each tick of
+ * the clock.
+ *
+ * time_freq is set via ntp_adjtime() from a value stored in a file when
+ * the synchronization daemon is first started. Its value is retrieved
+ * via ntp_adjtime() and written to the file about once per hour by the
+ * daemon.
+ *
+ * time_adj is the adjustment added to the value of tick at each timer
+ * interrupt and is recomputed from time_phase and time_freq at each
+ * seconds rollover.
+ *
+ * time_reftime is the second's portion of the system time on the last
+ * call to ntp_adjtime(). It is used to adjust the time_freq variable
+ * and to increase the time_maxerror as the time since last update
+ * increases.
+ */
+static long time_phase = 0; /* phase offset (scaled us) */
+long time_freq = 0; /* frequency offset (scaled ppm) */
+static long time_adj = 0; /* tick adjust (scaled 1 / hz) */
+static long time_reftime = 0; /* time at last adjustment (s) */
+
+#ifdef PPS_SYNC
+/*
+ * The following variables are used only if the kernel PPS discipline
+ * code is configured (PPS_SYNC). The scale factors are defined in the
+ * timex.h header file.
+ *
+ * pps_time contains the time at each calibration interval, as read by
+ * microtime(). pps_count counts the seconds of the calibration
+ * interval, the duration of which is nominally pps_shift in powers of
+ * two.
+ *
+ * pps_offset is the time offset produced by the time median filter
+ * pps_tf[], while pps_jitter is the dispersion (jitter) measured by
+ * this filter.
+ *
+ * pps_freq is the frequency offset produced by the frequency median
+ * filter pps_ff[], while pps_stabil is the dispersion (wander) measured
+ * by this filter.
+ *
+ * pps_usec is latched from a high resolution counter or external clock
+ * at pps_time. Here we want the hardware counter contents only, not the
+ * contents plus the time_tv.usec as usual.
+ *
+ * pps_valid counts the number of seconds since the last PPS update. It
+ * is used as a watchdog timer to disable the PPS discipline should the
+ * PPS signal be lost.
+ *
+ * pps_glitch counts the number of seconds since the beginning of an
+ * offset burst more than tick/2 from current nominal offset. It is used
+ * mainly to suppress error bursts due to priority conflicts between the
+ * PPS interrupt and timer interrupt.
+ *
+ * pps_intcnt counts the calibration intervals for use in the interval-
+ * adaptation algorithm. It's just too complicated for words.
+ */
+struct timeval pps_time; /* kernel time at last interval */
+long pps_offset = 0; /* pps time offset (us) */
+long pps_jitter = MAXTIME; /* pps time dispersion (jitter) (us) */
+long pps_tf[] = {0, 0, 0}; /* pps time offset median filter (us) */
+long pps_freq = 0; /* frequency offset (scaled ppm) */
+long pps_stabil = MAXFREQ; /* frequency dispersion (scaled ppm) */
+long pps_ff[] = {0, 0, 0}; /* frequency offset median filter */
+long pps_usec = 0; /* microsec counter at last interval */
+long pps_valid = PPS_VALID; /* pps signal watchdog counter */
+int pps_glitch = 0; /* pps signal glitch counter */
+int pps_count = 0; /* calibration interval counter (s) */
+int pps_shift = PPS_SHIFT; /* interval duration (s) (shift) */
+int pps_intcnt = 0; /* intervals at current duration */
+
+/*
+ * PPS signal quality monitors
+ *
+ * pps_jitcnt counts the seconds that have been discarded because the
+ * jitter measured by the time median filter exceeds the limit MAXTIME
+ * (100 us).
+ *
+ * pps_calcnt counts the frequency calibration intervals, which are
+ * variable from 4 s to 256 s.
+ *
+ * pps_errcnt counts the calibration intervals which have been discarded
+ * because the wander exceeds the limit MAXFREQ (100 ppm) or where the
+ * calibration interval jitter exceeds two ticks.
+ *
+ * pps_stbcnt counts the calibration intervals that have been discarded
+ * because the frequency wander exceeds the limit MAXFREQ / 4 (25 us).
+ */
+long pps_jitcnt = 0; /* jitter limit exceeded */
+long pps_calcnt = 0; /* calibration intervals */
+long pps_errcnt = 0; /* calibration errors */
+long pps_stbcnt = 0; /* stability limit exceeded */
+#endif /* PPS_SYNC */
+
+/* XXX none of this stuff works under FreeBSD */
+#ifdef EXT_CLOCK
+/*
+ * External clock definitions
+ *
+ * The following definitions and declarations are used only if an
+ * external clock (HIGHBALL or TPRO) is configured on the system.
+ */
+#define CLOCK_INTERVAL 30 /* CPU clock update interval (s) */
+
+/*
+ * The clock_count variable is set to CLOCK_INTERVAL at each PPS
+ * interrupt and decremented once each second.
+ */
+int clock_count = 0; /* CPU clock counter */
+
+#ifdef HIGHBALL
+/*
+ * The clock_offset and clock_cpu variables are used by the HIGHBALL
+ * interface. The clock_offset variable defines the offset between
+ * system time and the HIGBALL counters. The clock_cpu variable contains
+ * the offset between the system clock and the HIGHBALL clock for use in
+ * disciplining the kernel time variable.
+ */
+extern struct timeval clock_offset; /* Highball clock offset */
+long clock_cpu = 0; /* CPU clock adjust */
+#endif /* HIGHBALL */
+#endif /* EXT_CLOCK */
+
+/*
+ * hardupdate() - local clock update
+ *
+ * This routine is called by ntp_adjtime() to update the local clock
+ * phase and frequency. The implementation is of an adaptive-parameter,
+ * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new
+ * time and frequency offset estimates for each call. If the kernel PPS
+ * discipline code is configured (PPS_SYNC), the PPS signal itself
+ * determines the new time offset, instead of the calling argument.
+ * Presumably, calls to ntp_adjtime() occur only when the caller
+ * believes the local clock is valid within some bound (+-128 ms with
+ * NTP). If the caller's time is far different than the PPS time, an
+ * argument will ensue, and it's not clear who will lose.
+ *
+ * For uncompensated quartz crystal oscillatores and nominal update
+ * intervals less than 1024 s, operation should be in phase-lock mode
+ * (STA_FLL = 0), where the loop is disciplined to phase. For update
+ * intervals greater than thiss, operation should be in frequency-lock
+ * mode (STA_FLL = 1), where the loop is disciplined to frequency.
+ *
+ * Note: splclock() is in effect.
+ */
+void
+hardupdate(offset)
+ long offset;
+{
+ long ltemp, mtemp;
+
+ if (!(time_status & STA_PLL) && !(time_status & STA_PPSTIME))
+ return;
+ ltemp = offset;
+#ifdef PPS_SYNC
+ if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
+ ltemp = pps_offset;
+#endif /* PPS_SYNC */
+
+ /*
+ * Scale the phase adjustment and clamp to the operating range.
+ */
+ if (ltemp > MAXPHASE)
+ time_offset = MAXPHASE << SHIFT_UPDATE;
+ else if (ltemp < -MAXPHASE)
+ time_offset = -(MAXPHASE << SHIFT_UPDATE);
+ else
+ time_offset = ltemp << SHIFT_UPDATE;
+
+ /*
+ * Select whether the frequency is to be controlled and in which
+ * mode (PLL or FLL). Clamp to the operating range. Ugly
+ * multiply/divide should be replaced someday.
+ */
+ if (time_status & STA_FREQHOLD || time_reftime == 0)
+ time_reftime = time.tv_sec;
+ mtemp = time.tv_sec - time_reftime;
+ time_reftime = time.tv_sec;
+ if (time_status & STA_FLL) {
+ if (mtemp >= MINSEC) {
+ ltemp = ((time_offset / mtemp) << (SHIFT_USEC -
+ SHIFT_UPDATE));
+ if (ltemp < 0)
+ time_freq -= -ltemp >> SHIFT_KH;
+ else
+ time_freq += ltemp >> SHIFT_KH;
+ }
+ } else {
+ if (mtemp < MAXSEC) {
+ ltemp *= mtemp;
+ if (ltemp < 0)
+ time_freq -= -ltemp >> (time_constant +
+ time_constant + SHIFT_KF -
+ SHIFT_USEC);
+ else
+ time_freq += ltemp >> (time_constant +
+ time_constant + SHIFT_KF -
+ SHIFT_USEC);
+ }
+ }
+ if (time_freq > time_tolerance)
+ time_freq = time_tolerance;
+ else if (time_freq < -time_tolerance)
+ time_freq = -time_tolerance;
+}
+
+
+
+/*
+ * Initialize clock frequencies and start both clocks running.
+ */
+/* ARGSUSED*/
+static void
+initclocks(dummy)
+ void *dummy;
+{
+ register int i;
+
+ /*
+ * Set divisors to 1 (normal case) and let the machine-specific
+ * code do its bit.
+ */
+ psdiv = pscnt = 1;
+ cpu_initclocks();
+
+ /*
+ * Compute profhz/stathz, and fix profhz if needed.
+ */
+ i = stathz ? stathz : hz;
+ if (profhz == 0)
+ profhz = i;
+ psratio = profhz / i;
+}
+
+/*
+ * The real-time timer, interrupting hz times per second.
+ */
+void
+hardclock(frame)
+ register struct clockframe *frame;
+{
+ register struct callout *p1;
+ register struct proc *p;
+ register int needsoft;
+
+ /*
+ * Update real-time timeout queue.
+ * At front of queue are some number of events which are ``due''.
+ * The time to these is <= 0 and if negative represents the
+ * number of ticks which have passed since it was supposed to happen.
+ * The rest of the q elements (times > 0) are events yet to happen,
+ * where the time for each is given as a delta from the previous.
+ * Decrementing just the first of these serves to decrement the time
+ * to all events.
+ */
+ needsoft = 0;
+ for (p1 = calltodo.c_next; p1 != NULL; p1 = p1->c_next) {
+ if (--p1->c_time > 0)
+ break;
+ needsoft = 1;
+ if (p1->c_time == 0)
+ break;
+ }
+
+ p = curproc;
+ if (p) {
+ register struct pstats *pstats;
+
+ /*
+ * Run current process's virtual and profile time, as needed.
+ */
+ pstats = p->p_stats;
+ if (CLKF_USERMODE(frame) &&
+ timerisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
+ itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
+ psignal(p, SIGVTALRM);
+ if (timerisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
+ itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
+ psignal(p, SIGPROF);
+ }
+
+ /*
+ * If no separate statistics clock is available, run it from here.
+ */
+ if (stathz == 0)
+ statclock(frame);
+
+ /*
+ * Increment the time-of-day.
+ */
+ ticks++;
+ {
+ int time_update;
+ struct timeval newtime = time;
+ long ltemp;
+
+ if (timedelta == 0) {
+ time_update = CPU_THISTICKLEN(tick);
+ } else {
+ time_update = CPU_THISTICKLEN(tick) + tickdelta;
+ timedelta -= tickdelta;
+ }
+ BUMPTIME(&mono_time, time_update);
+
+ /*
+ * Compute the phase adjustment. If the low-order bits
+ * (time_phase) of the update overflow, bump the high-order bits
+ * (time_update).
+ */
+ time_phase += time_adj;
+ if (time_phase <= -FINEUSEC) {
+ ltemp = -time_phase >> SHIFT_SCALE;
+ time_phase += ltemp << SHIFT_SCALE;
+ time_update -= ltemp;
+ }
+ else if (time_phase >= FINEUSEC) {
+ ltemp = time_phase >> SHIFT_SCALE;
+ time_phase -= ltemp << SHIFT_SCALE;
+ time_update += ltemp;
+ }
+
+ newtime.tv_usec += time_update;
+ /*
+ * On rollover of the second the phase adjustment to be used for
+ * the next second is calculated. Also, the maximum error is
+ * increased by the tolerance. If the PPS frequency discipline
+ * code is present, the phase is increased to compensate for the
+ * CPU clock oscillator frequency error.
+ *
+ * On a 32-bit machine and given parameters in the timex.h
+ * header file, the maximum phase adjustment is +-512 ms and
+ * maximum frequency offset is a tad less than) +-512 ppm. On a
+ * 64-bit machine, you shouldn't need to ask.
+ */
+ if (newtime.tv_usec >= 1000000) {
+ newtime.tv_usec -= 1000000;
+ newtime.tv_sec++;
+ time_maxerror += time_tolerance >> SHIFT_USEC;
+
+ /*
+ * Compute the phase adjustment for the next second. In
+ * PLL mode, the offset is reduced by a fixed factor
+ * times the time constant. In FLL mode the offset is
+ * used directly. In either mode, the maximum phase
+ * adjustment for each second is clamped so as to spread
+ * the adjustment over not more than the number of
+ * seconds between updates.
+ */
+ if (time_offset < 0) {
+ ltemp = -time_offset;
+ if (!(time_status & STA_FLL))
+ ltemp >>= SHIFT_KG + time_constant;
+ if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+ ltemp = (MAXPHASE / MINSEC) <<
+ SHIFT_UPDATE;
+ time_offset += ltemp;
+ time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ -
+ SHIFT_UPDATE);
+ } else {
+ ltemp = time_offset;
+ if (!(time_status & STA_FLL))
+ ltemp >>= SHIFT_KG + time_constant;
+ if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+ ltemp = (MAXPHASE / MINSEC) <<
+ SHIFT_UPDATE;
+ time_offset -= ltemp;
+ time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ -
+ SHIFT_UPDATE);
+ }
+
+ /*
+ * Compute the frequency estimate and additional phase
+ * adjustment due to frequency error for the next
+ * second. When the PPS signal is engaged, gnaw on the
+ * watchdog counter and update the frequency computed by
+ * the pll and the PPS signal.
+ */
+#ifdef PPS_SYNC
+ pps_valid++;
+ if (pps_valid == PPS_VALID) {
+ pps_jitter = MAXTIME;
+ pps_stabil = MAXFREQ;
+ time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+ STA_PPSWANDER | STA_PPSERROR);
+ }
+ ltemp = time_freq + pps_freq;
+#else
+ ltemp = time_freq;
+#endif /* PPS_SYNC */
+ if (ltemp < 0)
+ time_adj -= -ltemp >>
+ (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+ else
+ time_adj += ltemp >>
+ (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+
+#if SHIFT_HZ == 7
+ /*
+ * When the CPU clock oscillator frequency is not a
+ * power of two in Hz, the SHIFT_HZ is only an
+ * approximate scale factor. In the SunOS kernel, this
+ * results in a PLL gain factor of 1/1.28 = 0.78 what it
+ * should be. In the following code the overall gain is
+ * increased by a factor of 1.25, which results in a
+ * residual error less than 3 percent.
+ */
+ /* Same thing applies for FreeBSD --GAW */
+ if (hz == 100) {
+ if (time_adj < 0)
+ time_adj -= -time_adj >> 2;
+ else
+ time_adj += time_adj >> 2;
+ }
+#endif /* SHIFT_HZ */
+
+ /* XXX - this is really bogus, but can't be fixed until
+ xntpd's idea of the system clock is fixed to know how
+ the user wants leap seconds handled; in the mean time,
+ we assume that users of NTP are running without proper
+ leap second support (this is now the default anyway) */
+ /*
+ * Leap second processing. If in leap-insert state at
+ * the end of the day, the system clock is set back one
+ * second; if in leap-delete state, the system clock is
+ * set ahead one second. The microtime() routine or
+ * external clock driver will insure that reported time
+ * is always monotonic. The ugly divides should be
+ * replaced.
+ */
+ switch (time_state) {
+
+ case TIME_OK:
+ if (time_status & STA_INS)
+ time_state = TIME_INS;
+ else if (time_status & STA_DEL)
+ time_state = TIME_DEL;
+ break;
+
+ case TIME_INS:
+ if (newtime.tv_sec % 86400 == 0) {
+ newtime.tv_sec--;
+ time_state = TIME_OOP;
+ }
+ break;
+
+ case TIME_DEL:
+ if ((newtime.tv_sec + 1) % 86400 == 0) {
+ newtime.tv_sec++;
+ time_state = TIME_WAIT;
+ }
+ break;
+
+ case TIME_OOP:
+ time_state = TIME_WAIT;
+ break;
+
+ case TIME_WAIT:
+ if (!(time_status & (STA_INS | STA_DEL)))
+ time_state = TIME_OK;
+ }
+ }
+ CPU_CLOCKUPDATE(&time, &newtime);
+ }
+
+ /*
+ * Process callouts at a very low cpu priority, so we don't keep the
+ * relatively high clock interrupt priority any longer than necessary.
+ */
+ if (needsoft) {
+ if (CLKF_BASEPRI(frame)) {
+ /*
+ * Save the overhead of a software interrupt;
+ * it will happen as soon as we return, so do it now.
+ */
+ (void)splsoftclock();
+ softclock();
+ } else
+ setsoftclock();
+ }
+}
+
+/*
+ * Software (low priority) clock interrupt.
+ * Run periodic events from timeout queue.
+ */
+/*ARGSUSED*/
+void
+softclock()
+{
+ register struct callout *c;
+ register void *arg;
+ register void (*func) __P((void *));
+ register int s;
+
+ s = splhigh();
+ while ((c = calltodo.c_next) != NULL && c->c_time <= 0) {
+ func = c->c_func;
+ arg = c->c_arg;
+ calltodo.c_next = c->c_next;
+ c->c_next = callfree;
+ callfree = c;
+ splx(s);
+ (*func)(arg);
+ (void) splhigh();
+ }
+ splx(s);
+}
+
+/*
+ * timeout --
+ * Execute a function after a specified length of time.
+ *
+ * untimeout --
+ * Cancel previous timeout function call.
+ *
+ * See AT&T BCI Driver Reference Manual for specification. This
+ * implementation differs from that one in that no identification
+ * value is returned from timeout, rather, the original arguments
+ * to timeout are used to identify entries for untimeout.
+ */
+void
+timeout(ftn, arg, ticks)
+ timeout_t ftn;
+ void *arg;
+ register int ticks;
+{
+ register struct callout *new, *p, *t;
+ register int s;
+
+ if (ticks <= 0)
+ ticks = 1;
+
+ /* Lock out the clock. */
+ s = splhigh();
+
+ /* Fill in the next free callout structure. */
+ if (callfree == NULL)
+ panic("timeout table full");
+ new = callfree;
+ callfree = new->c_next;
+ new->c_arg = arg;
+ new->c_func = ftn;
+
+ /*
+ * The time for each event is stored as a difference from the time
+ * of the previous event on the queue. Walk the queue, correcting
+ * the ticks argument for queue entries passed. Correct the ticks
+ * value for the queue entry immediately after the insertion point
+ * as well. Watch out for negative c_time values; these represent
+ * overdue events.
+ */
+ for (p = &calltodo;
+ (t = p->c_next) != NULL && ticks > t->c_time; p = t)
+ if (t->c_time > 0)
+ ticks -= t->c_time;
+ new->c_time = ticks;
+ if (t != NULL)
+ t->c_time -= ticks;
+
+ /* Insert the new entry into the queue. */
+ p->c_next = new;
+ new->c_next = t;
+ splx(s);
+}
+
+void
+untimeout(ftn, arg)
+ timeout_t ftn;
+ void *arg;
+{
+ register struct callout *p, *t;
+ register int s;
+
+ s = splhigh();
+ for (p = &calltodo; (t = p->c_next) != NULL; p = t)
+ if (t->c_func == ftn && t->c_arg == arg) {
+ /* Increment next entry's tick count. */
+ if (t->c_next && t->c_time > 0)
+ t->c_next->c_time += t->c_time;
+
+ /* Move entry from callout queue to callfree queue. */
+ p->c_next = t->c_next;
+ t->c_next = callfree;
+ callfree = t;
+ break;
+ }
+ splx(s);
+}
+
+void
+gettime(struct timeval *tvp)
+{
+ int s;
+
+ s = splclock();
+ /* XXX should use microtime() iff tv_usec is used. */
+ *tvp = time;
+ splx(s);
+}
+
+/*
+ * Compute number of hz until specified time. Used to
+ * compute third argument to timeout() from an absolute time.
+ */
+int
+hzto(tv)
+ struct timeval *tv;
+{
+ register unsigned long ticks;
+ register long sec, usec;
+ int s;
+
+ /*
+ * If the number of usecs in the whole seconds part of the time
+ * difference fits in a long, then the total number of usecs will
+ * fit in an unsigned long. Compute the total and convert it to
+ * ticks, rounding up and adding 1 to allow for the current tick
+ * to expire. Rounding also depends on unsigned long arithmetic
+ * to avoid overflow.
+ *
+ * Otherwise, if the number of ticks in the whole seconds part of
+ * the time difference fits in a long, then convert the parts to
+ * ticks separately and add, using similar rounding methods and
+ * overflow avoidance. This method would work in the previous
+ * case but it is slightly slower and assumes that hz is integral.
+ *
+ * Otherwise, round the time difference down to the maximum
+ * representable value.
+ *
+ * If ints have 32 bits, then the maximum value for any timeout in
+ * 10ms ticks is 248 days.
+ */
+ s = splclock();
+ sec = tv->tv_sec - time.tv_sec;
+ usec = tv->tv_usec - time.tv_usec;
+ splx(s);
+ if (usec < 0) {
+ sec--;
+ usec += 1000000;
+ }
+ if (sec < 0) {
+#ifdef DIAGNOSTIC
+ printf("hzto: negative time difference %ld sec %ld usec\n",
+ sec, usec);
+#endif
+ ticks = 1;
+ } else if (sec <= LONG_MAX / 1000000)
+ ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
+ / tick + 1;
+ else if (sec <= LONG_MAX / hz)
+ ticks = sec * hz
+ + ((unsigned long)usec + (tick - 1)) / tick + 1;
+ else
+ ticks = LONG_MAX;
+ if (ticks > INT_MAX)
+ ticks = INT_MAX;
+ return (ticks);
+}
+
+/*
+ * Start profiling on a process.
+ *
+ * Kernel profiling passes proc0 which never exits and hence
+ * keeps the profile clock running constantly.
+ */
+void
+startprofclock(p)
+ register struct proc *p;
+{
+ int s;
+
+ if ((p->p_flag & P_PROFIL) == 0) {
+ p->p_flag |= P_PROFIL;
+ if (++profprocs == 1 && stathz != 0) {
+ s = splstatclock();
+ psdiv = pscnt = psratio;
+ setstatclockrate(profhz);
+ splx(s);
+ }
+ }
+}
+
+/*
+ * Stop profiling on a process.
+ */
+void
+stopprofclock(p)
+ register struct proc *p;
+{
+ int s;
+
+ if (p->p_flag & P_PROFIL) {
+ p->p_flag &= ~P_PROFIL;
+ if (--profprocs == 0 && stathz != 0) {
+ s = splstatclock();
+ psdiv = pscnt = 1;
+ setstatclockrate(stathz);
+ splx(s);
+ }
+ }
+}
+
+/*
+ * Statistics clock. Grab profile sample, and if divider reaches 0,
+ * do process and kernel statistics.
+ */
+void
+statclock(frame)
+ register struct clockframe *frame;
+{
+#ifdef GPROF
+ register struct gmonparam *g;
+#endif
+ register struct proc *p;
+ register int i;
+ struct pstats *pstats;
+ long rss;
+ struct rusage *ru;
+ struct vmspace *vm;
+
+ if (CLKF_USERMODE(frame)) {
+ p = curproc;
+ if (p->p_flag & P_PROFIL)
+ addupc_intr(p, CLKF_PC(frame), 1);
+ if (--pscnt > 0)
+ return;
+ /*
+ * Came from user mode; CPU was in user state.
+ * If this process is being profiled record the tick.
+ */
+ p->p_uticks++;
+ if (p->p_nice > NZERO)
+ cp_time[CP_NICE]++;
+ else
+ cp_time[CP_USER]++;
+ } else {
+#ifdef GPROF
+ /*
+ * Kernel statistics are just like addupc_intr, only easier.
+ */
+ g = &_gmonparam;
+ if (g->state == GMON_PROF_ON) {
+ i = CLKF_PC(frame) - g->lowpc;
+ if (i < g->textsize) {
+ i /= HISTFRACTION * sizeof(*g->kcount);
+ g->kcount[i]++;
+ }
+ }
+#endif
+ if (--pscnt > 0)
+ return;
+ /*
+ * Came from kernel mode, so we were:
+ * - handling an interrupt,
+ * - doing syscall or trap work on behalf of the current
+ * user process, or
+ * - spinning in the idle loop.
+ * Whichever it is, charge the time as appropriate.
+ * Note that we charge interrupts to the current process,
+ * regardless of whether they are ``for'' that process,
+ * so that we know how much of its real time was spent
+ * in ``non-process'' (i.e., interrupt) work.
+ */
+ p = curproc;
+ if (CLKF_INTR(frame)) {
+ if (p != NULL)
+ p->p_iticks++;
+ cp_time[CP_INTR]++;
+ } else if (p != NULL) {
+ p->p_sticks++;
+ cp_time[CP_SYS]++;
+ } else
+ cp_time[CP_IDLE]++;
+ }
+ pscnt = psdiv;
+
+ /*
+ * We maintain statistics shown by user-level statistics
+ * programs: the amount of time in each cpu state, and
+ * the amount of time each of DK_NDRIVE ``drives'' is busy.
+ *
+ * XXX should either run linked list of drives, or (better)
+ * grab timestamps in the start & done code.
+ */
+ for (i = 0; i < DK_NDRIVE; i++)
+ if (dk_busy & (1 << i))
+ dk_time[i]++;
+
+ /*
+ * We adjust the priority of the current process. The priority of
+ * a process gets worse as it accumulates CPU time. The cpu usage
+ * estimator (p_estcpu) is increased here. The formula for computing
+ * priorities (in kern_synch.c) will compute a different value each
+ * time p_estcpu increases by 4. The cpu usage estimator ramps up
+ * quite quickly when the process is running (linearly), and decays
+ * away exponentially, at a rate which is proportionally slower when
+ * the system is busy. The basic principal is that the system will
+ * 90% forget that the process used a lot of CPU time in 5 * loadav
+ * seconds. This causes the system to favor processes which haven't
+ * run much recently, and to round-robin among other processes.
+ */
+ if (p != NULL) {
+ p->p_cpticks++;
+ if (++p->p_estcpu == 0)
+ p->p_estcpu--;
+ if ((p->p_estcpu & 3) == 0) {
+ resetpriority(p);
+ if (p->p_priority >= PUSER)
+ p->p_priority = p->p_usrpri;
+ }
+
+ /* Update resource usage integrals and maximums. */
+ if ((pstats = p->p_stats) != NULL &&
+ (ru = &pstats->p_ru) != NULL &&
+ (vm = p->p_vmspace) != NULL) {
+ ru->ru_ixrss += vm->vm_tsize * PAGE_SIZE / 1024;
+ ru->ru_idrss += vm->vm_dsize * PAGE_SIZE / 1024;
+ ru->ru_isrss += vm->vm_ssize * PAGE_SIZE / 1024;
+ rss = vm->vm_pmap.pm_stats.resident_count *
+ PAGE_SIZE / 1024;
+ if (ru->ru_maxrss < rss)
+ ru->ru_maxrss = rss;
+ }
+ }
+}
+
+/*
+ * Return information about system clocks.
+ */
+static int
+sysctl_kern_clockrate SYSCTL_HANDLER_ARGS
+{
+ struct clockinfo clkinfo;
+ /*
+ * Construct clockinfo structure.
+ */
+ clkinfo.hz = hz;
+ clkinfo.tick = tick;
+ clkinfo.profhz = profhz;
+ clkinfo.stathz = stathz ? stathz : hz;
+ return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
+}
+
+SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
+ 0, 0, sysctl_kern_clockrate, "S,clockinfo","");
+
+#ifdef PPS_SYNC
+/*
+ * hardpps() - discipline CPU clock oscillator to external PPS signal
+ *
+ * This routine is called at each PPS interrupt in order to discipline
+ * the CPU clock oscillator to the PPS signal. It measures the PPS phase
+ * and leaves it in a handy spot for the hardclock() routine. It
+ * integrates successive PPS phase differences and calculates the
+ * frequency offset. This is used in hardclock() to discipline the CPU
+ * clock oscillator so that intrinsic frequency error is cancelled out.
+ * The code requires the caller to capture the time and hardware counter
+ * value at the on-time PPS signal transition.
+ *
+ * Note that, on some Unix systems, this routine runs at an interrupt
+ * priority level higher than the timer interrupt routine hardclock().
+ * Therefore, the variables used are distinct from the hardclock()
+ * variables, except for certain exceptions: The PPS frequency pps_freq
+ * and phase pps_offset variables are determined by this routine and
+ * updated atomically. The time_tolerance variable can be considered a
+ * constant, since it is infrequently changed, and then only when the
+ * PPS signal is disabled. The watchdog counter pps_valid is updated
+ * once per second by hardclock() and is atomically cleared in this
+ * routine.
+ */
+void
+hardpps(tvp, usec)
+ struct timeval *tvp; /* time at PPS */
+ long usec; /* hardware counter at PPS */
+{
+ long u_usec, v_usec, bigtick;
+ long cal_sec, cal_usec;
+
+ /*
+ * An occasional glitch can be produced when the PPS interrupt
+ * occurs in the hardclock() routine before the time variable is
+ * updated. Here the offset is discarded when the difference
+ * between it and the last one is greater than tick/2, but not
+ * if the interval since the first discard exceeds 30 s.
+ */
+ time_status |= STA_PPSSIGNAL;
+ time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
+ pps_valid = 0;
+ u_usec = -tvp->tv_usec;
+ if (u_usec < -500000)
+ u_usec += 1000000;
+ v_usec = pps_offset - u_usec;
+ if (v_usec < 0)
+ v_usec = -v_usec;
+ if (v_usec > (tick >> 1)) {
+ if (pps_glitch > MAXGLITCH) {
+ pps_glitch = 0;
+ pps_tf[2] = u_usec;
+ pps_tf[1] = u_usec;
+ } else {
+ pps_glitch++;
+ u_usec = pps_offset;
+ }
+ } else
+ pps_glitch = 0;
+
+ /*
+ * A three-stage median filter is used to help deglitch the pps
+ * time. The median sample becomes the time offset estimate; the
+ * difference between the other two samples becomes the time
+ * dispersion (jitter) estimate.
+ */
+ pps_tf[2] = pps_tf[1];
+ pps_tf[1] = pps_tf[0];
+ pps_tf[0] = u_usec;
+ if (pps_tf[0] > pps_tf[1]) {
+ if (pps_tf[1] > pps_tf[2]) {
+ pps_offset = pps_tf[1]; /* 0 1 2 */
+ v_usec = pps_tf[0] - pps_tf[2];
+ } else if (pps_tf[2] > pps_tf[0]) {
+ pps_offset = pps_tf[0]; /* 2 0 1 */
+ v_usec = pps_tf[2] - pps_tf[1];
+ } else {
+ pps_offset = pps_tf[2]; /* 0 2 1 */
+ v_usec = pps_tf[0] - pps_tf[1];
+ }
+ } else {
+ if (pps_tf[1] < pps_tf[2]) {
+ pps_offset = pps_tf[1]; /* 2 1 0 */
+ v_usec = pps_tf[2] - pps_tf[0];
+ } else if (pps_tf[2] < pps_tf[0]) {
+ pps_offset = pps_tf[0]; /* 1 0 2 */
+ v_usec = pps_tf[1] - pps_tf[2];
+ } else {
+ pps_offset = pps_tf[2]; /* 1 2 0 */
+ v_usec = pps_tf[1] - pps_tf[0];
+ }
+ }
+ if (v_usec > MAXTIME)
+ pps_jitcnt++;
+ v_usec = (v_usec << PPS_AVG) - pps_jitter;
+ if (v_usec < 0)
+ pps_jitter -= -v_usec >> PPS_AVG;
+ else
+ pps_jitter += v_usec >> PPS_AVG;
+ if (pps_jitter > (MAXTIME >> 1))
+ time_status |= STA_PPSJITTER;
+
+ /*
+ * During the calibration interval adjust the starting time when
+ * the tick overflows. At the end of the interval compute the
+ * duration of the interval and the difference of the hardware
+ * counters at the beginning and end of the interval. This code
+ * is deliciously complicated by the fact valid differences may
+ * exceed the value of tick when using long calibration
+ * intervals and small ticks. Note that the counter can be
+ * greater than tick if caught at just the wrong instant, but
+ * the values returned and used here are correct.
+ */
+ bigtick = (long)tick << SHIFT_USEC;
+ pps_usec -= pps_freq;
+ if (pps_usec >= bigtick)
+ pps_usec -= bigtick;
+ if (pps_usec < 0)
+ pps_usec += bigtick;
+ pps_time.tv_sec++;
+ pps_count++;
+ if (pps_count < (1 << pps_shift))
+ return;
+ pps_count = 0;
+ pps_calcnt++;
+ u_usec = usec << SHIFT_USEC;
+ v_usec = pps_usec - u_usec;
+ if (v_usec >= bigtick >> 1)
+ v_usec -= bigtick;
+ if (v_usec < -(bigtick >> 1))
+ v_usec += bigtick;
+ if (v_usec < 0)
+ v_usec = -(-v_usec >> pps_shift);
+ else
+ v_usec = v_usec >> pps_shift;
+ pps_usec = u_usec;
+ cal_sec = tvp->tv_sec;
+ cal_usec = tvp->tv_usec;
+ cal_sec -= pps_time.tv_sec;
+ cal_usec -= pps_time.tv_usec;
+ if (cal_usec < 0) {
+ cal_usec += 1000000;
+ cal_sec--;
+ }
+ pps_time = *tvp;
+
+ /*
+ * Check for lost interrupts, noise, excessive jitter and
+ * excessive frequency error. The number of timer ticks during
+ * the interval may vary +-1 tick. Add to this a margin of one
+ * tick for the PPS signal jitter and maximum frequency
+ * deviation. If the limits are exceeded, the calibration
+ * interval is reset to the minimum and we start over.
+ */
+ u_usec = (long)tick << 1;
+ if (!((cal_sec == -1 && cal_usec > (1000000 - u_usec))
+ || (cal_sec == 0 && cal_usec < u_usec))
+ || v_usec > time_tolerance || v_usec < -time_tolerance) {
+ pps_errcnt++;
+ pps_shift = PPS_SHIFT;
+ pps_intcnt = 0;
+ time_status |= STA_PPSERROR;
+ return;
+ }
+
+ /*
+ * A three-stage median filter is used to help deglitch the pps
+ * frequency. The median sample becomes the frequency offset
+ * estimate; the difference between the other two samples
+ * becomes the frequency dispersion (stability) estimate.
+ */
+ pps_ff[2] = pps_ff[1];
+ pps_ff[1] = pps_ff[0];
+ pps_ff[0] = v_usec;
+ if (pps_ff[0] > pps_ff[1]) {
+ if (pps_ff[1] > pps_ff[2]) {
+ u_usec = pps_ff[1]; /* 0 1 2 */
+ v_usec = pps_ff[0] - pps_ff[2];
+ } else if (pps_ff[2] > pps_ff[0]) {
+ u_usec = pps_ff[0]; /* 2 0 1 */
+ v_usec = pps_ff[2] - pps_ff[1];
+ } else {
+ u_usec = pps_ff[2]; /* 0 2 1 */
+ v_usec = pps_ff[0] - pps_ff[1];
+ }
+ } else {
+ if (pps_ff[1] < pps_ff[2]) {
+ u_usec = pps_ff[1]; /* 2 1 0 */
+ v_usec = pps_ff[2] - pps_ff[0];
+ } else if (pps_ff[2] < pps_ff[0]) {
+ u_usec = pps_ff[0]; /* 1 0 2 */
+ v_usec = pps_ff[1] - pps_ff[2];
+ } else {
+ u_usec = pps_ff[2]; /* 1 2 0 */
+ v_usec = pps_ff[1] - pps_ff[0];
+ }
+ }
+
+ /*
+ * Here the frequency dispersion (stability) is updated. If it
+ * is less than one-fourth the maximum (MAXFREQ), the frequency
+ * offset is updated as well, but clamped to the tolerance. It
+ * will be processed later by the hardclock() routine.
+ */
+ v_usec = (v_usec >> 1) - pps_stabil;
+ if (v_usec < 0)
+ pps_stabil -= -v_usec >> PPS_AVG;
+ else
+ pps_stabil += v_usec >> PPS_AVG;
+ if (pps_stabil > MAXFREQ >> 2) {
+ pps_stbcnt++;
+ time_status |= STA_PPSWANDER;
+ return;
+ }
+ if (time_status & STA_PPSFREQ) {
+ if (u_usec < 0) {
+ pps_freq -= -u_usec >> PPS_AVG;
+ if (pps_freq < -time_tolerance)
+ pps_freq = -time_tolerance;
+ u_usec = -u_usec;
+ } else {
+ pps_freq += u_usec >> PPS_AVG;
+ if (pps_freq > time_tolerance)
+ pps_freq = time_tolerance;
+ }
+ }
+
+ /*
+ * Here the calibration interval is adjusted. If the maximum
+ * time difference is greater than tick / 4, reduce the interval
+ * by half. If this is not the case for four consecutive
+ * intervals, double the interval.
+ */
+ if (u_usec << pps_shift > bigtick >> 2) {
+ pps_intcnt = 0;
+ if (pps_shift > PPS_SHIFT)
+ pps_shift--;
+ } else if (pps_intcnt >= 4) {
+ pps_intcnt = 0;
+ if (pps_shift < PPS_SHIFTMAX)
+ pps_shift++;
+ } else
+ pps_intcnt++;
+}
+#endif /* PPS_SYNC */
diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c
index f4facf6..797ea2c 100644
--- a/sys/kern/kern_time.c
+++ b/sys/kern/kern_time.c
@@ -30,22 +30,22 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)kern_time.c 8.4 (Berkeley) 5/26/95
+ * @(#)kern_time.c 8.1 (Berkeley) 6/10/93
+ * $Id: kern_time.c,v 1.21 1997/02/22 09:39:13 peter Exp $
*/
#include <sys/param.h>
+#include <sys/sysproto.h>
#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
#include <sys/kernel.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/vnode.h>
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
+struct timezone tz;
-#include <machine/cpu.h>
-
-/*
+/*
* Time of day and interval timer support.
*
* These routines provide the kernel entry points to get and set
@@ -55,81 +55,97 @@
* timers when they expire.
*/
+static void timevalfix __P((struct timeval *));
+
+#ifndef _SYS_SYSPROTO_H_
+struct gettimeofday_args {
+ struct timeval *tp;
+ struct timezone *tzp;
+};
+#endif
/* ARGSUSED */
int
gettimeofday(p, uap, retval)
struct proc *p;
- register struct gettimeofday_args /* {
- syscallarg(struct timeval *) tp;
- syscallarg(struct timezone *) tzp;
- } */ *uap;
- register_t *retval;
+ register struct gettimeofday_args *uap;
+ int *retval;
{
struct timeval atv;
int error = 0;
- if (SCARG(uap, tp)) {
+ if (uap->tp) {
microtime(&atv);
- if (error = copyout((caddr_t)&atv, (caddr_t)SCARG(uap, tp),
- sizeof (atv)))
+ if ((error = copyout((caddr_t)&atv, (caddr_t)uap->tp,
+ sizeof (atv))))
return (error);
}
- if (SCARG(uap, tzp))
- error = copyout((caddr_t)&tz, (caddr_t)SCARG(uap, tzp),
+ if (uap->tzp)
+ error = copyout((caddr_t)&tz, (caddr_t)uap->tzp,
sizeof (tz));
return (error);
}
+#ifndef _SYS_SYSPROTO_H_
+struct settimeofday_args {
+ struct timeval *tv;
+ struct timezone *tzp;
+};
+#endif
/* ARGSUSED */
int
settimeofday(p, uap, retval)
struct proc *p;
- struct settimeofday_args /* {
- syscallarg(struct timeval *) tv;
- syscallarg(struct timezone *) tzp;
- } */ *uap;
- register_t *retval;
+ struct settimeofday_args *uap;
+ int *retval;
{
struct timeval atv, delta;
struct timezone atz;
int error, s;
- if (error = suser(p->p_ucred, &p->p_acflag))
+ if ((error = suser(p->p_ucred, &p->p_acflag)))
return (error);
/* Verify all parameters before changing time. */
- if (SCARG(uap, tv) && (error = copyin((caddr_t)SCARG(uap, tv),
- (caddr_t)&atv, sizeof(atv))))
+ if (uap->tv &&
+ (error = copyin((caddr_t)uap->tv, (caddr_t)&atv, sizeof(atv))))
return (error);
- if (SCARG(uap, tzp) && (error = copyin((caddr_t)SCARG(uap, tzp),
- (caddr_t)&atz, sizeof(atz))))
+ if (atv.tv_usec < 0 || atv.tv_usec >= 1000000)
+ return (EINVAL);
+ if (uap->tzp &&
+ (error = copyin((caddr_t)uap->tzp, (caddr_t)&atz, sizeof(atz))))
return (error);
- if (SCARG(uap, tv)) {
+ if (uap->tv) {
+ s = splclock();
/*
- * If the system is secure, we do not allow the time to be
- * set to an earlier value (it may be slowed using adjtime,
- * but not set back). This feature prevent interlopers from
- * setting arbitrary time stamps on files.
+ * Calculate delta directly to minimize clock interrupt
+ * latency. Fix it after the ipl has been lowered.
*/
- if (securelevel > 0 && timercmp(&atv, &time, <))
- return (EPERM);
- /* WHAT DO WE DO ABOUT PENDING REAL-TIME TIMEOUTS??? */
- s = splclock();
- /* nb. delta.tv_usec may be < 0, but this is OK here */
delta.tv_sec = atv.tv_sec - time.tv_sec;
delta.tv_usec = atv.tv_usec - time.tv_usec;
time = atv;
+ /*
+ * XXX should arrange for microtime() to agree with atv if
+ * it is called now. As it is, it may add up to about
+ * `tick' unwanted usec.
+ * Another problem is that clock interrupts may occur at
+ * other than multiples of `tick'. It's not worth fixing
+ * this here, since the problem is also caused by tick
+ * adjustments.
+ */
(void) splsoftclock();
+ timevalfix(&delta);
timevaladd(&boottime, &delta);
- timevalfix(&boottime);
timevaladd(&runtime, &delta);
- timevalfix(&runtime);
+ /* re-use 'p' */
+ for (p = allproc.lh_first; p != 0; p = p->p_list.le_next)
+ if (timerisset(&p->p_realtimer.it_value))
+ timevaladd(&p->p_realtimer.it_value, &delta);
# ifdef NFS
lease_updatetime(delta.tv_sec);
# endif
splx(s);
resettodr();
}
- if (SCARG(uap, tzp))
+ if (uap->tzp)
tz = atz;
return (0);
}
@@ -137,26 +153,29 @@ settimeofday(p, uap, retval)
extern int tickadj; /* "standard" clock skew, us./tick */
int tickdelta; /* current clock skew, us. per tick */
long timedelta; /* unapplied time correction, us. */
-long bigadj = 1000000; /* use 10x skew above bigadj us. */
+static long bigadj = 1000000; /* use 10x skew above bigadj us. */
+#ifndef _SYS_SYSPROTO_H_
+struct adjtime_args {
+ struct timeval *delta;
+ struct timeval *olddelta;
+};
+#endif
/* ARGSUSED */
int
adjtime(p, uap, retval)
struct proc *p;
- register struct adjtime_args /* {
- syscallarg(struct timeval *) delta;
- syscallarg(struct timeval *) olddelta;
- } */ *uap;
- register_t *retval;
+ register struct adjtime_args *uap;
+ int *retval;
{
struct timeval atv;
register long ndelta, ntickdelta, odelta;
int s, error;
- if (error = suser(p->p_ucred, &p->p_acflag))
+ if ((error = suser(p->p_ucred, &p->p_acflag)))
return (error);
- if (error = copyin((caddr_t)SCARG(uap, delta), (caddr_t)&atv,
- sizeof(struct timeval)))
+ if ((error =
+ copyin((caddr_t)uap->delta, (caddr_t)&atv, sizeof(struct timeval))))
return (error);
/*
@@ -167,7 +186,7 @@ adjtime(p, uap, retval)
* overshoot and start taking us away from the desired final time.
*/
ndelta = atv.tv_sec * 1000000 + atv.tv_usec;
- if (ndelta > bigadj)
+ if (ndelta > bigadj || ndelta < -bigadj)
ntickdelta = 10 * tickadj;
else
ntickdelta = tickadj;
@@ -187,10 +206,10 @@ adjtime(p, uap, retval)
tickdelta = ntickdelta;
splx(s);
- if (SCARG(uap, olddelta)) {
+ if (uap->olddelta) {
atv.tv_sec = odelta / 1000000;
atv.tv_usec = odelta % 1000000;
- (void) copyout((caddr_t)&atv, (caddr_t)SCARG(uap, olddelta),
+ (void) copyout((caddr_t)&atv, (caddr_t)uap->olddelta,
sizeof(struct timeval));
}
return (0);
@@ -217,25 +236,28 @@ adjtime(p, uap, retval)
* real time timers .it_interval. Rather, we compute the next time in
* absolute time the timer should go off.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct getitimer_args {
+ u_int which;
+ struct itimerval *itv;
+};
+#endif
/* ARGSUSED */
int
getitimer(p, uap, retval)
struct proc *p;
- register struct getitimer_args /* {
- syscallarg(u_int) which;
- syscallarg(struct itimerval *) itv;
- } */ *uap;
- register_t *retval;
+ register struct getitimer_args *uap;
+ int *retval;
{
struct itimerval aitv;
int s;
- if (SCARG(uap, which) > ITIMER_PROF)
+ if (uap->which > ITIMER_PROF)
return (EINVAL);
s = splclock();
- if (SCARG(uap, which) == ITIMER_REAL) {
+ if (uap->which == ITIMER_REAL) {
/*
- * Convert from absolute to relative time in .it_value
+ * Convert from absoulte to relative time in .it_value
* part of real time timer. If time for real time timer
* has passed return 0, else return difference between
* current time and time for the timer to go off.
@@ -245,53 +267,54 @@ getitimer(p, uap, retval)
if (timercmp(&aitv.it_value, &time, <))
timerclear(&aitv.it_value);
else
- timevalsub(&aitv.it_value,
- (struct timeval *)&time);
+ timevalsub(&aitv.it_value, &time);
} else
- aitv = p->p_stats->p_timer[SCARG(uap, which)];
+ aitv = p->p_stats->p_timer[uap->which];
splx(s);
- return (copyout((caddr_t)&aitv, (caddr_t)SCARG(uap, itv),
+ return (copyout((caddr_t)&aitv, (caddr_t)uap->itv,
sizeof (struct itimerval)));
}
+#ifndef _SYS_SYSPROTO_H_
+struct setitimer_args {
+ u_int which;
+ struct itimerval *itv, *oitv;
+};
+#endif
/* ARGSUSED */
int
setitimer(p, uap, retval)
struct proc *p;
- register struct setitimer_args /* {
- syscallarg(u_int) which;
- syscallarg(struct itimerval *) itv;
- syscallarg(struct itimerval *) oitv;
- } */ *uap;
- register_t *retval;
+ register struct setitimer_args *uap;
+ int *retval;
{
struct itimerval aitv;
register struct itimerval *itvp;
int s, error;
- if (SCARG(uap, which) > ITIMER_PROF)
+ if (uap->which > ITIMER_PROF)
return (EINVAL);
- itvp = SCARG(uap, itv);
+ itvp = uap->itv;
if (itvp && (error = copyin((caddr_t)itvp, (caddr_t)&aitv,
sizeof(struct itimerval))))
return (error);
- if ((SCARG(uap, itv) = SCARG(uap, oitv)) &&
- (error = getitimer(p, uap, retval)))
+ if ((uap->itv = uap->oitv) &&
+ (error = getitimer(p, (struct getitimer_args *)uap, retval)))
return (error);
if (itvp == 0)
return (0);
if (itimerfix(&aitv.it_value) || itimerfix(&aitv.it_interval))
return (EINVAL);
s = splclock();
- if (SCARG(uap, which) == ITIMER_REAL) {
+ if (uap->which == ITIMER_REAL) {
untimeout(realitexpire, (caddr_t)p);
if (timerisset(&aitv.it_value)) {
- timevaladd(&aitv.it_value, (struct timeval *)&time);
+ timevaladd(&aitv.it_value, &time);
timeout(realitexpire, (caddr_t)p, hzto(&aitv.it_value));
}
p->p_realtimer = aitv;
} else
- p->p_stats->p_timer[SCARG(uap, which)] = aitv;
+ p->p_stats->p_timer[uap->which] = aitv;
splx(s);
return (0);
}
@@ -303,6 +326,10 @@ setitimer(p, uap, retval)
* Else compute next time timer should go off which is > current time.
* This is where delay in processing this timeout causes multiple
* SIGALRM calls to be compressed into one.
+ * hzto() always adds 1 to allow for the time until the next clock
+ * interrupt being strictly less than 1 clock tick, but we don't want
+ * that here since we want to appear to be in sync with the clock
+ * interrupt even when we're delayed.
*/
void
realitexpire(arg)
@@ -323,7 +350,7 @@ realitexpire(arg)
&p->p_realtimer.it_interval);
if (timercmp(&p->p_realtimer.it_value, &time, >)) {
timeout(realitexpire, (caddr_t)p,
- hzto(&p->p_realtimer.it_value));
+ hzto(&p->p_realtimer.it_value) - 1);
splx(s);
return;
}
@@ -400,6 +427,7 @@ expire:
* it just gets very confused in this case.
* Caveat emptor.
*/
+void
timevaladd(t1, t2)
struct timeval *t1, *t2;
{
@@ -409,6 +437,7 @@ timevaladd(t1, t2)
timevalfix(t1);
}
+void
timevalsub(t1, t2)
struct timeval *t1, *t2;
{
@@ -418,6 +447,7 @@ timevalsub(t1, t2)
timevalfix(t1);
}
+static void
timevalfix(t1)
struct timeval *t1;
{
diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c
new file mode 100644
index 0000000..171ed0e
--- /dev/null
+++ b/sys/kern/kern_timeout.c
@@ -0,0 +1,1303 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)kern_clock.c 8.5 (Berkeley) 1/21/94
+ * $Id: kern_clock.c,v 1.34 1997/03/22 16:52:19 mpp Exp $
+ */
+
+/* Portions of this software are covered by the following: */
+/******************************************************************************
+ * *
+ * Copyright (c) David L. Mills 1993, 1994 *
+ * *
+ * Permission to use, copy, modify, and distribute this software and its *
+ * documentation for any purpose and without fee is hereby granted, provided *
+ * that the above copyright notice appears in all copies and that both the *
+ * copyright notice and this permission notice appear in supporting *
+ * documentation, and that the name University of Delaware not be used in *
+ * advertising or publicity pertaining to distribution of the software *
+ * without specific, written prior permission. The University of Delaware *
+ * makes no representations about the suitability this software for any *
+ * purpose. It is provided "as is" without express or implied warranty. *
+ * *
+ *****************************************************************************/
+
+#include "opt_cpu.h" /* XXX */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/dkstat.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/timex.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/sysctl.h>
+
+#include <machine/cpu.h>
+#define CLOCK_HAIR /* XXX */
+#include <machine/clock.h>
+
+#ifdef GPROF
+#include <sys/gmon.h>
+#endif
+
+static void initclocks __P((void *dummy));
+SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
+
+/* Exported to machdep.c. */
+struct callout *callfree, *callout;
+
+static struct callout calltodo;
+
+/* Some of these don't belong here, but it's easiest to concentrate them. */
+static long cp_time[CPUSTATES];
+long dk_seek[DK_NDRIVE];
+static long dk_time[DK_NDRIVE];
+long dk_wds[DK_NDRIVE];
+long dk_wpms[DK_NDRIVE];
+long dk_xfer[DK_NDRIVE];
+
+int dk_busy;
+int dk_ndrive = 0;
+char dk_names[DK_NDRIVE][DK_NAMELEN];
+
+long tk_cancc;
+long tk_nin;
+long tk_nout;
+long tk_rawcc;
+
+/*
+ * Clock handling routines.
+ *
+ * This code is written to operate with two timers that run independently of
+ * each other. The main clock, running hz times per second, is used to keep
+ * track of real time. The second timer handles kernel and user profiling,
+ * and does resource use estimation. If the second timer is programmable,
+ * it is randomized to avoid aliasing between the two clocks. For example,
+ * the randomization prevents an adversary from always giving up the cpu
+ * just before its quantum expires. Otherwise, it would never accumulate
+ * cpu ticks. The mean frequency of the second timer is stathz.
+ *
+ * If no second timer exists, stathz will be zero; in this case we drive
+ * profiling and statistics off the main clock. This WILL NOT be accurate;
+ * do not do it unless absolutely necessary.
+ *
+ * The statistics clock may (or may not) be run at a higher rate while
+ * profiling. This profile clock runs at profhz. We require that profhz
+ * be an integral multiple of stathz.
+ *
+ * If the statistics clock is running fast, it must be divided by the ratio
+ * profhz/stathz for statistics. (For profiling, every tick counts.)
+ */
+
+/*
+ * TODO:
+ * allocate more timeout table slots when table overflows.
+ */
+
+/*
+ * Bump a timeval by a small number of usec's.
+ */
+#define BUMPTIME(t, usec) { \
+ register volatile struct timeval *tp = (t); \
+ register long us; \
+ \
+ tp->tv_usec = us = tp->tv_usec + (usec); \
+ if (us >= 1000000) { \
+ tp->tv_usec = us - 1000000; \
+ tp->tv_sec++; \
+ } \
+}
+
+int stathz;
+int profhz;
+static int profprocs;
+int ticks;
+static int psdiv, pscnt; /* prof => stat divider */
+int psratio; /* ratio: prof / stat */
+
+volatile struct timeval time;
+volatile struct timeval mono_time;
+
+/*
+ * Phase/frequency-lock loop (PLL/FLL) definitions
+ *
+ * The following variables are read and set by the ntp_adjtime() system
+ * call.
+ *
+ * time_state shows the state of the system clock, with values defined
+ * in the timex.h header file.
+ *
+ * time_status shows the status of the system clock, with bits defined
+ * in the timex.h header file.
+ *
+ * time_offset is used by the PLL/FLL to adjust the system time in small
+ * increments.
+ *
+ * time_constant determines the bandwidth or "stiffness" of the PLL.
+ *
+ * time_tolerance determines maximum frequency error or tolerance of the
+ * CPU clock oscillator and is a property of the architecture; however,
+ * in principle it could change as result of the presence of external
+ * discipline signals, for instance.
+ *
+ * time_precision is usually equal to the kernel tick variable; however,
+ * in cases where a precision clock counter or external clock is
+ * available, the resolution can be much less than this and depend on
+ * whether the external clock is working or not.
+ *
+ * time_maxerror is initialized by a ntp_adjtime() call and increased by
+ * the kernel once each second to reflect the maximum error
+ * bound growth.
+ *
+ * time_esterror is set and read by the ntp_adjtime() call, but
+ * otherwise not used by the kernel.
+ */
+int time_status = STA_UNSYNC; /* clock status bits */
+int time_state = TIME_OK; /* clock state */
+long time_offset = 0; /* time offset (us) */
+long time_constant = 0; /* pll time constant */
+long time_tolerance = MAXFREQ; /* frequency tolerance (scaled ppm) */
+long time_precision = 1; /* clock precision (us) */
+long time_maxerror = MAXPHASE; /* maximum error (us) */
+long time_esterror = MAXPHASE; /* estimated error (us) */
+
+/*
+ * The following variables establish the state of the PLL/FLL and the
+ * residual time and frequency offset of the local clock. The scale
+ * factors are defined in the timex.h header file.
+ *
+ * time_phase and time_freq are the phase increment and the frequency
+ * increment, respectively, of the kernel time variable at each tick of
+ * the clock.
+ *
+ * time_freq is set via ntp_adjtime() from a value stored in a file when
+ * the synchronization daemon is first started. Its value is retrieved
+ * via ntp_adjtime() and written to the file about once per hour by the
+ * daemon.
+ *
+ * time_adj is the adjustment added to the value of tick at each timer
+ * interrupt and is recomputed from time_phase and time_freq at each
+ * seconds rollover.
+ *
+ * time_reftime is the second's portion of the system time on the last
+ * call to ntp_adjtime(). It is used to adjust the time_freq variable
+ * and to increase the time_maxerror as the time since last update
+ * increases.
+ */
+static long time_phase = 0; /* phase offset (scaled us) */
+long time_freq = 0; /* frequency offset (scaled ppm) */
+static long time_adj = 0; /* tick adjust (scaled 1 / hz) */
+static long time_reftime = 0; /* time at last adjustment (s) */
+
+#ifdef PPS_SYNC
+/*
+ * The following variables are used only if the kernel PPS discipline
+ * code is configured (PPS_SYNC). The scale factors are defined in the
+ * timex.h header file.
+ *
+ * pps_time contains the time at each calibration interval, as read by
+ * microtime(). pps_count counts the seconds of the calibration
+ * interval, the duration of which is nominally pps_shift in powers of
+ * two.
+ *
+ * pps_offset is the time offset produced by the time median filter
+ * pps_tf[], while pps_jitter is the dispersion (jitter) measured by
+ * this filter.
+ *
+ * pps_freq is the frequency offset produced by the frequency median
+ * filter pps_ff[], while pps_stabil is the dispersion (wander) measured
+ * by this filter.
+ *
+ * pps_usec is latched from a high resolution counter or external clock
+ * at pps_time. Here we want the hardware counter contents only, not the
+ * contents plus the time_tv.usec as usual.
+ *
+ * pps_valid counts the number of seconds since the last PPS update. It
+ * is used as a watchdog timer to disable the PPS discipline should the
+ * PPS signal be lost.
+ *
+ * pps_glitch counts the number of seconds since the beginning of an
+ * offset burst more than tick/2 from current nominal offset. It is used
+ * mainly to suppress error bursts due to priority conflicts between the
+ * PPS interrupt and timer interrupt.
+ *
+ * pps_intcnt counts the calibration intervals for use in the interval-
+ * adaptation algorithm. It's just too complicated for words.
+ */
+struct timeval pps_time; /* kernel time at last interval */
+long pps_offset = 0; /* pps time offset (us) */
+long pps_jitter = MAXTIME; /* pps time dispersion (jitter) (us) */
+long pps_tf[] = {0, 0, 0}; /* pps time offset median filter (us) */
+long pps_freq = 0; /* frequency offset (scaled ppm) */
+long pps_stabil = MAXFREQ; /* frequency dispersion (scaled ppm) */
+long pps_ff[] = {0, 0, 0}; /* frequency offset median filter */
+long pps_usec = 0; /* microsec counter at last interval */
+long pps_valid = PPS_VALID; /* pps signal watchdog counter */
+int pps_glitch = 0; /* pps signal glitch counter */
+int pps_count = 0; /* calibration interval counter (s) */
+int pps_shift = PPS_SHIFT; /* interval duration (s) (shift) */
+int pps_intcnt = 0; /* intervals at current duration */
+
+/*
+ * PPS signal quality monitors
+ *
+ * pps_jitcnt counts the seconds that have been discarded because the
+ * jitter measured by the time median filter exceeds the limit MAXTIME
+ * (100 us).
+ *
+ * pps_calcnt counts the frequency calibration intervals, which are
+ * variable from 4 s to 256 s.
+ *
+ * pps_errcnt counts the calibration intervals which have been discarded
+ * because the wander exceeds the limit MAXFREQ (100 ppm) or where the
+ * calibration interval jitter exceeds two ticks.
+ *
+ * pps_stbcnt counts the calibration intervals that have been discarded
+ * because the frequency wander exceeds the limit MAXFREQ / 4 (25 us).
+ */
+long pps_jitcnt = 0; /* jitter limit exceeded */
+long pps_calcnt = 0; /* calibration intervals */
+long pps_errcnt = 0; /* calibration errors */
+long pps_stbcnt = 0; /* stability limit exceeded */
+#endif /* PPS_SYNC */
+
+/* XXX none of this stuff works under FreeBSD */
+#ifdef EXT_CLOCK
+/*
+ * External clock definitions
+ *
+ * The following definitions and declarations are used only if an
+ * external clock (HIGHBALL or TPRO) is configured on the system.
+ */
+#define CLOCK_INTERVAL 30 /* CPU clock update interval (s) */
+
+/*
+ * The clock_count variable is set to CLOCK_INTERVAL at each PPS
+ * interrupt and decremented once each second.
+ */
+int clock_count = 0; /* CPU clock counter */
+
+#ifdef HIGHBALL
+/*
+ * The clock_offset and clock_cpu variables are used by the HIGHBALL
+ * interface. The clock_offset variable defines the offset between
+ * system time and the HIGBALL counters. The clock_cpu variable contains
+ * the offset between the system clock and the HIGHBALL clock for use in
+ * disciplining the kernel time variable.
+ */
+extern struct timeval clock_offset; /* Highball clock offset */
+long clock_cpu = 0; /* CPU clock adjust */
+#endif /* HIGHBALL */
+#endif /* EXT_CLOCK */
+
+/*
+ * hardupdate() - local clock update
+ *
+ * This routine is called by ntp_adjtime() to update the local clock
+ * phase and frequency. The implementation is of an adaptive-parameter,
+ * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new
+ * time and frequency offset estimates for each call. If the kernel PPS
+ * discipline code is configured (PPS_SYNC), the PPS signal itself
+ * determines the new time offset, instead of the calling argument.
+ * Presumably, calls to ntp_adjtime() occur only when the caller
+ * believes the local clock is valid within some bound (+-128 ms with
+ * NTP). If the caller's time is far different than the PPS time, an
+ * argument will ensue, and it's not clear who will lose.
+ *
+ * For uncompensated quartz crystal oscillatores and nominal update
+ * intervals less than 1024 s, operation should be in phase-lock mode
+ * (STA_FLL = 0), where the loop is disciplined to phase. For update
+ * intervals greater than thiss, operation should be in frequency-lock
+ * mode (STA_FLL = 1), where the loop is disciplined to frequency.
+ *
+ * Note: splclock() is in effect.
+ */
+void
+hardupdate(offset)
+ long offset;
+{
+ long ltemp, mtemp;
+
+ if (!(time_status & STA_PLL) && !(time_status & STA_PPSTIME))
+ return;
+ ltemp = offset;
+#ifdef PPS_SYNC
+ if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
+ ltemp = pps_offset;
+#endif /* PPS_SYNC */
+
+ /*
+ * Scale the phase adjustment and clamp to the operating range.
+ */
+ if (ltemp > MAXPHASE)
+ time_offset = MAXPHASE << SHIFT_UPDATE;
+ else if (ltemp < -MAXPHASE)
+ time_offset = -(MAXPHASE << SHIFT_UPDATE);
+ else
+ time_offset = ltemp << SHIFT_UPDATE;
+
+ /*
+ * Select whether the frequency is to be controlled and in which
+ * mode (PLL or FLL). Clamp to the operating range. Ugly
+ * multiply/divide should be replaced someday.
+ */
+ if (time_status & STA_FREQHOLD || time_reftime == 0)
+ time_reftime = time.tv_sec;
+ mtemp = time.tv_sec - time_reftime;
+ time_reftime = time.tv_sec;
+ if (time_status & STA_FLL) {
+ if (mtemp >= MINSEC) {
+ ltemp = ((time_offset / mtemp) << (SHIFT_USEC -
+ SHIFT_UPDATE));
+ if (ltemp < 0)
+ time_freq -= -ltemp >> SHIFT_KH;
+ else
+ time_freq += ltemp >> SHIFT_KH;
+ }
+ } else {
+ if (mtemp < MAXSEC) {
+ ltemp *= mtemp;
+ if (ltemp < 0)
+ time_freq -= -ltemp >> (time_constant +
+ time_constant + SHIFT_KF -
+ SHIFT_USEC);
+ else
+ time_freq += ltemp >> (time_constant +
+ time_constant + SHIFT_KF -
+ SHIFT_USEC);
+ }
+ }
+ if (time_freq > time_tolerance)
+ time_freq = time_tolerance;
+ else if (time_freq < -time_tolerance)
+ time_freq = -time_tolerance;
+}
+
+
+
+/*
+ * Initialize clock frequencies and start both clocks running.
+ */
+/* ARGSUSED*/
+static void
+initclocks(dummy)
+ void *dummy;
+{
+ register int i;
+
+ /*
+ * Set divisors to 1 (normal case) and let the machine-specific
+ * code do its bit.
+ */
+ psdiv = pscnt = 1;
+ cpu_initclocks();
+
+ /*
+ * Compute profhz/stathz, and fix profhz if needed.
+ */
+ i = stathz ? stathz : hz;
+ if (profhz == 0)
+ profhz = i;
+ psratio = profhz / i;
+}
+
+/*
+ * The real-time timer, interrupting hz times per second.
+ */
+void
+hardclock(frame)
+ register struct clockframe *frame;
+{
+ register struct callout *p1;
+ register struct proc *p;
+ register int needsoft;
+
+ /*
+ * Update real-time timeout queue.
+ * At front of queue are some number of events which are ``due''.
+ * The time to these is <= 0 and if negative represents the
+ * number of ticks which have passed since it was supposed to happen.
+ * The rest of the q elements (times > 0) are events yet to happen,
+ * where the time for each is given as a delta from the previous.
+ * Decrementing just the first of these serves to decrement the time
+ * to all events.
+ */
+ needsoft = 0;
+ for (p1 = calltodo.c_next; p1 != NULL; p1 = p1->c_next) {
+ if (--p1->c_time > 0)
+ break;
+ needsoft = 1;
+ if (p1->c_time == 0)
+ break;
+ }
+
+ p = curproc;
+ if (p) {
+ register struct pstats *pstats;
+
+ /*
+ * Run current process's virtual and profile time, as needed.
+ */
+ pstats = p->p_stats;
+ if (CLKF_USERMODE(frame) &&
+ timerisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
+ itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
+ psignal(p, SIGVTALRM);
+ if (timerisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
+ itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
+ psignal(p, SIGPROF);
+ }
+
+ /*
+ * If no separate statistics clock is available, run it from here.
+ */
+ if (stathz == 0)
+ statclock(frame);
+
+ /*
+ * Increment the time-of-day.
+ */
+ ticks++;
+ {
+ int time_update;
+ struct timeval newtime = time;
+ long ltemp;
+
+ if (timedelta == 0) {
+ time_update = CPU_THISTICKLEN(tick);
+ } else {
+ time_update = CPU_THISTICKLEN(tick) + tickdelta;
+ timedelta -= tickdelta;
+ }
+ BUMPTIME(&mono_time, time_update);
+
+ /*
+ * Compute the phase adjustment. If the low-order bits
+ * (time_phase) of the update overflow, bump the high-order bits
+ * (time_update).
+ */
+ time_phase += time_adj;
+ if (time_phase <= -FINEUSEC) {
+ ltemp = -time_phase >> SHIFT_SCALE;
+ time_phase += ltemp << SHIFT_SCALE;
+ time_update -= ltemp;
+ }
+ else if (time_phase >= FINEUSEC) {
+ ltemp = time_phase >> SHIFT_SCALE;
+ time_phase -= ltemp << SHIFT_SCALE;
+ time_update += ltemp;
+ }
+
+ newtime.tv_usec += time_update;
+ /*
+ * On rollover of the second the phase adjustment to be used for
+ * the next second is calculated. Also, the maximum error is
+ * increased by the tolerance. If the PPS frequency discipline
+ * code is present, the phase is increased to compensate for the
+ * CPU clock oscillator frequency error.
+ *
+ * On a 32-bit machine and given parameters in the timex.h
+ * header file, the maximum phase adjustment is +-512 ms and
+ * maximum frequency offset is a tad less than) +-512 ppm. On a
+ * 64-bit machine, you shouldn't need to ask.
+ */
+ if (newtime.tv_usec >= 1000000) {
+ newtime.tv_usec -= 1000000;
+ newtime.tv_sec++;
+ time_maxerror += time_tolerance >> SHIFT_USEC;
+
+ /*
+ * Compute the phase adjustment for the next second. In
+ * PLL mode, the offset is reduced by a fixed factor
+ * times the time constant. In FLL mode the offset is
+ * used directly. In either mode, the maximum phase
+ * adjustment for each second is clamped so as to spread
+ * the adjustment over not more than the number of
+ * seconds between updates.
+ */
+ if (time_offset < 0) {
+ ltemp = -time_offset;
+ if (!(time_status & STA_FLL))
+ ltemp >>= SHIFT_KG + time_constant;
+ if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+ ltemp = (MAXPHASE / MINSEC) <<
+ SHIFT_UPDATE;
+ time_offset += ltemp;
+ time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ -
+ SHIFT_UPDATE);
+ } else {
+ ltemp = time_offset;
+ if (!(time_status & STA_FLL))
+ ltemp >>= SHIFT_KG + time_constant;
+ if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+ ltemp = (MAXPHASE / MINSEC) <<
+ SHIFT_UPDATE;
+ time_offset -= ltemp;
+ time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ -
+ SHIFT_UPDATE);
+ }
+
+ /*
+ * Compute the frequency estimate and additional phase
+ * adjustment due to frequency error for the next
+ * second. When the PPS signal is engaged, gnaw on the
+ * watchdog counter and update the frequency computed by
+ * the pll and the PPS signal.
+ */
+#ifdef PPS_SYNC
+ pps_valid++;
+ if (pps_valid == PPS_VALID) {
+ pps_jitter = MAXTIME;
+ pps_stabil = MAXFREQ;
+ time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+ STA_PPSWANDER | STA_PPSERROR);
+ }
+ ltemp = time_freq + pps_freq;
+#else
+ ltemp = time_freq;
+#endif /* PPS_SYNC */
+ if (ltemp < 0)
+ time_adj -= -ltemp >>
+ (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+ else
+ time_adj += ltemp >>
+ (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+
+#if SHIFT_HZ == 7
+ /*
+ * When the CPU clock oscillator frequency is not a
+ * power of two in Hz, the SHIFT_HZ is only an
+ * approximate scale factor. In the SunOS kernel, this
+ * results in a PLL gain factor of 1/1.28 = 0.78 what it
+ * should be. In the following code the overall gain is
+ * increased by a factor of 1.25, which results in a
+ * residual error less than 3 percent.
+ */
+ /* Same thing applies for FreeBSD --GAW */
+ if (hz == 100) {
+ if (time_adj < 0)
+ time_adj -= -time_adj >> 2;
+ else
+ time_adj += time_adj >> 2;
+ }
+#endif /* SHIFT_HZ */
+
+ /* XXX - this is really bogus, but can't be fixed until
+ xntpd's idea of the system clock is fixed to know how
+ the user wants leap seconds handled; in the mean time,
+ we assume that users of NTP are running without proper
+ leap second support (this is now the default anyway) */
+ /*
+ * Leap second processing. If in leap-insert state at
+ * the end of the day, the system clock is set back one
+ * second; if in leap-delete state, the system clock is
+ * set ahead one second. The microtime() routine or
+ * external clock driver will insure that reported time
+ * is always monotonic. The ugly divides should be
+ * replaced.
+ */
+ switch (time_state) {
+
+ case TIME_OK:
+ if (time_status & STA_INS)
+ time_state = TIME_INS;
+ else if (time_status & STA_DEL)
+ time_state = TIME_DEL;
+ break;
+
+ case TIME_INS:
+ if (newtime.tv_sec % 86400 == 0) {
+ newtime.tv_sec--;
+ time_state = TIME_OOP;
+ }
+ break;
+
+ case TIME_DEL:
+ if ((newtime.tv_sec + 1) % 86400 == 0) {
+ newtime.tv_sec++;
+ time_state = TIME_WAIT;
+ }
+ break;
+
+ case TIME_OOP:
+ time_state = TIME_WAIT;
+ break;
+
+ case TIME_WAIT:
+ if (!(time_status & (STA_INS | STA_DEL)))
+ time_state = TIME_OK;
+ }
+ }
+ CPU_CLOCKUPDATE(&time, &newtime);
+ }
+
+ /*
+ * Process callouts at a very low cpu priority, so we don't keep the
+ * relatively high clock interrupt priority any longer than necessary.
+ */
+ if (needsoft) {
+ if (CLKF_BASEPRI(frame)) {
+ /*
+ * Save the overhead of a software interrupt;
+ * it will happen as soon as we return, so do it now.
+ */
+ (void)splsoftclock();
+ softclock();
+ } else
+ setsoftclock();
+ }
+}
+
+/*
+ * Software (low priority) clock interrupt.
+ * Run periodic events from timeout queue.
+ */
+/*ARGSUSED*/
+void
+softclock()
+{
+ register struct callout *c;
+ register void *arg;
+ register void (*func) __P((void *));
+ register int s;
+
+ s = splhigh();
+ while ((c = calltodo.c_next) != NULL && c->c_time <= 0) {
+ func = c->c_func;
+ arg = c->c_arg;
+ calltodo.c_next = c->c_next;
+ c->c_next = callfree;
+ callfree = c;
+ splx(s);
+ (*func)(arg);
+ (void) splhigh();
+ }
+ splx(s);
+}
+
+/*
+ * timeout --
+ * Execute a function after a specified length of time.
+ *
+ * untimeout --
+ * Cancel previous timeout function call.
+ *
+ * See AT&T BCI Driver Reference Manual for specification. This
+ * implementation differs from that one in that no identification
+ * value is returned from timeout, rather, the original arguments
+ * to timeout are used to identify entries for untimeout.
+ */
+void
+timeout(ftn, arg, ticks)
+ timeout_t ftn;
+ void *arg;
+ register int ticks;
+{
+ register struct callout *new, *p, *t;
+ register int s;
+
+ if (ticks <= 0)
+ ticks = 1;
+
+ /* Lock out the clock. */
+ s = splhigh();
+
+ /* Fill in the next free callout structure. */
+ if (callfree == NULL)
+ panic("timeout table full");
+ new = callfree;
+ callfree = new->c_next;
+ new->c_arg = arg;
+ new->c_func = ftn;
+
+ /*
+ * The time for each event is stored as a difference from the time
+ * of the previous event on the queue. Walk the queue, correcting
+ * the ticks argument for queue entries passed. Correct the ticks
+ * value for the queue entry immediately after the insertion point
+ * as well. Watch out for negative c_time values; these represent
+ * overdue events.
+ */
+ for (p = &calltodo;
+ (t = p->c_next) != NULL && ticks > t->c_time; p = t)
+ if (t->c_time > 0)
+ ticks -= t->c_time;
+ new->c_time = ticks;
+ if (t != NULL)
+ t->c_time -= ticks;
+
+ /* Insert the new entry into the queue. */
+ p->c_next = new;
+ new->c_next = t;
+ splx(s);
+}
+
+void
+untimeout(ftn, arg)
+ timeout_t ftn;
+ void *arg;
+{
+ register struct callout *p, *t;
+ register int s;
+
+ s = splhigh();
+ for (p = &calltodo; (t = p->c_next) != NULL; p = t)
+ if (t->c_func == ftn && t->c_arg == arg) {
+ /* Increment next entry's tick count. */
+ if (t->c_next && t->c_time > 0)
+ t->c_next->c_time += t->c_time;
+
+ /* Move entry from callout queue to callfree queue. */
+ p->c_next = t->c_next;
+ t->c_next = callfree;
+ callfree = t;
+ break;
+ }
+ splx(s);
+}
+
+void
+gettime(struct timeval *tvp)
+{
+ int s;
+
+ s = splclock();
+ /* XXX should use microtime() iff tv_usec is used. */
+ *tvp = time;
+ splx(s);
+}
+
+/*
+ * Compute number of hz until specified time. Used to
+ * compute third argument to timeout() from an absolute time.
+ */
+int
+hzto(tv)
+ struct timeval *tv;
+{
+ register unsigned long ticks;
+ register long sec, usec;
+ int s;
+
+ /*
+ * If the number of usecs in the whole seconds part of the time
+ * difference fits in a long, then the total number of usecs will
+ * fit in an unsigned long. Compute the total and convert it to
+ * ticks, rounding up and adding 1 to allow for the current tick
+ * to expire. Rounding also depends on unsigned long arithmetic
+ * to avoid overflow.
+ *
+ * Otherwise, if the number of ticks in the whole seconds part of
+ * the time difference fits in a long, then convert the parts to
+ * ticks separately and add, using similar rounding methods and
+ * overflow avoidance. This method would work in the previous
+ * case but it is slightly slower and assumes that hz is integral.
+ *
+ * Otherwise, round the time difference down to the maximum
+ * representable value.
+ *
+ * If ints have 32 bits, then the maximum value for any timeout in
+ * 10ms ticks is 248 days.
+ */
+ s = splclock();
+ sec = tv->tv_sec - time.tv_sec;
+ usec = tv->tv_usec - time.tv_usec;
+ splx(s);
+ if (usec < 0) {
+ sec--;
+ usec += 1000000;
+ }
+ if (sec < 0) {
+#ifdef DIAGNOSTIC
+ printf("hzto: negative time difference %ld sec %ld usec\n",
+ sec, usec);
+#endif
+ ticks = 1;
+ } else if (sec <= LONG_MAX / 1000000)
+ ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
+ / tick + 1;
+ else if (sec <= LONG_MAX / hz)
+ ticks = sec * hz
+ + ((unsigned long)usec + (tick - 1)) / tick + 1;
+ else
+ ticks = LONG_MAX;
+ if (ticks > INT_MAX)
+ ticks = INT_MAX;
+ return (ticks);
+}
+
+/*
+ * Start profiling on a process.
+ *
+ * Kernel profiling passes proc0 which never exits and hence
+ * keeps the profile clock running constantly.
+ */
+void
+startprofclock(p)
+ register struct proc *p;
+{
+ int s;
+
+ if ((p->p_flag & P_PROFIL) == 0) {
+ p->p_flag |= P_PROFIL;
+ if (++profprocs == 1 && stathz != 0) {
+ s = splstatclock();
+ psdiv = pscnt = psratio;
+ setstatclockrate(profhz);
+ splx(s);
+ }
+ }
+}
+
+/*
+ * Stop profiling on a process.
+ */
+void
+stopprofclock(p)
+ register struct proc *p;
+{
+ int s;
+
+ if (p->p_flag & P_PROFIL) {
+ p->p_flag &= ~P_PROFIL;
+ if (--profprocs == 0 && stathz != 0) {
+ s = splstatclock();
+ psdiv = pscnt = 1;
+ setstatclockrate(stathz);
+ splx(s);
+ }
+ }
+}
+
+/*
+ * Statistics clock. Grab profile sample, and if divider reaches 0,
+ * do process and kernel statistics.
+ */
+void
+statclock(frame)
+ register struct clockframe *frame;
+{
+#ifdef GPROF
+ register struct gmonparam *g;
+#endif
+ register struct proc *p;
+ register int i;
+ struct pstats *pstats;
+ long rss;
+ struct rusage *ru;
+ struct vmspace *vm;
+
+ if (CLKF_USERMODE(frame)) {
+ p = curproc;
+ if (p->p_flag & P_PROFIL)
+ addupc_intr(p, CLKF_PC(frame), 1);
+ if (--pscnt > 0)
+ return;
+ /*
+ * Came from user mode; CPU was in user state.
+ * If this process is being profiled record the tick.
+ */
+ p->p_uticks++;
+ if (p->p_nice > NZERO)
+ cp_time[CP_NICE]++;
+ else
+ cp_time[CP_USER]++;
+ } else {
+#ifdef GPROF
+ /*
+ * Kernel statistics are just like addupc_intr, only easier.
+ */
+ g = &_gmonparam;
+ if (g->state == GMON_PROF_ON) {
+ i = CLKF_PC(frame) - g->lowpc;
+ if (i < g->textsize) {
+ i /= HISTFRACTION * sizeof(*g->kcount);
+ g->kcount[i]++;
+ }
+ }
+#endif
+ if (--pscnt > 0)
+ return;
+ /*
+ * Came from kernel mode, so we were:
+ * - handling an interrupt,
+ * - doing syscall or trap work on behalf of the current
+ * user process, or
+ * - spinning in the idle loop.
+ * Whichever it is, charge the time as appropriate.
+ * Note that we charge interrupts to the current process,
+ * regardless of whether they are ``for'' that process,
+ * so that we know how much of its real time was spent
+ * in ``non-process'' (i.e., interrupt) work.
+ */
+ p = curproc;
+ if (CLKF_INTR(frame)) {
+ if (p != NULL)
+ p->p_iticks++;
+ cp_time[CP_INTR]++;
+ } else if (p != NULL) {
+ p->p_sticks++;
+ cp_time[CP_SYS]++;
+ } else
+ cp_time[CP_IDLE]++;
+ }
+ pscnt = psdiv;
+
+ /*
+ * We maintain statistics shown by user-level statistics
+ * programs: the amount of time in each cpu state, and
+ * the amount of time each of DK_NDRIVE ``drives'' is busy.
+ *
+ * XXX should either run linked list of drives, or (better)
+ * grab timestamps in the start & done code.
+ */
+ for (i = 0; i < DK_NDRIVE; i++)
+ if (dk_busy & (1 << i))
+ dk_time[i]++;
+
+ /*
+ * We adjust the priority of the current process. The priority of
+ * a process gets worse as it accumulates CPU time. The cpu usage
+ * estimator (p_estcpu) is increased here. The formula for computing
+ * priorities (in kern_synch.c) will compute a different value each
+ * time p_estcpu increases by 4. The cpu usage estimator ramps up
+ * quite quickly when the process is running (linearly), and decays
+ * away exponentially, at a rate which is proportionally slower when
+ * the system is busy. The basic principal is that the system will
+ * 90% forget that the process used a lot of CPU time in 5 * loadav
+ * seconds. This causes the system to favor processes which haven't
+ * run much recently, and to round-robin among other processes.
+ */
+ if (p != NULL) {
+ p->p_cpticks++;
+ if (++p->p_estcpu == 0)
+ p->p_estcpu--;
+ if ((p->p_estcpu & 3) == 0) {
+ resetpriority(p);
+ if (p->p_priority >= PUSER)
+ p->p_priority = p->p_usrpri;
+ }
+
+ /* Update resource usage integrals and maximums. */
+ if ((pstats = p->p_stats) != NULL &&
+ (ru = &pstats->p_ru) != NULL &&
+ (vm = p->p_vmspace) != NULL) {
+ ru->ru_ixrss += vm->vm_tsize * PAGE_SIZE / 1024;
+ ru->ru_idrss += vm->vm_dsize * PAGE_SIZE / 1024;
+ ru->ru_isrss += vm->vm_ssize * PAGE_SIZE / 1024;
+ rss = vm->vm_pmap.pm_stats.resident_count *
+ PAGE_SIZE / 1024;
+ if (ru->ru_maxrss < rss)
+ ru->ru_maxrss = rss;
+ }
+ }
+}
+
+/*
+ * Return information about system clocks.
+ */
+static int
+sysctl_kern_clockrate SYSCTL_HANDLER_ARGS
+{
+ struct clockinfo clkinfo;
+ /*
+ * Construct clockinfo structure.
+ */
+ clkinfo.hz = hz;
+ clkinfo.tick = tick;
+ clkinfo.profhz = profhz;
+ clkinfo.stathz = stathz ? stathz : hz;
+ return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
+}
+
+SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
+ 0, 0, sysctl_kern_clockrate, "S,clockinfo","");
+
+#ifdef PPS_SYNC
+/*
+ * hardpps() - discipline CPU clock oscillator to external PPS signal
+ *
+ * This routine is called at each PPS interrupt in order to discipline
+ * the CPU clock oscillator to the PPS signal. It measures the PPS phase
+ * and leaves it in a handy spot for the hardclock() routine. It
+ * integrates successive PPS phase differences and calculates the
+ * frequency offset. This is used in hardclock() to discipline the CPU
+ * clock oscillator so that intrinsic frequency error is cancelled out.
+ * The code requires the caller to capture the time and hardware counter
+ * value at the on-time PPS signal transition.
+ *
+ * Note that, on some Unix systems, this routine runs at an interrupt
+ * priority level higher than the timer interrupt routine hardclock().
+ * Therefore, the variables used are distinct from the hardclock()
+ * variables, except for certain exceptions: The PPS frequency pps_freq
+ * and phase pps_offset variables are determined by this routine and
+ * updated atomically. The time_tolerance variable can be considered a
+ * constant, since it is infrequently changed, and then only when the
+ * PPS signal is disabled. The watchdog counter pps_valid is updated
+ * once per second by hardclock() and is atomically cleared in this
+ * routine.
+ */
+void
+hardpps(tvp, usec)
+ struct timeval *tvp; /* time at PPS */
+ long usec; /* hardware counter at PPS */
+{
+ long u_usec, v_usec, bigtick;
+ long cal_sec, cal_usec;
+
+ /*
+ * An occasional glitch can be produced when the PPS interrupt
+ * occurs in the hardclock() routine before the time variable is
+ * updated. Here the offset is discarded when the difference
+ * between it and the last one is greater than tick/2, but not
+ * if the interval since the first discard exceeds 30 s.
+ */
+ time_status |= STA_PPSSIGNAL;
+ time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
+ pps_valid = 0;
+ u_usec = -tvp->tv_usec;
+ if (u_usec < -500000)
+ u_usec += 1000000;
+ v_usec = pps_offset - u_usec;
+ if (v_usec < 0)
+ v_usec = -v_usec;
+ if (v_usec > (tick >> 1)) {
+ if (pps_glitch > MAXGLITCH) {
+ pps_glitch = 0;
+ pps_tf[2] = u_usec;
+ pps_tf[1] = u_usec;
+ } else {
+ pps_glitch++;
+ u_usec = pps_offset;
+ }
+ } else
+ pps_glitch = 0;
+
+ /*
+ * A three-stage median filter is used to help deglitch the pps
+ * time. The median sample becomes the time offset estimate; the
+ * difference between the other two samples becomes the time
+ * dispersion (jitter) estimate.
+ */
+ pps_tf[2] = pps_tf[1];
+ pps_tf[1] = pps_tf[0];
+ pps_tf[0] = u_usec;
+ if (pps_tf[0] > pps_tf[1]) {
+ if (pps_tf[1] > pps_tf[2]) {
+ pps_offset = pps_tf[1]; /* 0 1 2 */
+ v_usec = pps_tf[0] - pps_tf[2];
+ } else if (pps_tf[2] > pps_tf[0]) {
+ pps_offset = pps_tf[0]; /* 2 0 1 */
+ v_usec = pps_tf[2] - pps_tf[1];
+ } else {
+ pps_offset = pps_tf[2]; /* 0 2 1 */
+ v_usec = pps_tf[0] - pps_tf[1];
+ }
+ } else {
+ if (pps_tf[1] < pps_tf[2]) {
+ pps_offset = pps_tf[1]; /* 2 1 0 */
+ v_usec = pps_tf[2] - pps_tf[0];
+ } else if (pps_tf[2] < pps_tf[0]) {
+ pps_offset = pps_tf[0]; /* 1 0 2 */
+ v_usec = pps_tf[1] - pps_tf[2];
+ } else {
+ pps_offset = pps_tf[2]; /* 1 2 0 */
+ v_usec = pps_tf[1] - pps_tf[0];
+ }
+ }
+ if (v_usec > MAXTIME)
+ pps_jitcnt++;
+ v_usec = (v_usec << PPS_AVG) - pps_jitter;
+ if (v_usec < 0)
+ pps_jitter -= -v_usec >> PPS_AVG;
+ else
+ pps_jitter += v_usec >> PPS_AVG;
+ if (pps_jitter > (MAXTIME >> 1))
+ time_status |= STA_PPSJITTER;
+
+ /*
+ * During the calibration interval adjust the starting time when
+ * the tick overflows. At the end of the interval compute the
+ * duration of the interval and the difference of the hardware
+ * counters at the beginning and end of the interval. This code
+ * is deliciously complicated by the fact valid differences may
+ * exceed the value of tick when using long calibration
+ * intervals and small ticks. Note that the counter can be
+ * greater than tick if caught at just the wrong instant, but
+ * the values returned and used here are correct.
+ */
+ bigtick = (long)tick << SHIFT_USEC;
+ pps_usec -= pps_freq;
+ if (pps_usec >= bigtick)
+ pps_usec -= bigtick;
+ if (pps_usec < 0)
+ pps_usec += bigtick;
+ pps_time.tv_sec++;
+ pps_count++;
+ if (pps_count < (1 << pps_shift))
+ return;
+ pps_count = 0;
+ pps_calcnt++;
+ u_usec = usec << SHIFT_USEC;
+ v_usec = pps_usec - u_usec;
+ if (v_usec >= bigtick >> 1)
+ v_usec -= bigtick;
+ if (v_usec < -(bigtick >> 1))
+ v_usec += bigtick;
+ if (v_usec < 0)
+ v_usec = -(-v_usec >> pps_shift);
+ else
+ v_usec = v_usec >> pps_shift;
+ pps_usec = u_usec;
+ cal_sec = tvp->tv_sec;
+ cal_usec = tvp->tv_usec;
+ cal_sec -= pps_time.tv_sec;
+ cal_usec -= pps_time.tv_usec;
+ if (cal_usec < 0) {
+ cal_usec += 1000000;
+ cal_sec--;
+ }
+ pps_time = *tvp;
+
+ /*
+ * Check for lost interrupts, noise, excessive jitter and
+ * excessive frequency error. The number of timer ticks during
+ * the interval may vary +-1 tick. Add to this a margin of one
+ * tick for the PPS signal jitter and maximum frequency
+ * deviation. If the limits are exceeded, the calibration
+ * interval is reset to the minimum and we start over.
+ */
+ u_usec = (long)tick << 1;
+ if (!((cal_sec == -1 && cal_usec > (1000000 - u_usec))
+ || (cal_sec == 0 && cal_usec < u_usec))
+ || v_usec > time_tolerance || v_usec < -time_tolerance) {
+ pps_errcnt++;
+ pps_shift = PPS_SHIFT;
+ pps_intcnt = 0;
+ time_status |= STA_PPSERROR;
+ return;
+ }
+
+ /*
+ * A three-stage median filter is used to help deglitch the pps
+ * frequency. The median sample becomes the frequency offset
+ * estimate; the difference between the other two samples
+ * becomes the frequency dispersion (stability) estimate.
+ */
+ pps_ff[2] = pps_ff[1];
+ pps_ff[1] = pps_ff[0];
+ pps_ff[0] = v_usec;
+ if (pps_ff[0] > pps_ff[1]) {
+ if (pps_ff[1] > pps_ff[2]) {
+ u_usec = pps_ff[1]; /* 0 1 2 */
+ v_usec = pps_ff[0] - pps_ff[2];
+ } else if (pps_ff[2] > pps_ff[0]) {
+ u_usec = pps_ff[0]; /* 2 0 1 */
+ v_usec = pps_ff[2] - pps_ff[1];
+ } else {
+ u_usec = pps_ff[2]; /* 0 2 1 */
+ v_usec = pps_ff[0] - pps_ff[1];
+ }
+ } else {
+ if (pps_ff[1] < pps_ff[2]) {
+ u_usec = pps_ff[1]; /* 2 1 0 */
+ v_usec = pps_ff[2] - pps_ff[0];
+ } else if (pps_ff[2] < pps_ff[0]) {
+ u_usec = pps_ff[0]; /* 1 0 2 */
+ v_usec = pps_ff[1] - pps_ff[2];
+ } else {
+ u_usec = pps_ff[2]; /* 1 2 0 */
+ v_usec = pps_ff[1] - pps_ff[0];
+ }
+ }
+
+ /*
+ * Here the frequency dispersion (stability) is updated. If it
+ * is less than one-fourth the maximum (MAXFREQ), the frequency
+ * offset is updated as well, but clamped to the tolerance. It
+ * will be processed later by the hardclock() routine.
+ */
+ v_usec = (v_usec >> 1) - pps_stabil;
+ if (v_usec < 0)
+ pps_stabil -= -v_usec >> PPS_AVG;
+ else
+ pps_stabil += v_usec >> PPS_AVG;
+ if (pps_stabil > MAXFREQ >> 2) {
+ pps_stbcnt++;
+ time_status |= STA_PPSWANDER;
+ return;
+ }
+ if (time_status & STA_PPSFREQ) {
+ if (u_usec < 0) {
+ pps_freq -= -u_usec >> PPS_AVG;
+ if (pps_freq < -time_tolerance)
+ pps_freq = -time_tolerance;
+ u_usec = -u_usec;
+ } else {
+ pps_freq += u_usec >> PPS_AVG;
+ if (pps_freq > time_tolerance)
+ pps_freq = time_tolerance;
+ }
+ }
+
+ /*
+ * Here the calibration interval is adjusted. If the maximum
+ * time difference is greater than tick / 4, reduce the interval
+ * by half. If this is not the case for four consecutive
+ * intervals, double the interval.
+ */
+ if (u_usec << pps_shift > bigtick >> 2) {
+ pps_intcnt = 0;
+ if (pps_shift > PPS_SHIFT)
+ pps_shift--;
+ } else if (pps_intcnt >= 4) {
+ pps_intcnt = 0;
+ if (pps_shift < PPS_SHIFTMAX)
+ pps_shift++;
+ } else
+ pps_intcnt++;
+}
+#endif /* PPS_SYNC */
diff --git a/sys/kern/kern_xxx.c b/sys/kern/kern_xxx.c
index caa1cdd..17550b6 100644
--- a/sys/kern/kern_xxx.c
+++ b/sys/kern/kern_xxx.c
@@ -30,114 +30,230 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)kern_xxx.c 8.3 (Berkeley) 2/14/95
+ * @(#)kern_xxx.c 8.2 (Berkeley) 11/14/93
+ * $Id$
*/
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/sysproto.h>
#include <sys/kernel.h>
#include <sys/proc.h>
-#include <sys/reboot.h>
-#include <vm/vm.h>
#include <sys/sysctl.h>
+#include <sys/utsname.h>
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
-
-/* ARGSUSED */
-int
-reboot(p, uap, retval)
- struct proc *p;
- struct reboot_args /* {
- syscallarg(int) opt;
- } */ *uap;
- register_t *retval;
-{
- int error;
-
- if (error = suser(p->p_ucred, &p->p_acflag))
- return (error);
- boot(SCARG(uap, opt));
- return (0);
-}
#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#ifndef _SYS_SYSPROTO_H_
+struct gethostname_args {
+ char *hostname;
+ u_int len;
+};
+#endif
/* ARGSUSED */
int
-compat_43_gethostname(p, uap, retval)
+ogethostname(p, uap, retval)
struct proc *p;
- struct compat_43_gethostname_args /* {
- syscallarg(char *) hostname;
- syscallarg(u_int) len;
- } */ *uap;
- register_t *retval;
+ struct gethostname_args *uap;
+ int *retval;
{
- int name;
+ int name[2];
- name = KERN_HOSTNAME;
- return (kern_sysctl(&name, 1, SCARG(uap, hostname), &SCARG(uap, len),
- 0, 0));
+ name[0] = CTL_KERN;
+ name[1] = KERN_HOSTNAME;
+ return (userland_sysctl(p, name, 2, uap->hostname, &uap->len,
+ 1, 0, 0, 0));
}
+#ifndef _SYS_SYSPROTO_H_
+struct sethostname_args {
+ char *hostname;
+ u_int len;
+};
+#endif
/* ARGSUSED */
int
-compat_43_sethostname(p, uap, retval)
+osethostname(p, uap, retval)
struct proc *p;
- register struct compat_43_sethostname_args /* {
- syscallarg(char *) hostname;
- syscallarg(u_int) len;
- } */ *uap;
- register_t *retval;
+ register struct sethostname_args *uap;
+ int *retval;
{
- int name;
+ int name[2];
int error;
- if (error = suser(p->p_ucred, &p->p_acflag))
+ name[0] = CTL_KERN;
+ name[1] = KERN_HOSTNAME;
+ if ((error = suser(p->p_ucred, &p->p_acflag)))
return (error);
- name = KERN_HOSTNAME;
- return (kern_sysctl(&name, 1, 0, 0, SCARG(uap, hostname),
- SCARG(uap, len)));
+ return (userland_sysctl(p, name, 2, 0, 0, 0,
+ uap->hostname, uap->len, 0));
}
+#ifndef _SYS_SYSPROTO_H_
+struct ogethostid_args {
+ int dummy;
+};
+#endif
/* ARGSUSED */
int
-compat_43_gethostid(p, uap, retval)
+ogethostid(p, uap, retval)
struct proc *p;
- void *uap;
- register_t *retval;
+ struct ogethostid_args *uap;
+ int *retval;
{
- *(int32_t *)retval = hostid;
+ *(long *)retval = hostid;
return (0);
}
#endif /* COMPAT_43 || COMPAT_SUNOS */
#ifdef COMPAT_43
+#ifndef _SYS_SYSPROTO_H_
+struct osethostid_args {
+ long hostid;
+};
+#endif
/* ARGSUSED */
int
-compat_43_sethostid(p, uap, retval)
+osethostid(p, uap, retval)
struct proc *p;
- struct compat_43_sethostid_args /* {
- syscallarg(int32_t) hostid;
- } */ *uap;
- register_t *retval;
+ struct osethostid_args *uap;
+ int *retval;
{
int error;
- if (error = suser(p->p_ucred, &p->p_acflag))
+ if ((error = suser(p->p_ucred, &p->p_acflag)))
return (error);
- hostid = SCARG(uap, hostid);
+ hostid = uap->hostid;
return (0);
}
int
-compat_43_quota(p, uap, retval)
+oquota(p, uap, retval)
struct proc *p;
- void *uap;
- register_t *retval;
+ struct oquota_args *uap;
+ int *retval;
{
return (ENOSYS);
}
#endif /* COMPAT_43 */
+
+#ifndef _SYS_SYSPROTO_H_
+struct uname_args {
+ struct utsname *name;
+};
+#endif
+
+/* ARGSUSED */
+int
+uname(p, uap, retval)
+ struct proc *p;
+ struct uname_args *uap;
+ int *retval;
+{
+ int name[2], len, rtval;
+ char *s, *us;
+
+ name[0] = CTL_KERN;
+ name[1] = KERN_OSTYPE;
+ len = sizeof uap->name->sysname;
+ rtval = userland_sysctl(p, name, 2, uap->name->sysname, &len,
+ 1, 0, 0, 0);
+ if( rtval) return rtval;
+ subyte( uap->name->sysname + sizeof(uap->name->sysname) - 1, 0);
+
+ name[1] = KERN_HOSTNAME;
+ len = sizeof uap->name->nodename;
+ rtval = userland_sysctl(p, name, 2, uap->name->nodename, &len,
+ 1, 0, 0, 0);
+ if( rtval) return rtval;
+ subyte( uap->name->nodename + sizeof(uap->name->nodename) - 1, 0);
+
+ name[1] = KERN_OSRELEASE;
+ len = sizeof uap->name->release;
+ rtval = userland_sysctl(p, name, 2, uap->name->release, &len,
+ 1, 0, 0, 0);
+ if( rtval) return rtval;
+ subyte( uap->name->release + sizeof(uap->name->release) - 1, 0);
+
+/*
+ name = KERN_VERSION;
+ len = sizeof uap->name->version;
+ rtval = userland_sysctl(p, name, 2, uap->name->version, &len,
+ 1, 0, 0, 0);
+ if( rtval) return rtval;
+ subyte( uap->name->version + sizeof(uap->name->version) - 1, 0);
+*/
+
+/*
+ * this stupid hackery to make the version field look like FreeBSD 1.1
+ */
+ for(s = version; *s && *s != '#'; s++);
+
+ for(us = uap->name->version; *s && *s != ':'; s++) {
+ rtval = subyte( us++, *s);
+ if( rtval)
+ return rtval;
+ }
+ rtval = subyte( us++, 0);
+ if( rtval)
+ return rtval;
+
+ name[1] = HW_MACHINE;
+ len = sizeof uap->name->machine;
+ rtval = userland_sysctl(p, name, 2, uap->name->machine, &len,
+ 1, 0, 0, 0);
+ if( rtval) return rtval;
+ subyte( uap->name->machine + sizeof(uap->name->machine) - 1, 0);
+
+ return 0;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getdomainname_args {
+ char *domainname;
+ int len;
+};
+#endif
+
+/* ARGSUSED */
+int
+getdomainname(p, uap, retval)
+ struct proc *p;
+ struct getdomainname_args *uap;
+ int *retval;
+{
+ int domainnamelen = strlen(domainname) + 1;
+ if ((u_int)uap->len > domainnamelen + 1)
+ uap->len = domainnamelen + 1;
+ return (copyout((caddr_t)domainname, (caddr_t)uap->domainname, uap->len));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setdomainname_args {
+ char *domainname;
+ int len;
+};
+#endif
+
+/* ARGSUSED */
+int
+setdomainname(p, uap, retval)
+ struct proc *p;
+ struct setdomainname_args *uap;
+ int *retval;
+{
+ int error, domainnamelen;
+
+ if ((error = suser(p->p_ucred, &p->p_acflag)))
+ return (error);
+ if ((u_int)uap->len > sizeof (domainname) - 1)
+ return EINVAL;
+ domainnamelen = uap->len;
+ error = copyin((caddr_t)uap->domainname, domainname, uap->len);
+ domainname[domainnamelen] = 0;
+ return (error);
+}
+
diff --git a/sys/kern/makesyscalls.sh b/sys/kern/makesyscalls.sh
index 4e2c28c..dc78413 100644
--- a/sys/kern/makesyscalls.sh
+++ b/sys/kern/makesyscalls.sh
@@ -1,72 +1,43 @@
#! /bin/sh -
-#
-# @(#)makesyscalls.sh 8.2 (Berkeley) 2/14/95
+# @(#)makesyscalls.sh 8.1 (Berkeley) 6/10/93
+# $Id$
set -e
-case $# in
- 2) ;;
- *) echo "Usage: $0 config-file input-file" 1>&2
- exit 1
- ;;
-esac
-
-# source the config file.
-. $1
+# name of compat option:
+compat=COMPAT_43
-# the config file sets the following variables:
-# sysnames the syscall names file
-# sysnumhdr the syscall numbers file
-# syssw the syscall switch file
-# sysarghdr the syscall argument struct definitions
-# compatopts those syscall types that are for 'compat' syscalls
-# switchname the name for the 'struct sysent' we define
-# namesname the name for the 'char *[]' we define
-# constprefix the prefix for the system call constants
-#
-# NOTE THAT THIS makesyscalls.sh DOES NOT SUPPORT 'LIBCOMPAT'.
+# output files:
+sysnames="syscalls.c"
+sysproto="../sys/sysproto.h"
+sysproto_h=_SYS_SYSPROTO_H_
+syshdr="../sys/syscall.h"
+syssw="init_sysent.c"
+syshide="../sys/syscall-hide.h"
+syscallprefix="SYS_"
+switchname="sysent"
+namesname="syscallnames"
# tmp files:
sysdcl="sysent.dcl"
-syscompat_pref="sysent."
+syscompat="sysent.compat"
+syscompatdcl="sysent.compatdcl"
sysent="sysent.switch"
+sysinc="sysinc.switch"
+sysarg="sysarg.switch"
-syscompat_files=""
-for file in $compatopts; do
- syscompat_files="$syscompat_files $syscompat_pref$file"
-done
+trap "rm $sysdcl $syscompat $syscompatdcl $sysent $sysinc $sysarg" 0
-trap "rm $sysdcl $syscompat_files $sysent" 0
-
-# Awk program (must support nawk extensions)
-# Use "awk" at Berkeley, "nawk" or "gawk" elsewhere.
-awk=${AWK:-awk}
-
-# Does this awk have a "toupper" function? (i.e. is it GNU awk)
-isgawk=`$awk 'BEGIN { print toupper("true"); exit; }' 2>/dev/null`
+case $# in
+ 0) echo "Usage: $0 input-file <config-file>" 1>&2
+ exit 1
+ ;;
+esac
-# If this awk does not define "toupper" then define our own.
-if [ "$isgawk" = TRUE ] ; then
- # GNU awk provides it.
- toupper=
-else
- # Provide our own toupper()
- toupper='
-function toupper(str) {
- _toupper_cmd = "echo "str" |tr a-z A-Z"
- _toupper_cmd | getline _toupper_str;
- close(_toupper_cmd);
- return _toupper_str;
-}'
+if [ -f $2 ]; then
+ . $2
fi
-# before handing it off to awk, make a few adjustments:
-# (1) insert spaces around {, }, (, ), *, and commas.
-# (2) get rid of any and all dollar signs (so that rcs id use safe)
-#
-# The awk script will deal with blank lines and lines that
-# start with the comment character (';').
-
sed -e '
s/\$//g
:join
@@ -79,287 +50,311 @@ s/\$//g
2,${
/^#/!s/\([{}()*,]\)/ \1 /g
}
-' < $2 | $awk "
-$toupper
-BEGIN {
- sysnames = \"$sysnames\"
- sysnumhdr = \"$sysnumhdr\"
- sysarghdr = \"$sysarghdr\"
- switchname = \"$switchname\"
- namesname = \"$namesname\"
- constprefix = \"$constprefix\"
-
- sysdcl = \"$sysdcl\"
- syscompat_pref = \"$syscompat_pref\"
- sysent = \"$sysent\"
- infile = \"$2\"
-
- compatopts = \"$compatopts\"
- "'
-
- printf "/*\n * System call switch table.\n *\n" > sysdcl
- printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysdcl
-
- ncompat = split(compatopts,compat)
- for (i = 1; i <= ncompat; i++) {
- compat_upper[i] = toupper(compat[i])
- compat_file[i] = sprintf("%s%s", syscompat_pref, compat[i])
-
- printf "\n#ifdef %s\n", compat_upper[i] > compat_file[i]
- printf "#define %s(func) __CONCAT(%s_,func)\n\n", \
- compat[i], compat[i] > compat_file[i]
+' < $1 | awk "
+ BEGIN {
+ sysdcl = \"$sysdcl\"
+ sysproto = \"$sysproto\"
+ sysproto_h = \"$sysproto_h\"
+ syscompat = \"$syscompat\"
+ syscompatdcl = \"$syscompatdcl\"
+ sysent = \"$sysent\"
+ sysinc = \"$sysinc\"
+ sysarg = \"$sysarg\"
+ sysnames = \"$sysnames\"
+ syshdr = \"$syshdr\"
+ compat = \"$compat\"
+ syshide = \"$syshide\"
+ syscallprefix = \"$syscallprefix\"
+ switchname = \"$switchname\"
+ namesname = \"$namesname\"
+ infile = \"$1\"
+ "'
+
+ printf "/*\n * System call switch table.\n *\n" > sysinc
+ printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysinc
+
+ printf "/*\n * System call prototypes.\n *\n" > sysarg
+ printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysarg
+
+ printf "\n#ifdef %s\n\n", compat > syscompat
+
+ printf "/*\n * System call names.\n *\n" > sysnames
+ printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnames
+
+ printf "/*\n * System call numbers.\n *\n" > syshdr
+ printf " * DO NOT EDIT-- this file is automatically generated.\n" > syshdr
+ printf "/*\n * System call hiders.\n *\n" > syshide
+ printf " * DO NOT EDIT-- this file is automatically generated.\n" > syshide
}
+ NR == 1 {
+ gsub("[$]Id: ", "", $0)
+ gsub(" [$]", "", $0)
- printf "/*\n * System call names.\n *\n" > sysnames
- printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnames
+ printf " * created from%s\n */\n\n", $0 > sysinc
- printf "/*\n * System call numbers.\n *\n" > sysnumhdr
- printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnumhdr
+ printf "\n#ifdef %s\n", compat > sysent
+ printf "#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name)\n" > sysent
+ printf("#else\n") > sysent
+ printf("#define compat(n, name) 0, (sy_call_t *)nosys\n") > sysent
+ printf("#endif\n\n") > sysent
+ printf("/* The casts are bogus but will do for now. */\n") > sysent
+ printf "struct sysent %s[] = {\n",switchname > sysent
- printf "/*\n * System call argument lists.\n *\n" > sysarghdr
- printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysarghdr
-}
-NR == 1 {
- printf " * created from%s\n */\n\n", $0 > sysdcl
-
- printf "#define\ts(type)\tsizeof(type)\n\n" > sysent
- printf "struct sysent %s[] = {\n",switchname > sysent
+ printf " * created from%s\n */\n\n", $0 > sysarg
+ printf("#ifndef %s\n", sysproto_h) > sysarg
+ printf("#define\t%s\n\n", sysproto_h) > sysarg
+ printf "#include <sys/signal.h>\n\n", $0 > sysarg
- printf " * created from%s\n */\n\n", $0 > sysnames
- printf "char *%s[] = {\n",namesname > sysnames
+ printf " * created from%s\n */\n\n", $0 > sysnames
+ printf "char *%s[] = {\n", namesname > sysnames
- printf " * created from%s\n */\n\n", $0 > sysnumhdr
+ printf " * created from%s\n */\n\n", $0 > syshdr
- printf " * created from%s\n */\n\n", $0 > sysarghdr
- printf "#define\tsyscallarg(x)\tunion { x datum; register_t pad; }\n" \
- > sysarghdr
- next
-}
-NF == 0 || $1 ~ /^;/ {
- next
-}
-$1 ~ /^#[ ]*include/ {
- print > sysdcl
- next
-}
-$1 ~ /^#[ ]*if/ {
- print > sysent
- print > sysdcl
- for (i = 1; i <= ncompat; i++)
- print > compat_file[i]
- print > sysnames
- savesyscall = syscall
- next
-}
-$1 ~ /^#[ ]*else/ {
- print > sysent
- print > sysdcl
- for (i = 1; i <= ncompat; i++)
- print > compat_file[i]
- print > sysnames
- syscall = savesyscall
- next
-}
-$1 ~ /^#/ {
- print > sysent
- print > sysdcl
- for (i = 1; i <= ncompat; i++)
- print > compat_file[i]
- print > sysnames
- next
-}
-syscall != $1 {
- printf "%s: line %d: syscall number out of sync at %d\n", \
- infile, NR, syscall
- printf "line is:\n"
- print
- exit 1
-}
-function parserr(was, wanted) {
- printf "%s: line %d: unexpected %s (expected %s)\n", \
- infile, NR, was, wanted
- exit 1
-}
-function parseline() {
- f=3 # toss number and type
- if ($NF != "}") {
- funcalias=$NF
- end=NF-1
- } else {
- funcalias=""
- end=NF
+ printf " * created from%s\n */\n\n", $0 > syshide
+ next
}
- if ($f != "{")
- parserr($f, "{")
- f++
- if ($end != "}")
- parserr($end, "}")
- end--
- if ($end != ";")
- parserr($end, ";")
- end--
- if ($end != ")")
- parserr($end, ")")
- end--
-
- f++ # toss return type
-
- funcname=$f
- if (funcalias == "")
- funcalias=funcname
- f++
+ NF == 0 || $1 ~ /^;/ {
+ next
+ }
+ $1 ~ /^#[ ]*include/ {
+ print > sysinc
+ next
+ }
+ $1 ~ /^#[ ]*if/ {
+ print > sysent
+ print > sysdcl
+ print > sysarg
+ print > syscompat
+ print > sysnames
+ print > syshide
+ savesyscall = syscall
+ next
+ }
+ $1 ~ /^#[ ]*else/ {
+ print > sysent
+ print > sysdcl
+ print > sysarg
+ print > syscompat
+ print > sysnames
+ print > syshide
+ syscall = savesyscall
+ next
+ }
+ $1 ~ /^#/ {
+ print > sysent
+ print > sysdcl
+ print > sysarg
+ print > syscompat
+ print > sysnames
+ print > syshide
+ next
+ }
+ syscall != $1 {
+ printf "%s: line %d: syscall number out of sync at %d\n", \
+ infile, NR, syscall
+ printf "line is:\n"
+ print
+ exit 1
+ }
+ function parserr(was, wanted) {
+ printf "%s: line %d: unexpected %s (expected %s)\n", \
+ infile, NR, was, wanted
+ exit 1
+ }
+ function parseline() {
+ f=4 # toss number and type
+ argc= 0;
+ bigargc = 0;
+ if ($NF != "}") {
+ funcalias=$(NF-2)
+ argalias=$(NF-1)
+ rettype=$NF
+ end=NF-3
+ } else {
+ funcalias=""
+ argalias=""
+ rettype="int"
+ end=NF
+ }
+ if ($2 == "NODEF") {
+ funcname=$4
+ return
+ }
+ if ($f != "{")
+ parserr($f, "{")
+ f++
+ if ($end != "}")
+ parserr($end, "}")
+ end--
+ if ($end != ";")
+ parserr($end, ";")
+ end--
+ if ($end != ")")
+ parserr($end, ")")
+ end--
+
+ f++ #function return type
+
+ funcname=$f
+ if (funcalias == "")
+ funcalias = funcname
+ if (argalias == "") {
+ argalias = funcname "_args"
+ if ($2 == "COMPAT")
+ argalias = "o" argalias
+ }
+ f++
- if ($f != "(")
- parserr($f, ")")
- f++
+ if ($f != "(")
+ parserr($f, ")")
+ f++
- argc= 0;
- if (f == end) {
- if ($f != "void")
- parserr($f, "argument definition")
- return
- }
+ if (f == end) {
+ if ($f != "void")
+ parserr($f, "argument definition")
+ return
+ }
- while (f <= end) {
- argc++
- argtype[argc]=""
- oldf=""
- while (f < end && $(f+1) != ",") {
- if (argtype[argc] != "" && oldf != "*")
- argtype[argc] = argtype[argc]" ";
- argtype[argc] = argtype[argc]$f;
- oldf = $f;
- f++
+ while (f <= end) {
+ argc++
+ argtype[argc]=""
+ oldf=""
+ while (f < end && $(f+1) != ",") {
+ if (argtype[argc] != "" && oldf != "*")
+ argtype[argc] = argtype[argc]" ";
+ argtype[argc] = argtype[argc]$f;
+ oldf = $f;
+ f++
+ }
+ if (argtype[argc] == "")
+ parserr($f, "argument definition")
+ if (argtype[argc] == "off_t")
+ bigargc++
+ argname[argc]=$f;
+ f += 2; # skip name, and any comma
}
- if (argtype[argc] == "")
- parserr($f, "argument definition")
- argname[argc]=$f;
- f += 2; # skip name, and any comma
}
-}
-function putent(nodefs, declfile, compatwrap) {
- # output syscall declaration for switch table
- if (compatwrap == "")
- printf("int\t%s();\n", funcname) > declfile
- else
- printf("int\t%s(%s)();\n", compatwrap, funcname) > declfile
-
- # output syscall switch entry
-# printf("\t{ { %d", argc) > sysent
-# for (i = 1; i <= argc; i++) {
-# if (i == 5) # wrap the line
-# printf(",\n\t ") > sysent
-# else
-# printf(", ") > sysent
-# printf("s(%s)", argtypenospc[i]) > sysent
-# }
- printf("\t{ %d, ", argc) > sysent
- if (argc == 0)
- printf("0") > sysent
- else if (compatwrap == "")
- printf("s(struct %s_args)", funcname) > sysent
- else
- printf("s(struct %s_%s_args)", compatwrap, funcname) > sysent
- if (compatwrap == "")
- wfn = sprintf("%s", funcname);
- else
- wfn = sprintf("%s(%s)", compatwrap, funcname);
- printf(",\n\t %s },", wfn) > sysent
- for (i = 0; i < (33 - length(wfn)) / 8; i++)
- printf("\t") > sysent
- if (compatwrap == "")
+ { comment = $4
+ if (NF < 7)
+ for (i = 5; i <= NF; i++)
+ comment = comment " " $i
+ }
+ $2 == "STD" || $2 == "NODEF" || $2 == "NOARGS" || $2 == "NOPROTO" {
+ parseline()
+ if ((!nosys || funcname != "nosys") && \
+ (funcname != "lkmnosys")) {
+ if (argc != 0 && $2 != "NOARGS" && $2 != "NOPROTO") {
+ printf("struct\t%s {\n", argalias) > sysarg
+ for (i = 1; i <= argc; i++)
+ printf("\t%s %s;\n", argtype[i],
+ argname[i]) > sysarg
+ printf("};\n") > sysarg
+ }
+ else if($2 != "NOARGS" && $2 != "NOPROTO")
+ printf("struct\t%s {\n\tint dummy;\n};\n", \
+ argalias) > sysarg
+ }
+ if ($2 != "NOPROTO" && (!nosys || funcname != "nosys") && \
+ (!lkmnosys || funcname != "lkmnosys")) {
+ printf("%s\t%s __P((struct proc *, struct %s *, int []))", \
+ rettype, funcname, argalias) > sysdcl
+ if (funcname == "exit")
+ printf(" __dead2") > sysdcl
+ printf(";\n") > sysdcl
+ }
+ if (funcname == "nosys")
+ nosys = 1
+ if (funcname == "lkmnosys")
+ lkmnosys = 1
+ printf("\t{ %d, (sy_call_t *)%s },\t\t", \
+ argc+bigargc, funcname) > sysent
+ if(length(funcname) < 11)
+ printf("\t") > sysent
printf("/* %d = %s */\n", syscall, funcalias) > sysent
- else
- printf("/* %d = %s %s */\n", syscall, compatwrap,
- funcalias) > sysent
-
- # output syscall name for names table
- if (compatwrap == "")
- printf("\t\"%s\",\t\t\t/* %d = %s */\n", funcalias, syscall,
- funcalias) > sysnames
- else
- printf("\t\"%s_%s\",\t/* %d = %s %s */\n", compatwrap,
- funcalias, syscall, compatwrap, funcalias) > sysnames
-
- # output syscall number of header, if appropriate
- if (nodefs == "" || nodefs == "NOARGS")
- printf("#define\t%s%s\t%d\n", constprefix, funcalias,
- syscall) > sysnumhdr
- else if (nodefs != "NODEF")
- printf("\t\t\t\t/* %d is %s %s */\n", syscall,
- compatwrap, funcalias) > sysnumhdr
-
- # output syscall argument structure, if it has arguments
- if (argc != 0 && nodefs != "NOARGS") {
- if (compatwrap == "")
- printf("\nstruct %s_args {\n", funcname) > sysarghdr
- else
- printf("\nstruct %s_%s_args {\n", compatwrap,
- funcname) > sysarghdr
- for (i = 1; i <= argc; i++)
- printf("\tsyscallarg(%s) %s;\n", argtype[i],
- argname[i]) > sysarghdr
- printf("};\n") > sysarghdr
+ printf("\t\"%s\",\t\t\t/* %d = %s */\n", \
+ funcalias, syscall, funcalias) > sysnames
+ if ($2 != "NODEF")
+ printf("#define\t%s%s\t%d\n", syscallprefix, \
+ funcalias, syscall) > syshdr
+ if ($3 != "NOHIDE")
+ printf("HIDE_%s(%s)\n", $3, funcname) > syshide
+ syscall++
+ next
}
-}
-$2 == "STD" {
- parseline()
- putent("", sysdcl, "")
- syscall++
- next
-}
-$2 == "NODEF" || $2 == "NOARGS" {
- parseline()
- putent($2, sysdcl, "")
- syscall++
- next
-}
-$2 == "OBSOL" || $2 == "UNIMPL" {
- if ($2 == "OBSOL")
- comment="obsolete"
- else
- comment="unimplemented"
- for (i = 3; i <= NF; i++)
- comment=comment " " $i
-
- printf("\t{ 0, 0,\n\t nosys },\t\t\t\t/* %d = %s */\n", \
- syscall, comment) > sysent
- printf("\t\"#%d (%s)\",\t\t/* %d = %s */\n", \
- syscall, comment, syscall, comment) > sysnames
- if ($2 != "UNIMPL")
- printf("\t\t\t\t/* %d is %s */\n", syscall, comment) > sysnumhdr
- syscall++
- next
-}
-{
- for (i = 1; i <= ncompat; i++) {
- if ($2 == compat_upper[i]) {
- parseline();
- putent("COMMENT", compat_file[i], compat[i])
- syscall++
- next
+ $2 == "COMPAT" || $2 == "CPT_NOA" {
+ parseline()
+ if (argc != 0 && $2 != "CPT_NOA") {
+ printf("struct\t%s {\n", argalias) > syscompat
+ for (i = 1; i <= argc; i++)
+ printf("\t%s %s;\n", argtype[i],
+ argname[i]) > syscompat
+ printf("};\n") > syscompat
}
+ else if($2 != "CPT_NOA")
+ printf("struct\t%s {\n\tint dummy;\n};\n", \
+ argalias) > sysarg
+ printf("%s\to%s __P((struct proc *, struct %s *, int []));\n", \
+ rettype, funcname, argalias) > syscompatdcl
+ printf("\t{ compat(%d,%s) },\t\t/* %d = old %s */\n", \
+ argc+bigargc, funcname, syscall, funcalias) > sysent
+ printf("\t\"old.%s\",\t\t/* %d = old %s */\n", \
+ funcalias, syscall, funcalias) > sysnames
+ printf("\t\t\t\t/* %d is old %s */\n", \
+ syscall, funcalias) > syshdr
+ if ($3 != "NOHIDE")
+ printf("HIDE_%s(%s)\n", $3, funcname) > syshide
+ syscall++
+ next
}
- printf "%s: line %d: unrecognized keyword %s\n", infile, NR, $2
- exit 1
-}
-END {
- printf "\n#undef\tsyscallarg\n" > sysarghdr
-
- for (i = 1; i <= ncompat; i++) {
- printf("\n#else /* %s */\n", compat_upper[i]) > compat_file[i]
- printf("#define %s(func) nosys\n", compat[i]) > \
- compat_file[i]
- printf("#endif /* %s */\n\n", compat_upper[i]) > compat_file[i]
- }
-
- printf("};\n\n") > sysent
- printf("int\tn%s= sizeof(%s) / sizeof(%s[0]);\n", switchname,
- switchname, switchname) > sysent
-
- printf("};\n") > sysnames
-} '
-
-cat $sysdcl $syscompat_files $sysent > $syssw
-
-#chmod 444 $sysnames $syshdr $syssw
+ $2 == "LIBCOMPAT" {
+ parseline()
+ printf("%s\to%s();\n", rettype, funcname) > syscompatdcl
+ printf("\t{ compat(%d,%s) },\t\t/* %d = old %s */\n", \
+ argc+bigargc, funcname, syscall, funcalias) > sysent
+ printf("\t\"old.%s\",\t\t/* %d = old %s */\n", \
+ funcalias, syscall, funcalias) > sysnames
+ printf("#define\t%s%s\t%d\t/* compatibility; still used by libc */\n", \
+ syscallprefix, funcalias, syscall) > syshdr
+ if ($3 != "NOHIDE")
+ printf("HIDE_%s(%s)\n", $3, funcname) > syshide
+ syscall++
+ next
+ }
+ $2 == "OBSOL" {
+ printf("\t{ 0, (sy_call_t *)nosys },\t\t\t/* %d = obsolete %s */\n", \
+ syscall, comment) > sysent
+ printf("\t\"obs_%s\",\t\t\t/* %d = obsolete %s */\n", \
+ $4, syscall, comment) > sysnames
+ printf("\t\t\t\t/* %d is obsolete %s */\n", \
+ syscall, comment) > syshdr
+ if ($3 != "NOHIDE")
+ printf("HIDE_%s(%s)\n", $3, $4) > syshide
+ syscall++
+ next
+ }
+ $2 == "UNIMPL" {
+ printf("\t{ 0, (sy_call_t *)nosys },\t\t\t/* %d = %s */\n", \
+ syscall, comment) > sysent
+ printf("\t\"#%d\",\t\t\t/* %d = %s */\n", \
+ syscall, syscall, comment) > sysnames
+ if ($3 != "NOHIDE")
+ printf("HIDE_%s(%s)\n", $3, $4) > syshide
+ syscall++
+ next
+ }
+ {
+ printf "%s: line %d: unrecognized keyword %s\n", infile, NR, $2
+ exit 1
+ }
+ END {
+ printf("\n#endif /* %s */\n", compat) > syscompatdcl
+ printf("\n#endif /* !%s */\n", sysproto_h) > syscompatdcl
+
+ printf("};\n") > sysent
+ printf("};\n") > sysnames
+ printf("#define\t%sMAXSYSCALL\t%d\n", syscallprefix, syscall) \
+ > syshdr
+ } '
+
+cat $sysinc $sysent >$syssw
+cat $sysarg $sysdcl $syscompat $syscompatdcl > $sysproto
diff --git a/sys/kern/md5c.c b/sys/kern/md5c.c
new file mode 100644
index 0000000..583d009
--- /dev/null
+++ b/sys/kern/md5c.c
@@ -0,0 +1,331 @@
+/*
+ * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm
+ *
+ * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+ * rights reserved.
+ *
+ * License to copy and use this software is granted provided that it
+ * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+ * Algorithm" in all material mentioning or referencing this software
+ * or this function.
+ *
+ * License is also granted to make and use derivative works provided
+ * that such works are identified as "derived from the RSA Data
+ * Security, Inc. MD5 Message-Digest Algorithm" in all material
+ * mentioning or referencing the derived work.
+ *
+ * RSA Data Security, Inc. makes no representations concerning either
+ * the merchantability of this software or the suitability of this
+ * software for any particular purpose. It is provided "as is"
+ * without express or implied warranty of any kind.
+ *
+ * These notices must be retained in any copies of any part of this
+ * documentation and/or software.
+ *
+ * $Id$
+ *
+ * This code is the same as the code published by RSA Inc. It has been
+ * edited for clarity and style only.
+ */
+
+#include <sys/types.h>
+
+#ifdef KERNEL
+#include <sys/param.h>
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+#include <sys/md5.h>
+
+static void MD5Transform __P((u_int32_t [4], const unsigned char [64]));
+
+#ifdef KERNEL
+#define memset(x,y,z) bzero(x,z);
+#define memcpy(x,y,z) bcopy(y, x, z)
+#endif
+
+#ifdef i386
+#define Encode memcpy
+#define Decode memcpy
+#else /* i386 */
+
+/*
+ * Encodes input (u_int32_t) into output (unsigned char). Assumes len is
+ * a multiple of 4.
+ */
+
+static void
+Encode (output, input, len)
+ unsigned char *output;
+ u_int32_t *input;
+ unsigned int len;
+{
+ unsigned int i, j;
+
+ for (i = 0, j = 0; j < len; i++, j += 4) {
+ output[j] = (unsigned char)(input[i] & 0xff);
+ output[j+1] = (unsigned char)((input[i] >> 8) & 0xff);
+ output[j+2] = (unsigned char)((input[i] >> 16) & 0xff);
+ output[j+3] = (unsigned char)((input[i] >> 24) & 0xff);
+ }
+}
+
+/*
+ * Decodes input (unsigned char) into output (u_int32_t). Assumes len is
+ * a multiple of 4.
+ */
+
+static void
+Decode (output, input, len)
+ u_int32_t *output;
+ const unsigned char *input;
+ unsigned int len;
+{
+ unsigned int i, j;
+
+ for (i = 0, j = 0; j < len; i++, j += 4)
+ output[i] = ((u_int32_t)input[j]) | (((u_int32_t)input[j+1]) << 8) |
+ (((u_int32_t)input[j+2]) << 16) | (((u_int32_t)input[j+3]) << 24);
+}
+#endif /* i386 */
+
+static unsigned char PADDING[64] = {
+ 0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* F, G, H and I are basic MD5 functions. */
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define I(x, y, z) ((y) ^ ((x) | (~z)))
+
+/* ROTATE_LEFT rotates x left n bits. */
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+
+/*
+ * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
+ * Rotation is separate from addition to prevent recomputation.
+ */
+#define FF(a, b, c, d, x, s, ac) { \
+ (a) += F ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define GG(a, b, c, d, x, s, ac) { \
+ (a) += G ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define HH(a, b, c, d, x, s, ac) { \
+ (a) += H ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+#define II(a, b, c, d, x, s, ac) { \
+ (a) += I ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+ (a) = ROTATE_LEFT ((a), (s)); \
+ (a) += (b); \
+ }
+
+/* MD5 initialization. Begins an MD5 operation, writing a new context. */
+
+void
+MD5Init (context)
+ MD5_CTX *context;
+{
+
+ context->count[0] = context->count[1] = 0;
+
+ /* Load magic initialization constants. */
+ context->state[0] = 0x67452301;
+ context->state[1] = 0xefcdab89;
+ context->state[2] = 0x98badcfe;
+ context->state[3] = 0x10325476;
+}
+
+/*
+ * MD5 block update operation. Continues an MD5 message-digest
+ * operation, processing another message block, and updating the
+ * context.
+ */
+
+void
+MD5Update (context, input, inputLen)
+ MD5_CTX *context;
+ const unsigned char *input;
+ unsigned int inputLen;
+{
+ unsigned int i, index, partLen;
+
+ /* Compute number of bytes mod 64 */
+ index = (unsigned int)((context->count[0] >> 3) & 0x3F);
+
+ /* Update number of bits */
+ if ((context->count[0] += ((u_int32_t)inputLen << 3))
+ < ((u_int32_t)inputLen << 3))
+ context->count[1]++;
+ context->count[1] += ((u_int32_t)inputLen >> 29);
+
+ partLen = 64 - index;
+
+ /* Transform as many times as possible. */
+ if (inputLen >= partLen) {
+ memcpy((void *)&context->buffer[index], (void *)input,
+ partLen);
+ MD5Transform (context->state, context->buffer);
+
+ for (i = partLen; i + 63 < inputLen; i += 64)
+ MD5Transform (context->state, &input[i]);
+
+ index = 0;
+ }
+ else
+ i = 0;
+
+ /* Buffer remaining input */
+ memcpy ((void *)&context->buffer[index], (void *)&input[i],
+ inputLen-i);
+}
+
+/*
+ * MD5 finalization. Ends an MD5 message-digest operation, writing the
+ * the message digest and zeroizing the context.
+ */
+
+void
+MD5Final (digest, context)
+ unsigned char digest[16];
+ MD5_CTX *context;
+{
+ unsigned char bits[8];
+ unsigned int index, padLen;
+
+ /* Save number of bits */
+ Encode (bits, context->count, 8);
+
+ /* Pad out to 56 mod 64. */
+ index = (unsigned int)((context->count[0] >> 3) & 0x3f);
+ padLen = (index < 56) ? (56 - index) : (120 - index);
+ MD5Update (context, PADDING, padLen);
+
+ /* Append length (before padding) */
+ MD5Update (context, bits, 8);
+
+ /* Store state in digest */
+ Encode (digest, context->state, 16);
+
+ /* Zeroize sensitive information. */
+ memset ((void *)context, 0, sizeof (*context));
+}
+
+/* MD5 basic transformation. Transforms state based on block. */
+
+static void
+MD5Transform (state, block)
+ u_int32_t state[4];
+ const unsigned char block[64];
+{
+ u_int32_t a = state[0], b = state[1], c = state[2], d = state[3], x[16];
+
+ Decode (x, block, 64);
+
+ /* Round 1 */
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+ FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
+ FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
+ FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
+ FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
+ FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
+ FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
+ FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
+ FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
+ FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
+ FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
+ FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
+ FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
+ FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
+ FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
+ FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
+ FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
+
+ /* Round 2 */
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+ GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
+ GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
+ GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
+ GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
+ GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
+ GG (d, a, b, c, x[10], S22, 0x2441453); /* 22 */
+ GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
+ GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
+ GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
+ GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
+ GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
+ GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
+ GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
+ GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
+ GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
+ GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
+
+ /* Round 3 */
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+ HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
+ HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
+ HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
+ HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
+ HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
+ HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
+ HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
+ HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
+ HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
+ HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
+ HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
+ HH (b, c, d, a, x[ 6], S34, 0x4881d05); /* 44 */
+ HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
+ HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
+ HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
+ HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
+
+ /* Round 4 */
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+ II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
+ II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
+ II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
+ II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
+ II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
+ II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
+ II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
+ II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
+ II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
+ II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
+ II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
+ II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
+ II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
+ II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
+ II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
+ II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
+
+ state[0] += a;
+ state[1] += b;
+ state[2] += c;
+ state[3] += d;
+
+ /* Zeroize sensitive information. */
+ memset ((void *)x, 0, sizeof (x));
+}
diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c
index 7281339..f48ce99 100644
--- a/sys/kern/subr_autoconf.c
+++ b/sys/kern/subr_autoconf.c
@@ -39,15 +39,14 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)subr_autoconf.c 8.3 (Berkeley) 5/17/94
+ * @(#)subr_autoconf.c 8.1 (Berkeley) 6/10/93
*
- * from: $Header: subr_autoconf.c,v 1.12 93/02/01 19:31:48 torek Exp $ (LBL)
+ * $Id$
*/
#include <sys/param.h>
#include <sys/device.h>
#include <sys/malloc.h>
-#include <libkern/libkern.h>
/*
* Autoconfiguration subroutines.
@@ -284,16 +283,15 @@ config_attach(parent, cf, aux, print)
void **nsp;
if (old == 0) {
- new = max(MINALLOCSIZE / sizeof(void *),
- dev->dv_unit + 1);
- newbytes = new * sizeof(void *);
- nsp = malloc(newbytes, M_DEVBUF, M_WAITOK); /*XXX*/
- bzero(nsp, newbytes);
+ nsp = malloc(MINALLOCSIZE, M_DEVBUF, M_WAITOK); /*XXX*/
+ bzero(nsp, MINALLOCSIZE);
+ cd->cd_ndevs = MINALLOCSIZE / sizeof(void *);
} else {
new = cd->cd_ndevs;
do {
new *= 2;
} while (new <= dev->dv_unit);
+ cd->cd_ndevs = new;
oldbytes = old * sizeof(void *);
newbytes = new * sizeof(void *);
nsp = malloc(newbytes, M_DEVBUF, M_WAITOK); /*XXX*/
@@ -301,7 +299,6 @@ config_attach(parent, cf, aux, print)
bzero(&nsp[old], newbytes - oldbytes);
free(cd->cd_devs, M_DEVBUF);
}
- cd->cd_ndevs = new;
cd->cd_devs = nsp;
}
if (cd->cd_devs[dev->dv_unit])
diff --git a/sys/kern/subr_clist.c b/sys/kern/subr_clist.c
new file mode 100644
index 0000000..d907b47
--- /dev/null
+++ b/sys/kern/subr_clist.c
@@ -0,0 +1,694 @@
+/*
+ * Copyright (c) 1994, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: tty_subr.c,v 1.26 1997/03/05 16:45:01 bde Exp $
+ */
+
+/*
+ * clist support routines
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/tty.h>
+#include <sys/clist.h>
+#include <sys/malloc.h>
+
+static void clist_init __P((void *));
+SYSINIT(clist, SI_SUB_CLIST, SI_ORDER_FIRST, clist_init, NULL)
+
+static struct cblock *cfreelist = 0;
+int cfreecount = 0;
+static int cslushcount;
+static int ctotcount;
+
+#ifndef INITIAL_CBLOCKS
+#define INITIAL_CBLOCKS 50
+#endif
+
+static struct cblock *cblock_alloc __P((void));
+static void cblock_alloc_cblocks __P((int number));
+static void cblock_free __P((struct cblock *cblockp));
+static void cblock_free_cblocks __P((int number));
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(cbstat, cbstat)
+{
+ printf(
+ "tot = %d (active = %d, free = %d (reserved = %d, slush = %d))\n",
+ ctotcount * CBSIZE, ctotcount * CBSIZE - cfreecount, cfreecount,
+ cfreecount - cslushcount * CBSIZE, cslushcount * CBSIZE);
+}
+#endif /* DDB */
+
+/*
+ * Called from init_main.c
+ */
+/* ARGSUSED*/
+static void
+clist_init(dummy)
+ void *dummy;
+{
+ /*
+ * Allocate an initial base set of cblocks as a 'slush'.
+ * We allocate non-slush cblocks with each initial ttyopen() and
+ * deallocate them with each ttyclose().
+ * We should adjust the slush allocation. This can't be done in
+ * the i/o routines because they are sometimes called from
+ * interrupt handlers when it may be unsafe to call malloc().
+ */
+ cblock_alloc_cblocks(cslushcount = INITIAL_CBLOCKS);
+}
+
+/*
+ * Remove a cblock from the cfreelist queue and return a pointer
+ * to it.
+ */
+static inline struct cblock *
+cblock_alloc()
+{
+ struct cblock *cblockp;
+
+ cblockp = cfreelist;
+ if (cblockp == NULL)
+ panic("clist reservation botch");
+ cfreelist = cblockp->c_next;
+ cblockp->c_next = NULL;
+ cfreecount -= CBSIZE;
+ return (cblockp);
+}
+
+/*
+ * Add a cblock to the cfreelist queue.
+ */
+static inline void
+cblock_free(cblockp)
+ struct cblock *cblockp;
+{
+ if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1))
+ bzero(cblockp->c_quote, sizeof cblockp->c_quote);
+ cblockp->c_next = cfreelist;
+ cfreelist = cblockp;
+ cfreecount += CBSIZE;
+}
+
+/*
+ * Allocate some cblocks for the cfreelist queue.
+ */
+static void
+cblock_alloc_cblocks(number)
+ int number;
+{
+ int i;
+ struct cblock *cbp;
+
+ for (i = 0; i < number; ++i) {
+ cbp = malloc(sizeof *cbp, M_TTYS, M_NOWAIT);
+ if (cbp == NULL) {
+ printf(
+"clist_alloc_cblocks: M_NOWAIT malloc failed, trying M_WAITOK\n");
+ cbp = malloc(sizeof *cbp, M_TTYS, M_WAITOK);
+ }
+ /*
+ * Freed cblocks have zero quotes and garbage elsewhere.
+ * Set the may-have-quote bit to force zeroing the quotes.
+ */
+ setbit(cbp->c_quote, CBQSIZE * NBBY - 1);
+ cblock_free(cbp);
+ }
+ ctotcount += number;
+}
+
+/*
+ * Set the cblock allocation policy for a a clist.
+ * Must be called in process context at spltty().
+ */
+void
+clist_alloc_cblocks(clistp, ccmax, ccreserved)
+ struct clist *clistp;
+ int ccmax;
+ int ccreserved;
+{
+ int dcbr;
+
+ /*
+ * Allow for wasted space at the head.
+ */
+ if (ccmax != 0)
+ ccmax += CBSIZE - 1;
+ if (ccreserved != 0)
+ ccreserved += CBSIZE - 1;
+
+ clistp->c_cbmax = roundup(ccmax, CBSIZE) / CBSIZE;
+ dcbr = roundup(ccreserved, CBSIZE) / CBSIZE - clistp->c_cbreserved;
+ if (dcbr >= 0)
+ cblock_alloc_cblocks(dcbr);
+ else {
+ if (clistp->c_cbreserved + dcbr < clistp->c_cbcount)
+ dcbr = clistp->c_cbcount - clistp->c_cbreserved;
+ cblock_free_cblocks(-dcbr);
+ }
+ clistp->c_cbreserved += dcbr;
+}
+
+/*
+ * Free some cblocks from the cfreelist queue back to the
+ * system malloc pool.
+ */
+static void
+cblock_free_cblocks(number)
+ int number;
+{
+ int i;
+
+ for (i = 0; i < number; ++i)
+ free(cblock_alloc(), M_TTYS);
+ ctotcount -= number;
+}
+
+/*
+ * Free the cblocks reserved for a clist.
+ * Must be called at spltty().
+ */
+void
+clist_free_cblocks(clistp)
+ struct clist *clistp;
+{
+ if (clistp->c_cbcount != 0)
+ panic("freeing active clist cblocks");
+ cblock_free_cblocks(clistp->c_cbreserved);
+ clistp->c_cbmax = 0;
+ clistp->c_cbreserved = 0;
+}
+
+/*
+ * Get a character from the head of a clist.
+ */
+int
+getc(clistp)
+ struct clist *clistp;
+{
+ int chr = -1;
+ int s;
+ struct cblock *cblockp;
+
+ s = spltty();
+
+ /* If there are characters in the list, get one */
+ if (clistp->c_cc) {
+ cblockp = (struct cblock *)((long)clistp->c_cf & ~CROUND);
+ chr = (u_char)*clistp->c_cf;
+
+ /*
+ * If this char is quoted, set the flag.
+ */
+ if (isset(cblockp->c_quote, clistp->c_cf - (char *)cblockp->c_info))
+ chr |= TTY_QUOTE;
+
+ /*
+ * Advance to next character.
+ */
+ clistp->c_cf++;
+ clistp->c_cc--;
+ /*
+ * If we have advanced the 'first' character pointer
+ * past the end of this cblock, advance to the next one.
+ * If there are no more characters, set the first and
+ * last pointers to NULL. In either case, free the
+ * current cblock.
+ */
+ if ((clistp->c_cf >= (char *)(cblockp+1)) || (clistp->c_cc == 0)) {
+ if (clistp->c_cc > 0) {
+ clistp->c_cf = cblockp->c_next->c_info;
+ } else {
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ }
+ }
+
+ splx(s);
+ return (chr);
+}
+
+/*
+ * Copy 'amount' of chars, beginning at head of clist 'clistp' to
+ * destination linear buffer 'dest'. Return number of characters
+ * actually copied.
+ */
+int
+q_to_b(clistp, dest, amount)
+ struct clist *clistp;
+ char *dest;
+ int amount;
+{
+ struct cblock *cblockp;
+ struct cblock *cblockn;
+ char *dest_orig = dest;
+ int numc;
+ int s;
+
+ s = spltty();
+
+ while (clistp && amount && (clistp->c_cc > 0)) {
+ cblockp = (struct cblock *)((long)clistp->c_cf & ~CROUND);
+ cblockn = cblockp + 1; /* pointer arithmetic! */
+ numc = min(amount, (char *)cblockn - clistp->c_cf);
+ numc = min(numc, clistp->c_cc);
+ bcopy(clistp->c_cf, dest, numc);
+ amount -= numc;
+ clistp->c_cf += numc;
+ clistp->c_cc -= numc;
+ dest += numc;
+ /*
+ * If this cblock has been emptied, advance to the next
+ * one. If there are no more characters, set the first
+ * and last pointer to NULL. In either case, free the
+ * current cblock.
+ */
+ if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+ if (clistp->c_cc > 0) {
+ clistp->c_cf = cblockp->c_next->c_info;
+ } else {
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ }
+ }
+
+ splx(s);
+ return (dest - dest_orig);
+}
+
+/*
+ * Flush 'amount' of chars, beginning at head of clist 'clistp'.
+ */
+void
+ndflush(clistp, amount)
+ struct clist *clistp;
+ int amount;
+{
+ struct cblock *cblockp;
+ struct cblock *cblockn;
+ int numc;
+ int s;
+
+ s = spltty();
+
+ while (amount && (clistp->c_cc > 0)) {
+ cblockp = (struct cblock *)((long)clistp->c_cf & ~CROUND);
+ cblockn = cblockp + 1; /* pointer arithmetic! */
+ numc = min(amount, (char *)cblockn - clistp->c_cf);
+ numc = min(numc, clistp->c_cc);
+ amount -= numc;
+ clistp->c_cf += numc;
+ clistp->c_cc -= numc;
+ /*
+ * If this cblock has been emptied, advance to the next
+ * one. If there are no more characters, set the first
+ * and last pointer to NULL. In either case, free the
+ * current cblock.
+ */
+ if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+ if (clistp->c_cc > 0) {
+ clistp->c_cf = cblockp->c_next->c_info;
+ } else {
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ }
+ }
+
+ splx(s);
+}
+
+/*
+ * Add a character to the end of a clist. Return -1 is no
+ * more clists, or 0 for success.
+ */
+int
+putc(chr, clistp)
+ int chr;
+ struct clist *clistp;
+{
+ struct cblock *cblockp;
+ int s;
+
+ s = spltty();
+
+ if (clistp->c_cl == NULL) {
+ if (clistp->c_cbreserved < 1) {
+ splx(s);
+ printf("putc to a clist with no reserved cblocks\n");
+ return (-1); /* nothing done */
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount = 1;
+ clistp->c_cf = clistp->c_cl = cblockp->c_info;
+ clistp->c_cc = 0;
+ } else {
+ cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND);
+ if (((long)clistp->c_cl & CROUND) == 0) {
+ struct cblock *prev = (cblockp - 1);
+
+ if (clistp->c_cbcount >= clistp->c_cbreserved) {
+ if (clistp->c_cbcount >= clistp->c_cbmax
+ || cslushcount <= 0) {
+ splx(s);
+ return (-1);
+ }
+ --cslushcount;
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount++;
+ prev->c_next = cblockp;
+ clistp->c_cl = cblockp->c_info;
+ }
+ }
+
+ /*
+ * If this character is quoted, set the quote bit, if not, clear it.
+ */
+ if (chr & TTY_QUOTE) {
+ setbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+ /*
+ * Use one of the spare quote bits to record that something
+ * may be quoted.
+ */
+ setbit(cblockp->c_quote, CBQSIZE * NBBY - 1);
+ } else
+ clrbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+
+ *clistp->c_cl++ = chr;
+ clistp->c_cc++;
+
+ splx(s);
+ return (0);
+}
+
+/*
+ * Copy data from linear buffer to clist chain. Return the
+ * number of characters not copied.
+ */
+int
+b_to_q(src, amount, clistp)
+ char *src;
+ int amount;
+ struct clist *clistp;
+{
+ struct cblock *cblockp;
+ char *firstbyte, *lastbyte;
+ u_char startmask, endmask;
+ int startbit, endbit, num_between, numc;
+ int s;
+
+ /*
+ * Avoid allocating an initial cblock and then not using it.
+ * c_cc == 0 must imply c_cbount == 0.
+ */
+ if (amount <= 0)
+ return (amount);
+
+ s = spltty();
+
+ /*
+ * If there are no cblocks assigned to this clist yet,
+ * then get one.
+ */
+ if (clistp->c_cl == NULL) {
+ if (clistp->c_cbreserved < 1) {
+ splx(s);
+ printf("b_to_q to a clist with no reserved cblocks.\n");
+ return (amount); /* nothing done */
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount = 1;
+ clistp->c_cf = clistp->c_cl = cblockp->c_info;
+ clistp->c_cc = 0;
+ } else {
+ cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND);
+ }
+
+ while (amount) {
+ /*
+ * Get another cblock if needed.
+ */
+ if (((long)clistp->c_cl & CROUND) == 0) {
+ struct cblock *prev = cblockp - 1;
+
+ if (clistp->c_cbcount >= clistp->c_cbreserved) {
+ if (clistp->c_cbcount >= clistp->c_cbmax
+ || cslushcount <= 0) {
+ splx(s);
+ return (amount);
+ }
+ --cslushcount;
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount++;
+ prev->c_next = cblockp;
+ clistp->c_cl = cblockp->c_info;
+ }
+
+ /*
+ * Copy a chunk of the linear buffer up to the end
+ * of this cblock.
+ */
+ numc = min(amount, (char *)(cblockp + 1) - clistp->c_cl);
+ bcopy(src, clistp->c_cl, numc);
+
+ /*
+ * Clear quote bits if they aren't known to be clear.
+ * The following could probably be made into a seperate
+ * "bitzero()" routine, but why bother?
+ */
+ if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) {
+ startbit = clistp->c_cl - (char *)cblockp->c_info;
+ endbit = startbit + numc - 1;
+
+ firstbyte = (u_char *)cblockp->c_quote + (startbit / NBBY);
+ lastbyte = (u_char *)cblockp->c_quote + (endbit / NBBY);
+
+ /*
+ * Calculate mask of bits to preserve in first and
+ * last bytes.
+ */
+ startmask = NBBY - (startbit % NBBY);
+ startmask = 0xff >> startmask;
+ endmask = (endbit % NBBY);
+ endmask = 0xff << (endmask + 1);
+
+ if (firstbyte != lastbyte) {
+ *firstbyte &= startmask;
+ *lastbyte &= endmask;
+
+ num_between = lastbyte - firstbyte - 1;
+ if (num_between)
+ bzero(firstbyte + 1, num_between);
+ } else {
+ *firstbyte &= (startmask | endmask);
+ }
+ }
+
+ /*
+ * ...and update pointer for the next chunk.
+ */
+ src += numc;
+ clistp->c_cl += numc;
+ clistp->c_cc += numc;
+ amount -= numc;
+ /*
+ * If we go through the loop again, it's always
+ * for data in the next cblock, so by adding one (cblock),
+ * (which makes the pointer 1 beyond the end of this
+ * cblock) we prepare for the assignment of 'prev'
+ * above.
+ */
+ cblockp += 1;
+
+ }
+
+ splx(s);
+ return (amount);
+}
+
+/*
+ * Get the next character in the clist. Store it at dst. Don't
+ * advance any clist pointers, but return a pointer to the next
+ * character position.
+ */
+char *
+nextc(clistp, cp, dst)
+ struct clist *clistp;
+ char *cp;
+ int *dst;
+{
+ struct cblock *cblockp;
+
+ ++cp;
+ /*
+ * See if the next character is beyond the end of
+ * the clist.
+ */
+ if (clistp->c_cc && (cp != clistp->c_cl)) {
+ /*
+ * If the next character is beyond the end of this
+ * cblock, advance to the next cblock.
+ */
+ if (((long)cp & CROUND) == 0)
+ cp = ((struct cblock *)cp - 1)->c_next->c_info;
+ cblockp = (struct cblock *)((long)cp & ~CROUND);
+
+ /*
+ * Get the character. Set the quote flag if this character
+ * is quoted.
+ */
+ *dst = (u_char)*cp | (isset(cblockp->c_quote, cp - (char *)cblockp->c_info) ? TTY_QUOTE : 0);
+
+ return (cp);
+ }
+
+ return (NULL);
+}
+
+/*
+ * "Unput" a character from a clist.
+ */
+int
+unputc(clistp)
+ struct clist *clistp;
+{
+ struct cblock *cblockp = 0, *cbp = 0;
+ int s;
+ int chr = -1;
+
+
+ s = spltty();
+
+ if (clistp->c_cc) {
+ --clistp->c_cc;
+ --clistp->c_cl;
+
+ chr = (u_char)*clistp->c_cl;
+
+ cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND);
+
+ /*
+ * Set quote flag if this character was quoted.
+ */
+ if (isset(cblockp->c_quote, (u_char *)clistp->c_cl - cblockp->c_info))
+ chr |= TTY_QUOTE;
+
+ /*
+ * If all of the characters have been unput in this
+ * cblock, then find the previous one and free this
+ * one.
+ */
+ if (clistp->c_cc && (clistp->c_cl <= (char *)cblockp->c_info)) {
+ cbp = (struct cblock *)((long)clistp->c_cf & ~CROUND);
+
+ while (cbp->c_next != cblockp)
+ cbp = cbp->c_next;
+
+ /*
+ * When the previous cblock is at the end, the 'last'
+ * pointer always points (invalidly) one past.
+ */
+ clistp->c_cl = (char *)(cbp+1);
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ cbp->c_next = NULL;
+ }
+ }
+
+ /*
+ * If there are no more characters on the list, then
+ * free the last cblock.
+ */
+ if ((clistp->c_cc == 0) && clistp->c_cl) {
+ cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND);
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+
+ splx(s);
+ return (chr);
+}
+
+/*
+ * Move characters in source clist to destination clist,
+ * preserving quote bits.
+ */
+void
+catq(src_clistp, dest_clistp)
+ struct clist *src_clistp, *dest_clistp;
+{
+ int chr, s;
+
+ s = spltty();
+ /*
+ * If the destination clist is empty (has no cblocks atttached),
+ * and there are no possible complications with the resource counters,
+ * then we simply assign the current clist to the destination.
+ */
+ if (!dest_clistp->c_cf
+ && src_clistp->c_cbcount <= src_clistp->c_cbmax
+ && src_clistp->c_cbcount <= dest_clistp->c_cbmax) {
+ dest_clistp->c_cf = src_clistp->c_cf;
+ dest_clistp->c_cl = src_clistp->c_cl;
+ src_clistp->c_cf = src_clistp->c_cl = NULL;
+
+ dest_clistp->c_cc = src_clistp->c_cc;
+ src_clistp->c_cc = 0;
+ dest_clistp->c_cbcount = src_clistp->c_cbcount;
+ src_clistp->c_cbcount = 0;
+
+ splx(s);
+ return;
+ }
+
+ splx(s);
+
+ /*
+ * XXX This should probably be optimized to more than one
+ * character at a time.
+ */
+ while ((chr = getc(src_clistp)) != -1)
+ putc(chr, dest_clistp);
+}
diff --git a/sys/kern/subr_disklabel.c b/sys/kern/subr_disklabel.c
new file mode 100644
index 0000000..94315de
--- /dev/null
+++ b/sys/kern/subr_disklabel.c
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)ufs_disksubr.c 8.5 (Berkeley) 1/21/94
+ * $Id$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/disklabel.h>
+#include <sys/diskslice.h>
+#include <sys/syslog.h>
+
+/*
+ * Seek sort for disks.
+ *
+ * The argument ap structure holds a b_actf activity chain pointer on which we
+ * keep two queues, sorted in ascending block order. The first queue holds
+ * those requests which are positioned after the current block (in the first
+ * request); the second holds requests which came in after their block number
+ * was passed. Thus we implement a one way scan, retracting after reaching the
+ * end of the drive to the first request on the second queue, at which time it
+ * becomes the first queue.
+ *
+ * A one-way scan is natural because of the way UNIX read-ahead blocks are
+ * allocated.
+ */
+
+void
+tqdisksort(ap, bp)
+ struct buf_queue_head *ap;
+ register struct buf *bp;
+{
+ register struct buf *bq;
+ struct buf *bn;
+
+ /* If the queue is empty, then it's easy. */
+ if ((bq = ap->tqh_first) == NULL) {
+ TAILQ_INSERT_HEAD(ap, bp, b_act);
+ return;
+ }
+
+#if 1
+ /* Put new writes after all reads */
+ if ((bp->b_flags & B_READ) == 0) {
+ while (bn = bq->b_act.tqe_next) {
+ if ((bq->b_flags & B_READ) == 0)
+ break;
+ bq = bn;
+ }
+ } else {
+ while (bn = bq->b_act.tqe_next) {
+ if ((bq->b_flags & B_READ) == 0) {
+ if (ap->tqh_first != bq) {
+ bq = *bq->b_act.tqe_prev;
+ }
+ break;
+ }
+ bq = bn;
+ }
+ goto insert;
+ }
+#endif
+
+ /*
+ * If we lie after the first (currently active) request, then we
+ * must locate the second request list and add ourselves to it.
+ */
+ if (bp->b_pblkno < bq->b_pblkno) {
+ while (bn = bq->b_act.tqe_next) {
+ /*
+ * Check for an ``inversion'' in the normally ascending
+ * cylinder numbers, indicating the start of the second
+ * request list.
+ */
+ if (bn->b_pblkno < bq->b_pblkno) {
+ /*
+ * Search the second request list for the first
+ * request at a larger cylinder number. We go
+ * before that; if there is no such request, we
+ * go at end.
+ */
+ do {
+ if (bp->b_pblkno < bn->b_pblkno)
+ goto insert;
+ bq = bn;
+ } while (bn = bq->b_act.tqe_next);
+ goto insert; /* after last */
+ }
+ bq = bn;
+ }
+ /*
+ * No inversions... we will go after the last, and
+ * be the first request in the second request list.
+ */
+ goto insert;
+ }
+ /*
+ * Request is at/after the current request...
+ * sort in the first request list.
+ */
+ while (bn = bq->b_act.tqe_next) {
+ /*
+ * We want to go after the current request if there is an
+ * inversion after it (i.e. it is the end of the first
+ * request list), or if the next request is a larger cylinder
+ * than our request.
+ */
+ if (bn->b_pblkno < bq->b_pblkno ||
+ bp->b_pblkno < bn->b_pblkno)
+ goto insert;
+ bq = bn;
+ }
+ /*
+ * Neither a second list nor a larger request... we go at the end of
+ * the first list, which is the same as the end of the whole schebang.
+ */
+insert:
+ TAILQ_INSERT_AFTER(ap, bq, bp, b_act);
+}
+
+
+/*
+ * Attempt to read a disk label from a device using the indicated strategy
+ * routine. The label must be partly set up before this: secpercyl, secsize
+ * and anything required in the strategy routine (e.g., dummy bounds for the
+ * partition containing the label) must be * filled in before calling us.
+ * Returns NULL on success and an error string on failure.
+ */
+char *
+readdisklabel(dev, strat, lp)
+ dev_t dev;
+ d_strategy_t *strat;
+ register struct disklabel *lp;
+{
+ register struct buf *bp;
+ struct disklabel *dlp;
+ char *msg = NULL;
+
+ bp = geteblk((int)lp->d_secsize);
+ bp->b_dev = dev;
+ bp->b_blkno = LABELSECTOR * ((int)lp->d_secsize/DEV_BSIZE);
+ bp->b_bcount = lp->d_secsize;
+ bp->b_flags &= ~B_INVAL;
+ bp->b_flags |= B_BUSY | B_READ;
+ (*strat)(bp);
+ if (biowait(bp))
+ msg = "I/O error";
+ else for (dlp = (struct disklabel *)bp->b_data;
+ dlp <= (struct disklabel *)((char *)bp->b_data +
+ DEV_BSIZE - sizeof(*dlp));
+ dlp = (struct disklabel *)((char *)dlp + sizeof(long))) {
+ if (dlp->d_magic != DISKMAGIC || dlp->d_magic2 != DISKMAGIC) {
+ if (msg == NULL)
+ msg = "no disk label";
+ } else if (dlp->d_npartitions > MAXPARTITIONS ||
+ dkcksum(dlp) != 0)
+ msg = "disk label corrupted";
+ else {
+ *lp = *dlp;
+ msg = NULL;
+ break;
+ }
+ }
+ bp->b_flags |= B_INVAL | B_AGE;
+ brelse(bp);
+ return (msg);
+}
+
+/*
+ * Check new disk label for sensibility before setting it.
+ */
+int
+setdisklabel(olp, nlp, openmask)
+ register struct disklabel *olp, *nlp;
+ u_long openmask;
+{
+ register i;
+ register struct partition *opp, *npp;
+
+ /*
+ * Check it is actually a disklabel we are looking at.
+ */
+ if (nlp->d_magic != DISKMAGIC || nlp->d_magic2 != DISKMAGIC ||
+ dkcksum(nlp) != 0)
+ return (EINVAL);
+ /*
+ * For each partition that we think is open,
+ */
+ while ((i = ffs((long)openmask)) != 0) {
+ i--;
+ /*
+ * Check it is not changing....
+ */
+ openmask &= ~(1 << i);
+ if (nlp->d_npartitions <= i)
+ return (EBUSY);
+ opp = &olp->d_partitions[i];
+ npp = &nlp->d_partitions[i];
+ if (npp->p_offset != opp->p_offset || npp->p_size < opp->p_size)
+ return (EBUSY);
+ /*
+ * Copy internally-set partition information
+ * if new label doesn't include it. XXX
+ * (If we are using it then we had better stay the same type)
+ * This is possibly dubious, as someone else noted (XXX)
+ */
+ if (npp->p_fstype == FS_UNUSED && opp->p_fstype != FS_UNUSED) {
+ npp->p_fstype = opp->p_fstype;
+ npp->p_fsize = opp->p_fsize;
+ npp->p_frag = opp->p_frag;
+ npp->p_cpg = opp->p_cpg;
+ }
+ }
+ nlp->d_checksum = 0;
+ nlp->d_checksum = dkcksum(nlp);
+ *olp = *nlp;
+ return (0);
+}
+
+/*
+ * Write disk label back to device after modification.
+ */
+int
+writedisklabel(dev, strat, lp)
+ dev_t dev;
+ d_strategy_t *strat;
+ register struct disklabel *lp;
+{
+ struct buf *bp;
+ struct disklabel *dlp;
+ int labelpart;
+ int error = 0;
+
+ labelpart = dkpart(dev);
+ if (lp->d_partitions[labelpart].p_offset != 0) {
+ if (lp->d_partitions[0].p_offset != 0)
+ return (EXDEV); /* not quite right */
+ labelpart = 0;
+ }
+ bp = geteblk((int)lp->d_secsize);
+ bp->b_dev = dkmodpart(dev, labelpart);
+ bp->b_blkno = LABELSECTOR * ((int)lp->d_secsize/DEV_BSIZE);
+ bp->b_bcount = lp->d_secsize;
+#if 1
+ /*
+ * We read the label first to see if it's there,
+ * in which case we will put ours at the same offset into the block..
+ * (I think this is stupid [Julian])
+ * Note that you can't write a label out over a corrupted label!
+ * (also stupid.. how do you write the first one? by raw writes?)
+ */
+ bp->b_flags &= ~B_INVAL;
+ bp->b_flags |= B_BUSY | B_READ;
+ (*strat)(bp);
+ error = biowait(bp);
+ if (error)
+ goto done;
+ for (dlp = (struct disklabel *)bp->b_data;
+ dlp <= (struct disklabel *)
+ ((char *)bp->b_data + lp->d_secsize - sizeof(*dlp));
+ dlp = (struct disklabel *)((char *)dlp + sizeof(long))) {
+ if (dlp->d_magic == DISKMAGIC && dlp->d_magic2 == DISKMAGIC &&
+ dkcksum(dlp) == 0) {
+ *dlp = *lp;
+ bp->b_flags &= ~(B_DONE | B_READ);
+ bp->b_flags |= B_BUSY | B_WRITE;
+ (*strat)(bp);
+ error = biowait(bp);
+ goto done;
+ }
+ }
+ error = ESRCH;
+done:
+#else
+ bzero(bp->b_data, lp->d_secsize);
+ dlp = (struct disklabel *)bp->b_data;
+ *dlp = *lp;
+ bp->b_flags &= ~B_INVAL;
+ bp->b_flags |= B_BUSY | B_WRITE;
+ (*strat)(bp);
+ error = biowait(bp);
+#endif
+ bp->b_flags |= B_INVAL | B_AGE;
+ brelse(bp);
+ return (error);
+}
+
+/*
+ * Compute checksum for disk label.
+ */
+u_int
+dkcksum(lp)
+ register struct disklabel *lp;
+{
+ register u_short *start, *end;
+ register u_short sum = 0;
+
+ start = (u_short *)lp;
+ end = (u_short *)&lp->d_partitions[lp->d_npartitions];
+ while (start < end)
+ sum ^= *start++;
+ return (sum);
+}
+
+/*
+ * Disk error is the preface to plaintive error messages
+ * about failing disk transfers. It prints messages of the form
+
+hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
+
+ * if the offset of the error in the transfer and a disk label
+ * are both available. blkdone should be -1 if the position of the error
+ * is unknown; the disklabel pointer may be null from drivers that have not
+ * been converted to use them. The message is printed with printf
+ * if pri is LOG_PRINTF, otherwise it uses log at the specified priority.
+ * The message should be completed (with at least a newline) with printf
+ * or addlog, respectively. There is no trailing space.
+ */
+void
+diskerr(bp, dname, what, pri, blkdone, lp)
+ register struct buf *bp;
+ char *dname, *what;
+ int pri, blkdone;
+ register struct disklabel *lp;
+{
+ int unit = dkunit(bp->b_dev);
+ int slice = dkslice(bp->b_dev);
+ int part = dkpart(bp->b_dev);
+ register int (*pr) __P((const char *, ...));
+ char partname[2];
+ char *sname;
+ int sn;
+
+ if (pri != LOG_PRINTF) {
+ log(pri, "");
+ pr = addlog;
+ } else
+ pr = printf;
+ sname = dsname(dname, unit, slice, part, partname);
+ (*pr)("%s%s: %s %sing fsbn ", sname, partname, what,
+ bp->b_flags & B_READ ? "read" : "writ");
+ sn = bp->b_blkno;
+ if (bp->b_bcount <= DEV_BSIZE)
+ (*pr)("%d", sn);
+ else {
+ if (blkdone >= 0) {
+ sn += blkdone;
+ (*pr)("%d of ", sn);
+ }
+ (*pr)("%d-%d", bp->b_blkno,
+ bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE);
+ }
+ if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) {
+#ifdef tahoe
+ sn *= DEV_BSIZE / lp->d_secsize; /* XXX */
+#endif
+ sn += lp->d_partitions[part].p_offset;
+ /*
+ * XXX should add slice offset and not print the slice,
+ * but we don't know the slice pointer.
+ * XXX should print bp->b_pblkno so that this will work
+ * independent of slices, labels and bad sector remapping,
+ * but some drivers don't set bp->b_pblkno.
+ */
+ (*pr)(" (%s bn %d; cn %d", sname, sn, sn / lp->d_secpercyl);
+ sn %= lp->d_secpercyl;
+ (*pr)(" tn %d sn %d)", sn / lp->d_nsectors, sn % lp->d_nsectors);
+ }
+}
diff --git a/sys/kern/subr_diskmbr.c b/sys/kern/subr_diskmbr.c
new file mode 100644
index 0000000..8983e950c
--- /dev/null
+++ b/sys/kern/subr_diskmbr.c
@@ -0,0 +1,456 @@
+/*-
+ * Copyright (c) 1994 Bruce D. Evans.
+ * All rights reserved.
+ *
+ * Copyright (c) 1982, 1986, 1988 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)ufs_disksubr.c 7.16 (Berkeley) 5/4/91
+ * from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $
+ * $Id$
+ */
+
+#include <stddef.h>
+#include <sys/param.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/disklabel.h>
+#define DOSPTYP_EXTENDED 5
+#define DOSPTYP_ONTRACK 84
+#include <sys/diskslice.h>
+#include <sys/malloc.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+
+#define TRACE(str) do { if (dsi_debug) printf str; } while (0)
+
+static volatile u_char dsi_debug;
+
+static struct dos_partition historical_bogus_partition_table[NDOSPART] = {
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+ { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+ { 0x80, 0, 1, 0, DOSPTYP_386BSD, 255, 255, 255, 0, 50000, },
+};
+
+static int check_part __P((char *sname, struct dos_partition *dp,
+ u_long offset, int nsectors, int ntracks,
+ u_long mbr_offset));
+static void extended __P((char *dname, dev_t dev, d_strategy_t *strat,
+ struct disklabel *lp, struct diskslices *ssp,
+ u_long ext_offset, u_long ext_size,
+ u_long base_ext_offset, int nsectors, int ntracks,
+ u_long mbr_offset));
+
+static int
+check_part(sname, dp, offset, nsectors, ntracks, mbr_offset )
+ char *sname;
+ struct dos_partition *dp;
+ u_long offset;
+ int nsectors;
+ int ntracks;
+ u_long mbr_offset;
+{
+ int chs_ecyl;
+ int chs_esect;
+ int chs_scyl;
+ int chs_ssect;
+ int error;
+ u_long esector;
+ u_long esector1;
+ u_long secpercyl;
+ u_long ssector;
+ u_long ssector1;
+
+ secpercyl = (u_long)nsectors * ntracks;
+ chs_scyl = DPCYL(dp->dp_scyl, dp->dp_ssect);
+ chs_ssect = DPSECT(dp->dp_ssect);
+ ssector = chs_ssect - 1 + dp->dp_shd * nsectors + chs_scyl * secpercyl
+ + mbr_offset;
+ ssector1 = offset + dp->dp_start;
+
+ /*
+ * If ssector1 is on a cylinder >= 1024, then ssector can't be right.
+ * Allow the C/H/S for it to be 1023/ntracks-1/nsectors, or correct
+ * apart from the cylinder being reduced modulo 1024.
+ */
+ if (ssector < ssector1
+ && ((chs_ssect == nsectors && dp->dp_shd == ntracks - 1
+ && chs_scyl == 1023)
+ || (ssector1 - ssector) % (1024 * secpercyl) == 0)
+ || (dp->dp_scyl == 255 && dp->dp_shd == 255
+ && dp->dp_ssect == 255)) {
+ TRACE(("%s: C/H/S start %d/%d/%d, start %lu: allow\n",
+ sname, chs_scyl, dp->dp_shd, chs_ssect, ssector1));
+ ssector = ssector1;
+ }
+
+ chs_ecyl = DPCYL(dp->dp_ecyl, dp->dp_esect);
+ chs_esect = DPSECT(dp->dp_esect);
+ esector = chs_esect - 1 + dp->dp_ehd * nsectors + chs_ecyl * secpercyl
+ + mbr_offset;
+ esector1 = ssector1 + dp->dp_size - 1;
+
+ /* Allow certain bogus C/H/S values for esector, as above. */
+ if (esector < esector1
+ && ((chs_esect == nsectors && dp->dp_ehd == ntracks - 1
+ && chs_ecyl == 1023)
+ || (esector1 - esector) % (1024 * secpercyl) == 0)
+ || (dp->dp_ecyl == 255 && dp->dp_ehd == 255
+ && dp->dp_esect == 255)) {
+ TRACE(("%s: C/H/S end %d/%d/%d, end %lu: allow\n",
+ sname, chs_ecyl, dp->dp_ehd, chs_esect, esector1));
+ esector = esector1;
+ }
+
+ error = (ssector == ssector1 && esector == esector1) ? 0 : EINVAL;
+ if (bootverbose)
+ printf("%s: type 0x%x, start %lu, end = %lu, size %lu %s\n",
+ sname, dp->dp_typ, ssector1, esector1, dp->dp_size,
+ error ? "" : ": OK");
+ if (ssector != ssector1 && bootverbose)
+ printf("%s: C/H/S start %d/%d/%d (%lu) != start %lu: invalid\n",
+ sname, chs_scyl, dp->dp_shd, chs_ssect,
+ ssector, ssector1);
+ if (esector != esector1 && bootverbose)
+ printf("%s: C/H/S end %d/%d/%d (%lu) != end %lu: invalid\n",
+ sname, chs_ecyl, dp->dp_ehd, chs_esect,
+ esector, esector1);
+ return (error);
+}
+
+int
+dsinit(dname, dev, strat, lp, sspp)
+ char *dname;
+ dev_t dev;
+ d_strategy_t *strat;
+ struct disklabel *lp;
+ struct diskslices **sspp;
+{
+ struct buf *bp;
+ u_char *cp;
+ int dospart;
+ struct dos_partition *dp;
+ struct dos_partition *dp0;
+ int error;
+ int max_ncyls;
+ int max_nsectors;
+ int max_ntracks;
+ u_long mbr_offset;
+ char partname[2];
+ u_long secpercyl;
+ char *sname;
+ struct diskslice *sp;
+ struct diskslices *ssp;
+
+ /*
+ * Allocate a dummy slices "struct" and initialize it to contain
+ * only an empty compatibility slice (pointing to itself) and a
+ * whole disk slice (covering the disk as described by the label).
+ * If there is an error, then the dummy struct becomes final.
+ */
+ ssp = malloc(offsetof(struct diskslices, dss_slices)
+ + BASE_SLICE * sizeof *sp, M_DEVBUF, M_WAITOK);
+ *sspp = ssp;
+ ssp->dss_first_bsd_slice = COMPATIBILITY_SLICE;
+ ssp->dss_nslices = BASE_SLICE;
+ sp = &ssp->dss_slices[0];
+ bzero(sp, BASE_SLICE * sizeof *sp);
+ sp[WHOLE_DISK_SLICE].ds_size = lp->d_secperunit;
+
+ mbr_offset = DOSBBSECTOR;
+reread_mbr:
+ /* Read master boot record. */
+ bp = geteblk((int)lp->d_secsize);
+ bp->b_dev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
+ bp->b_blkno = mbr_offset;
+ bp->b_bcount = lp->d_secsize;
+ bp->b_flags |= B_BUSY | B_READ;
+ (*strat)(bp);
+ if (biowait(bp) != 0) {
+ diskerr(bp, dname, "error reading primary partition table",
+ LOG_PRINTF, 0, lp);
+ printf("\n");
+ error = EIO;
+ goto done;
+ }
+
+ /* Weakly verify it. */
+ cp = bp->b_un.b_addr;
+ sname = dsname(dname, dkunit(dev), WHOLE_DISK_SLICE, RAW_PART,
+ partname);
+ if (cp[0x1FE] != 0x55 || cp[0x1FF] != 0xAA) {
+ printf("%s: invalid primary partition table: no magic\n",
+ sname);
+ error = EINVAL;
+ goto done;
+ }
+ dp0 = (struct dos_partition *)(cp + DOSPARTOFF);
+
+ /* Check for "Ontrack Diskmanager". */
+ for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) {
+ if (dp->dp_typ == DOSPTYP_ONTRACK) {
+ if (bootverbose)
+ printf(
+ "%s: Found \"Ontrack Disk Manager\" on this disk.\n", sname);
+ bp->b_flags |= B_INVAL | B_AGE;
+ brelse(bp);
+ mbr_offset = 63;
+ goto reread_mbr;
+ }
+ }
+
+ if (bcmp(dp0, historical_bogus_partition_table,
+ sizeof historical_bogus_partition_table) == 0) {
+ TRACE(("%s: invalid primary partition table: historical\n",
+ sname));
+ error = EINVAL;
+ goto done;
+ }
+
+ /* Guess the geometry. */
+ /*
+ * TODO:
+ * Perhaps skip entries with 0 size.
+ * Perhaps only look at entries of type DOSPTYP_386BSD.
+ */
+ max_ncyls = 0;
+ max_nsectors = 0;
+ max_ntracks = 0;
+ for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) {
+ int ncyls;
+ int nsectors;
+ int ntracks;
+
+ ncyls = DPCYL(dp->dp_ecyl, dp->dp_esect) + 1;
+ if (max_ncyls < ncyls)
+ max_ncyls = ncyls;
+ nsectors = DPSECT(dp->dp_esect);
+ if (max_nsectors < nsectors)
+ max_nsectors = nsectors;
+ ntracks = dp->dp_ehd + 1;
+ if (max_ntracks < ntracks)
+ max_ntracks = ntracks;
+ }
+
+ /*
+ * Check that we have guessed the geometry right by checking the
+ * partition entries.
+ */
+ /*
+ * TODO:
+ * As above.
+ * Check for overlaps.
+ * Check against d_secperunit if the latter is reliable.
+ */
+ error = 0;
+ for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) {
+ if (dp->dp_scyl == 0 && dp->dp_shd == 0 && dp->dp_ssect == 0
+ && dp->dp_start == 0 && dp->dp_size == 0)
+ continue;
+ sname = dsname(dname, dkunit(dev), BASE_SLICE + dospart,
+ RAW_PART, partname);
+
+ /*
+ * Temporarily ignore errors from this check. We could
+ * simplify things by accepting the table eariler if we
+ * always ignore errors here. Perhaps we should always
+ * accept the table if the magic is right but not let
+ * bad entries affect the geometry.
+ */
+ check_part(sname, dp, mbr_offset, max_nsectors, max_ntracks,
+ mbr_offset);
+ }
+ if (error != 0)
+ goto done;
+
+ /*
+ * Accept the DOS partition table.
+ * First adjust the label (we have been careful not to change it
+ * before we can guarantee success).
+ */
+ secpercyl = (u_long)max_nsectors * max_ntracks;
+ if (secpercyl != 0) {
+ u_long secperunit;
+
+ lp->d_nsectors = max_nsectors;
+ lp->d_ntracks = max_ntracks;
+ lp->d_secpercyl = secpercyl;
+ secperunit = secpercyl * max_ncyls;
+ if (lp->d_secperunit < secperunit)
+ lp->d_secperunit = secperunit;
+ lp->d_ncylinders = lp->d_secperunit / secpercyl;
+ }
+
+ /*
+ * Free the dummy slices "struct" and allocate a real new one.
+ * Initialize special slices as above.
+ */
+ free(ssp, M_DEVBUF);
+ ssp = malloc(offsetof(struct diskslices, dss_slices)
+#define MAX_SLICES_SUPPORTED MAX_SLICES /* was (BASE_SLICE + NDOSPART) */
+ + MAX_SLICES_SUPPORTED * sizeof *sp, M_DEVBUF, M_WAITOK);
+ *sspp = ssp;
+ ssp->dss_first_bsd_slice = COMPATIBILITY_SLICE;
+ sp = &ssp->dss_slices[0];
+ bzero(sp, MAX_SLICES_SUPPORTED * sizeof *sp);
+ sp[WHOLE_DISK_SLICE].ds_size = lp->d_secperunit;
+
+ /* Initialize normal slices. */
+ sp += BASE_SLICE;
+ for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++, sp++) {
+ sp->ds_offset = mbr_offset + dp->dp_start;
+ sp->ds_size = dp->dp_size;
+ sp->ds_type = dp->dp_typ;
+#if 0
+ lp->d_subtype |= (lp->d_subtype & 3) | dospart
+ | DSTYPE_INDOSPART;
+#endif
+ }
+ ssp->dss_nslices = BASE_SLICE + NDOSPART;
+
+ /* Handle extended partitions. */
+ sp -= NDOSPART;
+ for (dospart = 0; dospart < NDOSPART; dospart++, sp++)
+ if (sp->ds_type == DOSPTYP_EXTENDED)
+ extended(dname, bp->b_dev, strat, lp, ssp,
+ sp->ds_offset, sp->ds_size, sp->ds_offset,
+ max_nsectors, max_ntracks, mbr_offset);
+
+done:
+ bp->b_flags |= B_INVAL | B_AGE;
+ brelse(bp);
+ if (error == EINVAL)
+ error = 0;
+ return (error);
+}
+
+void
+extended(dname, dev, strat, lp, ssp, ext_offset, ext_size, base_ext_offset,
+ nsectors, ntracks, mbr_offset)
+ char *dname;
+ dev_t dev;
+ struct disklabel *lp;
+ d_strategy_t *strat;
+ struct diskslices *ssp;
+ u_long ext_offset;
+ u_long ext_size;
+ u_long base_ext_offset;
+ int nsectors;
+ int ntracks;
+ u_long mbr_offset;
+{
+ struct buf *bp;
+ u_char *cp;
+ int dospart;
+ struct dos_partition *dp;
+ u_long ext_offsets[NDOSPART];
+ u_long ext_sizes[NDOSPART];
+ char partname[2];
+ int slice;
+ char *sname;
+ struct diskslice *sp;
+
+ /* Read extended boot record. */
+ bp = geteblk((int)lp->d_secsize);
+ bp->b_dev = dev;
+ bp->b_blkno = ext_offset;
+ bp->b_bcount = lp->d_secsize;
+ bp->b_flags |= B_BUSY | B_READ;
+ (*strat)(bp);
+ if (biowait(bp) != 0) {
+ diskerr(bp, dname, "error reading extended partition table",
+ LOG_PRINTF, 0, lp);
+ printf("\n");
+ goto done;
+ }
+
+ /* Weakly verify it. */
+ cp = bp->b_un.b_addr;
+ if (cp[0x1FE] != 0x55 || cp[0x1FF] != 0xAA) {
+ sname = dsname(dname, dkunit(dev), WHOLE_DISK_SLICE, RAW_PART,
+ partname);
+ printf("%s: invalid extended partition table: no magic\n",
+ sname);
+ goto done;
+ }
+
+ for (dospart = 0,
+ dp = (struct dos_partition *)(bp->b_un.b_addr + DOSPARTOFF),
+ slice = ssp->dss_nslices, sp = &ssp->dss_slices[slice];
+ dospart < NDOSPART; dospart++, dp++) {
+ ext_sizes[dospart] = 0;
+ if (dp->dp_scyl == 0 && dp->dp_shd == 0 && dp->dp_ssect == 0
+ && dp->dp_start == 0 && dp->dp_size == 0)
+ continue;
+ if (dp->dp_typ == DOSPTYP_EXTENDED) {
+ char buf[32];
+
+ sname = dsname(dname, dkunit(dev), WHOLE_DISK_SLICE,
+ RAW_PART, partname);
+ strcpy(buf, sname);
+ if (strlen(buf) < sizeof buf - 11)
+ strcat(buf, "<extended>");
+ check_part(buf, dp, base_ext_offset, nsectors,
+ ntracks, mbr_offset);
+ ext_offsets[dospart] = base_ext_offset + dp->dp_start;
+ ext_sizes[dospart] = dp->dp_size;
+ } else {
+ sname = dsname(dname, dkunit(dev), slice, RAW_PART,
+ partname);
+ check_part(sname, dp, ext_offset, nsectors, ntracks,
+ mbr_offset);
+ if (slice >= MAX_SLICES) {
+ printf("%s: too many slices\n", sname);
+ slice++;
+ continue;
+ }
+ sp->ds_offset = ext_offset + dp->dp_start;
+ sp->ds_size = dp->dp_size;
+ sp->ds_type = dp->dp_typ;
+ ssp->dss_nslices++;
+ slice++;
+ sp++;
+ }
+ }
+
+ /* If we found any more slices, recursively find all the subslices. */
+ for (dospart = 0; dospart < NDOSPART; dospart++)
+ if (ext_sizes[dospart] != 0)
+ extended(dname, dev, strat, lp, ssp,
+ ext_offsets[dospart], ext_sizes[dospart],
+ base_ext_offset, nsectors, ntracks,
+ mbr_offset);
+
+done:
+ bp->b_flags |= B_INVAL | B_AGE;
+ brelse(bp);
+}
diff --git a/sys/kern/subr_diskslice.c b/sys/kern/subr_diskslice.c
new file mode 100644
index 0000000..44e01b0
--- /dev/null
+++ b/sys/kern/subr_diskslice.c
@@ -0,0 +1,1066 @@
+/*-
+ * Copyright (c) 1994 Bruce D. Evans.
+ * All rights reserved.
+ *
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Copyright (c) 1982, 1986, 1988 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)wd.c 7.2 (Berkeley) 5/9/91
+ * from: wd.c,v 1.55 1994/10/22 01:57:12 phk Exp $
+ * from: @(#)ufs_disksubr.c 7.16 (Berkeley) 5/4/91
+ * from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $
+ * $Id: subr_diskslice.c,v 1.35 1997/02/22 09:39:15 peter Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif
+#include <sys/disklabel.h>
+#include <sys/diskslice.h>
+#include <sys/dkbad.h>
+#include <sys/fcntl.h>
+#include <sys/malloc.h>
+#include <sys/stat.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+
+#include <ufs/ufs/dinode.h>
+#include <ufs/ffs/fs.h>
+
+#define TRACE(str) do { if (ds_debug) printf str; } while (0)
+
+typedef u_char bool_t;
+
+static volatile bool_t ds_debug;
+
+static void dsiodone __P((struct buf *bp));
+static char *fixlabel __P((char *sname, struct diskslice *sp,
+ struct disklabel *lp, int writeflag));
+static void free_ds_label __P((struct diskslices *ssp, int slice));
+#ifdef DEVFS
+static void free_ds_labeldevs __P((struct diskslices *ssp, int slice));
+#endif
+static void partition_info __P((char *sname, int part, struct partition *pp));
+static void slice_info __P((char *sname, struct diskslice *sp));
+static void set_ds_bad __P((struct diskslices *ssp, int slice,
+ struct dkbad_intern *btp));
+static void set_ds_label __P((struct diskslices *ssp, int slice,
+ struct disklabel *lp));
+#ifdef DEVFS
+static void set_ds_labeldevs __P((char *dname, dev_t dev,
+ struct diskslices *ssp));
+static void set_ds_labeldevs_unaliased __P((char *dname, dev_t dev,
+ struct diskslices *ssp));
+#endif
+static void set_ds_wlabel __P((struct diskslices *ssp, int slice,
+ int wlabel));
+
+/*
+ * Determine the size of the transfer, and make sure it is
+ * within the boundaries of the partition. Adjust transfer
+ * if needed, and signal errors or early completion.
+ *
+ * XXX TODO:
+ * o Do bad sector remapping. May need to split buffer.
+ * o Split buffers that are too big for the device.
+ * o Check for overflow.
+ * o Finish cleaning this up.
+ */
+int
+dscheck(bp, ssp)
+ struct buf *bp;
+ struct diskslices *ssp;
+{
+ daddr_t blkno;
+ daddr_t labelsect;
+ struct disklabel *lp;
+ u_long maxsz;
+ char *msg;
+ struct partition *pp;
+ struct diskslice *sp;
+ long sz;
+
+ if (bp->b_blkno < 0) {
+ Debugger("Slice code got negative blocknumber");
+ bp->b_error = EINVAL;
+ goto bad;
+ }
+
+ sp = &ssp->dss_slices[dkslice(bp->b_dev)];
+ lp = sp->ds_label;
+ sz = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT;
+ if (lp == NULL) {
+ blkno = bp->b_blkno;
+ labelsect = -LABELSECTOR - 1;
+ maxsz = sp->ds_size;
+ } else {
+ labelsect = lp->d_partitions[LABEL_PART].p_offset;
+if (labelsect != 0) Debugger("labelsect != 0 in dscheck()");
+ pp = &lp->d_partitions[dkpart(bp->b_dev)];
+ blkno = pp->p_offset + bp->b_blkno;
+ maxsz = pp->p_size;
+ if (sp->ds_bad != NULL && ds_debug) {
+ daddr_t newblkno;
+
+ newblkno = transbad144(sp->ds_bad, blkno);
+ if (newblkno != blkno)
+ printf("should map bad block %lu -> %lu\n",
+ blkno, newblkno);
+ }
+ }
+
+ /* overwriting disk label ? */
+ /* XXX should also protect bootstrap in first 8K */
+ if (blkno <= LABELSECTOR + labelsect &&
+#if LABELSECTOR != 0
+ bp->b_blkno + sz > LABELSECTOR + labelsect &&
+#endif
+ (bp->b_flags & B_READ) == 0 && sp->ds_wlabel == 0) {
+ bp->b_error = EROFS;
+ goto bad;
+ }
+
+#if defined(DOSBBSECTOR) && defined(notyet)
+ /* overwriting master boot record? */
+ if (blkno <= DOSBBSECTOR && (bp->b_flags & B_READ) == 0 &&
+ sp->ds_wlabel == 0) {
+ bp->b_error = EROFS;
+ goto bad;
+ }
+#endif
+
+ /* beyond partition? */
+ if (bp->b_blkno < 0 || bp->b_blkno + sz > maxsz) {
+ /* if exactly at end of disk, return an EOF */
+ if (bp->b_blkno == maxsz) {
+ bp->b_resid = bp->b_bcount;
+ return (0);
+ }
+ /* or truncate if part of it fits */
+ sz = maxsz - bp->b_blkno;
+ if (sz <= 0) {
+ bp->b_error = EINVAL;
+ goto bad;
+ }
+ bp->b_bcount = sz << DEV_BSHIFT;
+ }
+
+ bp->b_pblkno = blkno + sp->ds_offset;
+
+ /*
+ * Snoop on label accesses if the slice offset is nonzero. Fudge
+ * offsets in the label to keep the in-core label coherent with
+ * the on-disk one.
+ */
+ if (blkno <= LABELSECTOR + labelsect
+#if LABELSECTOR != 0
+ && bp->b_blkno + sz > LABELSECTOR + labelsect
+#endif
+ && sp->ds_offset != 0) {
+ struct iodone_chain *ic;
+
+ ic = malloc(sizeof *ic , M_DEVBUF, M_WAITOK);
+ ic->ic_prev_flags = bp->b_flags;
+ ic->ic_prev_iodone = bp->b_iodone;
+ ic->ic_prev_iodone_chain = bp->b_iodone_chain;
+ ic->ic_args[0].ia_long = (LABELSECTOR + labelsect - blkno)
+ << DEV_BSHIFT;
+ if (lp)
+ ic->ic_args[0].ia_long *= lp->d_secsize / DEV_BSIZE;
+ ic->ic_args[1].ia_ptr = sp;
+ bp->b_flags |= B_CALL;
+ bp->b_iodone = dsiodone;
+ bp->b_iodone_chain = ic;
+ if (!(bp->b_flags & B_READ)) {
+ /*
+ * XXX even disklabel(8) writes directly so we need
+ * to adjust writes. Perhaps we should drop support
+ * for DIOCWLABEL (always write protect labels) and
+ * require the use of DIOCWDINFO.
+ *
+ * XXX probably need to copy the data to avoid even
+ * temporarily corrupting the in-core copy.
+ */
+ if (bp->b_vp != NULL)
+ bp->b_vp->v_numoutput++;
+ msg = fixlabel((char *)NULL, sp,
+ (struct disklabel *)
+ (bp->b_data + ic->ic_args[0].ia_long),
+ TRUE);
+ if (msg != NULL) {
+ printf("%s\n", msg);
+ bp->b_error = EROFS;
+ goto bad;
+ }
+ }
+ }
+ return (1);
+
+bad:
+ bp->b_flags |= B_ERROR;
+ return (-1);
+}
+
+void
+dsclose(dev, mode, ssp)
+ dev_t dev;
+ int mode;
+ struct diskslices *ssp;
+{
+ u_char mask;
+ struct diskslice *sp;
+
+ sp = &ssp->dss_slices[dkslice(dev)];
+ mask = 1 << dkpart(dev);
+ switch (mode) {
+ case S_IFBLK:
+ sp->ds_bopenmask &= ~mask;
+ break;
+ case S_IFCHR:
+ sp->ds_copenmask &= ~mask;
+ break;
+ }
+ sp->ds_openmask = sp->ds_bopenmask | sp->ds_copenmask;
+}
+
+void
+dsgone(sspp)
+ struct diskslices **sspp;
+{
+ int slice;
+ struct diskslice *sp;
+ struct diskslices *ssp;
+
+ for (slice = 0, ssp = *sspp; slice < ssp->dss_nslices; slice++) {
+ sp = &ssp->dss_slices[slice];
+ if (sp->ds_bad != NULL) {
+ free(sp->ds_bad, M_DEVBUF);
+ set_ds_bad(ssp, slice, (struct dkbad_intern *)NULL);
+ }
+#ifdef DEVFS
+ if (sp->ds_bdev != NULL)
+ devfs_remove_dev(sp->ds_bdev);
+ if (sp->ds_cdev != NULL)
+ devfs_remove_dev(sp->ds_cdev);
+#endif
+ free_ds_label(ssp, slice);
+ }
+ free(ssp, M_DEVBUF);
+ *sspp = NULL;
+}
+
+/*
+ * For the "write" commands (DIOCSBAD, DIOCSDINFO and DIOCWDINFO), this
+ * is subject to the same restriction as dsopen().
+ */
+int
+dsioctl(dname, dev, cmd, data, flags, sspp, strat, setgeom)
+ char *dname;
+ dev_t dev;
+ int cmd;
+ caddr_t data;
+ int flags;
+ struct diskslices **sspp;
+ d_strategy_t *strat;
+ ds_setgeom_t *setgeom;
+{
+ int error;
+ struct disklabel *lp;
+ int old_wlabel;
+ int slice;
+ struct diskslice *sp;
+ struct diskslices *ssp;
+
+ slice = dkslice(dev);
+ ssp = *sspp;
+ sp = &ssp->dss_slices[slice];
+ lp = sp->ds_label;
+ switch (cmd) {
+
+ case DIOCGDINFO:
+ if (lp == NULL)
+ return (EINVAL);
+ *(struct disklabel *)data = *lp;
+ return (0);
+
+#ifdef notyet
+ case DIOCGDINFOP:
+ if (lp == NULL)
+ return (EINVAL);
+ *(struct disklabel **)data = lp;
+ return (0);
+#endif
+
+ case DIOCGPART:
+ if (lp == NULL)
+ return (EINVAL);
+ ((struct partinfo *)data)->disklab = lp;
+ ((struct partinfo *)data)->part
+ = &lp->d_partitions[dkpart(dev)];
+ return (0);
+
+ case DIOCGSLICEINFO:
+ *(struct diskslices *)data = *ssp;
+ return (0);
+
+ case DIOCSBAD:
+ if (slice == WHOLE_DISK_SLICE)
+ return (ENODEV);
+ if (!(flags & FWRITE))
+ return (EBADF);
+ if (lp == NULL)
+ return (EINVAL);
+ if (sp->ds_bad != NULL)
+ free(sp->ds_bad, M_DEVBUF);
+ set_ds_bad(ssp, slice, internbad144((struct dkbad *)data, lp));
+ return (0);
+
+ case DIOCSDINFO:
+ if (slice == WHOLE_DISK_SLICE)
+ return (ENODEV);
+ if (!(flags & FWRITE))
+ return (EBADF);
+ lp = malloc(sizeof *lp, M_DEVBUF, M_WAITOK);
+ if (sp->ds_label == NULL)
+ bzero(lp, sizeof *lp);
+ else
+ bcopy(sp->ds_label, lp, sizeof *lp);
+ error = setdisklabel(lp, (struct disklabel *)data,
+ sp->ds_label != NULL
+ ? sp->ds_openmask : (u_long)0);
+ /* XXX why doesn't setdisklabel() check this? */
+ if (error == 0 && lp->d_partitions[RAW_PART].p_offset != 0)
+ error = EINVAL;
+#if 0 /* XXX */
+ if (error != 0 && setgeom != NULL)
+ error = setgeom(lp);
+#endif
+ if (error != 0) {
+ free(lp, M_DEVBUF);
+ return (error);
+ }
+ free_ds_label(ssp, slice);
+ set_ds_label(ssp, slice, lp);
+#ifdef DEVFS
+ set_ds_labeldevs(dname, dev, ssp);
+#endif
+ return (0);
+
+ case DIOCSYNCSLICEINFO:
+ if (slice != WHOLE_DISK_SLICE || dkpart(dev) != RAW_PART)
+ return (EINVAL);
+ if (!*(int *)data)
+ for (slice = 0; slice < ssp->dss_nslices; slice++) {
+ u_char openmask;
+
+ openmask = ssp->dss_slices[slice].ds_openmask;
+ if (openmask
+ && (slice != WHOLE_DISK_SLICE
+ || openmask & ~(1 << RAW_PART)))
+ return (EBUSY);
+ }
+
+ /*
+ * Temporarily forget the current slices struct and read
+ * the current one.
+ * XXX should wait for current accesses on this disk to
+ * complete, then lock out future accesses and opens.
+ */
+ *sspp = NULL;
+ lp = malloc(sizeof *lp, M_DEVBUF, M_WAITOK);
+ *lp = *ssp->dss_slices[WHOLE_DISK_SLICE].ds_label;
+ error = dsopen(dname, dev,
+ ssp->dss_slices[WHOLE_DISK_SLICE].ds_copenmask
+ & (1 << RAW_PART) ? S_IFCHR : S_IFBLK,
+ sspp, lp, strat, setgeom, ssp->dss_bdevsw,
+ ssp->dss_cdevsw);
+ if (error != 0) {
+ free(lp, M_DEVBUF);
+ *sspp = ssp;
+ return (error);
+ }
+
+ /*
+ * Reopen everything. This is a no-op except in the "force"
+ * case and when the raw bdev and cdev are both open. Abort
+ * if anything fails.
+ */
+ for (slice = 0; slice < ssp->dss_nslices; slice++) {
+ u_char openmask;
+ int part;
+
+ for (openmask = ssp->dss_slices[slice].ds_bopenmask,
+ part = 0; openmask; openmask >>= 1, part++) {
+ if (!(openmask & 1))
+ continue;
+ error = dsopen(dname,
+ dkmodslice(dkmodpart(dev, part),
+ slice),
+ S_IFBLK, sspp, lp, strat,
+ setgeom, ssp->dss_bdevsw,
+ ssp->dss_cdevsw);
+ if (error != 0) {
+ /* XXX should free devfs toks. */
+ free(lp, M_DEVBUF);
+ /* XXX should restore devfs toks. */
+ *sspp = ssp;
+ return (EBUSY);
+ }
+ }
+ for (openmask = ssp->dss_slices[slice].ds_copenmask,
+ part = 0; openmask; openmask >>= 1, part++) {
+ if (!(openmask & 1))
+ continue;
+ error = dsopen(dname,
+ dkmodslice(dkmodpart(dev, part),
+ slice),
+ S_IFCHR, sspp, lp, strat,
+ setgeom, ssp->dss_bdevsw,
+ ssp->dss_cdevsw);
+ if (error != 0) {
+ /* XXX should free devfs toks. */
+ free(lp, M_DEVBUF);
+ /* XXX should restore devfs toks. */
+ *sspp = ssp;
+ return (EBUSY);
+ }
+ }
+ }
+
+ /* XXX devfs tokens? */
+ free(lp, M_DEVBUF);
+ dsgone(&ssp);
+ return (0);
+
+ case DIOCWDINFO:
+ error = dsioctl(dname, dev, DIOCSDINFO, data, flags, &ssp,
+ strat, setgeom);
+ if (error != 0)
+ return (error);
+ /*
+ * XXX this used to hack on dk_openpart to fake opening
+ * partition 0 in case that is used instead of dkpart(dev).
+ */
+ old_wlabel = sp->ds_wlabel;
+ set_ds_wlabel(ssp, slice, TRUE);
+ error = writedisklabel(dev, strat, sp->ds_label);
+ /* XXX should invalidate in-core label if write failed. */
+ set_ds_wlabel(ssp, slice, old_wlabel);
+ return (error);
+
+ case DIOCWLABEL:
+ if (slice == WHOLE_DISK_SLICE)
+ return (ENODEV);
+ if (!(flags & FWRITE))
+ return (EBADF);
+ set_ds_wlabel(ssp, slice, *(int *)data != 0);
+ return (0);
+
+ default:
+ return (-1);
+ }
+}
+
+static void
+dsiodone(bp)
+ struct buf *bp;
+{
+ struct iodone_chain *ic;
+ char *msg;
+
+ ic = bp->b_iodone_chain;
+ bp->b_flags = (ic->ic_prev_flags & B_CALL)
+ | (bp->b_flags & ~(B_CALL | B_DONE));
+ bp->b_iodone = ic->ic_prev_iodone;
+ bp->b_iodone_chain = ic->ic_prev_iodone_chain;
+ if (!(bp->b_flags & B_READ)
+ || (!(bp->b_flags & B_ERROR) && bp->b_error == 0)) {
+ msg = fixlabel((char *)NULL, ic->ic_args[1].ia_ptr,
+ (struct disklabel *)
+ (bp->b_data + ic->ic_args[0].ia_long),
+ FALSE);
+ if (msg != NULL)
+ printf("%s\n", msg);
+ }
+ free(ic, M_DEVBUF);
+ biodone(bp);
+}
+
+int
+dsisopen(ssp)
+ struct diskslices *ssp;
+{
+ int slice;
+
+ if (ssp == NULL)
+ return (0);
+ for (slice = 0; slice < ssp->dss_nslices; slice++)
+ if (ssp->dss_slices[slice].ds_openmask)
+ return (1);
+ return (0);
+}
+
+char *
+dsname(dname, unit, slice, part, partname)
+ char *dname;
+ int unit;
+ int slice;
+ int part;
+ char *partname;
+{
+ static char name[32];
+
+ if (strlen(dname) > 16)
+ dname = "nametoolong";
+ sprintf(name, "%s%d", dname, unit);
+ partname[0] = '\0';
+ if (slice != WHOLE_DISK_SLICE || part != RAW_PART) {
+ partname[0] = 'a' + part;
+ partname[1] = '\0';
+ if (slice != COMPATIBILITY_SLICE)
+ sprintf(name + strlen(name), "s%d", slice - 1);
+ }
+ return (name);
+}
+
+/*
+ * This should only be called when the unit is inactive and the strategy
+ * routine should not allow it to become active unless we call it. Our
+ * strategy routine must be special to allow activity.
+ */
+int
+dsopen(dname, dev, mode, sspp, lp, strat, setgeom, bdevsw, cdevsw)
+ char *dname;
+ dev_t dev;
+ int mode;
+ struct diskslices **sspp;
+ struct disklabel *lp;
+ d_strategy_t *strat;
+ ds_setgeom_t *setgeom;
+ struct bdevsw *bdevsw;
+ struct cdevsw *cdevsw;
+{
+ struct dkbad *btp;
+ dev_t dev1;
+ int error;
+ struct disklabel *lp1;
+ char *msg;
+ u_char mask;
+#ifdef DEVFS
+ int mynor;
+#endif
+ bool_t need_init;
+ int part;
+ char partname[2];
+ int slice;
+ char *sname;
+ struct diskslice *sp;
+ struct diskslices *ssp;
+ int unit;
+
+ /*
+ * XXX reinitialize the slice table unless there is an open device
+ * on the unit. This should only be done if the media has changed.
+ */
+ ssp = *sspp;
+ need_init = !dsisopen(ssp);
+ if (ssp != NULL && need_init)
+ dsgone(sspp);
+ if (need_init) {
+ TRACE(("dsinit\n"));
+ error = dsinit(dname, dev, strat, lp, sspp);
+ if (error != 0) {
+ dsgone(sspp);
+ return (error);
+ }
+ lp->d_npartitions = RAW_PART + 1;
+ lp->d_partitions[RAW_PART].p_size = lp->d_secperunit;
+ ssp = *sspp;
+#ifdef DEVFS
+ ssp->dss_bdevsw = bdevsw;
+ ssp->dss_cdevsw = cdevsw;
+#endif
+
+ /*
+ * If there are no real slices, then make the compatiblity
+ * slice cover the whole disk.
+ */
+ if (ssp->dss_nslices == BASE_SLICE)
+ ssp->dss_slices[COMPATIBILITY_SLICE].ds_size
+ = lp->d_secperunit;
+
+ /* Point the compatibility slice at the BSD slice, if any. */
+ for (slice = BASE_SLICE; slice < ssp->dss_nslices; slice++) {
+ sp = &ssp->dss_slices[slice];
+ if (sp->ds_type == DOSPTYP_386BSD /* XXX */) {
+ ssp->dss_first_bsd_slice = slice;
+ ssp->dss_slices[COMPATIBILITY_SLICE].ds_offset
+ = sp->ds_offset;
+ ssp->dss_slices[COMPATIBILITY_SLICE].ds_size
+ = sp->ds_size;
+ ssp->dss_slices[COMPATIBILITY_SLICE].ds_type
+ = sp->ds_type;
+ break;
+ }
+ }
+
+ lp1 = malloc(sizeof *lp1, M_DEVBUF, M_WAITOK);
+ *lp1 = *lp;
+
+ /*
+ * Initialize defaults for the label for the whole disk so
+ * that it can be used as a template for disklabel(8).
+ * d_rpm = 3600 is unlikely to be correct for a modern
+ * disk, but d_rpm is normally irrelevant.
+ */
+ if (lp1->d_rpm == 0)
+ lp1->d_rpm = 3600;
+ if (lp1->d_interleave == 0)
+ lp1->d_interleave = 1;
+ if (lp1->d_npartitions == 0)
+ lp1->d_npartitions = MAXPARTITIONS;
+ if (lp1->d_bbsize == 0)
+ lp1->d_bbsize = BBSIZE;
+ if (lp1->d_sbsize == 0)
+ lp1->d_sbsize = SBSIZE;
+
+ ssp->dss_slices[WHOLE_DISK_SLICE].ds_label = lp1;
+ ssp->dss_slices[WHOLE_DISK_SLICE].ds_wlabel = TRUE;
+ if (setgeom != NULL) {
+ error = setgeom(lp);
+ if (error != 0) {
+ dsgone(sspp);
+ return (error);
+ }
+ }
+ }
+
+ unit = dkunit(dev);
+
+ /*
+ * Initialize secondary info for all slices. It is needed for more
+ * than the current slice in the DEVFS case.
+ */
+ for (slice = 0; slice < ssp->dss_nslices; slice++) {
+ sp = &ssp->dss_slices[slice];
+ if (sp->ds_label != NULL)
+ continue;
+ dev1 = dkmodslice(dkmodpart(dev, RAW_PART), slice);
+ sname = dsname(dname, unit, slice, RAW_PART, partname);
+#ifdef DEVFS
+ if (slice != COMPATIBILITY_SLICE && sp->ds_bdev == NULL
+ && sp->ds_size != 0) {
+ mynor = minor(dev1);
+ sp->ds_bdev =
+ devfs_add_devswf(bdevsw, mynor, DV_BLK,
+ UID_ROOT, GID_OPERATOR, 0640,
+ "%s", sname);
+ sp->ds_cdev =
+ devfs_add_devswf(cdevsw, mynor, DV_CHR,
+ UID_ROOT, GID_OPERATOR, 0640,
+ "r%s", sname);
+ }
+#endif
+ /*
+ * XXX this should probably only be done for the need_init
+ * case, but there may be a problem with DIOCSYNCSLICEINFO.
+ */
+ set_ds_wlabel(ssp, slice, TRUE); /* XXX invert */
+ lp1 = malloc(sizeof *lp1, M_DEVBUF, M_WAITOK);
+ *lp1 = *lp;
+ TRACE(("readdisklabel\n"));
+ msg = readdisklabel(dev1, strat, lp1);
+#if 0 /* XXX */
+ if (msg == NULL && setgeom != NULL && setgeom(lp1) != 0)
+ msg = "setgeom failed";
+#endif
+ if (msg == NULL)
+ msg = fixlabel(sname, sp, lp1, FALSE);
+ if (msg != NULL) {
+ free(lp1, M_DEVBUF);
+ if (sp->ds_type == DOSPTYP_386BSD /* XXX */)
+ log(LOG_WARNING, "%s: cannot find label (%s)\n",
+ sname, msg);
+ continue;
+ }
+ if (lp1->d_flags & D_BADSECT) {
+ btp = malloc(sizeof *btp, M_DEVBUF, M_WAITOK);
+ TRACE(("readbad144\n"));
+ msg = readbad144(dev1, strat, lp1, btp);
+ if (msg != NULL) {
+ log(LOG_WARNING,
+ "%s: cannot find bad sector table (%s)\n",
+ sname, msg);
+ free(btp, M_DEVBUF);
+ free(lp1, M_DEVBUF);
+ continue;
+ }
+ set_ds_bad(ssp, slice, internbad144(btp, lp1));
+ free(btp, M_DEVBUF);
+ if (sp->ds_bad == NULL) {
+ free(lp1, M_DEVBUF);
+ continue;
+ }
+ }
+ set_ds_label(ssp, slice, lp1);
+#ifdef DEVFS
+ set_ds_labeldevs(dname, dev1, ssp);
+#endif
+ set_ds_wlabel(ssp, slice, FALSE);
+ }
+
+ slice = dkslice(dev);
+ if (slice >= ssp->dss_nslices)
+ return (ENXIO);
+ sp = &ssp->dss_slices[slice];
+ part = dkpart(dev);
+ if (part != RAW_PART
+ && (sp->ds_label == NULL || part >= sp->ds_label->d_npartitions))
+ return (EINVAL); /* XXX needs translation */
+ mask = 1 << part;
+ switch (mode) {
+ case S_IFBLK:
+ sp->ds_bopenmask |= mask;
+ break;
+ case S_IFCHR:
+ sp->ds_copenmask |= mask;
+ break;
+ }
+ sp->ds_openmask = sp->ds_bopenmask | sp->ds_copenmask;
+ return (0);
+}
+
+int
+dssize(dev, sspp, dopen, dclose)
+ dev_t dev;
+ struct diskslices **sspp;
+ d_open_t dopen;
+ d_close_t dclose;
+{
+ struct disklabel *lp;
+ int part;
+ int slice;
+ struct diskslices *ssp;
+
+ slice = dkslice(dev);
+ part = dkpart(dev);
+ ssp = *sspp;
+ if (ssp == NULL || slice >= ssp->dss_nslices
+ || !(ssp->dss_slices[slice].ds_bopenmask & (1 << part))) {
+ if (dopen(dev, FREAD, S_IFBLK, (struct proc *)NULL) != 0)
+ return (-1);
+ dclose(dev, FREAD, S_IFBLK, (struct proc *)NULL);
+ ssp = *sspp;
+ }
+ lp = ssp->dss_slices[slice].ds_label;
+ if (lp == NULL)
+ return (-1);
+ return ((int)lp->d_partitions[part].p_size);
+}
+
+static void
+free_ds_label(ssp, slice)
+ struct diskslices *ssp;
+ int slice;
+{
+ struct disklabel *lp;
+ struct diskslice *sp;
+
+ sp = &ssp->dss_slices[slice];
+ lp = sp->ds_label;
+ if (lp == NULL)
+ return;
+#ifdef DEVFS
+ free_ds_labeldevs(ssp, slice);
+ if (slice == COMPATIBILITY_SLICE)
+ free_ds_labeldevs(ssp, ssp->dss_first_bsd_slice);
+ else if (slice == ssp->dss_first_bsd_slice)
+ free_ds_labeldevs(ssp, COMPATIBILITY_SLICE);
+#endif
+ free(lp, M_DEVBUF);
+ set_ds_label(ssp, slice, (struct disklabel *)NULL);
+}
+
+#ifdef DEVFS
+static void
+free_ds_labeldevs(ssp, slice)
+ struct diskslices *ssp;
+ int slice;
+{
+ struct disklabel *lp;
+ int part;
+ struct diskslice *sp;
+
+ sp = &ssp->dss_slices[slice];
+ lp = sp->ds_label;
+ if (lp == NULL)
+ return;
+ for (part = 0; part < lp->d_npartitions; part++) {
+ if (sp->ds_bdevs[part] != NULL) {
+ devfs_remove_dev(sp->ds_bdevs[part]);
+ sp->ds_bdevs[part] = NULL;
+ }
+ if (sp->ds_cdevs[part] != NULL) {
+ devfs_remove_dev(sp->ds_cdevs[part]);
+ sp->ds_cdevs[part] = NULL;
+ }
+ }
+}
+#endif
+
+static char *
+fixlabel(sname, sp, lp, writeflag)
+ char *sname;
+ struct diskslice *sp;
+ struct disklabel *lp;
+ int writeflag;
+{
+ u_long end;
+ u_long offset;
+ int part;
+ struct partition *pp;
+ u_long start;
+ bool_t warned;
+
+ /* These errors "can't happen" so don't bother reporting details. */
+ if (lp->d_magic != DISKMAGIC || lp->d_magic2 != DISKMAGIC)
+ return ("fixlabel: invalid magic");
+ if (dkcksum(lp) != 0)
+ return ("fixlabel: invalid checksum");
+
+ pp = &lp->d_partitions[RAW_PART];
+ if (writeflag) {
+ start = 0;
+ offset = sp->ds_offset;
+ } else {
+ start = sp->ds_offset;
+ offset = -sp->ds_offset;
+ }
+ if (pp->p_offset != start) {
+ if (sname != NULL) {
+ printf(
+"%s: rejecting BSD label: raw partition offset != slice offset\n",
+ sname);
+ slice_info(sname, sp);
+ partition_info(sname, RAW_PART, pp);
+ }
+ return ("fixlabel: raw partition offset != slice offset");
+ }
+ if (pp->p_size != sp->ds_size) {
+ if (sname != NULL) {
+ printf("%s: raw partition size != slice size\n", sname);
+ slice_info(sname, sp);
+ partition_info(sname, RAW_PART, pp);
+ }
+ if (pp->p_size > sp->ds_size) {
+ if (sname == NULL)
+ return ("fixlabel: raw partition size > slice size");
+ printf("%s: truncating raw partition\n", sname);
+ pp->p_size = sp->ds_size;
+ }
+ }
+ end = start + sp->ds_size;
+ if (start > end)
+ return ("fixlabel: slice wraps");
+ if (lp->d_secpercyl <= 0)
+ return ("fixlabel: d_secpercyl <= 0");
+ pp -= RAW_PART;
+ warned = FALSE;
+ for (part = 0; part < lp->d_npartitions; part++, pp++) {
+ if (pp->p_offset != 0 || pp->p_size != 0) {
+ if (pp->p_offset < start
+ || pp->p_offset + pp->p_size > end
+ || pp->p_offset + pp->p_size < pp->p_offset) {
+ if (sname != NULL) {
+ printf(
+"%s: rejecting partition in BSD label: it isn't entirely within the slice\n",
+ sname);
+ if (!warned) {
+ slice_info(sname, sp);
+ warned = TRUE;
+ }
+ partition_info(sname, part, pp);
+ }
+ /* XXX else silently discard junk. */
+ bzero(pp, sizeof *pp);
+ } else
+ pp->p_offset += offset;
+ }
+ }
+ lp->d_ncylinders = sp->ds_size / lp->d_secpercyl;
+ lp->d_secperunit = sp->ds_size;
+ lp->d_checksum = 0;
+ lp->d_checksum = dkcksum(lp);
+ return (NULL);
+}
+
+static void
+partition_info(sname, part, pp)
+ char *sname;
+ int part;
+ struct partition *pp;
+{
+ printf("%s%c: start %lu, end %lu, size %lu\n", sname, 'a' + part,
+ pp->p_offset, pp->p_offset + pp->p_size - 1, pp->p_size);
+}
+
+static void
+slice_info(sname, sp)
+ char *sname;
+ struct diskslice *sp;
+{
+ printf("%s: start %lu, end %lu, size %lu\n", sname,
+ sp->ds_offset, sp->ds_offset + sp->ds_size - 1, sp->ds_size);
+}
+
+/*
+ * Most changes to ds_bad, ds_label and ds_wlabel are made using the
+ * following functions to ensure coherency of the compatibility slice
+ * with the first BSD slice. The openmask fields are _not_ shared and
+ * the other fields (ds_offset and ds_size) aren't changed after they
+ * are initialized.
+ */
+static void
+set_ds_bad(ssp, slice, btp)
+ struct diskslices *ssp;
+ int slice;
+ struct dkbad_intern *btp;
+{
+ ssp->dss_slices[slice].ds_bad = btp;
+ if (slice == COMPATIBILITY_SLICE)
+ ssp->dss_slices[ssp->dss_first_bsd_slice].ds_bad = btp;
+ else if (slice == ssp->dss_first_bsd_slice)
+ ssp->dss_slices[COMPATIBILITY_SLICE].ds_bad = btp;
+}
+
+static void
+set_ds_label(ssp, slice, lp)
+ struct diskslices *ssp;
+ int slice;
+ struct disklabel *lp;
+{
+ ssp->dss_slices[slice].ds_label = lp;
+ if (slice == COMPATIBILITY_SLICE)
+ ssp->dss_slices[ssp->dss_first_bsd_slice].ds_label = lp;
+ else if (slice == ssp->dss_first_bsd_slice)
+ ssp->dss_slices[COMPATIBILITY_SLICE].ds_label = lp;
+}
+
+#ifdef DEVFS
+static void
+set_ds_labeldevs(dname, dev, ssp)
+ char *dname;
+ dev_t dev;
+ struct diskslices *ssp;
+{
+ int slice;
+
+ set_ds_labeldevs_unaliased(dname, dev, ssp);
+ if (ssp->dss_first_bsd_slice == COMPATIBILITY_SLICE)
+ return;
+ slice = dkslice(dev);
+ if (slice == COMPATIBILITY_SLICE)
+ set_ds_labeldevs_unaliased(dname,
+ dkmodslice(dev, ssp->dss_first_bsd_slice), ssp);
+ else if (slice == ssp->dss_first_bsd_slice)
+ set_ds_labeldevs_unaliased(dname,
+ dkmodslice(dev, COMPATIBILITY_SLICE), ssp);
+}
+
+static void
+set_ds_labeldevs_unaliased(dname, dev, ssp)
+ char *dname;
+ dev_t dev;
+ struct diskslices *ssp;
+{
+ struct disklabel *lp;
+ int mynor;
+ int part;
+ char partname[2];
+ struct partition *pp;
+ int slice;
+ char *sname;
+ struct diskslice *sp;
+
+ slice = dkslice(dev);
+ sp = &ssp->dss_slices[slice];
+ if (sp->ds_size == 0)
+ return;
+ lp = sp->ds_label;
+ for (part = 0; part < lp->d_npartitions; part++) {
+ pp = &lp->d_partitions[part];
+ if (pp->p_size == 0)
+ continue;
+ sname = dsname(dname, dkunit(dev), slice, part, partname);
+ if (part == RAW_PART && sp->ds_bdev != NULL) {
+ sp->ds_bdevs[part] =
+ devfs_link(sp->ds_bdev,
+ "%s%s", sname, partname);
+ sp->ds_cdevs[part] =
+ devfs_link(sp->ds_cdev,
+ "r%s%s", sname, partname);
+ } else {
+ mynor = minor(dkmodpart(dev, part));
+ sp->ds_bdevs[part] =
+ devfs_add_devswf(ssp->dss_bdevsw, mynor, DV_BLK,
+ UID_ROOT, GID_OPERATOR, 0640,
+ "%s%s", sname, partname);
+ sp->ds_cdevs[part] =
+ devfs_add_devswf(ssp->dss_cdevsw, mynor, DV_CHR,
+ UID_ROOT, GID_OPERATOR, 0640,
+ "r%s%s", sname, partname);
+ }
+ }
+}
+#endif /* DEVFS */
+
+static void
+set_ds_wlabel(ssp, slice, wlabel)
+ struct diskslices *ssp;
+ int slice;
+ int wlabel;
+{
+ ssp->dss_slices[slice].ds_wlabel = wlabel;
+ if (slice == COMPATIBILITY_SLICE)
+ ssp->dss_slices[ssp->dss_first_bsd_slice].ds_wlabel = wlabel;
+ else if (slice == ssp->dss_first_bsd_slice)
+ ssp->dss_slices[COMPATIBILITY_SLICE].ds_wlabel = wlabel;
+}
diff --git a/sys/kern/subr_dkbad.c b/sys/kern/subr_dkbad.c
new file mode 100644
index 0000000..8fef863
--- /dev/null
+++ b/sys/kern/subr_dkbad.c
@@ -0,0 +1,159 @@
+/*-
+ * Copyright (c) 1994 Bruce D. Evans.
+ * All rights reserved.
+ *
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Copyright (c) 1982, 1986, 1988 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)wd.c 7.2 (Berkeley) 5/9/91
+ * from: wd.c,v 1.55 1994/10/22 01:57:12 phk Exp $
+ * from: @(#)ufs_disksubr.c 7.16 (Berkeley) 5/4/91
+ * from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $
+ * $Id$
+ */
+
+#include <sys/param.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/disklabel.h>
+#include <sys/dkbad.h>
+#include <sys/malloc.h>
+
+/*
+ * Internalize the bad sector table.
+ * TODO:
+ * o Fix types.
+ * Type long should be daddr_t since we compare with blkno's.
+ * Sentinel -1 should be ((daddr_t)-1).
+ * o Can remove explicit test for sentinel if it is a positive
+ * (unsigned or not) value larger than all possible blkno's.
+ * o Check that the table is sorted.
+ * o Use faster searches.
+ * o Use the internal table in wddump().
+ * o Don't duplicate so much code.
+ * o Do all bad block handing in a driver-independent file.
+ * o Remove limit of 126 spare sectors.
+ */
+struct dkbad_intern *
+internbad144(btp, lp)
+ struct dkbad *btp;
+ struct disklabel *lp;
+{
+ struct dkbad_intern *bip;
+ int i;
+
+ bip = malloc(sizeof *bip, M_DEVBUF, M_WAITOK);
+ /*
+ * Spare sectors are allocated beginning with the last sector of
+ * the second last track of the disk (the last track is used for
+ * the bad sector list).
+ */
+ bip->bi_maxspare = lp->d_secperunit - lp->d_nsectors - 1;
+ bip->bi_nbad = DKBAD_MAXBAD;
+ i = 0;
+ for (; i < DKBAD_MAXBAD && btp->bt_bad[i].bt_cyl != DKBAD_NOCYL; i++)
+ bip->bi_bad[i] = btp->bt_bad[i].bt_cyl * lp->d_secpercyl
+ + (btp->bt_bad[i].bt_trksec >> 8)
+ * lp->d_nsectors
+ + (btp->bt_bad[i].bt_trksec & 0x00ff);
+ bip->bi_bad[i] = -1;
+ return (bip);
+}
+
+char *
+readbad144(dev, strat, lp, bdp)
+ dev_t dev;
+ d_strategy_t *strat;
+ struct disklabel *lp;
+ struct dkbad *bdp;
+{
+ struct buf *bp;
+ struct dkbad *db;
+ int i;
+ char *msg;
+
+ bp = geteblk((int)lp->d_secsize);
+ i = 0;
+ do {
+ /* Read a bad sector table. */
+ bp->b_dev = dev;
+ bp->b_blkno = lp->d_secperunit - lp->d_nsectors + i;
+ if (lp->d_secsize > DEV_BSIZE)
+ bp->b_blkno *= lp->d_secsize / DEV_BSIZE;
+ else
+ bp->b_blkno /= DEV_BSIZE / lp->d_secsize;
+ bp->b_bcount = lp->d_secsize;
+ bp->b_flags = B_BUSY | B_READ;
+ (*strat)(bp);
+
+ /* If successful, validate, otherwise try another. */
+ if (biowait(bp) == 0) {
+ db = (struct dkbad *)(bp->b_un.b_addr);
+ if (db->bt_mbz == 0 && db->bt_flag == DKBAD_MAGIC) {
+ msg = NULL;
+ *bdp = *db;
+ break;
+ }
+ msg = "bad sector table corrupted";
+ } else
+ msg = "bad sector table I/O error";
+ } while ((bp->b_flags & B_ERROR) && (i += 2) < 10 &&
+ i < lp->d_nsectors);
+ bp->b_flags |= B_INVAL | B_AGE;
+ brelse(bp);
+ return (msg);
+}
+
+daddr_t
+transbad144(bip, blkno)
+ struct dkbad_intern *bip;
+ daddr_t blkno;
+{
+ int i;
+
+ /*
+ * List is sorted, so the search can terminate when it is past our
+ * sector.
+ */
+ for (i = 0; bip->bi_bad[i] != -1 && bip->bi_bad[i] <= blkno; i++)
+ if (bip->bi_bad[i] == blkno)
+ /*
+ * Spare sectors are allocated in decreasing order.
+ */
+ return (bip->bi_maxspare - i);
+ return (blkno);
+}
diff --git a/sys/kern/subr_log.c b/sys/kern/subr_log.c
index 792a1ce..1418709 100644
--- a/sys/kern/subr_log.c
+++ b/sys/kern/subr_log.c
@@ -30,7 +30,8 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)subr_log.c 8.3 (Berkeley) 2/14/95
+ * @(#)subr_log.c 8.1 (Berkeley) 6/10/93
+ * $Id: subr_log.c,v 1.21 1997/03/23 03:36:22 bde Exp $
*/
/*
@@ -39,18 +40,37 @@
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/conf.h>
#include <sys/proc.h>
#include <sys/vnode.h>
-#include <sys/ioctl.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
#include <sys/msgbuf.h>
-#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
#define LOG_RDPRI (PZERO + 1)
#define LOG_ASYNC 0x04
#define LOG_RDWAIT 0x08
-struct logsoftc {
+static d_open_t logopen;
+static d_close_t logclose;
+static d_read_t logread;
+static d_ioctl_t logioctl;
+static d_select_t logselect;
+
+#define CDEV_MAJOR 7
+static struct cdevsw log_cdevsw =
+ { logopen, logclose, logread, nowrite, /*7*/
+ logioctl, nostop, nullreset, nodevtotty,/* klog */
+ logselect, nommap, NULL, "log", NULL, -1 };
+
+static struct logsoftc {
int sc_state; /* see above for possibilities */
struct selinfo sc_selp; /* process waiting on select call */
int sc_pgid; /* process/group for async I/O */
@@ -59,36 +79,21 @@ struct logsoftc {
int log_open; /* also used in log() */
/*ARGSUSED*/
-int
+static int
logopen(dev, flags, mode, p)
dev_t dev;
int flags, mode;
struct proc *p;
{
- register struct msgbuf *mbp = msgbufp;
-
if (log_open)
return (EBUSY);
log_open = 1;
logsoftc.sc_pgid = p->p_pid; /* signal process only */
- /*
- * Potential race here with putchar() but since putchar should be
- * called by autoconf, msg_magic should be initialized by the time
- * we get here.
- */
- if (mbp->msg_magic != MSG_MAGIC) {
- register int i;
-
- mbp->msg_magic = MSG_MAGIC;
- mbp->msg_bufx = mbp->msg_bufr = 0;
- for (i=0; i < MSG_BSIZE; i++)
- mbp->msg_bufc[i] = 0;
- }
return (0);
}
/*ARGSUSED*/
-int
+static int
logclose(dev, flag, mode, p)
dev_t dev;
int flag, mode;
@@ -101,7 +106,7 @@ logclose(dev, flag, mode, p)
}
/*ARGSUSED*/
-int
+static int
logread(dev, uio, flag)
dev_t dev;
struct uio *uio;
@@ -119,8 +124,8 @@ logread(dev, uio, flag)
return (EWOULDBLOCK);
}
logsoftc.sc_state |= LOG_RDWAIT;
- if (error = tsleep((caddr_t)mbp, LOG_RDPRI | PCATCH,
- "klog", 0)) {
+ if ((error = tsleep((caddr_t)mbp, LOG_RDPRI | PCATCH,
+ "klog", 0))) {
splx(s);
return (error);
}
@@ -140,14 +145,14 @@ logread(dev, uio, flag)
if (error)
break;
mbp->msg_bufr += l;
- if (mbp->msg_bufr < 0 || mbp->msg_bufr >= MSG_BSIZE)
+ if (mbp->msg_bufr >= MSG_BSIZE)
mbp->msg_bufr = 0;
}
return (error);
}
/*ARGSUSED*/
-int
+static int
logselect(dev, rw, p)
dev_t dev;
int rw;
@@ -179,8 +184,8 @@ logwakeup()
selwakeup(&logsoftc.sc_selp);
if (logsoftc.sc_state & LOG_ASYNC) {
if (logsoftc.sc_pgid < 0)
- gsignal(-logsoftc.sc_pgid, SIGIO);
- else if (p = pfind(logsoftc.sc_pgid))
+ gsignal(-logsoftc.sc_pgid, SIGIO);
+ else if ((p = pfind(logsoftc.sc_pgid)))
psignal(p, SIGIO);
}
if (logsoftc.sc_state & LOG_RDWAIT) {
@@ -190,10 +195,10 @@ logwakeup()
}
/*ARGSUSED*/
-int
+static int
logioctl(dev, com, data, flag, p)
dev_t dev;
- u_long com;
+ int com;
caddr_t data;
int flag;
struct proc *p;
@@ -232,7 +237,33 @@ logioctl(dev, com, data, flag, p)
break;
default:
- return (-1);
+ return (ENOTTY);
}
return (0);
}
+
+static log_devsw_installed = 0;
+#ifdef DEVFS
+static void *log_devfs_token;
+#endif
+
+static void
+log_drvinit(void *unused)
+{
+ dev_t dev;
+
+ if( ! log_devsw_installed ) {
+ dev = makedev(CDEV_MAJOR,0);
+ cdevsw_add(&dev,&log_cdevsw,NULL);
+ log_devsw_installed = 1;
+#ifdef DEVFS
+ log_devfs_token = devfs_add_devswf(&log_cdevsw, 0, DV_CHR,
+ UID_ROOT, GID_WHEEL, 0600,
+ "klog");
+#endif
+ }
+}
+
+SYSINIT(logdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,log_drvinit,NULL)
+
+
diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c
new file mode 100644
index 0000000..f7d41bf
--- /dev/null
+++ b/sys/kern/subr_param.c
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 1980, 1986, 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)param.c 8.3 (Berkeley) 8/20/94
+ * $Id$
+ */
+
+#include "opt_sysvipc.h"
+#include "opt_param.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/socket.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/callout.h>
+#include <sys/clist.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+
+#include <ufs/ufs/quota.h>
+
+#ifdef SYSVSHM
+#include <machine/vmparam.h>
+#include <sys/shm.h>
+#endif
+#ifdef SYSVSEM
+#include <sys/sem.h>
+#endif
+#ifdef SYSVMSG
+#include <sys/msg.h>
+#endif
+
+/*
+ * System parameter formulae.
+ *
+ * This file is copied into each directory where we compile
+ * the kernel; it should be modified there to suit local taste
+ * if necessary.
+ *
+ * Compiled with -DMAXUSERS=xx
+ */
+
+#ifndef HZ
+#define HZ 100
+#endif
+int hz = HZ;
+int tick = 1000000 / HZ;
+int tickadj = 30000 / (60 * HZ); /* can adjust 30ms in 60s */
+#define NPROC (20 + 16 * MAXUSERS)
+int maxproc = NPROC; /* maximum # of processes */
+int maxprocperuid = NPROC-1; /* maximum # of processes per user */
+int maxfiles = NPROC*2; /* system wide open files limit */
+int maxfilesperproc = NPROC*2; /* per-process open files limit */
+int ncallout = 16 + NPROC; /* maximum # of timer events */
+
+/* maximum # of mbuf clusters */
+#ifndef NMBCLUSTERS
+#define NMBCLUSTERS (512 + MAXUSERS * 16)
+#endif
+int nmbclusters = NMBCLUSTERS;
+
+/* allocate 1/4th amount of virtual address space for mbufs XXX */
+int nmbufs = NMBCLUSTERS * 4;
+
+int fscale = FSCALE; /* kernel uses `FSCALE', user uses `fscale' */
+
+/*
+ * Values in support of System V compatible shared memory. XXX
+ */
+#ifdef SYSVSHM
+#ifndef SHMMAX
+#define SHMMAX (SHMMAXPGS*PAGE_SIZE)
+#endif
+#ifndef SHMMIN
+#define SHMMIN 1
+#endif
+#ifndef SHMMNI
+#define SHMMNI 32 /* <= SHMMMNI in shm.h */
+#endif
+#ifndef SHMSEG
+#define SHMSEG 8
+#endif
+#ifndef SHMALL
+#define SHMALL (SHMMAXPGS)
+#endif
+
+struct shminfo shminfo = {
+ SHMMAX,
+ SHMMIN,
+ SHMMNI,
+ SHMSEG,
+ SHMALL
+};
+#endif
+
+/*
+ * Values in support of System V compatible semaphores.
+ */
+
+#ifdef SYSVSEM
+
+struct seminfo seminfo = {
+ SEMMAP, /* # of entries in semaphore map */
+ SEMMNI, /* # of semaphore identifiers */
+ SEMMNS, /* # of semaphores in system */
+ SEMMNU, /* # of undo structures in system */
+ SEMMSL, /* max # of semaphores per id */
+ SEMOPM, /* max # of operations per semop call */
+ SEMUME, /* max # of undo entries per process */
+ SEMUSZ, /* size in bytes of undo structure */
+ SEMVMX, /* semaphore maximum value */
+ SEMAEM /* adjust on exit max value */
+};
+#endif
+
+/*
+ * Values in support of System V compatible messages.
+ */
+
+#ifdef SYSVMSG
+
+struct msginfo msginfo = {
+ MSGMAX, /* max chars in a message */
+ MSGMNI, /* # of message queue identifiers */
+ MSGMNB, /* max chars in a queue */
+ MSGTQL, /* max messages in system */
+ MSGSSZ, /* size of a message segment */
+ /* (must be small power of 2 greater than 4) */
+ MSGSEG /* number of message segments */
+};
+#endif
+
+/*
+ * These may be set to nonzero here or by patching.
+ * If they are nonzero at bootstrap time then they are
+ * initialized to values dependent on the memory size.
+ */
+#ifdef NBUF
+int nbuf = NBUF;
+#else
+int nbuf = 0;
+#endif
+int nswbuf = 0;
+
+/*
+ * These have to be allocated somewhere; allocating
+ * them here forces loader errors if this file is omitted
+ * (if they've been externed everywhere else; hah!).
+ */
+struct buf *swbuf;
diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c
index 8a9a44e..4b3ed36 100644
--- a/sys/kern/subr_prf.c
+++ b/sys/kern/subr_prf.c
@@ -35,23 +35,21 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)subr_prf.c 8.4 (Berkeley) 5/4/95
+ * @(#)subr_prf.c 8.3 (Berkeley) 1/21/94
+ * $Id$
*/
+#include "opt_ddb.h"
+
#include <sys/param.h>
#include <sys/systm.h>
-#include <sys/buf.h>
-#include <sys/conf.h>
-#include <sys/reboot.h>
#include <sys/msgbuf.h>
#include <sys/proc.h>
-#include <sys/ioctl.h>
-#include <sys/vnode.h>
-#include <sys/file.h>
#include <sys/tty.h>
#include <sys/tprintf.h>
#include <sys/syslog.h>
#include <sys/malloc.h>
+#include <machine/cons.h>
/*
* Note that stdarg.h and the ANSI style va_start macro is used for both
@@ -59,71 +57,20 @@
*/
#include <machine/stdarg.h>
-#ifdef KADB
-#include <machine/kdbparam.h>
-#endif
-
#define TOCONS 0x01
#define TOTTY 0x02
#define TOLOG 0x04
struct tty *constty; /* pointer to console "window" tty */
-extern cnputc(); /* standard console putc */
-int (*v_putc)() = cnputc; /* routine to putc on virtual console */
-
-void logpri __P((int level));
-static void putchar __P((int ch, int flags, struct tty *tp));
+static void (*v_putc)(int) = cnputc; /* routine to putc on virtual console */
+static void logpri __P((int level));
+static void msglogchar(int c, void *dummyarg);
+struct putchar_arg {int flags; struct tty *tty; };
+static void putchar __P((int ch, void *arg));
static char *ksprintn __P((u_long num, int base, int *len));
-void kprintf __P((const char *fmt, int flags, struct tty *tp, va_list ap));
-
-int consintr = 1; /* Ok to handle console interrupts? */
-/*
- * Variable panicstr contains argument to first call to panic; used as flag
- * to indicate that the kernel has already called panic.
- */
-const char *panicstr;
-
-/*
- * Panic is called on unresolvable fatal errors. It prints "panic: mesg",
- * and then reboots. If we are called twice, then we avoid trying to sync
- * the disks as this often leads to recursive panics.
- */
-#ifdef __GNUC__
-volatile void boot(int flags); /* boot() does not return */
-volatile /* panic() does not return */
-#endif
-void
-#ifdef __STDC__
-panic(const char *fmt, ...)
-#else
-panic(fmt, va_alist)
- char *fmt;
-#endif
-{
- int bootopt;
- va_list ap;
-
- bootopt = RB_AUTOBOOT | RB_DUMP;
- if (panicstr)
- bootopt |= RB_NOSYNC;
- else
- panicstr = fmt;
-
- va_start(ap, fmt);
- printf("panic: %r\n", fmt, ap);
- va_end(ap);
-
-#ifdef KGDB
- kgdb_panic();
-#endif
-#ifdef KADB
- if (boothowto & RB_KDB)
- kdbpanic();
-#endif
- boot(bootopt);
-}
+static int consintr = 1; /* Ok to handle console interrupts? */
/*
* Warn that a system table is full.
@@ -142,19 +89,17 @@ tablefull(tab)
* the queue does not clear in a reasonable time.
*/
void
-#ifdef __STDC__
uprintf(const char *fmt, ...)
-#else
-uprintf(fmt, va_alist)
- char *fmt;
-#endif
{
- register struct proc *p = curproc;
+ struct proc *p = curproc;
va_list ap;
+ struct putchar_arg pca;
if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) {
va_start(ap, fmt);
- kprintf(fmt, TOTTY, p->p_session->s_ttyp, ap);
+ pca.tty = p->p_session->s_ttyp;
+ pca.flags = TOTTY;
+ kvprintf(fmt, putchar, &pca, 10, ap);
va_end(ap);
}
}
@@ -185,18 +130,13 @@ tprintf_close(sess)
* with the given session.
*/
void
-#ifdef __STDC__
tprintf(tpr_t tpr, const char *fmt, ...)
-#else
-tprintf(tpr, fmt, va_alist)
- tpr_t tpr;
- char *fmt;
-#endif
{
register struct session *sess = (struct session *)tpr;
struct tty *tp = NULL;
int flags = TOLOG;
va_list ap;
+ struct putchar_arg pca;
logpri(LOG_INFO);
if (sess && sess->s_ttyvp && ttycheckoutq(sess->s_ttyp, 0)) {
@@ -204,7 +144,9 @@ tprintf(tpr, fmt, va_alist)
tp = sess->s_ttyp;
}
va_start(ap, fmt);
- kprintf(fmt, flags, tp, ap);
+ pca.tty = tp;
+ pca.flags = flags;
+ kvprintf(fmt, putchar, &pca, 10, ap);
va_end(ap);
logwakeup();
}
@@ -215,18 +157,14 @@ tprintf(tpr, fmt, va_alist)
* be revoke(2)'d away. Other callers should use tprintf.
*/
void
-#ifdef __STDC__
ttyprintf(struct tty *tp, const char *fmt, ...)
-#else
-ttyprintf(tp, fmt, va_alist)
- struct tty *tp;
- char *fmt;
-#endif
{
va_list ap;
-
+ struct putchar_arg pca;
va_start(ap, fmt);
- kprintf(fmt, TOTTY, tp, ap);
+ pca.tty = tp;
+ pca.flags = TOTTY;
+ kvprintf(fmt, putchar, &pca, 10, ap);
va_end(ap);
}
@@ -238,13 +176,7 @@ extern int log_open;
* log yet, it writes to the console also.
*/
void
-#ifdef __STDC__
log(int level, const char *fmt, ...)
-#else
-log(level, fmt, va_alist)
- int level;
- char *fmt;
-#endif
{
register int s;
va_list ap;
@@ -252,73 +184,157 @@ log(level, fmt, va_alist)
s = splhigh();
logpri(level);
va_start(ap, fmt);
- kprintf(fmt, TOLOG, NULL, ap);
- splx(s);
+
+ kvprintf(fmt, msglogchar, NULL, 10, ap);
va_end(ap);
+
+ splx(s);
if (!log_open) {
+ struct putchar_arg pca;
va_start(ap, fmt);
- kprintf(fmt, TOCONS, NULL, ap);
+ pca.tty = NULL;
+ pca.flags = TOCONS;
+ kvprintf(fmt, putchar, &pca, 10, ap);
va_end(ap);
}
logwakeup();
}
-void
+static void
logpri(level)
int level;
{
- register int ch;
register char *p;
- putchar('<', TOLOG, NULL);
- for (p = ksprintn((u_long)level, 10, NULL); ch = *p--;)
- putchar(ch, TOLOG, NULL);
- putchar('>', TOLOG, NULL);
+ msglogchar('<', NULL);
+ for (p = ksprintn((u_long)level, 10, NULL); *p;)
+ msglogchar(*p--, NULL);
+ msglogchar('>', NULL);
}
-void
-#ifdef __STDC__
+int
addlog(const char *fmt, ...)
-#else
-addlog(fmt, va_alist)
- char *fmt;
-#endif
{
register int s;
va_list ap;
+ int retval;
s = splhigh();
va_start(ap, fmt);
- kprintf(fmt, TOLOG, NULL, ap);
+ retval = kvprintf(fmt, msglogchar, NULL, 10, ap);
splx(s);
va_end(ap);
if (!log_open) {
+ struct putchar_arg pca;
va_start(ap, fmt);
- kprintf(fmt, TOCONS, NULL, ap);
+ pca.tty = NULL;
+ pca.flags = TOCONS;
+ kvprintf(fmt, putchar, &pca, 10, ap);
va_end(ap);
}
logwakeup();
+ return (retval);
}
-void
-#ifdef __STDC__
+int
printf(const char *fmt, ...)
-#else
-printf(fmt, va_alist)
- char *fmt;
-#endif
{
va_list ap;
register int savintr;
+ struct putchar_arg pca;
+ int retval;
savintr = consintr; /* disable interrupts */
consintr = 0;
va_start(ap, fmt);
- kprintf(fmt, TOCONS | TOLOG, NULL, ap);
+ pca.tty = NULL;
+ pca.flags = TOCONS | TOLOG;
+ retval = kvprintf(fmt, putchar, &pca, 10, ap);
va_end(ap);
if (!panicstr)
logwakeup();
consintr = savintr; /* reenable interrupts */
+ return retval;
+}
+
+void
+vprintf(const char *fmt, va_list ap)
+{
+ register int savintr;
+ struct putchar_arg pca;
+
+ savintr = consintr; /* disable interrupts */
+ consintr = 0;
+ pca.tty = NULL;
+ pca.flags = TOCONS | TOLOG;
+ kvprintf(fmt, putchar, &pca, 10, ap);
+ if (!panicstr)
+ logwakeup();
+ consintr = savintr; /* reenable interrupts */
+}
+
+/*
+ * Print a character on console or users terminal. If destination is
+ * the console then the last MSGBUFS characters are saved in msgbuf for
+ * inspection later.
+ */
+static void
+putchar(int c, void *arg)
+{
+ struct putchar_arg *ap = (struct putchar_arg*) arg;
+ int flags = ap->flags;
+ struct tty *tp = ap->tty;
+ if (panicstr)
+ constty = NULL;
+ if ((flags & TOCONS) && tp == NULL && constty) {
+ tp = constty;
+ flags |= TOTTY;
+ }
+ if ((flags & TOTTY) && tp && tputchar(c, tp) < 0 &&
+ (flags & TOCONS) && tp == constty)
+ constty = NULL;
+ if ((flags & TOLOG))
+ msglogchar(c, NULL);
+ if ((flags & TOCONS) && constty == NULL && c != '\0')
+ (*v_putc)(c);
+}
+
+/*
+ * Scaled down version of sprintf(3).
+ */
+int
+sprintf(char *buf, const char *cfmt, ...)
+{
+ int retval;
+ va_list ap;
+
+ va_start(ap, cfmt);
+ retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap);
+ buf[retval] = '\0';
+ va_end(ap);
+ return retval;
+}
+
+/*
+ * Put a number (base <= 16) in a buffer in reverse order; return an
+ * optional length and a pointer to the NULL terminated (preceded?)
+ * buffer.
+ */
+static char *
+ksprintn(ul, base, lenp)
+ register u_long ul;
+ register int base, *lenp;
+{ /* A long in base 8, plus NULL. */
+ static char buf[sizeof(long) * NBBY / 3 + 2];
+ register char *p;
+
+ p = buf;
+ do {
+ *++p = hex2ascii(ul % base);
+ } while (ul /= base);
+ if (lenp)
+ *lenp = p - buf;
+ return (p);
}
/*
@@ -337,110 +353,178 @@ printf(fmt, va_alist)
* the next characters (up to a control character, i.e. a character <= 32),
* give the name of the register. Thus:
*
- * kprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE\n");
+ * kvprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE\n");
*
* would produce output:
*
* reg=3<BITTWO,BITONE>
*
- * The format %r passes an additional format string and argument list
- * recursively. Its usage is:
- *
- * fn(char *fmt, ...)
- * {
- * va_list ap;
- * va_start(ap, fmt);
- * printf("prefix: %r: suffix\n", fmt, ap);
- * va_end(ap);
- * }
- *
- * Space or zero padding and a field width are supported for the numeric
- * formats only.
+ * XXX: %D -- Hexdump, takes pointer and separator string:
+ * ("%6D", ptr, ":") -> XX:XX:XX:XX:XX:XX
+ * ("%*D", len, ptr, " " -> XX XX XX XX ...
*/
-void
-kprintf(fmt, flags, tp, ap)
- register const char *fmt;
- int flags;
- struct tty *tp;
- va_list ap;
+int
+kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_list ap)
{
- register char *p, *q;
- register int ch, n;
+#define PCHAR(c) {int cc=(c); if (func) (*func)(cc,arg); else *d++ = cc; retval++; }
+ char *p, *q, *d;
+ u_char *up;
+ int ch, n;
u_long ul;
- int base, lflag, tmp, width;
+ int base, lflag, tmp, width, ladjust, sharpflag, neg, sign, dot;
+ int dwidth;
char padc;
+ int retval = 0;
+
+ if (!func)
+ d = (char *) arg;
+ else
+ d = NULL;
+
+ if (fmt == NULL)
+ fmt = "(fmt null)\n";
+
+ if (radix < 2 || radix > 36)
+ radix = 10;
for (;;) {
padc = ' ';
width = 0;
- while ((ch = *(u_char *)fmt++) != '%') {
- if (ch == '\0')
- return;
- putchar(ch, flags, tp);
+ while ((ch = (u_char)*fmt++) != '%') {
+ if (ch == '\0')
+ return retval;
+ PCHAR(ch);
}
- lflag = 0;
-reswitch: switch (ch = *(u_char *)fmt++) {
- case '0':
- padc = '0';
+ lflag = 0; ladjust = 0; sharpflag = 0; neg = 0;
+ sign = 0; dot = 0; dwidth = 0;
+reswitch: switch (ch = (u_char)*fmt++) {
+ case '.':
+ dot = 1;
goto reswitch;
- case '1': case '2': case '3': case '4':
- case '5': case '6': case '7': case '8': case '9':
- for (width = 0;; ++fmt) {
- width = width * 10 + ch - '0';
- ch = *fmt;
- if (ch < '0' || ch > '9')
- break;
+ case '#':
+ sharpflag = 1;
+ goto reswitch;
+ case '+':
+ sign = 1;
+ goto reswitch;
+ case '-':
+ ladjust = 1;
+ goto reswitch;
+ case '%':
+ PCHAR(ch);
+ break;
+ case '*':
+ if (!dot) {
+ width = va_arg(ap, int);
+ if (width < 0) {
+ ladjust = !ladjust;
+ width = -width;
+ }
+ } else {
+ dwidth = va_arg(ap, int);
}
goto reswitch;
- case 'l':
- lflag = 1;
+ case '0':
+ if (!dot) {
+ padc = '0';
+ goto reswitch;
+ }
+ case '1': case '2': case '3': case '4':
+ case '5': case '6': case '7': case '8': case '9':
+ for (n = 0;; ++fmt) {
+ n = n * 10 + ch - '0';
+ ch = *fmt;
+ if (ch < '0' || ch > '9')
+ break;
+ }
+ if (dot)
+ dwidth = n;
+ else
+ width = n;
goto reswitch;
case 'b':
ul = va_arg(ap, int);
p = va_arg(ap, char *);
- for (q = ksprintn(ul, *p++, NULL); ch = *q--;)
- putchar(ch, flags, tp);
+ for (q = ksprintn(ul, *p++, NULL); *q;)
+ PCHAR(*q--);
if (!ul)
break;
- for (tmp = 0; n = *p++;) {
+ for (tmp = 0; *p;) {
+ n = *p++;
if (ul & (1 << (n - 1))) {
- putchar(tmp ? ',' : '<', flags, tp);
+ PCHAR(tmp ? ',' : '<');
for (; (n = *p) > ' '; ++p)
- putchar(n, flags, tp);
+ PCHAR(n);
tmp = 1;
} else
for (; *p > ' '; ++p)
continue;
}
if (tmp)
- putchar('>', flags, tp);
+ PCHAR('>');
break;
case 'c':
- putchar(va_arg(ap, int), flags, tp);
- break;
- case 'r':
- p = va_arg(ap, char *);
- kprintf(p, flags, tp, va_arg(ap, va_list));
+ PCHAR(va_arg(ap, int));
break;
- case 's':
+ case 'D':
+ up = va_arg(ap, u_char *);
p = va_arg(ap, char *);
- while (ch = *p++)
- putchar(ch, flags, tp);
+ if (!width)
+ width = 16;
+ while(width--) {
+ PCHAR(hex2ascii(*up >> 4));
+ PCHAR(hex2ascii(*up & 0x0f));
+ up++;
+ if (width)
+ for (q=p;*q;q++)
+ PCHAR(*q);
+ }
break;
case 'd':
ul = lflag ? va_arg(ap, long) : va_arg(ap, int);
- if ((long)ul < 0) {
- putchar('-', flags, tp);
- ul = -(long)ul;
- }
+ sign = 1;
base = 10;
goto number;
+ case 'l':
+ lflag = 1;
+ goto reswitch;
+ case 'n':
+ ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int);
+ base = radix;
+ goto number;
case 'o':
ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int);
base = 8;
goto number;
+ case 'p':
+ ul = (u_long)va_arg(ap, void *);
+ base = 16;
+ PCHAR('0');
+ PCHAR('x');
+ goto number;
+ case 's':
+ p = va_arg(ap, char *);
+ if (p == NULL)
+ p = "(null)";
+ if (!dot)
+ n = strlen (p);
+ else
+ for (n = 0; n < dwidth && p[n]; n++)
+ continue;
+
+ width -= n;
+
+ if (!ladjust && width > 0)
+ while (width--)
+ PCHAR(padc);
+ while (n--)
+ PCHAR(*p++);
+ if (ladjust && width > 0)
+ while (width--)
+ PCHAR(padc);
+ break;
case 'u':
ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int);
base = 10;
@@ -448,56 +532,71 @@ reswitch: switch (ch = *(u_char *)fmt++) {
case 'x':
ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int);
base = 16;
-number: p = ksprintn(ul, base, &tmp);
- if (width && (width -= tmp) > 0)
+number: if (sign && (long)ul < 0L) {
+ neg = 1;
+ ul = -(long)ul;
+ }
+ p = ksprintn(ul, base, &tmp);
+ if (sharpflag && ul != 0) {
+ if (base == 8)
+ tmp++;
+ else if (base == 16)
+ tmp += 2;
+ }
+ if (neg)
+ tmp++;
+
+ if (!ladjust && width && (width -= tmp) > 0)
+ while (width--)
+ PCHAR(padc);
+ if (neg)
+ PCHAR('-');
+ if (sharpflag && ul != 0) {
+ if (base == 8) {
+ PCHAR('0');
+ } else if (base == 16) {
+ PCHAR('0');
+ PCHAR('x');
+ }
+ }
+
+ while (*p)
+ PCHAR(*p--);
+
+ if (ladjust && width && (width -= tmp) > 0)
while (width--)
- putchar(padc, flags, tp);
- while (ch = *p--)
- putchar(ch, flags, tp);
+ PCHAR(padc);
+
break;
default:
- putchar('%', flags, tp);
+ PCHAR('%');
if (lflag)
- putchar('l', flags, tp);
- /* FALLTHROUGH */
- case '%':
- putchar(ch, flags, tp);
+ PCHAR('l');
+ PCHAR(ch);
+ break;
}
}
+#undef PCHAR
}
/*
- * Print a character on console or users terminal. If destination is
- * the console then the last MSGBUFS characters are saved in msgbuf for
- * inspection later.
+ * Put character in log buffer.
*/
static void
-putchar(c, flags, tp)
- register int c;
- int flags;
- struct tty *tp;
+msglogchar(int c, void *dummyarg)
{
- extern int msgbufmapped;
- register struct msgbuf *mbp;
+ struct msgbuf *mbp;
- if (panicstr)
- constty = NULL;
- if ((flags & TOCONS) && tp == NULL && constty) {
- tp = constty;
- flags |= TOTTY;
- }
- if ((flags & TOTTY) && tp && tputchar(c, tp) < 0 &&
- (flags & TOCONS) && tp == constty)
- constty = NULL;
- if ((flags & TOLOG) &&
- c != '\0' && c != '\r' && c != 0177 && msgbufmapped) {
+ if (c != '\0' && c != '\r' && c != 0177 && msgbufmapped) {
mbp = msgbufp;
- if (mbp->msg_magic != MSG_MAGIC) {
- bzero((caddr_t)mbp, sizeof(*mbp));
+ if (mbp->msg_magic != MSG_MAGIC ||
+ mbp->msg_bufx >= MSG_BSIZE ||
+ mbp->msg_bufr >= MSG_BSIZE) {
+ bzero(mbp, sizeof(struct msgbuf));
mbp->msg_magic = MSG_MAGIC;
}
mbp->msg_bufc[mbp->msg_bufx++] = c;
- if (mbp->msg_bufx < 0 || mbp->msg_bufx >= MSG_BSIZE)
+ if (mbp->msg_bufx >= MSG_BSIZE)
mbp->msg_bufx = 0;
/* If the buffer is full, keep the most recent data. */
if (mbp->msg_bufr == mbp->msg_bufx) {
@@ -505,102 +604,4 @@ putchar(c, flags, tp)
mbp->msg_bufr = 0;
}
}
- if ((flags & TOCONS) && constty == NULL && c != '\0')
- (*v_putc)(c);
-}
-
-/*
- * Scaled down version of sprintf(3).
- */
-#ifdef __STDC__
-sprintf(char *buf, const char *cfmt, ...)
-#else
-sprintf(buf, cfmt, va_alist)
- char *buf, *cfmt;
-#endif
-{
- register const char *fmt = cfmt;
- register char *p, *bp;
- register int ch, base;
- u_long ul;
- int lflag;
- va_list ap;
-
- va_start(ap, cfmt);
- for (bp = buf; ; ) {
- while ((ch = *(u_char *)fmt++) != '%')
- if ((*bp++ = ch) == '\0')
- return ((bp - buf) - 1);
-
- lflag = 0;
-reswitch: switch (ch = *(u_char *)fmt++) {
- case 'l':
- lflag = 1;
- goto reswitch;
- case 'c':
- *bp++ = va_arg(ap, int);
- break;
- case 's':
- p = va_arg(ap, char *);
- while (*bp++ = *p++)
- continue;
- --bp;
- break;
- case 'd':
- ul = lflag ? va_arg(ap, long) : va_arg(ap, int);
- if ((long)ul < 0) {
- *bp++ = '-';
- ul = -(long)ul;
- }
- base = 10;
- goto number;
- break;
- case 'o':
- ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int);
- base = 8;
- goto number;
- break;
- case 'u':
- ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int);
- base = 10;
- goto number;
- break;
- case 'x':
- ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int);
- base = 16;
-number: for (p = ksprintn(ul, base, NULL); ch = *p--;)
- *bp++ = ch;
- break;
- default:
- *bp++ = '%';
- if (lflag)
- *bp++ = 'l';
- /* FALLTHROUGH */
- case '%':
- *bp++ = ch;
- }
- }
- va_end(ap);
-}
-
-/*
- * Put a number (base <= 16) in a buffer in reverse order; return an
- * optional length and a pointer to the NULL terminated (preceded?)
- * buffer.
- */
-static char *
-ksprintn(ul, base, lenp)
- register u_long ul;
- register int base, *lenp;
-{ /* A long in base 8, plus NULL. */
- static char buf[sizeof(long) * NBBY / 3 + 2];
- register char *p;
-
- p = buf;
- do {
- *++p = "0123456789abcdef"[ul % base];
- } while (ul /= base);
- if (lenp)
- *lenp = p - buf;
- return (p);
}
diff --git a/sys/kern/subr_prof.c b/sys/kern/subr_prof.c
index 237553d..08ba35f 100644
--- a/sys/kern/subr_prof.c
+++ b/sys/kern/subr_prof.c
@@ -30,17 +30,17 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)subr_prof.c 8.4 (Berkeley) 2/14/95
+ * @(#)subr_prof.c 8.3 (Berkeley) 9/23/93
+ * $Id$
*/
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/sysproto.h>
#include <sys/kernel.h>
#include <sys/proc.h>
-#include <sys/user.h>
-
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
+#include <sys/resourcevar.h>
+#include <sys/sysctl.h>
#include <machine/cpu.h>
@@ -48,26 +48,57 @@
#include <sys/malloc.h>
#include <sys/gmon.h>
-/*
- * Froms is actually a bunch of unsigned shorts indexing tos
- */
+static void kmstartup __P((void *));
+SYSINIT(kmem, SI_SUB_KPROF, SI_ORDER_FIRST, kmstartup, NULL)
+
struct gmonparam _gmonparam = { GMON_PROF_OFF };
+extern char btext[];
extern char etext[];
+#ifdef GUPROF
+void
+nullfunc_loop_profiled()
+{
+ int i;
+
+ for (i = 0; i < CALIB_SCALE; i++)
+ nullfunc_profiled();
+}
+
+#define nullfunc_loop_profiled_end nullfunc_profiled /* XXX */
+
void
-kmstartup()
+nullfunc_profiled()
+{
+}
+#endif /* GUPROF */
+
+static void
+kmstartup(dummy)
+ void *dummy;
{
char *cp;
struct gmonparam *p = &_gmonparam;
+#ifdef GUPROF
+ int cputime_overhead;
+ int empty_loop_time;
+ int i;
+ int mcount_overhead;
+ int mexitcount_overhead;
+ int nullfunc_loop_overhead;
+ int nullfunc_loop_profiled_time;
+ fptrint_t tmp_addr;
+#endif
+
/*
* Round lowpc and highpc to multiples of the density we're using
* so the rest of the scaling (here and in gprof) stays in ints.
*/
- p->lowpc = ROUNDDOWN(KERNBASE, HISTFRACTION * sizeof(HISTCOUNTER));
+ p->lowpc = ROUNDDOWN((u_long)btext, HISTFRACTION * sizeof(HISTCOUNTER));
p->highpc = ROUNDUP((u_long)etext, HISTFRACTION * sizeof(HISTCOUNTER));
p->textsize = p->highpc - p->lowpc;
- printf("Profiling kernel, textsize=%d [%x..%x]\n",
+ printf("Profiling kernel, textsize=%lu [%x..%x]\n",
p->textsize, p->lowpc, p->highpc);
p->kcountsize = p->textsize / HISTFRACTION;
p->hashfraction = HASHFRACTION;
@@ -87,25 +118,168 @@ kmstartup()
bzero(cp, p->kcountsize + p->tossize + p->fromssize);
p->tos = (struct tostruct *)cp;
cp += p->tossize;
- p->kcount = (u_short *)cp;
+ p->kcount = (HISTCOUNTER *)cp;
cp += p->kcountsize;
p->froms = (u_short *)cp;
+
+#ifdef GUPROF
+ /* Initialize pointers to overhead counters. */
+ p->cputime_count = &KCOUNT(p, PC_TO_I(p, cputime));
+ p->mcount_count = &KCOUNT(p, PC_TO_I(p, mcount));
+ p->mexitcount_count = &KCOUNT(p, PC_TO_I(p, mexitcount));
+
+ /*
+ * Disable interrupts to avoid interference while we calibrate
+ * things.
+ */
+ disable_intr();
+
+ /*
+ * Determine overheads.
+ * XXX this needs to be repeated for each useful timer/counter.
+ */
+ cputime_overhead = 0;
+ startguprof(p);
+ for (i = 0; i < CALIB_SCALE; i++)
+ cputime_overhead += cputime();
+
+ empty_loop();
+ startguprof(p);
+ empty_loop();
+ empty_loop_time = cputime();
+
+ nullfunc_loop_profiled();
+
+ /*
+ * Start profiling. There won't be any normal function calls since
+ * interrupts are disabled, but we will call the profiling routines
+ * directly to determine their overheads.
+ */
+ p->state = GMON_PROF_HIRES;
+
+ startguprof(p);
+ nullfunc_loop_profiled();
+
+ startguprof(p);
+ for (i = 0; i < CALIB_SCALE; i++)
+#if defined(i386) && __GNUC__ >= 2
+ asm("pushl %0; call __mcount; popl %%ecx"
+ :
+ : "i" (profil)
+ : "ax", "bx", "cx", "dx", "memory");
+#else
+#error
+#endif
+ mcount_overhead = KCOUNT(p, PC_TO_I(p, profil));
+
+ startguprof(p);
+ for (i = 0; i < CALIB_SCALE; i++)
+#if defined(i386) && __GNUC__ >= 2
+ asm("call mexitcount; 1:"
+ : : : "ax", "bx", "cx", "dx", "memory");
+ asm("movl $1b,%0" : "=rm" (tmp_addr));
+#else
+#error
+#endif
+ mexitcount_overhead = KCOUNT(p, PC_TO_I(p, tmp_addr));
+
+ p->state = GMON_PROF_OFF;
+ stopguprof(p);
+
+ enable_intr();
+
+ nullfunc_loop_profiled_time = 0;
+ for (tmp_addr = (fptrint_t)nullfunc_loop_profiled;
+ tmp_addr < (fptrint_t)nullfunc_loop_profiled_end;
+ tmp_addr += HISTFRACTION * sizeof(HISTCOUNTER))
+ nullfunc_loop_profiled_time += KCOUNT(p, PC_TO_I(p, tmp_addr));
+#define CALIB_DOSCALE(count) (((count) + CALIB_SCALE / 3) / CALIB_SCALE)
+#define c2n(count, freq) ((int)((count) * 1000000000LL / freq))
+ printf("cputime %d, empty_loop %d, nullfunc_loop_profiled %d, mcount %d, mexitcount %d\n",
+ CALIB_DOSCALE(c2n(cputime_overhead, p->profrate)),
+ CALIB_DOSCALE(c2n(empty_loop_time, p->profrate)),
+ CALIB_DOSCALE(c2n(nullfunc_loop_profiled_time, p->profrate)),
+ CALIB_DOSCALE(c2n(mcount_overhead, p->profrate)),
+ CALIB_DOSCALE(c2n(mexitcount_overhead, p->profrate)));
+ cputime_overhead -= empty_loop_time;
+ mcount_overhead -= empty_loop_time;
+ mexitcount_overhead -= empty_loop_time;
+
+ /*-
+ * Profiling overheads are determined by the times between the
+ * following events:
+ * MC1: mcount() is called
+ * MC2: cputime() (called from mcount()) latches the timer
+ * MC3: mcount() completes
+ * ME1: mexitcount() is called
+ * ME2: cputime() (called from mexitcount()) latches the timer
+ * ME3: mexitcount() completes.
+ * The times between the events vary slightly depending on instruction
+ * combination and cache misses, etc. Attempt to determine the
+ * minimum times. These can be subtracted from the profiling times
+ * without much risk of reducing the profiling times below what they
+ * would be when profiling is not configured. Abbreviate:
+ * ab = minimum time between MC1 and MC3
+ * a = minumum time between MC1 and MC2
+ * b = minimum time between MC2 and MC3
+ * cd = minimum time between ME1 and ME3
+ * c = minimum time between ME1 and ME2
+ * d = minimum time between ME2 and ME3.
+ * These satisfy the relations:
+ * ab <= mcount_overhead (just measured)
+ * a + b <= ab
+ * cd <= mexitcount_overhead (just measured)
+ * c + d <= cd
+ * a + d <= nullfunc_loop_profiled_time (just measured)
+ * a >= 0, b >= 0, c >= 0, d >= 0.
+ * Assume that ab and cd are equal to the minimums.
+ */
+ p->cputime_overhead = CALIB_DOSCALE(cputime_overhead);
+ p->mcount_overhead = CALIB_DOSCALE(mcount_overhead - cputime_overhead);
+ p->mexitcount_overhead = CALIB_DOSCALE(mexitcount_overhead
+ - cputime_overhead);
+ nullfunc_loop_overhead = nullfunc_loop_profiled_time - empty_loop_time;
+ p->mexitcount_post_overhead = CALIB_DOSCALE((mcount_overhead
+ - nullfunc_loop_overhead)
+ / 4);
+ p->mexitcount_pre_overhead = p->mexitcount_overhead
+ + p->cputime_overhead
+ - p->mexitcount_post_overhead;
+ p->mcount_pre_overhead = CALIB_DOSCALE(nullfunc_loop_overhead)
+ - p->mexitcount_post_overhead;
+ p->mcount_post_overhead = p->mcount_overhead
+ + p->cputime_overhead
+ - p->mcount_pre_overhead;
+ printf(
+"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d nsec\n",
+ c2n(p->cputime_overhead, p->profrate),
+ c2n(p->mcount_overhead, p->profrate),
+ c2n(p->mcount_pre_overhead, p->profrate),
+ c2n(p->mcount_post_overhead, p->profrate),
+ c2n(p->cputime_overhead, p->profrate),
+ c2n(p->mexitcount_overhead, p->profrate),
+ c2n(p->mexitcount_pre_overhead, p->profrate),
+ c2n(p->mexitcount_post_overhead, p->profrate));
+ printf(
+"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d cycles\n",
+ p->cputime_overhead, p->mcount_overhead,
+ p->mcount_pre_overhead, p->mcount_post_overhead,
+ p->cputime_overhead, p->mexitcount_overhead,
+ p->mexitcount_pre_overhead, p->mexitcount_post_overhead);
+#endif /* GUPROF */
}
/*
* Return kernel profiling information.
*/
-int
-sysctl_doprof(name, namelen, oldp, oldlenp, newp, newlen, p)
- int *name;
- u_int namelen;
- void *oldp;
- size_t *oldlenp;
- void *newp;
- size_t newlen;
+static int
+sysctl_kern_prof SYSCTL_HANDLER_ARGS
{
+ int *name = (int *) arg1;
+ u_int namelen = arg2;
struct gmonparam *gp = &_gmonparam;
int error;
+ int state;
/* all sysctl names at this level are terminal */
if (namelen != 1)
@@ -113,30 +287,50 @@ sysctl_doprof(name, namelen, oldp, oldlenp, newp, newlen, p)
switch (name[0]) {
case GPROF_STATE:
- error = sysctl_int(oldp, oldlenp, newp, newlen, &gp->state);
+ state = gp->state;
+ error = sysctl_handle_int(oidp, &state, 0, req);
if (error)
return (error);
- if (gp->state == GMON_PROF_OFF)
+ if (!req->newptr)
+ return (0);
+ if (state == GMON_PROF_OFF) {
+ gp->state = state;
stopprofclock(&proc0);
- else
+ stopguprof(gp);
+ } else if (state == GMON_PROF_ON) {
+ gp->state = GMON_PROF_OFF;
+ stopguprof(gp);
+ gp->profrate = profhz;
startprofclock(&proc0);
+ gp->state = state;
+#ifdef GUPROF
+ } else if (state == GMON_PROF_HIRES) {
+ gp->state = GMON_PROF_OFF;
+ stopprofclock(&proc0);
+ startguprof(gp);
+ gp->state = state;
+#endif
+ } else if (state != gp->state)
+ return (EINVAL);
return (0);
case GPROF_COUNT:
- return (sysctl_struct(oldp, oldlenp, newp, newlen,
- gp->kcount, gp->kcountsize));
+ return (sysctl_handle_opaque(oidp,
+ gp->kcount, gp->kcountsize, req));
case GPROF_FROMS:
- return (sysctl_struct(oldp, oldlenp, newp, newlen,
- gp->froms, gp->fromssize));
+ return (sysctl_handle_opaque(oidp,
+ gp->froms, gp->fromssize, req));
case GPROF_TOS:
- return (sysctl_struct(oldp, oldlenp, newp, newlen,
- gp->tos, gp->tossize));
+ return (sysctl_handle_opaque(oidp,
+ gp->tos, gp->tossize, req));
case GPROF_GMONPARAM:
- return (sysctl_rdstruct(oldp, oldlenp, newp, gp, sizeof *gp));
+ return (sysctl_handle_opaque(oidp, gp, sizeof *gp, req));
default:
return (EOPNOTSUPP);
}
/* NOTREACHED */
}
+
+SYSCTL_NODE(_kern, KERN_PROF, prof, CTLFLAG_RW, sysctl_kern_prof, "");
#endif /* GPROF */
/*
@@ -145,24 +339,27 @@ sysctl_doprof(name, namelen, oldp, oldlenp, newp, newlen, p)
* The scale factor is a fixed point number with 16 bits of fraction, so that
* 1.0 is represented as 0x10000. A scale factor of 0 turns off profiling.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct profil_args {
+ caddr_t samples;
+ u_int size;
+ u_int offset;
+ u_int scale;
+};
+#endif
/* ARGSUSED */
int
profil(p, uap, retval)
struct proc *p;
- register struct profil_args /* {
- syscallarg(caddr_t) samples;
- syscallarg(u_int) size;
- syscallarg(u_int) offset;
- syscallarg(u_int) scale;
- } */ *uap;
- register_t *retval;
+ register struct profil_args *uap;
+ int *retval;
{
register struct uprof *upp;
int s;
- if (SCARG(uap, scale) > (1 << 16))
+ if (uap->scale > (1 << 16))
return (EINVAL);
- if (SCARG(uap, scale) == 0) {
+ if (uap->scale == 0) {
stopprofclock(p);
return (0);
}
@@ -170,10 +367,10 @@ profil(p, uap, retval)
/* Block profile interrupts while changing state. */
s = splstatclock();
- upp->pr_off = SCARG(uap, offset);
- upp->pr_scale = SCARG(uap, scale);
- upp->pr_base = SCARG(uap, samples);
- upp->pr_size = SCARG(uap, size);
+ upp->pr_off = uap->offset;
+ upp->pr_scale = uap->scale;
+ upp->pr_base = uap->samples;
+ upp->pr_size = uap->size;
startprofclock(p);
splx(s);
diff --git a/sys/kern/subr_rlist.c b/sys/kern/subr_rlist.c
new file mode 100644
index 0000000..ef29ce3
--- /dev/null
+++ b/sys/kern/subr_rlist.c
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 1992 William F. Jolitz, TeleMuse
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This software is a component of "386BSD" developed by
+ William F. Jolitz, TeleMuse.
+ * 4. Neither the name of the developer nor the name "386BSD"
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS A COMPONENT OF 386BSD DEVELOPED BY WILLIAM F. JOLITZ
+ * AND IS INTENDED FOR RESEARCH AND EDUCATIONAL PURPOSES ONLY. THIS
+ * SOFTWARE SHOULD NOT BE CONSIDERED TO BE A COMMERCIAL PRODUCT.
+ * THE DEVELOPER URGES THAT USERS WHO REQUIRE A COMMERCIAL PRODUCT
+ * NOT MAKE USE THIS WORK.
+ *
+ * FOR USERS WHO WISH TO UNDERSTAND THE 386BSD SYSTEM DEVELOPED
+ * BY WILLIAM F. JOLITZ, WE RECOMMEND THE USER STUDY WRITTEN
+ * REFERENCES SUCH AS THE "PORTING UNIX TO THE 386" SERIES
+ * (BEGINNING JANUARY 1991 "DR. DOBBS JOURNAL", USA AND BEGINNING
+ * JUNE 1991 "UNIX MAGAZIN", GERMANY) BY WILLIAM F. JOLITZ AND
+ * LYNNE GREER JOLITZ, AS WELL AS OTHER BOOKS ON UNIX AND THE
+ * ON-LINE 386BSD USER MANUAL BEFORE USE. A BOOK DISCUSSING THE INTERNALS
+ * OF 386BSD ENTITLED "386BSD FROM THE INSIDE OUT" WILL BE AVAILABLE LATE 1992.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE DEVELOPER ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE DEVELOPER BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/*
+ * Changes Copyright (C) 1995, David Greenman & John Dyson; This software may
+ * be used, modified, copied, distributed, and sold, in both source and
+ * binary form provided that the above copyright and these terms are
+ * retained. Under no circumstances is the author responsible for the proper
+ * functioning of this software, nor does the author assume any responsibility
+ * for damages incurred with its use.
+ *
+ * $Id$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/rlist.h>
+#include <sys/proc.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+/*
+ * Resource lists.
+ */
+
+#define RLIST_MIN 128
+static int rlist_count=0;
+static struct rlist *rlfree;
+
+static struct rlist *rlist_malloc __P((void));
+
+static struct rlist *
+rlist_malloc()
+{
+ struct rlist *rl;
+ int i;
+ while( rlist_count < RLIST_MIN) {
+ int s = splhigh();
+ rl = (struct rlist *)kmem_alloc(kernel_map, PAGE_SIZE);
+ splx(s);
+ if( !rl)
+ break;
+
+ for(i=0;i<(PAGE_SIZE/(sizeof *rl));i++) {
+ rl->rl_next = rlfree;
+ rlfree = rl;
+ rlist_count++;
+ rl++;
+ }
+ }
+
+ if( (rl = rlfree) == 0 )
+ panic("Cannot get an rlist entry");
+
+ --rlist_count;
+ rlfree = rl->rl_next;
+ return rl;
+}
+
+inline static void
+rlist_mfree( struct rlist *rl)
+{
+ rl->rl_next = rlfree;
+ rlfree = rl;
+ ++rlist_count;
+}
+
+void
+rlist_free(rlh, start, end)
+ struct rlisthdr *rlh;
+ u_int start, end;
+{
+ struct rlist **rlp = &rlh->rlh_list;
+ struct rlist *prev_rlp = NULL, *cur_rlp = *rlp, *next_rlp = NULL;
+ int s;
+
+ s = splhigh();
+ while (rlh->rlh_lock & RLH_LOCKED) {
+ rlh->rlh_lock |= RLH_DESIRED;
+ tsleep(rlh, PSWP, "rlistf", 0);
+ }
+ rlh->rlh_lock |= RLH_LOCKED;
+ splx(s);
+
+ /*
+ * Traverse the list looking for an entry after the one we want
+ * to insert.
+ */
+ while (cur_rlp != NULL) {
+ if (start < cur_rlp->rl_start)
+ break;
+#ifdef DIAGNOSTIC
+ if (prev_rlp) {
+ if (prev_rlp->rl_end + 1 == cur_rlp->rl_start)
+ panic("rlist_free: missed coalesce opportunity");
+ if (prev_rlp->rl_end == cur_rlp->rl_start)
+ panic("rlist_free: entries overlap");
+ if (prev_rlp->rl_end > cur_rlp->rl_start)
+ panic("entries out of order");
+ }
+#endif
+ prev_rlp = cur_rlp;
+ cur_rlp = cur_rlp->rl_next;
+ }
+
+ if (cur_rlp != NULL) {
+
+ if (end >= cur_rlp->rl_start)
+ panic("rlist_free: free end overlaps already freed area");
+
+ if (prev_rlp) {
+ if (start <= prev_rlp->rl_end)
+ panic("rlist_free: free start overlaps already freed area");
+ /*
+ * Attempt to append
+ */
+ if (prev_rlp->rl_end + 1 == start) {
+ prev_rlp->rl_end = end;
+ /*
+ * Attempt to prepend and coalesce
+ */
+ if (end + 1 == cur_rlp->rl_start) {
+ prev_rlp->rl_end = cur_rlp->rl_end;
+ prev_rlp->rl_next = cur_rlp->rl_next;
+ rlist_mfree(cur_rlp);
+ }
+ goto done;
+ }
+ }
+ /*
+ * Attempt to prepend
+ */
+ if (end + 1 == cur_rlp->rl_start) {
+ cur_rlp->rl_start = start;
+ goto done;
+ }
+ }
+ /*
+ * Reached the end of the list without finding a larger entry.
+ * Append to last entry if there is one and it's adjacent.
+ */
+ if (prev_rlp) {
+ if (start <= prev_rlp->rl_end)
+ panic("rlist_free: free start overlaps already freed area at list tail");
+ /*
+ * Attempt to append
+ */
+ if (prev_rlp->rl_end + 1 == start) {
+ prev_rlp->rl_end = end;
+ goto done;
+ }
+ }
+
+ /*
+ * Could neither append nor prepend; allocate a new entry.
+ */
+ next_rlp = cur_rlp;
+ cur_rlp = rlist_malloc();
+ cur_rlp->rl_start = start;
+ cur_rlp->rl_end = end;
+ cur_rlp->rl_next = next_rlp;
+ if (prev_rlp) {
+ prev_rlp->rl_next = cur_rlp;
+ } else {
+ /*
+ * No previous - this entry is the new list head.
+ */
+ *rlp = cur_rlp;
+ }
+
+done:
+ rlh->rlh_lock &= ~RLH_LOCKED;
+ if (rlh->rlh_lock & RLH_DESIRED) {
+ wakeup(rlh);
+ rlh->rlh_lock &= ~RLH_DESIRED;
+ }
+ return;
+}
+
+/*
+ * Obtain a region of desired size from a resource list.
+ * If nothing available of that size, return 0. Otherwise,
+ * return a value of 1 and set resource start location with
+ * "*loc". (Note: loc can be zero if we don't wish the value)
+ */
+int
+rlist_alloc (rlh, size, loc)
+ struct rlisthdr *rlh;
+ unsigned size, *loc;
+{
+ struct rlist **rlp = &rlh->rlh_list;
+ register struct rlist *lp;
+ int s;
+ register struct rlist *olp = 0;
+
+ s = splhigh();
+ while (rlh->rlh_lock & RLH_LOCKED) {
+ rlh->rlh_lock |= RLH_DESIRED;
+ tsleep(rlh, PSWP, "rlistf", 0);
+ }
+ rlh->rlh_lock |= RLH_LOCKED;
+ splx(s);
+
+ /* walk list, allocating first thing that's big enough (first fit) */
+ for (; *rlp; rlp = &((*rlp)->rl_next))
+ if(size <= (*rlp)->rl_end - (*rlp)->rl_start + 1) {
+
+ /* hand it to the caller */
+ if (loc) *loc = (*rlp)->rl_start;
+ (*rlp)->rl_start += size;
+
+ /* did we eat this element entirely? */
+ if ((*rlp)->rl_start > (*rlp)->rl_end) {
+ lp = (*rlp)->rl_next;
+ rlist_mfree(*rlp);
+ /*
+ * if the deleted element was in fromt
+ * of the list, adjust *rlp, else don't.
+ */
+ if (olp) {
+ olp->rl_next = lp;
+ } else {
+ *rlp = lp;
+ }
+ }
+
+ rlh->rlh_lock &= ~RLH_LOCKED;
+ if (rlh->rlh_lock & RLH_DESIRED) {
+ wakeup(rlh);
+ rlh->rlh_lock &= ~RLH_DESIRED;
+ }
+ return (1);
+ } else {
+ olp = *rlp;
+ }
+
+ rlh->rlh_lock &= ~RLH_LOCKED;
+ if (rlh->rlh_lock & RLH_DESIRED) {
+ wakeup(rlh);
+ rlh->rlh_lock &= ~RLH_DESIRED;
+ }
+ /* nothing in list that's big enough */
+ return (0);
+}
+
+/*
+ * Finished with this resource list, reclaim all space and
+ * mark it as being empty.
+ */
+void
+rlist_destroy (rlh)
+ struct rlisthdr *rlh;
+{
+ struct rlist **rlp = &rlh->rlh_list;
+ struct rlist *lp, *nlp;
+
+ lp = *rlp;
+ *rlp = 0;
+ for (; lp; lp = nlp) {
+ nlp = lp->rl_next;
+ rlist_mfree(lp);
+ }
+}
diff --git a/sys/kern/subr_rmap.c b/sys/kern/subr_rmap.c
deleted file mode 100644
index 2f31173..0000000
--- a/sys/kern/subr_rmap.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/*-
- * Copyright (c) 1982, 1986, 1993
- * The Regents of the University of California. All rights reserved.
- * (c) UNIX System Laboratories, Inc.
- * All or some portions of this file are derived from material licensed
- * to the University of California by American Telephone and Telegraph
- * Co. or Unix System Laboratories, Inc. and are reproduced herein with
- * the permission of UNIX System Laboratories, Inc.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- * from: @(#)subr_rmap.c 8.1 (Berkeley) 6/10/93
- */
-
-#include <sys/param.h>
-#include <sys/map.h>
-#include <sys/proc.h>
-
-void
-rminit(a1, a2, a3, a4, a5)
- struct map *a1;
- long a2, a3;
- char *a4;
- int a5;
-{
-
- /*
- * Body deleted.
- */
- return;
-}
-
-long
-rmalloc(a1, a2)
- struct map *a1;
- long a2;
-{
-
- /*
- * Body deleted.
- */
- return (0);
-}
-
-void
-rmfree(a1, a2, a3)
- struct map *a1;
- long a2, a3;
-{
-
- /*
- * Body deleted.
- */
- return;
-}
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
new file mode 100644
index 0000000..9dca842
--- /dev/null
+++ b/sys/kern/subr_trap.c
@@ -0,0 +1,940 @@
+/*-
+ * Copyright (C) 1994, David Greenman
+ * Copyright (c) 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the University of Utah, and William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)trap.c 7.4 (Berkeley) 5/13/91
+ * $Id$
+ */
+
+/*
+ * 386 Trap and System call handling
+ */
+
+#include "opt_ktrace.h"
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/acct.h>
+#include <sys/kernel.h>
+#include <sys/syscall.h>
+#include <sys/sysent.h>
+#include <sys/queue.h>
+#include <sys/vmmeter.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_extern.h>
+
+#include <sys/user.h>
+
+#include <machine/cpu.h>
+#include <machine/md_var.h>
+#include <machine/psl.h>
+#include <machine/reg.h>
+#include <machine/trap.h>
+#include <machine/../isa/isa_device.h>
+
+#ifdef POWERFAIL_NMI
+#include <sys/syslog.h>
+#include <machine/clock.h>
+#endif
+
+#include "isa.h"
+#include "npx.h"
+
+int (*pmath_emulate) __P((struct trapframe *));
+
+extern void trap __P((struct trapframe frame));
+extern int trapwrite __P((unsigned addr));
+extern void syscall __P((struct trapframe frame));
+
+static int trap_pfault __P((struct trapframe *, int));
+static void trap_fatal __P((struct trapframe *));
+void dblfault_handler __P((void));
+
+extern inthand_t IDTVEC(syscall);
+
+#define MAX_TRAP_MSG 28
+static char *trap_msg[] = {
+ "", /* 0 unused */
+ "privileged instruction fault", /* 1 T_PRIVINFLT */
+ "", /* 2 unused */
+ "breakpoint instruction fault", /* 3 T_BPTFLT */
+ "", /* 4 unused */
+ "", /* 5 unused */
+ "arithmetic trap", /* 6 T_ARITHTRAP */
+ "system forced exception", /* 7 T_ASTFLT */
+ "", /* 8 unused */
+ "general protection fault", /* 9 T_PROTFLT */
+ "trace trap", /* 10 T_TRCTRAP */
+ "", /* 11 unused */
+ "page fault", /* 12 T_PAGEFLT */
+ "", /* 13 unused */
+ "alignment fault", /* 14 T_ALIGNFLT */
+ "", /* 15 unused */
+ "", /* 16 unused */
+ "", /* 17 unused */
+ "integer divide fault", /* 18 T_DIVIDE */
+ "non-maskable interrupt trap", /* 19 T_NMI */
+ "overflow trap", /* 20 T_OFLOW */
+ "FPU bounds check fault", /* 21 T_BOUND */
+ "FPU device not available", /* 22 T_DNA */
+ "double fault", /* 23 T_DOUBLEFLT */
+ "FPU operand fetch fault", /* 24 T_FPOPFLT */
+ "invalid TSS fault", /* 25 T_TSSFLT */
+ "segment not present fault", /* 26 T_SEGNPFLT */
+ "stack fault", /* 27 T_STKFLT */
+ "machine check trap", /* 28 T_MCHK */
+};
+
+static void userret __P((struct proc *p, struct trapframe *frame,
+ u_quad_t oticks));
+
+static inline void
+userret(p, frame, oticks)
+ struct proc *p;
+ struct trapframe *frame;
+ u_quad_t oticks;
+{
+ int sig, s;
+
+ while ((sig = CURSIG(p)) != 0)
+ postsig(sig);
+ p->p_priority = p->p_usrpri;
+ if (want_resched) {
+ /*
+ * Since we are curproc, clock will normally just change
+ * our priority without moving us from one queue to another
+ * (since the running process is not on a queue.)
+ * If that happened after we setrunqueue ourselves but before we
+ * mi_switch()'ed, we might not be on the queue indicated by
+ * our priority.
+ */
+ s = splhigh();
+ setrunqueue(p);
+ p->p_stats->p_ru.ru_nivcsw++;
+ mi_switch();
+ splx(s);
+ while ((sig = CURSIG(p)) != 0)
+ postsig(sig);
+ }
+ /*
+ * Charge system time if profiling.
+ */
+ if (p->p_flag & P_PROFIL)
+ addupc_task(p, frame->tf_eip,
+ (u_int)(p->p_sticks - oticks) * psratio);
+
+ curpriority = p->p_priority;
+}
+
+/*
+ * Exception, fault, and trap interface to the FreeBSD kernel.
+ * This common code is called from assembly language IDT gate entry
+ * routines that prepare a suitable stack frame, and restore this
+ * frame after the exception has been processed.
+ */
+
+void
+trap(frame)
+ struct trapframe frame;
+{
+ struct proc *p = curproc;
+ u_quad_t sticks = 0;
+ int i = 0, ucode = 0, type, code;
+#ifdef DEBUG
+ u_long eva;
+#endif
+
+ type = frame.tf_trapno;
+ code = frame.tf_err;
+
+ if (ISPL(frame.tf_cs) == SEL_UPL) {
+ /* user trap */
+
+ sticks = p->p_sticks;
+ p->p_md.md_regs = (int *)&frame;
+
+ switch (type) {
+ case T_PRIVINFLT: /* privileged instruction fault */
+ ucode = type;
+ i = SIGILL;
+ break;
+
+ case T_BPTFLT: /* bpt instruction fault */
+ case T_TRCTRAP: /* trace trap */
+ frame.tf_eflags &= ~PSL_T;
+ i = SIGTRAP;
+ break;
+
+ case T_ARITHTRAP: /* arithmetic trap */
+ ucode = code;
+ i = SIGFPE;
+ break;
+
+ case T_ASTFLT: /* Allow process switch */
+ astoff();
+ cnt.v_soft++;
+ if (p->p_flag & P_OWEUPC) {
+ p->p_flag &= ~P_OWEUPC;
+ addupc_task(p, p->p_stats->p_prof.pr_addr,
+ p->p_stats->p_prof.pr_ticks);
+ }
+ goto out;
+
+ case T_PROTFLT: /* general protection fault */
+ case T_SEGNPFLT: /* segment not present fault */
+ case T_STKFLT: /* stack fault */
+ case T_TSSFLT: /* invalid TSS fault */
+ case T_DOUBLEFLT: /* double fault */
+ default:
+ ucode = code + BUS_SEGM_FAULT ;
+ i = SIGBUS;
+ break;
+
+ case T_PAGEFLT: /* page fault */
+ i = trap_pfault(&frame, TRUE);
+ if (i == -1)
+ return;
+ if (i == 0)
+ goto out;
+
+ ucode = T_PAGEFLT;
+ break;
+
+ case T_DIVIDE: /* integer divide fault */
+ ucode = FPE_INTDIV_TRAP;
+ i = SIGFPE;
+ break;
+
+#if NISA > 0
+ case T_NMI:
+#ifdef POWERFAIL_NMI
+ goto handle_powerfail;
+#else /* !POWERFAIL_NMI */
+#ifdef DDB
+ /* NMI can be hooked up to a pushbutton for debugging */
+ printf ("NMI ... going to debugger\n");
+ if (kdb_trap (type, 0, &frame))
+ return;
+#endif /* DDB */
+ /* machine/parity/power fail/"kitchen sink" faults */
+ if (isa_nmi(code) == 0) return;
+ panic("NMI indicates hardware failure");
+#endif /* POWERFAIL_NMI */
+#endif /* NISA > 0 */
+
+ case T_OFLOW: /* integer overflow fault */
+ ucode = FPE_INTOVF_TRAP;
+ i = SIGFPE;
+ break;
+
+ case T_BOUND: /* bounds check fault */
+ ucode = FPE_SUBRNG_TRAP;
+ i = SIGFPE;
+ break;
+
+ case T_DNA:
+#if NNPX > 0
+ /* if a transparent fault (due to context switch "late") */
+ if (npxdna())
+ return;
+#endif
+ if (!pmath_emulate) {
+ i = SIGFPE;
+ ucode = FPE_FPU_NP_TRAP;
+ break;
+ }
+ i = (*pmath_emulate)(&frame);
+ if (i == 0) {
+ if (!(frame.tf_eflags & PSL_T))
+ return;
+ frame.tf_eflags &= ~PSL_T;
+ i = SIGTRAP;
+ }
+ /* else ucode = emulator_only_knows() XXX */
+ break;
+
+ case T_FPOPFLT: /* FPU operand fetch fault */
+ ucode = T_FPOPFLT;
+ i = SIGILL;
+ break;
+ }
+ } else {
+ /* kernel trap */
+
+ switch (type) {
+ case T_PAGEFLT: /* page fault */
+ (void) trap_pfault(&frame, FALSE);
+ return;
+
+ case T_DNA:
+#if NNPX > 0
+ /*
+ * The kernel is apparently using npx for copying.
+ * XXX this should be fatal unless the kernel has
+ * registered such use.
+ */
+ if (npxdna())
+ return;
+#endif
+ break;
+
+ case T_PROTFLT: /* general protection fault */
+ case T_SEGNPFLT: /* segment not present fault */
+ /*
+ * Invalid segment selectors and out of bounds
+ * %eip's and %esp's can be set up in user mode.
+ * This causes a fault in kernel mode when the
+ * kernel tries to return to user mode. We want
+ * to get this fault so that we can fix the
+ * problem here and not have to check all the
+ * selectors and pointers when the user changes
+ * them.
+ */
+#define MAYBE_DORETI_FAULT(where, whereto) \
+ do { \
+ if (frame.tf_eip == (int)where) { \
+ frame.tf_eip = (int)whereto; \
+ return; \
+ } \
+ } while (0)
+
+ if (intr_nesting_level == 0) {
+ MAYBE_DORETI_FAULT(doreti_iret,
+ doreti_iret_fault);
+ MAYBE_DORETI_FAULT(doreti_popl_ds,
+ doreti_popl_ds_fault);
+ MAYBE_DORETI_FAULT(doreti_popl_es,
+ doreti_popl_es_fault);
+ if (curpcb && curpcb->pcb_onfault) {
+ frame.tf_eip = (int)curpcb->pcb_onfault;
+ return;
+ }
+ }
+ break;
+
+ case T_TSSFLT:
+ /*
+ * PSL_NT can be set in user mode and isn't cleared
+ * automatically when the kernel is entered. This
+ * causes a TSS fault when the kernel attempts to
+ * `iret' because the TSS link is uninitialized. We
+ * want to get this fault so that we can fix the
+ * problem here and not every time the kernel is
+ * entered.
+ */
+ if (frame.tf_eflags & PSL_NT) {
+ frame.tf_eflags &= ~PSL_NT;
+ return;
+ }
+ break;
+
+ case T_TRCTRAP: /* trace trap */
+ if (frame.tf_eip == (int)IDTVEC(syscall)) {
+ /*
+ * We've just entered system mode via the
+ * syscall lcall. Continue single stepping
+ * silently until the syscall handler has
+ * saved the flags.
+ */
+ return;
+ }
+ if (frame.tf_eip == (int)IDTVEC(syscall) + 1) {
+ /*
+ * The syscall handler has now saved the
+ * flags. Stop single stepping it.
+ */
+ frame.tf_eflags &= ~PSL_T;
+ return;
+ }
+ /*
+ * Fall through.
+ */
+ case T_BPTFLT:
+ /*
+ * If DDB is enabled, let it handle the debugger trap.
+ * Otherwise, debugger traps "can't happen".
+ */
+#ifdef DDB
+ if (kdb_trap (type, 0, &frame))
+ return;
+#endif
+ break;
+
+#if NISA > 0
+ case T_NMI:
+#ifdef POWERFAIL_NMI
+#ifndef TIMER_FREQ
+# define TIMER_FREQ 1193182
+#endif
+ handle_powerfail:
+ {
+ static unsigned lastalert = 0;
+
+ if(time.tv_sec - lastalert > 10)
+ {
+ log(LOG_WARNING, "NMI: power fail\n");
+ sysbeep(TIMER_FREQ/880, hz);
+ lastalert = time.tv_sec;
+ }
+ return;
+ }
+#else /* !POWERFAIL_NMI */
+#ifdef DDB
+ /* NMI can be hooked up to a pushbutton for debugging */
+ printf ("NMI ... going to debugger\n");
+ if (kdb_trap (type, 0, &frame))
+ return;
+#endif /* DDB */
+ /* machine/parity/power fail/"kitchen sink" faults */
+ if (isa_nmi(code) == 0) return;
+ /* FALL THROUGH */
+#endif /* POWERFAIL_NMI */
+#endif /* NISA > 0 */
+ }
+
+ trap_fatal(&frame);
+ return;
+ }
+
+ trapsignal(p, i, ucode);
+
+#ifdef DEBUG
+ eva = rcr2();
+ if (type <= MAX_TRAP_MSG) {
+ uprintf("fatal process exception: %s",
+ trap_msg[type]);
+ if ((type == T_PAGEFLT) || (type == T_PROTFLT))
+ uprintf(", fault VA = 0x%x", eva);
+ uprintf("\n");
+ }
+#endif
+
+out:
+ userret(p, &frame, sticks);
+}
+
+#ifdef notyet
+/*
+ * This version doesn't allow a page fault to user space while
+ * in the kernel. The rest of the kernel needs to be made "safe"
+ * before this can be used. I think the only things remaining
+ * to be made safe are the iBCS2 code and the process tracing/
+ * debugging code.
+ */
+static int
+trap_pfault(frame, usermode)
+ struct trapframe *frame;
+ int usermode;
+{
+ vm_offset_t va;
+ struct vmspace *vm = NULL;
+ vm_map_t map = 0;
+ int rv = 0;
+ vm_prot_t ftype;
+ int eva;
+ struct proc *p = curproc;
+
+ if (frame->tf_err & PGEX_W)
+ ftype = VM_PROT_READ | VM_PROT_WRITE;
+ else
+ ftype = VM_PROT_READ;
+
+ eva = rcr2();
+ va = trunc_page((vm_offset_t)eva);
+
+ if (va < VM_MIN_KERNEL_ADDRESS) {
+ vm_offset_t v;
+ vm_page_t mpte;
+
+ if (p == NULL ||
+ (!usermode && va < VM_MAXUSER_ADDRESS &&
+ (intr_nesting_level != 0 || curpcb == NULL ||
+ curpcb->pcb_onfault == NULL))) {
+ trap_fatal(frame);
+ return (-1);
+ }
+
+ /*
+ * This is a fault on non-kernel virtual memory.
+ * vm is initialized above to NULL. If curproc is NULL
+ * or curproc->p_vmspace is NULL the fault is fatal.
+ */
+ vm = p->p_vmspace;
+ if (vm == NULL)
+ goto nogo;
+
+ map = &vm->vm_map;
+
+ /*
+ * Keep swapout from messing with us during this
+ * critical time.
+ */
+ ++p->p_lock;
+
+ /*
+ * Grow the stack if necessary
+ */
+ if ((caddr_t)va > vm->vm_maxsaddr
+ && (caddr_t)va < (caddr_t)USRSTACK) {
+ if (!grow(p, va)) {
+ rv = KERN_FAILURE;
+ --p->p_lock;
+ goto nogo;
+ }
+ }
+
+ /* Fault in the user page: */
+ rv = vm_fault(map, va, ftype, FALSE);
+
+ --p->p_lock;
+ } else {
+ /*
+ * Don't allow user-mode faults in kernel address space.
+ */
+ if (usermode)
+ goto nogo;
+
+ /*
+ * Since we know that kernel virtual address addresses
+ * always have pte pages mapped, we just have to fault
+ * the page.
+ */
+ rv = vm_fault(kernel_map, va, ftype, FALSE);
+ }
+
+ if (rv == KERN_SUCCESS)
+ return (0);
+nogo:
+ if (!usermode) {
+ if (intr_nesting_level == 0 && curpcb && curpcb->pcb_onfault) {
+ frame->tf_eip = (int)curpcb->pcb_onfault;
+ return (0);
+ }
+ trap_fatal(frame);
+ return (-1);
+ }
+
+ /* kludge to pass faulting virtual address to sendsig */
+ frame->tf_err = eva;
+
+ return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
+}
+#endif
+
+int
+trap_pfault(frame, usermode)
+ struct trapframe *frame;
+ int usermode;
+{
+ vm_offset_t va;
+ struct vmspace *vm = NULL;
+ vm_map_t map = 0;
+ int rv = 0;
+ vm_prot_t ftype;
+ int eva;
+ struct proc *p = curproc;
+
+ eva = rcr2();
+ va = trunc_page((vm_offset_t)eva);
+
+ if (va >= KERNBASE) {
+ /*
+ * Don't allow user-mode faults in kernel address space.
+ */
+ if (usermode)
+ goto nogo;
+
+ map = kernel_map;
+ } else {
+ /*
+ * This is a fault on non-kernel virtual memory.
+ * vm is initialized above to NULL. If curproc is NULL
+ * or curproc->p_vmspace is NULL the fault is fatal.
+ */
+ if (p != NULL)
+ vm = p->p_vmspace;
+
+ if (vm == NULL)
+ goto nogo;
+
+ map = &vm->vm_map;
+ }
+
+ if (frame->tf_err & PGEX_W)
+ ftype = VM_PROT_READ | VM_PROT_WRITE;
+ else
+ ftype = VM_PROT_READ;
+
+ if (map != kernel_map) {
+ /*
+ * Keep swapout from messing with us during this
+ * critical time.
+ */
+ ++p->p_lock;
+
+ /*
+ * Grow the stack if necessary
+ */
+ if ((caddr_t)va > vm->vm_maxsaddr
+ && (caddr_t)va < (caddr_t)USRSTACK) {
+ if (!grow(p, va)) {
+ rv = KERN_FAILURE;
+ --p->p_lock;
+ goto nogo;
+ }
+ }
+
+ /* Fault in the user page: */
+ rv = vm_fault(map, va, ftype, FALSE);
+
+ --p->p_lock;
+ } else {
+ /*
+ * Since we know that kernel virtual address addresses
+ * always have pte pages mapped, we just have to fault
+ * the page.
+ */
+ rv = vm_fault(map, va, ftype, FALSE);
+ }
+
+ if (rv == KERN_SUCCESS)
+ return (0);
+nogo:
+ if (!usermode) {
+ if (intr_nesting_level == 0 && curpcb && curpcb->pcb_onfault) {
+ frame->tf_eip = (int)curpcb->pcb_onfault;
+ return (0);
+ }
+ trap_fatal(frame);
+ return (-1);
+ }
+
+ /* kludge to pass faulting virtual address to sendsig */
+ frame->tf_err = eva;
+
+ return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
+}
+
+static void
+trap_fatal(frame)
+ struct trapframe *frame;
+{
+ int code, type, eva, ss, esp;
+ struct soft_segment_descriptor softseg;
+
+ code = frame->tf_err;
+ type = frame->tf_trapno;
+ eva = rcr2();
+ sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
+
+ if (type <= MAX_TRAP_MSG)
+ printf("\n\nFatal trap %d: %s while in %s mode\n",
+ type, trap_msg[type],
+ ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
+ if (type == T_PAGEFLT) {
+ printf("fault virtual address = 0x%x\n", eva);
+ printf("fault code = %s %s, %s\n",
+ code & PGEX_U ? "user" : "supervisor",
+ code & PGEX_W ? "write" : "read",
+ code & PGEX_P ? "protection violation" : "page not present");
+ }
+ printf("instruction pointer = 0x%x:0x%x\n",
+ frame->tf_cs & 0xffff, frame->tf_eip);
+ if (ISPL(frame->tf_cs) == SEL_UPL) {
+ ss = frame->tf_ss & 0xffff;
+ esp = frame->tf_esp;
+ } else {
+ ss = GSEL(GDATA_SEL, SEL_KPL);
+ esp = (int)&frame->tf_esp;
+ }
+ printf("stack pointer = 0x%x:0x%x\n", ss, esp);
+ printf("frame pointer = 0x%x:0x%x\n", ss, frame->tf_ebp);
+ printf("code segment = base 0x%x, limit 0x%x, type 0x%x\n",
+ softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
+ printf(" = DPL %d, pres %d, def32 %d, gran %d\n",
+ softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
+ softseg.ssd_gran);
+ printf("processor eflags = ");
+ if (frame->tf_eflags & PSL_T)
+ printf("trace trap, ");
+ if (frame->tf_eflags & PSL_I)
+ printf("interrupt enabled, ");
+ if (frame->tf_eflags & PSL_NT)
+ printf("nested task, ");
+ if (frame->tf_eflags & PSL_RF)
+ printf("resume, ");
+ if (frame->tf_eflags & PSL_VM)
+ printf("vm86, ");
+ printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
+ printf("current process = ");
+ if (curproc) {
+ printf("%lu (%s)\n",
+ (u_long)curproc->p_pid, curproc->p_comm ?
+ curproc->p_comm : "");
+ } else {
+ printf("Idle\n");
+ }
+ printf("interrupt mask = ");
+ if ((cpl & net_imask) == net_imask)
+ printf("net ");
+ if ((cpl & tty_imask) == tty_imask)
+ printf("tty ");
+ if ((cpl & bio_imask) == bio_imask)
+ printf("bio ");
+ if (cpl == 0)
+ printf("none");
+ printf("\n");
+
+#ifdef KDB
+ if (kdb_trap(&psl))
+ return;
+#endif
+#ifdef DDB
+ if (kdb_trap (type, 0, frame))
+ return;
+#endif
+ if (type <= MAX_TRAP_MSG)
+ panic(trap_msg[type]);
+ else
+ panic("unknown/reserved trap");
+}
+
+/*
+ * Double fault handler. Called when a fault occurs while writing
+ * a frame for a trap/exception onto the stack. This usually occurs
+ * when the stack overflows (such is the case with infinite recursion,
+ * for example).
+ *
+ * XXX Note that the current PTD gets replaced by IdlePTD when the
+ * task switch occurs. This means that the stack that was active at
+ * the time of the double fault is not available at <kstack> unless
+ * the machine was idle when the double fault occurred. The downside
+ * of this is that "trace <ebp>" in ddb won't work.
+ */
+void
+dblfault_handler()
+{
+ struct pcb *pcb = curpcb;
+
+ if (pcb != NULL) {
+ printf("\nFatal double fault:\n");
+ printf("eip = 0x%x\n", pcb->pcb_tss.tss_eip);
+ printf("esp = 0x%x\n", pcb->pcb_tss.tss_esp);
+ printf("ebp = 0x%x\n", pcb->pcb_tss.tss_ebp);
+ }
+
+ panic("double fault");
+}
+
+/*
+ * Compensate for 386 brain damage (missing URKR).
+ * This is a little simpler than the pagefault handler in trap() because
+ * it the page tables have already been faulted in and high addresses
+ * are thrown out early for other reasons.
+ */
+int trapwrite(addr)
+ unsigned addr;
+{
+ struct proc *p;
+ vm_offset_t va;
+ struct vmspace *vm;
+ int rv;
+
+ va = trunc_page((vm_offset_t)addr);
+ /*
+ * XXX - MAX is END. Changed > to >= for temp. fix.
+ */
+ if (va >= VM_MAXUSER_ADDRESS)
+ return (1);
+
+ p = curproc;
+ vm = p->p_vmspace;
+
+ ++p->p_lock;
+
+ if ((caddr_t)va >= vm->vm_maxsaddr
+ && (caddr_t)va < (caddr_t)USRSTACK) {
+ if (!grow(p, va)) {
+ --p->p_lock;
+ return (1);
+ }
+ }
+
+ /*
+ * fault the data page
+ */
+ rv = vm_fault(&vm->vm_map, va, VM_PROT_READ|VM_PROT_WRITE, FALSE);
+
+ --p->p_lock;
+
+ if (rv != KERN_SUCCESS)
+ return 1;
+
+ return (0);
+}
+
+/*
+ * System call request from POSIX system call gate interface to kernel.
+ * Like trap(), argument is call by reference.
+ */
+void
+syscall(frame)
+ struct trapframe frame;
+{
+ caddr_t params;
+ int i;
+ struct sysent *callp;
+ struct proc *p = curproc;
+ u_quad_t sticks;
+ int error;
+ int args[8], rval[2];
+ u_int code;
+
+ sticks = p->p_sticks;
+ if (ISPL(frame.tf_cs) != SEL_UPL)
+ panic("syscall");
+
+ p->p_md.md_regs = (int *)&frame;
+ params = (caddr_t)frame.tf_esp + sizeof(int);
+ code = frame.tf_eax;
+ if (p->p_sysent->sv_prepsyscall) {
+ (*p->p_sysent->sv_prepsyscall)(&frame, args, &code, &params);
+ } else {
+ /*
+ * Need to check if this is a 32 bit or 64 bit syscall.
+ */
+ if (code == SYS_syscall) {
+ /*
+ * Code is first argument, followed by actual args.
+ */
+ code = fuword(params);
+ params += sizeof(int);
+ } else if (code == SYS___syscall) {
+ /*
+ * Like syscall, but code is a quad, so as to maintain
+ * quad alignment for the rest of the arguments.
+ */
+ code = fuword(params);
+ params += sizeof(quad_t);
+ }
+ }
+
+ if (p->p_sysent->sv_mask)
+ code &= p->p_sysent->sv_mask;
+
+ if (code >= p->p_sysent->sv_size)
+ callp = &p->p_sysent->sv_table[0];
+ else
+ callp = &p->p_sysent->sv_table[code];
+
+ if (params && (i = callp->sy_narg * sizeof(int)) &&
+ (error = copyin(params, (caddr_t)args, (u_int)i))) {
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_SYSCALL))
+ ktrsyscall(p->p_tracep, code, callp->sy_narg, args);
+#endif
+ goto bad;
+ }
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_SYSCALL))
+ ktrsyscall(p->p_tracep, code, callp->sy_narg, args);
+#endif
+ rval[0] = 0;
+ rval[1] = frame.tf_edx;
+
+ error = (*callp->sy_call)(p, args, rval);
+
+ switch (error) {
+
+ case 0:
+ /*
+ * Reinitialize proc pointer `p' as it may be different
+ * if this is a child returning from fork syscall.
+ */
+ p = curproc;
+ frame.tf_eax = rval[0];
+ frame.tf_edx = rval[1];
+ frame.tf_eflags &= ~PSL_C;
+ break;
+
+ case ERESTART:
+ /*
+ * Reconstruct pc, assuming lcall $X,y is 7 bytes,
+ * int 0x80 is 2 bytes. We saved this in tf_err.
+ */
+ frame.tf_eip -= frame.tf_err;
+ break;
+
+ case EJUSTRETURN:
+ break;
+
+ default:
+bad:
+ if (p->p_sysent->sv_errsize)
+ if (error >= p->p_sysent->sv_errsize)
+ error = -1; /* XXX */
+ else
+ error = p->p_sysent->sv_errtbl[error];
+ frame.tf_eax = error;
+ frame.tf_eflags |= PSL_C;
+ break;
+ }
+
+ if (frame.tf_eflags & PSL_T) {
+ /* Traced syscall. */
+ frame.tf_eflags &= ~PSL_T;
+ trapsignal(p, SIGTRAP, 0);
+ }
+
+ userret(p, &frame, sticks);
+
+#ifdef KTRACE
+ if (KTRPOINT(p, KTR_SYSRET))
+ ktrsysret(p->p_tracep, code, error, rval[0]);
+#endif
+}
diff --git a/sys/kern/subr_xxx.c b/sys/kern/subr_xxx.c
index 45b2d64..5ff7dcc 100644
--- a/sys/kern/subr_xxx.c
+++ b/sys/kern/subr_xxx.c
@@ -30,88 +30,282 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)subr_xxx.c 8.3 (Berkeley) 3/29/95
+ * @(#)subr_xxx.c 8.1 (Berkeley) 6/10/93
+ * $Id$
*/
/*
- * Miscellaneous trivial functions, including many
- * that are often inline-expanded or done in assembler.
+ * Miscellaneous trivial functions.
*/
#include <sys/param.h>
#include <sys/systm.h>
-#include <machine/cpu.h>
-
/*
- * Unsupported device function (e.g. writing to read-only device).
+ * Return error for operation not supported
+ * on a specific object or file type.
*/
int
-enodev()
+eopnotsupp()
{
- return (ENODEV);
+ return (EOPNOTSUPP);
}
/*
- * Unconfigured device function; driver not configured.
+ * Return error for an inval operation
+ * on a specific object or file type.
*/
int
-enxio()
+einval()
{
- return (ENXIO);
+ return (EINVAL);
}
/*
- * Unsupported ioctl function.
+ * Generic null operation, always returns success.
*/
int
-enoioctl()
+nullop()
{
- return (ENOTTY);
+ return (0);
}
+#include <sys/conf.h>
+
/*
- * Unsupported system function.
- * This is used for an otherwise-reasonable operation
- * that is not supported by the current system binary.
+ * Unsupported devswitch functions (e.g. for writing to read-only device).
+ * XXX may belong elsewhere.
*/
+
+int
+noopen(dev, flags, fmt, p)
+ dev_t dev;
+ int flags;
+ int fmt;
+ struct proc *p;
+{
+
+ return (ENODEV);
+}
+
+int
+noclose(dev, flags, fmt, p)
+ dev_t dev;
+ int flags;
+ int fmt;
+ struct proc *p;
+{
+
+ return (ENODEV);
+}
+
+int
+noread(dev, uio, ioflag)
+ dev_t dev;
+ struct uio *uio;
+ int ioflag;
+{
+
+ return (ENODEV);
+}
+
+int
+nowrite(dev, uio, ioflag)
+ dev_t dev;
+ struct uio *uio;
+ int ioflag;
+{
+
+ return (ENODEV);
+}
+
+int
+noioctl(dev, cmd, data, flags, p)
+ dev_t dev;
+ int cmd;
+ caddr_t data;
+ int flags;
+ struct proc *p;
+{
+
+ return (ENODEV);
+}
+
+void
+nostop(tp, rw)
+ struct tty *tp;
+ int rw;
+{
+
+}
+
+int
+noreset(dev)
+ dev_t dev;
+{
+
+ printf("noreset(0x%x) called\n", dev);
+ return (ENODEV);
+}
+
+struct tty *
+nodevtotty(dev)
+ dev_t dev;
+{
+
+ return (NULL);
+}
+
+int
+noselect(dev, rw, p)
+ dev_t dev;
+ int rw;
+ struct proc *p;
+{
+
+ /* XXX is this distinguished from 1 for data available? */
+ return (ENODEV);
+}
+
+int
+nommap(dev, offset, nprot)
+ dev_t dev;
+ int offset;
+ int nprot;
+{
+
+ /* Don't return ENODEV. That would allow mapping address ENODEV! */
+ return (-1);
+}
+
int
-enosys()
+nodump(dev)
+ dev_t dev;
{
- return (ENOSYS);
+ return (ENODEV);
}
/*
- * Return error for operation not supported
- * on a specific object or file type.
+ * Null devswitch functions (for when the operation always succeeds).
+ * XXX may belong elsewhere.
+ * XXX not all are here (e.g., seltrue() isn't).
+ */
+
+/*
+ * XXX this is probably bogus. Any device that uses it isn't checking the
+ * minor number.
*/
int
-eopnotsupp()
+nullopen(dev, flags, fmt, p)
+ dev_t dev;
+ int flags;
+ int fmt;
+ struct proc *p;
{
- return (EOPNOTSUPP);
+ return (0);
+}
+
+int
+nullclose(dev, flags, fmt, p)
+ dev_t dev;
+ int flags;
+ int fmt;
+ struct proc *p;
+{
+
+ return (0);
}
/*
- * Return error for an inval operation
- * on a specific object or file type.
+ * Unconfigured devswitch functions (for unconfigured drivers).
+ * XXX may belong elsewhere.
*/
+
int
-einval()
+nxopen(dev, flags, fmt, p)
+ dev_t dev;
+ int flags;
+ int fmt;
+ struct proc *p;
{
- return (EINVAL);
+ return (ENXIO);
}
/*
- * Generic null operation, always returns success.
+ * XXX all nx functions except nxopen() should probably go away. They
+ * probably can't be called for non-open devices.
*/
+
int
-nullop()
+nxclose(dev, flags, fmt, p)
+ dev_t dev;
+ int flags;
+ int fmt;
+ struct proc *p;
{
- return (0);
+ printf("nxclose(0x%x) called\n", dev);
+ return (ENXIO);
+}
+
+int
+nxread(dev, uio, ioflag)
+ dev_t dev;
+ struct uio *uio;
+ int ioflag;
+{
+
+ printf("nxread(0x%x) called\n", dev);
+ return (ENXIO);
+}
+
+int
+nxwrite(dev, uio, ioflag)
+ dev_t dev;
+ struct uio *uio;
+ int ioflag;
+{
+
+ printf("nxwrite(0x%x) called\n", dev);
+ return (ENXIO);
+}
+
+int
+nxioctl(dev, cmd, data, flags, p)
+ dev_t dev;
+ int cmd;
+ caddr_t data;
+ int flags;
+ struct proc *p;
+{
+
+ printf("nxioctl(0x%x) called\n", dev);
+ return (ENXIO);
+}
+
+int
+nxselect(dev, rw, p)
+ dev_t dev;
+ int rw;
+ struct proc *p;
+{
+
+ printf("nxselect(0x%x) called\n", dev);
+
+ /* XXX is this distinguished from 1 for data available? */
+ return (ENXIO);
+}
+
+int
+nxdump(dev)
+ dev_t dev;
+{
+
+ printf("nxdump(0x%x) called\n", dev);
+ return (ENXIO);
}
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index 08385b3..2bcfd68 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -35,15 +35,24 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)sys_generic.c 8.9 (Berkeley) 2/14/95
+ * @(#)sys_generic.c 8.5 (Berkeley) 1/21/94
+ * $Id: sys_generic.c,v 1.25 1997/03/23 03:36:23 bde Exp $
*/
+#include "opt_ktrace.h"
+
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/sysproto.h>
#include <sys/filedesc.h>
-#include <sys/ioctl.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
+#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/proc.h>
+#include <sys/stat.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/uio.h>
#include <sys/kernel.h>
@@ -52,23 +61,26 @@
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
+#include <vm/vm.h>
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
+static int selscan __P((struct proc *, fd_mask **, fd_mask **, int, int *));
/*
* Read system call.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct read_args {
+ int fd;
+ char *buf;
+ u_int nbyte;
+};
+#endif
/* ARGSUSED */
int
read(p, uap, retval)
struct proc *p;
- register struct read_args /* {
- syscallarg(int) fd;
- syscallarg(char *) buf;
- syscallarg(u_int) nbyte;
- } */ *uap;
- register_t *retval;
+ register struct read_args *uap;
+ int *retval;
{
register struct file *fp;
register struct filedesc *fdp = p->p_fd;
@@ -79,15 +91,19 @@ read(p, uap, retval)
struct iovec ktriov;
#endif
- if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles ||
- (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL ||
+ if (((u_int)uap->fd) >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
(fp->f_flag & FREAD) == 0)
return (EBADF);
- aiov.iov_base = (caddr_t)SCARG(uap, buf);
- aiov.iov_len = SCARG(uap, nbyte);
+ aiov.iov_base = (caddr_t)uap->buf;
+ aiov.iov_len = uap->nbyte;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
- auio.uio_resid = SCARG(uap, nbyte);
+
+ auio.uio_resid = uap->nbyte;
+ if (auio.uio_resid < 0)
+ return (EINVAL);
+
auio.uio_rw = UIO_READ;
auio.uio_segflg = UIO_USERSPACE;
auio.uio_procp = p;
@@ -98,16 +114,15 @@ read(p, uap, retval)
if (KTRPOINT(p, KTR_GENIO))
ktriov = aiov;
#endif
- cnt = SCARG(uap, nbyte);
- if (error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred))
+ cnt = uap->nbyte;
+ if ((error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred)))
if (auio.uio_resid != cnt && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
cnt -= auio.uio_resid;
#ifdef KTRACE
if (KTRPOINT(p, KTR_GENIO) && error == 0)
- ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_READ, &ktriov,
- cnt, error);
+ ktrgenio(p->p_tracep, uap->fd, UIO_READ, &ktriov, cnt, error);
#endif
*retval = cnt;
return (error);
@@ -116,15 +131,18 @@ read(p, uap, retval)
/*
* Scatter read system call.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct readv_args {
+ int fd;
+ struct iovec *iovp;
+ u_int iovcnt;
+};
+#endif
int
readv(p, uap, retval)
struct proc *p;
- register struct readv_args /* {
- syscallarg(int) fd;
- syscallarg(struct iovec *) iovp;
- syscallarg(u_int) iovcnt;
- } */ *uap;
- register_t *retval;
+ register struct readv_args *uap;
+ int *retval;
{
register struct file *fp;
register struct filedesc *fdp = p->p_fd;
@@ -138,14 +156,14 @@ readv(p, uap, retval)
struct iovec *ktriov = NULL;
#endif
- if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles ||
- (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL ||
+ if (((u_int)uap->fd) >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
(fp->f_flag & FREAD) == 0)
return (EBADF);
/* note: can't use iovlen until iovcnt is validated */
- iovlen = SCARG(uap, iovcnt) * sizeof (struct iovec);
- if (SCARG(uap, iovcnt) > UIO_SMALLIOV) {
- if (SCARG(uap, iovcnt) > UIO_MAXIOV)
+ iovlen = uap->iovcnt * sizeof (struct iovec);
+ if (uap->iovcnt > UIO_SMALLIOV) {
+ if (uap->iovcnt > UIO_MAXIOV)
return (EINVAL);
MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
needfree = iov;
@@ -154,19 +172,19 @@ readv(p, uap, retval)
needfree = NULL;
}
auio.uio_iov = iov;
- auio.uio_iovcnt = SCARG(uap, iovcnt);
+ auio.uio_iovcnt = uap->iovcnt;
auio.uio_rw = UIO_READ;
auio.uio_segflg = UIO_USERSPACE;
auio.uio_procp = p;
- if (error = copyin((caddr_t)SCARG(uap, iovp), (caddr_t)iov, iovlen))
+ if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
goto done;
auio.uio_resid = 0;
- for (i = 0; i < SCARG(uap, iovcnt); i++) {
- if (auio.uio_resid + iov->iov_len < auio.uio_resid) {
+ for (i = 0; i < uap->iovcnt; i++) {
+ auio.uio_resid += iov->iov_len;
+ if (auio.uio_resid < 0) {
error = EINVAL;
goto done;
}
- auio.uio_resid += iov->iov_len;
iov++;
}
#ifdef KTRACE
@@ -179,7 +197,7 @@ readv(p, uap, retval)
}
#endif
cnt = auio.uio_resid;
- if (error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred))
+ if ((error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred)))
if (auio.uio_resid != cnt && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
@@ -187,7 +205,7 @@ readv(p, uap, retval)
#ifdef KTRACE
if (ktriov != NULL) {
if (error == 0)
- ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_READ, ktriov,
+ ktrgenio(p->p_tracep, uap->fd, UIO_READ, ktriov,
cnt, error);
FREE(ktriov, M_TEMP);
}
@@ -202,15 +220,18 @@ done:
/*
* Write system call
*/
+#ifndef _SYS_SYSPROTO_H_
+struct write_args {
+ int fd;
+ char *buf;
+ u_int nbyte;
+};
+#endif
int
write(p, uap, retval)
struct proc *p;
- register struct write_args /* {
- syscallarg(int) fd;
- syscallarg(char *) buf;
- syscallarg(u_int) nbyte;
- } */ *uap;
- register_t *retval;
+ register struct write_args *uap;
+ int *retval;
{
register struct file *fp;
register struct filedesc *fdp = p->p_fd;
@@ -221,15 +242,15 @@ write(p, uap, retval)
struct iovec ktriov;
#endif
- if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles ||
- (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL ||
+ if (((u_int)uap->fd) >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
(fp->f_flag & FWRITE) == 0)
return (EBADF);
- aiov.iov_base = (caddr_t)SCARG(uap, buf);
- aiov.iov_len = SCARG(uap, nbyte);
+ aiov.iov_base = (caddr_t)uap->buf;
+ aiov.iov_len = uap->nbyte;
auio.uio_iov = &aiov;
auio.uio_iovcnt = 1;
- auio.uio_resid = SCARG(uap, nbyte);
+ auio.uio_resid = uap->nbyte;
auio.uio_rw = UIO_WRITE;
auio.uio_segflg = UIO_USERSPACE;
auio.uio_procp = p;
@@ -240,8 +261,8 @@ write(p, uap, retval)
if (KTRPOINT(p, KTR_GENIO))
ktriov = aiov;
#endif
- cnt = SCARG(uap, nbyte);
- if (error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred)) {
+ cnt = uap->nbyte;
+ if ((error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred))) {
if (auio.uio_resid != cnt && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
@@ -251,7 +272,7 @@ write(p, uap, retval)
cnt -= auio.uio_resid;
#ifdef KTRACE
if (KTRPOINT(p, KTR_GENIO) && error == 0)
- ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_WRITE,
+ ktrgenio(p->p_tracep, uap->fd, UIO_WRITE,
&ktriov, cnt, error);
#endif
*retval = cnt;
@@ -261,15 +282,18 @@ write(p, uap, retval)
/*
* Gather write system call
*/
+#ifndef _SYS_SYSPROTO_H_
+struct writev_args {
+ int fd;
+ struct iovec *iovp;
+ u_int iovcnt;
+};
+#endif
int
writev(p, uap, retval)
struct proc *p;
- register struct writev_args /* {
- syscallarg(int) fd;
- syscallarg(struct iovec *) iovp;
- syscallarg(u_int) iovcnt;
- } */ *uap;
- register_t *retval;
+ register struct writev_args *uap;
+ int *retval;
{
register struct file *fp;
register struct filedesc *fdp = p->p_fd;
@@ -283,14 +307,14 @@ writev(p, uap, retval)
struct iovec *ktriov = NULL;
#endif
- if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles ||
- (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL ||
+ if (((u_int)uap->fd) >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
(fp->f_flag & FWRITE) == 0)
return (EBADF);
/* note: can't use iovlen until iovcnt is validated */
- iovlen = SCARG(uap, iovcnt) * sizeof (struct iovec);
- if (SCARG(uap, iovcnt) > UIO_SMALLIOV) {
- if (SCARG(uap, iovcnt) > UIO_MAXIOV)
+ iovlen = uap->iovcnt * sizeof (struct iovec);
+ if (uap->iovcnt > UIO_SMALLIOV) {
+ if (uap->iovcnt > UIO_MAXIOV)
return (EINVAL);
MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
needfree = iov;
@@ -299,19 +323,19 @@ writev(p, uap, retval)
needfree = NULL;
}
auio.uio_iov = iov;
- auio.uio_iovcnt = SCARG(uap, iovcnt);
+ auio.uio_iovcnt = uap->iovcnt;
auio.uio_rw = UIO_WRITE;
auio.uio_segflg = UIO_USERSPACE;
auio.uio_procp = p;
- if (error = copyin((caddr_t)SCARG(uap, iovp), (caddr_t)iov, iovlen))
+ if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
goto done;
auio.uio_resid = 0;
- for (i = 0; i < SCARG(uap, iovcnt); i++) {
- if (auio.uio_resid + iov->iov_len < auio.uio_resid) {
+ for (i = 0; i < uap->iovcnt; i++) {
+ auio.uio_resid += iov->iov_len;
+ if (auio.uio_resid < 0) {
error = EINVAL;
goto done;
}
- auio.uio_resid += iov->iov_len;
iov++;
}
#ifdef KTRACE
@@ -324,7 +348,7 @@ writev(p, uap, retval)
}
#endif
cnt = auio.uio_resid;
- if (error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred)) {
+ if ((error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred))) {
if (auio.uio_resid != cnt && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
@@ -335,7 +359,7 @@ writev(p, uap, retval)
#ifdef KTRACE
if (ktriov != NULL) {
if (error == 0)
- ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_WRITE,
+ ktrgenio(p->p_tracep, uap->fd, UIO_WRITE,
ktriov, cnt, error);
FREE(ktriov, M_TEMP);
}
@@ -350,21 +374,23 @@ done:
/*
* Ioctl system call
*/
+#ifndef _SYS_SYSPROTO_H_
+struct ioctl_args {
+ int fd;
+ int com;
+ caddr_t data;
+};
+#endif
/* ARGSUSED */
int
ioctl(p, uap, retval)
struct proc *p;
- register struct ioctl_args /* {
- syscallarg(int) fd;
- syscallarg(u_long) com;
- syscallarg(caddr_t) data;
- } */ *uap;
- register_t *retval;
+ register struct ioctl_args *uap;
+ int *retval;
{
register struct file *fp;
register struct filedesc *fdp;
- register u_long com;
- register int error;
+ register int com, error;
register u_int size;
caddr_t data, memp;
int tmp;
@@ -372,19 +398,19 @@ ioctl(p, uap, retval)
char stkbuf[STK_PARAMS];
fdp = p->p_fd;
- if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles ||
- (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL)
+ if ((u_int)uap->fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[uap->fd]) == NULL)
return (EBADF);
if ((fp->f_flag & (FREAD | FWRITE)) == 0)
return (EBADF);
- switch (com = SCARG(uap, com)) {
+ switch (com = uap->com) {
case FIONCLEX:
- fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
+ fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
return (0);
case FIOCLEX:
- fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
+ fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
return (0);
}
@@ -403,14 +429,14 @@ ioctl(p, uap, retval)
data = stkbuf;
if (com&IOC_IN) {
if (size) {
- error = copyin(SCARG(uap, data), data, (u_int)size);
+ error = copyin(uap->data, data, (u_int)size);
if (error) {
if (memp)
free(memp, M_IOCTLOPS);
return (error);
}
} else
- *(caddr_t *)data = SCARG(uap, data);
+ *(caddr_t *)data = uap->data;
} else if ((com&IOC_OUT) && size)
/*
* Zero the buffer so the user always
@@ -418,12 +444,12 @@ ioctl(p, uap, retval)
*/
bzero(data, size);
else if (com&IOC_VOID)
- *(caddr_t *)data = SCARG(uap, data);
+ *(caddr_t *)data = uap->data;
switch (com) {
case FIONBIO:
- if (tmp = *(int *)data)
+ if ((tmp = *(int *)data))
fp->f_flag |= FNONBLOCK;
else
fp->f_flag &= ~FNONBLOCK;
@@ -431,7 +457,7 @@ ioctl(p, uap, retval)
break;
case FIOASYNC:
- if (tmp = *(int *)data)
+ if ((tmp = *(int *)data))
fp->f_flag |= FASYNC;
else
fp->f_flag &= ~FASYNC;
@@ -456,7 +482,7 @@ ioctl(p, uap, retval)
tmp = p1->p_pgrp->pg_id;
}
error = (*fp->f_ops->fo_ioctl)
- (fp, TIOCSPGRP, (caddr_t)&tmp, p);
+ (fp, (int)TIOCSPGRP, (caddr_t)&tmp, p);
break;
case FIOGETOWN:
@@ -465,7 +491,7 @@ ioctl(p, uap, retval)
*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
break;
}
- error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
+ error = (*fp->f_ops->fo_ioctl)(fp, (int)TIOCGPGRP, data, p);
*(int *)data = -*(int *)data;
break;
@@ -476,7 +502,7 @@ ioctl(p, uap, retval)
* already set and checked above.
*/
if (error == 0 && (com&IOC_OUT) && size)
- error = copyout(data, SCARG(uap, data), (u_int)size);
+ error = copyout(data, uap->data, (u_int)size);
break;
}
if (memp)
@@ -484,49 +510,88 @@ ioctl(p, uap, retval)
return (error);
}
-int selwait, nselcoll;
+static int nselcoll;
+int selwait;
/*
* Select system call.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct select_args {
+ int nd;
+ fd_set *in, *ou, *ex;
+ struct timeval *tv;
+};
+#endif
int
select(p, uap, retval)
register struct proc *p;
- register struct select_args /* {
- syscallarg(u_int) nd;
- syscallarg(fd_set *) in;
- syscallarg(fd_set *) ou;
- syscallarg(fd_set *) ex;
- syscallarg(struct timeval *) tv;
- } */ *uap;
- register_t *retval;
+ register struct select_args *uap;
+ int *retval;
{
- fd_set ibits[3], obits[3];
+ /*
+ * The magic 2048 here is chosen to be just enough for FD_SETSIZE
+ * infds with the new FD_SETSIZE of 1024, and more than enough for
+ * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
+ * of 256.
+ */
+ fd_mask s_selbits[howmany(2048, NFDBITS)];
+ fd_mask *ibits[3], *obits[3], *selbits, *sbp;
struct timeval atv;
- int s, ncoll, error, timo = 0;
- u_int ni;
+ int s, ncoll, error, timo;
+ u_int nbufbytes, ncpbytes, nfdbits;
- bzero((caddr_t)ibits, sizeof(ibits));
- bzero((caddr_t)obits, sizeof(obits));
- if (SCARG(uap, nd) > FD_SETSIZE)
+ if (uap->nd < 0)
return (EINVAL);
- if (SCARG(uap, nd) > p->p_fd->fd_nfiles) {
- /* forgiving; slightly wrong */
- SCARG(uap, nd) = p->p_fd->fd_nfiles;
- }
- ni = howmany(SCARG(uap, nd), NFDBITS) * sizeof(fd_mask);
+ if (uap->nd > p->p_fd->fd_nfiles)
+ uap->nd = p->p_fd->fd_nfiles; /* forgiving; slightly wrong */
+
+ /*
+ * Allocate just enough bits for the non-null fd_sets. Use the
+ * preallocated auto buffer if possible.
+ */
+ nfdbits = roundup(uap->nd, NFDBITS);
+ ncpbytes = nfdbits / NBBY;
+ nbufbytes = 0;
+ if (uap->in != NULL)
+ nbufbytes += 2 * ncpbytes;
+ if (uap->ou != NULL)
+ nbufbytes += 2 * ncpbytes;
+ if (uap->ex != NULL)
+ nbufbytes += 2 * ncpbytes;
+ if (nbufbytes <= sizeof s_selbits)
+ selbits = &s_selbits[0];
+ else
+ selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
+ /*
+ * Assign pointers into the bit buffers and fetch the input bits.
+ * Put the output buffers together so that they can be bzeroed
+ * together.
+ */
+ sbp = selbits;
#define getbits(name, x) \
- if (SCARG(uap, name) && (error = copyin((caddr_t)SCARG(uap, name), \
- (caddr_t)&ibits[x], ni))) \
- goto done;
+ do { \
+ if (uap->name == NULL) \
+ ibits[x] = NULL; \
+ else { \
+ ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp; \
+ obits[x] = sbp; \
+ sbp += ncpbytes / sizeof *sbp; \
+ error = copyin(uap->name, ibits[x], ncpbytes); \
+ if (error != 0) \
+ goto done; \
+ } \
+ } while (0)
getbits(in, 0);
getbits(ou, 1);
getbits(ex, 2);
#undef getbits
+ if (nbufbytes != 0)
+ bzero(selbits, nbufbytes / 2);
- if (SCARG(uap, tv)) {
- error = copyin((caddr_t)SCARG(uap, tv), (caddr_t)&atv,
+ if (uap->tv) {
+ error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
sizeof (atv));
if (error)
goto done;
@@ -535,31 +600,28 @@ select(p, uap, retval)
goto done;
}
s = splclock();
- timevaladd(&atv, (struct timeval *)&time);
+ timevaladd(&atv, &time);
+ timo = hzto(&atv);
+ /*
+ * Avoid inadvertently sleeping forever.
+ */
+ if (timo == 0)
+ timo = 1;
splx(s);
- }
+ } else
+ timo = 0;
retry:
ncoll = nselcoll;
p->p_flag |= P_SELECT;
- error = selscan(p, ibits, obits, SCARG(uap, nd), retval);
+ error = selscan(p, ibits, obits, uap->nd, retval);
if (error || *retval)
goto done;
s = splhigh();
- if (SCARG(uap, tv)) {
- if (timercmp(&time, &atv, >=)) {
- splx(s);
- goto done;
- }
- /*
- * If poll wait was tiny, this could be zero; we will
- * have to round it up to avoid sleeping forever. If
- * we retry below, the timercmp above will get us out.
- * Note that if wait was 0, the timercmp will prevent
- * us from getting here the first time.
- */
- timo = hzto(&atv);
- if (timo == 0)
- timo = 1;
+ /* this should be timercmp(&time, &atv, >=) */
+ if (uap->tv && (time.tv_sec > atv.tv_sec ||
+ (time.tv_sec == atv.tv_sec && time.tv_usec >= atv.tv_usec))) {
+ splx(s);
+ goto done;
}
if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
splx(s);
@@ -578,8 +640,7 @@ done:
if (error == EWOULDBLOCK)
error = 0;
#define putbits(name, x) \
- if (SCARG(uap, name) && (error2 = copyout((caddr_t)&obits[x], \
- (caddr_t)SCARG(uap, name), ni))) \
+ if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
error = error2;
if (error == 0) {
int error2;
@@ -589,15 +650,16 @@ done:
putbits(ex, 2);
#undef putbits
}
+ if (selbits != &s_selbits[0])
+ free(selbits, M_SELECT);
return (error);
}
-int
+static int
selscan(p, ibits, obits, nfd, retval)
struct proc *p;
- fd_set *ibits, *obits;
- int nfd;
- register_t *retval;
+ fd_mask **ibits, **obits;
+ int nfd, *retval;
{
register struct filedesc *fdp = p->p_fd;
register int msk, i, j, fd;
@@ -607,15 +669,18 @@ selscan(p, ibits, obits, nfd, retval)
static int flag[3] = { FREAD, FWRITE, 0 };
for (msk = 0; msk < 3; msk++) {
+ if (ibits[msk] == NULL)
+ continue;
for (i = 0; i < nfd; i += NFDBITS) {
- bits = ibits[msk].fds_bits[i/NFDBITS];
+ bits = ibits[msk][i/NFDBITS];
while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
bits &= ~(1 << j);
fp = fdp->fd_ofiles[fd];
if (fp == NULL)
return (EBADF);
if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) {
- FD_SET(fd, &obits[msk]);
+ obits[msk][(fd)/NFDBITS] |=
+ (1 << ((fd) % NFDBITS));
n++;
}
}
diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c
new file mode 100644
index 0000000..5beac60
--- /dev/null
+++ b/sys/kern/sys_pipe.c
@@ -0,0 +1,1107 @@
+/*
+ * Copyright (c) 1996 John S. Dyson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice immediately at the beginning of the file, without modification,
+ * this list of conditions, and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. Absolutely no warranty of function or purpose is made by the author
+ * John S. Dyson.
+ * 4. Modifications may be freely made to this file if the above conditions
+ * are met.
+ *
+ * $Id: sys_pipe.c,v 1.26 1997/03/23 03:36:24 bde Exp $
+ */
+
+#ifndef OLD_PIPE
+
+/*
+ * This file contains a high-performance replacement for the socket-based
+ * pipes scheme originally used in FreeBSD/4.4Lite. It does not support
+ * all features of sockets, but does do everything that pipes normally
+ * do.
+ */
+
+/*
+ * This code has two modes of operation, a small write mode and a large
+ * write mode. The small write mode acts like conventional pipes with
+ * a kernel buffer. If the buffer is less than PIPE_MINDIRECT, then the
+ * "normal" pipe buffering is done. If the buffer is between PIPE_MINDIRECT
+ * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
+ * the receiving process can copy it directly from the pages in the sending
+ * process.
+ *
+ * If the sending process receives a signal, it is possible that it will
+ * go away, and certainly its address space can change, because control
+ * is returned back to the user-mode side. In that case, the pipe code
+ * arranges to copy the buffer supplied by the user process, to a pageable
+ * kernel buffer, and the receiving process will grab the data from the
+ * pageable kernel buffer. Since signals don't happen all that often,
+ * the copy operation is normally eliminated.
+ *
+ * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
+ * happen for small transfers so that the system will not spend all of
+ * its time context switching. PIPE_SIZE is constrained by the
+ * amount of kernel virtual memory.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/protosw.h>
+#include <sys/stat.h>
+#include <sys/filedesc.h>
+#include <sys/malloc.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
+#include <sys/stat.h>
+#include <sys/select.h>
+#include <sys/signalvar.h>
+#include <sys/errno.h>
+#include <sys/queue.h>
+#include <sys/vmmeter.h>
+#include <sys/kernel.h>
+#include <sys/sysproto.h>
+#include <sys/pipe.h>
+
+#include <vm/vm.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_param.h>
+#include <sys/lock.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+
+/*
+ * Use this define if you want to disable *fancy* VM things. Expect an
+ * approx 30% decrease in transfer rate. This could be useful for
+ * NetBSD or OpenBSD.
+ */
+/* #define PIPE_NODIRECT */
+
+/*
+ * interfaces to the outside world
+ */
+static int pipe_read __P((struct file *fp, struct uio *uio,
+ struct ucred *cred));
+static int pipe_write __P((struct file *fp, struct uio *uio,
+ struct ucred *cred));
+static int pipe_close __P((struct file *fp, struct proc *p));
+static int pipe_select __P((struct file *fp, int which, struct proc *p));
+static int pipe_ioctl __P((struct file *fp, int cmd, caddr_t data, struct proc *p));
+
+static struct fileops pipeops =
+ { pipe_read, pipe_write, pipe_ioctl, pipe_select, pipe_close };
+
+/*
+ * Default pipe buffer size(s), this can be kind-of large now because pipe
+ * space is pageable. The pipe code will try to maintain locality of
+ * reference for performance reasons, so small amounts of outstanding I/O
+ * will not wipe the cache.
+ */
+#define MINPIPESIZE (PIPE_SIZE/3)
+#define MAXPIPESIZE (2*PIPE_SIZE/3)
+
+/*
+ * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
+ * is there so that on large systems, we don't exhaust it.
+ */
+#define MAXPIPEKVA (8*1024*1024)
+
+/*
+ * Limit for direct transfers, we cannot, of course limit
+ * the amount of kva for pipes in general though.
+ */
+#define LIMITPIPEKVA (16*1024*1024)
+
+/*
+ * Limit the number of "big" pipes
+ */
+#define LIMITBIGPIPES 32
+int nbigpipe;
+
+static int amountpipekva;
+
+static void pipeclose __P((struct pipe *cpipe));
+static void pipeinit __P((struct pipe *cpipe));
+static __inline int pipelock __P((struct pipe *cpipe, int catch));
+static __inline void pipeunlock __P((struct pipe *cpipe));
+static __inline void pipeselwakeup __P((struct pipe *cpipe));
+#ifndef PIPE_NODIRECT
+static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
+static void pipe_destroy_write_buffer __P((struct pipe *wpipe));
+static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
+static void pipe_clone_write_buffer __P((struct pipe *wpipe));
+#endif
+static void pipespace __P((struct pipe *cpipe));
+
+/*
+ * The pipe system call for the DTYPE_PIPE type of pipes
+ */
+
+/* ARGSUSED */
+int
+pipe(p, uap, retval)
+ struct proc *p;
+ struct pipe_args /* {
+ int dummy;
+ } */ *uap;
+ int retval[];
+{
+ register struct filedesc *fdp = p->p_fd;
+ struct file *rf, *wf;
+ struct pipe *rpipe, *wpipe;
+ int fd, error;
+
+ rpipe = malloc( sizeof (*rpipe), M_TEMP, M_WAITOK);
+ pipeinit(rpipe);
+ rpipe->pipe_state |= PIPE_DIRECTOK;
+ wpipe = malloc( sizeof (*wpipe), M_TEMP, M_WAITOK);
+ pipeinit(wpipe);
+ wpipe->pipe_state |= PIPE_DIRECTOK;
+
+ error = falloc(p, &rf, &fd);
+ if (error)
+ goto free2;
+ retval[0] = fd;
+ rf->f_flag = FREAD | FWRITE;
+ rf->f_type = DTYPE_PIPE;
+ rf->f_ops = &pipeops;
+ rf->f_data = (caddr_t)rpipe;
+ error = falloc(p, &wf, &fd);
+ if (error)
+ goto free3;
+ wf->f_flag = FREAD | FWRITE;
+ wf->f_type = DTYPE_PIPE;
+ wf->f_ops = &pipeops;
+ wf->f_data = (caddr_t)wpipe;
+ retval[1] = fd;
+
+ rpipe->pipe_peer = wpipe;
+ wpipe->pipe_peer = rpipe;
+
+ return (0);
+free3:
+ ffree(rf);
+ fdp->fd_ofiles[retval[0]] = 0;
+free2:
+ (void)pipeclose(wpipe);
+ (void)pipeclose(rpipe);
+ return (error);
+}
+
+/*
+ * Allocate kva for pipe circular buffer, the space is pageable
+ */
+static void
+pipespace(cpipe)
+ struct pipe *cpipe;
+{
+ int npages, error;
+
+ npages = round_page(cpipe->pipe_buffer.size)/PAGE_SIZE;
+ /*
+ * Create an object, I don't like the idea of paging to/from
+ * kernel_object.
+ * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
+ */
+ cpipe->pipe_buffer.object = vm_object_allocate(OBJT_DEFAULT, npages);
+ cpipe->pipe_buffer.buffer = (caddr_t) vm_map_min(kernel_map);
+
+ /*
+ * Insert the object into the kernel map, and allocate kva for it.
+ * The map entry is, by default, pageable.
+ * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
+ */
+ error = vm_map_find(kernel_map, cpipe->pipe_buffer.object, 0,
+ (vm_offset_t *) &cpipe->pipe_buffer.buffer,
+ cpipe->pipe_buffer.size, 1,
+ VM_PROT_ALL, VM_PROT_ALL, 0);
+
+ if (error != KERN_SUCCESS)
+ panic("pipeinit: cannot allocate pipe -- out of kvm -- code = %d", error);
+ amountpipekva += cpipe->pipe_buffer.size;
+}
+
+/*
+ * initialize and allocate VM and memory for pipe
+ */
+static void
+pipeinit(cpipe)
+ struct pipe *cpipe;
+{
+ int s;
+
+ cpipe->pipe_buffer.in = 0;
+ cpipe->pipe_buffer.out = 0;
+ cpipe->pipe_buffer.cnt = 0;
+ cpipe->pipe_buffer.size = PIPE_SIZE;
+
+ /* Buffer kva gets dynamically allocated */
+ cpipe->pipe_buffer.buffer = NULL;
+ /* cpipe->pipe_buffer.object = invalid */
+
+ cpipe->pipe_state = 0;
+ cpipe->pipe_peer = NULL;
+ cpipe->pipe_busy = 0;
+ gettime(&cpipe->pipe_ctime);
+ cpipe->pipe_atime = cpipe->pipe_ctime;
+ cpipe->pipe_mtime = cpipe->pipe_ctime;
+ bzero(&cpipe->pipe_sel, sizeof cpipe->pipe_sel);
+ cpipe->pipe_pgid = NO_PID;
+
+#ifndef PIPE_NODIRECT
+ /*
+ * pipe data structure initializations to support direct pipe I/O
+ */
+ cpipe->pipe_map.cnt = 0;
+ cpipe->pipe_map.kva = 0;
+ cpipe->pipe_map.pos = 0;
+ cpipe->pipe_map.npages = 0;
+ /* cpipe->pipe_map.ms[] = invalid */
+#endif
+}
+
+
+/*
+ * lock a pipe for I/O, blocking other access
+ */
+static __inline int
+pipelock(cpipe, catch)
+ struct pipe *cpipe;
+ int catch;
+{
+ int error;
+ while (cpipe->pipe_state & PIPE_LOCK) {
+ cpipe->pipe_state |= PIPE_LWANT;
+ if (error = tsleep( cpipe,
+ catch?(PRIBIO|PCATCH):PRIBIO, "pipelk", 0)) {
+ return error;
+ }
+ }
+ cpipe->pipe_state |= PIPE_LOCK;
+ return 0;
+}
+
+/*
+ * unlock a pipe I/O lock
+ */
+static __inline void
+pipeunlock(cpipe)
+ struct pipe *cpipe;
+{
+ cpipe->pipe_state &= ~PIPE_LOCK;
+ if (cpipe->pipe_state & PIPE_LWANT) {
+ cpipe->pipe_state &= ~PIPE_LWANT;
+ wakeup(cpipe);
+ }
+}
+
+static __inline void
+pipeselwakeup(cpipe)
+ struct pipe *cpipe;
+{
+ struct proc *p;
+
+ if (cpipe->pipe_state & PIPE_SEL) {
+ cpipe->pipe_state &= ~PIPE_SEL;
+ selwakeup(&cpipe->pipe_sel);
+ }
+ if (cpipe->pipe_state & PIPE_ASYNC) {
+ if (cpipe->pipe_pgid < 0)
+ gsignal(-cpipe->pipe_pgid, SIGIO);
+ else if ((p = pfind(cpipe->pipe_pgid)) != NULL)
+ psignal(p, SIGIO);
+ }
+}
+
+/* ARGSUSED */
+static int
+pipe_read(fp, uio, cred)
+ struct file *fp;
+ struct uio *uio;
+ struct ucred *cred;
+{
+
+ struct pipe *rpipe = (struct pipe *) fp->f_data;
+ int error = 0;
+ int nread = 0;
+ u_int size;
+
+ ++rpipe->pipe_busy;
+ while (uio->uio_resid) {
+ /*
+ * normal pipe buffer receive
+ */
+ if (rpipe->pipe_buffer.cnt > 0) {
+ size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
+ if (size > rpipe->pipe_buffer.cnt)
+ size = rpipe->pipe_buffer.cnt;
+ if (size > (u_int) uio->uio_resid)
+ size = (u_int) uio->uio_resid;
+ if ((error = pipelock(rpipe,1)) == 0) {
+ error = uiomove( &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out],
+ size, uio);
+ pipeunlock(rpipe);
+ }
+ if (error) {
+ break;
+ }
+ rpipe->pipe_buffer.out += size;
+ if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
+ rpipe->pipe_buffer.out = 0;
+
+ rpipe->pipe_buffer.cnt -= size;
+ nread += size;
+#ifndef PIPE_NODIRECT
+ /*
+ * Direct copy, bypassing a kernel buffer.
+ */
+ } else if ((size = rpipe->pipe_map.cnt) &&
+ (rpipe->pipe_state & PIPE_DIRECTW)) {
+ caddr_t va;
+ if (size > (u_int) uio->uio_resid)
+ size = (u_int) uio->uio_resid;
+ if ((error = pipelock(rpipe,1)) == 0) {
+ va = (caddr_t) rpipe->pipe_map.kva + rpipe->pipe_map.pos;
+ error = uiomove(va, size, uio);
+ pipeunlock(rpipe);
+ }
+ if (error)
+ break;
+ nread += size;
+ rpipe->pipe_map.pos += size;
+ rpipe->pipe_map.cnt -= size;
+ if (rpipe->pipe_map.cnt == 0) {
+ rpipe->pipe_state &= ~PIPE_DIRECTW;
+ wakeup(rpipe);
+ }
+#endif
+ } else {
+ /*
+ * detect EOF condition
+ */
+ if (rpipe->pipe_state & PIPE_EOF) {
+ /* XXX error = ? */
+ break;
+ }
+ /*
+ * If the "write-side" has been blocked, wake it up now.
+ */
+ if (rpipe->pipe_state & PIPE_WANTW) {
+ rpipe->pipe_state &= ~PIPE_WANTW;
+ wakeup(rpipe);
+ }
+ if (nread > 0)
+ break;
+
+ if (fp->f_flag & FNONBLOCK) {
+ error = EAGAIN;
+ break;
+ }
+
+ /*
+ * If there is no more to read in the pipe, reset
+ * its pointers to the beginning. This improves
+ * cache hit stats.
+ */
+
+ if ((error = pipelock(rpipe,1)) == 0) {
+ if (rpipe->pipe_buffer.cnt == 0) {
+ rpipe->pipe_buffer.in = 0;
+ rpipe->pipe_buffer.out = 0;
+ }
+ pipeunlock(rpipe);
+ } else {
+ break;
+ }
+
+ if (rpipe->pipe_state & PIPE_WANTW) {
+ rpipe->pipe_state &= ~PIPE_WANTW;
+ wakeup(rpipe);
+ }
+
+ rpipe->pipe_state |= PIPE_WANTR;
+ if (error = tsleep(rpipe, PRIBIO|PCATCH, "piperd", 0)) {
+ break;
+ }
+ }
+ }
+
+ if (error == 0)
+ gettime(&rpipe->pipe_atime);
+
+ --rpipe->pipe_busy;
+ if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
+ rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
+ wakeup(rpipe);
+ } else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
+ /*
+ * If there is no more to read in the pipe, reset
+ * its pointers to the beginning. This improves
+ * cache hit stats.
+ */
+ if (rpipe->pipe_buffer.cnt == 0) {
+ if ((error == 0) && (error = pipelock(rpipe,1)) == 0) {
+ rpipe->pipe_buffer.in = 0;
+ rpipe->pipe_buffer.out = 0;
+ pipeunlock(rpipe);
+ }
+ }
+
+ /*
+ * If the "write-side" has been blocked, wake it up now.
+ */
+ if (rpipe->pipe_state & PIPE_WANTW) {
+ rpipe->pipe_state &= ~PIPE_WANTW;
+ wakeup(rpipe);
+ }
+ }
+
+ if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
+ pipeselwakeup(rpipe);
+
+ return error;
+}
+
+#ifndef PIPE_NODIRECT
+/*
+ * Map the sending processes' buffer into kernel space and wire it.
+ * This is similar to a physical write operation.
+ */
+static int
+pipe_build_write_buffer(wpipe, uio)
+ struct pipe *wpipe;
+ struct uio *uio;
+{
+ u_int size;
+ int i;
+ vm_offset_t addr, endaddr, paddr;
+
+ size = (u_int) uio->uio_iov->iov_len;
+ if (size > wpipe->pipe_buffer.size)
+ size = wpipe->pipe_buffer.size;
+
+ endaddr = round_page(uio->uio_iov->iov_base + size);
+ for(i = 0, addr = trunc_page(uio->uio_iov->iov_base);
+ addr < endaddr;
+ addr += PAGE_SIZE, i+=1) {
+
+ vm_page_t m;
+
+ vm_fault_quick( (caddr_t) addr, VM_PROT_READ);
+ paddr = pmap_kextract(addr);
+ if (!paddr) {
+ int j;
+ for(j=0;j<i;j++)
+ vm_page_unwire(wpipe->pipe_map.ms[j]);
+ return EFAULT;
+ }
+
+ m = PHYS_TO_VM_PAGE(paddr);
+ vm_page_wire(m);
+ wpipe->pipe_map.ms[i] = m;
+ }
+
+/*
+ * set up the control block
+ */
+ wpipe->pipe_map.npages = i;
+ wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
+ wpipe->pipe_map.cnt = size;
+
+/*
+ * and map the buffer
+ */
+ if (wpipe->pipe_map.kva == 0) {
+ /*
+ * We need to allocate space for an extra page because the
+ * address range might (will) span pages at times.
+ */
+ wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
+ wpipe->pipe_buffer.size + PAGE_SIZE);
+ amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
+ }
+ pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
+ wpipe->pipe_map.npages);
+
+/*
+ * and update the uio data
+ */
+
+ uio->uio_iov->iov_len -= size;
+ uio->uio_iov->iov_base += size;
+ if (uio->uio_iov->iov_len == 0)
+ uio->uio_iov++;
+ uio->uio_resid -= size;
+ uio->uio_offset += size;
+ return 0;
+}
+
+/*
+ * unmap and unwire the process buffer
+ */
+static void
+pipe_destroy_write_buffer(wpipe)
+struct pipe *wpipe;
+{
+ int i;
+ if (wpipe->pipe_map.kva) {
+ pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
+
+ if (amountpipekva > MAXPIPEKVA) {
+ vm_offset_t kva = wpipe->pipe_map.kva;
+ wpipe->pipe_map.kva = 0;
+ kmem_free(kernel_map, kva,
+ wpipe->pipe_buffer.size + PAGE_SIZE);
+ amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
+ }
+ }
+ for (i=0;i<wpipe->pipe_map.npages;i++)
+ vm_page_unwire(wpipe->pipe_map.ms[i]);
+}
+
+/*
+ * In the case of a signal, the writing process might go away. This
+ * code copies the data into the circular buffer so that the source
+ * pages can be freed without loss of data.
+ */
+static void
+pipe_clone_write_buffer(wpipe)
+struct pipe *wpipe;
+{
+ int size;
+ int pos;
+
+ size = wpipe->pipe_map.cnt;
+ pos = wpipe->pipe_map.pos;
+ bcopy((caddr_t) wpipe->pipe_map.kva+pos,
+ (caddr_t) wpipe->pipe_buffer.buffer,
+ size);
+
+ wpipe->pipe_buffer.in = size;
+ wpipe->pipe_buffer.out = 0;
+ wpipe->pipe_buffer.cnt = size;
+ wpipe->pipe_state &= ~PIPE_DIRECTW;
+
+ pipe_destroy_write_buffer(wpipe);
+}
+
+/*
+ * This implements the pipe buffer write mechanism. Note that only
+ * a direct write OR a normal pipe write can be pending at any given time.
+ * If there are any characters in the pipe buffer, the direct write will
+ * be deferred until the receiving process grabs all of the bytes from
+ * the pipe buffer. Then the direct mapping write is set-up.
+ */
+static int
+pipe_direct_write(wpipe, uio)
+ struct pipe *wpipe;
+ struct uio *uio;
+{
+ int error;
+retry:
+ while (wpipe->pipe_state & PIPE_DIRECTW) {
+ if ( wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+ wpipe->pipe_state |= PIPE_WANTW;
+ error = tsleep(wpipe,
+ PRIBIO|PCATCH, "pipdww", 0);
+ if (error)
+ goto error1;
+ if (wpipe->pipe_state & PIPE_EOF) {
+ error = EPIPE;
+ goto error1;
+ }
+ }
+ wpipe->pipe_map.cnt = 0; /* transfer not ready yet */
+ if (wpipe->pipe_buffer.cnt > 0) {
+ if ( wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+
+ wpipe->pipe_state |= PIPE_WANTW;
+ error = tsleep(wpipe,
+ PRIBIO|PCATCH, "pipdwc", 0);
+ if (error)
+ goto error1;
+ if (wpipe->pipe_state & PIPE_EOF) {
+ error = EPIPE;
+ goto error1;
+ }
+ goto retry;
+ }
+
+ wpipe->pipe_state |= PIPE_DIRECTW;
+
+ error = pipe_build_write_buffer(wpipe, uio);
+ if (error) {
+ wpipe->pipe_state &= ~PIPE_DIRECTW;
+ goto error1;
+ }
+
+ error = 0;
+ while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
+ if (wpipe->pipe_state & PIPE_EOF) {
+ pipelock(wpipe, 0);
+ pipe_destroy_write_buffer(wpipe);
+ pipeunlock(wpipe);
+ pipeselwakeup(wpipe);
+ error = EPIPE;
+ goto error1;
+ }
+ if (wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+ pipeselwakeup(wpipe);
+ error = tsleep(wpipe, PRIBIO|PCATCH, "pipdwt", 0);
+ }
+
+ pipelock(wpipe,0);
+ if (wpipe->pipe_state & PIPE_DIRECTW) {
+ /*
+ * this bit of trickery substitutes a kernel buffer for
+ * the process that might be going away.
+ */
+ pipe_clone_write_buffer(wpipe);
+ } else {
+ pipe_destroy_write_buffer(wpipe);
+ }
+ pipeunlock(wpipe);
+ return error;
+
+error1:
+ wakeup(wpipe);
+ return error;
+}
+#endif
+
+static int
+pipe_write(fp, uio, cred)
+ struct file *fp;
+ struct uio *uio;
+ struct ucred *cred;
+{
+ int error = 0;
+ int orig_resid;
+
+ struct pipe *wpipe, *rpipe;
+
+ rpipe = (struct pipe *) fp->f_data;
+ wpipe = rpipe->pipe_peer;
+
+ /*
+ * detect loss of pipe read side, issue SIGPIPE if lost.
+ */
+ if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
+ return EPIPE;
+ }
+
+ /*
+ * If it is advantageous to resize the pipe buffer, do
+ * so.
+ */
+ if ((uio->uio_resid > PIPE_SIZE) &&
+ (nbigpipe < LIMITBIGPIPES) &&
+ (wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
+ (wpipe->pipe_buffer.size <= PIPE_SIZE) &&
+ (wpipe->pipe_buffer.cnt == 0)) {
+
+ if (wpipe->pipe_buffer.buffer) {
+ amountpipekva -= wpipe->pipe_buffer.size;
+ kmem_free(kernel_map,
+ (vm_offset_t)wpipe->pipe_buffer.buffer,
+ wpipe->pipe_buffer.size);
+ }
+
+#ifndef PIPE_NODIRECT
+ if (wpipe->pipe_map.kva) {
+ amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
+ kmem_free(kernel_map,
+ wpipe->pipe_map.kva,
+ wpipe->pipe_buffer.size + PAGE_SIZE);
+ }
+#endif
+
+ wpipe->pipe_buffer.in = 0;
+ wpipe->pipe_buffer.out = 0;
+ wpipe->pipe_buffer.cnt = 0;
+ wpipe->pipe_buffer.size = BIG_PIPE_SIZE;
+ wpipe->pipe_buffer.buffer = NULL;
+ ++nbigpipe;
+
+#ifndef PIPE_NODIRECT
+ wpipe->pipe_map.cnt = 0;
+ wpipe->pipe_map.kva = 0;
+ wpipe->pipe_map.pos = 0;
+ wpipe->pipe_map.npages = 0;
+#endif
+
+ }
+
+
+ if( wpipe->pipe_buffer.buffer == NULL) {
+ if ((error = pipelock(wpipe,1)) == 0) {
+ pipespace(wpipe);
+ pipeunlock(wpipe);
+ } else {
+ return error;
+ }
+ }
+
+ ++wpipe->pipe_busy;
+ orig_resid = uio->uio_resid;
+ while (uio->uio_resid) {
+ int space;
+#ifndef PIPE_NODIRECT
+ /*
+ * If the transfer is large, we can gain performance if
+ * we do process-to-process copies directly.
+ * If the write is non-blocking, we don't use the
+ * direct write mechanism.
+ */
+ if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
+ (fp->f_flag & FNONBLOCK) == 0 &&
+ (wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) &&
+ (uio->uio_iov->iov_len >= PIPE_MINDIRECT)) {
+ error = pipe_direct_write( wpipe, uio);
+ if (error) {
+ break;
+ }
+ continue;
+ }
+#endif
+
+ /*
+ * Pipe buffered writes cannot be coincidental with
+ * direct writes. We wait until the currently executing
+ * direct write is completed before we start filling the
+ * pipe buffer.
+ */
+ retrywrite:
+ while (wpipe->pipe_state & PIPE_DIRECTW) {
+ if (wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+ error = tsleep(wpipe,
+ PRIBIO|PCATCH, "pipbww", 0);
+ if (error)
+ break;
+ }
+
+ space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
+
+ /* Writes of size <= PIPE_BUF must be atomic. */
+ /* XXX perhaps they need to be contiguous to be atomic? */
+ if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
+ space = 0;
+
+ if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
+ /*
+ * This set the maximum transfer as a segment of
+ * the buffer.
+ */
+ int size = wpipe->pipe_buffer.size - wpipe->pipe_buffer.in;
+ /*
+ * space is the size left in the buffer
+ */
+ if (size > space)
+ size = space;
+ /*
+ * now limit it to the size of the uio transfer
+ */
+ if (size > uio->uio_resid)
+ size = uio->uio_resid;
+ if ((error = pipelock(wpipe,1)) == 0) {
+ /*
+ * It is possible for a direct write to
+ * slip in on us... handle it here...
+ */
+ if (wpipe->pipe_state & PIPE_DIRECTW) {
+ pipeunlock(wpipe);
+ goto retrywrite;
+ }
+ error = uiomove( &wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in],
+ size, uio);
+ pipeunlock(wpipe);
+ }
+ if (error)
+ break;
+
+ wpipe->pipe_buffer.in += size;
+ if (wpipe->pipe_buffer.in >= wpipe->pipe_buffer.size)
+ wpipe->pipe_buffer.in = 0;
+
+ wpipe->pipe_buffer.cnt += size;
+ } else {
+ /*
+ * If the "read-side" has been blocked, wake it up now.
+ */
+ if (wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+
+ /*
+ * don't block on non-blocking I/O
+ */
+ if (fp->f_flag & FNONBLOCK) {
+ error = EAGAIN;
+ break;
+ }
+
+ /*
+ * We have no more space and have something to offer,
+ * wake up selects.
+ */
+ pipeselwakeup(wpipe);
+
+ wpipe->pipe_state |= PIPE_WANTW;
+ if (error = tsleep(wpipe, (PRIBIO+1)|PCATCH, "pipewr", 0)) {
+ break;
+ }
+ /*
+ * If read side wants to go away, we just issue a signal
+ * to ourselves.
+ */
+ if (wpipe->pipe_state & PIPE_EOF) {
+ error = EPIPE;
+ break;
+ }
+ }
+ }
+
+ --wpipe->pipe_busy;
+ if ((wpipe->pipe_busy == 0) &&
+ (wpipe->pipe_state & PIPE_WANT)) {
+ wpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTR);
+ wakeup(wpipe);
+ } else if (wpipe->pipe_buffer.cnt > 0) {
+ /*
+ * If we have put any characters in the buffer, we wake up
+ * the reader.
+ */
+ if (wpipe->pipe_state & PIPE_WANTR) {
+ wpipe->pipe_state &= ~PIPE_WANTR;
+ wakeup(wpipe);
+ }
+ }
+
+ /*
+ * Don't return EPIPE if I/O was successful
+ */
+ if ((wpipe->pipe_buffer.cnt == 0) &&
+ (uio->uio_resid == 0) &&
+ (error == EPIPE))
+ error = 0;
+
+ if (error == 0)
+ gettime(&wpipe->pipe_mtime);
+
+ /*
+ * We have something to offer,
+ * wake up select.
+ */
+ if (wpipe->pipe_buffer.cnt)
+ pipeselwakeup(wpipe);
+
+ return error;
+}
+
+/*
+ * we implement a very minimal set of ioctls for compatibility with sockets.
+ */
+int
+pipe_ioctl(fp, cmd, data, p)
+ struct file *fp;
+ int cmd;
+ register caddr_t data;
+ struct proc *p;
+{
+ register struct pipe *mpipe = (struct pipe *)fp->f_data;
+
+ switch (cmd) {
+
+ case FIONBIO:
+ return (0);
+
+ case FIOASYNC:
+ if (*(int *)data) {
+ mpipe->pipe_state |= PIPE_ASYNC;
+ } else {
+ mpipe->pipe_state &= ~PIPE_ASYNC;
+ }
+ return (0);
+
+ case FIONREAD:
+ if (mpipe->pipe_state & PIPE_DIRECTW)
+ *(int *)data = mpipe->pipe_map.cnt;
+ else
+ *(int *)data = mpipe->pipe_buffer.cnt;
+ return (0);
+
+ case TIOCSPGRP:
+ mpipe->pipe_pgid = *(int *)data;
+ return (0);
+
+ case TIOCGPGRP:
+ *(int *)data = mpipe->pipe_pgid;
+ return (0);
+
+ }
+ return (ENOTTY);
+}
+
+int
+pipe_select(fp, which, p)
+ struct file *fp;
+ int which;
+ struct proc *p;
+{
+ register struct pipe *rpipe = (struct pipe *)fp->f_data;
+ struct pipe *wpipe;
+
+ wpipe = rpipe->pipe_peer;
+ switch (which) {
+
+ case FREAD:
+ if ( (rpipe->pipe_state & PIPE_DIRECTW) ||
+ (rpipe->pipe_buffer.cnt > 0) ||
+ (rpipe->pipe_state & PIPE_EOF)) {
+ return (1);
+ }
+ selrecord(p, &rpipe->pipe_sel);
+ rpipe->pipe_state |= PIPE_SEL;
+ break;
+
+ case FWRITE:
+ if ((wpipe == NULL) ||
+ (wpipe->pipe_state & PIPE_EOF) ||
+ (((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
+ (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) {
+ return (1);
+ }
+ selrecord(p, &wpipe->pipe_sel);
+ wpipe->pipe_state |= PIPE_SEL;
+ break;
+
+ case 0:
+ if ((rpipe->pipe_state & PIPE_EOF) ||
+ (wpipe == NULL) ||
+ (wpipe->pipe_state & PIPE_EOF)) {
+ return (1);
+ }
+
+ selrecord(p, &rpipe->pipe_sel);
+ rpipe->pipe_state |= PIPE_SEL;
+ break;
+ }
+ return (0);
+}
+
+int
+pipe_stat(pipe, ub)
+ register struct pipe *pipe;
+ register struct stat *ub;
+{
+ bzero((caddr_t)ub, sizeof (*ub));
+ ub->st_mode = S_IFIFO;
+ ub->st_blksize = pipe->pipe_buffer.size;
+ ub->st_size = pipe->pipe_buffer.cnt;
+ ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
+ TIMEVAL_TO_TIMESPEC(&pipe->pipe_atime, &ub->st_atimespec);
+ TIMEVAL_TO_TIMESPEC(&pipe->pipe_mtime, &ub->st_mtimespec);
+ TIMEVAL_TO_TIMESPEC(&pipe->pipe_ctime, &ub->st_ctimespec);
+ /*
+ * Left as 0: st_dev, st_ino, st_nlink, st_uid, st_gid, st_rdev,
+ * st_flags, st_gen.
+ * XXX (st_dev, st_ino) should be unique.
+ */
+ return 0;
+}
+
+/* ARGSUSED */
+static int
+pipe_close(fp, p)
+ struct file *fp;
+ struct proc *p;
+{
+ struct pipe *cpipe = (struct pipe *)fp->f_data;
+
+ pipeclose(cpipe);
+ fp->f_data = NULL;
+ return 0;
+}
+
+/*
+ * shutdown the pipe
+ */
+static void
+pipeclose(cpipe)
+ struct pipe *cpipe;
+{
+ struct pipe *ppipe;
+ if (cpipe) {
+
+ pipeselwakeup(cpipe);
+
+ /*
+ * If the other side is blocked, wake it up saying that
+ * we want to close it down.
+ */
+ while (cpipe->pipe_busy) {
+ wakeup(cpipe);
+ cpipe->pipe_state |= PIPE_WANT|PIPE_EOF;
+ tsleep(cpipe, PRIBIO, "pipecl", 0);
+ }
+
+ /*
+ * Disconnect from peer
+ */
+ if (ppipe = cpipe->pipe_peer) {
+ pipeselwakeup(ppipe);
+
+ ppipe->pipe_state |= PIPE_EOF;
+ wakeup(ppipe);
+ ppipe->pipe_peer = NULL;
+ }
+
+ /*
+ * free resources
+ */
+ if (cpipe->pipe_buffer.buffer) {
+ if (cpipe->pipe_buffer.size > PIPE_SIZE)
+ --nbigpipe;
+ amountpipekva -= cpipe->pipe_buffer.size;
+ kmem_free(kernel_map,
+ (vm_offset_t)cpipe->pipe_buffer.buffer,
+ cpipe->pipe_buffer.size);
+ }
+#ifndef PIPE_NODIRECT
+ if (cpipe->pipe_map.kva) {
+ amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
+ kmem_free(kernel_map,
+ cpipe->pipe_map.kva,
+ cpipe->pipe_buffer.size + PAGE_SIZE);
+ }
+#endif
+ free(cpipe, M_TEMP);
+ }
+}
+#endif
diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c
index 4cc40ba..7a538b6 100644
--- a/sys/kern/sys_process.c
+++ b/sys/kern/sys_process.c
@@ -1,11 +1,6 @@
-/*-
- * Copyright (c) 1982, 1986, 1989, 1993
- * The Regents of the University of California. All rights reserved.
- * (c) UNIX System Laboratories, Inc.
- * All or some portions of this file are derived from material licensed
- * to the University of California by American Telephone and Telegraph
- * Co. or Unix System Laboratories, Inc. and are reproduced herein with
- * the permission of UNIX System Laboratories, Inc.
+/*
+ * Copyright (c) 1994, Sean Eric Fagan
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -17,16 +12,14 @@
* documentation and/or other materials provided with the distribution.
* 3. All advertising materials mentioning features or use of this software
* must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
+ * This product includes software developed by Sean Eric Fagan.
+ * 4. The name of the author may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
*
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
@@ -35,40 +28,481 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * from: @(#)sys_process.c 8.1 (Berkeley) 6/10/93
+ * $Id$
*/
#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/ptrace.h>
#include <sys/errno.h>
+#include <sys/queue.h>
+
+#include <machine/reg.h>
+#include <machine/psl.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+#include <sys/user.h>
+#include <miscfs/procfs/procfs.h>
+
+/* use the equivalent procfs code */
+#if 0
+static int
+pread (struct proc *procp, unsigned int addr, unsigned int *retval) {
+ int rv;
+ vm_map_t map, tmap;
+ vm_object_t object;
+ vm_offset_t kva = 0;
+ int page_offset; /* offset into page */
+ vm_offset_t pageno; /* page number */
+ vm_map_entry_t out_entry;
+ vm_prot_t out_prot;
+ boolean_t wired, single_use;
+ vm_pindex_t pindex;
+
+ /* Map page into kernel space */
+
+ map = &procp->p_vmspace->vm_map;
+
+ page_offset = addr - trunc_page(addr);
+ pageno = trunc_page(addr);
+
+ tmap = map;
+ rv = vm_map_lookup (&tmap, pageno, VM_PROT_READ, &out_entry,
+ &object, &pindex, &out_prot, &wired, &single_use);
+
+ if (rv != KERN_SUCCESS)
+ return EINVAL;
+
+ vm_map_lookup_done (tmap, out_entry);
+
+ /* Find space in kernel_map for the page we're interested in */
+ rv = vm_map_find (kernel_map, object, IDX_TO_OFF(pindex),
+ &kva, PAGE_SIZE, 0, VM_PROT_ALL, VM_PROT_ALL, 0);
+
+ if (!rv) {
+ vm_object_reference (object);
+
+ rv = vm_map_pageable (kernel_map, kva, kva + PAGE_SIZE, 0);
+ if (!rv) {
+ *retval = 0;
+ bcopy ((caddr_t)kva + page_offset,
+ retval, sizeof *retval);
+ }
+ vm_map_remove (kernel_map, kva, kva + PAGE_SIZE);
+ }
+
+ return rv;
+}
+
+static int
+pwrite (struct proc *procp, unsigned int addr, unsigned int datum) {
+ int rv;
+ vm_map_t map, tmap;
+ vm_object_t object;
+ vm_offset_t kva = 0;
+ int page_offset; /* offset into page */
+ vm_offset_t pageno; /* page number */
+ vm_map_entry_t out_entry;
+ vm_prot_t out_prot;
+ boolean_t wired, single_use;
+ vm_pindex_t pindex;
+ boolean_t fix_prot = 0;
+
+ /* Map page into kernel space */
+
+ map = &procp->p_vmspace->vm_map;
+
+ page_offset = addr - trunc_page(addr);
+ pageno = trunc_page(addr);
+
+ /*
+ * Check the permissions for the area we're interested in.
+ */
+
+ if (vm_map_check_protection (map, pageno, pageno + PAGE_SIZE,
+ VM_PROT_WRITE) == FALSE) {
+ /*
+ * If the page was not writable, we make it so.
+ * XXX It is possible a page may *not* be read/executable,
+ * if a process changes that!
+ */
+ fix_prot = 1;
+ /* The page isn't writable, so let's try making it so... */
+ if ((rv = vm_map_protect (map, pageno, pageno + PAGE_SIZE,
+ VM_PROT_ALL, 0)) != KERN_SUCCESS)
+ return EFAULT; /* I guess... */
+ }
+
+ /*
+ * Now we need to get the page. out_entry, out_prot, wired, and
+ * single_use aren't used. One would think the vm code would be
+ * a *bit* nicer... We use tmap because vm_map_lookup() can
+ * change the map argument.
+ */
+
+ tmap = map;
+ rv = vm_map_lookup (&tmap, pageno, VM_PROT_WRITE, &out_entry,
+ &object, &pindex, &out_prot, &wired, &single_use);
+ if (rv != KERN_SUCCESS) {
+ return EINVAL;
+ }
+
+ /*
+ * Okay, we've got the page. Let's release tmap.
+ */
+
+ vm_map_lookup_done (tmap, out_entry);
+
+ /*
+ * Fault the page in...
+ */
+
+ rv = vm_fault(map, pageno, VM_PROT_WRITE|VM_PROT_READ, FALSE);
+ if (rv != KERN_SUCCESS)
+ return EFAULT;
+
+ /* Find space in kernel_map for the page we're interested in */
+ rv = vm_map_find (kernel_map, object, IDX_TO_OFF(pindex),
+ &kva, PAGE_SIZE, 0,
+ VM_PROT_ALL, VM_PROT_ALL, 0);
+ if (!rv) {
+ vm_object_reference (object);
+
+ rv = vm_map_pageable (kernel_map, kva, kva + PAGE_SIZE, 0);
+ if (!rv) {
+ bcopy (&datum, (caddr_t)kva + page_offset, sizeof datum);
+ }
+ vm_map_remove (kernel_map, kva, kva + PAGE_SIZE);
+ }
+
+ if (fix_prot)
+ vm_map_protect (map, pageno, pageno + PAGE_SIZE,
+ VM_PROT_READ|VM_PROT_EXECUTE, 0);
+ return rv;
+}
+#endif
/*
* Process debugging system call.
*/
+#ifndef _SYS_SYSPROTO_H_
struct ptrace_args {
int req;
pid_t pid;
caddr_t addr;
int data;
};
-ptrace(a1, a2, a3)
- struct proc *a1;
- struct ptrace_args *a2;
- int *a3;
+#endif
+
+int
+ptrace(curp, uap, retval)
+ struct proc *curp;
+ struct ptrace_args *uap;
+ int *retval;
{
+ struct proc *p;
+ struct iovec iov;
+ struct uio uio;
+ int error = 0;
+ int write;
+ int s;
+
+ if (uap->req == PT_TRACE_ME)
+ p = curp;
+ else {
+ if ((p = pfind(uap->pid)) == NULL)
+ return ESRCH;
+ }
/*
- * Body deleted.
+ * Permissions check
*/
- return (ENOSYS);
-}
+ switch (uap->req) {
+ case PT_TRACE_ME:
+ /* Always legal. */
+ break;
-trace_req(a1)
- struct proc *a1;
-{
+ case PT_ATTACH:
+ /* Self */
+ if (p->p_pid == curp->p_pid)
+ return EINVAL;
+
+ /* Already traced */
+ if (p->p_flag & P_TRACED)
+ return EBUSY;
+
+ /* not owned by you, has done setuid (unless you're root) */
+ if ((p->p_cred->p_ruid != curp->p_cred->p_ruid) ||
+ (p->p_flag & P_SUGID)) {
+ if (error = suser(curp->p_ucred, &curp->p_acflag))
+ return error;
+ }
+
+ /* OK */
+ break;
+
+ case PT_READ_I:
+ case PT_READ_D:
+ case PT_READ_U:
+ case PT_WRITE_I:
+ case PT_WRITE_D:
+ case PT_WRITE_U:
+ case PT_CONTINUE:
+ case PT_KILL:
+ case PT_STEP:
+ case PT_DETACH:
+#ifdef PT_GETREGS
+ case PT_GETREGS:
+#endif
+#ifdef PT_SETREGS
+ case PT_SETREGS:
+#endif
+#ifdef PT_GETFPREGS
+ case PT_GETFPREGS:
+#endif
+#ifdef PT_SETFPREGS
+ case PT_SETFPREGS:
+#endif
+ /* not being traced... */
+ if ((p->p_flag & P_TRACED) == 0)
+ return EPERM;
+
+ /* not being traced by YOU */
+ if (p->p_pptr != curp)
+ return EBUSY;
+
+ /* not currently stopped */
+ if (p->p_stat != SSTOP || (p->p_flag & P_WAITED) == 0)
+ return EBUSY;
+
+ /* OK */
+ break;
+
+ default:
+ return EINVAL;
+ }
+
+#ifdef FIX_SSTEP
+ /*
+ * Single step fixup ala procfs
+ */
+ FIX_SSTEP(p);
+#endif
/*
- * Body deleted.
+ * Actually do the requests
*/
- return (0);
+
+ write = 0;
+ *retval = 0;
+
+ switch (uap->req) {
+ case PT_TRACE_ME:
+ /* set my trace flag and "owner" so it can read/write me */
+ p->p_flag |= P_TRACED;
+ p->p_oppid = p->p_pptr->p_pid;
+ return 0;
+
+ case PT_ATTACH:
+ /* security check done above */
+ p->p_flag |= P_TRACED;
+ p->p_oppid = p->p_pptr->p_pid;
+ if (p->p_pptr != curp)
+ proc_reparent(p, curp);
+ uap->data = SIGSTOP;
+ goto sendsig; /* in PT_CONTINUE below */
+
+ case PT_STEP:
+ case PT_CONTINUE:
+ case PT_DETACH:
+ if ((unsigned)uap->data >= NSIG)
+ return EINVAL;
+
+ PHOLD(p);
+
+ if (uap->req == PT_STEP) {
+ if ((error = ptrace_single_step (p))) {
+ PRELE(p);
+ return error;
+ }
+ }
+
+ if (uap->addr != (caddr_t)1) {
+ fill_eproc (p, &p->p_addr->u_kproc.kp_eproc);
+ if ((error = ptrace_set_pc (p, (u_int)uap->addr))) {
+ PRELE(p);
+ return error;
+ }
+ }
+ PRELE(p);
+
+ if (uap->req == PT_DETACH) {
+ /* reset process parent */
+ if (p->p_oppid != p->p_pptr->p_pid) {
+ struct proc *pp;
+
+ pp = pfind(p->p_oppid);
+ proc_reparent(p, pp ? pp : initproc);
+ }
+
+ p->p_flag &= ~(P_TRACED | P_WAITED);
+ p->p_oppid = 0;
+
+ /* should we send SIGCHLD? */
+
+ }
+
+ sendsig:
+ /* deliver or queue signal */
+ s = splhigh();
+ if (p->p_stat == SSTOP) {
+ p->p_xstat = uap->data;
+ setrunnable(p);
+ } else if (uap->data) {
+ psignal(p, uap->data);
+ }
+ splx(s);
+ return 0;
+
+ case PT_WRITE_I:
+ case PT_WRITE_D:
+ write = 1;
+ /* fallthrough */
+ case PT_READ_I:
+ case PT_READ_D:
+ /* write = 0 set above */
+ iov.iov_base = write ? (caddr_t)&uap->data : (caddr_t)retval;
+ iov.iov_len = sizeof(int);
+ uio.uio_iov = &iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_offset = (off_t)(u_long)uap->addr;
+ uio.uio_resid = sizeof(int);
+ uio.uio_segflg = UIO_SYSSPACE; /* ie: the uap */
+ uio.uio_rw = write ? UIO_WRITE : UIO_READ;
+ uio.uio_procp = p;
+ error = procfs_domem(curp, p, NULL, &uio);
+ if (uio.uio_resid != 0) {
+ /*
+ * XXX procfs_domem() doesn't currently return ENOSPC,
+ * so I think write() can bogusly return 0.
+ * XXX what happens for short writes? We don't want
+ * to write partial data.
+ * XXX procfs_domem() returns EPERM for other invalid
+ * addresses. Convert this to EINVAL. Does this
+ * clobber returns of EPERM for other reasons?
+ */
+ if (error == 0 || error == ENOSPC || error == EPERM)
+ error = EINVAL; /* EOF */
+ }
+ return (error);
+
+ case PT_READ_U:
+ if ((u_int)uap->addr > (UPAGES * PAGE_SIZE - sizeof(int))) {
+ return EFAULT;
+ }
+ error = 0;
+ PHOLD(p); /* user had damn well better be incore! */
+ if (p->p_flag & P_INMEM) {
+ p->p_addr->u_kproc.kp_proc = *p;
+ fill_eproc (p, &p->p_addr->u_kproc.kp_eproc);
+ *retval = *(int*)((u_int)p->p_addr + (u_int)uap->addr);
+ } else {
+ *retval = 0;
+ error = EFAULT;
+ }
+ PRELE(p);
+ return error;
+
+ case PT_WRITE_U:
+ PHOLD(p); /* user had damn well better be incore! */
+ if (p->p_flag & P_INMEM) {
+ p->p_addr->u_kproc.kp_proc = *p;
+ fill_eproc (p, &p->p_addr->u_kproc.kp_eproc);
+ error = ptrace_write_u(p, (vm_offset_t)uap->addr, uap->data);
+ } else {
+ error = EFAULT;
+ }
+ PRELE(p);
+ return error;
+
+ case PT_KILL:
+ uap->data = SIGKILL;
+ goto sendsig; /* in PT_CONTINUE above */
+
+#ifdef PT_SETREGS
+ case PT_SETREGS:
+ write = 1;
+ /* fallthrough */
+#endif /* PT_SETREGS */
+#ifdef PT_GETREGS
+ case PT_GETREGS:
+ /* write = 0 above */
+#endif /* PT_SETREGS */
+#if defined(PT_SETREGS) || defined(PT_GETREGS)
+ if (!procfs_validregs(p)) /* no P_SYSTEM procs please */
+ return EINVAL;
+ else {
+ iov.iov_base = uap->addr;
+ iov.iov_len = sizeof(struct reg);
+ uio.uio_iov = &iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_offset = 0;
+ uio.uio_resid = sizeof(struct reg);
+ uio.uio_segflg = UIO_USERSPACE;
+ uio.uio_rw = write ? UIO_WRITE : UIO_READ;
+ uio.uio_procp = curp;
+ return (procfs_doregs(curp, p, NULL, &uio));
+ }
+#endif /* defined(PT_SETREGS) || defined(PT_GETREGS) */
+
+#ifdef PT_SETFPREGS
+ case PT_SETFPREGS:
+ write = 1;
+ /* fallthrough */
+#endif /* PT_SETFPREGS */
+#ifdef PT_GETFPREGS
+ case PT_GETFPREGS:
+ /* write = 0 above */
+#endif /* PT_SETFPREGS */
+#if defined(PT_SETFPREGS) || defined(PT_GETFPREGS)
+ if (!procfs_validfpregs(p)) /* no P_SYSTEM procs please */
+ return EINVAL;
+ else {
+ iov.iov_base = uap->addr;
+ iov.iov_len = sizeof(struct fpreg);
+ uio.uio_iov = &iov;
+ uio.uio_iovcnt = 1;
+ uio.uio_offset = 0;
+ uio.uio_resid = sizeof(struct fpreg);
+ uio.uio_segflg = UIO_USERSPACE;
+ uio.uio_rw = write ? UIO_WRITE : UIO_READ;
+ uio.uio_procp = curp;
+ return (procfs_dofpregs(curp, p, NULL, &uio));
+ }
+#endif /* defined(PT_SETFPREGS) || defined(PT_GETFPREGS) */
+
+ default:
+ break;
+ }
+
+ return 0;
+}
+
+int
+trace_req(p)
+ struct proc *p;
+{
+ return 1;
}
diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c
index abc2dc7..c3e6615 100644
--- a/sys/kern/sys_socket.c
+++ b/sys/kern/sys_socket.c
@@ -30,28 +30,39 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)sys_socket.c 8.3 (Berkeley) 2/14/95
+ * @(#)sys_socket.c 8.1 (Berkeley) 6/10/93
+ * $Id: sys_socket.c,v 1.11 1997/03/23 03:36:25 bde Exp $
*/
#include <sys/param.h>
+#include <sys/queue.h>
#include <sys/systm.h>
#include <sys/proc.h>
+#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
#include <sys/socket.h>
+#include <sys/stat.h>
#include <sys/socketvar.h>
-#include <sys/ioctl.h>
+#include <sys/filio.h> /* XXX */
+#include <sys/sockio.h>
#include <sys/stat.h>
#include <net/if.h>
#include <net/route.h>
+static int soo_read __P((struct file *fp, struct uio *uio,
+ struct ucred *cred));
+static int soo_write __P((struct file *fp, struct uio *uio,
+ struct ucred *cred));
+static int soo_close __P((struct file *fp, struct proc *p));
+
struct fileops socketops =
{ soo_read, soo_write, soo_ioctl, soo_select, soo_close };
/* ARGSUSED */
-int
+static int
soo_read(fp, uio, cred)
struct file *fp;
struct uio *uio;
@@ -63,7 +74,7 @@ soo_read(fp, uio, cred)
}
/* ARGSUSED */
-int
+static int
soo_write(fp, uio, cred)
struct file *fp;
struct uio *uio;
@@ -77,7 +88,7 @@ soo_write(fp, uio, cred)
int
soo_ioctl(fp, cmd, data, p)
struct file *fp;
- u_long cmd;
+ int cmd;
register caddr_t data;
struct proc *p;
{
@@ -129,8 +140,7 @@ soo_ioctl(fp, cmd, data, p)
return (ifioctl(so, cmd, data, p));
if (IOCGROUP(cmd) == 'r')
return (rtioctl(cmd, data, p));
- return ((*so->so_proto->pr_usrreq)(so, PRU_CONTROL,
- (struct mbuf *)cmd, (struct mbuf *)data, (struct mbuf *)0));
+ return ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, 0));
}
int
@@ -183,13 +193,11 @@ soo_stat(so, ub)
bzero((caddr_t)ub, sizeof (*ub));
ub->st_mode = S_IFSOCK;
- return ((*so->so_proto->pr_usrreq)(so, PRU_SENSE,
- (struct mbuf *)ub, (struct mbuf *)0,
- (struct mbuf *)0));
+ return ((*so->so_proto->pr_usrreqs->pru_sense)(so, ub));
}
/* ARGSUSED */
-int
+static int
soo_close(fp, p)
struct file *fp;
struct proc *p;
diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c
index 91cbdc9..e938376 100644
--- a/sys/kern/syscalls.c
+++ b/sys/kern/syscalls.c
@@ -2,7 +2,7 @@
* System call names.
*
* DO NOT EDIT-- this file is automatically generated.
- * created from @(#)syscalls.master 8.6 (Berkeley) 3/30/95
+ * created from Id: syscalls.master,v 1.33 1997/02/22 09:39:21 peter Exp
*/
char *syscallnames[] = {
@@ -14,10 +14,10 @@ char *syscallnames[] = {
"open", /* 5 = open */
"close", /* 6 = close */
"wait4", /* 7 = wait4 */
- "compat_43_creat", /* 8 = compat_43 creat */
+ "old.creat", /* 8 = old creat */
"link", /* 9 = link */
"unlink", /* 10 = unlink */
- "#11 (obsolete execv)", /* 11 = obsolete execv */
+ "obs_execv", /* 11 = obsolete execv */
"chdir", /* 12 = chdir */
"fchdir", /* 13 = fchdir */
"mknod", /* 14 = mknod */
@@ -25,7 +25,7 @@ char *syscallnames[] = {
"chown", /* 16 = chown */
"break", /* 17 = break */
"getfsstat", /* 18 = getfsstat */
- "compat_43_lseek", /* 19 = compat_43 lseek */
+ "old.lseek", /* 19 = old lseek */
"getpid", /* 20 = getpid */
"mount", /* 21 = mount */
"unmount", /* 22 = unmount */
@@ -44,18 +44,14 @@ char *syscallnames[] = {
"fchflags", /* 35 = fchflags */
"sync", /* 36 = sync */
"kill", /* 37 = kill */
- "compat_43_stat", /* 38 = compat_43 stat */
+ "old.stat", /* 38 = old stat */
"getppid", /* 39 = getppid */
- "compat_43_lstat", /* 40 = compat_43 lstat */
+ "old.lstat", /* 40 = old lstat */
"dup", /* 41 = dup */
"pipe", /* 42 = pipe */
"getegid", /* 43 = getegid */
"profil", /* 44 = profil */
-#ifdef KTRACE
"ktrace", /* 45 = ktrace */
-#else
- "#45 (unimplemented ktrace)", /* 45 = unimplemented ktrace */
-#endif
"sigaction", /* 46 = sigaction */
"getgid", /* 47 = getgid */
"sigprocmask", /* 48 = sigprocmask */
@@ -72,83 +68,75 @@ char *syscallnames[] = {
"execve", /* 59 = execve */
"umask", /* 60 = umask */
"chroot", /* 61 = chroot */
- "compat_43_fstat", /* 62 = compat_43 fstat */
- "compat_43_getkerninfo", /* 63 = compat_43 getkerninfo */
- "compat_43_getpagesize", /* 64 = compat_43 getpagesize */
+ "old.fstat", /* 62 = old fstat */
+ "old.getkerninfo", /* 63 = old getkerninfo */
+ "old.getpagesize", /* 64 = old getpagesize */
"msync", /* 65 = msync */
"vfork", /* 66 = vfork */
- "#67 (obsolete vread)", /* 67 = obsolete vread */
- "#68 (obsolete vwrite)", /* 68 = obsolete vwrite */
+ "obs_vread", /* 67 = obsolete vread */
+ "obs_vwrite", /* 68 = obsolete vwrite */
"sbrk", /* 69 = sbrk */
"sstk", /* 70 = sstk */
- "compat_43_mmap", /* 71 = compat_43 mmap */
+ "old.mmap", /* 71 = old mmap */
"vadvise", /* 72 = vadvise */
"munmap", /* 73 = munmap */
"mprotect", /* 74 = mprotect */
"madvise", /* 75 = madvise */
- "#76 (obsolete vhangup)", /* 76 = obsolete vhangup */
- "#77 (obsolete vlimit)", /* 77 = obsolete vlimit */
+ "obs_vhangup", /* 76 = obsolete vhangup */
+ "obs_vlimit", /* 77 = obsolete vlimit */
"mincore", /* 78 = mincore */
"getgroups", /* 79 = getgroups */
"setgroups", /* 80 = setgroups */
"getpgrp", /* 81 = getpgrp */
"setpgid", /* 82 = setpgid */
"setitimer", /* 83 = setitimer */
- "compat_43_wait", /* 84 = compat_43 wait */
+ "old.wait", /* 84 = old wait */
"swapon", /* 85 = swapon */
"getitimer", /* 86 = getitimer */
- "compat_43_gethostname", /* 87 = compat_43 gethostname */
- "compat_43_sethostname", /* 88 = compat_43 sethostname */
+ "old.gethostname", /* 87 = old gethostname */
+ "old.sethostname", /* 88 = old sethostname */
"getdtablesize", /* 89 = getdtablesize */
"dup2", /* 90 = dup2 */
- "#91 (unimplemented getdopt)", /* 91 = unimplemented getdopt */
+ "#91", /* 91 = getdopt */
"fcntl", /* 92 = fcntl */
"select", /* 93 = select */
- "#94 (unimplemented setdopt)", /* 94 = unimplemented setdopt */
+ "#94", /* 94 = setdopt */
"fsync", /* 95 = fsync */
"setpriority", /* 96 = setpriority */
"socket", /* 97 = socket */
"connect", /* 98 = connect */
- "compat_43_accept", /* 99 = compat_43 accept */
+ "old.accept", /* 99 = old accept */
"getpriority", /* 100 = getpriority */
- "compat_43_send", /* 101 = compat_43 send */
- "compat_43_recv", /* 102 = compat_43 recv */
+ "old.send", /* 101 = old send */
+ "old.recv", /* 102 = old recv */
"sigreturn", /* 103 = sigreturn */
"bind", /* 104 = bind */
"setsockopt", /* 105 = setsockopt */
"listen", /* 106 = listen */
- "#107 (obsolete vtimes)", /* 107 = obsolete vtimes */
- "compat_43_sigvec", /* 108 = compat_43 sigvec */
- "compat_43_sigblock", /* 109 = compat_43 sigblock */
- "compat_43_sigsetmask", /* 110 = compat_43 sigsetmask */
+ "obs_vtimes", /* 107 = obsolete vtimes */
+ "old.sigvec", /* 108 = old sigvec */
+ "old.sigblock", /* 109 = old sigblock */
+ "old.sigsetmask", /* 110 = old sigsetmask */
"sigsuspend", /* 111 = sigsuspend */
- "compat_43_sigstack", /* 112 = compat_43 sigstack */
- "compat_43_recvmsg", /* 113 = compat_43 recvmsg */
- "compat_43_sendmsg", /* 114 = compat_43 sendmsg */
-#ifdef TRACE
- "vtrace", /* 115 = vtrace */
-#else
- "#115 (obsolete vtrace)", /* 115 = obsolete vtrace */
-#endif
+ "old.sigstack", /* 112 = old sigstack */
+ "old.recvmsg", /* 113 = old recvmsg */
+ "old.sendmsg", /* 114 = old sendmsg */
+ "obs_vtrace", /* 115 = obsolete vtrace */
"gettimeofday", /* 116 = gettimeofday */
"getrusage", /* 117 = getrusage */
"getsockopt", /* 118 = getsockopt */
-#ifdef vax
- "resuba", /* 119 = resuba */
-#else
- "#119 (unimplemented resuba)", /* 119 = unimplemented resuba */
-#endif
+ "#119", /* 119 = resuba */
"readv", /* 120 = readv */
"writev", /* 121 = writev */
"settimeofday", /* 122 = settimeofday */
"fchown", /* 123 = fchown */
"fchmod", /* 124 = fchmod */
- "compat_43_recvfrom", /* 125 = compat_43 recvfrom */
- "compat_43_setreuid", /* 126 = compat_43 setreuid */
- "compat_43_setregid", /* 127 = compat_43 setregid */
+ "old.recvfrom", /* 125 = old recvfrom */
+ "setreuid", /* 126 = setreuid */
+ "setregid", /* 127 = setregid */
"rename", /* 128 = rename */
- "compat_43_truncate", /* 129 = compat_43 truncate */
- "compat_43_ftruncate", /* 130 = compat_43 ftruncate */
+ "old.truncate", /* 129 = old truncate */
+ "old.ftruncate", /* 130 = old ftruncate */
"flock", /* 131 = flock */
"mkfifo", /* 132 = mkfifo */
"sendto", /* 133 = sendto */
@@ -157,60 +145,56 @@ char *syscallnames[] = {
"mkdir", /* 136 = mkdir */
"rmdir", /* 137 = rmdir */
"utimes", /* 138 = utimes */
- "#139 (obsolete 4.2 sigreturn)", /* 139 = obsolete 4.2 sigreturn */
+ "obs_4.2", /* 139 = obsolete 4.2 sigreturn */
"adjtime", /* 140 = adjtime */
- "compat_43_getpeername", /* 141 = compat_43 getpeername */
- "compat_43_gethostid", /* 142 = compat_43 gethostid */
- "compat_43_sethostid", /* 143 = compat_43 sethostid */
- "compat_43_getrlimit", /* 144 = compat_43 getrlimit */
- "compat_43_setrlimit", /* 145 = compat_43 setrlimit */
- "compat_43_killpg", /* 146 = compat_43 killpg */
+ "old.getpeername", /* 141 = old getpeername */
+ "old.gethostid", /* 142 = old gethostid */
+ "old.sethostid", /* 143 = old sethostid */
+ "old.getrlimit", /* 144 = old getrlimit */
+ "old.setrlimit", /* 145 = old setrlimit */
+ "old.killpg", /* 146 = old killpg */
"setsid", /* 147 = setsid */
"quotactl", /* 148 = quotactl */
- "compat_43_quota", /* 149 = compat_43 quota */
- "compat_43_getsockname", /* 150 = compat_43 getsockname */
- "#151 (unimplemented)", /* 151 = unimplemented */
- "#152 (unimplemented)", /* 152 = unimplemented */
- "#153 (unimplemented)", /* 153 = unimplemented */
- "#154 (unimplemented)", /* 154 = unimplemented */
+ "old.quota", /* 149 = old quota */
+ "old.getsockname", /* 150 = old getsockname */
+ "#151", /* 151 = sem_lock */
+ "#152", /* 152 = sem_wakeup */
+ "#153", /* 153 = asyncdaemon */
+ "#154", /* 154 = nosys */
#ifdef NFS
"nfssvc", /* 155 = nfssvc */
#else
- "#155 (unimplemented nfssvc)", /* 155 = unimplemented nfssvc */
+ "#155", /* 155 = nosys */
#endif
- "compat_43_getdirentries", /* 156 = compat_43 getdirentries */
+ "old.getdirentries", /* 156 = old getdirentries */
"statfs", /* 157 = statfs */
"fstatfs", /* 158 = fstatfs */
- "#159 (unimplemented)", /* 159 = unimplemented */
- "#160 (unimplemented)", /* 160 = unimplemented */
-#ifdef NFS
+ "#159", /* 159 = nosys */
+ "#160", /* 160 = nosys */
+#if defined(NFS) && !defined (NFS_NOSERVER)
"getfh", /* 161 = getfh */
#else
- "#161 (unimplemented getfh)", /* 161 = unimplemented getfh */
-#endif
- "#162 (unimplemented getdomainname)", /* 162 = unimplemented getdomainname */
- "#163 (unimplemented setdomainname)", /* 163 = unimplemented setdomainname */
- "#164 (unimplemented)", /* 164 = unimplemented */
- "#165 (unimplemented)", /* 165 = unimplemented */
- "#166 (unimplemented)", /* 166 = unimplemented */
- "#167 (unimplemented)", /* 167 = unimplemented */
- "#168 (unimplemented)", /* 168 = unimplemented */
- "#169 (unimplemented semsys)", /* 169 = unimplemented semsys */
- "#170 (unimplemented msgsys)", /* 170 = unimplemented msgsys */
-#if defined(SYSVSHM) && !defined(alpha)
- "compat_43_shmsys", /* 171 = compat_43 shmsys */
-#else
- "#171 (unimplemented shmsys)", /* 171 = unimplemented shmsys */
+ "#161", /* 161 = nosys */
#endif
- "#172 (unimplemented)", /* 172 = unimplemented */
- "#173 (unimplemented)", /* 173 = unimplemented */
- "#174 (unimplemented)", /* 174 = unimplemented */
- "#175 (unimplemented)", /* 175 = unimplemented */
- "#176 (unimplemented)", /* 176 = unimplemented */
- "#177 (unimplemented)", /* 177 = unimplemented */
- "#178 (unimplemented)", /* 178 = unimplemented */
- "#179 (unimplemented)", /* 179 = unimplemented */
- "#180 (unimplemented)", /* 180 = unimplemented */
+ "getdomainname", /* 162 = getdomainname */
+ "setdomainname", /* 163 = setdomainname */
+ "uname", /* 164 = uname */
+ "sysarch", /* 165 = sysarch */
+ "rtprio", /* 166 = rtprio */
+ "#167", /* 167 = nosys */
+ "#168", /* 168 = nosys */
+ "semsys", /* 169 = semsys */
+ "msgsys", /* 170 = msgsys */
+ "shmsys", /* 171 = shmsys */
+ "#172", /* 172 = nosys */
+ "#173", /* 173 = nosys */
+ "#174", /* 174 = nosys */
+ "#175", /* 175 = nosys */
+ "ntp_adjtime", /* 176 = ntp_adjtime */
+ "#177", /* 177 = sfork */
+ "#178", /* 178 = getdescriptor */
+ "#179", /* 179 = setdescriptor */
+ "#180", /* 180 = nosys */
"setgid", /* 181 = setgid */
"setegid", /* 182 = setegid */
"seteuid", /* 183 = seteuid */
@@ -220,17 +204,17 @@ char *syscallnames[] = {
"lfs_segclean", /* 186 = lfs_segclean */
"lfs_segwait", /* 187 = lfs_segwait */
#else
- "#184 (unimplemented lfs_bmapv)", /* 184 = unimplemented lfs_bmapv */
- "#185 (unimplemented lfs_markv)", /* 185 = unimplemented lfs_markv */
- "#186 (unimplemented lfs_segclean)", /* 186 = unimplemented lfs_segclean */
- "#187 (unimplemented lfs_segwait)", /* 187 = unimplemented lfs_segwait */
+ "#184", /* 184 = nosys */
+ "#185", /* 185 = nosys */
+ "#186", /* 186 = nosys */
+ "#187", /* 187 = nosys */
#endif
"stat", /* 188 = stat */
"fstat", /* 189 = fstat */
"lstat", /* 190 = lstat */
"pathconf", /* 191 = pathconf */
"fpathconf", /* 192 = fpathconf */
- "#193 (unimplemented)", /* 193 = unimplemented */
+ "#193", /* 193 = nosys */
"getrlimit", /* 194 = getrlimit */
"setrlimit", /* 195 = setrlimit */
"getdirentries", /* 196 = getdirentries */
@@ -242,38 +226,51 @@ char *syscallnames[] = {
"__sysctl", /* 202 = __sysctl */
"mlock", /* 203 = mlock */
"munlock", /* 204 = munlock */
- "undelete", /* 205 = undelete */
- "#206 (unimplemented)", /* 206 = unimplemented */
- "#207 (unimplemented)", /* 207 = unimplemented */
- "#208 (unimplemented)", /* 208 = unimplemented */
- "#209 (unimplemented)", /* 209 = unimplemented */
- "#210 (unimplemented)", /* 210 = unimplemented */
- "#211 (unimplemented)", /* 211 = unimplemented */
- "#212 (unimplemented)", /* 212 = unimplemented */
- "#213 (unimplemented)", /* 213 = unimplemented */
- "#214 (unimplemented)", /* 214 = unimplemented */
- "#215 (unimplemented)", /* 215 = unimplemented */
- "#216 (unimplemented)", /* 216 = unimplemented */
- "#217 (unimplemented)", /* 217 = unimplemented */
- "#218 (unimplemented)", /* 218 = unimplemented */
- "#219 (unimplemented)", /* 219 = unimplemented */
- "#220 (unimplemented semctl)", /* 220 = unimplemented semctl */
- "#221 (unimplemented semget)", /* 221 = unimplemented semget */
- "#222 (unimplemented semop)", /* 222 = unimplemented semop */
- "#223 (unimplemented semconfig)", /* 223 = unimplemented semconfig */
- "#224 (unimplemented msgctl)", /* 224 = unimplemented msgctl */
- "#225 (unimplemented msgget)", /* 225 = unimplemented msgget */
- "#226 (unimplemented msgsnd)", /* 226 = unimplemented msgsnd */
- "#227 (unimplemented msgrcv)", /* 227 = unimplemented msgrcv */
-#if defined(SYSVSHM) && 0
+ "utrace", /* 205 = utrace */
+ "undelete", /* 206 = undelete */
+ "#207", /* 207 = nosys */
+ "#208", /* 208 = nosys */
+ "#209", /* 209 = nosys */
+ "lkmnosys", /* 210 = lkmnosys */
+ "lkmnosys", /* 211 = lkmnosys */
+ "lkmnosys", /* 212 = lkmnosys */
+ "lkmnosys", /* 213 = lkmnosys */
+ "lkmnosys", /* 214 = lkmnosys */
+ "lkmnosys", /* 215 = lkmnosys */
+ "lkmnosys", /* 216 = lkmnosys */
+ "lkmnosys", /* 217 = lkmnosys */
+ "lkmnosys", /* 218 = lkmnosys */
+ "lkmnosys", /* 219 = lkmnosys */
+ "__semctl", /* 220 = __semctl */
+ "semget", /* 221 = semget */
+ "semop", /* 222 = semop */
+ "semconfig", /* 223 = semconfig */
+ "msgctl", /* 224 = msgctl */
+ "msgget", /* 225 = msgget */
+ "msgsnd", /* 226 = msgsnd */
+ "msgrcv", /* 227 = msgrcv */
"shmat", /* 228 = shmat */
"shmctl", /* 229 = shmctl */
"shmdt", /* 230 = shmdt */
"shmget", /* 231 = shmget */
-#else
- "#228 (unimplemented shmat)", /* 228 = unimplemented shmat */
- "#229 (unimplemented shmctl)", /* 229 = unimplemented shmctl */
- "#230 (unimplemented shmdt)", /* 230 = unimplemented shmdt */
- "#231 (unimplemented shmget)", /* 231 = unimplemented shmget */
-#endif
+ "#232", /* 232 = nosys */
+ "#233", /* 233 = nosys */
+ "#234", /* 234 = nosys */
+ "#235", /* 235 = nosys */
+ "#236", /* 236 = nosys */
+ "#237", /* 237 = nosys */
+ "#238", /* 238 = nosys */
+ "#239", /* 239 = nosys */
+ "#240", /* 240 = nosys */
+ "#241", /* 241 = nosys */
+ "#242", /* 242 = nosys */
+ "#243", /* 243 = nosys */
+ "#244", /* 244 = nosys */
+ "#245", /* 245 = nosys */
+ "#246", /* 246 = nosys */
+ "#247", /* 247 = nosys */
+ "#248", /* 248 = nosys */
+ "#249", /* 249 = nosys */
+ "minherit", /* 250 = minherit */
+ "rfork", /* 251 = rfork */
};
diff --git a/sys/kern/syscalls.conf b/sys/kern/syscalls.conf
deleted file mode 100644
index 71b82ce..0000000
--- a/sys/kern/syscalls.conf
+++ /dev/null
@@ -1,12 +0,0 @@
-# @(#)syscalls.conf 8.1 (Berkeley) 2/14/95
-
-sysnames="syscalls.c"
-sysnumhdr="../sys/syscall.h"
-syssw="init_sysent.c"
-sysarghdr="../sys/syscallargs.h"
-compatopts="compat_43"
-libcompatopts=""
-
-switchname="sysent"
-namesname="syscallnames"
-constprefix="SYS_"
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index b57cd73..b0921d4 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -1,38 +1,32 @@
- @(#)syscalls.master 8.6 (Berkeley) 3/30/95
-; System call name/number "master" file.
-; (See syscalls.conf to see what it is processed into.)
+ $Id$
+; from: @(#)syscalls.master 8.2 (Berkeley) 1/13/94
;
-; Fields: number type [type-dependent ...]
+; System call name/number master file.
+; Processed to created init_sysent.c, syscalls.c and syscall.h.
+
+; Columns: number type nargs namespc name alt{name,tag,rtyp}/comments
; number system call number, must be in order
-; type one of STD, OBSOL, UNIMPL, NODEF, NOARGS, or one of
-; the compatibility options defined in syscalls.conf.
-;
+; type one of STD, OBSOL, UNIMPL, COMPAT
+; namespc one of POSIX, BSD, NOHIDE
+; name psuedo-prototype of syscall routine
+; If one of the following alts is different, then all appear:
+; altname name of system call if different
+; alttag name of args struct tag if different from [o]`name'"_args"
+; altrtyp return type if not int (bogus - syscalls always return int)
+; for UNIMPL/OBSOL, name continues with comments
+
; types:
; STD always included
-; OBSOL obsolete, not included in system
-; UNIMPL unimplemented, not included in system
-; NODEF included, but don't define the syscall number
-; NOARGS included, but don't define the syscall args structure
-;
-; The compat options are defined in the syscalls.conf file, and the
-; compat option name is prefixed to the syscall name. Other than
-; that, they're like NODEF (for 'compat' options), or STD (for
-; 'libcompat' options).
-;
-; The type-dependent arguments are as follows:
-; For STD, NODEF, NOARGS, and compat syscalls:
-; { pseudo-proto } [alias]
-; For other syscalls:
-; [comment]
-;
+; COMPAT included on COMPAT #ifdef
+; LIBCOMPAT included on COMPAT #ifdef, and placed in syscall.h
+; OBSOL obsolete, not included in system, only specifies name
+; UNIMPL not implemented, placeholder only
+
; #ifdef's, etc. may be included, and are copied to the output files.
-; #include's are copied to the syscall switch definition file only.
#include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/signal.h>
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
; Reserved/unimplemented system calls in the range 0-150 inclusive
; are reserved for use in future Berkeley releases.
@@ -40,316 +34,359 @@
; redistributions should be placed in the reserved range at the end
; of the current calls.
-0 STD { int nosys(void); } syscall
-1 STD { int exit(int rval); }
-2 STD { int fork(void); }
-3 STD { int read(int fd, char *buf, u_int nbyte); }
-4 STD { int write(int fd, char *buf, u_int nbyte); }
-5 STD { int open(char *path, int flags, int mode); }
-6 STD { int close(int fd); }
-7 STD { int wait4(int pid, int *status, int options, \
- struct rusage *rusage); }
-8 COMPAT_43 { int creat(char *path, int mode); }
-9 STD { int link(char *path, char *link); }
-10 STD { int unlink(char *path); }
-11 OBSOL execv
-12 STD { int chdir(char *path); }
-13 STD { int fchdir(int fd); }
-14 STD { int mknod(char *path, int mode, int dev); }
-15 STD { int chmod(char *path, int mode); }
-16 STD { int chown(char *path, int uid, int gid); }
-17 STD { int obreak(char *nsize); } break
-18 STD { int getfsstat(struct statfs *buf, long bufsize, \
+0 STD NOHIDE { int nosys(void); } syscall nosys_args int
+1 STD NOHIDE { void exit(int rval); } exit rexit_args void
+2 STD POSIX { int fork(void); }
+3 STD POSIX { int read(int fd, char *buf, u_int nbyte); }
+4 STD POSIX { int write(int fd, char *buf, u_int nbyte); }
+5 STD POSIX { int open(char *path, int flags, int mode); }
+; XXX should be { int open(const char *path, int flags, ...); }
+; but we're not ready for `const' or varargs.
+; XXX man page says `mode_t mode'.
+6 STD POSIX { int close(int fd); }
+7 STD BSD { int wait4(int pid, int *status, int options, \
+ struct rusage *rusage); } wait4 wait_args int
+8 COMPAT BSD { int creat(char *path, int mode); }
+9 STD POSIX { int link(char *path, char *link); }
+10 STD POSIX { int unlink(char *path); }
+11 OBSOL NOHIDE execv
+12 STD POSIX { int chdir(char *path); }
+13 STD BSD { int fchdir(int fd); }
+14 STD POSIX { int mknod(char *path, int mode, int dev); }
+15 STD POSIX { int chmod(char *path, int mode); }
+16 STD POSIX { int chown(char *path, int uid, int gid); }
+17 STD BSD { int obreak(char *nsize); } break obreak_args int
+18 STD BSD { int getfsstat(struct statfs *buf, long bufsize, \
int flags); }
-19 COMPAT_43 { long lseek(int fd, long offset, int whence); }
-20 STD { pid_t getpid(void); }
-21 STD { int mount(char *type, char *path, int flags, \
+19 COMPAT POSIX { long lseek(int fd, long offset, int whence); }
+20 STD POSIX { pid_t getpid(void); }
+21 STD BSD { int mount(char *type, char *path, int flags, \
caddr_t data); }
-22 STD { int unmount(char *path, int flags); }
-23 STD { int setuid(uid_t uid); }
-24 STD { uid_t getuid(void); }
-25 STD { uid_t geteuid(void); }
-26 STD { int ptrace(int req, pid_t pid, caddr_t addr, \
+; XXX 4.4lite2 uses `char *type' but we're not ready for that.
+; XXX `path' should have type `const char *' but we're not ready for that.
+22 STD BSD { int unmount(char *path, int flags); }
+23 STD POSIX { int setuid(uid_t uid); }
+24 STD POSIX { uid_t getuid(void); }
+25 STD POSIX { uid_t geteuid(void); }
+26 STD BSD { int ptrace(int req, pid_t pid, caddr_t addr, \
int data); }
-27 STD { int recvmsg(int s, struct msghdr *msg, int flags); }
-28 STD { int sendmsg(int s, caddr_t msg, int flags); }
-29 STD { int recvfrom(int s, caddr_t buf, size_t len, \
+27 STD BSD { int recvmsg(int s, struct msghdr *msg, int flags); }
+28 STD BSD { int sendmsg(int s, caddr_t msg, int flags); }
+29 STD BSD { int recvfrom(int s, caddr_t buf, size_t len, \
int flags, caddr_t from, int *fromlenaddr); }
-30 STD { int accept(int s, caddr_t name, int *anamelen); }
-31 STD { int getpeername(int fdes, caddr_t asa, int *alen); }
-32 STD { int getsockname(int fdes, caddr_t asa, int *alen); }
-33 STD { int access(char *path, int flags); }
-34 STD { int chflags(char *path, int flags); }
-35 STD { int fchflags(int fd, int flags); }
-36 STD { int sync(void); }
-37 STD { int kill(int pid, int signum); }
-38 COMPAT_43 { int stat(char *path, struct ostat *ub); }
-39 STD { pid_t getppid(void); }
-40 COMPAT_43 { int lstat(char *path, struct ostat *ub); }
-41 STD { int dup(u_int fd); }
-42 STD { int pipe(void); }
-43 STD { gid_t getegid(void); }
-44 STD { int profil(caddr_t samples, u_int size, \
+30 STD BSD { int accept(int s, caddr_t name, int *anamelen); }
+31 STD BSD { int getpeername(int fdes, caddr_t asa, int *alen); }
+32 STD BSD { int getsockname(int fdes, caddr_t asa, int *alen); }
+33 STD POSIX { int access(char *path, int flags); }
+34 STD BSD { int chflags(char *path, int flags); }
+35 STD BSD { int fchflags(int fd, int flags); }
+36 STD BSD { int sync(void); }
+37 STD POSIX { int kill(int pid, int signum); }
+38 COMPAT POSIX { int stat(char *path, struct ostat *ub); }
+39 STD POSIX { pid_t getppid(void); }
+40 COMPAT POSIX { int lstat(char *path, struct ostat *ub); }
+41 STD POSIX { int dup(u_int fd); }
+42 STD POSIX { int pipe(void); }
+43 STD POSIX { gid_t getegid(void); }
+44 STD BSD { int profil(caddr_t samples, u_int size, \
u_int offset, u_int scale); }
-#ifdef KTRACE
-45 STD { int ktrace(char *fname, int ops, int facs, \
+45 STD BSD { int ktrace(char *fname, int ops, int facs, \
int pid); }
-#else
-45 UNIMPL ktrace
-#endif
-46 STD { int sigaction(int signum, struct sigaction *nsa, \
+46 STD POSIX { int sigaction(int signum, struct sigaction *nsa, \
struct sigaction *osa); }
-47 STD { gid_t getgid(void); }
-48 STD { int sigprocmask(int how, sigset_t mask); }
-49 STD { int getlogin(char *namebuf, u_int namelen); }
-50 STD { int setlogin(char *namebuf); }
-51 STD { int acct(char *path); }
-52 STD { int sigpending(void); }
-53 STD { int sigaltstack(struct sigaltstack *nss, \
+47 STD POSIX { gid_t getgid(void); }
+48 STD POSIX { int sigprocmask(int how, sigset_t mask); }
+; XXX note nonstandard (bogus) calling convention - the libc stub passes
+; us the mask, not a pointer to it, and we return the old mask as the
+; (int) return value.
+49 STD BSD { int getlogin(char *namebuf, u_int namelen); }
+50 STD BSD { int setlogin(char *namebuf); }
+51 STD BSD { int acct(char *path); }
+52 STD POSIX { int sigpending(void); }
+53 STD BSD { int sigaltstack(struct sigaltstack *nss, \
struct sigaltstack *oss); }
-54 STD { int ioctl(int fd, u_long com, caddr_t data); }
-55 STD { int reboot(int opt); }
-56 STD { int revoke(char *path); }
-57 STD { int symlink(char *path, char *link); }
-58 STD { int readlink(char *path, char *buf, int count); }
-59 STD { int execve(char *path, char **argp, char **envp); }
-60 STD { int umask(int newmask); }
-61 STD { int chroot(char *path); }
-62 COMPAT_43 { int fstat(int fd, struct ostat *sb); }
-63 COMPAT_43 { int getkerninfo(int op, char *where, int *size, \
- int arg); }
-64 COMPAT_43 { int getpagesize(void); }
-65 STD { int msync(caddr_t addr, int len); }
-66 STD { int vfork(void); }
-67 OBSOL vread
-68 OBSOL vwrite
-69 STD { int sbrk(int incr); }
-70 STD { int sstk(int incr); }
-71 COMPAT_43 { int mmap(caddr_t addr, int len, int prot, \
+54 STD POSIX { int ioctl(int fd, u_long com, caddr_t data); }
+55 STD BSD { int reboot(int opt); }
+56 STD POSIX { int revoke(char *path); }
+57 STD POSIX { int symlink(char *path, char *link); }
+58 STD POSIX { int readlink(char *path, char *buf, int count); }
+59 STD POSIX { int execve(char *fname, char **argv, char **envv); }
+60 STD POSIX { int umask(int newmask); } umask umask_args int
+61 STD BSD { int chroot(char *path); }
+62 COMPAT POSIX { int fstat(int fd, struct ostat *sb); }
+63 COMPAT BSD { int getkerninfo(int op, char *where, int *size, \
+ int arg); } getkerninfo getkerninfo_args int
+64 COMPAT BSD { int getpagesize(void); } \
+ getpagesize getpagesize_args int
+65 STD BSD { int msync(caddr_t addr, size_t len, int flags); }
+66 STD BSD { int vfork(void); }
+67 OBSOL NOHIDE vread
+68 OBSOL NOHIDE vwrite
+69 STD BSD { int sbrk(int incr); }
+70 STD BSD { int sstk(int incr); }
+71 COMPAT BSD { int mmap(caddr_t addr, int len, int prot, \
int flags, int fd, long pos); }
-72 STD { int ovadvise(int anom); } vadvise
-73 STD { int munmap(caddr_t addr, int len); }
-74 STD { int mprotect(caddr_t addr, int len, int prot); }
-75 STD { int madvise(caddr_t addr, int len, int behav); }
-76 OBSOL vhangup
-77 OBSOL vlimit
-78 STD { int mincore(caddr_t addr, int len, char *vec); }
-79 STD { int getgroups(u_int gidsetsize, gid_t *gidset); }
-80 STD { int setgroups(u_int gidsetsize, gid_t *gidset); }
-81 STD { int getpgrp(void); }
-82 STD { int setpgid(int pid, int pgid); }
-83 STD { int setitimer(u_int which, struct itimerval *itv, \
+72 STD BSD { int ovadvise(int anom); } vadvise ovadvise_args int
+73 STD BSD { int munmap(caddr_t addr, size_t len); }
+74 STD BSD { int mprotect(caddr_t addr, size_t len, int prot); }
+75 STD BSD { int madvise(caddr_t addr, size_t len, int behav); }
+76 OBSOL NOHIDE vhangup
+77 OBSOL NOHIDE vlimit
+78 STD BSD { int mincore(caddr_t addr, size_t len, char *vec); }
+79 STD POSIX { int getgroups(u_int gidsetsize, gid_t *gidset); }
+80 STD POSIX { int setgroups(u_int gidsetsize, gid_t *gidset); }
+81 STD POSIX { int getpgrp(void); }
+82 STD POSIX { int setpgid(int pid, int pgid); }
+83 STD BSD { int setitimer(u_int which, struct itimerval *itv, \
struct itimerval *oitv); }
-84 COMPAT_43 { int wait(void); }
-85 STD { int swapon(char *name); }
-86 STD { int getitimer(u_int which, struct itimerval *itv); }
-87 COMPAT_43 { int gethostname(char *hostname, u_int len); }
-88 COMPAT_43 { int sethostname(char *hostname, u_int len); }
-89 STD { int getdtablesize(void); }
-90 STD { int dup2(u_int from, u_int to); }
-91 UNIMPL getdopt
-92 STD { int fcntl(int fd, int cmd, void *arg); }
-93 STD { int select(u_int nd, fd_set *in, fd_set *ou, \
+84 COMPAT BSD { int wait(void); }
+85 STD BSD { int swapon(char *name); }
+86 STD BSD { int getitimer(u_int which, struct itimerval *itv); }
+87 COMPAT BSD { int gethostname(char *hostname, u_int len); } \
+ gethostname gethostname_args int
+88 COMPAT BSD { int sethostname(char *hostname, u_int len); } \
+ sethostname sethostname_args int
+89 STD BSD { int getdtablesize(void); }
+90 STD POSIX { int dup2(u_int from, u_int to); }
+91 UNIMPL BSD getdopt
+92 STD POSIX { int fcntl(int fd, int cmd, int arg); }
+; XXX should be { int fcntl(int fd, int cmd, ...); }
+; but we're not ready for varargs.
+; XXX man page says `int arg' too.
+93 STD BSD { int select(int nd, fd_set *in, fd_set *ou, \
fd_set *ex, struct timeval *tv); }
-94 UNIMPL setdopt
-95 STD { int fsync(int fd); }
-96 STD { int setpriority(int which, int who, int prio); }
-97 STD { int socket(int domain, int type, int protocol); }
-98 STD { int connect(int s, caddr_t name, int namelen); }
-99 COMPAT_43 { int accept(int s, caddr_t name, int *anamelen); }
-100 STD { int getpriority(int which, int who); }
-101 COMPAT_43 { int send(int s, caddr_t buf, int len, int flags); }
-102 COMPAT_43 { int recv(int s, caddr_t buf, int len, int flags); }
-103 STD { int sigreturn(struct sigcontext *sigcntxp); }
-104 STD { int bind(int s, caddr_t name, int namelen); }
-105 STD { int setsockopt(int s, int level, int name, \
+94 UNIMPL BSD setdopt
+95 STD POSIX { int fsync(int fd); }
+96 STD BSD { int setpriority(int which, int who, int prio); }
+97 STD BSD { int socket(int domain, int type, int protocol); }
+98 STD BSD { int connect(int s, caddr_t name, int namelen); }
+99 CPT_NOA BSD { int accept(int s, caddr_t name, int *anamelen); } \
+ accept accept_args int
+100 STD BSD { int getpriority(int which, int who); }
+101 COMPAT BSD { int send(int s, caddr_t buf, int len, int flags); }
+102 COMPAT BSD { int recv(int s, caddr_t buf, int len, int flags); }
+103 STD BSD { int sigreturn(struct sigcontext *sigcntxp); }
+104 STD BSD { int bind(int s, caddr_t name, int namelen); }
+105 STD BSD { int setsockopt(int s, int level, int name, \
caddr_t val, int valsize); }
-106 STD { int listen(int s, int backlog); }
-107 OBSOL vtimes
-108 COMPAT_43 { int sigvec(int signum, struct sigvec *nsv, \
+106 STD BSD { int listen(int s, int backlog); }
+107 OBSOL NOHIDE vtimes
+108 COMPAT BSD { int sigvec(int signum, struct sigvec *nsv, \
struct sigvec *osv); }
-109 COMPAT_43 { int sigblock(int mask); }
-110 COMPAT_43 { int sigsetmask(int mask); }
-111 STD { int sigsuspend(int mask); }
-112 COMPAT_43 { int sigstack(struct sigstack *nss, \
+109 COMPAT BSD { int sigblock(int mask); }
+110 COMPAT BSD { int sigsetmask(int mask); }
+111 STD POSIX { int sigsuspend(sigset_t mask); }
+; XXX note nonstandard (bogus) calling convention - the libc stub passes
+; us the mask, not a pointer to it.
+112 COMPAT BSD { int sigstack(struct sigstack *nss, \
struct sigstack *oss); }
-113 COMPAT_43 { int recvmsg(int s, struct omsghdr *msg, int flags); }
-114 COMPAT_43 { int sendmsg(int s, caddr_t msg, int flags); }
-#ifdef TRACE
-115 STD { int vtrace(int request, int value); }
-#else
-115 OBSOL vtrace
-#endif
-116 STD { int gettimeofday(struct timeval *tp, \
+113 COMPAT BSD { int recvmsg(int s, struct omsghdr *msg, int flags); }
+114 COMPAT BSD { int sendmsg(int s, caddr_t msg, int flags); }
+115 OBSOL NOHIDE vtrace
+116 STD BSD { int gettimeofday(struct timeval *tp, \
struct timezone *tzp); }
-117 STD { int getrusage(int who, struct rusage *rusage); }
-118 STD { int getsockopt(int s, int level, int name, \
+117 STD BSD { int getrusage(int who, struct rusage *rusage); }
+118 STD BSD { int getsockopt(int s, int level, int name, \
caddr_t val, int *avalsize); }
-#ifdef vax
-119 STD { int resuba(int value); }
-#else
-119 UNIMPL resuba
-#endif
-120 STD { int readv(int fd, struct iovec *iovp, u_int iovcnt); }
-121 STD { int writev(int fd, struct iovec *iovp, \
+119 UNIMPL NOHIDE resuba (BSD/OS 2.x)
+120 STD BSD { int readv(int fd, struct iovec *iovp, u_int iovcnt); }
+121 STD BSD { int writev(int fd, struct iovec *iovp, \
u_int iovcnt); }
-122 STD { int settimeofday(struct timeval *tv, \
+122 STD BSD { int settimeofday(struct timeval *tv, \
struct timezone *tzp); }
-123 STD { int fchown(int fd, int uid, int gid); }
-124 STD { int fchmod(int fd, int mode); }
-125 COMPAT_43 { int recvfrom(int s, caddr_t buf, size_t len, \
- int flags, caddr_t from, int *fromlenaddr); }
-126 COMPAT_43 { int setreuid(int ruid, int euid); }
-127 COMPAT_43 { int setregid(int rgid, int egid); }
-128 STD { int rename(char *from, char *to); }
-129 COMPAT_43 { int truncate(char *path, long length); }
-130 COMPAT_43 { int ftruncate(int fd, long length); }
-131 STD { int flock(int fd, int how); }
-132 STD { int mkfifo(char *path, int mode); }
-133 STD { int sendto(int s, caddr_t buf, size_t len, \
+123 STD BSD { int fchown(int fd, int uid, int gid); }
+124 STD BSD { int fchmod(int fd, int mode); }
+125 CPT_NOA BSD { int recvfrom(int s, caddr_t buf, size_t len, \
+ int flags, caddr_t from, int *fromlenaddr); } \
+ recvfrom recvfrom_args int
+126 STD BSD { int setreuid(int ruid, int euid); }
+127 STD BSD { int setregid(int rgid, int egid); }
+128 STD POSIX { int rename(char *from, char *to); }
+129 COMPAT BSD { int truncate(char *path, long length); }
+130 COMPAT BSD { int ftruncate(int fd, long length); }
+131 STD BSD { int flock(int fd, int how); }
+132 STD POSIX { int mkfifo(char *path, int mode); }
+133 STD BSD { int sendto(int s, caddr_t buf, size_t len, \
int flags, caddr_t to, int tolen); }
-134 STD { int shutdown(int s, int how); }
-135 STD { int socketpair(int domain, int type, int protocol, \
+134 STD BSD { int shutdown(int s, int how); }
+135 STD BSD { int socketpair(int domain, int type, int protocol, \
int *rsv); }
-136 STD { int mkdir(char *path, int mode); }
-137 STD { int rmdir(char *path); }
-138 STD { int utimes(char *path, struct timeval *tptr); }
-139 OBSOL 4.2 sigreturn
-140 STD { int adjtime(struct timeval *delta, \
+136 STD POSIX { int mkdir(char *path, int mode); }
+137 STD POSIX { int rmdir(char *path); }
+138 STD BSD { int utimes(char *path, struct timeval *tptr); }
+139 OBSOL NOHIDE 4.2 sigreturn
+140 STD BSD { int adjtime(struct timeval *delta, \
struct timeval *olddelta); }
-141 COMPAT_43 { int getpeername(int fdes, caddr_t asa, int *alen); }
-142 COMPAT_43 { int32_t gethostid(void); }
-143 COMPAT_43 { int sethostid(int32_t hostid); }
-144 COMPAT_43 { int getrlimit(u_int which, struct ogetrlimit *rlp); }
-145 COMPAT_43 { int setrlimit(u_int which, struct ogetrlimit *rlp); }
-146 COMPAT_43 { int killpg(int pgid, int signum); }
-147 STD { int setsid(void); }
-148 STD { int quotactl(char *path, int cmd, int uid, \
+141 COMPAT BSD { int getpeername(int fdes, caddr_t asa, int *alen); }
+142 COMPAT BSD { long gethostid(void); }
+143 COMPAT BSD { int sethostid(long hostid); }
+144 COMPAT BSD { int getrlimit(u_int which, struct ogetrlimit *rlp); }
+145 COMPAT BSD { int setrlimit(u_int which, struct ogetrlimit *rlp); }
+146 COMPAT BSD { int killpg(int pgid, int signum); }
+147 STD POSIX { int setsid(void); }
+148 STD BSD { int quotactl(char *path, int cmd, int uid, \
caddr_t arg); }
-149 COMPAT_43 { int quota(void); }
-150 COMPAT_43 { int getsockname(int fdec, caddr_t asa, int *alen); }
+149 COMPAT BSD { int quota(void); }
+150 CPT_NOA BSD { int getsockname(int fdec, caddr_t asa, int *alen); }\
+ getsockname getsockname_args int
; Syscalls 151-180 inclusive are reserved for vendor-specific
; system calls. (This includes various calls added for compatibity
; with other Unix variants.)
; Some of these calls are now supported by BSD...
-151 UNIMPL
-152 UNIMPL
-153 UNIMPL
-154 UNIMPL
+151 UNIMPL NOHIDE sem_lock (BSD/OS 2.x)
+152 UNIMPL NOHIDE sem_wakeup (BSD/OS 2.x)
+153 UNIMPL NOHIDE asyncdaemon (BSD/OS 2.x)
+154 UNIMPL NOHIDE nosys
#ifdef NFS
-155 STD { int nfssvc(int flag, caddr_t argp); }
+155 STD BSD { int nfssvc(int flag, caddr_t argp); }
#else
-155 UNIMPL nfssvc
+155 UNIMPL BSD nosys
#endif
-156 COMPAT_43 { int getdirentries(int fd, char *buf, u_int count, \
+156 COMPAT BSD { int getdirentries(int fd, char *buf, u_int count, \
long *basep); }
-157 STD { int statfs(char *path, struct statfs *buf); }
-158 STD { int fstatfs(int fd, struct statfs *buf); }
-159 UNIMPL
-160 UNIMPL
-#ifdef NFS
-161 STD { int getfh(char *fname, fhandle_t *fhp); }
-#else
-161 UNIMPL getfh
-#endif
-162 UNIMPL getdomainname
-163 UNIMPL setdomainname
-164 UNIMPL
-165 UNIMPL
-166 UNIMPL
-167 UNIMPL
-168 UNIMPL
-169 UNIMPL semsys
-170 UNIMPL msgsys
-; XXX more generally, never on machines where sizeof(void *) != sizeof(int)
-#if defined(SYSVSHM) && !defined(alpha)
-171 COMPAT_43 { int shmsys(int which, int a2, int a3, int a4); }
+157 STD BSD { int statfs(char *path, struct statfs *buf); }
+158 STD BSD { int fstatfs(int fd, struct statfs *buf); }
+159 UNIMPL NOHIDE nosys
+160 UNIMPL NOHIDE nosys
+#if defined(NFS) && !defined (NFS_NOSERVER)
+161 STD BSD { int getfh(char *fname, struct fhandle *fhp); }
#else
-171 UNIMPL shmsys
+161 UNIMPL BSD nosys
#endif
-172 UNIMPL
-173 UNIMPL
-174 UNIMPL
-175 UNIMPL
-176 UNIMPL
-177 UNIMPL
-178 UNIMPL
-179 UNIMPL
-180 UNIMPL
+162 STD BSD { int getdomainname(char *domainname, int len); }
+163 STD BSD { int setdomainname(char *domainname, int len); }
+164 STD BSD { int uname(struct utsname *name); }
+165 STD BSD { int sysarch(int op, char *parms); }
+166 STD BSD { int rtprio(int function, pid_t pid, \
+ struct rtprio *rtp); }
+167 UNIMPL NOHIDE nosys
+168 UNIMPL NOHIDE nosys
+169 STD BSD { int semsys(int which, int a2, int a3, int a4, \
+ int a5); }
+; XXX should be { int semsys(int which, ...); }
+170 STD BSD { int msgsys(int which, int a2, int a3, int a4, \
+ int a5, int a6); }
+; XXX should be { int msgsys(int which, ...); }
+171 STD BSD { int shmsys(int which, int a2, int a3, int a4); }
+; XXX should be { int shmsys(int which, ...); }
+172 UNIMPL NOHIDE nosys
+173 UNIMPL NOHIDE nosys
+174 UNIMPL NOHIDE nosys
+175 UNIMPL NOHIDE nosys
+176 STD BSD { int ntp_adjtime(struct timex *tp); }
+177 UNIMPL NOHIDE sfork (BSD/OS 2.x)
+178 UNIMPL NOHIDE getdescriptor (BSD/OS 2.x)
+179 UNIMPL NOHIDE setdescriptor (BSD/OS 2.x)
+180 UNIMPL NOHIDE nosys
-; Syscalls 180-209 are used by/reserved for BSD
-181 STD { int setgid(gid_t gid); }
-182 STD { int setegid(gid_t egid); }
-183 STD { int seteuid(uid_t euid); }
+; Syscalls 180-199 are used by/reserved for BSD
+181 STD POSIX { int setgid(gid_t gid); }
+182 STD BSD { int setegid(gid_t egid); }
+183 STD BSD { int seteuid(uid_t euid); }
#ifdef LFS
-184 STD { int lfs_bmapv(fsid_t *fsidp, \
+184 STD BSD { int lfs_bmapv(struct fsid **fsidp, \
struct block_info *blkiov, int blkcnt); }
-185 STD { int lfs_markv(fsid_t *fsidp, \
+185 STD BSD { int lfs_markv(struct fsid **fsidp, \
struct block_info *blkiov, int blkcnt); }
-186 STD { int lfs_segclean(fsid_t *fsidp, u_long segment); }
-187 STD { int lfs_segwait(fsid_t *fsidp, struct timeval *tv); }
+186 STD BSD { int lfs_segclean(struct fsid **fsidp, \
+ u_long segment); }
+187 STD BSD { int lfs_segwait(struct fsid **fsidp, \
+ struct timeval *tv); }
#else
-184 UNIMPL lfs_bmapv
-185 UNIMPL lfs_markv
-186 UNIMPL lfs_segclean
-187 UNIMPL lfs_segwait
+184 UNIMPL BSD nosys
+185 UNIMPL BSD nosys
+186 UNIMPL BSD nosys
+187 UNIMPL BSD nosys
#endif
-188 STD { int stat(char *path, struct stat *ub); }
-189 STD { int fstat(int fd, struct stat *sb); }
-190 STD { int lstat(char *path, struct stat *ub); }
-191 STD { int pathconf(char *path, int name); }
-192 STD { int fpathconf(int fd, int name); }
-193 UNIMPL
-194 STD { int getrlimit(u_int which, struct rlimit *rlp); }
-195 STD { int setrlimit(u_int which, struct rlimit *rlp); }
-196 STD { int getdirentries(int fd, char *buf, u_int count, \
+188 STD POSIX { int stat(char *path, struct stat *ub); }
+189 STD POSIX { int fstat(int fd, struct stat *sb); }
+190 STD POSIX { int lstat(char *path, struct stat *ub); }
+191 STD POSIX { int pathconf(char *path, int name); }
+192 STD POSIX { int fpathconf(int fd, int name); }
+193 UNIMPL NOHIDE nosys
+194 STD BSD { int getrlimit(u_int which, \
+ struct orlimit *rlp); } \
+ getrlimit __getrlimit_args int
+195 STD BSD { int setrlimit(u_int which, \
+ struct orlimit *rlp); } \
+ setrlimit __setrlimit_args int
+196 STD BSD { int getdirentries(int fd, char *buf, u_int count, \
long *basep); }
-197 STD { caddr_t mmap(caddr_t addr, size_t len, int prot, \
+197 STD BSD { caddr_t mmap(caddr_t addr, size_t len, int prot, \
int flags, int fd, long pad, off_t pos); }
-198 STD { int nosys(void); } __syscall
-199 STD { off_t lseek(int fd, int pad, off_t offset, \
+198 STD NOHIDE { int nosys(void); } __syscall __syscall_args int
+199 STD POSIX { off_t lseek(int fd, int pad, off_t offset, \
int whence); }
-200 STD { int truncate(char *path, int pad, off_t length); }
-201 STD { int ftruncate(int fd, int pad, off_t length); }
-202 STD { int __sysctl(int *name, u_int namelen, void *old, \
- size_t *oldlenp, void *new, size_t newlen); }
-203 STD { int mlock(caddr_t addr, size_t len); }
-204 STD { int munlock(caddr_t addr, size_t len); }
-205 STD { int undelete(char *path); }
-206 UNIMPL
-207 UNIMPL
-208 UNIMPL
-209 UNIMPL
-; Syscalls 210-219 are used by/reserved for vendor-specific system calls
-210 UNIMPL
-211 UNIMPL
-212 UNIMPL
-213 UNIMPL
-214 UNIMPL
-215 UNIMPL
-216 UNIMPL
-217 UNIMPL
-218 UNIMPL
-219 UNIMPL
-; System calls 220-240 are reserved for use by BSD
-220 UNIMPL semctl
-221 UNIMPL semget
-222 UNIMPL semop
-223 UNIMPL semconfig
-224 UNIMPL msgctl
-225 UNIMPL msgget
-226 UNIMPL msgsnd
-227 UNIMPL msgrcv
-#if defined(SYSVSHM) && 0
-228 STD { int shmat(int shmid, void *shmaddr, int shmflg); }
-229 STD { int shmctl(int shmid, int cmd, \
+200 STD BSD { int truncate(char *path, int pad, off_t length); }
+201 STD BSD { int ftruncate(int fd, int pad, off_t length); }
+202 STD BSD { int __sysctl(int *name, u_int namelen, void *old, \
+ size_t *oldlenp, void *new, size_t newlen); } \
+ __sysctl sysctl_args int
+; properly, __sysctl should be a NOHIDE, but making an exception
+; here allows to avoid one in libc/sys/Makefile.inc.
+203 STD BSD { int mlock(caddr_t addr, size_t len); }
+204 STD BSD { int munlock(caddr_t addr, size_t len); }
+205 STD BSD { int utrace(caddr_t addr, size_t len); }
+206 STD BSD { int undelete(char *path); }
+207 UNIMPL NOHIDE nosys
+208 UNIMPL NOHIDE nosys
+209 UNIMPL NOHIDE nosys
+
+;
+; The following are reserved for loadable syscalls
+;
+210 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+211 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+212 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+213 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+214 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+215 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+216 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+217 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+218 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+219 NODEF NOHIDE lkmnosys lkmnosys nosys_args int
+
+;
+; The following were introduced with NetBSD/4.4Lite-2
+;
+220 STD BSD { int __semctl(int semid, int semnum, int cmd, \
+ union semun *arg); }
+221 STD BSD { int semget(key_t key, int nsems, int semflg); }
+222 STD BSD { int semop(int semid, struct sembuf *sops, \
+ u_int nsops); }
+223 STD BSD { int semconfig(int flag); }
+224 STD BSD { int msgctl(int msqid, int cmd, \
+ struct msqid_ds *buf); }
+225 STD BSD { int msgget(key_t key, int msgflg); }
+226 STD BSD { int msgsnd(int msqid, void *msgp, size_t msgsz, \
+ int msgflg); }
+227 STD BSD { int msgrcv(int msqid, void *msgp, size_t msgsz, \
+ long msgtyp, int msgflg); }
+228 STD BSD { int shmat(int shmid, void *shmaddr, int shmflg); }
+229 STD BSD { int shmctl(int shmid, int cmd, \
struct shmid_ds *buf); }
-230 STD { int shmdt(void *shmaddr); }
-231 STD { int shmget(key_t key, int size, int shmflg); }
-#else
-228 UNIMPL shmat
-229 UNIMPL shmctl
-230 UNIMPL shmdt
-231 UNIMPL shmget
-#endif
+230 STD BSD { int shmdt(void *shmaddr); }
+231 STD BSD { int shmget(key_t key, int size, int shmflg); }
+;
+232 UNIMPL NOHIDE nosys
+233 UNIMPL NOHIDE nosys
+234 UNIMPL NOHIDE nosys
+235 UNIMPL NOHIDE nosys
+236 UNIMPL NOHIDE nosys
+237 UNIMPL NOHIDE nosys
+238 UNIMPL NOHIDE nosys
+239 UNIMPL NOHIDE nosys
+240 UNIMPL NOHIDE nosys
+241 UNIMPL NOHIDE nosys
+242 UNIMPL NOHIDE nosys
+243 UNIMPL NOHIDE nosys
+244 UNIMPL NOHIDE nosys
+245 UNIMPL NOHIDE nosys
+246 UNIMPL NOHIDE nosys
+247 UNIMPL NOHIDE nosys
+248 UNIMPL NOHIDE nosys
+249 UNIMPL NOHIDE nosys
+; syscall numbers initially used in OpenBSD
+250 STD BSD { int minherit(caddr_t addr, size_t len, int inherit); }
+251 STD BSD { int rfork(int flags); }
diff --git a/sys/kern/sysv_ipc.c b/sys/kern/sysv_ipc.c
new file mode 100644
index 0000000..a1a1965
--- /dev/null
+++ b/sys/kern/sysv_ipc.c
@@ -0,0 +1,297 @@
+/* $Id$ */
+/* $NetBSD: sysv_ipc.c,v 1.7 1994/06/29 06:33:11 cgd Exp $ */
+
+/*
+ * Copyright (c) 1994 Herb Peyerl <hpeyerl@novatel.ca>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Herb Peyerl.
+ * 4. The name of Herb Peyerl may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/syslog.h>
+#include <sys/sysproto.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/sem.h>
+
+#if defined(SYSVSEM) || defined(SYSVSHM) || defined(SYSVMSG)
+
+/*
+ * Check for ipc permission
+ */
+
+int
+ipcperm(cred, perm, mode)
+ struct ucred *cred;
+ struct ipc_perm *perm;
+ int mode;
+{
+
+ if (cred->cr_uid == 0)
+ return (0);
+
+ /* Check for user match. */
+ if (cred->cr_uid != perm->cuid && cred->cr_uid != perm->uid) {
+ if (mode & IPC_M)
+ return (EPERM);
+ /* Check for group match. */
+ mode >>= 3;
+ if (!groupmember(perm->gid, cred) &&
+ !groupmember(perm->cgid, cred))
+ /* Check for `other' match. */
+ mode >>= 3;
+ }
+
+ if (mode & IPC_M)
+ return (0);
+ return ((mode & perm->mode) == mode ? 0 : EACCES);
+}
+
+#endif /* defined(SYSVSEM) || defined(SYSVSHM) || defined(SYSVMSG) */
+
+
+#if !defined(SYSVSEM) || !defined(SYSVSHM) || !defined(SYSVMSG)
+
+static void sysv_nosys __P((struct proc *p, char *s));
+
+static void
+sysv_nosys(p, s)
+ struct proc *p;
+ char *s;
+{
+ log(LOG_ERR, "cmd %s pid %d tried to use non-present %s\n",
+ p->p_comm, p->p_pid, s);
+}
+
+#if !defined(SYSVSEM)
+
+/*
+ * SYSVSEM stubs
+ */
+
+int
+semsys(p, uap, retval)
+ struct proc *p;
+ struct semsys_args *uap;
+ int *retval;
+{
+ sysv_nosys(p, "SYSVSEM");
+ return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+semconfig(p, uap, retval)
+ struct proc *p;
+ struct semconfig_args *uap;
+ int *retval;
+{
+ sysv_nosys(p, "SYSVSEM");
+ return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+__semctl(p, uap, retval)
+ struct proc *p;
+ register struct __semctl_args *uap;
+ int *retval;
+{
+ sysv_nosys(p, "SYSVSEM");
+ return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+semget(p, uap, retval)
+ struct proc *p;
+ register struct semget_args *uap;
+ int *retval;
+{
+ sysv_nosys(p, "SYSVSEM");
+ return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+semop(p, uap, retval)
+ struct proc *p;
+ register struct semop_args *uap;
+ int *retval;
+{
+ sysv_nosys(p, "SYSVSEM");
+ return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+/* called from kern_exit.c */
+void
+semexit(p)
+ struct proc *p;
+{
+ return;
+}
+
+#endif /* !defined(SYSVSEM) */
+
+
+#if !defined(SYSVMSG)
+
+/*
+ * SYSVMSG stubs
+ */
+
+int
+msgsys(p, uap, retval)
+ struct proc *p;
+ /* XXX actually varargs. */
+ struct msgsys_args *uap;
+ int *retval;
+{
+ sysv_nosys(p, "SYSVMSG");
+ return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+msgctl(p, uap, retval)
+ struct proc *p;
+ register struct msgctl_args *uap;
+ int *retval;
+{
+ sysv_nosys(p, "SYSVMSG");
+ return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+msgget(p, uap, retval)
+ struct proc *p;
+ register struct msgget_args *uap;
+ int *retval;
+{
+ sysv_nosys(p, "SYSVMSG");
+ return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+msgsnd(p, uap, retval)
+ struct proc *p;
+ register struct msgsnd_args *uap;
+ int *retval;
+{
+ sysv_nosys(p, "SYSVMSG");
+ return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+msgrcv(p, uap, retval)
+ struct proc *p;
+ register struct msgrcv_args *uap;
+ int *retval;
+{
+ sysv_nosys(p, "SYSVMSG");
+ return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+#endif /* !defined(SYSVMSG) */
+
+
+#if !defined(SYSVSHM)
+
+/*
+ * SYSVSHM stubs
+ */
+
+int
+shmdt(p, uap, retval)
+ struct proc *p;
+ struct shmdt_args *uap;
+ int *retval;
+{
+ sysv_nosys(p, "SYSVSHM");
+ return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+shmat(p, uap, retval)
+ struct proc *p;
+ struct shmat_args *uap;
+ int *retval;
+{
+ sysv_nosys(p, "SYSVSHM");
+ return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+shmctl(p, uap, retval)
+ struct proc *p;
+ struct shmctl_args *uap;
+ int *retval;
+{
+ sysv_nosys(p, "SYSVSHM");
+ return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+shmget(p, uap, retval)
+ struct proc *p;
+ struct shmget_args *uap;
+ int *retval;
+{
+ sysv_nosys(p, "SYSVSHM");
+ return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+shmsys(p, uap, retval)
+ struct proc *p;
+ /* XXX actually varargs. */
+ struct shmsys_args *uap;
+ int *retval;
+{
+ sysv_nosys(p, "SYSVSHM");
+ return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+/* called from kern_fork.c */
+void
+shmfork(p1, p2)
+ struct proc *p1, *p2;
+{
+ return;
+}
+
+/* called from kern_exit.c */
+void
+shmexit(p)
+ struct proc *p;
+{
+ return;
+}
+
+#endif /* !defined(SYSVSHM) */
+
+#endif /* !defined(SYSVSEM) || !defined(SYSVSHM) || !defined(SYSVMSG) */
diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c
new file mode 100644
index 0000000..d6e695f
--- /dev/null
+++ b/sys/kern/sysv_msg.c
@@ -0,0 +1,1034 @@
+/* $Id$ */
+
+/*
+ * Implementation of SVID messages
+ *
+ * Author: Daniel Boulet
+ *
+ * Copyright 1993 Daniel Boulet and RTMX Inc.
+ *
+ * This system call was implemented by Daniel Boulet under contract from RTMX.
+ *
+ * Redistribution and use in source forms, with and without modification,
+ * are permitted provided that this entire comment appears intact.
+ *
+ * Redistribution in binary form may occur without any restrictions.
+ * Obviously, it would be nice if you gave credit where credit is due
+ * but requiring it would be too onerous.
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ */
+
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/msg.h>
+#include <sys/sysent.h>
+
+static void msginit __P((void *));
+SYSINIT(sysv_msg, SI_SUB_SYSV_MSG, SI_ORDER_FIRST, msginit, NULL)
+
+#define MSG_DEBUG
+#undef MSG_DEBUG_OK
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgctl_args;
+int msgctl __P((struct proc *p, struct msgctl_args *uap, int *retval));
+struct msgget_args;
+int msgget __P((struct proc *p, struct msgget_args *uap, int *retval));
+struct msgsnd_args;
+int msgsnd __P((struct proc *p, struct msgsnd_args *uap, int *retval));
+struct msgrcv_args;
+int msgrcv __P((struct proc *p, struct msgrcv_args *uap, int *retval));
+#endif
+static void msg_freehdr __P((struct msg *msghdr));
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+static sy_call_t *msgcalls[] = {
+ (sy_call_t *)msgctl, (sy_call_t *)msgget,
+ (sy_call_t *)msgsnd, (sy_call_t *)msgrcv
+};
+
+static int nfree_msgmaps; /* # of free map entries */
+static short free_msgmaps; /* head of linked list of free map entries */
+static struct msg *free_msghdrs; /* list of free msg headers */
+char *msgpool; /* MSGMAX byte long msg buffer pool */
+struct msgmap *msgmaps; /* MSGSEG msgmap structures */
+struct msg *msghdrs; /* MSGTQL msg headers */
+struct msqid_ds *msqids; /* MSGMNI msqid_ds struct's */
+
+void
+msginit(dummy)
+ void *dummy;
+{
+ register int i;
+
+ /*
+ * msginfo.msgssz should be a power of two for efficiency reasons.
+ * It is also pretty silly if msginfo.msgssz is less than 8
+ * or greater than about 256 so ...
+ */
+
+ i = 8;
+ while (i < 1024 && i != msginfo.msgssz)
+ i <<= 1;
+ if (i != msginfo.msgssz) {
+ printf("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz,
+ msginfo.msgssz);
+ panic("msginfo.msgssz not a small power of 2");
+ }
+
+ if (msginfo.msgseg > 32767) {
+ printf("msginfo.msgseg=%d\n", msginfo.msgseg);
+ panic("msginfo.msgseg > 32767");
+ }
+
+ if (msgmaps == NULL)
+ panic("msgmaps is NULL");
+
+ for (i = 0; i < msginfo.msgseg; i++) {
+ if (i > 0)
+ msgmaps[i-1].next = i;
+ msgmaps[i].next = -1; /* implies entry is available */
+ }
+ free_msgmaps = 0;
+ nfree_msgmaps = msginfo.msgseg;
+
+ if (msghdrs == NULL)
+ panic("msghdrs is NULL");
+
+ for (i = 0; i < msginfo.msgtql; i++) {
+ msghdrs[i].msg_type = 0;
+ if (i > 0)
+ msghdrs[i-1].msg_next = &msghdrs[i];
+ msghdrs[i].msg_next = NULL;
+ }
+ free_msghdrs = &msghdrs[0];
+
+ if (msqids == NULL)
+ panic("msqids is NULL");
+
+ for (i = 0; i < msginfo.msgmni; i++) {
+ msqids[i].msg_qbytes = 0; /* implies entry is available */
+ msqids[i].msg_perm.seq = 0; /* reset to a known value */
+ }
+}
+
+/*
+ * Entry point for all MSG calls
+ */
+int
+msgsys(p, uap, retval)
+ struct proc *p;
+ /* XXX actually varargs. */
+ struct msgsys_args /* {
+ u_int which;
+ int a2;
+ int a3;
+ int a4;
+ int a5;
+ int a6;
+ } */ *uap;
+ int *retval;
+{
+
+ if (uap->which >= sizeof(msgcalls)/sizeof(msgcalls[0]))
+ return (EINVAL);
+ return ((*msgcalls[uap->which])(p, &uap->a2, retval));
+}
+
+static void
+msg_freehdr(msghdr)
+ struct msg *msghdr;
+{
+ while (msghdr->msg_ts > 0) {
+ short next;
+ if (msghdr->msg_spot < 0 || msghdr->msg_spot >= msginfo.msgseg)
+ panic("msghdr->msg_spot out of range");
+ next = msgmaps[msghdr->msg_spot].next;
+ msgmaps[msghdr->msg_spot].next = free_msgmaps;
+ free_msgmaps = msghdr->msg_spot;
+ nfree_msgmaps++;
+ msghdr->msg_spot = next;
+ if (msghdr->msg_ts >= msginfo.msgssz)
+ msghdr->msg_ts -= msginfo.msgssz;
+ else
+ msghdr->msg_ts = 0;
+ }
+ if (msghdr->msg_spot != -1)
+ panic("msghdr->msg_spot != -1");
+ msghdr->msg_next = free_msghdrs;
+ free_msghdrs = msghdr;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgctl_args {
+ int msqid;
+ int cmd;
+ struct msqid_ds *buf;
+};
+#endif
+
+int
+msgctl(p, uap, retval)
+ struct proc *p;
+ register struct msgctl_args *uap;
+ int *retval;
+{
+ int msqid = uap->msqid;
+ int cmd = uap->cmd;
+ struct msqid_ds *user_msqptr = uap->buf;
+ struct ucred *cred = p->p_ucred;
+ int rval, eval;
+ struct msqid_ds msqbuf;
+ register struct msqid_ds *msqptr;
+
+#ifdef MSG_DEBUG_OK
+ printf("call to msgctl(%d, %d, 0x%x)\n", msqid, cmd, user_msqptr);
+#endif
+
+ msqid = IPCID_TO_IX(msqid);
+
+ if (msqid < 0 || msqid >= msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+ printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
+ msginfo.msgmni);
+#endif
+ return(EINVAL);
+ }
+
+ msqptr = &msqids[msqid];
+
+ if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+ printf("no such msqid\n");
+#endif
+ return(EINVAL);
+ }
+ if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+ printf("wrong sequence number\n");
+#endif
+ return(EINVAL);
+ }
+
+ eval = 0;
+ rval = 0;
+
+ switch (cmd) {
+
+ case IPC_RMID:
+ {
+ struct msg *msghdr;
+ if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_M)))
+ return(eval);
+ /* Free the message headers */
+ msghdr = msqptr->msg_first;
+ while (msghdr != NULL) {
+ struct msg *msghdr_tmp;
+
+ /* Free the segments of each message */
+ msqptr->msg_cbytes -= msghdr->msg_ts;
+ msqptr->msg_qnum--;
+ msghdr_tmp = msghdr;
+ msghdr = msghdr->msg_next;
+ msg_freehdr(msghdr_tmp);
+ }
+
+ if (msqptr->msg_cbytes != 0)
+ panic("msg_cbytes is screwed up");
+ if (msqptr->msg_qnum != 0)
+ panic("msg_qnum is screwed up");
+
+ msqptr->msg_qbytes = 0; /* Mark it as free */
+
+ wakeup((caddr_t)msqptr);
+ }
+
+ break;
+
+ case IPC_SET:
+ if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_M)))
+ return(eval);
+ if ((eval = copyin(user_msqptr, &msqbuf, sizeof(msqbuf))) != 0)
+ return(eval);
+ if (msqbuf.msg_qbytes > msqptr->msg_qbytes && cred->cr_uid != 0)
+ return(EPERM);
+ if (msqbuf.msg_qbytes > msginfo.msgmnb) {
+#ifdef MSG_DEBUG_OK
+ printf("can't increase msg_qbytes beyond %d (truncating)\n",
+ msginfo.msgmnb);
+#endif
+ msqbuf.msg_qbytes = msginfo.msgmnb; /* silently restrict qbytes to system limit */
+ }
+ if (msqbuf.msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+ printf("can't reduce msg_qbytes to 0\n");
+#endif
+ return(EINVAL); /* non-standard errno! */
+ }
+ msqptr->msg_perm.uid = msqbuf.msg_perm.uid; /* change the owner */
+ msqptr->msg_perm.gid = msqbuf.msg_perm.gid; /* change the owner */
+ msqptr->msg_perm.mode = (msqptr->msg_perm.mode & ~0777) |
+ (msqbuf.msg_perm.mode & 0777);
+ msqptr->msg_qbytes = msqbuf.msg_qbytes;
+ msqptr->msg_ctime = time.tv_sec;
+ break;
+
+ case IPC_STAT:
+ if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_R))) {
+#ifdef MSG_DEBUG_OK
+ printf("requester doesn't have read access\n");
+#endif
+ return(eval);
+ }
+ eval = copyout((caddr_t)msqptr, user_msqptr,
+ sizeof(struct msqid_ds));
+ break;
+
+ default:
+#ifdef MSG_DEBUG_OK
+ printf("invalid command %d\n", cmd);
+#endif
+ return(EINVAL);
+ }
+
+ if (eval == 0)
+ *retval = rval;
+ return(eval);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgget_args {
+ key_t key;
+ int msgflg;
+};
+#endif
+
+int
+msgget(p, uap, retval)
+ struct proc *p;
+ register struct msgget_args *uap;
+ int *retval;
+{
+ int msqid, eval;
+ int key = uap->key;
+ int msgflg = uap->msgflg;
+ struct ucred *cred = p->p_ucred;
+ register struct msqid_ds *msqptr = NULL;
+
+#ifdef MSG_DEBUG_OK
+ printf("msgget(0x%x, 0%o)\n", key, msgflg);
+#endif
+
+ if (key != IPC_PRIVATE) {
+ for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
+ msqptr = &msqids[msqid];
+ if (msqptr->msg_qbytes != 0 &&
+ msqptr->msg_perm.key == key)
+ break;
+ }
+ if (msqid < msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+ printf("found public key\n");
+#endif
+ if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) {
+#ifdef MSG_DEBUG_OK
+ printf("not exclusive\n");
+#endif
+ return(EEXIST);
+ }
+ if ((eval = ipcperm(cred, &msqptr->msg_perm, msgflg & 0700 ))) {
+#ifdef MSG_DEBUG_OK
+ printf("requester doesn't have 0%o access\n",
+ msgflg & 0700);
+#endif
+ return(eval);
+ }
+ goto found;
+ }
+ }
+
+#ifdef MSG_DEBUG_OK
+ printf("need to allocate the msqid_ds\n");
+#endif
+ if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) {
+ for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
+ /*
+ * Look for an unallocated and unlocked msqid_ds.
+ * msqid_ds's can be locked by msgsnd or msgrcv while
+ * they are copying the message in/out. We can't
+ * re-use the entry until they release it.
+ */
+ msqptr = &msqids[msqid];
+ if (msqptr->msg_qbytes == 0 &&
+ (msqptr->msg_perm.mode & MSG_LOCKED) == 0)
+ break;
+ }
+ if (msqid == msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+ printf("no more msqid_ds's available\n");
+#endif
+ return(ENOSPC);
+ }
+#ifdef MSG_DEBUG_OK
+ printf("msqid %d is available\n", msqid);
+#endif
+ msqptr->msg_perm.key = key;
+ msqptr->msg_perm.cuid = cred->cr_uid;
+ msqptr->msg_perm.uid = cred->cr_uid;
+ msqptr->msg_perm.cgid = cred->cr_gid;
+ msqptr->msg_perm.gid = cred->cr_gid;
+ msqptr->msg_perm.mode = (msgflg & 0777);
+ /* Make sure that the returned msqid is unique */
+ msqptr->msg_perm.seq++;
+ msqptr->msg_first = NULL;
+ msqptr->msg_last = NULL;
+ msqptr->msg_cbytes = 0;
+ msqptr->msg_qnum = 0;
+ msqptr->msg_qbytes = msginfo.msgmnb;
+ msqptr->msg_lspid = 0;
+ msqptr->msg_lrpid = 0;
+ msqptr->msg_stime = 0;
+ msqptr->msg_rtime = 0;
+ msqptr->msg_ctime = time.tv_sec;
+ } else {
+#ifdef MSG_DEBUG_OK
+ printf("didn't find it and wasn't asked to create it\n");
+#endif
+ return(ENOENT);
+ }
+
+found:
+ /* Construct the unique msqid */
+ *retval = IXSEQ_TO_IPCID(msqid, msqptr->msg_perm);
+ return(0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgsnd_args {
+ int msqid;
+ void *msgp;
+ size_t msgsz;
+ int msgflg;
+};
+#endif
+
+int
+msgsnd(p, uap, retval)
+ struct proc *p;
+ register struct msgsnd_args *uap;
+ int *retval;
+{
+ int msqid = uap->msqid;
+ void *user_msgp = uap->msgp;
+ size_t msgsz = uap->msgsz;
+ int msgflg = uap->msgflg;
+ int segs_needed, eval;
+ struct ucred *cred = p->p_ucred;
+ register struct msqid_ds *msqptr;
+ register struct msg *msghdr;
+ short next;
+
+#ifdef MSG_DEBUG_OK
+ printf("call to msgsnd(%d, 0x%x, %d, %d)\n", msqid, user_msgp, msgsz,
+ msgflg);
+#endif
+
+ msqid = IPCID_TO_IX(msqid);
+
+ if (msqid < 0 || msqid >= msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+ printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
+ msginfo.msgmni);
+#endif
+ return(EINVAL);
+ }
+
+ msqptr = &msqids[msqid];
+ if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+ printf("no such message queue id\n");
+#endif
+ return(EINVAL);
+ }
+ if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+ printf("wrong sequence number\n");
+#endif
+ return(EINVAL);
+ }
+
+ if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_W))) {
+#ifdef MSG_DEBUG_OK
+ printf("requester doesn't have write access\n");
+#endif
+ return(eval);
+ }
+
+ segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz;
+#ifdef MSG_DEBUG_OK
+ printf("msgsz=%d, msgssz=%d, segs_needed=%d\n", msgsz, msginfo.msgssz,
+ segs_needed);
+#endif
+ for (;;) {
+ int need_more_resources = 0;
+
+ /*
+ * check msgsz
+ * (inside this loop in case msg_qbytes changes while we sleep)
+ */
+
+ if (msgsz > msqptr->msg_qbytes) {
+#ifdef MSG_DEBUG_OK
+ printf("msgsz > msqptr->msg_qbytes\n");
+#endif
+ return(EINVAL);
+ }
+
+ if (msqptr->msg_perm.mode & MSG_LOCKED) {
+#ifdef MSG_DEBUG_OK
+ printf("msqid is locked\n");
+#endif
+ need_more_resources = 1;
+ }
+ if (msgsz + msqptr->msg_cbytes > msqptr->msg_qbytes) {
+#ifdef MSG_DEBUG_OK
+ printf("msgsz + msg_cbytes > msg_qbytes\n");
+#endif
+ need_more_resources = 1;
+ }
+ if (segs_needed > nfree_msgmaps) {
+#ifdef MSG_DEBUG_OK
+ printf("segs_needed > nfree_msgmaps\n");
+#endif
+ need_more_resources = 1;
+ }
+ if (free_msghdrs == NULL) {
+#ifdef MSG_DEBUG_OK
+ printf("no more msghdrs\n");
+#endif
+ need_more_resources = 1;
+ }
+
+ if (need_more_resources) {
+ int we_own_it;
+
+ if ((msgflg & IPC_NOWAIT) != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("need more resources but caller doesn't want to wait\n");
+#endif
+ return(EAGAIN);
+ }
+
+ if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("we don't own the msqid_ds\n");
+#endif
+ we_own_it = 0;
+ } else {
+ /* Force later arrivals to wait for our
+ request */
+#ifdef MSG_DEBUG_OK
+ printf("we own the msqid_ds\n");
+#endif
+ msqptr->msg_perm.mode |= MSG_LOCKED;
+ we_own_it = 1;
+ }
+#ifdef MSG_DEBUG_OK
+ printf("goodnight\n");
+#endif
+ eval = tsleep((caddr_t)msqptr, (PZERO - 4) | PCATCH,
+ "msgwait", 0);
+#ifdef MSG_DEBUG_OK
+ printf("good morning, eval=%d\n", eval);
+#endif
+ if (we_own_it)
+ msqptr->msg_perm.mode &= ~MSG_LOCKED;
+ if (eval != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("msgsnd: interrupted system call\n");
+#endif
+ return(EINTR);
+ }
+
+ /*
+ * Make sure that the msq queue still exists
+ */
+
+ if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+ printf("msqid deleted\n");
+#endif
+ /* The SVID says to return EIDRM. */
+#ifdef EIDRM
+ return(EIDRM);
+#else
+ /* Unfortunately, BSD doesn't define that code
+ yet! */
+ return(EINVAL);
+#endif
+ }
+
+ } else {
+#ifdef MSG_DEBUG_OK
+ printf("got all the resources that we need\n");
+#endif
+ break;
+ }
+ }
+
+ /*
+ * We have the resources that we need.
+ * Make sure!
+ */
+
+ if (msqptr->msg_perm.mode & MSG_LOCKED)
+ panic("msg_perm.mode & MSG_LOCKED");
+ if (segs_needed > nfree_msgmaps)
+ panic("segs_needed > nfree_msgmaps");
+ if (msgsz + msqptr->msg_cbytes > msqptr->msg_qbytes)
+ panic("msgsz + msg_cbytes > msg_qbytes");
+ if (free_msghdrs == NULL)
+ panic("no more msghdrs");
+
+ /*
+ * Re-lock the msqid_ds in case we page-fault when copying in the
+ * message
+ */
+
+ if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0)
+ panic("msqid_ds is already locked");
+ msqptr->msg_perm.mode |= MSG_LOCKED;
+
+ /*
+ * Allocate a message header
+ */
+
+ msghdr = free_msghdrs;
+ free_msghdrs = msghdr->msg_next;
+ msghdr->msg_spot = -1;
+ msghdr->msg_ts = msgsz;
+
+ /*
+ * Allocate space for the message
+ */
+
+ while (segs_needed > 0) {
+ if (nfree_msgmaps <= 0)
+ panic("not enough msgmaps");
+ if (free_msgmaps == -1)
+ panic("nil free_msgmaps");
+ next = free_msgmaps;
+ if (next <= -1)
+ panic("next too low #1");
+ if (next >= msginfo.msgseg)
+ panic("next out of range #1");
+#ifdef MSG_DEBUG_OK
+ printf("allocating segment %d to message\n", next);
+#endif
+ free_msgmaps = msgmaps[next].next;
+ nfree_msgmaps--;
+ msgmaps[next].next = msghdr->msg_spot;
+ msghdr->msg_spot = next;
+ segs_needed--;
+ }
+
+ /*
+ * Copy in the message type
+ */
+
+ if ((eval = copyin(user_msgp, &msghdr->msg_type,
+ sizeof(msghdr->msg_type))) != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("error %d copying the message type\n", eval);
+#endif
+ msg_freehdr(msghdr);
+ msqptr->msg_perm.mode &= ~MSG_LOCKED;
+ wakeup((caddr_t)msqptr);
+ return(eval);
+ }
+ user_msgp = (char *)user_msgp + sizeof(msghdr->msg_type);
+
+ /*
+ * Validate the message type
+ */
+
+ if (msghdr->msg_type < 1) {
+ msg_freehdr(msghdr);
+ msqptr->msg_perm.mode &= ~MSG_LOCKED;
+ wakeup((caddr_t)msqptr);
+#ifdef MSG_DEBUG_OK
+ printf("mtype (%d) < 1\n", msghdr->msg_type);
+#endif
+ return(EINVAL);
+ }
+
+ /*
+ * Copy in the message body
+ */
+
+ next = msghdr->msg_spot;
+ while (msgsz > 0) {
+ size_t tlen;
+ if (msgsz > msginfo.msgssz)
+ tlen = msginfo.msgssz;
+ else
+ tlen = msgsz;
+ if (next <= -1)
+ panic("next too low #2");
+ if (next >= msginfo.msgseg)
+ panic("next out of range #2");
+ if ((eval = copyin(user_msgp, &msgpool[next * msginfo.msgssz],
+ tlen)) != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("error %d copying in message segment\n", eval);
+#endif
+ msg_freehdr(msghdr);
+ msqptr->msg_perm.mode &= ~MSG_LOCKED;
+ wakeup((caddr_t)msqptr);
+ return(eval);
+ }
+ msgsz -= tlen;
+ user_msgp = (char *)user_msgp + tlen;
+ next = msgmaps[next].next;
+ }
+ if (next != -1)
+ panic("didn't use all the msg segments");
+
+ /*
+ * We've got the message. Unlock the msqid_ds.
+ */
+
+ msqptr->msg_perm.mode &= ~MSG_LOCKED;
+
+ /*
+ * Make sure that the msqid_ds is still allocated.
+ */
+
+ if (msqptr->msg_qbytes == 0) {
+ msg_freehdr(msghdr);
+ wakeup((caddr_t)msqptr);
+ /* The SVID says to return EIDRM. */
+#ifdef EIDRM
+ return(EIDRM);
+#else
+ /* Unfortunately, BSD doesn't define that code yet! */
+ return(EINVAL);
+#endif
+ }
+
+ /*
+ * Put the message into the queue
+ */
+
+ if (msqptr->msg_first == NULL) {
+ msqptr->msg_first = msghdr;
+ msqptr->msg_last = msghdr;
+ } else {
+ msqptr->msg_last->msg_next = msghdr;
+ msqptr->msg_last = msghdr;
+ }
+ msqptr->msg_last->msg_next = NULL;
+
+ msqptr->msg_cbytes += msghdr->msg_ts;
+ msqptr->msg_qnum++;
+ msqptr->msg_lspid = p->p_pid;
+ msqptr->msg_stime = time.tv_sec;
+
+ wakeup((caddr_t)msqptr);
+ *retval = 0;
+ return(0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgrcv_args {
+ int msqid;
+ void *msgp;
+ size_t msgsz;
+ long msgtyp;
+ int msgflg;
+};
+#endif
+
+int
+msgrcv(p, uap, retval)
+ struct proc *p;
+ register struct msgrcv_args *uap;
+ int *retval;
+{
+ int msqid = uap->msqid;
+ void *user_msgp = uap->msgp;
+ size_t msgsz = uap->msgsz;
+ long msgtyp = uap->msgtyp;
+ int msgflg = uap->msgflg;
+ size_t len;
+ struct ucred *cred = p->p_ucred;
+ register struct msqid_ds *msqptr;
+ register struct msg *msghdr;
+ int eval;
+ short next;
+
+#ifdef MSG_DEBUG_OK
+ printf("call to msgrcv(%d, 0x%x, %d, %ld, %d)\n", msqid, user_msgp,
+ msgsz, msgtyp, msgflg);
+#endif
+
+ msqid = IPCID_TO_IX(msqid);
+
+ if (msqid < 0 || msqid >= msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+ printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
+ msginfo.msgmni);
+#endif
+ return(EINVAL);
+ }
+
+ msqptr = &msqids[msqid];
+ if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+ printf("no such message queue id\n");
+#endif
+ return(EINVAL);
+ }
+ if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+ printf("wrong sequence number\n");
+#endif
+ return(EINVAL);
+ }
+
+ if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_R))) {
+#ifdef MSG_DEBUG_OK
+ printf("requester doesn't have read access\n");
+#endif
+ return(eval);
+ }
+
+ msghdr = NULL;
+ while (msghdr == NULL) {
+ if (msgtyp == 0) {
+ msghdr = msqptr->msg_first;
+ if (msghdr != NULL) {
+ if (msgsz < msghdr->msg_ts &&
+ (msgflg & MSG_NOERROR) == 0) {
+#ifdef MSG_DEBUG_OK
+ printf("first message on the queue is too big (want %d, got %d)\n",
+ msgsz, msghdr->msg_ts);
+#endif
+ return(E2BIG);
+ }
+ if (msqptr->msg_first == msqptr->msg_last) {
+ msqptr->msg_first = NULL;
+ msqptr->msg_last = NULL;
+ } else {
+ msqptr->msg_first = msghdr->msg_next;
+ if (msqptr->msg_first == NULL)
+ panic("msg_first/last screwed up #1");
+ }
+ }
+ } else {
+ struct msg *previous;
+ struct msg **prev;
+
+ previous = NULL;
+ prev = &(msqptr->msg_first);
+ while ((msghdr = *prev) != NULL) {
+ /*
+ * Is this message's type an exact match or is
+ * this message's type less than or equal to
+ * the absolute value of a negative msgtyp?
+ * Note that the second half of this test can
+ * NEVER be true if msgtyp is positive since
+ * msg_type is always positive!
+ */
+
+ if (msgtyp == msghdr->msg_type ||
+ msghdr->msg_type <= -msgtyp) {
+#ifdef MSG_DEBUG_OK
+ printf("found message type %d, requested %d\n",
+ msghdr->msg_type, msgtyp);
+#endif
+ if (msgsz < msghdr->msg_ts &&
+ (msgflg & MSG_NOERROR) == 0) {
+#ifdef MSG_DEBUG_OK
+ printf("requested message on the queue is too big (want %d, got %d)\n",
+ msgsz, msghdr->msg_ts);
+#endif
+ return(E2BIG);
+ }
+ *prev = msghdr->msg_next;
+ if (msghdr == msqptr->msg_last) {
+ if (previous == NULL) {
+ if (prev !=
+ &msqptr->msg_first)
+ panic("msg_first/last screwed up #2");
+ msqptr->msg_first =
+ NULL;
+ msqptr->msg_last =
+ NULL;
+ } else {
+ if (prev ==
+ &msqptr->msg_first)
+ panic("msg_first/last screwed up #3");
+ msqptr->msg_last =
+ previous;
+ }
+ }
+ break;
+ }
+ previous = msghdr;
+ prev = &(msghdr->msg_next);
+ }
+ }
+
+ /*
+ * We've either extracted the msghdr for the appropriate
+ * message or there isn't one.
+ * If there is one then bail out of this loop.
+ */
+
+ if (msghdr != NULL)
+ break;
+
+ /*
+ * Hmph! No message found. Does the user want to wait?
+ */
+
+ if ((msgflg & IPC_NOWAIT) != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("no appropriate message found (msgtyp=%d)\n",
+ msgtyp);
+#endif
+ /* The SVID says to return ENOMSG. */
+#ifdef ENOMSG
+ return(ENOMSG);
+#else
+ /* Unfortunately, BSD doesn't define that code yet! */
+ return(EAGAIN);
+#endif
+ }
+
+ /*
+ * Wait for something to happen
+ */
+
+#ifdef MSG_DEBUG_OK
+ printf("msgrcv: goodnight\n");
+#endif
+ eval = tsleep((caddr_t)msqptr, (PZERO - 4) | PCATCH, "msgwait",
+ 0);
+#ifdef MSG_DEBUG_OK
+ printf("msgrcv: good morning (eval=%d)\n", eval);
+#endif
+
+ if (eval != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("msgsnd: interrupted system call\n");
+#endif
+ return(EINTR);
+ }
+
+ /*
+ * Make sure that the msq queue still exists
+ */
+
+ if (msqptr->msg_qbytes == 0 ||
+ msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+ printf("msqid deleted\n");
+#endif
+ /* The SVID says to return EIDRM. */
+#ifdef EIDRM
+ return(EIDRM);
+#else
+ /* Unfortunately, BSD doesn't define that code yet! */
+ return(EINVAL);
+#endif
+ }
+ }
+
+ /*
+ * Return the message to the user.
+ *
+ * First, do the bookkeeping (before we risk being interrupted).
+ */
+
+ msqptr->msg_cbytes -= msghdr->msg_ts;
+ msqptr->msg_qnum--;
+ msqptr->msg_lrpid = p->p_pid;
+ msqptr->msg_rtime = time.tv_sec;
+
+ /*
+ * Make msgsz the actual amount that we'll be returning.
+ * Note that this effectively truncates the message if it is too long
+ * (since msgsz is never increased).
+ */
+
+#ifdef MSG_DEBUG_OK
+ printf("found a message, msgsz=%d, msg_ts=%d\n", msgsz,
+ msghdr->msg_ts);
+#endif
+ if (msgsz > msghdr->msg_ts)
+ msgsz = msghdr->msg_ts;
+
+ /*
+ * Return the type to the user.
+ */
+
+ eval = copyout((caddr_t)&(msghdr->msg_type), user_msgp,
+ sizeof(msghdr->msg_type));
+ if (eval != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("error (%d) copying out message type\n", eval);
+#endif
+ msg_freehdr(msghdr);
+ wakeup((caddr_t)msqptr);
+ return(eval);
+ }
+ user_msgp = (char *)user_msgp + sizeof(msghdr->msg_type);
+
+ /*
+ * Return the segments to the user
+ */
+
+ next = msghdr->msg_spot;
+ for (len = 0; len < msgsz; len += msginfo.msgssz) {
+ size_t tlen;
+
+ if (msgsz > msginfo.msgssz)
+ tlen = msginfo.msgssz;
+ else
+ tlen = msgsz;
+ if (next <= -1)
+ panic("next too low #3");
+ if (next >= msginfo.msgseg)
+ panic("next out of range #3");
+ eval = copyout((caddr_t)&msgpool[next * msginfo.msgssz],
+ user_msgp, tlen);
+ if (eval != 0) {
+#ifdef MSG_DEBUG_OK
+ printf("error (%d) copying out message segment\n",
+ eval);
+#endif
+ msg_freehdr(msghdr);
+ wakeup((caddr_t)msqptr);
+ return(eval);
+ }
+ user_msgp = (char *)user_msgp + tlen;
+ next = msgmaps[next].next;
+ }
+
+ /*
+ * Done, return the actual number of bytes copied out.
+ */
+
+ msg_freehdr(msghdr);
+ wakeup((caddr_t)msqptr);
+ *retval = msgsz;
+ return(0);
+}
diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c
new file mode 100644
index 0000000..e66ddc6
--- /dev/null
+++ b/sys/kern/sysv_sem.c
@@ -0,0 +1,985 @@
+/* $Id$ */
+
+/*
+ * Implementation of SVID semaphores
+ *
+ * Author: Daniel Boulet
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ */
+
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/sem.h>
+#include <sys/sysent.h>
+
+static void seminit __P((void *));
+SYSINIT(sysv_sem, SI_SUB_SYSV_SEM, SI_ORDER_FIRST, seminit, NULL)
+
+#ifndef _SYS_SYSPROTO_H_
+struct __semctl_args;
+int __semctl __P((struct proc *p, struct __semctl_args *uap, int *retval));
+struct semget_args;
+int semget __P((struct proc *p, struct semget_args *uap, int *retval));
+struct semop_args;
+int semop __P((struct proc *p, struct semop_args *uap, int *retval));
+struct semconfig_args;
+int semconfig __P((struct proc *p, struct semconfig_args *uap,
+ int *retval));
+#endif
+
+static struct sem_undo *semu_alloc __P((struct proc *p));
+static int semundo_adjust __P((struct proc *p, struct sem_undo **supptr,
+ int semid, int semnum, int adjval));
+static void semundo_clear __P((int semid, int semnum));
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+static sy_call_t *semcalls[] = {
+ (sy_call_t *)__semctl, (sy_call_t *)semget,
+ (sy_call_t *)semop, (sy_call_t *)semconfig
+};
+
+static int semtot = 0;
+struct semid_ds *sema; /* semaphore id pool */
+struct sem *sem; /* semaphore pool */
+static struct sem_undo *semu_list; /* list of active undo structures */
+int *semu; /* undo structure pool */
+
+static struct proc *semlock_holder = NULL;
+
+void
+seminit(dummy)
+ void *dummy;
+{
+ register int i;
+
+ if (sema == NULL)
+ panic("sema is NULL");
+ if (semu == NULL)
+ panic("semu is NULL");
+
+ for (i = 0; i < seminfo.semmni; i++) {
+ sema[i].sem_base = 0;
+ sema[i].sem_perm.mode = 0;
+ }
+ for (i = 0; i < seminfo.semmnu; i++) {
+ register struct sem_undo *suptr = SEMU(i);
+ suptr->un_proc = NULL;
+ }
+ semu_list = NULL;
+}
+
+/*
+ * Entry point for all SEM calls
+ */
+int
+semsys(p, uap, retval)
+ struct proc *p;
+ /* XXX actually varargs. */
+ struct semsys_args /* {
+ u_int which;
+ int a2;
+ int a3;
+ int a4;
+ int a5;
+ } */ *uap;
+ int *retval;
+{
+
+ while (semlock_holder != NULL && semlock_holder != p)
+ (void) tsleep((caddr_t)&semlock_holder, (PZERO - 4), "semsys", 0);
+
+ if (uap->which >= sizeof(semcalls)/sizeof(semcalls[0]))
+ return (EINVAL);
+ return ((*semcalls[uap->which])(p, &uap->a2, retval));
+}
+
+/*
+ * Lock or unlock the entire semaphore facility.
+ *
+ * This will probably eventually evolve into a general purpose semaphore
+ * facility status enquiry mechanism (I don't like the "read /dev/kmem"
+ * approach currently taken by ipcs and the amount of info that we want
+ * to be able to extract for ipcs is probably beyond what the capability
+ * of the getkerninfo facility.
+ *
+ * At the time that the current version of semconfig was written, ipcs is
+ * the only user of the semconfig facility. It uses it to ensure that the
+ * semaphore facility data structures remain static while it fishes around
+ * in /dev/kmem.
+ */
+
+#ifndef _SYS_SYSPROTO_H_
+struct semconfig_args {
+ semconfig_ctl_t flag;
+};
+#endif
+
+int
+semconfig(p, uap, retval)
+ struct proc *p;
+ struct semconfig_args *uap;
+ int *retval;
+{
+ int eval = 0;
+
+ switch (uap->flag) {
+ case SEM_CONFIG_FREEZE:
+ semlock_holder = p;
+ break;
+
+ case SEM_CONFIG_THAW:
+ semlock_holder = NULL;
+ wakeup((caddr_t)&semlock_holder);
+ break;
+
+ default:
+ printf("semconfig: unknown flag parameter value (%d) - ignored\n",
+ uap->flag);
+ eval = EINVAL;
+ break;
+ }
+
+ *retval = 0;
+ return(eval);
+}
+
+/*
+ * Allocate a new sem_undo structure for a process
+ * (returns ptr to structure or NULL if no more room)
+ */
+
+static struct sem_undo *
+semu_alloc(p)
+ struct proc *p;
+{
+ register int i;
+ register struct sem_undo *suptr;
+ register struct sem_undo **supptr;
+ int attempt;
+
+ /*
+ * Try twice to allocate something.
+ * (we'll purge any empty structures after the first pass so
+ * two passes are always enough)
+ */
+
+ for (attempt = 0; attempt < 2; attempt++) {
+ /*
+ * Look for a free structure.
+ * Fill it in and return it if we find one.
+ */
+
+ for (i = 0; i < seminfo.semmnu; i++) {
+ suptr = SEMU(i);
+ if (suptr->un_proc == NULL) {
+ suptr->un_next = semu_list;
+ semu_list = suptr;
+ suptr->un_cnt = 0;
+ suptr->un_proc = p;
+ return(suptr);
+ }
+ }
+
+ /*
+ * We didn't find a free one, if this is the first attempt
+ * then try to free some structures.
+ */
+
+ if (attempt == 0) {
+ /* All the structures are in use - try to free some */
+ int did_something = 0;
+
+ supptr = &semu_list;
+ while ((suptr = *supptr) != NULL) {
+ if (suptr->un_cnt == 0) {
+ suptr->un_proc = NULL;
+ *supptr = suptr->un_next;
+ did_something = 1;
+ } else
+ supptr = &(suptr->un_next);
+ }
+
+ /* If we didn't free anything then just give-up */
+ if (!did_something)
+ return(NULL);
+ } else {
+ /*
+ * The second pass failed even though we freed
+ * something after the first pass!
+ * This is IMPOSSIBLE!
+ */
+ panic("semu_alloc - second attempt failed");
+ }
+ }
+ return (NULL);
+}
+
+/*
+ * Adjust a particular entry for a particular proc
+ */
+
+static int
+semundo_adjust(p, supptr, semid, semnum, adjval)
+ register struct proc *p;
+ struct sem_undo **supptr;
+ int semid, semnum;
+ int adjval;
+{
+ register struct sem_undo *suptr;
+ register struct undo *sunptr;
+ int i;
+
+ /* Look for and remember the sem_undo if the caller doesn't provide
+ it */
+
+ suptr = *supptr;
+ if (suptr == NULL) {
+ for (suptr = semu_list; suptr != NULL;
+ suptr = suptr->un_next) {
+ if (suptr->un_proc == p) {
+ *supptr = suptr;
+ break;
+ }
+ }
+ if (suptr == NULL) {
+ if (adjval == 0)
+ return(0);
+ suptr = semu_alloc(p);
+ if (suptr == NULL)
+ return(ENOSPC);
+ *supptr = suptr;
+ }
+ }
+
+ /*
+ * Look for the requested entry and adjust it (delete if adjval becomes
+ * 0).
+ */
+ sunptr = &suptr->un_ent[0];
+ for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
+ if (sunptr->un_id != semid || sunptr->un_num != semnum)
+ continue;
+ if (adjval == 0)
+ sunptr->un_adjval = 0;
+ else
+ sunptr->un_adjval += adjval;
+ if (sunptr->un_adjval == 0) {
+ suptr->un_cnt--;
+ if (i < suptr->un_cnt)
+ suptr->un_ent[i] =
+ suptr->un_ent[suptr->un_cnt];
+ }
+ return(0);
+ }
+
+ /* Didn't find the right entry - create it */
+ if (adjval == 0)
+ return(0);
+ if (suptr->un_cnt != SEMUME) {
+ sunptr = &suptr->un_ent[suptr->un_cnt];
+ suptr->un_cnt++;
+ sunptr->un_adjval = adjval;
+ sunptr->un_id = semid; sunptr->un_num = semnum;
+ } else
+ return(EINVAL);
+ return(0);
+}
+
+static void
+semundo_clear(semid, semnum)
+ int semid, semnum;
+{
+ register struct sem_undo *suptr;
+
+ for (suptr = semu_list; suptr != NULL; suptr = suptr->un_next) {
+ register struct undo *sunptr = &suptr->un_ent[0];
+ register int i = 0;
+
+ while (i < suptr->un_cnt) {
+ if (sunptr->un_id == semid) {
+ if (semnum == -1 || sunptr->un_num == semnum) {
+ suptr->un_cnt--;
+ if (i < suptr->un_cnt) {
+ suptr->un_ent[i] =
+ suptr->un_ent[suptr->un_cnt];
+ continue;
+ }
+ }
+ if (semnum != -1)
+ break;
+ }
+ i++, sunptr++;
+ }
+ }
+}
+
+/*
+ * Note that the user-mode half of this passes a union, not a pointer
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct __semctl_args {
+ int semid;
+ int semnum;
+ int cmd;
+ union semun *arg;
+};
+#endif
+
+int
+__semctl(p, uap, retval)
+ struct proc *p;
+ register struct __semctl_args *uap;
+ int *retval;
+{
+ int semid = uap->semid;
+ int semnum = uap->semnum;
+ int cmd = uap->cmd;
+ union semun *arg = uap->arg;
+ union semun real_arg;
+ struct ucred *cred = p->p_ucred;
+ int i, rval, eval;
+ struct semid_ds sbuf;
+ register struct semid_ds *semaptr;
+
+#ifdef SEM_DEBUG
+ printf("call to semctl(%d, %d, %d, 0x%x)\n", semid, semnum, cmd, arg);
+#endif
+
+ semid = IPCID_TO_IX(semid);
+ if (semid < 0 || semid >= seminfo.semmsl)
+ return(EINVAL);
+
+ semaptr = &sema[semid];
+ if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 ||
+ semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid))
+ return(EINVAL);
+
+ eval = 0;
+ rval = 0;
+
+ switch (cmd) {
+ case IPC_RMID:
+ if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_M)))
+ return(eval);
+ semaptr->sem_perm.cuid = cred->cr_uid;
+ semaptr->sem_perm.uid = cred->cr_uid;
+ semtot -= semaptr->sem_nsems;
+ for (i = semaptr->sem_base - sem; i < semtot; i++)
+ sem[i] = sem[i + semaptr->sem_nsems];
+ for (i = 0; i < seminfo.semmni; i++) {
+ if ((sema[i].sem_perm.mode & SEM_ALLOC) &&
+ sema[i].sem_base > semaptr->sem_base)
+ sema[i].sem_base -= semaptr->sem_nsems;
+ }
+ semaptr->sem_perm.mode = 0;
+ semundo_clear(semid, -1);
+ wakeup((caddr_t)semaptr);
+ break;
+
+ case IPC_SET:
+ if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_M)))
+ return(eval);
+ if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+ return(eval);
+ if ((eval = copyin(real_arg.buf, (caddr_t)&sbuf,
+ sizeof(sbuf))) != 0)
+ return(eval);
+ semaptr->sem_perm.uid = sbuf.sem_perm.uid;
+ semaptr->sem_perm.gid = sbuf.sem_perm.gid;
+ semaptr->sem_perm.mode = (semaptr->sem_perm.mode & ~0777) |
+ (sbuf.sem_perm.mode & 0777);
+ semaptr->sem_ctime = time.tv_sec;
+ break;
+
+ case IPC_STAT:
+ if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+ return(eval);
+ if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+ return(eval);
+ eval = copyout((caddr_t)semaptr, real_arg.buf,
+ sizeof(struct semid_ds));
+ break;
+
+ case GETNCNT:
+ if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+ return(eval);
+ if (semnum < 0 || semnum >= semaptr->sem_nsems)
+ return(EINVAL);
+ rval = semaptr->sem_base[semnum].semncnt;
+ break;
+
+ case GETPID:
+ if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+ return(eval);
+ if (semnum < 0 || semnum >= semaptr->sem_nsems)
+ return(EINVAL);
+ rval = semaptr->sem_base[semnum].sempid;
+ break;
+
+ case GETVAL:
+ if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+ return(eval);
+ if (semnum < 0 || semnum >= semaptr->sem_nsems)
+ return(EINVAL);
+ rval = semaptr->sem_base[semnum].semval;
+ break;
+
+ case GETALL:
+ if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+ return(eval);
+ if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+ return(eval);
+ for (i = 0; i < semaptr->sem_nsems; i++) {
+ eval = copyout((caddr_t)&semaptr->sem_base[i].semval,
+ &real_arg.array[i], sizeof(real_arg.array[0]));
+ if (eval != 0)
+ break;
+ }
+ break;
+
+ case GETZCNT:
+ if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+ return(eval);
+ if (semnum < 0 || semnum >= semaptr->sem_nsems)
+ return(EINVAL);
+ rval = semaptr->sem_base[semnum].semzcnt;
+ break;
+
+ case SETVAL:
+ if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_W)))
+ return(eval);
+ if (semnum < 0 || semnum >= semaptr->sem_nsems)
+ return(EINVAL);
+ if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+ return(eval);
+ semaptr->sem_base[semnum].semval = real_arg.val;
+ semundo_clear(semid, semnum);
+ wakeup((caddr_t)semaptr);
+ break;
+
+ case SETALL:
+ if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_W)))
+ return(eval);
+ if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+ return(eval);
+ for (i = 0; i < semaptr->sem_nsems; i++) {
+ eval = copyin(&real_arg.array[i],
+ (caddr_t)&semaptr->sem_base[i].semval,
+ sizeof(real_arg.array[0]));
+ if (eval != 0)
+ break;
+ }
+ semundo_clear(semid, -1);
+ wakeup((caddr_t)semaptr);
+ break;
+
+ default:
+ return(EINVAL);
+ }
+
+ if (eval == 0)
+ *retval = rval;
+ return(eval);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct semget_args {
+ key_t key;
+ int nsems;
+ int semflg;
+};
+#endif
+
+int
+semget(p, uap, retval)
+ struct proc *p;
+ register struct semget_args *uap;
+ int *retval;
+{
+ int semid, eval;
+ int key = uap->key;
+ int nsems = uap->nsems;
+ int semflg = uap->semflg;
+ struct ucred *cred = p->p_ucred;
+
+#ifdef SEM_DEBUG
+ printf("semget(0x%x, %d, 0%o)\n", key, nsems, semflg);
+#endif
+
+ if (key != IPC_PRIVATE) {
+ for (semid = 0; semid < seminfo.semmni; semid++) {
+ if ((sema[semid].sem_perm.mode & SEM_ALLOC) &&
+ sema[semid].sem_perm.key == key)
+ break;
+ }
+ if (semid < seminfo.semmni) {
+#ifdef SEM_DEBUG
+ printf("found public key\n");
+#endif
+ if ((eval = ipcperm(cred, &sema[semid].sem_perm,
+ semflg & 0700)))
+ return(eval);
+ if (nsems > 0 && sema[semid].sem_nsems < nsems) {
+#ifdef SEM_DEBUG
+ printf("too small\n");
+#endif
+ return(EINVAL);
+ }
+ if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) {
+#ifdef SEM_DEBUG
+ printf("not exclusive\n");
+#endif
+ return(EEXIST);
+ }
+ goto found;
+ }
+ }
+
+#ifdef SEM_DEBUG
+ printf("need to allocate the semid_ds\n");
+#endif
+ if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) {
+ if (nsems <= 0 || nsems > seminfo.semmsl) {
+#ifdef SEM_DEBUG
+ printf("nsems out of range (0<%d<=%d)\n", nsems,
+ seminfo.semmsl);
+#endif
+ return(EINVAL);
+ }
+ if (nsems > seminfo.semmns - semtot) {
+#ifdef SEM_DEBUG
+ printf("not enough semaphores left (need %d, got %d)\n",
+ nsems, seminfo.semmns - semtot);
+#endif
+ return(ENOSPC);
+ }
+ for (semid = 0; semid < seminfo.semmni; semid++) {
+ if ((sema[semid].sem_perm.mode & SEM_ALLOC) == 0)
+ break;
+ }
+ if (semid == seminfo.semmni) {
+#ifdef SEM_DEBUG
+ printf("no more semid_ds's available\n");
+#endif
+ return(ENOSPC);
+ }
+#ifdef SEM_DEBUG
+ printf("semid %d is available\n", semid);
+#endif
+ sema[semid].sem_perm.key = key;
+ sema[semid].sem_perm.cuid = cred->cr_uid;
+ sema[semid].sem_perm.uid = cred->cr_uid;
+ sema[semid].sem_perm.cgid = cred->cr_gid;
+ sema[semid].sem_perm.gid = cred->cr_gid;
+ sema[semid].sem_perm.mode = (semflg & 0777) | SEM_ALLOC;
+ sema[semid].sem_perm.seq =
+ (sema[semid].sem_perm.seq + 1) & 0x7fff;
+ sema[semid].sem_nsems = nsems;
+ sema[semid].sem_otime = 0;
+ sema[semid].sem_ctime = time.tv_sec;
+ sema[semid].sem_base = &sem[semtot];
+ semtot += nsems;
+ bzero(sema[semid].sem_base,
+ sizeof(sema[semid].sem_base[0])*nsems);
+#ifdef SEM_DEBUG
+ printf("sembase = 0x%x, next = 0x%x\n", sema[semid].sem_base,
+ &sem[semtot]);
+#endif
+ } else {
+#ifdef SEM_DEBUG
+ printf("didn't find it and wasn't asked to create it\n");
+#endif
+ return(ENOENT);
+ }
+
+found:
+ *retval = IXSEQ_TO_IPCID(semid, sema[semid].sem_perm);
+ return(0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct semop_args {
+ int semid;
+ struct sembuf *sops;
+ int nsops;
+};
+#endif
+
+int
+semop(p, uap, retval)
+ struct proc *p;
+ register struct semop_args *uap;
+ int *retval;
+{
+ int semid = uap->semid;
+ int nsops = uap->nsops;
+ struct sembuf sops[MAX_SOPS];
+ register struct semid_ds *semaptr;
+ register struct sembuf *sopptr;
+ register struct sem *semptr;
+ struct sem_undo *suptr = NULL;
+ struct ucred *cred = p->p_ucred;
+ int i, j, eval;
+ int do_wakeup, do_undos;
+
+#ifdef SEM_DEBUG
+ printf("call to semop(%d, 0x%x, %d)\n", semid, sops, nsops);
+#endif
+
+ semid = IPCID_TO_IX(semid); /* Convert back to zero origin */
+
+ if (semid < 0 || semid >= seminfo.semmsl)
+ return(EINVAL);
+
+ semaptr = &sema[semid];
+ if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0)
+ return(EINVAL);
+ if (semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid))
+ return(EINVAL);
+
+ if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_W))) {
+#ifdef SEM_DEBUG
+ printf("eval = %d from ipaccess\n", eval);
+#endif
+ return(eval);
+ }
+
+ if (nsops > MAX_SOPS) {
+#ifdef SEM_DEBUG
+ printf("too many sops (max=%d, nsops=%d)\n", MAX_SOPS, nsops);
+#endif
+ return(E2BIG);
+ }
+
+ if ((eval = copyin(uap->sops, &sops, nsops * sizeof(sops[0]))) != 0) {
+#ifdef SEM_DEBUG
+ printf("eval = %d from copyin(%08x, %08x, %d)\n", eval,
+ uap->sops, &sops, nsops * sizeof(sops[0]));
+#endif
+ return(eval);
+ }
+
+ /*
+ * Loop trying to satisfy the vector of requests.
+ * If we reach a point where we must wait, any requests already
+ * performed are rolled back and we go to sleep until some other
+ * process wakes us up. At this point, we start all over again.
+ *
+ * This ensures that from the perspective of other tasks, a set
+ * of requests is atomic (never partially satisfied).
+ */
+ do_undos = 0;
+
+ for (;;) {
+ do_wakeup = 0;
+
+ for (i = 0; i < nsops; i++) {
+ sopptr = &sops[i];
+
+ if (sopptr->sem_num >= semaptr->sem_nsems)
+ return(EFBIG);
+
+ semptr = &semaptr->sem_base[sopptr->sem_num];
+
+#ifdef SEM_DEBUG
+ printf("semop: semaptr=%x, sem_base=%x, semptr=%x, sem[%d]=%d : op=%d, flag=%s\n",
+ semaptr, semaptr->sem_base, semptr,
+ sopptr->sem_num, semptr->semval, sopptr->sem_op,
+ (sopptr->sem_flg & IPC_NOWAIT) ? "nowait" : "wait");
+#endif
+
+ if (sopptr->sem_op < 0) {
+ if (semptr->semval + sopptr->sem_op < 0) {
+#ifdef SEM_DEBUG
+ printf("semop: can't do it now\n");
+#endif
+ break;
+ } else {
+ semptr->semval += sopptr->sem_op;
+ if (semptr->semval == 0 &&
+ semptr->semzcnt > 0)
+ do_wakeup = 1;
+ }
+ if (sopptr->sem_flg & SEM_UNDO)
+ do_undos = 1;
+ } else if (sopptr->sem_op == 0) {
+ if (semptr->semval > 0) {
+#ifdef SEM_DEBUG
+ printf("semop: not zero now\n");
+#endif
+ break;
+ }
+ } else {
+ if (semptr->semncnt > 0)
+ do_wakeup = 1;
+ semptr->semval += sopptr->sem_op;
+ if (sopptr->sem_flg & SEM_UNDO)
+ do_undos = 1;
+ }
+ }
+
+ /*
+ * Did we get through the entire vector?
+ */
+ if (i >= nsops)
+ goto done;
+
+ /*
+ * No ... rollback anything that we've already done
+ */
+#ifdef SEM_DEBUG
+ printf("semop: rollback 0 through %d\n", i-1);
+#endif
+ for (j = 0; j < i; j++)
+ semaptr->sem_base[sops[j].sem_num].semval -=
+ sops[j].sem_op;
+
+ /*
+ * If the request that we couldn't satisfy has the
+ * NOWAIT flag set then return with EAGAIN.
+ */
+ if (sopptr->sem_flg & IPC_NOWAIT)
+ return(EAGAIN);
+
+ if (sopptr->sem_op == 0)
+ semptr->semzcnt++;
+ else
+ semptr->semncnt++;
+
+#ifdef SEM_DEBUG
+ printf("semop: good night!\n");
+#endif
+ eval = tsleep((caddr_t)semaptr, (PZERO - 4) | PCATCH,
+ "semwait", 0);
+#ifdef SEM_DEBUG
+ printf("semop: good morning (eval=%d)!\n", eval);
+#endif
+
+ suptr = NULL; /* sem_undo may have been reallocated */
+
+ if (eval != 0)
+ return(EINTR);
+#ifdef SEM_DEBUG
+ printf("semop: good morning!\n");
+#endif
+
+ /*
+ * Make sure that the semaphore still exists
+ */
+ if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 ||
+ semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) {
+ /* The man page says to return EIDRM. */
+ /* Unfortunately, BSD doesn't define that code! */
+#ifdef EIDRM
+ return(EIDRM);
+#else
+ return(EINVAL);
+#endif
+ }
+
+ /*
+ * The semaphore is still alive. Readjust the count of
+ * waiting processes.
+ */
+ if (sopptr->sem_op == 0)
+ semptr->semzcnt--;
+ else
+ semptr->semncnt--;
+ }
+
+done:
+ /*
+ * Process any SEM_UNDO requests.
+ */
+ if (do_undos) {
+ for (i = 0; i < nsops; i++) {
+ /*
+ * We only need to deal with SEM_UNDO's for non-zero
+ * op's.
+ */
+ int adjval;
+
+ if ((sops[i].sem_flg & SEM_UNDO) == 0)
+ continue;
+ adjval = sops[i].sem_op;
+ if (adjval == 0)
+ continue;
+ eval = semundo_adjust(p, &suptr, semid,
+ sops[i].sem_num, -adjval);
+ if (eval == 0)
+ continue;
+
+ /*
+ * Oh-Oh! We ran out of either sem_undo's or undo's.
+ * Rollback the adjustments to this point and then
+ * rollback the semaphore ups and down so we can return
+ * with an error with all structures restored. We
+ * rollback the undo's in the exact reverse order that
+ * we applied them. This guarantees that we won't run
+ * out of space as we roll things back out.
+ */
+ for (j = i - 1; j >= 0; j--) {
+ if ((sops[j].sem_flg & SEM_UNDO) == 0)
+ continue;
+ adjval = sops[j].sem_op;
+ if (adjval == 0)
+ continue;
+ if (semundo_adjust(p, &suptr, semid,
+ sops[j].sem_num, adjval) != 0)
+ panic("semop - can't undo undos");
+ }
+
+ for (j = 0; j < nsops; j++)
+ semaptr->sem_base[sops[j].sem_num].semval -=
+ sops[j].sem_op;
+
+#ifdef SEM_DEBUG
+ printf("eval = %d from semundo_adjust\n", eval);
+#endif
+ return(eval);
+ } /* loop through the sops */
+ } /* if (do_undos) */
+
+ /* We're definitely done - set the sempid's */
+ for (i = 0; i < nsops; i++) {
+ sopptr = &sops[i];
+ semptr = &semaptr->sem_base[sopptr->sem_num];
+ semptr->sempid = p->p_pid;
+ }
+
+ /* Do a wakeup if any semaphore was up'd. */
+ if (do_wakeup) {
+#ifdef SEM_DEBUG
+ printf("semop: doing wakeup\n");
+#ifdef SEM_WAKEUP
+ sem_wakeup((caddr_t)semaptr);
+#else
+ wakeup((caddr_t)semaptr);
+#endif
+ printf("semop: back from wakeup\n");
+#else
+ wakeup((caddr_t)semaptr);
+#endif
+ }
+#ifdef SEM_DEBUG
+ printf("semop: done\n");
+#endif
+ *retval = 0;
+ return(0);
+}
+
+/*
+ * Go through the undo structures for this process and apply the adjustments to
+ * semaphores.
+ */
+void
+semexit(p)
+ struct proc *p;
+{
+ register struct sem_undo *suptr;
+ register struct sem_undo **supptr;
+ int did_something;
+
+ /*
+ * If somebody else is holding the global semaphore facility lock
+ * then sleep until it is released.
+ */
+ while (semlock_holder != NULL && semlock_holder != p) {
+#ifdef SEM_DEBUG
+ printf("semaphore facility locked - sleeping ...\n");
+#endif
+ (void) tsleep((caddr_t)&semlock_holder, (PZERO - 4), "semext", 0);
+ }
+
+ did_something = 0;
+
+ /*
+ * Go through the chain of undo vectors looking for one
+ * associated with this process.
+ */
+
+ for (supptr = &semu_list; (suptr = *supptr) != NULL;
+ supptr = &suptr->un_next) {
+ if (suptr->un_proc == p)
+ break;
+ }
+
+ if (suptr == NULL)
+ goto unlock;
+
+#ifdef SEM_DEBUG
+ printf("proc @%08x has undo structure with %d entries\n", p,
+ suptr->un_cnt);
+#endif
+
+ /*
+ * If there are any active undo elements then process them.
+ */
+ if (suptr->un_cnt > 0) {
+ int ix;
+
+ for (ix = 0; ix < suptr->un_cnt; ix++) {
+ int semid = suptr->un_ent[ix].un_id;
+ int semnum = suptr->un_ent[ix].un_num;
+ int adjval = suptr->un_ent[ix].un_adjval;
+ struct semid_ds *semaptr;
+
+ semaptr = &sema[semid];
+ if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0)
+ panic("semexit - semid not allocated");
+ if (semnum >= semaptr->sem_nsems)
+ panic("semexit - semnum out of range");
+
+#ifdef SEM_DEBUG
+ printf("semexit: %08x id=%d num=%d(adj=%d) ; sem=%d\n",
+ suptr->un_proc, suptr->un_ent[ix].un_id,
+ suptr->un_ent[ix].un_num,
+ suptr->un_ent[ix].un_adjval,
+ semaptr->sem_base[semnum].semval);
+#endif
+
+ if (adjval < 0) {
+ if (semaptr->sem_base[semnum].semval < -adjval)
+ semaptr->sem_base[semnum].semval = 0;
+ else
+ semaptr->sem_base[semnum].semval +=
+ adjval;
+ } else
+ semaptr->sem_base[semnum].semval += adjval;
+
+#ifdef SEM_WAKEUP
+ sem_wakeup((caddr_t)semaptr);
+#else
+ wakeup((caddr_t)semaptr);
+#endif
+#ifdef SEM_DEBUG
+ printf("semexit: back from wakeup\n");
+#endif
+ }
+ }
+
+ /*
+ * Deallocate the undo vector.
+ */
+#ifdef SEM_DEBUG
+ printf("removing vector\n");
+#endif
+ suptr->un_proc = NULL;
+ *supptr = suptr->un_next;
+
+unlock:
+ /*
+ * If the exiting process is holding the global semaphore facility
+ * lock then release it.
+ */
+ if (semlock_holder == p) {
+ semlock_holder = NULL;
+ wakeup((caddr_t)&semlock_holder);
+ }
+}
diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c
new file mode 100644
index 0000000..9e93923
--- /dev/null
+++ b/sys/kern/sysv_shm.c
@@ -0,0 +1,622 @@
+/* $Id$ */
+/* $NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $ */
+
+/*
+ * Copyright (c) 1994 Adam Glass and Charles Hannum. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by Adam Glass and Charles
+ * Hannum.
+ * 4. The names of the authors may not be used to endorse or promote products
+ * derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_sysvipc.h"
+#include "opt_rlimit.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/shm.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/sysent.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_object.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_inherit.h>
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmat_args;
+extern int shmat __P((struct proc *p, struct shmat_args *uap, int *retval));
+struct shmctl_args;
+extern int shmctl __P((struct proc *p, struct shmctl_args *uap, int *retval));
+struct shmdt_args;
+extern int shmdt __P((struct proc *p, struct shmdt_args *uap, int *retval));
+struct shmget_args;
+extern int shmget __P((struct proc *p, struct shmget_args *uap, int *retval));
+#endif
+
+static void shminit __P((void *));
+SYSINIT(sysv_shm, SI_SUB_SYSV_SHM, SI_ORDER_FIRST, shminit, NULL)
+
+struct oshmctl_args;
+static int oshmctl __P((struct proc *p, struct oshmctl_args *uap, int *retval));
+static int shmget_allocate_segment __P((struct proc *p, struct shmget_args *uap, int mode, int *retval));
+static int shmget_existing __P((struct proc *p, struct shmget_args *uap, int mode, int segnum, int *retval));
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+sy_call_t *shmcalls[] = {
+ (sy_call_t *)shmat, (sy_call_t *)oshmctl,
+ (sy_call_t *)shmdt, (sy_call_t *)shmget,
+ (sy_call_t *)shmctl
+};
+
+#define SHMSEG_FREE 0x0200
+#define SHMSEG_REMOVED 0x0400
+#define SHMSEG_ALLOCATED 0x0800
+#define SHMSEG_WANTED 0x1000
+
+static int shm_last_free, shm_nused, shm_committed;
+struct shmid_ds *shmsegs;
+
+struct shm_handle {
+ /* vm_offset_t kva; */
+ vm_object_t shm_object;
+};
+
+struct shmmap_state {
+ vm_offset_t va;
+ int shmid;
+};
+
+static void shm_deallocate_segment __P((struct shmid_ds *));
+static int shm_find_segment_by_key __P((key_t));
+static struct shmid_ds *shm_find_segment_by_shmid __P((int));
+static int shm_delete_mapping __P((struct proc *, struct shmmap_state *));
+
+static int
+shm_find_segment_by_key(key)
+ key_t key;
+{
+ int i;
+
+ for (i = 0; i < shminfo.shmmni; i++)
+ if ((shmsegs[i].shm_perm.mode & SHMSEG_ALLOCATED) &&
+ shmsegs[i].shm_perm.key == key)
+ return i;
+ return -1;
+}
+
+static struct shmid_ds *
+shm_find_segment_by_shmid(shmid)
+ int shmid;
+{
+ int segnum;
+ struct shmid_ds *shmseg;
+
+ segnum = IPCID_TO_IX(shmid);
+ if (segnum < 0 || segnum >= shminfo.shmmni)
+ return NULL;
+ shmseg = &shmsegs[segnum];
+ if ((shmseg->shm_perm.mode & (SHMSEG_ALLOCATED | SHMSEG_REMOVED))
+ != SHMSEG_ALLOCATED ||
+ shmseg->shm_perm.seq != IPCID_TO_SEQ(shmid))
+ return NULL;
+ return shmseg;
+}
+
+static void
+shm_deallocate_segment(shmseg)
+ struct shmid_ds *shmseg;
+{
+ struct shm_handle *shm_handle;
+ size_t size;
+
+ shm_handle = shmseg->shm_internal;
+ vm_object_deallocate(shm_handle->shm_object);
+ free((caddr_t)shm_handle, M_SHM);
+ shmseg->shm_internal = NULL;
+ size = round_page(shmseg->shm_segsz);
+ shm_committed -= btoc(size);
+ shm_nused--;
+ shmseg->shm_perm.mode = SHMSEG_FREE;
+}
+
+static int
+shm_delete_mapping(p, shmmap_s)
+ struct proc *p;
+ struct shmmap_state *shmmap_s;
+{
+ struct shmid_ds *shmseg;
+ int segnum, result;
+ size_t size;
+
+ segnum = IPCID_TO_IX(shmmap_s->shmid);
+ shmseg = &shmsegs[segnum];
+ size = round_page(shmseg->shm_segsz);
+ result = vm_map_remove(&p->p_vmspace->vm_map, shmmap_s->va, shmmap_s->va + size);
+ if (result != KERN_SUCCESS)
+ return EINVAL;
+ shmmap_s->shmid = -1;
+ shmseg->shm_dtime = time.tv_sec;
+ if ((--shmseg->shm_nattch <= 0) &&
+ (shmseg->shm_perm.mode & SHMSEG_REMOVED)) {
+ shm_deallocate_segment(shmseg);
+ shm_last_free = segnum;
+ }
+ return 0;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmdt_args {
+ void *shmaddr;
+};
+#endif
+
+int
+shmdt(p, uap, retval)
+ struct proc *p;
+ struct shmdt_args *uap;
+ int *retval;
+{
+ struct shmmap_state *shmmap_s;
+ int i;
+
+ shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
+ if (shmmap_s == NULL)
+ return EINVAL;
+ for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
+ if (shmmap_s->shmid != -1 &&
+ shmmap_s->va == (vm_offset_t)uap->shmaddr)
+ break;
+ if (i == shminfo.shmseg)
+ return EINVAL;
+ return shm_delete_mapping(p, shmmap_s);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmat_args {
+ int shmid;
+ void *shmaddr;
+ int shmflg;
+};
+#endif
+
+int
+shmat(p, uap, retval)
+ struct proc *p;
+ struct shmat_args *uap;
+ int *retval;
+{
+ int error, i, flags;
+ struct ucred *cred = p->p_ucred;
+ struct shmid_ds *shmseg;
+ struct shmmap_state *shmmap_s = NULL;
+ struct shm_handle *shm_handle;
+ vm_offset_t attach_va;
+ vm_prot_t prot;
+ vm_size_t size;
+ int rv;
+
+ shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
+ if (shmmap_s == NULL) {
+ size = shminfo.shmseg * sizeof(struct shmmap_state);
+ shmmap_s = malloc(size, M_SHM, M_WAITOK);
+ for (i = 0; i < shminfo.shmseg; i++)
+ shmmap_s[i].shmid = -1;
+ p->p_vmspace->vm_shm = (caddr_t)shmmap_s;
+ }
+ shmseg = shm_find_segment_by_shmid(uap->shmid);
+ if (shmseg == NULL)
+ return EINVAL;
+ error = ipcperm(cred, &shmseg->shm_perm,
+ (uap->shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W);
+ if (error)
+ return error;
+ for (i = 0; i < shminfo.shmseg; i++) {
+ if (shmmap_s->shmid == -1)
+ break;
+ shmmap_s++;
+ }
+ if (i >= shminfo.shmseg)
+ return EMFILE;
+ size = round_page(shmseg->shm_segsz);
+ prot = VM_PROT_READ;
+ if ((uap->shmflg & SHM_RDONLY) == 0)
+ prot |= VM_PROT_WRITE;
+ flags = MAP_ANON | MAP_SHARED;
+ if (uap->shmaddr) {
+ flags |= MAP_FIXED;
+ if (uap->shmflg & SHM_RND)
+ attach_va = (vm_offset_t)uap->shmaddr & ~(SHMLBA-1);
+ else if (((vm_offset_t)uap->shmaddr & (SHMLBA-1)) == 0)
+ attach_va = (vm_offset_t)uap->shmaddr;
+ else
+ return EINVAL;
+ } else {
+ /* This is just a hint to vm_map_find() about where to put it. */
+ attach_va = round_page(p->p_vmspace->vm_taddr + MAXTSIZ + MAXDSIZ);
+ }
+
+ shm_handle = shmseg->shm_internal;
+ vm_object_reference(shm_handle->shm_object);
+ rv = vm_map_find(&p->p_vmspace->vm_map, shm_handle->shm_object,
+ 0, &attach_va, size, (flags & MAP_FIXED)?0:1, prot, prot, 0);
+ if (rv != KERN_SUCCESS) {
+ return ENOMEM;
+ }
+ vm_map_inherit(&p->p_vmspace->vm_map,
+ attach_va, attach_va + size, VM_INHERIT_SHARE);
+
+ shmmap_s->va = attach_va;
+ shmmap_s->shmid = uap->shmid;
+ shmseg->shm_lpid = p->p_pid;
+ shmseg->shm_atime = time.tv_sec;
+ shmseg->shm_nattch++;
+ *retval = attach_va;
+ return 0;
+}
+
+struct oshmid_ds {
+ struct ipc_perm shm_perm; /* operation perms */
+ int shm_segsz; /* size of segment (bytes) */
+ ushort shm_cpid; /* pid, creator */
+ ushort shm_lpid; /* pid, last operation */
+ short shm_nattch; /* no. of current attaches */
+ time_t shm_atime; /* last attach time */
+ time_t shm_dtime; /* last detach time */
+ time_t shm_ctime; /* last change time */
+ void *shm_handle; /* internal handle for shm segment */
+};
+
+struct oshmctl_args {
+ int shmid;
+ int cmd;
+ struct oshmid_ds *ubuf;
+};
+
+static int
+oshmctl(p, uap, retval)
+ struct proc *p;
+ struct oshmctl_args *uap;
+ int *retval;
+{
+#ifdef COMPAT_43
+ int error;
+ struct ucred *cred = p->p_ucred;
+ struct shmid_ds *shmseg;
+ struct oshmid_ds outbuf;
+
+ shmseg = shm_find_segment_by_shmid(uap->shmid);
+ if (shmseg == NULL)
+ return EINVAL;
+ switch (uap->cmd) {
+ case IPC_STAT:
+ error = ipcperm(cred, &shmseg->shm_perm, IPC_R);
+ if (error)
+ return error;
+ outbuf.shm_perm = shmseg->shm_perm;
+ outbuf.shm_segsz = shmseg->shm_segsz;
+ outbuf.shm_cpid = shmseg->shm_cpid;
+ outbuf.shm_lpid = shmseg->shm_lpid;
+ outbuf.shm_nattch = shmseg->shm_nattch;
+ outbuf.shm_atime = shmseg->shm_atime;
+ outbuf.shm_dtime = shmseg->shm_dtime;
+ outbuf.shm_ctime = shmseg->shm_ctime;
+ outbuf.shm_handle = shmseg->shm_internal;
+ error = copyout((caddr_t)&outbuf, uap->ubuf, sizeof(outbuf));
+ if (error)
+ return error;
+ break;
+ default:
+ /* XXX casting to (sy_call_t *) is bogus, as usual. */
+ return ((sy_call_t *)shmctl)(p, uap, retval);
+ }
+ return 0;
+#else
+ return EINVAL;
+#endif
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmctl_args {
+ int shmid;
+ int cmd;
+ struct shmid_ds *buf;
+};
+#endif
+
+int
+shmctl(p, uap, retval)
+ struct proc *p;
+ struct shmctl_args *uap;
+ int *retval;
+{
+ int error;
+ struct ucred *cred = p->p_ucred;
+ struct shmid_ds inbuf;
+ struct shmid_ds *shmseg;
+
+ shmseg = shm_find_segment_by_shmid(uap->shmid);
+ if (shmseg == NULL)
+ return EINVAL;
+ switch (uap->cmd) {
+ case IPC_STAT:
+ error = ipcperm(cred, &shmseg->shm_perm, IPC_R);
+ if (error)
+ return error;
+ error = copyout((caddr_t)shmseg, uap->buf, sizeof(inbuf));
+ if (error)
+ return error;
+ break;
+ case IPC_SET:
+ error = ipcperm(cred, &shmseg->shm_perm, IPC_M);
+ if (error)
+ return error;
+ error = copyin(uap->buf, (caddr_t)&inbuf, sizeof(inbuf));
+ if (error)
+ return error;
+ shmseg->shm_perm.uid = inbuf.shm_perm.uid;
+ shmseg->shm_perm.gid = inbuf.shm_perm.gid;
+ shmseg->shm_perm.mode =
+ (shmseg->shm_perm.mode & ~ACCESSPERMS) |
+ (inbuf.shm_perm.mode & ACCESSPERMS);
+ shmseg->shm_ctime = time.tv_sec;
+ break;
+ case IPC_RMID:
+ error = ipcperm(cred, &shmseg->shm_perm, IPC_M);
+ if (error)
+ return error;
+ shmseg->shm_perm.key = IPC_PRIVATE;
+ shmseg->shm_perm.mode |= SHMSEG_REMOVED;
+ if (shmseg->shm_nattch <= 0) {
+ shm_deallocate_segment(shmseg);
+ shm_last_free = IPCID_TO_IX(uap->shmid);
+ }
+ break;
+#if 0
+ case SHM_LOCK:
+ case SHM_UNLOCK:
+#endif
+ default:
+ return EINVAL;
+ }
+ return 0;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmget_args {
+ key_t key;
+ size_t size;
+ int shmflg;
+};
+#endif
+
+static int
+shmget_existing(p, uap, mode, segnum, retval)
+ struct proc *p;
+ struct shmget_args *uap;
+ int mode;
+ int segnum;
+ int *retval;
+{
+ struct shmid_ds *shmseg;
+ struct ucred *cred = p->p_ucred;
+ int error;
+
+ shmseg = &shmsegs[segnum];
+ if (shmseg->shm_perm.mode & SHMSEG_REMOVED) {
+ /*
+ * This segment is in the process of being allocated. Wait
+ * until it's done, and look the key up again (in case the
+ * allocation failed or it was freed).
+ */
+ shmseg->shm_perm.mode |= SHMSEG_WANTED;
+ error = tsleep((caddr_t)shmseg, PLOCK | PCATCH, "shmget", 0);
+ if (error)
+ return error;
+ return EAGAIN;
+ }
+ error = ipcperm(cred, &shmseg->shm_perm, mode);
+ if (error)
+ return error;
+ if (uap->size && uap->size > shmseg->shm_segsz)
+ return EINVAL;
+ if ((uap->shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL))
+ return EEXIST;
+ *retval = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm);
+ return 0;
+}
+
+static int
+shmget_allocate_segment(p, uap, mode, retval)
+ struct proc *p;
+ struct shmget_args *uap;
+ int mode;
+ int *retval;
+{
+ int i, segnum, shmid, size;
+ struct ucred *cred = p->p_ucred;
+ struct shmid_ds *shmseg;
+ struct shm_handle *shm_handle;
+
+ if (uap->size < shminfo.shmmin || uap->size > shminfo.shmmax)
+ return EINVAL;
+ if (shm_nused >= shminfo.shmmni) /* any shmids left? */
+ return ENOSPC;
+ size = round_page(uap->size);
+ if (shm_committed + btoc(size) > shminfo.shmall)
+ return ENOMEM;
+ if (shm_last_free < 0) {
+ for (i = 0; i < shminfo.shmmni; i++)
+ if (shmsegs[i].shm_perm.mode & SHMSEG_FREE)
+ break;
+ if (i == shminfo.shmmni)
+ panic("shmseg free count inconsistent");
+ segnum = i;
+ } else {
+ segnum = shm_last_free;
+ shm_last_free = -1;
+ }
+ shmseg = &shmsegs[segnum];
+ /*
+ * In case we sleep in malloc(), mark the segment present but deleted
+ * so that noone else tries to create the same key.
+ */
+ shmseg->shm_perm.mode = SHMSEG_ALLOCATED | SHMSEG_REMOVED;
+ shmseg->shm_perm.key = uap->key;
+ shmseg->shm_perm.seq = (shmseg->shm_perm.seq + 1) & 0x7fff;
+ shm_handle = (struct shm_handle *)
+ malloc(sizeof(struct shm_handle), M_SHM, M_WAITOK);
+ shmid = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm);
+
+ /*
+ * We make sure that we have allocated a pager before we need
+ * to.
+ */
+ shm_handle->shm_object =
+ vm_pager_allocate(OBJT_SWAP, 0, OFF_TO_IDX(size),
+ VM_PROT_DEFAULT, 0);
+ shmseg->shm_internal = shm_handle;
+ shmseg->shm_perm.cuid = shmseg->shm_perm.uid = cred->cr_uid;
+ shmseg->shm_perm.cgid = shmseg->shm_perm.gid = cred->cr_gid;
+ shmseg->shm_perm.mode = (shmseg->shm_perm.mode & SHMSEG_WANTED) |
+ (mode & ACCESSPERMS) | SHMSEG_ALLOCATED;
+ shmseg->shm_segsz = uap->size;
+ shmseg->shm_cpid = p->p_pid;
+ shmseg->shm_lpid = shmseg->shm_nattch = 0;
+ shmseg->shm_atime = shmseg->shm_dtime = 0;
+ shmseg->shm_ctime = time.tv_sec;
+ shm_committed += btoc(size);
+ shm_nused++;
+ if (shmseg->shm_perm.mode & SHMSEG_WANTED) {
+ /*
+ * Somebody else wanted this key while we were asleep. Wake
+ * them up now.
+ */
+ shmseg->shm_perm.mode &= ~SHMSEG_WANTED;
+ wakeup((caddr_t)shmseg);
+ }
+ *retval = shmid;
+ return 0;
+}
+
+int
+shmget(p, uap, retval)
+ struct proc *p;
+ struct shmget_args *uap;
+ int *retval;
+{
+ int segnum, mode, error;
+
+ mode = uap->shmflg & ACCESSPERMS;
+ if (uap->key != IPC_PRIVATE) {
+ again:
+ segnum = shm_find_segment_by_key(uap->key);
+ if (segnum >= 0) {
+ error = shmget_existing(p, uap, mode, segnum, retval);
+ if (error == EAGAIN)
+ goto again;
+ return error;
+ }
+ if ((uap->shmflg & IPC_CREAT) == 0)
+ return ENOENT;
+ }
+ return shmget_allocate_segment(p, uap, mode, retval);
+}
+
+int
+shmsys(p, uap, retval)
+ struct proc *p;
+ /* XXX actually varargs. */
+ struct shmsys_args /* {
+ u_int which;
+ int a2;
+ int a3;
+ int a4;
+ } */ *uap;
+ int *retval;
+{
+
+ if (uap->which >= sizeof(shmcalls)/sizeof(shmcalls[0]))
+ return EINVAL;
+ return ((*shmcalls[uap->which])(p, &uap->a2, retval));
+}
+
+void
+shmfork(p1, p2)
+ struct proc *p1, *p2;
+{
+ struct shmmap_state *shmmap_s;
+ size_t size;
+ int i;
+
+ size = shminfo.shmseg * sizeof(struct shmmap_state);
+ shmmap_s = malloc(size, M_SHM, M_WAITOK);
+ bcopy((caddr_t)p1->p_vmspace->vm_shm, (caddr_t)shmmap_s, size);
+ p2->p_vmspace->vm_shm = (caddr_t)shmmap_s;
+ for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
+ if (shmmap_s->shmid != -1)
+ shmsegs[IPCID_TO_IX(shmmap_s->shmid)].shm_nattch++;
+}
+
+void
+shmexit(p)
+ struct proc *p;
+{
+ struct shmmap_state *shmmap_s;
+ int i;
+
+ shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
+ for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
+ if (shmmap_s->shmid != -1)
+ shm_delete_mapping(p, shmmap_s);
+ free((caddr_t)p->p_vmspace->vm_shm, M_SHM);
+ p->p_vmspace->vm_shm = NULL;
+}
+
+void
+shminit(dummy)
+ void *dummy;
+{
+ int i;
+ for (i = 0; i < shminfo.shmmni; i++) {
+ shmsegs[i].shm_perm.mode = SHMSEG_FREE;
+ shmsegs[i].shm_perm.seq = 0;
+ }
+ shm_last_free = 0;
+ shm_nused = 0;
+ shm_committed = 0;
+}
diff --git a/sys/kern/tty.c b/sys/kern/tty.c
index 5d698b1..f6e14f9 100644
--- a/sys/kern/tty.c
+++ b/sys/kern/tty.c
@@ -35,39 +35,82 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)tty.c 8.13 (Berkeley) 1/9/95
+ * @(#)tty.c 8.8 (Berkeley) 1/21/94
+ * $Id: tty.c,v 1.93 1997/03/23 03:36:26 bde Exp $
*/
+/*-
+ * TODO:
+ * o Fix races for sending the start char in ttyflush().
+ * o Handle inter-byte timeout for "MIN > 0, TIME > 0" in ttyselect().
+ * With luck, there will be MIN chars before select() returns().
+ * o Handle CLOCAL consistently for ptys. Perhaps disallow setting it.
+ * o Don't allow input in TS_ZOMBIE case. It would be visible through
+ * FIONREAD.
+ * o Do the new sio locking stuff here and use it to avoid special
+ * case for EXTPROC?
+ * o Lock PENDIN too?
+ * o Move EXTPROC and/or PENDIN to t_state?
+ * o Wrap most of ttioctl in spltty/splx.
+ * o Implement TIOCNOTTY or remove it from <sys/ioctl.h>.
+ * o Send STOP if IXOFF is toggled off while TS_TBLOCK is set.
+ * o Don't allow certain termios flags to affect disciplines other
+ * than TTYDISC. Cancel their effects before switch disciplines
+ * and ignore them if they are set while we are in another
+ * discipline.
+ * o Handle c_ispeed = 0 to c_ispeed = c_ospeed conversion here instead
+ * of in drivers and fix drivers that write to tp->t_termios.
+ * o Check for TS_CARR_ON being set while everything is closed and not
+ * waiting for carrier. TS_CARR_ON isn't cleared if nothing is open,
+ * so it would live until the next open even if carrier drops.
+ * o Restore TS_WOPEN since it is useful in pstat. It must be cleared
+ * only when _all_ openers leave open().
+ */
+
+#include "snp.h"
+#include "opt_uconsole.h"
+
#include <sys/param.h>
#include <sys/systm.h>
-#include <sys/ioctl.h>
+#include <sys/filio.h>
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#include <sys/ioctl_compat.h>
+#endif
#include <sys/proc.h>
#define TTYDEFCHARS
#include <sys/tty.h>
#undef TTYDEFCHARS
-#include <sys/file.h>
+#include <sys/fcntl.h>
#include <sys/conf.h>
#include <sys/dkstat.h>
#include <sys/uio.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
#include <sys/syslog.h>
+#include <sys/signalvar.h>
+#include <sys/resourcevar.h>
+#include <sys/malloc.h>
+#if NSNP > 0
+#include <sys/snoop.h>
+#endif
#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
static int proc_compare __P((struct proc *p1, struct proc *p2));
-static int ttnread __P((struct tty *));
-static void ttyblock __P((struct tty *tp));
-static void ttyecho __P((int, struct tty *tp));
-static void ttyrubo __P((struct tty *, int));
-
-/* Symbolic sleep message strings. */
-char ttclos[] = "ttycls";
-char ttopen[] = "ttyopn";
-char ttybg[] = "ttybg";
-char ttybuf[] = "ttybuf";
-char ttyin[] = "ttyin";
-char ttyout[] = "ttyout";
+static int ttnread __P((struct tty *tp));
+static void ttyecho __P((int c, struct tty *tp));
+static int ttyoutput __P((int c, register struct tty *tp));
+static void ttypend __P((struct tty *tp));
+static void ttyretype __P((struct tty *tp));
+static void ttyrub __P((int c, struct tty *tp));
+static void ttyrubo __P((struct tty *tp, int cnt));
+static void ttyunblock __P((struct tty *tp));
+static int ttywflush __P((struct tty *tp));
/*
* Table with character classes and parity. The 8th bit indicates parity,
@@ -95,7 +138,7 @@ char ttyout[] = "ttyout";
#define TB TAB
#define VT VTAB
-char const char_type[] = {
+static u_char const char_type[] = {
E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC, /* nul - bel */
O|BS, E|TB, E|NL, O|CC, E|VT, O|CR, O|CC, E|CC, /* bs - si */
O|CC, E|CC, E|CC, O|CC, E|CC, O|CC, O|CC, E|CC, /* dle - etb */
@@ -148,6 +191,17 @@ char const char_type[] = {
#define ISSET(t, f) ((t) & (f))
/*
+ * Input control starts when we would not be able to fit the maximum
+ * contents of the ping-pong buffers and finishes when we would be able
+ * to fit that much plus 1/8 more.
+ */
+#define I_HIGH_WATER (TTYHOG - 2 * 256) /* XXX */
+#define I_LOW_WATER ((TTYHOG - 2 * 256) * 7 / 8) /* XXX */
+
+#undef MAX_INPUT /* XXX wrong in <sys/syslimits.h> */
+#define MAX_INPUT TTYHOG
+
+/*
* Initial open of tty, or (re)entry to standard tty line discipline.
*/
int
@@ -161,9 +215,20 @@ ttyopen(device, tp)
tp->t_dev = device;
if (!ISSET(tp->t_state, TS_ISOPEN)) {
SET(tp->t_state, TS_ISOPEN);
+ if (ISSET(tp->t_cflag, CLOCAL))
+ SET(tp->t_state, TS_CONNECTED);
bzero(&tp->t_winsize, sizeof(tp->t_winsize));
}
- CLR(tp->t_state, TS_WOPEN);
+
+ /*
+ * Initialize or restore a cblock allocation policy suitable for
+ * the standard line discipline.
+ */
+ clist_alloc_cblocks(&tp->t_canq, TTYHOG, 512);
+ clist_alloc_cblocks(&tp->t_outq, TTMAXHIWAT + OBUFSIZ + 100,
+ TTMAXHIWAT + OBUFSIZ + 100);
+ clist_alloc_cblocks(&tp->t_rawq, TTYHOG, TTYHOG);
+
splx(s);
return (0);
}
@@ -172,22 +237,36 @@ ttyopen(device, tp)
* Handle close() on a tty line: flush and set to initial state,
* bumping generation number so that pending read/write calls
* can detect recycling of the tty.
+ * XXX our caller should have done `spltty(); l_close(); ttyclose();'
+ * and l_close() should have flushed, but we repeat the spltty() and
+ * the flush in case there are buggy callers.
*/
int
ttyclose(tp)
register struct tty *tp;
{
- extern struct tty *constty; /* Temporary virtual console. */
+ int s;
+ s = spltty();
if (constty == tp)
constty = NULL;
ttyflush(tp, FREAD | FWRITE);
+ clist_free_cblocks(&tp->t_canq);
+ clist_free_cblocks(&tp->t_outq);
+ clist_free_cblocks(&tp->t_rawq);
+
+#if NSNP > 0
+ if (ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL)
+ snpdown((struct snoop *)tp->t_sc);
+#endif
tp->t_gen++;
+ tp->t_line = TTYDISC;
tp->t_pgrp = NULL;
tp->t_session = NULL;
tp->t_state = 0;
+ splx(s);
return (0);
}
@@ -197,10 +276,10 @@ ttyclose(tp)
}
/* Is 'c' a line delimiter ("break" character)? */
-#define TTBREAKC(c) \
- ((c) == '\n' || ((c) == cc[VEOF] || \
- (c) == cc[VEOL] || (c) == cc[VEOL2]) && (c) != _POSIX_VDISABLE)
-
+#define TTBREAKC(c, lflag) \
+ ((c) == '\n' || (((c) == cc[VEOF] || \
+ (c) == cc[VEOL] || ((c) == cc[VEOL2] && lflag & IEXTEN)) && \
+ (c) != _POSIX_VDISABLE))
/*
* Process input of a single character received on a tty.
@@ -210,8 +289,8 @@ ttyinput(c, tp)
register int c;
register struct tty *tp;
{
- register int iflag, lflag;
- register u_char *cc;
+ register tcflag_t iflag, lflag;
+ register cc_t *cc;
int i, err;
/*
@@ -232,26 +311,44 @@ ttyinput(c, tp)
}
++tk_nin;
+ /*
+ * Block further input iff:
+ * current input > threshold AND input is available to user program
+ * AND input flow control is enabled and not yet invoked.
+ * The 3 is slop for PARMRK.
+ */
+ iflag = tp->t_iflag;
+ if (tp->t_rawq.c_cc + tp->t_canq.c_cc > I_HIGH_WATER - 3 &&
+ (!ISSET(lflag, ICANON) || tp->t_canq.c_cc != 0) &&
+ (ISSET(tp->t_cflag, CRTS_IFLOW) || ISSET(iflag, IXOFF)) &&
+ !ISSET(tp->t_state, TS_TBLOCK))
+ ttyblock(tp);
+
/* Handle exceptional conditions (break, parity, framing). */
cc = tp->t_cc;
- iflag = tp->t_iflag;
- if (err = (ISSET(c, TTY_ERRORMASK))) {
+ err = (ISSET(c, TTY_ERRORMASK));
+ if (err) {
CLR(c, TTY_ERRORMASK);
- if (ISSET(err, TTY_FE) && !c) { /* Break. */
+ if (ISSET(err, TTY_BI)) {
if (ISSET(iflag, IGNBRK))
+ return (0);
+ if (ISSET(iflag, BRKINT)) {
+ ttyflush(tp, FREAD | FWRITE);
+ pgsignal(tp->t_pgrp, SIGINT, 1);
goto endcase;
- else if (ISSET(iflag, BRKINT) &&
- ISSET(lflag, ISIG) &&
- (cc[VINTR] != _POSIX_VDISABLE))
- c = cc[VINTR];
- else if (ISSET(iflag, PARMRK))
+ }
+ if (ISSET(iflag, PARMRK))
goto parmrk;
- } else if (ISSET(err, TTY_PE) &&
- ISSET(iflag, INPCK) || ISSET(err, TTY_FE)) {
+ } else if ((ISSET(err, TTY_PE) && ISSET(iflag, INPCK))
+ || ISSET(err, TTY_FE)) {
if (ISSET(iflag, IGNPAR))
- goto endcase;
+ return (0);
else if (ISSET(iflag, PARMRK)) {
-parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
+parmrk:
+ if (tp->t_rawq.c_cc + tp->t_canq.c_cc >
+ MAX_INPUT - 3)
+ goto input_overflow;
+ (void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
(void)putc(0 | TTY_QUOTE, &tp->t_rawq);
(void)putc(c | TTY_QUOTE, &tp->t_rawq);
goto endcase;
@@ -259,11 +356,7 @@ parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
c = 0;
}
}
- /*
- * In tandem mode, check high water mark.
- */
- if (ISSET(iflag, IXOFF))
- ttyblock(tp);
+
if (!ISSET(tp->t_state, TS_TYPEN) && ISSET(iflag, ISTRIP))
CLR(c, 0x80);
if (!ISSET(lflag, EXTPROC)) {
@@ -341,7 +434,7 @@ parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
#ifdef sun4c /* XXX */
(*tp->t_stop)(tp, 0);
#else
- (*cdevsw[major(tp->t_dev)].d_stop)(tp,
+ (*cdevsw[major(tp->t_dev)]->d_stop)(tp,
0);
#endif
return (0);
@@ -361,7 +454,7 @@ parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
*/
if (c == '\r') {
if (ISSET(iflag, IGNCR))
- goto endcase;
+ return (0);
else if (ISSET(iflag, ICRNL))
c = '\n';
} else if (c == '\n' && ISSET(iflag, INLCR))
@@ -403,8 +496,7 @@ parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
/*
* word erase (^W)
*/
- if (CCEQ(cc[VWERASE], c)) {
- int alt = ISSET(lflag, ALTWERASE);
+ if (CCEQ(cc[VWERASE], c) && ISSET(lflag, IEXTEN)) {
int ctype;
/*
@@ -436,21 +528,21 @@ parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
if (c == -1)
goto endcase;
} while (c != ' ' && c != '\t' &&
- (alt == 0 || ISALPHA(c) == ctype));
+ (!ISSET(lflag, ALTWERASE) || ISALPHA(c) == ctype));
(void)putc(c, &tp->t_rawq);
goto endcase;
}
/*
* reprint line (^R)
*/
- if (CCEQ(cc[VREPRINT], c)) {
+ if (CCEQ(cc[VREPRINT], c) && ISSET(lflag, IEXTEN)) {
ttyretype(tp);
goto endcase;
}
/*
* ^T - kernel info and generate SIGINFO
*/
- if (CCEQ(cc[VSTATUS], c)) {
+ if (CCEQ(cc[VSTATUS], c) && ISSET(lflag, IEXTEN)) {
if (ISSET(lflag, ISIG))
pgsignal(tp->t_pgrp, SIGINFO, 1);
if (!ISSET(lflag, NOKERNINFO))
@@ -461,14 +553,19 @@ parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
/*
* Check for input buffer overflow
*/
- if (tp->t_rawq.c_cc + tp->t_canq.c_cc >= TTYHOG) {
+ if (tp->t_rawq.c_cc + tp->t_canq.c_cc >= MAX_INPUT) {
+input_overflow:
if (ISSET(iflag, IMAXBEL)) {
if (tp->t_outq.c_cc < tp->t_hiwat)
(void)ttyoutput(CTRL('g'), tp);
- } else
- ttyflush(tp, FREAD | FWRITE);
+ }
goto endcase;
}
+
+ if ( c == 0377 && ISSET(iflag, PARMRK) && !ISSET(iflag, ISTRIP)
+ && ISSET(iflag, IGNBRK|IGNPAR) != (IGNBRK|IGNPAR))
+ (void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
+
/*
* Put data char in q for user and
* wakeup on seeing a line delimiter.
@@ -479,7 +576,7 @@ parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
ttyecho(c, tp);
goto endcase;
}
- if (TTBREAKC(c)) {
+ if (TTBREAKC(c, lflag)) {
tp->t_rocount = 0;
catq(&tp->t_rawq, &tp->t_canq);
ttwakeup(tp);
@@ -498,7 +595,7 @@ parmrk: (void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
/*
* Place the cursor over the '^' of the ^D.
*/
- i = min(2, tp->t_column - i);
+ i = imin(2, tp->t_column - i);
while (i > 0) {
(void)ttyoutput('\b', tp);
i--;
@@ -525,13 +622,13 @@ startoutput:
* Returns < 0 if succeeds, otherwise returns char to resend.
* Must be recursive.
*/
-int
+static int
ttyoutput(c, tp)
register int c;
register struct tty *tp;
{
- register long oflag;
- register int notout, col, s;
+ register tcflag_t oflag;
+ register int col, s;
oflag = tp->t_oflag;
if (!ISSET(oflag, OPOST)) {
@@ -553,18 +650,15 @@ ttyoutput(c, tp)
if (c == '\t' &&
ISSET(oflag, OXTABS) && !ISSET(tp->t_lflag, EXTPROC)) {
c = 8 - (tp->t_column & 7);
- if (ISSET(tp->t_lflag, FLUSHO)) {
- notout = 0;
- } else {
+ if (!ISSET(tp->t_lflag, FLUSHO)) {
s = spltty(); /* Don't interrupt tabs. */
- notout = b_to_q(" ", c, &tp->t_outq);
- c -= notout;
+ c -= b_to_q(" ", c, &tp->t_outq);
tk_nout += c;
tp->t_outcc += c;
splx(s);
}
tp->t_column += c;
- return (notout ? '\t' : -1);
+ return (c ? -1 : '\t');
}
if (c == CEOT && ISSET(oflag, ONOEOT))
return (-1);
@@ -616,12 +710,9 @@ ttyoutput(c, tp)
int
ttioctl(tp, cmd, data, flag)
register struct tty *tp;
- u_long cmd;
+ int cmd, flag;
void *data;
- int flag;
{
- extern struct tty *constty; /* Temporary virtual console. */
- extern int nlinesw;
register struct proc *p;
int s, error;
@@ -637,6 +728,7 @@ ttioctl(tp, cmd, data, flag)
#ifdef notdef
case TIOCSPGRP:
#endif
+ case TIOCSTAT:
case TIOCSTI:
case TIOCSWINSZ:
#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
@@ -649,13 +741,16 @@ ttioctl(tp, cmd, data, flag)
case TIOCSETP:
case TIOCSLTC:
#endif
- while (isbackground(curproc, tp) &&
- p->p_pgrp->pg_jobc && (p->p_flag & P_PPWAIT) == 0 &&
+ while (isbackground(p, tp) &&
+ (p->p_flag & P_PPWAIT) == 0 &&
(p->p_sigignore & sigmask(SIGTTOU)) == 0 &&
(p->p_sigmask & sigmask(SIGTTOU)) == 0) {
+ if (p->p_pgrp->pg_jobc == 0)
+ return (EIO);
pgsignal(p->p_pgrp, SIGTTOU, 1);
- if (error = ttysleep(tp,
- &lbolt, TTOPRI | PCATCH, ttybg, 0))
+ error = ttysleep(tp, &lbolt, TTOPRI | PCATCH, "ttybg1",
+ 0);
+ if (error)
return (error);
}
break;
@@ -673,7 +768,9 @@ ttioctl(tp, cmd, data, flag)
case FIONBIO: /* set/clear non-blocking i/o */
break; /* XXX: delete. */
case FIONREAD: /* get # bytes to read */
+ s = spltty();
*(int *)data = ttnread(tp);
+ splx(s);
break;
case TIOCEXCL: /* set exclusive use of tty */
s = spltty();
@@ -693,8 +790,7 @@ ttioctl(tp, cmd, data, flag)
case TIOCCONS: /* become virtual console */
if (*(int *)data) {
if (constty && constty != tp &&
- ISSET(constty->t_state, TS_CARR_ON | TS_ISOPEN) ==
- (TS_CARR_ON | TS_ISOPEN))
+ ISSET(constty->t_state, TS_CONNECTED))
return (EBUSY);
#ifndef UCONSOLE
if (error = suser(p->p_ucred, &p->p_acflag))
@@ -705,7 +801,8 @@ ttioctl(tp, cmd, data, flag)
constty = NULL;
break;
case TIOCDRAIN: /* wait till output drained */
- if (error = ttywait(tp))
+ error = ttywait(tp);
+ if (error)
return (error);
break;
case TIOCGETA: { /* get termios struct */
@@ -745,9 +842,12 @@ ttioctl(tp, cmd, data, flag)
case TIOCSETAF: { /* drn out, fls in, set */
register struct termios *t = (struct termios *)data;
+ if (t->c_ispeed < 0 || t->c_ospeed < 0)
+ return (EINVAL);
s = spltty();
if (cmd == TIOCSETAW || cmd == TIOCSETAF) {
- if (error = ttywait(tp)) {
+ error = ttywait(tp);
+ if (error) {
splx(s);
return (error);
}
@@ -761,35 +861,56 @@ ttioctl(tp, cmd, data, flag)
if (tp->t_param && (error = (*tp->t_param)(tp, t))) {
splx(s);
return (error);
- } else {
- if (!ISSET(tp->t_state, TS_CARR_ON) &&
- ISSET(tp->t_cflag, CLOCAL) &&
- !ISSET(t->c_cflag, CLOCAL)) {
- CLR(tp->t_state, TS_ISOPEN);
- SET(tp->t_state, TS_WOPEN);
- ttwakeup(tp);
- }
- tp->t_cflag = t->c_cflag;
- tp->t_ispeed = t->c_ispeed;
- tp->t_ospeed = t->c_ospeed;
}
+ if (ISSET(t->c_cflag, CLOCAL) &&
+ !ISSET(tp->t_cflag, CLOCAL)) {
+ /*
+ * XXX disconnections would be too hard to
+ * get rid of without this kludge. The only
+ * way to get rid of controlling terminals
+ * is to exit from the session leader.
+ */
+ CLR(tp->t_state, TS_ZOMBIE);
+
+ wakeup(TSA_CARR_ON(tp));
+ ttwakeup(tp);
+ ttwwakeup(tp);
+ }
+ if ((ISSET(tp->t_state, TS_CARR_ON) ||
+ ISSET(t->c_cflag, CLOCAL)) &&
+ !ISSET(tp->t_state, TS_ZOMBIE))
+ SET(tp->t_state, TS_CONNECTED);
+ else
+ CLR(tp->t_state, TS_CONNECTED);
+ tp->t_cflag = t->c_cflag;
+ tp->t_ispeed = t->c_ispeed;
+ tp->t_ospeed = t->c_ospeed;
ttsetwater(tp);
}
- if (cmd != TIOCSETAF) {
- if (ISSET(t->c_lflag, ICANON) !=
- ISSET(tp->t_lflag, ICANON))
- if (ISSET(t->c_lflag, ICANON)) {
- SET(tp->t_lflag, PENDIN);
- ttwakeup(tp);
- } else {
- struct clist tq;
-
+ if (ISSET(t->c_lflag, ICANON) != ISSET(tp->t_lflag, ICANON) &&
+ cmd != TIOCSETAF) {
+ if (ISSET(t->c_lflag, ICANON))
+ SET(tp->t_lflag, PENDIN);
+ else {
+ /*
+ * XXX we really shouldn't allow toggling
+ * ICANON while we're in a non-termios line
+ * discipline. Now we have to worry about
+ * panicing for a null queue.
+ */
+ if (tp->t_canq.c_cbreserved > 0 &&
+ tp->t_rawq.c_cbreserved > 0) {
catq(&tp->t_rawq, &tp->t_canq);
- tq = tp->t_rawq;
- tp->t_rawq = tp->t_canq;
- tp->t_canq = tq;
- CLR(tp->t_lflag, PENDIN);
+ /*
+ * XXX the queue limits may be
+ * different, so the old queue
+ * swapping method no longer works.
+ */
+ catq(&tp->t_canq, &tp->t_rawq);
}
+ CLR(tp->t_lflag, PENDIN);
+ }
+ ttwakeup(tp);
}
tp->t_iflag = t->c_iflag;
tp->t_oflag = t->c_oflag;
@@ -801,6 +922,9 @@ ttioctl(tp, cmd, data, flag)
else
CLR(t->c_lflag, EXTPROC);
tp->t_lflag = t->c_lflag | ISSET(tp->t_lflag, PENDIN);
+ if (t->c_cc[VMIN] != tp->t_cc[VMIN] ||
+ t->c_cc[VTIME] != tp->t_cc[VTIME])
+ ttwakeup(tp);
bcopy(t->c_cc, tp->t_cc, sizeof(t->c_cc));
splx(s);
break;
@@ -840,7 +964,9 @@ ttioctl(tp, cmd, data, flag)
return (EPERM);
if (p->p_ucred->cr_uid && !isctty(p, tp))
return (EACCES);
+ s = spltty();
(*linesw[tp->t_line].l_rint)(*(u_char *)data, tp);
+ splx(s);
break;
case TIOCSTOP: /* stop output, like ^S */
s = spltty();
@@ -849,7 +975,7 @@ ttioctl(tp, cmd, data, flag)
#ifdef sun4c /* XXX */
(*tp->t_stop)(tp, 0);
#else
- (*cdevsw[major(tp->t_dev)].d_stop)(tp, 0);
+ (*cdevsw[major(tp->t_dev)]->d_stop)(tp, 0);
#endif
}
splx(s);
@@ -857,8 +983,8 @@ ttioctl(tp, cmd, data, flag)
case TIOCSCTTY: /* become controlling tty */
/* Session ctty vnode pointer set in vnode layer. */
if (!SESS_LEADER(p) ||
- (p->p_session->s_ttyvp || tp->t_session) &&
- (tp->t_session != p->p_session))
+ ((p->p_session->s_ttyvp || tp->t_session) &&
+ (tp->t_session != p->p_session)))
return (EPERM);
tp->t_session = p->p_session;
tp->t_pgrp = p->p_pgrp;
@@ -875,6 +1001,11 @@ ttioctl(tp, cmd, data, flag)
tp->t_pgrp = pgrp;
break;
}
+ case TIOCSTAT: /* simulate control-T */
+ s = spltty();
+ ttyinfo(tp);
+ splx(s);
+ break;
case TIOCSWINSZ: /* set window size */
if (bcmp((caddr_t)&tp->t_winsize, data,
sizeof (struct winsize))) {
@@ -882,6 +1013,17 @@ ttioctl(tp, cmd, data, flag)
pgsignal(tp->t_pgrp, SIGWINCH, 1);
}
break;
+ case TIOCSDRAINWAIT:
+ error = suser(p->p_ucred, &p->p_acflag);
+ if (error)
+ return (error);
+ tp->t_timeout = *(int *)data * hz;
+ wakeup(TSA_OCOMPLETE(tp));
+ wakeup(TSA_OLOWAT(tp));
+ break;
+ case TIOCGDRAINWAIT:
+ *(int *)data = tp->t_timeout / hz;
+ break;
default:
#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
return (ttcompat(tp, cmd, data, flag));
@@ -893,27 +1035,27 @@ ttioctl(tp, cmd, data, flag)
}
int
-ttselect(device, rw, p)
- dev_t device;
+ttyselect(tp, rw, p)
+ struct tty *tp;
int rw;
struct proc *p;
{
- register struct tty *tp;
- int nread, s;
+ int s;
- tp = &cdevsw[major(device)].d_ttys[minor(device)];
+ if (tp == NULL)
+ return (ENXIO);
s = spltty();
switch (rw) {
case FREAD:
- nread = ttnread(tp);
- if (nread > 0 || !ISSET(tp->t_cflag, CLOCAL) &&
- !ISSET(tp->t_state, TS_CARR_ON))
+ if (ttnread(tp) > 0 || ISSET(tp->t_state, TS_ZOMBIE))
goto win;
selrecord(p, &tp->t_rsel);
break;
case FWRITE:
- if (tp->t_outq.c_cc <= tp->t_lowat) {
+ if ((tp->t_outq.c_cc <= tp->t_lowat &&
+ ISSET(tp->t_state, TS_CONNECTED))
+ || ISSET(tp->t_state, TS_ZOMBIE)) {
win: splx(s);
return (1);
}
@@ -924,6 +1066,22 @@ win: splx(s);
return (0);
}
+/*
+ * This is a wrapper for compatibility with the select vector used by
+ * cdevsw. It relies on a proper xxxdevtotty routine.
+ */
+int
+ttselect(dev, rw, p)
+ dev_t dev;
+ int rw;
+ struct proc *p;
+{
+ return ttyselect((*cdevsw[major(dev)]->d_devtotty)(dev), rw, p);
+}
+
+/*
+ * Must be called at spltty().
+ */
static int
ttnread(tp)
struct tty *tp;
@@ -933,8 +1091,11 @@ ttnread(tp)
if (ISSET(tp->t_lflag, PENDIN))
ttypend(tp);
nread = tp->t_canq.c_cc;
- if (!ISSET(tp->t_lflag, ICANON))
+ if (!ISSET(tp->t_lflag, ICANON)) {
nread += tp->t_rawq.c_cc;
+ if (nread < tp->t_cc[VMIN] && tp->t_cc[VTIME] == 0)
+ nread = 0;
+ }
return (nread);
}
@@ -950,14 +1111,24 @@ ttywait(tp)
error = 0;
s = spltty();
while ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) &&
- (ISSET(tp->t_state, TS_CARR_ON) || ISSET(tp->t_cflag, CLOCAL))
- && tp->t_oproc) {
+ ISSET(tp->t_state, TS_CONNECTED) && tp->t_oproc) {
(*tp->t_oproc)(tp);
- SET(tp->t_state, TS_ASLEEP);
- if (error = ttysleep(tp,
- &tp->t_outq, TTOPRI | PCATCH, ttyout, 0))
+ if ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) &&
+ ISSET(tp->t_state, TS_CONNECTED)) {
+ SET(tp->t_state, TS_SO_OCOMPLETE);
+ error = ttysleep(tp, TSA_OCOMPLETE(tp),
+ TTOPRI | PCATCH, "ttywai",
+ tp->t_timeout);
+ if (error) {
+ if (error == EWOULDBLOCK)
+ error = EIO;
+ break;
+ }
+ } else
break;
}
+ if (!error && (tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)))
+ error = EIO;
splx(s);
return (error);
}
@@ -965,7 +1136,7 @@ ttywait(tp)
/*
* Flush if successfully wait.
*/
-int
+static int
ttywflush(tp)
struct tty *tp;
{
@@ -987,24 +1158,66 @@ ttyflush(tp, rw)
register int s;
s = spltty();
+#if 0
+again:
+#endif
+ if (rw & FWRITE) {
+ FLUSHQ(&tp->t_outq);
+ CLR(tp->t_state, TS_TTSTOP);
+ }
+#ifdef sun4c /* XXX */
+ (*tp->t_stop)(tp, rw);
+#else
+ (*cdevsw[major(tp->t_dev)]->d_stop)(tp, rw);
+#endif
if (rw & FREAD) {
FLUSHQ(&tp->t_canq);
FLUSHQ(&tp->t_rawq);
+ CLR(tp->t_lflag, PENDIN);
tp->t_rocount = 0;
tp->t_rocol = 0;
CLR(tp->t_state, TS_LOCAL);
ttwakeup(tp);
+ if (ISSET(tp->t_state, TS_TBLOCK)) {
+ if (rw & FWRITE)
+ FLUSHQ(&tp->t_outq);
+ ttyunblock(tp);
+
+ /*
+ * Don't let leave any state that might clobber the
+ * next line discipline (although we should do more
+ * to send the START char). Not clearing the state
+ * may have caused the "putc to a clist with no
+ * reserved cblocks" panic/printf.
+ */
+ CLR(tp->t_state, TS_TBLOCK);
+
+#if 0 /* forget it, sleeping isn't always safe and we don't know when it is */
+ if (ISSET(tp->t_iflag, IXOFF)) {
+ /*
+ * XXX wait a bit in the hope that the stop
+ * character (if any) will go out. Waiting
+ * isn't good since it allows races. This
+ * will be fixed when the stop character is
+ * put in a special queue. Don't bother with
+ * the checks in ttywait() since the timeout
+ * will save us.
+ */
+ SET(tp->t_state, TS_SO_OCOMPLETE);
+ ttysleep(tp, TSA_OCOMPLETE(tp), TTOPRI,
+ "ttyfls", hz / 10);
+ /*
+ * Don't try sending the stop character again.
+ */
+ CLR(tp->t_state, TS_TBLOCK);
+ goto again;
+ }
+#endif
+ }
}
if (rw & FWRITE) {
- CLR(tp->t_state, TS_TTSTOP);
-#ifdef sun4c /* XXX */
- (*tp->t_stop)(tp, rw);
-#else
- (*cdevsw[major(tp->t_dev)].d_stop)(tp, rw);
-#endif
FLUSHQ(&tp->t_outq);
- wakeup((caddr_t)&tp->t_outq);
- selwakeup(&tp->t_wsel);
+ ttwwakeup(tp);
}
splx(s);
}
@@ -1013,42 +1226,63 @@ ttyflush(tp, rw)
* Copy in the default termios characters.
*/
void
+termioschars(t)
+ struct termios *t;
+{
+
+ bcopy(ttydefchars, t->c_cc, sizeof t->c_cc);
+}
+
+/*
+ * Old interface.
+ */
+void
ttychars(tp)
struct tty *tp;
{
- bcopy(ttydefchars, tp->t_cc, sizeof(ttydefchars));
+ termioschars(&tp->t_termios);
}
/*
- * Send stop character on input overflow.
+ * Handle input high water. Send stop character for the IXOFF case. Turn
+ * on our input flow control bit and propagate the changes to the driver.
+ * XXX the stop character should be put in a special high priority queue.
*/
-static void
+void
ttyblock(tp)
- register struct tty *tp;
+ struct tty *tp;
{
- register int total;
- total = tp->t_rawq.c_cc + tp->t_canq.c_cc;
- if (tp->t_rawq.c_cc > TTYHOG) {
- ttyflush(tp, FREAD | FWRITE);
- CLR(tp->t_state, TS_TBLOCK);
- }
- /*
- * Block further input iff: current input > threshold
- * AND input is available to user program.
- */
- if (total >= TTYHOG / 2 &&
- !ISSET(tp->t_state, TS_TBLOCK) &&
- !ISSET(tp->t_lflag, ICANON) || tp->t_canq.c_cc > 0 &&
- tp->t_cc[VSTOP] != _POSIX_VDISABLE) {
- if (putc(tp->t_cc[VSTOP], &tp->t_outq) == 0) {
- SET(tp->t_state, TS_TBLOCK);
- ttstart(tp);
- }
- }
+ SET(tp->t_state, TS_TBLOCK);
+ if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTOP] != _POSIX_VDISABLE &&
+ putc(tp->t_cc[VSTOP], &tp->t_outq) != 0)
+ CLR(tp->t_state, TS_TBLOCK); /* try again later */
+ ttstart(tp);
+}
+
+/*
+ * Handle input low water. Send start character for the IXOFF case. Turn
+ * off our input flow control bit and propagate the changes to the driver.
+ * XXX the start character should be put in a special high priority queue.
+ */
+static void
+ttyunblock(tp)
+ struct tty *tp;
+{
+
+ CLR(tp->t_state, TS_TBLOCK);
+ if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTART] != _POSIX_VDISABLE &&
+ putc(tp->t_cc[VSTART], &tp->t_outq) != 0)
+ SET(tp->t_state, TS_TBLOCK); /* try again later */
+ ttstart(tp);
}
+#ifdef notyet
+/* Not used by any current (i386) drivers. */
+/*
+ * Restart after an inter-char delay.
+ */
void
ttrstrt(tp_arg)
void *tp_arg;
@@ -1068,6 +1302,7 @@ ttrstrt(tp_arg)
splx(s);
}
+#endif
int
ttstart(tp)
@@ -1088,10 +1323,8 @@ ttylclose(tp, flag)
int flag;
{
- if (flag & IO_NDELAY)
+ if (flag & FNONBLOCK || ttywflush(tp))
ttyflush(tp, FREAD | FWRITE);
- else
- ttywflush(tp);
return (0);
}
@@ -1106,19 +1339,23 @@ ttymodem(tp, flag)
int flag;
{
- if (!ISSET(tp->t_state, TS_WOPEN) && ISSET(tp->t_cflag, MDMBUF)) {
+ if (ISSET(tp->t_state, TS_CARR_ON) && ISSET(tp->t_cflag, MDMBUF)) {
/*
* MDMBUF: do flow control according to carrier flag
+ * XXX TS_CAR_OFLOW doesn't do anything yet. TS_TTSTOP
+ * works if IXON and IXANY are clear.
*/
if (flag) {
+ CLR(tp->t_state, TS_CAR_OFLOW);
CLR(tp->t_state, TS_TTSTOP);
ttstart(tp);
- } else if (!ISSET(tp->t_state, TS_TTSTOP)) {
+ } else if (!ISSET(tp->t_state, TS_CAR_OFLOW)) {
+ SET(tp->t_state, TS_CAR_OFLOW);
SET(tp->t_state, TS_TTSTOP);
#ifdef sun4c /* XXX */
(*tp->t_stop)(tp, 0);
#else
- (*cdevsw[major(tp->t_dev)].d_stop)(tp, 0);
+ (*cdevsw[major(tp->t_dev)]->d_stop)(tp, 0);
#endif
}
} else if (flag == 0) {
@@ -1128,6 +1365,8 @@ ttymodem(tp, flag)
CLR(tp->t_state, TS_CARR_ON);
if (ISSET(tp->t_state, TS_ISOPEN) &&
!ISSET(tp->t_cflag, CLOCAL)) {
+ SET(tp->t_state, TS_ZOMBIE);
+ CLR(tp->t_state, TS_CONNECTED);
if (tp->t_session && tp->t_session->s_leader)
psignal(tp->t_session->s_leader, SIGHUP);
ttyflush(tp, FREAD | FWRITE);
@@ -1138,30 +1377,11 @@ ttymodem(tp, flag)
* Carrier now on.
*/
SET(tp->t_state, TS_CARR_ON);
+ if (!ISSET(tp->t_state, TS_ZOMBIE))
+ SET(tp->t_state, TS_CONNECTED);
+ wakeup(TSA_CARR_ON(tp));
ttwakeup(tp);
- }
- return (1);
-}
-
-/*
- * Default modem control routine (for other line disciplines).
- * Return argument flag, to turn off device on carrier drop.
- */
-int
-nullmodem(tp, flag)
- register struct tty *tp;
- int flag;
-{
-
- if (flag)
- SET(tp->t_state, TS_CARR_ON);
- else {
- CLR(tp->t_state, TS_CARR_ON);
- if (!ISSET(tp->t_cflag, CLOCAL)) {
- if (tp->t_session && tp->t_session->s_leader)
- psignal(tp->t_session->s_leader, SIGHUP);
- return (0);
- }
+ ttwwakeup(tp);
}
return (1);
}
@@ -1170,18 +1390,25 @@ nullmodem(tp, flag)
* Reinput pending characters after state switch
* call at spltty().
*/
-void
+static void
ttypend(tp)
register struct tty *tp;
{
struct clist tq;
- register c;
+ register int c;
CLR(tp->t_lflag, PENDIN);
SET(tp->t_state, TS_TYPEN);
+ /*
+ * XXX this assumes too much about clist internals. It may even
+ * fail if the cblock slush pool is empty. We can't allocate more
+ * cblocks here because we are called from an interrupt handler
+ * and clist_alloc_cblocks() can wait.
+ */
tq = tp->t_rawq;
- tp->t_rawq.c_cc = 0;
- tp->t_rawq.c_cf = tp->t_rawq.c_cl = 0;
+ bzero(&tp->t_rawq, sizeof tp->t_rawq);
+ tp->t_rawq.c_cbmax = tq.c_cbmax;
+ tp->t_rawq.c_cbreserved = tq.c_cbreserved;
while ((c = getc(&tq)) >= 0)
ttyinput(c, tp);
CLR(tp->t_state, TS_TYPEN);
@@ -1198,34 +1425,47 @@ ttread(tp, uio, flag)
{
register struct clist *qp;
register int c;
- register long lflag;
- register u_char *cc = tp->t_cc;
+ register tcflag_t lflag;
+ register cc_t *cc = tp->t_cc;
register struct proc *p = curproc;
int s, first, error = 0;
+ int has_stime = 0, last_cc = 0;
+ long slp = 0; /* XXX this should be renamed `timo'. */
-loop: lflag = tp->t_lflag;
+loop:
s = spltty();
+ lflag = tp->t_lflag;
/*
* take pending input first
*/
- if (ISSET(lflag, PENDIN))
+ if (ISSET(lflag, PENDIN)) {
ttypend(tp);
- splx(s);
+ splx(s); /* reduce latency */
+ s = spltty();
+ lflag = tp->t_lflag; /* XXX ttypend() clobbers it */
+ }
/*
* Hang process if it's in the background.
*/
if (isbackground(p, tp)) {
+ splx(s);
if ((p->p_sigignore & sigmask(SIGTTIN)) ||
(p->p_sigmask & sigmask(SIGTTIN)) ||
p->p_flag & P_PPWAIT || p->p_pgrp->pg_jobc == 0)
return (EIO);
pgsignal(p->p_pgrp, SIGTTIN, 1);
- if (error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, ttybg, 0))
+ error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg2", 0);
+ if (error)
return (error);
goto loop;
}
+ if (ISSET(tp->t_state, TS_ZOMBIE)) {
+ splx(s);
+ return (0); /* EOF */
+ }
+
/*
* If canonical, use the canonical queue,
* else use the raw queue.
@@ -1234,47 +1474,171 @@ loop: lflag = tp->t_lflag;
*/
qp = ISSET(lflag, ICANON) ? &tp->t_canq : &tp->t_rawq;
- /*
- * If there is no input, sleep on rawq
- * awaiting hardware receipt and notification.
- * If we have data, we don't need to check for carrier.
- */
- s = spltty();
- if (qp->c_cc <= 0) {
- int carrier;
-
- carrier = ISSET(tp->t_state, TS_CARR_ON) ||
- ISSET(tp->t_cflag, CLOCAL);
- if (!carrier && ISSET(tp->t_state, TS_ISOPEN)) {
+ if (flag & IO_NDELAY) {
+ if (qp->c_cc > 0)
+ goto read;
+ if (!ISSET(lflag, ICANON) && cc[VMIN] == 0) {
splx(s);
- return (0); /* EOF */
+ return (0);
}
- if (flag & IO_NDELAY) {
+ splx(s);
+ return (EWOULDBLOCK);
+ }
+ if (!ISSET(lflag, ICANON)) {
+ int m = cc[VMIN];
+ long t = cc[VTIME];
+ struct timeval stime, timecopy;
+ int x;
+
+ /*
+ * Check each of the four combinations.
+ * (m > 0 && t == 0) is the normal read case.
+ * It should be fairly efficient, so we check that and its
+ * companion case (m == 0 && t == 0) first.
+ * For the other two cases, we compute the target sleep time
+ * into slp.
+ */
+ if (t == 0) {
+ if (qp->c_cc < m)
+ goto sleep;
+ if (qp->c_cc > 0)
+ goto read;
+
+ /* m, t and qp->c_cc are all 0. 0 is enough input. */
splx(s);
- return (EWOULDBLOCK);
+ return (0);
+ }
+ t *= 100000; /* time in us */
+#define diff(t1, t2) (((t1).tv_sec - (t2).tv_sec) * 1000000 + \
+ ((t1).tv_usec - (t2).tv_usec))
+ if (m > 0) {
+ if (qp->c_cc <= 0)
+ goto sleep;
+ if (qp->c_cc >= m)
+ goto read;
+ gettime(&timecopy);
+ if (!has_stime) {
+ /* first character, start timer */
+ has_stime = 1;
+ stime = timecopy;
+ slp = t;
+ } else if (qp->c_cc > last_cc) {
+ /* got a character, restart timer */
+ stime = timecopy;
+ slp = t;
+ } else {
+ /* nothing, check expiration */
+ slp = t - diff(timecopy, stime);
+ if (slp <= 0)
+ goto read;
+ }
+ last_cc = qp->c_cc;
+ } else { /* m == 0 */
+ if (qp->c_cc > 0)
+ goto read;
+ gettime(&timecopy);
+ if (!has_stime) {
+ has_stime = 1;
+ stime = timecopy;
+ slp = t;
+ } else {
+ slp = t - diff(timecopy, stime);
+ if (slp <= 0) {
+ /* Timed out, but 0 is enough input. */
+ splx(s);
+ return (0);
+ }
+ }
}
- error = ttysleep(tp, &tp->t_rawq, TTIPRI | PCATCH,
- carrier ? ttyin : ttopen, 0);
+#undef diff
+ /*
+ * Rounding down may make us wake up just short
+ * of the target, so we round up.
+ * The formula is ceiling(slp * hz/1000000).
+ * 32-bit arithmetic is enough for hz < 169.
+ * XXX see hzto() for how to avoid overflow if hz
+ * is large (divide by `tick' and/or arrange to
+ * use hzto() if hz is large).
+ */
+ slp = (long) (((u_long)slp * hz) + 999999) / 1000000;
+ goto sleep;
+ }
+ if (qp->c_cc <= 0) {
+sleep:
+ /*
+ * There is no input, or not enough input and we can block.
+ */
+ error = ttysleep(tp, TSA_HUP_OR_INPUT(tp), TTIPRI | PCATCH,
+ ISSET(tp->t_state, TS_CONNECTED) ?
+ "ttyin" : "ttyhup", (int)slp);
splx(s);
- if (error)
+ if (error == EWOULDBLOCK)
+ error = 0;
+ else if (error)
return (error);
+ /*
+ * XXX what happens if another process eats some input
+ * while we are asleep (not just here)? It would be
+ * safest to detect changes and reset our state variables
+ * (has_stime and last_cc).
+ */
+ slp = 0;
goto loop;
}
+read:
splx(s);
-
/*
* Input present, check for input mapping and processing.
*/
first = 1;
- while ((c = getc(qp)) >= 0) {
+ if (ISSET(lflag, ICANON | ISIG))
+ goto slowcase;
+ for (;;) {
+ char ibuf[IBUFSIZ];
+ int icc;
+
+ icc = imin(uio->uio_resid, IBUFSIZ);
+ icc = q_to_b(qp, ibuf, icc);
+ if (icc <= 0) {
+ if (first)
+ goto loop;
+ break;
+ }
+ error = uiomove(ibuf, icc, uio);
+ /*
+ * XXX if there was an error then we should ungetc() the
+ * unmoved chars and reduce icc here.
+ */
+#if NSNP > 0
+ if (ISSET(tp->t_lflag, ECHO) &&
+ ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL)
+ snpin((struct snoop *)tp->t_sc, ibuf, icc);
+#endif
+ if (error)
+ break;
+ if (uio->uio_resid == 0)
+ break;
+ first = 0;
+ }
+ goto out;
+slowcase:
+ for (;;) {
+ c = getc(qp);
+ if (c < 0) {
+ if (first)
+ goto loop;
+ break;
+ }
/*
* delayed suspend (^Y)
*/
- if (CCEQ(cc[VDSUSP], c) && ISSET(lflag, ISIG)) {
+ if (CCEQ(cc[VDSUSP], c) &&
+ ISSET(lflag, IEXTEN | ISIG) == (IEXTEN | ISIG)) {
pgsignal(tp->t_pgrp, SIGTSTP, 1);
if (first) {
- if (error = ttysleep(tp,
- &lbolt, TTIPRI | PCATCH, ttybg, 0))
+ error = ttysleep(tp, &lbolt, TTIPRI | PCATCH,
+ "ttybg3", 0);
+ if (error)
break;
goto loop;
}
@@ -1290,30 +1654,39 @@ loop: lflag = tp->t_lflag;
*/
error = ureadc(c, uio);
if (error)
+ /* XXX should ungetc(c, qp). */
break;
+#if NSNP > 0
+ /*
+ * Only snoop directly on input in echo mode. Non-echoed
+ * input will be snooped later iff the application echoes it.
+ */
+ if (ISSET(tp->t_lflag, ECHO) &&
+ ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL)
+ snpinc((struct snoop *)tp->t_sc, (char)c);
+#endif
if (uio->uio_resid == 0)
break;
/*
* In canonical mode check for a "break character"
* marking the end of a "line of input".
*/
- if (ISSET(lflag, ICANON) && TTBREAKC(c))
+ if (ISSET(lflag, ICANON) && TTBREAKC(c, lflag))
break;
first = 0;
}
+
+out:
/*
- * Look to unblock output now that (presumably)
+ * Look to unblock input now that (presumably)
* the input queue has gone down.
*/
s = spltty();
- if (ISSET(tp->t_state, TS_TBLOCK) && tp->t_rawq.c_cc < TTYHOG/5) {
- if (cc[VSTART] != _POSIX_VDISABLE &&
- putc(cc[VSTART], &tp->t_outq) == 0) {
- CLR(tp->t_state, TS_TBLOCK);
- ttstart(tp);
- }
- }
+ if (ISSET(tp->t_state, TS_TBLOCK) &&
+ tp->t_rawq.c_cc + tp->t_canq.c_cc <= I_LOW_WATER)
+ ttyunblock(tp);
splx(s);
+
return (error);
}
@@ -1334,17 +1707,17 @@ ttycheckoutq(tp, wait)
hiwat = tp->t_hiwat;
s = spltty();
oldsig = wait ? curproc->p_siglist : 0;
- if (tp->t_outq.c_cc > hiwat + 200)
+ if (tp->t_outq.c_cc > hiwat + OBUFSIZ + 100)
while (tp->t_outq.c_cc > hiwat) {
ttstart(tp);
+ if (tp->t_outq.c_cc <= hiwat)
+ break;
if (wait == 0 || curproc->p_siglist != oldsig) {
splx(s);
return (0);
}
- timeout((void (*)__P((void *)))wakeup,
- (void *)&tp->t_outq, hz);
- SET(tp->t_state, TS_ASLEEP);
- sleep((caddr_t)&tp->t_outq, PZERO - 1);
+ SET(tp->t_state, TS_SO_OLOWAT);
+ tsleep(TSA_OLOWAT(tp), PZERO - 1, "ttoutq", hz);
}
splx(s);
return (1);
@@ -1359,7 +1732,7 @@ ttwrite(tp, uio, flag)
register struct uio *uio;
int flag;
{
- register char *cp;
+ register char *cp = NULL;
register int cc, ce;
register struct proc *p;
int i, hiwat, cnt, error, s;
@@ -1371,24 +1744,24 @@ ttwrite(tp, uio, flag)
cc = 0;
loop:
s = spltty();
- if (!ISSET(tp->t_state, TS_CARR_ON) &&
- !ISSET(tp->t_cflag, CLOCAL)) {
- if (ISSET(tp->t_state, TS_ISOPEN)) {
- splx(s);
- return (EIO);
- } else if (flag & IO_NDELAY) {
+ if (ISSET(tp->t_state, TS_ZOMBIE)) {
+ splx(s);
+ if (uio->uio_resid == cnt)
+ error = EIO;
+ goto out;
+ }
+ if (!ISSET(tp->t_state, TS_CONNECTED)) {
+ if (flag & IO_NDELAY) {
splx(s);
error = EWOULDBLOCK;
goto out;
- } else {
- /* Sleep awaiting carrier. */
- error = ttysleep(tp,
- &tp->t_rawq, TTIPRI | PCATCH,ttopen, 0);
- splx(s);
- if (error)
- goto out;
- goto loop;
}
+ error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH,
+ "ttydcd", 0);
+ splx(s);
+ if (error)
+ goto out;
+ goto loop;
}
splx(s);
/*
@@ -1398,10 +1771,14 @@ loop:
if (isbackground(p, tp) &&
ISSET(tp->t_lflag, TOSTOP) && (p->p_flag & P_PPWAIT) == 0 &&
(p->p_sigignore & sigmask(SIGTTOU)) == 0 &&
- (p->p_sigmask & sigmask(SIGTTOU)) == 0 &&
- p->p_pgrp->pg_jobc) {
+ (p->p_sigmask & sigmask(SIGTTOU)) == 0) {
+ if (p->p_pgrp->pg_jobc == 0) {
+ error = EIO;
+ goto out;
+ }
pgsignal(p->p_pgrp, SIGTTOU, 1);
- if (error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, ttybg, 0))
+ error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg4", 0);
+ if (error)
goto out;
goto loop;
}
@@ -1422,13 +1799,17 @@ loop:
* leftover from last time.
*/
if (cc == 0) {
- cc = min(uio->uio_resid, OBUFSIZ);
+ cc = imin(uio->uio_resid, OBUFSIZ);
cp = obuf;
error = uiomove(cp, cc, uio);
if (error) {
cc = 0;
break;
}
+#if NSNP > 0
+ if (ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL)
+ snpin((struct snoop *)tp->t_sc, cp, cc);
+#endif
}
/*
* If nothing fancy need be done, grab those characters we
@@ -1444,7 +1825,7 @@ loop:
ce = cc;
else {
ce = cc - scanc((u_int)cc, (u_char *)cp,
- (u_char *)char_type, CCLASSMASK);
+ char_type, CCLASSMASK);
/*
* If ce is zero, then we're processing
* a special character through ttyoutput.
@@ -1454,9 +1835,15 @@ loop:
if (ttyoutput(*cp, tp) >= 0) {
/* No Clists, wait a bit. */
ttstart(tp);
- if (error = ttysleep(tp, &lbolt,
- TTOPRI | PCATCH, ttybuf, 0))
- break;
+ if (flag & IO_NDELAY) {
+ error = EWOULDBLOCK;
+ goto out;
+ }
+ error = ttysleep(tp, &lbolt,
+ TTOPRI|PCATCH,
+ "ttybf1", 0);
+ if (error)
+ goto out;
goto loop;
}
cp++;
@@ -1484,9 +1871,14 @@ loop:
if (i > 0) {
/* No Clists, wait a bit. */
ttstart(tp);
- if (error = ttysleep(tp,
- &lbolt, TTOPRI | PCATCH, ttybuf, 0))
- break;
+ if (flag & IO_NDELAY) {
+ error = EWOULDBLOCK;
+ goto out;
+ }
+ error = ttysleep(tp, &lbolt, TTOPRI | PCATCH,
+ "ttybf2", 0);
+ if (error)
+ goto out;
goto loop;
}
if (ISSET(tp->t_lflag, FLUSHO) ||
@@ -1520,9 +1912,12 @@ ovhiwat:
uio->uio_resid += cc;
return (uio->uio_resid == cnt ? EWOULDBLOCK : 0);
}
- SET(tp->t_state, TS_ASLEEP);
- error = ttysleep(tp, &tp->t_outq, TTOPRI | PCATCH, ttyout, 0);
+ SET(tp->t_state, TS_SO_OLOWAT);
+ error = ttysleep(tp, TSA_OLOWAT(tp), TTOPRI | PCATCH, "ttywri",
+ tp->t_timeout);
splx(s);
+ if (error == EWOULDBLOCK)
+ error = EIO;
if (error)
goto out;
goto loop;
@@ -1532,7 +1927,7 @@ ovhiwat:
* Rubout one character from the rawq of tp
* as cleanly as possible.
*/
-void
+static void
ttyrub(c, tp)
register int c;
register struct tty *tp;
@@ -1635,7 +2030,7 @@ ttyrubo(tp, cnt)
* Reprint the rawq line. Note, it is assumed that c_cc has already
* been checked.
*/
-void
+static void
ttyretype(tp)
register struct tty *tp;
{
@@ -1679,11 +2074,11 @@ ttyecho(c, tp)
if (!ISSET(tp->t_state, TS_CNTTB))
CLR(tp->t_lflag, FLUSHO);
if ((!ISSET(tp->t_lflag, ECHO) &&
- (!ISSET(tp->t_lflag, ECHONL) || c == '\n')) ||
+ (c != '\n' || !ISSET(tp->t_lflag, ECHONL))) ||
ISSET(tp->t_lflag, EXTPROC))
return;
if (ISSET(tp->t_lflag, ECHOCTL) &&
- (ISSET(c, TTY_CHARMASK) <= 037 && c != '\t' && c != '\n' ||
+ ((ISSET(c, TTY_CHARMASK) <= 037 && c != '\t' && c != '\n') ||
ISSET(c, TTY_CHARMASK) == 0177)) {
(void)ttyoutput('^', tp);
CLR(c, ~TTY_CHARMASK);
@@ -1703,10 +2098,33 @@ ttwakeup(tp)
register struct tty *tp;
{
- selwakeup(&tp->t_rsel);
+ if (tp->t_rsel.si_pid != 0)
+ selwakeup(&tp->t_rsel);
if (ISSET(tp->t_state, TS_ASYNC))
pgsignal(tp->t_pgrp, SIGIO, 1);
- wakeup((caddr_t)&tp->t_rawq);
+ wakeup(TSA_HUP_OR_INPUT(tp));
+}
+
+/*
+ * Wake up any writers on a tty.
+ */
+void
+ttwwakeup(tp)
+ register struct tty *tp;
+{
+
+ if (tp->t_wsel.si_pid != 0 && tp->t_outq.c_cc <= tp->t_lowat)
+ selwakeup(&tp->t_wsel);
+ if (ISSET(tp->t_state, TS_BUSY | TS_SO_OCOMPLETE) ==
+ TS_SO_OCOMPLETE && tp->t_outq.c_cc == 0) {
+ CLR(tp->t_state, TS_SO_OCOMPLETE);
+ wakeup(TSA_OCOMPLETE(tp));
+ }
+ if (ISSET(tp->t_state, TS_SO_OLOWAT) &&
+ tp->t_outq.c_cc <= tp->t_lowat) {
+ CLR(tp->t_state, TS_SO_OLOWAT);
+ wakeup(TSA_OLOWAT(tp));
+ }
}
/*
@@ -1786,15 +2204,15 @@ ttyinfo(tp)
/* Print user time. */
ttyprintf(tp, "%d.%02du ",
- utime.tv_sec, (utime.tv_usec + 5000) / 10000);
+ utime.tv_sec, utime.tv_usec / 10000);
/* Print system time. */
ttyprintf(tp, "%d.%02ds ",
- stime.tv_sec, (stime.tv_usec + 5000) / 10000);
+ stime.tv_sec, stime.tv_usec / 10000);
-#define pgtok(a) (((a) * NBPG) / 1024)
+#define pgtok(a) (((a) * PAGE_SIZE) / 1024)
/* Print percentage cpu, resident set size. */
- tmp = pick->p_pctcpu * 10000 + FSCALE / 2 >> FSHIFT;
+ tmp = (pick->p_pctcpu * 10000 + FSCALE / 2) >> FSHIFT;
ttyprintf(tp, "%d%% %dk\n",
tmp / 100,
pick->p_stat == SIDL || pick->p_stat == SZOMB ? 0 :
@@ -1891,8 +2309,7 @@ tputchar(c, tp)
register int s;
s = spltty();
- if (ISSET(tp->t_state,
- TS_CARR_ON | TS_ISOPEN) != (TS_CARR_ON | TS_ISOPEN)) {
+ if (!ISSET(tp->t_state, TS_CONNECTED)) {
splx(s);
return (-1);
}
@@ -1906,7 +2323,7 @@ tputchar(c, tp)
/*
* Sleep on chan, returning ERESTART if tty changed while we napped and
- * returning any errors (e.g. EINTR/ETIMEDOUT) reported by tsleep. If
+ * returning any errors (e.g. EINTR/EWOULDBLOCK) reported by tsleep. If
* the tty is revoked, restarting a pending call will redo validation done
* at the start of the call.
*/
@@ -1918,10 +2335,44 @@ ttysleep(tp, chan, pri, wmesg, timo)
char *wmesg;
{
int error;
- short gen;
+ int gen;
gen = tp->t_gen;
- if (error = tsleep(chan, pri, wmesg, timo))
+ error = tsleep(chan, pri, wmesg, timo);
+ if (error)
return (error);
return (tp->t_gen == gen ? 0 : ERESTART);
}
+
+#ifdef notyet
+/*
+ * XXX this is usable not useful or used. Most tty drivers have
+ * ifdefs for using ttymalloc() but assume a different interface.
+ */
+/*
+ * Allocate a tty struct. Clists in the struct will be allocated by
+ * ttyopen().
+ */
+struct tty *
+ttymalloc()
+{
+ struct tty *tp;
+
+ tp = malloc(sizeof *tp, M_TTYS, M_WAITOK);
+ bzero(tp, sizeof *tp);
+ return (tp);
+}
+#endif
+
+#if 0 /* XXX not yet usable: session leader holds a ref (see kern_exit.c). */
+/*
+ * Free a tty struct. Clists in the struct should have been freed by
+ * ttyclose().
+ */
+void
+ttyfree(tp)
+ struct tty *tp;
+{
+ free(tp, M_TTYS);
+}
+#endif /* 0 */
diff --git a/sys/kern/tty_compat.c b/sys/kern/tty_compat.c
index ce95853..ed58c6a 100644
--- a/sys/kern/tty_compat.c
+++ b/sys/kern/tty_compat.c
@@ -30,28 +30,39 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)tty_compat.c 8.2 (Berkeley) 1/9/95
+ * @(#)tty_compat.c 8.1 (Berkeley) 6/10/93
+ * $Id: tty_compat.c,v 1.21 1997/02/22 09:39:24 peter Exp $
*/
-/*
+/*
* mapping routines for old line discipline (yuck)
*/
#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
#include <sys/param.h>
#include <sys/systm.h>
-#include <sys/ioctl.h>
+#include <sys/ioctl_compat.h>
#include <sys/proc.h>
#include <sys/tty.h>
#include <sys/termios.h>
#include <sys/file.h>
#include <sys/conf.h>
#include <sys/kernel.h>
+#include <sys/sysctl.h>
#include <sys/syslog.h>
-int ttydebug = 0;
+static int ttcompatgetflags __P((struct tty *tp));
+static void ttcompatsetflags __P((struct tty *tp, struct termios *t));
+static void ttcompatsetlflags __P((struct tty *tp, struct termios *t));
+static int ttcompatspeedtab __P((int speed, struct speedtab *table));
+
+static int ttydebug = 0;
+SYSCTL_INT(_debug, OID_AUTO, ttydebug, CTLFLAG_RW, &ttydebug, 0, "");
static struct speedtab compatspeeds[] = {
+#define MAX_SPEED 17
+ { 115200, 17 },
+ { 57600, 16 },
{ 38400, 15 },
{ 19200, 14 },
{ 9600, 13 },
@@ -70,78 +81,61 @@ static struct speedtab compatspeeds[] = {
{ 0, 0 },
{ -1, -1 },
};
-static int compatspcodes[16] = {
+static int compatspcodes[] = {
0, 50, 75, 110, 134, 150, 200, 300, 600, 1200,
- 1800, 2400, 4800, 9600, 19200, 38400,
+ 1800, 2400, 4800, 9600, 19200, 38400, 57600, 115200,
};
-/*ARGSUSED*/
-ttcompat(tp, com, data, flag)
+static int
+ttcompatspeedtab(speed, table)
+ int speed;
+ register struct speedtab *table;
+{
+ if (speed == 0)
+ return (0); /* hangup */
+ for ( ; table->sp_speed > 0; table++)
+ if (table->sp_speed <= speed) /* nearest one, rounded down */
+ return (table->sp_code);
+ return (1); /* 50, min and not hangup */
+}
+
+int
+ttsetcompat(tp, com, data, term)
register struct tty *tp;
- u_long com;
+ int *com;
caddr_t data;
- int flag;
+ struct termios *term;
{
-
- switch (com) {
- case TIOCGETP: {
- register struct sgttyb *sg = (struct sgttyb *)data;
- register u_char *cc = tp->t_cc;
- register speed;
-
- speed = ttspeedtab(tp->t_ospeed, compatspeeds);
- sg->sg_ospeed = (speed == -1) ? 15 : speed;
- if (tp->t_ispeed == 0)
- sg->sg_ispeed = sg->sg_ospeed;
- else {
- speed = ttspeedtab(tp->t_ispeed, compatspeeds);
- sg->sg_ispeed = (speed == -1) ? 15 : speed;
- }
- sg->sg_erase = cc[VERASE];
- sg->sg_kill = cc[VKILL];
- sg->sg_flags = ttcompatgetflags(tp);
- break;
- }
-
+ switch (*com) {
case TIOCSETP:
case TIOCSETN: {
register struct sgttyb *sg = (struct sgttyb *)data;
- struct termios term;
int speed;
- term = tp->t_termios;
- if ((speed = sg->sg_ispeed) > 15 || speed < 0)
- term.c_ispeed = speed;
+ if ((speed = sg->sg_ispeed) > MAX_SPEED || speed < 0)
+ return(EINVAL);
+ else if (speed != ttcompatspeedtab(tp->t_ispeed, compatspeeds))
+ term->c_ispeed = compatspcodes[speed];
else
- term.c_ispeed = compatspcodes[speed];
- if ((speed = sg->sg_ospeed) > 15 || speed < 0)
- term.c_ospeed = speed;
+ term->c_ispeed = tp->t_ispeed;
+ if ((speed = sg->sg_ospeed) > MAX_SPEED || speed < 0)
+ return(EINVAL);
+ else if (speed != ttcompatspeedtab(tp->t_ospeed, compatspeeds))
+ term->c_ospeed = compatspcodes[speed];
else
- term.c_ospeed = compatspcodes[speed];
- term.c_cc[VERASE] = sg->sg_erase;
- term.c_cc[VKILL] = sg->sg_kill;
- tp->t_flags = tp->t_flags&0xffff0000 | sg->sg_flags&0xffff;
- ttcompatsetflags(tp, &term);
- return (ttioctl(tp, com == TIOCSETP ? TIOCSETAF : TIOCSETA,
- &term, flag));
- }
-
- case TIOCGETC: {
- struct tchars *tc = (struct tchars *)data;
- register u_char *cc = tp->t_cc;
-
- tc->t_intrc = cc[VINTR];
- tc->t_quitc = cc[VQUIT];
- tc->t_startc = cc[VSTART];
- tc->t_stopc = cc[VSTOP];
- tc->t_eofc = cc[VEOF];
- tc->t_brkc = cc[VEOL];
+ term->c_ospeed = tp->t_ospeed;
+ term->c_cc[VERASE] = sg->sg_erase;
+ term->c_cc[VKILL] = sg->sg_kill;
+ tp->t_flags = (tp->t_flags&0xffff0000) | (sg->sg_flags&0xffff);
+ ttcompatsetflags(tp, term);
+ *com = (*com == TIOCSETP) ? TIOCSETAF : TIOCSETA;
break;
}
case TIOCSETC: {
struct tchars *tc = (struct tchars *)data;
- register u_char *cc = tp->t_cc;
+ register cc_t *cc;
+ cc = term->c_cc;
cc[VINTR] = tc->t_intrc;
cc[VQUIT] = tc->t_quitc;
cc[VSTART] = tc->t_startc;
@@ -150,23 +144,96 @@ ttcompat(tp, com, data, flag)
cc[VEOL] = tc->t_brkc;
if (tc->t_brkc == -1)
cc[VEOL2] = _POSIX_VDISABLE;
+ *com = TIOCSETA;
break;
}
case TIOCSLTC: {
struct ltchars *ltc = (struct ltchars *)data;
- register u_char *cc = tp->t_cc;
+ register cc_t *cc;
+ cc = term->c_cc;
cc[VSUSP] = ltc->t_suspc;
cc[VDSUSP] = ltc->t_dsuspc;
cc[VREPRINT] = ltc->t_rprntc;
cc[VDISCARD] = ltc->t_flushc;
cc[VWERASE] = ltc->t_werasc;
cc[VLNEXT] = ltc->t_lnextc;
+ *com = TIOCSETA;
+ break;
+ }
+ case TIOCLBIS:
+ case TIOCLBIC:
+ case TIOCLSET:
+ if (*com == TIOCLSET)
+ tp->t_flags = (tp->t_flags&0xffff) | *(int *)data<<16;
+ else {
+ tp->t_flags =
+ (ttcompatgetflags(tp)&0xffff0000)|(tp->t_flags&0xffff);
+ if (*com == TIOCLBIS)
+ tp->t_flags |= *(int *)data<<16;
+ else
+ tp->t_flags &= ~(*(int *)data<<16);
+ }
+ ttcompatsetlflags(tp, term);
+ *com = TIOCSETA;
+ break;
+ }
+ return 0;
+}
+
+/*ARGSUSED*/
+int
+ttcompat(tp, com, data, flag)
+ register struct tty *tp;
+ int com;
+ caddr_t data;
+ int flag;
+{
+ switch (com) {
+ case TIOCSETP:
+ case TIOCSETN:
+ case TIOCSETC:
+ case TIOCSLTC:
+ case TIOCLBIS:
+ case TIOCLBIC:
+ case TIOCLSET: {
+ struct termios term;
+ int error;
+
+ term = tp->t_termios;
+ if ((error = ttsetcompat(tp, &com, data, &term)) != 0)
+ return error;
+ return ttioctl(tp, com, &term, flag);
+ }
+ case TIOCGETP: {
+ register struct sgttyb *sg = (struct sgttyb *)data;
+ register cc_t *cc = tp->t_cc;
+
+ sg->sg_ospeed = ttcompatspeedtab(tp->t_ospeed, compatspeeds);
+ if (tp->t_ispeed == 0)
+ sg->sg_ispeed = sg->sg_ospeed;
+ else
+ sg->sg_ispeed = ttcompatspeedtab(tp->t_ispeed, compatspeeds);
+ sg->sg_erase = cc[VERASE];
+ sg->sg_kill = cc[VKILL];
+ sg->sg_flags = tp->t_flags = ttcompatgetflags(tp);
+ break;
+ }
+ case TIOCGETC: {
+ struct tchars *tc = (struct tchars *)data;
+ register cc_t *cc = tp->t_cc;
+
+ tc->t_intrc = cc[VINTR];
+ tc->t_quitc = cc[VQUIT];
+ tc->t_startc = cc[VSTART];
+ tc->t_stopc = cc[VSTOP];
+ tc->t_eofc = cc[VEOF];
+ tc->t_brkc = cc[VEOL];
break;
}
case TIOCGLTC: {
struct ltchars *ltc = (struct ltchars *)data;
- register u_char *cc = tp->t_cc;
+ register cc_t *cc = tp->t_cc;
ltc->t_suspc = cc[VSUSP];
ltc->t_dsuspc = cc[VDSUSP];
@@ -176,27 +243,11 @@ ttcompat(tp, com, data, flag)
ltc->t_lnextc = cc[VLNEXT];
break;
}
- case TIOCLBIS:
- case TIOCLBIC:
- case TIOCLSET: {
- struct termios term;
-
- term = tp->t_termios;
- if (com == TIOCLSET)
- tp->t_flags = (tp->t_flags&0xffff) | *(int *)data<<16;
- else {
- tp->t_flags =
- (ttcompatgetflags(tp)&0xffff0000)|(tp->t_flags&0xffff);
- if (com == TIOCLBIS)
- tp->t_flags |= *(int *)data<<16;
- else
- tp->t_flags &= ~(*(int *)data<<16);
- }
- ttcompatsetlflags(tp, &term);
- return (ttioctl(tp, TIOCSETA, &term, flag));
- }
case TIOCLGET:
- *(int *)data = ttcompatgetflags(tp)>>16;
+ tp->t_flags =
+ (ttcompatgetflags(tp) & 0xffff0000UL)
+ | (tp->t_flags & 0xffff);
+ *(int *)data = tp->t_flags>>16;
if (ttydebug)
printf("CLGET: returning %x\n", *(int *)data);
break;
@@ -208,7 +259,7 @@ ttcompat(tp, com, data, flag)
case OTIOCSETD: {
int ldisczero = 0;
- return (ttioctl(tp, TIOCSETD,
+ return (ttioctl(tp, TIOCSETD,
*(int *)data == 2 ? (caddr_t)&ldisczero : data, flag));
}
@@ -222,20 +273,26 @@ ttcompat(tp, com, data, flag)
return (0);
}
+static int
ttcompatgetflags(tp)
register struct tty *tp;
{
- register long iflag = tp->t_iflag;
- register long lflag = tp->t_lflag;
- register long oflag = tp->t_oflag;
- register long cflag = tp->t_cflag;
+ register tcflag_t iflag = tp->t_iflag;
+ register tcflag_t lflag = tp->t_lflag;
+ register tcflag_t oflag = tp->t_oflag;
+ register tcflag_t cflag = tp->t_cflag;
register flags = 0;
if (iflag&IXOFF)
flags |= TANDEM;
if (iflag&ICRNL || oflag&ONLCR)
flags |= CRMOD;
- if (cflag&PARENB) {
+ if ((cflag&CSIZE) == CS8) {
+ flags |= PASS8;
+ if (iflag&ISTRIP)
+ flags |= ANYP;
+ }
+ else if (cflag&PARENB) {
if (iflag&INPCK) {
if (cflag&PARODD)
flags |= ODDP;
@@ -243,20 +300,18 @@ ttcompatgetflags(tp)
flags |= EVENP;
} else
flags |= EVENP | ODDP;
- } else {
- if ((tp->t_flags&LITOUT) && !(oflag&OPOST))
- flags |= LITOUT;
- if (tp->t_flags&PASS8)
- flags |= PASS8;
}
-
- if ((lflag&ICANON) == 0) {
+
+ if ((lflag&ICANON) == 0) {
/* fudge */
- if (iflag&IXON || lflag&ISIG || lflag&IEXTEN || cflag&PARENB)
+ if (iflag&(INPCK|ISTRIP|IXON) || lflag&(IEXTEN|ISIG)
+ || cflag&(CSIZE|PARENB) != CS8)
flags |= CBREAK;
else
flags |= RAW;
}
+ if (!(flags&RAW) && !(oflag&OPOST) && cflag&(CSIZE|PARENB) == CS8)
+ flags |= LITOUT;
if (cflag&MDMBUF)
flags |= MDMBUF;
if ((cflag&HUPCL) == 0)
@@ -274,28 +329,28 @@ ttcompatgetflags(tp)
if ((iflag&IXANY) == 0)
flags |= DECCTQ;
flags |= lflag&(ECHO|TOSTOP|FLUSHO|PENDIN|NOFLSH);
-if (ttydebug)
- printf("getflags: %x\n", flags);
+ if (ttydebug)
+ printf("getflags: %x\n", flags);
return (flags);
}
+static void
ttcompatsetflags(tp, t)
register struct tty *tp;
register struct termios *t;
{
register flags = tp->t_flags;
- register long iflag = t->c_iflag;
- register long oflag = t->c_oflag;
- register long lflag = t->c_lflag;
- register long cflag = t->c_cflag;
+ register tcflag_t iflag = t->c_iflag;
+ register tcflag_t oflag = t->c_oflag;
+ register tcflag_t lflag = t->c_lflag;
+ register tcflag_t cflag = t->c_cflag;
if (flags & RAW) {
- iflag &= IXOFF;
- oflag &= ~OPOST;
+ iflag = IGNBRK;
lflag &= ~(ECHOCTL|ISIG|ICANON|IEXTEN);
} else {
+ iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR);
iflag |= BRKINT|IXON|IMAXBEL;
- oflag |= OPOST;
lflag |= ISIG|IEXTEN|ECHOCTL; /* XXX was echoctl on ? */
if (flags & XTABS)
oflag |= OXTABS;
@@ -317,49 +372,59 @@ ttcompatsetflags(tp, t)
lflag |= ECHO;
else
lflag &= ~ECHO;
-
+
+ cflag &= ~(CSIZE|PARENB);
if (flags&(RAW|LITOUT|PASS8)) {
- cflag &= ~(CSIZE|PARENB);
cflag |= CS8;
- if ((flags&(RAW|PASS8)) == 0)
+ if (!(flags&(RAW|PASS8))
+ || (flags&(RAW|PASS8|ANYP)) == (PASS8|ANYP))
iflag |= ISTRIP;
else
iflag &= ~ISTRIP;
+ if (flags&(RAW|LITOUT))
+ oflag &= ~OPOST;
+ else
+ oflag |= OPOST;
} else {
- cflag &= ~CSIZE;
cflag |= CS7|PARENB;
iflag |= ISTRIP;
+ oflag |= OPOST;
}
+ /* XXX don't set INPCK if RAW or PASS8? */
if ((flags&(EVENP|ODDP)) == EVENP) {
iflag |= INPCK;
cflag &= ~PARODD;
} else if ((flags&(EVENP|ODDP)) == ODDP) {
iflag |= INPCK;
cflag |= PARODD;
- } else
+ } else
iflag &= ~INPCK;
- if (flags&LITOUT)
- oflag &= ~OPOST; /* move earlier ? */
if (flags&TANDEM)
iflag |= IXOFF;
else
iflag &= ~IXOFF;
+ if ((flags&DECCTQ) == 0)
+ iflag |= IXANY;
+ else
+ iflag &= ~IXANY;
t->c_iflag = iflag;
t->c_oflag = oflag;
t->c_lflag = lflag;
t->c_cflag = cflag;
}
+static void
ttcompatsetlflags(tp, t)
register struct tty *tp;
register struct termios *t;
{
register flags = tp->t_flags;
- register long iflag = t->c_iflag;
- register long oflag = t->c_oflag;
- register long lflag = t->c_lflag;
- register long cflag = t->c_cflag;
+ register tcflag_t iflag = t->c_iflag;
+ register tcflag_t oflag = t->c_oflag;
+ register tcflag_t lflag = t->c_lflag;
+ register tcflag_t cflag = t->c_cflag;
+ iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR);
if (flags&CRTERA)
lflag |= ECHOE;
else
@@ -376,6 +441,10 @@ ttcompatsetlflags(tp, t)
lflag |= ECHOCTL;
else
lflag &= ~ECHOCTL;
+ if (flags&TANDEM)
+ iflag |= IXOFF;
+ else
+ iflag &= ~IXOFF;
if ((flags&DECCTQ) == 0)
iflag |= IXANY;
else
@@ -390,17 +459,30 @@ ttcompatsetlflags(tp, t)
cflag |= HUPCL;
lflag &= ~(TOSTOP|FLUSHO|PENDIN|NOFLSH);
lflag |= flags&(TOSTOP|FLUSHO|PENDIN|NOFLSH);
- if (flags&(LITOUT|PASS8)) {
- iflag &= ~ISTRIP;
- cflag &= ~(CSIZE|PARENB);
+
+ /*
+ * The next if-else statement is copied from above so don't bother
+ * checking it separately. We could avoid fiddlling with the
+ * character size if the mode is already RAW or if neither the
+ * LITOUT bit or the PASS8 bit is being changed, but the delta of
+ * the change is not available here and skipping the RAW case would
+ * make the code different from above.
+ */
+ cflag &= ~(CSIZE|PARENB);
+ if (flags&(RAW|LITOUT|PASS8)) {
cflag |= CS8;
- if (flags&LITOUT)
- oflag &= ~OPOST;
- if ((flags&(PASS8|RAW)) == 0)
+ if (!(flags&(RAW|PASS8))
+ || (flags&(RAW|PASS8|ANYP)) == (PASS8|ANYP))
iflag |= ISTRIP;
- } else if ((flags&RAW) == 0) {
- cflag &= ~CSIZE;
+ else
+ iflag &= ~ISTRIP;
+ if (flags&(RAW|LITOUT))
+ oflag &= ~OPOST;
+ else
+ oflag |= OPOST;
+ } else {
cflag |= CS7|PARENB;
+ iflag |= ISTRIP;
oflag |= OPOST;
}
t->c_iflag = iflag;
diff --git a/sys/kern/tty_conf.c b/sys/kern/tty_conf.c
index 1453675..2e765c8 100644
--- a/sys/kern/tty_conf.c
+++ b/sys/kern/tty_conf.c
@@ -35,92 +35,174 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)tty_conf.c 8.5 (Berkeley) 1/9/95
+ * @(#)tty_conf.c 8.4 (Berkeley) 1/21/94
+ * $Id$
*/
#include <sys/param.h>
#include <sys/systm.h>
-#include <sys/buf.h>
-#include <sys/ioctl.h>
-#include <sys/proc.h>
#include <sys/tty.h>
#include <sys/conf.h>
-#define ttynodisc ((int (*) __P((dev_t, struct tty *)))enodev)
-#define ttyerrclose ((int (*) __P((struct tty *, int flags)))enodev)
-#define ttyerrio ((int (*) __P((struct tty *, struct uio *, int)))enodev)
-#define ttyerrinput ((int (*) __P((int c, struct tty *)))enodev)
-#define ttyerrstart ((int (*) __P((struct tty *)))enodev)
-
-int nullioctl __P((struct tty *tp, u_long cmd, caddr_t data,
- int flag, struct proc *p));
-
-#include "tb.h"
-#if NTB > 0
-int tbopen __P((dev_t dev, struct tty *tp));
-int tbclose __P((struct tty *tp, int flags));
-int tbread __P((struct tty *, struct uio *, int flags));
-int tbioctl __P((struct tty *tp, u_long cmd, caddr_t data,
- int flag, struct proc *p));
-int tbinput __P((int c, struct tty *tp));
+#ifndef MAXLDISC
+#define MAXLDISC 8
#endif
-#include "sl.h"
-#if NSL > 0
-int slopen __P((dev_t dev, struct tty *tp));
-int slclose __P((struct tty *tp, int flags));
-int sltioctl __P((struct tty *tp, u_long cmd, caddr_t data,
- int flag, struct proc *p));
-int slinput __P((int c, struct tty *tp));
-int slstart __P((struct tty *tp));
+static l_open_t l_noopen;
+static l_close_t l_noclose;
+static l_ioctl_t l_nullioctl;
+static l_rint_t l_norint;
+static l_start_t l_nostart;
+
+/*
+ * XXX it probably doesn't matter what the entries other than the l_open
+ * entry are here. The l_nullioctl and ttymodem entries still look fishy.
+ * Reconsider the removal of nullmodem anyway. It was too much like
+ * ttymodem, but a completely null version might be useful.
+ */
+#define NODISC(n) \
+ { l_noopen, l_noclose, l_noread, l_nowrite, \
+ l_nullioctl, l_norint, l_nostart, ttymodem }
+
+struct linesw linesw[MAXLDISC] =
+{
+ /* 0- termios */
+ { ttyopen, ttylclose, ttread, ttwrite,
+ l_nullioctl, ttyinput, ttstart, ttymodem },
+ NODISC(1), /* 1- defunct */
+ /* 2- NTTYDISC */
+#ifdef COMPAT_43
+ { ttyopen, ttylclose, ttread, ttwrite,
+ l_nullioctl, ttyinput, ttstart, ttymodem },
+#else
+ NODISC(2),
#endif
+ NODISC(3), /* TABLDISC */
+ NODISC(4), /* SLIPDISC */
+ NODISC(5), /* PPPDISC */
+ NODISC(6), /* loadable */
+ NODISC(7), /* loadable */
+};
+int nlinesw = sizeof (linesw) / sizeof (linesw[0]);
+
+static struct linesw nodisc = NODISC(0);
-struct linesw linesw[] =
+#define LOADABLE_LDISC 6
+/*
+ * ldisc_register: Register a line discipline.
+ *
+ * discipline: Index for discipline to load, or LDISC_LOAD for us to choose.
+ * linesw_p: Pointer to linesw_p.
+ *
+ * Returns: Index used or -1 on failure.
+ */
+int
+ldisc_register(discipline, linesw_p)
+ int discipline;
+ struct linesw *linesw_p;
{
- { ttyopen, ttylclose, ttread, ttwrite, nullioctl,
- ttyinput, ttstart, ttymodem }, /* 0- termios */
+ int slot = -1;
- { ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl,
- ttyerrinput, ttyerrstart, nullmodem }, /* 1- defunct */
+ if (discipline == LDISC_LOAD) {
+ int i;
+ for (i = LOADABLE_LDISC; i < MAXLDISC; i++)
+ if (bcmp(linesw + i, &nodisc, sizeof(nodisc)) == 0) {
+ slot = i;
+ }
+ }
+ else if (discipline >= 0 && discipline < MAXLDISC) {
+ slot = discipline;
+ }
- { ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl,
- ttyerrinput, ttyerrstart, nullmodem }, /* 2- defunct */
+ if (slot != -1 && linesw_p)
+ linesw[slot] = *linesw_p;
-#if NTB > 0
- { tbopen, tbclose, tbread, enodev, tbioctl,
- tbinput, ttstart, nullmodem }, /* 3- TABLDISC */
-#else
- { ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl,
- ttyerrinput, ttyerrstart, nullmodem },
-#endif
+ return slot;
+}
-#if NSL > 0
- { slopen, slclose, ttyerrio, ttyerrio, sltioctl,
- slinput, slstart, nullmodem }, /* 4- SLIPDISC */
-#else
- { ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl,
- ttyerrinput, ttyerrstart, nullmodem },
-#endif
-};
+/*
+ * ldisc_deregister: Deregister a line discipline obtained with
+ * ldisc_register. Can only deregister "loadable" ones now.
+ *
+ * discipline: Index for discipline to unload.
+ */
+void
+ldisc_deregister(discipline)
+ int discipline;
+{
+ if (discipline >= LOADABLE_LDISC && discipline < MAXLDISC) {
+ linesw[discipline] = nodisc;
+ }
+}
-int nlinesw = sizeof (linesw) / sizeof (linesw[0]);
+static int
+l_noopen(dev, tp)
+ dev_t dev;
+ struct tty *tp;
+{
+
+ return (ENODEV);
+}
+
+static int
+l_noclose(tp, flag)
+ struct tty *tp;
+ int flag;
+{
+
+ return (ENODEV);
+}
+
+int
+l_noread(tp, uio, flag)
+ struct tty *tp;
+ struct uio *uio;
+ int flag;
+{
+
+ return (ENODEV);
+}
+
+int
+l_nowrite(tp, uio, flag)
+ struct tty *tp;
+ struct uio *uio;
+ int flag;
+{
+
+ return (ENODEV);
+}
+
+static int
+l_norint(c, tp)
+ int c;
+ struct tty *tp;
+{
+
+ return (ENODEV);
+}
+
+static int
+l_nostart(tp)
+ struct tty *tp;
+{
+
+ return (ENODEV);
+}
/*
* Do nothing specific version of line
* discipline specific ioctl command.
*/
-/*ARGSUSED*/
-nullioctl(tp, cmd, data, flags, p)
+static int
+l_nullioctl(tp, cmd, data, flags, p)
struct tty *tp;
- u_long cmd;
+ int cmd;
char *data;
int flags;
struct proc *p;
{
-#ifdef lint
- tp = tp; data = data; flags = flags; p = p;
-#endif
return (-1);
}
diff --git a/sys/kern/tty_cons.c b/sys/kern/tty_cons.c
new file mode 100644
index 0000000..1a56c85
--- /dev/null
+++ b/sys/kern/tty_cons.c
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 1988 University of Utah.
+ * Copyright (c) 1991 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * from: @(#)cons.c 7.2 (Berkeley) 5/9/91
+ * $Id$
+ */
+
+#include <sys/param.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/reboot.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/tty.h>
+
+#include <machine/cpu.h>
+#include <machine/cons.h>
+
+/* XXX this should be config(8)ed. */
+#include "sc.h"
+#include "vt.h"
+#include "sio.h"
+static struct consdev constab[] = {
+#if NSC > 0
+ { sccnprobe, sccninit, sccngetc, sccncheckc, sccnputc },
+#endif
+#if NVT > 0
+ { pccnprobe, pccninit, pccngetc, pccncheckc, pccnputc },
+#endif
+#if NSIO > 0
+ { siocnprobe, siocninit, siocngetc, siocncheckc, siocnputc },
+#endif
+ { 0 },
+};
+
+static d_open_t cnopen;
+static d_close_t cnclose;
+static d_read_t cnread;
+static d_write_t cnwrite;
+static d_ioctl_t cnioctl;
+static d_select_t cnselect;
+
+#define CDEV_MAJOR 0
+static struct cdevsw cn_cdevsw =
+ { cnopen, cnclose, cnread, cnwrite, /*0*/
+ cnioctl, nullstop, nullreset, nodevtotty,/* console */
+ cnselect, nommap, NULL, "console", NULL, -1 };
+
+struct tty *constty = 0; /* virtual console output device */
+
+static dev_t cn_dev_t;
+SYSCTL_OPAQUE(_machdep, CPU_CONSDEV, consdev, CTLTYPE_OPAQUE|CTLFLAG_RD,
+ &cn_dev_t, sizeof cn_dev_t, "T,dev_t", "");
+static int cn_mute;
+SYSCTL_INT(_kern, OID_AUTO, consmute, CTLFLAG_RW, &cn_mute, 0, "");
+
+int cons_unavail = 0; /* XXX:
+ * physical console not available for
+ * input (i.e., it is in graphics mode)
+ */
+
+static u_char cn_is_open; /* nonzero if logical console is open */
+static u_char cn_phys_is_open; /* nonzero if physical console is open */
+static d_close_t *cn_phys_close; /* physical device close function */
+static d_open_t *cn_phys_open; /* physical device open function */
+static struct consdev *cn_tab; /* physical console device info */
+static struct tty *cn_tp; /* physical console tty struct */
+#ifdef DEVFS
+void *cn_devfs_token; /* represents the devfs entry */
+#endif /* DEVFS */
+
+void
+cninit()
+{
+ struct consdev *best_cp, *cp;
+
+ /*
+ * Find the first console with the highest priority.
+ */
+ best_cp = NULL;
+ for (cp = constab; cp->cn_probe; cp++) {
+ (*cp->cn_probe)(cp);
+ if (cp->cn_pri > CN_DEAD &&
+ (best_cp == NULL || cp->cn_pri > best_cp->cn_pri))
+ best_cp = cp;
+ }
+
+ /*
+ * Check if we should mute the console (for security reasons perhaps)
+ * It can be changes dynamically using sysctl kern.consmute
+ * once we are up and going.
+ *
+ */
+ cn_mute = ((boothowto & (RB_MUTE
+ |RB_SINGLE
+ |RB_VERBOSE
+ |RB_ASKNAME
+ |RB_CONFIG)) == RB_MUTE);
+
+ /*
+ * If no console, give up.
+ */
+ if (best_cp == NULL) {
+ cn_tab = best_cp;
+ return;
+ }
+
+ /*
+ * Initialize console, then attach to it. This ordering allows
+ * debugging using the previous console, if any.
+ * XXX if there was a previous console, then its driver should
+ * be informed when we forget about it.
+ */
+ (*best_cp->cn_init)(best_cp);
+ cn_tab = best_cp;
+}
+
+void
+cninit_finish()
+{
+ struct cdevsw *cdp;
+
+ if (cn_tab == NULL)
+ return;
+
+ /*
+ * Hook the open and close functions.
+ */
+ cdp = cdevsw[major(cn_tab->cn_dev)];
+ cn_phys_close = cdp->d_close;
+ cdp->d_close = cnclose;
+ cn_phys_open = cdp->d_open;
+ cdp->d_open = cnopen;
+ cn_tp = (*cdp->d_devtotty)(cn_tab->cn_dev);
+ cn_dev_t = cn_tp->t_dev;
+}
+
+static int
+cnopen(dev, flag, mode, p)
+ dev_t dev;
+ int flag, mode;
+ struct proc *p;
+{
+ dev_t cndev, physdev;
+ int retval;
+
+ if (cn_tab == NULL)
+ return (0);
+ cndev = cn_tab->cn_dev;
+ physdev = (major(dev) == major(cndev) ? dev : cndev);
+ retval = (*cn_phys_open)(physdev, flag, mode, p);
+ if (retval == 0) {
+ if (dev == cndev)
+ cn_phys_is_open = 1;
+ else if (physdev == cndev)
+ cn_is_open = 1;
+ }
+ return (retval);
+}
+
+static int
+cnclose(dev, flag, mode, p)
+ dev_t dev;
+ int flag, mode;
+ struct proc *p;
+{
+ dev_t cndev;
+
+ if (cn_tab == NULL)
+ return (0);
+ cndev = cn_tab->cn_dev;
+ if (dev == cndev) {
+ /* the physical device is about to be closed */
+ cn_phys_is_open = 0;
+ if (cn_is_open) {
+ if (cn_tp) {
+ /* perform a ttyhalfclose() */
+ /* reset session and proc group */
+ cn_tp->t_pgrp = NULL;
+ cn_tp->t_session = NULL;
+ }
+ return (0);
+ }
+ } else if (major(dev) != major(cndev)) {
+ /* the logical console is about to be closed */
+ cn_is_open = 0;
+ if (cn_phys_is_open)
+ return (0);
+ dev = cndev;
+ }
+ return ((*cn_phys_close)(dev, flag, mode, p));
+}
+
+static int
+cnread(dev, uio, flag)
+ dev_t dev;
+ struct uio *uio;
+ int flag;
+{
+ if ((cn_tab == NULL) || cn_mute)
+ return (0);
+ dev = cn_tab->cn_dev;
+ return ((*cdevsw[major(dev)]->d_read)(dev, uio, flag));
+}
+
+static int
+cnwrite(dev, uio, flag)
+ dev_t dev;
+ struct uio *uio;
+ int flag;
+{
+ if ((cn_tab == NULL) || cn_mute)
+ return (0);
+ if (constty)
+ dev = constty->t_dev;
+ else
+ dev = cn_tab->cn_dev;
+ return ((*cdevsw[major(dev)]->d_write)(dev, uio, flag));
+}
+
+static int
+cnioctl(dev, cmd, data, flag, p)
+ dev_t dev;
+ int cmd;
+ caddr_t data;
+ int flag;
+ struct proc *p;
+{
+ int error;
+
+ if ((cn_tab == NULL) || cn_mute)
+ return (0);
+ /*
+ * Superuser can always use this to wrest control of console
+ * output from the "virtual" console.
+ */
+ if (cmd == TIOCCONS && constty) {
+ error = suser(p->p_ucred, (u_short *) NULL);
+ if (error)
+ return (error);
+ constty = NULL;
+ return (0);
+ }
+ dev = cn_tab->cn_dev;
+ return ((*cdevsw[major(dev)]->d_ioctl)(dev, cmd, data, flag, p));
+}
+
+static int
+cnselect(dev, rw, p)
+ dev_t dev;
+ int rw;
+ struct proc *p;
+{
+ if ((cn_tab == NULL) || cn_mute)
+ return (1);
+
+ dev = cn_tab->cn_dev;
+
+ return ((*cdevsw[major(dev)]->d_select)(dev, rw, p));
+}
+
+int
+cngetc()
+{
+ int c;
+ if ((cn_tab == NULL) || cn_mute)
+ return (-1);
+ c = (*cn_tab->cn_getc)(cn_tab->cn_dev);
+ if (c == '\r') c = '\n'; /* console input is always ICRNL */
+ return (c);
+}
+
+int
+cncheckc()
+{
+ if ((cn_tab == NULL) || cn_mute)
+ return (-1);
+ return ((*cn_tab->cn_checkc)(cn_tab->cn_dev));
+}
+
+void
+cnputc(c)
+ register int c;
+{
+ if ((cn_tab == NULL) || cn_mute)
+ return;
+ if (c) {
+ if (c == '\n')
+ (*cn_tab->cn_putc)(cn_tab->cn_dev, '\r');
+ (*cn_tab->cn_putc)(cn_tab->cn_dev, c);
+ }
+}
+
+static cn_devsw_installed = 0;
+
+static void
+cn_drvinit(void *unused)
+{
+ dev_t dev;
+
+ if( ! cn_devsw_installed ) {
+ dev = makedev(CDEV_MAJOR,0);
+ cdevsw_add(&dev,&cn_cdevsw,NULL);
+ cn_devsw_installed = 1;
+#ifdef DEVFS
+ cn_devfs_token = devfs_add_devswf(&cn_cdevsw, 0, DV_CHR,
+ UID_ROOT, GID_WHEEL, 0600,
+ "console");
+#endif
+ }
+}
+
+SYSINIT(cndev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,cn_drvinit,NULL)
+
+
diff --git a/sys/kern/tty_pty.c b/sys/kern/tty_pty.c
index 2c37984..ee0b653 100644
--- a/sys/kern/tty_pty.c
+++ b/sys/kern/tty_pty.c
@@ -31,6 +31,7 @@
* SUCH DAMAGE.
*
* @(#)tty_pty.c 8.4 (Berkeley) 2/20/95
+ * $Id: tty_pty.c,v 1.42 1997/03/23 03:36:28 bde Exp $
*/
/*
@@ -41,14 +42,53 @@
#include <sys/param.h>
#include <sys/systm.h>
-#include <sys/ioctl.h>
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#include <sys/ioctl_compat.h>
+#endif
#include <sys/proc.h>
#include <sys/tty.h>
#include <sys/conf.h>
-#include <sys/file.h>
+#include <sys/fcntl.h>
#include <sys/uio.h>
#include <sys/kernel.h>
#include <sys/vnode.h>
+#include <sys/signalvar.h>
+
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+
+#ifdef notyet
+static void ptyattach __P((int n));
+#endif
+static void ptsstart __P((struct tty *tp));
+static void ptcwakeup __P((struct tty *tp, int flag));
+
+static d_open_t ptsopen;
+static d_close_t ptsclose;
+static d_read_t ptsread;
+static d_write_t ptswrite;
+static d_ioctl_t ptyioctl;
+static d_stop_t ptsstop;
+static d_devtotty_t ptydevtotty;
+static d_open_t ptcopen;
+static d_close_t ptcclose;
+static d_read_t ptcread;
+static d_write_t ptcwrite;
+static d_select_t ptcselect;
+
+#define CDEV_MAJOR_S 5
+#define CDEV_MAJOR_C 6
+static struct cdevsw pts_cdevsw =
+ { ptsopen, ptsclose, ptsread, ptswrite, /*5*/
+ ptyioctl, ptsstop, nullreset, ptydevtotty,/* ttyp */
+ ttselect, nommap, NULL, "pts", NULL, -1 };
+
+static struct cdevsw ptc_cdevsw =
+ { ptcopen, ptcclose, ptcread, ptcwrite, /*6*/
+ ptyioctl, nullstop, nullreset, ptydevtotty,/* ptyp */
+ ptcselect, nommap, NULL, "ptc", NULL, -1 };
+
#if NPTY == 1
#undef NPTY
@@ -58,17 +98,17 @@
#define BUFSIZ 100 /* Chunk size iomoved to/from user */
/*
- * pts == /dev/tty[pqrs]?
- * ptc == /dev/pty[pqrs]?
+ * pts == /dev/tty[pqrsPQRS][0123456789abcdefghijklmnopqrstuv]
+ * ptc == /dev/pty[pqrsPQRS][0123456789abcdefghijklmnopqrstuv]
*/
-struct tty pt_tty[NPTY]; /* XXX */
-struct pt_ioctl {
+static struct tty pt_tty[NPTY]; /* XXX */
+static struct pt_ioctl {
int pt_flags;
struct selinfo pt_selr, pt_selw;
u_char pt_send;
u_char pt_ucntl;
} pt_ioctl[NPTY]; /* XXX */
-int npty = NPTY; /* for pstat -t */
+static int npty = NPTY; /* for pstat -t */
#define PF_PKT 0x08 /* packet mode */
#define PF_STOPPED 0x10 /* user told stopped */
@@ -76,18 +116,16 @@ int npty = NPTY; /* for pstat -t */
#define PF_NOSTOP 0x40
#define PF_UCNTL 0x80 /* user control mode */
-void ptsstop __P((struct tty *, int));
-
+#ifdef notyet
/*
* Establish n (or default if n is 1) ptys in the system.
*
* XXX cdevsw & pstat require the array `pty[]' to be an array
*/
-void
+static void
ptyattach(n)
int n;
{
-#ifdef notyet
char *mem;
register u_long ntb;
#define DEFAULT_NPTY 32
@@ -102,10 +140,11 @@ ptyattach(n)
mem = (char *)ALIGN(mem + ntb);
pt_ioctl = (struct pt_ioctl *)mem;
npty = n;
-#endif
}
+#endif
/*ARGSUSED*/
+static int
ptsopen(dev, flag, devtype, p)
dev_t dev;
int flag, devtype;
@@ -118,7 +157,6 @@ ptsopen(dev, flag, devtype, p)
return (ENXIO);
tp = &pt_tty[minor(dev)];
if ((tp->t_state & TS_ISOPEN) == 0) {
- tp->t_state |= TS_WOPEN;
ttychars(tp); /* Set up default chars */
tp->t_iflag = TTYDEF_IFLAG;
tp->t_oflag = TTYDEF_OFLAG;
@@ -129,20 +167,22 @@ ptsopen(dev, flag, devtype, p)
} else if (tp->t_state&TS_XCLUDE && p->p_ucred->cr_uid != 0)
return (EBUSY);
if (tp->t_oproc) /* Ctrlr still around. */
- tp->t_state |= TS_CARR_ON;
+ (void)(*linesw[tp->t_line].l_modem)(tp, 1);
while ((tp->t_state & TS_CARR_ON) == 0) {
- tp->t_state |= TS_WOPEN;
if (flag&FNONBLOCK)
break;
- if (error = ttysleep(tp, (caddr_t)&tp->t_rawq, TTIPRI | PCATCH,
- ttopen, 0))
+ error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH,
+ "ptsopn", 0);
+ if (error)
return (error);
}
error = (*linesw[tp->t_line].l_open)(dev, tp);
- ptcwakeup(tp, FREAD|FWRITE);
+ if (error == 0)
+ ptcwakeup(tp, FREAD|FWRITE);
return (error);
}
+static int
ptsclose(dev, flag, mode, p)
dev_t dev;
int flag, mode;
@@ -153,11 +193,12 @@ ptsclose(dev, flag, mode, p)
tp = &pt_tty[minor(dev)];
err = (*linesw[tp->t_line].l_close)(tp, flag);
- err |= ttyclose(tp);
- ptcwakeup(tp, FREAD|FWRITE);
+ ptsstop(tp, FREAD|FWRITE);
+ (void) ttyclose(tp);
return (err);
}
+static int
ptsread(dev, uio, flag)
dev_t dev;
struct uio *uio;
@@ -177,15 +218,17 @@ again:
p->p_flag & P_PPWAIT)
return (EIO);
pgsignal(p->p_pgrp, SIGTTIN, 1);
- if (error = ttysleep(tp, (caddr_t)&lbolt,
- TTIPRI | PCATCH, ttybg, 0))
+ error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ptsbg",
+ 0);
+ if (error)
return (error);
}
if (tp->t_canq.c_cc == 0) {
if (flag & IO_NDELAY)
return (EWOULDBLOCK);
- if (error = ttysleep(tp, (caddr_t)&tp->t_canq,
- TTIPRI | PCATCH, ttyin, 0))
+ error = ttysleep(tp, TSA_PTS_READ(tp), TTIPRI | PCATCH,
+ "ptsin", 0);
+ if (error)
return (error);
goto again;
}
@@ -210,6 +253,7 @@ again:
* Wakeups of controlling tty will happen
* indirectly, when tty driver calls ptsstart.
*/
+static int
ptswrite(dev, uio, flag)
dev_t dev;
struct uio *uio;
@@ -227,7 +271,7 @@ ptswrite(dev, uio, flag)
* Start output on pseudo-tty.
* Wake up process selecting or sleeping for input from controlling tty.
*/
-void
+static void
ptsstart(tp)
struct tty *tp;
{
@@ -242,6 +286,7 @@ ptsstart(tp)
ptcwakeup(tp, FREAD);
}
+static void
ptcwakeup(tp, flag)
struct tty *tp;
int flag;
@@ -250,23 +295,19 @@ ptcwakeup(tp, flag)
if (flag & FREAD) {
selwakeup(&pti->pt_selr);
- wakeup((caddr_t)&tp->t_outq.c_cf);
+ wakeup(TSA_PTC_READ(tp));
}
if (flag & FWRITE) {
selwakeup(&pti->pt_selw);
- wakeup((caddr_t)&tp->t_rawq.c_cf);
+ wakeup(TSA_PTC_WRITE(tp));
}
}
-/*ARGSUSED*/
-#ifdef __STDC__
-ptcopen(dev_t dev, int flag, int devtype, struct proc *p)
-#else
+static int
ptcopen(dev, flag, devtype, p)
dev_t dev;
int flag, devtype;
struct proc *p;
-#endif
{
register struct tty *tp;
struct pt_ioctl *pti;
@@ -289,19 +330,37 @@ ptcopen(dev, flag, devtype, p)
return (0);
}
-ptcclose(dev)
+static int
+ptcclose(dev, flags, fmt, p)
dev_t dev;
+ int flags;
+ int fmt;
+ struct proc *p;
{
register struct tty *tp;
tp = &pt_tty[minor(dev)];
(void)(*linesw[tp->t_line].l_modem)(tp, 0);
- tp->t_state &= ~TS_CARR_ON;
+
+ /*
+ * XXX MDMBUF makes no sense for ptys but would inhibit the above
+ * l_modem(). CLOCAL makes sense but isn't supported. Special
+ * l_modem()s that ignore carrier drop make no sense for ptys but
+ * may be in use because other parts of the line discipline make
+ * sense for ptys. Recover by doing everything that a normal
+ * ttymodem() would have done except for sending a SIGHUP.
+ */
+ if (tp->t_state & TS_ISOPEN) {
+ tp->t_state &= ~(TS_CARR_ON | TS_CONNECTED);
+ tp->t_state |= TS_ZOMBIE;
+ ttyflush(tp, FREAD | FWRITE);
+ }
+
tp->t_oproc = 0; /* mark closed */
- tp->t_session = 0;
return (0);
}
+static int
ptcread(dev, uio, flag)
dev_t dev;
struct uio *uio;
@@ -327,7 +386,8 @@ ptcread(dev, uio, flag)
if (pti->pt_send & TIOCPKT_IOCTL) {
cc = min(uio->uio_resid,
sizeof(tp->t_termios));
- uiomove(&tp->t_termios, cc, uio);
+ uiomove((caddr_t)&tp->t_termios, cc,
+ uio);
}
pti->pt_send = 0;
return (0);
@@ -342,12 +402,12 @@ ptcread(dev, uio, flag)
if (tp->t_outq.c_cc && (tp->t_state&TS_TTSTOP) == 0)
break;
}
- if ((tp->t_state&TS_CARR_ON) == 0)
+ if ((tp->t_state & TS_CONNECTED) == 0)
return (0); /* EOF */
if (flag & IO_NDELAY)
return (EWOULDBLOCK);
- if (error = tsleep((caddr_t)&tp->t_outq.c_cf, TTIPRI | PCATCH,
- ttyin, 0))
+ error = tsleep(TSA_PTC_READ(tp), TTIPRI | PCATCH, "ptcin", 0);
+ if (error)
return (error);
}
if (pti->pt_flags & (PF_PKT|PF_UCNTL))
@@ -358,17 +418,11 @@ ptcread(dev, uio, flag)
break;
error = uiomove(buf, cc, uio);
}
- if (tp->t_outq.c_cc <= tp->t_lowat) {
- if (tp->t_state&TS_ASLEEP) {
- tp->t_state &= ~TS_ASLEEP;
- wakeup((caddr_t)&tp->t_outq);
- }
- selwakeup(&tp->t_wsel);
- }
+ ttwwakeup(tp);
return (error);
}
-void
+static void
ptsstop(tp, flush)
register struct tty *tp;
int flush;
@@ -392,6 +446,7 @@ ptsstop(tp, flush)
ptcwakeup(tp, flag);
}
+static int
ptcselect(dev, rw, p)
dev_t dev;
int rw;
@@ -401,7 +456,7 @@ ptcselect(dev, rw, p)
struct pt_ioctl *pti = &pt_ioctl[minor(dev)];
int s;
- if ((tp->t_state&TS_CARR_ON) == 0)
+ if ((tp->t_state & TS_CONNECTED) == 0)
return (1);
switch (rw) {
@@ -420,8 +475,8 @@ ptcselect(dev, rw, p)
case 0: /* exceptional */
if ((tp->t_state&TS_ISOPEN) &&
- (pti->pt_flags&PF_PKT && pti->pt_send ||
- pti->pt_flags&PF_UCNTL && pti->pt_ucntl))
+ ((pti->pt_flags&PF_PKT && pti->pt_send) ||
+ (pti->pt_flags&PF_UCNTL && pti->pt_ucntl)))
return (1);
selrecord(p, &pti->pt_selr);
break;
@@ -446,13 +501,14 @@ ptcselect(dev, rw, p)
return (0);
}
+static int
ptcwrite(dev, uio, flag)
dev_t dev;
register struct uio *uio;
int flag;
{
register struct tty *tp = &pt_tty[minor(dev)];
- register u_char *cp;
+ register u_char *cp = 0;
register int cc = 0;
u_char locbuf[BUFSIZ];
int cnt = 0;
@@ -465,7 +521,8 @@ again:
if (pti->pt_flags & PF_REMOTE) {
if (tp->t_canq.c_cc)
goto block;
- while (uio->uio_resid > 0 && tp->t_canq.c_cc < TTYHOG - 1) {
+ while ((uio->uio_resid > 0 || cc > 0) &&
+ tp->t_canq.c_cc < TTYHOG - 1) {
if (cc == 0) {
cc = min(uio->uio_resid, BUFSIZ);
cc = min(cc, TTYHOG - 1 - tp->t_canq.c_cc);
@@ -474,19 +531,34 @@ again:
if (error)
return (error);
/* check again for safety */
- if ((tp->t_state&TS_ISOPEN) == 0)
+ if ((tp->t_state & TS_ISOPEN) == 0) {
+ /* adjust as usual */
+ uio->uio_resid += cc;
return (EIO);
+ }
+ }
+ if (cc > 0) {
+ cc = b_to_q((char *)cp, cc, &tp->t_canq);
+ /*
+ * XXX we don't guarantee that the canq size
+ * is >= TTYHOG, so the above b_to_q() may
+ * leave some bytes uncopied. However, space
+ * is guaranteed for the null terminator if
+ * we don't fail here since (TTYHOG - 1) is
+ * not a multiple of CBSIZE.
+ */
+ if (cc > 0)
+ break;
}
- if (cc)
- (void) b_to_q((char *)cp, cc, &tp->t_canq);
- cc = 0;
}
+ /* adjust for data copied in but not written */
+ uio->uio_resid += cc;
(void) putc(0, &tp->t_canq);
ttwakeup(tp);
- wakeup((caddr_t)&tp->t_canq);
+ wakeup(TSA_PTS_READ(tp));
return (0);
}
- while (uio->uio_resid > 0) {
+ while (uio->uio_resid > 0 || cc > 0) {
if (cc == 0) {
cc = min(uio->uio_resid, BUFSIZ);
cp = locbuf;
@@ -494,13 +566,16 @@ again:
if (error)
return (error);
/* check again for safety */
- if ((tp->t_state&TS_ISOPEN) == 0)
+ if ((tp->t_state & TS_ISOPEN) == 0) {
+ /* adjust for data copied in but not written */
+ uio->uio_resid += cc;
return (EIO);
+ }
}
while (cc > 0) {
if ((tp->t_rawq.c_cc + tp->t_canq.c_cc) >= TTYHOG - 2 &&
(tp->t_canq.c_cc > 0 || !(tp->t_iflag&ICANON))) {
- wakeup((caddr_t)&tp->t_rawq);
+ wakeup(TSA_HUP_OR_INPUT(tp));
goto block;
}
(*linesw[tp->t_line].l_rint)(*cp++, tp);
@@ -513,10 +588,13 @@ again:
block:
/*
* Come here to wait for slave to open, for space
- * in outq, or space in rawq.
+ * in outq, or space in rawq, or an empty canq.
*/
- if ((tp->t_state&TS_CARR_ON) == 0)
+ if ((tp->t_state & TS_CONNECTED) == 0) {
+ /* adjust for data copied in but not written */
+ uio->uio_resid += cc;
return (EIO);
+ }
if (flag & IO_NDELAY) {
/* adjust for data copied in but not written */
uio->uio_resid += cc;
@@ -524,8 +602,8 @@ block:
return (EWOULDBLOCK);
return (0);
}
- if (error = tsleep((caddr_t)&tp->t_rawq.c_cf, TTOPRI | PCATCH,
- ttyout, 0)) {
+ error = tsleep(TSA_PTC_WRITE(tp), TTOPRI | PCATCH, "ptcout", 0);
+ if (error) {
/* adjust for data copied in but not written */
uio->uio_resid += cc;
return (error);
@@ -533,10 +611,21 @@ block:
goto again;
}
+static struct tty *
+ptydevtotty(dev)
+ dev_t dev;
+{
+ if (minor(dev) >= npty)
+ return (NULL);
+
+ return &pt_tty[minor(dev)];
+}
+
/*ARGSUSED*/
+static int
ptyioctl(dev, cmd, data, flag, p)
dev_t dev;
- u_long cmd;
+ int cmd;
caddr_t data;
int flag;
struct proc *p;
@@ -572,7 +661,7 @@ ptyioctl(dev, cmd, data, flag, p)
}
return(0);
} else
- if (cdevsw[major(dev)].d_open == ptcopen)
+ if (cdevsw[major(dev)]->d_open == ptcopen)
switch (cmd) {
case TIOCGPGRP:
@@ -610,7 +699,7 @@ ptyioctl(dev, cmd, data, flag, p)
return (0);
#ifdef COMPAT_43
- case TIOCSETP:
+ case TIOCSETP:
case TIOCSETN:
#endif
case TIOCSETD:
@@ -670,7 +759,7 @@ ptyioctl(dev, cmd, data, flag, p)
break;
}
}
- stop = (tp->t_iflag & IXON) && CCEQ(cc[VSTOP], CTRL('s'))
+ stop = (tp->t_iflag & IXON) && CCEQ(cc[VSTOP], CTRL('s'))
&& CCEQ(cc[VSTART], CTRL('q'));
if (pti->pt_flags & PF_NOSTOP) {
if (stop) {
@@ -689,3 +778,49 @@ ptyioctl(dev, cmd, data, flag, p)
}
return (error);
}
+
+static ptc_devsw_installed = 0;
+#ifdef DEVFS
+#define MAXUNITS (8 * 32)
+static void *devfs_token_pts[MAXUNITS];
+static void *devfs_token_ptc[MAXUNITS];
+static const char jnames[] = "pqrsPQRS";
+#endif
+
+static void
+ptc_drvinit(void *unused)
+{
+#ifdef DEVFS
+ int i,j,k;
+#endif
+ dev_t dev;
+
+ if( ! ptc_devsw_installed ) {
+ dev = makedev(CDEV_MAJOR_S, 0);
+ cdevsw_add(&dev, &pts_cdevsw, NULL);
+ dev = makedev(CDEV_MAJOR_C, 0);
+ cdevsw_add(&dev, &ptc_cdevsw, NULL);
+ ptc_devsw_installed = 1;
+#ifdef DEVFS
+/*XXX*/
+#if NPTY > MAXUNITS
+#undef NPTY
+#define NPTY MAXUNITS
+#endif
+ for ( i = 0 ; i<NPTY ; i++ ) {
+ j = i / 32;
+ k = i % 32;
+ devfs_token_pts[i] =
+ devfs_add_devswf(&pts_cdevsw,i,
+ DV_CHR,0,0,0666,
+ "tty%c%n",jnames[j],k);
+ devfs_token_ptc[i] =
+ devfs_add_devswf(&ptc_cdevsw,i,
+ DV_CHR,0,0,0666,
+ "pty%c%n",jnames[j],k);
+ }
+#endif
+ }
+}
+
+SYSINIT(ptcdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR_C,ptc_drvinit,NULL)
diff --git a/sys/kern/tty_snoop.c b/sys/kern/tty_snoop.c
new file mode 100644
index 0000000..6e2bf5d
--- /dev/null
+++ b/sys/kern/tty_snoop.c
@@ -0,0 +1,548 @@
+/*
+ * Copyright (c) 1995 Ugen J.S.Antsilevich
+ *
+ * Redistribution and use in source forms, with and without modification,
+ * are permitted provided that this entire comment appears intact.
+ *
+ * Redistribution in binary form may occur without any restrictions.
+ * Obviously, it would be nice if you gave credit where credit is due
+ * but requiring it would be too onerous.
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ *
+ * Snoop stuff.
+ */
+
+#include "snp.h"
+
+#if NSNP > 0
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/filio.h>
+#include <sys/ioctl_compat.h> /* Oooh..We need O/NTTYDISC */
+#include <sys/proc.h>
+#include <sys/tty.h>
+#include <sys/fcntl.h>
+#include <sys/conf.h>
+#include <sys/uio.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+
+#include <sys/snoop.h>
+
+static d_open_t snpopen;
+static d_close_t snpclose;
+static d_read_t snpread;
+static d_write_t snpwrite;
+static d_ioctl_t snpioctl;
+static d_select_t snpselect;
+
+#define CDEV_MAJOR 53
+static struct cdevsw snp_cdevsw =
+ { snpopen, snpclose, snpread, snpwrite, /*53*/
+ snpioctl, nostop, nullreset, nodevtotty,/* snoop */
+ snpselect, nommap, NULL, "snp", NULL, -1 };
+
+
+#ifndef MIN
+#define MIN(a,b) (((a)<(b))?(a):(b))
+#endif
+
+static struct snoop snoopsw[NSNP];
+
+static struct tty *snpdevtotty __P((dev_t dev));
+static int snp_detach __P((struct snoop *snp));
+
+static struct tty *
+snpdevtotty (dev)
+ dev_t dev;
+{
+ struct cdevsw *cdp;
+ int maj;
+
+ maj = major(dev);
+ if ((u_int)maj >= nchrdev)
+ return (NULL);
+ cdp = cdevsw[maj];
+ if (cdp == NULL)
+ return (NULL);
+ return ((*cdp->d_devtotty)(dev));
+}
+
+#define SNP_INPUT_BUF 5 /* This is even too much,the maximal
+ * interactive mode write is 3 bytes
+ * length for function keys...
+ */
+
+static int
+snpwrite(dev, uio, flag)
+ dev_t dev;
+ struct uio *uio;
+ int flag;
+{
+ int unit = minor(dev), len, i, error;
+ struct snoop *snp = &snoopsw[unit];
+ struct tty *tp;
+ char c[SNP_INPUT_BUF];
+
+ if (snp->snp_tty == NULL)
+ return (EIO);
+
+ tp = snp->snp_tty;
+
+ if ((tp->t_sc == snp) && (tp->t_state & TS_SNOOP) &&
+ (tp->t_line == OTTYDISC || tp->t_line == NTTYDISC))
+ goto tty_input;
+
+ printf("Snoop: attempt to write to bad tty.\n");
+ return (EIO);
+
+tty_input:
+ if (!(tp->t_state & TS_ISOPEN))
+ return (EIO);
+
+ while (uio->uio_resid > 0) {
+ len = MIN(uio->uio_resid,SNP_INPUT_BUF);
+ if ((error = uiomove(c, len, uio)) != 0)
+ return (error);
+ for (i=0;i<len;i++) {
+ if (ttyinput(c[i] , tp))
+ return (EIO);
+ }
+ }
+ return 0;
+
+}
+
+
+static int
+snpread(dev, uio, flag)
+ dev_t dev;
+ struct uio *uio;
+ int flag;
+{
+ int unit = minor(dev), s;
+ struct snoop *snp = &snoopsw[unit];
+ int len, n, nblen, error = 0;
+ caddr_t from;
+ char *nbuf;
+
+#ifdef DIAGNOSTIC
+ if ((snp->snp_len + snp->snp_base) > snp->snp_blen)
+ panic("snoop buffer error");
+#endif
+
+ if (snp->snp_tty == NULL)
+ return (EIO);
+
+ snp->snp_flags &= ~SNOOP_RWAIT;
+
+ do {
+ if (snp->snp_len == 0) {
+ if (snp->snp_flags & SNOOP_NBIO) {
+ return EWOULDBLOCK;
+ }
+ snp->snp_flags |= SNOOP_RWAIT;
+ tsleep((caddr_t) snp, (PZERO + 1) | PCATCH, "snoopread", 0);
+ }
+ } while (snp->snp_len == 0);
+
+ n = snp->snp_len;
+
+ while (snp->snp_len > 0 && uio->uio_resid > 0 && error == 0) {
+ len = MIN(uio->uio_resid, snp->snp_len);
+ from = (caddr_t) (snp->snp_buf + snp->snp_base);
+ if (len == 0)
+ break;
+
+ error = uiomove(from, len, uio);
+ snp->snp_base += len;
+ snp->snp_len -= len;
+ }
+ if ((snp->snp_flags & SNOOP_OFLOW) && (n < snp->snp_len)) {
+ snp->snp_flags &= ~SNOOP_OFLOW;
+ }
+ s = spltty();
+ nblen = snp->snp_blen;
+ if (((nblen / 2) >= SNOOP_MINLEN) && (nblen / 2) >= snp->snp_len) {
+ while (((nblen / 2) >= snp->snp_len) && ((nblen / 2) >= SNOOP_MINLEN))
+ nblen = nblen / 2;
+ if (nbuf = malloc(nblen, M_TTYS, M_NOWAIT)) {
+ bcopy(snp->snp_buf + snp->snp_base, nbuf, snp->snp_len);
+ free(snp->snp_buf, M_TTYS);
+ snp->snp_buf = nbuf;
+ snp->snp_blen = nblen;
+ snp->snp_base = 0;
+ }
+ }
+ splx(s);
+
+ return error;
+}
+
+int
+snpinc(snp, c)
+ struct snoop *snp;
+ char c;
+{
+ char buf[1];
+
+ buf[0]=c;
+ return (snpin(snp,buf,1));
+}
+
+
+int
+snpin(snp, buf, n)
+ struct snoop *snp;
+ char *buf;
+ int n;
+{
+ int s_free, s_tail;
+ int s, len, nblen;
+ caddr_t from, to;
+ char *nbuf;
+
+
+ if (n == 0)
+ return 0;
+
+#ifdef DIAGNOSTIC
+ if (n < 0)
+ panic("bad snoop char count");
+
+ if (!(snp->snp_flags & SNOOP_OPEN)) {
+ printf("Snoop: data coming to closed device.\n");
+ return 0;
+ }
+#endif
+ if (snp->snp_flags & SNOOP_DOWN) {
+ printf("Snoop: more data to down interface.\n");
+ return 0;
+ }
+
+ if (snp->snp_flags & SNOOP_OFLOW) {
+ printf("Snoop: buffer overflow.\n");
+ /*
+ * On overflow we just repeat the standart close
+ * procedure...yes , this is waste of space but.. Then next
+ * read from device will fail if one would recall he is
+ * snooping and retry...
+ */
+
+ return (snpdown(snp));
+ }
+ s_tail = snp->snp_blen - (snp->snp_len + snp->snp_base);
+ s_free = snp->snp_blen - snp->snp_len;
+
+
+ if (n > s_free) {
+ s = spltty();
+ nblen = snp->snp_blen;
+ while ((n > s_free) && ((nblen * 2) <= SNOOP_MAXLEN)) {
+ nblen = snp->snp_blen * 2;
+ s_free = nblen - (snp->snp_len + snp->snp_base);
+ }
+ if ((n <= s_free) && (nbuf = malloc(nblen, M_TTYS, M_NOWAIT))) {
+ bcopy(snp->snp_buf + snp->snp_base, nbuf, snp->snp_len);
+ free(snp->snp_buf, M_TTYS);
+ snp->snp_buf = nbuf;
+ snp->snp_blen = nblen;
+ snp->snp_base = 0;
+ } else {
+ snp->snp_flags |= SNOOP_OFLOW;
+ if (snp->snp_flags & SNOOP_RWAIT) {
+ snp->snp_flags &= ~SNOOP_RWAIT;
+ wakeup((caddr_t) snp);
+ }
+ splx(s);
+ return 0;
+ }
+ splx(s);
+ }
+ if (n > s_tail) {
+ from = (caddr_t) (snp->snp_buf + snp->snp_base);
+ to = (caddr_t) (snp->snp_buf);
+ len = snp->snp_len;
+ bcopy(from, to, len);
+ snp->snp_base = 0;
+ }
+ to = (caddr_t) (snp->snp_buf + snp->snp_base + snp->snp_len);
+ bcopy(buf, to, n);
+ snp->snp_len += n;
+
+ if (snp->snp_flags & SNOOP_RWAIT) {
+ snp->snp_flags &= ~SNOOP_RWAIT;
+ wakeup((caddr_t) snp);
+ }
+ selwakeup(&snp->snp_sel);
+ snp->snp_sel.si_pid = 0;
+
+ return n;
+}
+
+static int
+snpopen(dev, flag, mode, p)
+ dev_t dev;
+ int flag, mode;
+ struct proc *p;
+{
+ struct snoop *snp;
+ register int unit, error;
+
+ if (error = suser(p->p_ucred, &p->p_acflag))
+ return (error);
+
+ if ((unit = minor(dev)) >= NSNP)
+ return (ENXIO);
+
+ snp = &snoopsw[unit];
+
+ if (snp->snp_flags & SNOOP_OPEN)
+ return (ENXIO);
+
+ /*
+ * We intentionally do not OR flags with SNOOP_OPEN,but set them so
+ * all previous settings (especially SNOOP_OFLOW) will be cleared.
+ */
+ snp->snp_flags = SNOOP_OPEN;
+
+ snp->snp_buf = malloc(SNOOP_MINLEN, M_TTYS, M_WAITOK);
+ snp->snp_blen = SNOOP_MINLEN;
+ snp->snp_base = 0;
+ snp->snp_len = 0;
+
+ /*
+ * snp_tty == NULL is for inactive snoop devices.
+ */
+ snp->snp_tty = NULL;
+ snp->snp_target = -1;
+ return (0);
+}
+
+
+static int
+snp_detach(snp)
+ struct snoop *snp;
+{
+ struct tty *tp;
+
+ snp->snp_base = 0;
+ snp->snp_len = 0;
+
+ /*
+ * If line disc. changed we do not touch this pointer,SLIP/PPP will
+ * change it anyway.
+ */
+
+ if (snp->snp_tty == NULL)
+ goto detach_notty;
+
+ tp = snp->snp_tty;
+
+ if (tp && (tp->t_sc == snp) && (tp->t_state & TS_SNOOP) &&
+ (tp->t_line == OTTYDISC || tp->t_line == NTTYDISC)) {
+ tp->t_sc = NULL;
+ tp->t_state &= ~TS_SNOOP;
+ } else
+ printf("Snoop: bad attached tty data.\n");
+
+ snp->snp_tty = NULL;
+ snp->snp_target = -1;
+
+detach_notty:
+ selwakeup(&snp->snp_sel);
+ snp->snp_sel.si_pid = 0;
+
+ return (0);
+}
+
+static int
+snpclose(dev, flags, fmt, p)
+ dev_t dev;
+ int flags;
+ int fmt;
+ struct proc *p;
+{
+ register int unit = minor(dev);
+ struct snoop *snp = &snoopsw[unit];
+
+ snp->snp_blen = 0;
+ free(snp->snp_buf, M_TTYS);
+ snp->snp_flags &= ~SNOOP_OPEN;
+
+ return (snp_detach(snp));
+}
+
+int
+snpdown(snp)
+ struct snoop *snp;
+{
+ snp->snp_blen = SNOOP_MINLEN;
+ free(snp->snp_buf, M_TTYS);
+ snp->snp_buf = malloc(SNOOP_MINLEN, M_TTYS, M_WAITOK);
+ snp->snp_flags |= SNOOP_DOWN;
+
+ return (snp_detach(snp));
+}
+
+
+static int
+snpioctl(dev, cmd, data, flags, p)
+ dev_t dev;
+ int cmd;
+ caddr_t data;
+ int flags;
+ struct proc *p;
+{
+ int unit = minor(dev), s;
+ dev_t tdev;
+ struct snoop *snp = &snoopsw[unit];
+ struct tty *tp, *tpo;
+
+ switch (cmd) {
+ case SNPSTTY:
+ tdev = *((dev_t *) data);
+ if (tdev == -1)
+ return (snpdown(snp));
+
+ tp = snpdevtotty(tdev);
+ if (!tp)
+ return (EINVAL);
+
+ if ((tp->t_sc != (caddr_t) snp) && (tp->t_state & TS_SNOOP))
+ return (EBUSY);
+
+ if ((tp->t_line != OTTYDISC) && (tp->t_line != NTTYDISC))
+ return (EBUSY);
+
+ s = spltty();
+
+ if (snp->snp_target == -1) {
+ tpo = snp->snp_tty;
+ if (tpo)
+ tpo->t_state &= ~TS_SNOOP;
+ }
+
+ tp->t_sc = (caddr_t) snp;
+ tp->t_state |= TS_SNOOP;
+ snp->snp_tty = tp;
+ snp->snp_target = tdev;
+
+ /*
+ * Clean overflow and down flags -
+ * we'll have a chance to get them in the future :)))
+ */
+ snp->snp_flags &= ~SNOOP_OFLOW;
+ snp->snp_flags &= ~SNOOP_DOWN;
+ splx(s);
+ break;
+
+ case SNPGTTY:
+ /*
+ * We keep snp_target field specially to make
+ * SNPGTTY happy,else we can't know what is device
+ * major/minor for tty.
+ */
+ *((dev_t *) data) = snp->snp_target;
+ break;
+
+ case FIONBIO:
+ if (*(int *) data)
+ snp->snp_flags |= SNOOP_NBIO;
+ else
+ snp->snp_flags &= ~SNOOP_NBIO;
+ break;
+
+ case FIOASYNC:
+ if (*(int *) data)
+ snp->snp_flags |= SNOOP_ASYNC;
+ else
+ snp->snp_flags &= ~SNOOP_ASYNC;
+ break;
+
+ case FIONREAD:
+ s = spltty();
+ if (snp->snp_tty != NULL)
+ *(int *) data = snp->snp_len;
+ else
+ if (snp->snp_flags & SNOOP_DOWN) {
+ if (snp->snp_flags & SNOOP_OFLOW)
+ *(int *) data = SNP_OFLOW;
+ else
+ *(int *) data = SNP_TTYCLOSE;
+ } else {
+ *(int *) data = SNP_DETACH;
+ }
+ splx(s);
+ break;
+
+ default:
+ return (ENOTTY);
+ }
+ return (0);
+}
+
+
+static int
+snpselect(dev, rw, p)
+ dev_t dev;
+ int rw;
+ struct proc *p;
+{
+ int unit = minor(dev);
+ struct snoop *snp = &snoopsw[unit];
+
+ if (rw != FREAD)
+ return 1;
+
+ if (snp->snp_len > 0)
+ return 1;
+
+ /*
+ * If snoop is down,we don't want to select() forever so we return 1.
+ * Caller should see if we down via FIONREAD ioctl().The last should
+ * return -1 to indicate down state.
+ */
+ if (snp->snp_flags & SNOOP_DOWN)
+ return 1;
+
+ selrecord(p, &snp->snp_sel);
+ return 0;
+}
+
+#ifdef DEVFS
+static void *snp_devfs_token[NSNP];
+#endif
+static snp_devsw_installed = 0;
+
+static void
+snp_drvinit(void *unused)
+{
+ dev_t dev;
+#ifdef DEVFS
+ int i;
+#endif
+
+ if( ! snp_devsw_installed ) {
+ dev = makedev(CDEV_MAJOR, 0);
+ cdevsw_add(&dev,&snp_cdevsw, NULL);
+ snp_devsw_installed = 1;
+#ifdef DEVFS
+ for ( i = 0 ; i < NSNP ; i++) {
+ snp_devfs_token[i] =
+ devfs_add_devswf(&snp_cdevsw, i, DV_CHR, 0, 0,
+ 0600, "snp%d", i);
+ }
+#endif
+ }
+}
+
+SYSINIT(snpdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,snp_drvinit,NULL)
+
+
+#endif
diff --git a/sys/kern/tty_subr.c b/sys/kern/tty_subr.c
index fe8f000..d907b47 100644
--- a/sys/kern/tty_subr.c
+++ b/sys/kern/tty_subr.c
@@ -1,32 +1,21 @@
-/*-
- * Copyright (c) 1982, 1986, 1993
- * The Regents of the University of California. All rights reserved.
- * (c) UNIX System Laboratories, Inc.
- * All or some portions of this file are derived from material licensed
- * to the University of California by American Telephone and Telegraph
- * Co. or Unix System Laboratories, Inc. and are reproduced herein with
- * the permission of UNIX System Laboratories, Inc.
+/*
+ * Copyright (c) 1994, David Greenman
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
+ * notice unmodified, this list of conditions, and the following
+ * disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
*
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
* ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
* FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
* DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
* OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
@@ -35,125 +24,671 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * from: @(#)tty_subr.c 8.2 (Berkeley) 9/5/93
+ * $Id: tty_subr.c,v 1.26 1997/03/05 16:45:01 bde Exp $
+ */
+
+/*
+ * clist support routines
*/
#include <sys/param.h>
-#include <sys/ioctl.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
#include <sys/tty.h>
+#include <sys/clist.h>
+#include <sys/malloc.h>
-char cwaiting;
-struct cblock *cfree, *cfreelist;
-int cfreecount, nclist;
+static void clist_init __P((void *));
+SYSINIT(clist, SI_SUB_CLIST, SI_ORDER_FIRST, clist_init, NULL)
-void
-clist_init()
+static struct cblock *cfreelist = 0;
+int cfreecount = 0;
+static int cslushcount;
+static int ctotcount;
+
+#ifndef INITIAL_CBLOCKS
+#define INITIAL_CBLOCKS 50
+#endif
+
+static struct cblock *cblock_alloc __P((void));
+static void cblock_alloc_cblocks __P((int number));
+static void cblock_free __P((struct cblock *cblockp));
+static void cblock_free_cblocks __P((int number));
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(cbstat, cbstat)
{
+ printf(
+ "tot = %d (active = %d, free = %d (reserved = %d, slush = %d))\n",
+ ctotcount * CBSIZE, ctotcount * CBSIZE - cfreecount, cfreecount,
+ cfreecount - cslushcount * CBSIZE, cslushcount * CBSIZE);
+}
+#endif /* DDB */
+/*
+ * Called from init_main.c
+ */
+/* ARGSUSED*/
+static void
+clist_init(dummy)
+ void *dummy;
+{
/*
- * Body deleted.
+ * Allocate an initial base set of cblocks as a 'slush'.
+ * We allocate non-slush cblocks with each initial ttyopen() and
+ * deallocate them with each ttyclose().
+ * We should adjust the slush allocation. This can't be done in
+ * the i/o routines because they are sometimes called from
+ * interrupt handlers when it may be unsafe to call malloc().
*/
- return;
+ cblock_alloc_cblocks(cslushcount = INITIAL_CBLOCKS);
}
-getc(a1)
- struct clist *a1;
+/*
+ * Remove a cblock from the cfreelist queue and return a pointer
+ * to it.
+ */
+static inline struct cblock *
+cblock_alloc()
{
+ struct cblock *cblockp;
- /*
- * Body deleted.
- */
- return ((char)0);
+ cblockp = cfreelist;
+ if (cblockp == NULL)
+ panic("clist reservation botch");
+ cfreelist = cblockp->c_next;
+ cblockp->c_next = NULL;
+ cfreecount -= CBSIZE;
+ return (cblockp);
}
-q_to_b(a1, a2, a3)
- struct clist *a1;
- char *a2;
- int a3;
+/*
+ * Add a cblock to the cfreelist queue.
+ */
+static inline void
+cblock_free(cblockp)
+ struct cblock *cblockp;
{
+ if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1))
+ bzero(cblockp->c_quote, sizeof cblockp->c_quote);
+ cblockp->c_next = cfreelist;
+ cfreelist = cblockp;
+ cfreecount += CBSIZE;
+}
- /*
- * Body deleted.
- */
- return (0);
+/*
+ * Allocate some cblocks for the cfreelist queue.
+ */
+static void
+cblock_alloc_cblocks(number)
+ int number;
+{
+ int i;
+ struct cblock *cbp;
+
+ for (i = 0; i < number; ++i) {
+ cbp = malloc(sizeof *cbp, M_TTYS, M_NOWAIT);
+ if (cbp == NULL) {
+ printf(
+"clist_alloc_cblocks: M_NOWAIT malloc failed, trying M_WAITOK\n");
+ cbp = malloc(sizeof *cbp, M_TTYS, M_WAITOK);
+ }
+ /*
+ * Freed cblocks have zero quotes and garbage elsewhere.
+ * Set the may-have-quote bit to force zeroing the quotes.
+ */
+ setbit(cbp->c_quote, CBQSIZE * NBBY - 1);
+ cblock_free(cbp);
+ }
+ ctotcount += number;
}
-ndqb(a1, a2)
- struct clist *a1;
- int a2;
+/*
+ * Set the cblock allocation policy for a a clist.
+ * Must be called in process context at spltty().
+ */
+void
+clist_alloc_cblocks(clistp, ccmax, ccreserved)
+ struct clist *clistp;
+ int ccmax;
+ int ccreserved;
{
+ int dcbr;
/*
- * Body deleted.
+ * Allow for wasted space at the head.
*/
- return (0);
+ if (ccmax != 0)
+ ccmax += CBSIZE - 1;
+ if (ccreserved != 0)
+ ccreserved += CBSIZE - 1;
+
+ clistp->c_cbmax = roundup(ccmax, CBSIZE) / CBSIZE;
+ dcbr = roundup(ccreserved, CBSIZE) / CBSIZE - clistp->c_cbreserved;
+ if (dcbr >= 0)
+ cblock_alloc_cblocks(dcbr);
+ else {
+ if (clistp->c_cbreserved + dcbr < clistp->c_cbcount)
+ dcbr = clistp->c_cbcount - clistp->c_cbreserved;
+ cblock_free_cblocks(-dcbr);
+ }
+ clistp->c_cbreserved += dcbr;
}
+/*
+ * Free some cblocks from the cfreelist queue back to the
+ * system malloc pool.
+ */
+static void
+cblock_free_cblocks(number)
+ int number;
+{
+ int i;
+
+ for (i = 0; i < number; ++i)
+ free(cblock_alloc(), M_TTYS);
+ ctotcount -= number;
+}
+
+/*
+ * Free the cblocks reserved for a clist.
+ * Must be called at spltty().
+ */
void
-ndflush(a1, a2)
- struct clist *a1;
- int a2;
+clist_free_cblocks(clistp)
+ struct clist *clistp;
{
+ if (clistp->c_cbcount != 0)
+ panic("freeing active clist cblocks");
+ cblock_free_cblocks(clistp->c_cbreserved);
+ clistp->c_cbmax = 0;
+ clistp->c_cbreserved = 0;
+}
- /*
- * Body deleted.
- */
- return;
+/*
+ * Get a character from the head of a clist.
+ */
+int
+getc(clistp)
+ struct clist *clistp;
+{
+ int chr = -1;
+ int s;
+ struct cblock *cblockp;
+
+ s = spltty();
+
+ /* If there are characters in the list, get one */
+ if (clistp->c_cc) {
+ cblockp = (struct cblock *)((long)clistp->c_cf & ~CROUND);
+ chr = (u_char)*clistp->c_cf;
+
+ /*
+ * If this char is quoted, set the flag.
+ */
+ if (isset(cblockp->c_quote, clistp->c_cf - (char *)cblockp->c_info))
+ chr |= TTY_QUOTE;
+
+ /*
+ * Advance to next character.
+ */
+ clistp->c_cf++;
+ clistp->c_cc--;
+ /*
+ * If we have advanced the 'first' character pointer
+ * past the end of this cblock, advance to the next one.
+ * If there are no more characters, set the first and
+ * last pointers to NULL. In either case, free the
+ * current cblock.
+ */
+ if ((clistp->c_cf >= (char *)(cblockp+1)) || (clistp->c_cc == 0)) {
+ if (clistp->c_cc > 0) {
+ clistp->c_cf = cblockp->c_next->c_info;
+ } else {
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ }
+ }
+
+ splx(s);
+ return (chr);
+}
+
+/*
+ * Copy 'amount' of chars, beginning at head of clist 'clistp' to
+ * destination linear buffer 'dest'. Return number of characters
+ * actually copied.
+ */
+int
+q_to_b(clistp, dest, amount)
+ struct clist *clistp;
+ char *dest;
+ int amount;
+{
+ struct cblock *cblockp;
+ struct cblock *cblockn;
+ char *dest_orig = dest;
+ int numc;
+ int s;
+
+ s = spltty();
+
+ while (clistp && amount && (clistp->c_cc > 0)) {
+ cblockp = (struct cblock *)((long)clistp->c_cf & ~CROUND);
+ cblockn = cblockp + 1; /* pointer arithmetic! */
+ numc = min(amount, (char *)cblockn - clistp->c_cf);
+ numc = min(numc, clistp->c_cc);
+ bcopy(clistp->c_cf, dest, numc);
+ amount -= numc;
+ clistp->c_cf += numc;
+ clistp->c_cc -= numc;
+ dest += numc;
+ /*
+ * If this cblock has been emptied, advance to the next
+ * one. If there are no more characters, set the first
+ * and last pointer to NULL. In either case, free the
+ * current cblock.
+ */
+ if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+ if (clistp->c_cc > 0) {
+ clistp->c_cf = cblockp->c_next->c_info;
+ } else {
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ }
+ }
+
+ splx(s);
+ return (dest - dest_orig);
}
-putc(a1, a2)
- char a1;
- struct clist *a2;
+/*
+ * Flush 'amount' of chars, beginning at head of clist 'clistp'.
+ */
+void
+ndflush(clistp, amount)
+ struct clist *clistp;
+ int amount;
{
+ struct cblock *cblockp;
+ struct cblock *cblockn;
+ int numc;
+ int s;
+
+ s = spltty();
+
+ while (amount && (clistp->c_cc > 0)) {
+ cblockp = (struct cblock *)((long)clistp->c_cf & ~CROUND);
+ cblockn = cblockp + 1; /* pointer arithmetic! */
+ numc = min(amount, (char *)cblockn - clistp->c_cf);
+ numc = min(numc, clistp->c_cc);
+ amount -= numc;
+ clistp->c_cf += numc;
+ clistp->c_cc -= numc;
+ /*
+ * If this cblock has been emptied, advance to the next
+ * one. If there are no more characters, set the first
+ * and last pointer to NULL. In either case, free the
+ * current cblock.
+ */
+ if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+ if (clistp->c_cc > 0) {
+ clistp->c_cf = cblockp->c_next->c_info;
+ } else {
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ }
+ }
+
+ splx(s);
+}
+
+/*
+ * Add a character to the end of a clist. Return -1 is no
+ * more clists, or 0 for success.
+ */
+int
+putc(chr, clistp)
+ int chr;
+ struct clist *clistp;
+{
+ struct cblock *cblockp;
+ int s;
+
+ s = spltty();
+
+ if (clistp->c_cl == NULL) {
+ if (clistp->c_cbreserved < 1) {
+ splx(s);
+ printf("putc to a clist with no reserved cblocks\n");
+ return (-1); /* nothing done */
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount = 1;
+ clistp->c_cf = clistp->c_cl = cblockp->c_info;
+ clistp->c_cc = 0;
+ } else {
+ cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND);
+ if (((long)clistp->c_cl & CROUND) == 0) {
+ struct cblock *prev = (cblockp - 1);
+
+ if (clistp->c_cbcount >= clistp->c_cbreserved) {
+ if (clistp->c_cbcount >= clistp->c_cbmax
+ || cslushcount <= 0) {
+ splx(s);
+ return (-1);
+ }
+ --cslushcount;
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount++;
+ prev->c_next = cblockp;
+ clistp->c_cl = cblockp->c_info;
+ }
+ }
/*
- * Body deleted.
+ * If this character is quoted, set the quote bit, if not, clear it.
*/
+ if (chr & TTY_QUOTE) {
+ setbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+ /*
+ * Use one of the spare quote bits to record that something
+ * may be quoted.
+ */
+ setbit(cblockp->c_quote, CBQSIZE * NBBY - 1);
+ } else
+ clrbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+
+ *clistp->c_cl++ = chr;
+ clistp->c_cc++;
+
+ splx(s);
return (0);
}
-b_to_q(a1, a2, a3)
- char *a1;
- int a2;
- struct clist *a3;
+/*
+ * Copy data from linear buffer to clist chain. Return the
+ * number of characters not copied.
+ */
+int
+b_to_q(src, amount, clistp)
+ char *src;
+ int amount;
+ struct clist *clistp;
{
+ struct cblock *cblockp;
+ char *firstbyte, *lastbyte;
+ u_char startmask, endmask;
+ int startbit, endbit, num_between, numc;
+ int s;
/*
- * Body deleted.
+ * Avoid allocating an initial cblock and then not using it.
+ * c_cc == 0 must imply c_cbount == 0.
*/
- return (0);
+ if (amount <= 0)
+ return (amount);
+
+ s = spltty();
+
+ /*
+ * If there are no cblocks assigned to this clist yet,
+ * then get one.
+ */
+ if (clistp->c_cl == NULL) {
+ if (clistp->c_cbreserved < 1) {
+ splx(s);
+ printf("b_to_q to a clist with no reserved cblocks.\n");
+ return (amount); /* nothing done */
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount = 1;
+ clistp->c_cf = clistp->c_cl = cblockp->c_info;
+ clistp->c_cc = 0;
+ } else {
+ cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND);
+ }
+
+ while (amount) {
+ /*
+ * Get another cblock if needed.
+ */
+ if (((long)clistp->c_cl & CROUND) == 0) {
+ struct cblock *prev = cblockp - 1;
+
+ if (clistp->c_cbcount >= clistp->c_cbreserved) {
+ if (clistp->c_cbcount >= clistp->c_cbmax
+ || cslushcount <= 0) {
+ splx(s);
+ return (amount);
+ }
+ --cslushcount;
+ }
+ cblockp = cblock_alloc();
+ clistp->c_cbcount++;
+ prev->c_next = cblockp;
+ clistp->c_cl = cblockp->c_info;
+ }
+
+ /*
+ * Copy a chunk of the linear buffer up to the end
+ * of this cblock.
+ */
+ numc = min(amount, (char *)(cblockp + 1) - clistp->c_cl);
+ bcopy(src, clistp->c_cl, numc);
+
+ /*
+ * Clear quote bits if they aren't known to be clear.
+ * The following could probably be made into a seperate
+ * "bitzero()" routine, but why bother?
+ */
+ if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) {
+ startbit = clistp->c_cl - (char *)cblockp->c_info;
+ endbit = startbit + numc - 1;
+
+ firstbyte = (u_char *)cblockp->c_quote + (startbit / NBBY);
+ lastbyte = (u_char *)cblockp->c_quote + (endbit / NBBY);
+
+ /*
+ * Calculate mask of bits to preserve in first and
+ * last bytes.
+ */
+ startmask = NBBY - (startbit % NBBY);
+ startmask = 0xff >> startmask;
+ endmask = (endbit % NBBY);
+ endmask = 0xff << (endmask + 1);
+
+ if (firstbyte != lastbyte) {
+ *firstbyte &= startmask;
+ *lastbyte &= endmask;
+
+ num_between = lastbyte - firstbyte - 1;
+ if (num_between)
+ bzero(firstbyte + 1, num_between);
+ } else {
+ *firstbyte &= (startmask | endmask);
+ }
+ }
+
+ /*
+ * ...and update pointer for the next chunk.
+ */
+ src += numc;
+ clistp->c_cl += numc;
+ clistp->c_cc += numc;
+ amount -= numc;
+ /*
+ * If we go through the loop again, it's always
+ * for data in the next cblock, so by adding one (cblock),
+ * (which makes the pointer 1 beyond the end of this
+ * cblock) we prepare for the assignment of 'prev'
+ * above.
+ */
+ cblockp += 1;
+
+ }
+
+ splx(s);
+ return (amount);
}
+/*
+ * Get the next character in the clist. Store it at dst. Don't
+ * advance any clist pointers, but return a pointer to the next
+ * character position.
+ */
char *
-nextc(a1, a2, a3)
- struct clist *a1;
- char *a2;
- int *a3;
+nextc(clistp, cp, dst)
+ struct clist *clistp;
+ char *cp;
+ int *dst;
{
+ struct cblock *cblockp;
+ ++cp;
/*
- * Body deleted.
+ * See if the next character is beyond the end of
+ * the clist.
*/
- return ((char *)0);
+ if (clistp->c_cc && (cp != clistp->c_cl)) {
+ /*
+ * If the next character is beyond the end of this
+ * cblock, advance to the next cblock.
+ */
+ if (((long)cp & CROUND) == 0)
+ cp = ((struct cblock *)cp - 1)->c_next->c_info;
+ cblockp = (struct cblock *)((long)cp & ~CROUND);
+
+ /*
+ * Get the character. Set the quote flag if this character
+ * is quoted.
+ */
+ *dst = (u_char)*cp | (isset(cblockp->c_quote, cp - (char *)cblockp->c_info) ? TTY_QUOTE : 0);
+
+ return (cp);
+ }
+
+ return (NULL);
}
-unputc(a1)
- struct clist *a1;
+/*
+ * "Unput" a character from a clist.
+ */
+int
+unputc(clistp)
+ struct clist *clistp;
{
+ struct cblock *cblockp = 0, *cbp = 0;
+ int s;
+ int chr = -1;
+
+
+ s = spltty();
+
+ if (clistp->c_cc) {
+ --clistp->c_cc;
+ --clistp->c_cl;
+
+ chr = (u_char)*clistp->c_cl;
+
+ cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND);
+
+ /*
+ * Set quote flag if this character was quoted.
+ */
+ if (isset(cblockp->c_quote, (u_char *)clistp->c_cl - cblockp->c_info))
+ chr |= TTY_QUOTE;
+
+ /*
+ * If all of the characters have been unput in this
+ * cblock, then find the previous one and free this
+ * one.
+ */
+ if (clistp->c_cc && (clistp->c_cl <= (char *)cblockp->c_info)) {
+ cbp = (struct cblock *)((long)clistp->c_cf & ~CROUND);
+
+ while (cbp->c_next != cblockp)
+ cbp = cbp->c_next;
+
+ /*
+ * When the previous cblock is at the end, the 'last'
+ * pointer always points (invalidly) one past.
+ */
+ clistp->c_cl = (char *)(cbp+1);
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ cbp->c_next = NULL;
+ }
+ }
/*
- * Body deleted.
+ * If there are no more characters on the list, then
+ * free the last cblock.
*/
- return ((char)0);
+ if ((clistp->c_cc == 0) && clistp->c_cl) {
+ cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND);
+ cblock_free(cblockp);
+ if (--clistp->c_cbcount >= clistp->c_cbreserved)
+ ++cslushcount;
+ clistp->c_cf = clistp->c_cl = NULL;
+ }
+
+ splx(s);
+ return (chr);
}
+/*
+ * Move characters in source clist to destination clist,
+ * preserving quote bits.
+ */
void
-catq(a1, a2)
- struct clist *a1, *a2;
+catq(src_clistp, dest_clistp)
+ struct clist *src_clistp, *dest_clistp;
{
+ int chr, s;
+
+ s = spltty();
+ /*
+ * If the destination clist is empty (has no cblocks atttached),
+ * and there are no possible complications with the resource counters,
+ * then we simply assign the current clist to the destination.
+ */
+ if (!dest_clistp->c_cf
+ && src_clistp->c_cbcount <= src_clistp->c_cbmax
+ && src_clistp->c_cbcount <= dest_clistp->c_cbmax) {
+ dest_clistp->c_cf = src_clistp->c_cf;
+ dest_clistp->c_cl = src_clistp->c_cl;
+ src_clistp->c_cf = src_clistp->c_cl = NULL;
+
+ dest_clistp->c_cc = src_clistp->c_cc;
+ src_clistp->c_cc = 0;
+ dest_clistp->c_cbcount = src_clistp->c_cbcount;
+ src_clistp->c_cbcount = 0;
+
+ splx(s);
+ return;
+ }
+
+ splx(s);
/*
- * Body deleted.
+ * XXX This should probably be optimized to more than one
+ * character at a time.
*/
- return;
+ while ((chr = getc(src_clistp)) != -1)
+ putc(chr, dest_clistp);
}
diff --git a/sys/kern/tty_tb.c b/sys/kern/tty_tb.c
index 05a46ba..8f4c84c 100644
--- a/sys/kern/tty_tb.c
+++ b/sys/kern/tty_tb.c
@@ -30,7 +30,8 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)tty_tb.c 8.2 (Berkeley) 1/9/95
+ * @(#)tty_tb.c 8.1 (Berkeley) 6/10/93
+ * $Id$
*/
#include "tb.h"
@@ -310,9 +311,7 @@ poldecode(tc, cp, polpos)
/*ARGSUSED*/
tbioctl(tp, cmd, data, flag)
struct tty *tp;
- u_long cmd;
caddr_t data;
- int flag;
{
register struct tb *tbp = (struct tb *)tp->T_LINEP;
diff --git a/sys/kern/tty_tty.c b/sys/kern/tty_tty.c
index d9dd1b4..be164d5 100644
--- a/sys/kern/tty_tty.c
+++ b/sys/kern/tty_tty.c
@@ -1,5 +1,5 @@
/*-
- * Copyright (c) 1982, 1986, 1991, 1993, 1995
+ * Copyright (c) 1982, 1986, 1991, 1993
* The Regents of the University of California. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,8 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)tty_tty.c 8.4 (Berkeley) 5/14/95
+ * @(#)tty_tty.c 8.2 (Berkeley) 9/23/93
+ * $Id: tty_tty.c,v 1.15 1997/03/23 03:36:30 bde Exp $
*/
/*
@@ -39,15 +40,33 @@
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/conf.h>
-#include <sys/ioctl.h>
#include <sys/proc.h>
#include <sys/tty.h>
#include <sys/vnode.h>
-#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+
+static d_open_t cttyopen;
+static d_read_t cttyread;
+static d_write_t cttywrite;
+static d_ioctl_t cttyioctl;
+static d_select_t cttyselect;
+
+#define CDEV_MAJOR 1
+/* Don't make static, fdesc_vnops uses this. */
+struct cdevsw ctty_cdevsw =
+ { cttyopen, nullclose, cttyread, cttywrite, /*1*/
+ cttyioctl, nullstop, nullreset, nodevtotty,/* tty */
+ cttyselect, nommap, NULL, "ctty", NULL, -1 };
+
#define cttyvp(p) ((p)->p_flag & P_CONTROLT ? (p)->p_session->s_ttyvp : NULL)
/*ARGSUSED*/
+static int
cttyopen(dev, flag, mode, p)
dev_t dev;
int flag, mode;
@@ -64,7 +83,7 @@ cttyopen(dev, flag, mode, p)
* Since group is tty and mode is 620 on most terminal lines
* and since sessions protect terminals from processes outside
* your session, this check is probably no longer necessary.
- * Since it inhibits setuid root programs that later switch
+ * Since it inhibits setuid root programs that later switch
* to another user from accessing /dev/tty, we have decided
* to delete this test. (mckusick 5/93)
*/
@@ -78,6 +97,7 @@ cttyopen(dev, flag, mode, p)
}
/*ARGSUSED*/
+static int
cttyread(dev, uio, flag)
dev_t dev;
struct uio *uio;
@@ -96,6 +116,7 @@ cttyread(dev, uio, flag)
}
/*ARGSUSED*/
+static int
cttywrite(dev, uio, flag)
dev_t dev;
struct uio *uio;
@@ -114,9 +135,10 @@ cttywrite(dev, uio, flag)
}
/*ARGSUSED*/
+static int
cttyioctl(dev, cmd, addr, flag, p)
dev_t dev;
- u_long cmd;
+ int cmd;
caddr_t addr;
int flag;
struct proc *p;
@@ -125,6 +147,8 @@ cttyioctl(dev, cmd, addr, flag, p)
if (ttyvp == NULL)
return (EIO);
+ if (cmd == TIOCSCTTY) /* don't allow controlling tty to be set */
+ return EINVAL; /* to controlling tty -- infinite recursion */
if (cmd == TIOCNOTTY) {
if (!SESS_LEADER(p)) {
p->p_flag &= ~P_CONTROLT;
@@ -136,6 +160,7 @@ cttyioctl(dev, cmd, addr, flag, p)
}
/*ARGSUSED*/
+static int
cttyselect(dev, flag, p)
dev_t dev;
int flag;
@@ -147,3 +172,27 @@ cttyselect(dev, flag, p)
return (1); /* try operation to get EOF/failure */
return (VOP_SELECT(ttyvp, flag, FREAD|FWRITE, NOCRED, p));
}
+
+static ctty_devsw_installed = 0;
+#ifdef DEVFS
+static void *ctty_devfs_token;
+#endif
+
+static void
+ctty_drvinit(void *unused)
+{
+ dev_t dev;
+
+ if( ! ctty_devsw_installed ) {
+ dev = makedev(CDEV_MAJOR,0);
+ cdevsw_add(&dev,&ctty_cdevsw,NULL);
+ ctty_devsw_installed = 1;
+#ifdef DEVFS
+ ctty_devfs_token =
+ devfs_add_devswf(&ctty_cdevsw, 0, DV_CHR, 0, 0,
+ 0666, "tty");
+#endif
+ }
+}
+
+SYSINIT(cttydev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,ctty_drvinit,NULL)
diff --git a/sys/kern/uipc_domain.c b/sys/kern/uipc_domain.c
index 1c91f2a..a2c3477 100644
--- a/sys/kern/uipc_domain.c
+++ b/sys/kern/uipc_domain.c
@@ -30,7 +30,8 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)uipc_domain.c 8.3 (Berkeley) 2/14/95
+ * @(#)uipc_domain.c 8.2 (Berkeley) 10/18/93
+ * $Id$
*/
#include <sys/param.h>
@@ -38,69 +39,120 @@
#include <sys/protosw.h>
#include <sys/domain.h>
#include <sys/mbuf.h>
-#include <sys/time.h>
#include <sys/kernel.h>
#include <sys/systm.h>
-#include <sys/proc.h>
-#include <vm/vm.h>
-#include <sys/sysctl.h>
-void pffasttimo __P((void *));
-void pfslowtimo __P((void *));
+/*
+ * System initialization
+ *
+ * Note: domain initialization wants to take place on a per domain basis
+ * as a result of traversing a linker set. Most likely, each domain
+ * want to call a registration function rather than being handled here
+ * in domaininit(). Probably this will look like:
+ *
+ * SYSINIT(unique, SI_SUB_PROTO_DOMAI, SI_ORDER_ANY, domain_add, xxx)
+ *
+ * Where 'xxx' is replaced by the address of a parameter struct to be
+ * passed to the doamin_add() function.
+ */
+
+static int x_save_spl; /* used by kludge*/
+static void kludge_splimp __P((void *));
+static void kludge_splx __P((void *));
+static void domaininit __P((void *));
+SYSINIT(splimp, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, kludge_splimp, &x_save_spl)
+SYSINIT(domain, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, domaininit, NULL)
+SYSINIT(splx, SI_SUB_PROTO_END, SI_ORDER_FIRST, kludge_splx, &x_save_spl)
+
+static void pffasttimo __P((void *));
+static void pfslowtimo __P((void *));
+
+struct domain *domains;
#define ADDDOMAIN(x) { \
- extern struct domain __CONCAT(x,domain); \
__CONCAT(x,domain.dom_next) = domains; \
domains = &__CONCAT(x,domain); \
}
-void
-domaininit()
+extern struct linker_set domain_set;
+
+/* ARGSUSED*/
+static void
+domaininit(dummy)
+ void *dummy;
{
- register struct domain *dp;
+ register struct domain *dp, **dpp;
register struct protosw *pr;
-#undef unix
-#ifndef lint
- ADDDOMAIN(unix);
- ADDDOMAIN(route);
-#ifdef INET
- ADDDOMAIN(inet);
-#endif
-#ifdef NS
- ADDDOMAIN(ns);
-#endif
-#ifdef ISO
- ADDDOMAIN(iso);
-#endif
-#ifdef CCITT
- ADDDOMAIN(ccitt);
-#endif
-#include "imp.h"
-#if NIMP > 0
- ADDDOMAIN(imp);
-#endif
+ /*
+ * NB - local domain is always present.
+ */
+ ADDDOMAIN(local);
+
+ for (dpp = (struct domain **)domain_set.ls_items; *dpp; dpp++) {
+ (**dpp).dom_next = domains;
+ domains = *dpp;
+ }
+
+/* - not in our sources
+#ifdef ISDN
+ ADDDOMAIN(isdn);
#endif
+*/
for (dp = domains; dp; dp = dp->dom_next) {
if (dp->dom_init)
(*dp->dom_init)();
- for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+ for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++){
+#ifdef PRU_OLDSTYLE
+ /* See comments in uipc_socket2.c. */
+ if (pr->pr_usrreqs == 0 && pr->pr_ousrreq)
+ pr->pr_usrreqs = &pru_oldstyle;
+#endif
if (pr->pr_init)
(*pr->pr_init)();
+ }
}
-if (max_linkhdr < 16) /* XXX */
-max_linkhdr = 16;
+ if (max_linkhdr < 16) /* XXX */
+ max_linkhdr = 16;
max_hdr = max_linkhdr + max_protohdr;
max_datalen = MHLEN - max_hdr;
- timeout(pffasttimo, NULL, 1);
- timeout(pfslowtimo, NULL, 1);
+ timeout(pffasttimo, (void *)0, 1);
+ timeout(pfslowtimo, (void *)0, 1);
}
+
+/*
+ * The following two operations are kludge code. Most likely, they should
+ * be done as a "domainpreinit()" for the first function and then rolled
+ * in as the last act of "domaininit()" for the second.
+ *
+ * In point of fact, it is questionable why other initialization prior
+ * to this does not also take place at splimp by default.
+ */
+static void
+kludge_splimp(udata)
+ void *udata;
+{
+ int *savesplp = udata;
+
+ *savesplp = splimp();
+}
+
+static void
+kludge_splx(udata)
+ void *udata;
+{
+ int *savesplp = udata;
+
+ splx( *savesplp);
+}
+
+
+
struct protosw *
-pffindtype(family, type)
- int family, type;
+pffindtype(int family, int type)
{
register struct domain *dp;
register struct protosw *pr;
@@ -117,8 +169,7 @@ found:
}
struct protosw *
-pffindproto(family, protocol, type)
- int family, protocol, type;
+pffindproto(int family, int protocol, int type)
{
register struct domain *dp;
register struct protosw *pr;
@@ -142,44 +193,6 @@ found:
return (maybe);
}
-int
-net_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
- int *name;
- u_int namelen;
- void *oldp;
- size_t *oldlenp;
- void *newp;
- size_t newlen;
- struct proc *p;
-{
- register struct domain *dp;
- register struct protosw *pr;
- int family, protocol;
-
- /*
- * All sysctl names at this level are nonterminal;
- * next two components are protocol family and protocol number,
- * then at least one addition component.
- */
- if (namelen < 3)
- return (EISDIR); /* overloaded */
- family = name[0];
- protocol = name[1];
-
- if (family == 0)
- return (0);
- for (dp = domains; dp; dp = dp->dom_next)
- if (dp->dom_family == family)
- goto found;
- return (ENOPROTOOPT);
-found:
- for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
- if (pr->pr_protocol == protocol && pr->pr_sysctl)
- return ((*pr->pr_sysctl)(name + 2, namelen - 2,
- oldp, oldlenp, newp, newlen));
- return (ENOPROTOOPT);
-}
-
void
pfctlinput(cmd, sa)
int cmd;
@@ -191,10 +204,10 @@ pfctlinput(cmd, sa)
for (dp = domains; dp; dp = dp->dom_next)
for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
if (pr->pr_ctlinput)
- (*pr->pr_ctlinput)(cmd, sa, (caddr_t)0);
+ (*pr->pr_ctlinput)(cmd, sa, (void *)0);
}
-void
+static void
pfslowtimo(arg)
void *arg;
{
@@ -205,10 +218,10 @@ pfslowtimo(arg)
for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
if (pr->pr_slowtimo)
(*pr->pr_slowtimo)();
- timeout(pfslowtimo, NULL, hz/2);
+ timeout(pfslowtimo, (void *)0, hz/2);
}
-void
+static void
pffasttimo(arg)
void *arg;
{
@@ -219,5 +232,5 @@ pffasttimo(arg)
for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
if (pr->pr_fasttimo)
(*pr->pr_fasttimo)();
- timeout(pffasttimo, NULL, hz/5);
+ timeout(pffasttimo, (void *)0, hz/5);
}
diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c
index 62abfd5..7032e44 100644
--- a/sys/kern/uipc_mbuf.c
+++ b/sys/kern/uipc_mbuf.c
@@ -30,35 +30,81 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)uipc_mbuf.c 8.4 (Berkeley) 2/14/95
+ * @(#)uipc_mbuf.c 8.2 (Berkeley) 1/4/94
+ * $Id: uipc_mbuf.c,v 1.28 1997/02/18 20:43:05 wollman Exp $
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/malloc.h>
-#include <sys/map.h>
#define MBTYPES
#include <sys/mbuf.h>
#include <sys/kernel.h>
+#include <sys/sysctl.h>
#include <sys/syslog.h>
#include <sys/domain.h>
#include <sys/protosw.h>
#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
-extern vm_map_t mb_map;
-struct mbuf *mbutl;
-char *mclrefcnt;
+static void mbinit __P((void *));
+SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbinit, NULL)
-void
-mbinit()
+struct mbuf *mbutl;
+char *mclrefcnt;
+struct mbstat mbstat;
+struct mbuf *mmbfree;
+union mcluster *mclfree;
+int max_linkhdr;
+int max_protohdr;
+int max_hdr;
+int max_datalen;
+
+SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
+ &max_linkhdr, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
+ &max_protohdr, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
+ &max_datalen, 0, "");
+SYSCTL_STRUCT(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RW, &mbstat, mbstat, "");
+
+static void m_reclaim __P((void));
+
+/* "number of clusters of pages" */
+#define NCL_INIT 1
+
+#define NMB_INIT 16
+
+/* ARGSUSED*/
+static void
+mbinit(dummy)
+ void *dummy;
{
int s;
+ mmbfree = NULL; mclfree = NULL;
+ mbstat.m_msize = MSIZE;
+ mbstat.m_mclbytes = MCLBYTES;
+ mbstat.m_minclsize = MINCLSIZE;
+ mbstat.m_mlen = MLEN;
+ mbstat.m_mhlen = MHLEN;
+
s = splimp();
- if (m_clalloc(max(4096/CLBYTES, 1), M_DONTWAIT) == 0)
+ if (m_mballoc(NMB_INIT, M_DONTWAIT) == 0)
goto bad;
+#if MCLBYTES <= PAGE_SIZE
+ if (m_clalloc(NCL_INIT, M_DONTWAIT) == 0)
+ goto bad;
+#else
+ /* It's OK to call contigmalloc in this context. */
+ if (m_clalloc(16, 0) == 0)
+ goto bad;
+#endif
splx(s);
return;
bad:
@@ -66,6 +112,80 @@ bad:
}
/*
+ * Allocate at least nmb mbufs and place on mbuf free list.
+ * Must be called at splimp.
+ */
+/* ARGSUSED */
+int
+m_mballoc(nmb, nowait)
+ register int nmb;
+ int nowait;
+{
+ register caddr_t p;
+ register int i;
+ int nbytes;
+
+ /* Once we run out of map space, it will be impossible to get
+ * any more (nothing is ever freed back to the map) (XXX which
+ * is dumb). (however you are not dead as m_reclaim might
+ * still be able to free a substantial amount of space).
+ */
+ if (mb_map_full)
+ return (0);
+
+ nbytes = round_page(nmb * MSIZE);
+ p = (caddr_t)kmem_malloc(mb_map, nbytes, M_NOWAIT);
+ if (p == 0 && !nowait) {
+ mbstat.m_wait++;
+ p = (caddr_t)kmem_malloc(mb_map, nbytes, M_WAITOK);
+ }
+
+ /*
+ * Either the map is now full, or this is nowait and there
+ * are no pages left.
+ */
+ if (p == NULL)
+ return (0);
+
+ nmb = nbytes / MSIZE;
+ for (i = 0; i < nmb; i++) {
+ ((struct mbuf *)p)->m_next = mmbfree;
+ mmbfree = (struct mbuf *)p;
+ p += MSIZE;
+ }
+ mbstat.m_mbufs += nmb;
+ return (1);
+}
+
+#if MCLBYTES > PAGE_SIZE
+static int i_want_my_mcl;
+
+static void
+kproc_mclalloc(void)
+{
+ int status;
+
+ while (1) {
+ tsleep(&i_want_my_mcl, PVM, "mclalloc", 0);
+
+ for (; i_want_my_mcl; i_want_my_mcl--) {
+ if (m_clalloc(1, 0) == 0)
+ printf("m_clalloc failed even in process context!\n");
+ }
+ }
+}
+
+static struct proc *mclallocproc;
+static struct kproc_desc mclalloc_kp = {
+ "mclalloc",
+ kproc_mclalloc,
+ &mclallocproc
+};
+SYSINIT_KT(mclallocproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start,
+ &mclalloc_kp);
+#endif
+
+/*
* Allocate some number of mbuf clusters
* and place on cluster free list.
* Must be called at splimp.
@@ -76,21 +196,45 @@ m_clalloc(ncl, nowait)
register int ncl;
int nowait;
{
- static int logged;
register caddr_t p;
register int i;
int npg;
- npg = ncl * CLSIZE;
- p = (caddr_t)kmem_malloc(mb_map, ctob(npg), !nowait);
+ /*
+ * Once we run out of map space, it will be impossible
+ * to get any more (nothing is ever freed back to the
+ * map).
+ */
+ if (mb_map_full) {
+ mbstat.m_drops++;
+ return (0);
+ }
+
+#if MCLBYTES > PAGE_SIZE
+ if (nowait) {
+ i_want_my_mcl += ncl;
+ wakeup(&i_want_my_mcl);
+ mbstat.m_wait++;
+ p = 0;
+ } else {
+ p = contigmalloc1(MCLBYTES * ncl, M_DEVBUF, M_WAITOK, 0ul,
+ ~0ul, PAGE_SIZE, 0, mb_map);
+ }
+#else
+ npg = ncl;
+ p = (caddr_t)kmem_malloc(mb_map, ctob(npg),
+ nowait ? M_NOWAIT : M_WAITOK);
+ ncl = ncl * PAGE_SIZE / MCLBYTES;
+#endif
+ /*
+ * Either the map is now full, or this is nowait and there
+ * are no pages left.
+ */
if (p == NULL) {
- if (logged == 0) {
- logged++;
- log(LOG_ERR, "mb_map full\n");
- }
+ mbstat.m_drops++;
return (0);
}
- ncl = ncl * CLBYTES / MCLBYTES;
+
for (i = 0; i < ncl; i++) {
((union mcluster *)p)->mcl_next = mclfree;
mclfree = (union mcluster *)p;
@@ -115,6 +259,10 @@ m_retry(i, t)
#define m_retry(i, t) (struct mbuf *)0
MGET(m, i, t);
#undef m_retry
+ if (m != NULL)
+ mbstat.m_wait++;
+ else
+ mbstat.m_drops++;
return (m);
}
@@ -131,10 +279,14 @@ m_retryhdr(i, t)
#define m_retryhdr(i, t) (struct mbuf *)0
MGETHDR(m, i, t);
#undef m_retryhdr
+ if (m != NULL)
+ mbstat.m_wait++;
+ else
+ mbstat.m_drops++;
return (m);
}
-void
+static void
m_reclaim()
{
register struct domain *dp;
@@ -207,7 +359,8 @@ m_freem(m)
return;
do {
MFREE(m, n);
- } while (m = n);
+ m = n;
+ } while (m);
}
/*
@@ -248,7 +401,7 @@ m_prepend(m, len, how)
* continuing for "len" bytes. If len is M_COPYALL, copy to end of mbuf.
* The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
*/
-int MCFail;
+#define MCFail (mbstat.m_mcfail)
struct mbuf *
m_copym(m, off0, len, wait)
@@ -296,7 +449,11 @@ m_copym(m, off0, len, wait)
n->m_len = min(len, m->m_len - off);
if (m->m_flags & M_EXT) {
n->m_data = m->m_data + off;
- mclrefcnt[mtocl(m->m_ext.ext_buf)]++;
+ if(!m->m_ext.ext_ref)
+ mclrefcnt[mtocl(m->m_ext.ext_buf)]++;
+ else
+ (*(m->m_ext.ext_ref))(m->m_ext.ext_buf,
+ m->m_ext.ext_size);
n->m_ext = m->m_ext;
n->m_flags |= M_EXT;
} else
@@ -318,6 +475,61 @@ nospace:
}
/*
+ * Copy an entire packet, including header (which must be present).
+ * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
+ */
+struct mbuf *
+m_copypacket(m, how)
+ struct mbuf *m;
+ int how;
+{
+ struct mbuf *top, *n, *o;
+
+ MGET(n, how, m->m_type);
+ top = n;
+ if (!n)
+ goto nospace;
+
+ M_COPY_PKTHDR(n, m);
+ n->m_len = m->m_len;
+ if (m->m_flags & M_EXT) {
+ n->m_data = m->m_data;
+ mclrefcnt[mtocl(m->m_ext.ext_buf)]++;
+ n->m_ext = m->m_ext;
+ n->m_flags |= M_EXT;
+ } else {
+ bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
+ }
+
+ m = m->m_next;
+ while (m) {
+ MGET(o, how, m->m_type);
+ if (!o)
+ goto nospace;
+
+ n->m_next = o;
+ n = n->m_next;
+
+ n->m_len = m->m_len;
+ if (m->m_flags & M_EXT) {
+ n->m_data = m->m_data;
+ mclrefcnt[mtocl(m->m_ext.ext_buf)]++;
+ n->m_ext = m->m_ext;
+ n->m_flags |= M_EXT;
+ } else {
+ bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
+ }
+
+ m = m->m_next;
+ }
+ return top;
+nospace:
+ m_freem(top);
+ MCFail++;
+ return 0;
+}
+
+/*
* Copy data from an mbuf chain starting "off" bytes from the beginning,
* continuing for "len" bytes, into the indicated buffer.
*/
@@ -447,8 +659,8 @@ m_adj(mp, req_len)
}
count -= m->m_len;
}
- while (m = m->m_next)
- m->m_len = 0;
+ while (m->m_next)
+ (m = m->m_next) ->m_len = 0;
}
}
@@ -460,7 +672,7 @@ m_adj(mp, req_len)
* If there is room, it will add up to max_protohdr-len extra bytes to the
* contiguous region in an attempt to avoid being called next time.
*/
-int MPFail;
+#define MPFail (mbstat.m_mpfail)
struct mbuf *
m_pullup(n, len)
@@ -573,7 +785,11 @@ extpacket:
if (m->m_flags & M_EXT) {
n->m_flags |= M_EXT;
n->m_ext = m->m_ext;
- mclrefcnt[mtocl(m->m_ext.ext_buf)]++;
+ if(!m->m_ext.ext_ref)
+ mclrefcnt[mtocl(m->m_ext.ext_buf)]++;
+ else
+ (*(m->m_ext.ext_ref))(m->m_ext.ext_buf,
+ m->m_ext.ext_size);
m->m_ext.ext_size = 0; /* For Accounting XXXXXX danger */
n->m_data = m->m_data + len;
} else {
@@ -593,7 +809,7 @@ m_devget(buf, totlen, off0, ifp, copy)
char *buf;
int totlen, off0;
struct ifnet *ifp;
- void (*copy)();
+ void (*copy) __P((char *from, caddr_t to, u_int len));
{
register struct mbuf *m;
struct mbuf *top = 0, **mp = &top;
@@ -604,12 +820,8 @@ m_devget(buf, totlen, off0, ifp, copy)
cp = buf;
epkt = cp + totlen;
if (off) {
- /*
- * If 'off' is non-zero, packet is trailer-encapsulated,
- * so we have to skip the type and length fields.
- */
- cp += off + 2 * sizeof(u_int16_t);
- totlen -= 2 * sizeof(u_int16_t);
+ cp += off + 2 * sizeof(u_short);
+ totlen -= 2 * sizeof(u_short);
}
MGETHDR(m, M_DONTWAIT, MT_DATA);
if (m == 0)
@@ -658,3 +870,56 @@ m_devget(buf, totlen, off0, ifp, copy)
}
return (top);
}
+
+/*
+ * Copy data from a buffer back into the indicated mbuf chain,
+ * starting "off" bytes from the beginning, extending the mbuf
+ * chain if necessary.
+ */
+void
+m_copyback(m0, off, len, cp)
+ struct mbuf *m0;
+ register int off;
+ register int len;
+ caddr_t cp;
+{
+ register int mlen;
+ register struct mbuf *m = m0, *n;
+ int totlen = 0;
+
+ if (m0 == 0)
+ return;
+ while (off > (mlen = m->m_len)) {
+ off -= mlen;
+ totlen += mlen;
+ if (m->m_next == 0) {
+ n = m_getclr(M_DONTWAIT, m->m_type);
+ if (n == 0)
+ goto out;
+ n->m_len = min(MLEN, len + off);
+ m->m_next = n;
+ }
+ m = m->m_next;
+ }
+ while (len > 0) {
+ mlen = min (m->m_len - off, len);
+ bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
+ cp += mlen;
+ len -= mlen;
+ mlen += off;
+ off = 0;
+ totlen += mlen;
+ if (len == 0)
+ break;
+ if (m->m_next == 0) {
+ n = m_get(M_DONTWAIT, m->m_type);
+ if (n == 0)
+ break;
+ n->m_len = min(MLEN, len);
+ m->m_next = n;
+ }
+ m = m->m_next;
+ }
+out: if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
+ m->m_pkthdr.len = totlen;
+}
diff --git a/sys/kern/uipc_proto.c b/sys/kern/uipc_proto.c
index e89a84c..f652ce3 100644
--- a/sys/kern/uipc_proto.c
+++ b/sys/kern/uipc_proto.c
@@ -30,43 +30,47 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)uipc_proto.c 8.2 (Berkeley) 2/14/95
+ * @(#)uipc_proto.c 8.1 (Berkeley) 6/10/93
+ * $Id: uipc_proto.c,v 1.9 1997/02/22 09:39:27 peter Exp $
*/
#include <sys/param.h>
-#include <sys/socket.h>
-#include <sys/protosw.h>
+#include <sys/kernel.h>
#include <sys/domain.h>
#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/un.h>
+
+#include <net/raw_cb.h>
/*
- * Definitions of protocols supported in the UNIX domain.
+ * Definitions of protocols supported in the LOCAL domain.
*/
-int uipc_usrreq(), raw_usrreq();
-void raw_init(), raw_input(), raw_ctlinput();
-extern struct domain unixdomain; /* or at least forward */
-
-struct protosw unixsw[] = {
-{ SOCK_STREAM, &unixdomain, 0, PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS,
+static struct protosw localsw[] = {
+{ SOCK_STREAM, &localdomain, 0, PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS,
0, 0, 0, 0,
uipc_usrreq,
0, 0, 0, 0,
},
-{ SOCK_DGRAM, &unixdomain, 0, PR_ATOMIC|PR_ADDR|PR_RIGHTS,
+{ SOCK_DGRAM, &localdomain, 0, PR_ATOMIC|PR_ADDR|PR_RIGHTS,
0, 0, 0, 0,
uipc_usrreq,
0, 0, 0, 0,
},
{ 0, 0, 0, 0,
- raw_input, 0, raw_ctlinput, 0,
+ 0, 0, raw_ctlinput, 0,
raw_usrreq,
raw_init, 0, 0, 0,
}
};
-int unp_externalize(), unp_dispose();
+struct domain localdomain =
+ { AF_LOCAL, "local", 0, unp_externalize, unp_dispose,
+ localsw, &localsw[sizeof(localsw)/sizeof(localsw[0])] };
-struct domain unixdomain =
- { AF_UNIX, "unix", 0, unp_externalize, unp_dispose,
- unixsw, &unixsw[sizeof(unixsw)/sizeof(unixsw[0])] };
+SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain");
+SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0, "SOCK_STREAM");
+SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM");
diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c
new file mode 100644
index 0000000..e19db0c
--- /dev/null
+++ b/sys/kern/uipc_sockbuf.c
@@ -0,0 +1,1018 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993
+ * The Regents of the University of California. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
+ * $Id: uipc_socket2.c,v 1.21 1997/02/19 19:15:43 wollman Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/file.h>
+#include <sys/buf.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+
+/*
+ * Primitive routines for operating on sockets and socket buffers
+ */
+
+u_long sb_max = SB_MAX; /* XXX should be static */
+
+static u_long sb_efficiency = 8; /* parameter for sbreserve() */
+
+/*
+ * Procedures to manipulate state flags of socket
+ * and do appropriate wakeups. Normal sequence from the
+ * active (originating) side is that soisconnecting() is
+ * called during processing of connect() call,
+ * resulting in an eventual call to soisconnected() if/when the
+ * connection is established. When the connection is torn down
+ * soisdisconnecting() is called during processing of disconnect() call,
+ * and soisdisconnected() is called when the connection to the peer
+ * is totally severed. The semantics of these routines are such that
+ * connectionless protocols can call soisconnected() and soisdisconnected()
+ * only, bypassing the in-progress calls when setting up a ``connection''
+ * takes no time.
+ *
+ * From the passive side, a socket is created with
+ * two queues of sockets: so_q0 for connections in progress
+ * and so_q for connections already made and awaiting user acceptance.
+ * As a protocol is preparing incoming connections, it creates a socket
+ * structure queued on so_q0 by calling sonewconn(). When the connection
+ * is established, soisconnected() is called, and transfers the
+ * socket structure to so_q, making it available to accept().
+ *
+ * If a socket is closed with sockets on either
+ * so_q0 or so_q, these sockets are dropped.
+ *
+ * If higher level protocols are implemented in
+ * the kernel, the wakeups done here will sometimes
+ * cause software-interrupt process scheduling.
+ */
+
+void
+soisconnecting(so)
+ register struct socket *so;
+{
+
+ so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
+ so->so_state |= SS_ISCONNECTING;
+}
+
+void
+soisconnected(so)
+ register struct socket *so;
+{
+ register struct socket *head = so->so_head;
+
+ so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
+ so->so_state |= SS_ISCONNECTED;
+ if (head && (so->so_state & SS_INCOMP)) {
+ TAILQ_REMOVE(&head->so_incomp, so, so_list);
+ head->so_incqlen--;
+ so->so_state &= ~SS_INCOMP;
+ TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+ so->so_state |= SS_COMP;
+ sorwakeup(head);
+ wakeup((caddr_t)&head->so_timeo);
+ } else {
+ wakeup((caddr_t)&so->so_timeo);
+ sorwakeup(so);
+ sowwakeup(so);
+ }
+}
+
+void
+soisdisconnecting(so)
+ register struct socket *so;
+{
+
+ so->so_state &= ~SS_ISCONNECTING;
+ so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
+ wakeup((caddr_t)&so->so_timeo);
+ sowwakeup(so);
+ sorwakeup(so);
+}
+
+void
+soisdisconnected(so)
+ register struct socket *so;
+{
+
+ so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
+ so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE);
+ wakeup((caddr_t)&so->so_timeo);
+ sowwakeup(so);
+ sorwakeup(so);
+}
+
+/*
+ * Return a random connection that hasn't been serviced yet and
+ * is eligible for discard. There is a one in qlen chance that
+ * we will return a null, saying that there are no dropable
+ * requests. In this case, the protocol specific code should drop
+ * the new request. This insures fairness.
+ *
+ * This may be used in conjunction with protocol specific queue
+ * congestion routines.
+ */
+struct socket *
+sodropablereq(head)
+ register struct socket *head;
+{
+ register struct socket *so;
+ unsigned int i, j, qlen;
+
+ static int rnd;
+ static long old_mono_secs;
+ static unsigned int cur_cnt, old_cnt;
+
+ if ((i = (mono_time.tv_sec - old_mono_secs)) != 0) {
+ old_mono_secs = mono_time.tv_sec;
+ old_cnt = cur_cnt / i;
+ cur_cnt = 0;
+ }
+
+ so = TAILQ_FIRST(&head->so_incomp);
+ if (!so)
+ return (so);
+
+ qlen = head->so_incqlen;
+ if (++cur_cnt > qlen || old_cnt > qlen) {
+ rnd = (314159 * rnd + 66329) & 0xffff;
+ j = ((qlen + 1) * rnd) >> 16;
+
+ while (j-- && so)
+ so = TAILQ_NEXT(so, so_list);
+ }
+
+ return (so);
+}
+
+/*
+ * When an attempt at a new connection is noted on a socket
+ * which accepts connections, sonewconn is called. If the
+ * connection is possible (subject to space constraints, etc.)
+ * then we allocate a new structure, propoerly linked into the
+ * data structure of the original socket, and return this.
+ * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
+ *
+ * Currently, sonewconn() is defined as sonewconn1() in socketvar.h
+ * to catch calls that are missing the (new) second parameter.
+ */
+struct socket *
+sonewconn1(head, connstatus)
+ register struct socket *head;
+ int connstatus;
+{
+ register struct socket *so;
+
+ if (head->so_qlen > 3 * head->so_qlimit / 2)
+ return ((struct socket *)0);
+ MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_DONTWAIT);
+ if (so == NULL)
+ return ((struct socket *)0);
+ bzero((caddr_t)so, sizeof(*so));
+ so->so_head = head;
+ so->so_type = head->so_type;
+ so->so_options = head->so_options &~ SO_ACCEPTCONN;
+ so->so_linger = head->so_linger;
+ so->so_state = head->so_state | SS_NOFDREF;
+ so->so_proto = head->so_proto;
+ so->so_timeo = head->so_timeo;
+ so->so_pgid = head->so_pgid;
+ (void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat);
+
+ if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0)) {
+ (void) free((caddr_t)so, M_SOCKET);
+ return ((struct socket *)0);
+ }
+
+ if (connstatus) {
+ TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+ so->so_state |= SS_COMP;
+ } else {
+ TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
+ so->so_state |= SS_INCOMP;
+ head->so_incqlen++;
+ }
+ head->so_qlen++;
+ if (connstatus) {
+ sorwakeup(head);
+ wakeup((caddr_t)&head->so_timeo);
+ so->so_state |= connstatus;
+ }
+ return (so);
+}
+
+/*
+ * Socantsendmore indicates that no more data will be sent on the
+ * socket; it would normally be applied to a socket when the user
+ * informs the system that no more data is to be sent, by the protocol
+ * code (in case PRU_SHUTDOWN). Socantrcvmore indicates that no more data
+ * will be received, and will normally be applied to the socket by a
+ * protocol when it detects that the peer will send no more data.
+ * Data queued for reading in the socket may yet be read.
+ */
+
+void
+socantsendmore(so)
+ struct socket *so;
+{
+
+ so->so_state |= SS_CANTSENDMORE;
+ sowwakeup(so);
+}
+
+void
+socantrcvmore(so)
+ struct socket *so;
+{
+
+ so->so_state |= SS_CANTRCVMORE;
+ sorwakeup(so);
+}
+
+/*
+ * Wait for data to arrive at/drain from a socket buffer.
+ */
+int
+sbwait(sb)
+ struct sockbuf *sb;
+{
+
+ sb->sb_flags |= SB_WAIT;
+ return (tsleep((caddr_t)&sb->sb_cc,
+ (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
+ sb->sb_timeo));
+}
+
+/*
+ * Lock a sockbuf already known to be locked;
+ * return any error returned from sleep (EINTR).
+ */
+int
+sb_lock(sb)
+ register struct sockbuf *sb;
+{
+ int error;
+
+ while (sb->sb_flags & SB_LOCK) {
+ sb->sb_flags |= SB_WANT;
+ error = tsleep((caddr_t)&sb->sb_flags,
+ (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH,
+ "sblock", 0);
+ if (error)
+ return (error);
+ }
+ sb->sb_flags |= SB_LOCK;
+ return (0);
+}
+
+/*
+ * Wakeup processes waiting on a socket buffer.
+ * Do asynchronous notification via SIGIO
+ * if the socket has the SS_ASYNC flag set.
+ */
+void
+sowakeup(so, sb)
+ register struct socket *so;
+ register struct sockbuf *sb;
+{
+ struct proc *p;
+
+ selwakeup(&sb->sb_sel);
+ sb->sb_flags &= ~SB_SEL;
+ if (sb->sb_flags & SB_WAIT) {
+ sb->sb_flags &= ~SB_WAIT;
+ wakeup((caddr_t)&sb->sb_cc);
+ }
+ if (so->so_state & SS_ASYNC) {
+ if (so->so_pgid < 0)
+ gsignal(-so->so_pgid, SIGIO);
+ else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
+ psignal(p, SIGIO);
+ }
+}
+
+/*
+ * Socket buffer (struct sockbuf) utility routines.
+ *
+ * Each socket contains two socket buffers: one for sending data and
+ * one for receiving data. Each buffer contains a queue of mbufs,
+ * information about the number of mbufs and amount of data in the
+ * queue, and other fields allowing select() statements and notification
+ * on data availability to be implemented.
+ *
+ * Data stored in a socket buffer is maintained as a list of records.
+ * Each record is a list of mbufs chained together with the m_next
+ * field. Records are chained together with the m_nextpkt field. The upper
+ * level routine soreceive() expects the following conventions to be
+ * observed when placing information in the receive buffer:
+ *
+ * 1. If the protocol requires each message be preceded by the sender's
+ * name, then a record containing that name must be present before
+ * any associated data (mbuf's must be of type MT_SONAME).
+ * 2. If the protocol supports the exchange of ``access rights'' (really
+ * just additional data associated with the message), and there are
+ * ``rights'' to be received, then a record containing this data
+ * should be present (mbuf's must be of type MT_RIGHTS).
+ * 3. If a name or rights record exists, then it must be followed by
+ * a data record, perhaps of zero length.
+ *
+ * Before using a new socket structure it is first necessary to reserve
+ * buffer space to the socket, by calling sbreserve(). This should commit
+ * some of the available buffer space in the system buffer pool for the
+ * socket (currently, it does nothing but enforce limits). The space
+ * should be released by calling sbrelease() when the socket is destroyed.
+ */
+
+int
+soreserve(so, sndcc, rcvcc)
+ register struct socket *so;
+ u_long sndcc, rcvcc;
+{
+
+ if (sbreserve(&so->so_snd, sndcc) == 0)
+ goto bad;
+ if (sbreserve(&so->so_rcv, rcvcc) == 0)
+ goto bad2;
+ if (so->so_rcv.sb_lowat == 0)
+ so->so_rcv.sb_lowat = 1;
+ if (so->so_snd.sb_lowat == 0)
+ so->so_snd.sb_lowat = MCLBYTES;
+ if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
+ so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
+ return (0);
+bad2:
+ sbrelease(&so->so_snd);
+bad:
+ return (ENOBUFS);
+}
+
+/*
+ * Allot mbufs to a sockbuf.
+ * Attempt to scale mbmax so that mbcnt doesn't become limiting
+ * if buffering efficiency is near the normal case.
+ */
+int
+sbreserve(sb, cc)
+ struct sockbuf *sb;
+ u_long cc;
+{
+ if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES))
+ return (0);
+ sb->sb_hiwat = cc;
+ sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
+ if (sb->sb_lowat > sb->sb_hiwat)
+ sb->sb_lowat = sb->sb_hiwat;
+ return (1);
+}
+
+/*
+ * Free mbufs held by a socket, and reserved mbuf space.
+ */
+void
+sbrelease(sb)
+ struct sockbuf *sb;
+{
+
+ sbflush(sb);
+ sb->sb_hiwat = sb->sb_mbmax = 0;
+}
+
+/*
+ * Routines to add and remove
+ * data from an mbuf queue.
+ *
+ * The routines sbappend() or sbappendrecord() are normally called to
+ * append new mbufs to a socket buffer, after checking that adequate
+ * space is available, comparing the function sbspace() with the amount
+ * of data to be added. sbappendrecord() differs from sbappend() in
+ * that data supplied is treated as the beginning of a new record.
+ * To place a sender's address, optional access rights, and data in a
+ * socket receive buffer, sbappendaddr() should be used. To place
+ * access rights and data in a socket receive buffer, sbappendrights()
+ * should be used. In either case, the new data begins a new record.
+ * Note that unlike sbappend() and sbappendrecord(), these routines check
+ * for the caller that there will be enough space to store the data.
+ * Each fails if there is not enough space, or if it cannot find mbufs
+ * to store additional information in.
+ *
+ * Reliable protocols may use the socket send buffer to hold data
+ * awaiting acknowledgement. Data is normally copied from a socket
+ * send buffer in a protocol with m_copy for output to a peer,
+ * and then removing the data from the socket buffer with sbdrop()
+ * or sbdroprecord() when the data is acknowledged by the peer.
+ */
+
+/*
+ * Append mbuf chain m to the last record in the
+ * socket buffer sb. The additional space associated
+ * the mbuf chain is recorded in sb. Empty mbufs are
+ * discarded and mbufs are compacted where possible.
+ */
+void
+sbappend(sb, m)
+ struct sockbuf *sb;
+ struct mbuf *m;
+{
+ register struct mbuf *n;
+
+ if (m == 0)
+ return;
+ n = sb->sb_mb;
+ if (n) {
+ while (n->m_nextpkt)
+ n = n->m_nextpkt;
+ do {
+ if (n->m_flags & M_EOR) {
+ sbappendrecord(sb, m); /* XXXXXX!!!! */
+ return;
+ }
+ } while (n->m_next && (n = n->m_next));
+ }
+ sbcompress(sb, m, n);
+}
+
+#ifdef SOCKBUF_DEBUG
+void
+sbcheck(sb)
+ register struct sockbuf *sb;
+{
+ register struct mbuf *m;
+ register int len = 0, mbcnt = 0;
+
+ for (m = sb->sb_mb; m; m = m->m_next) {
+ len += m->m_len;
+ mbcnt += MSIZE;
+ if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
+ mbcnt += m->m_ext.ext_size;
+ if (m->m_nextpkt)
+ panic("sbcheck nextpkt");
+ }
+ if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
+ printf("cc %d != %d || mbcnt %d != %d\n", len, sb->sb_cc,
+ mbcnt, sb->sb_mbcnt);
+ panic("sbcheck");
+ }
+}
+#endif
+
+/*
+ * As above, except the mbuf chain
+ * begins a new record.
+ */
+void
+sbappendrecord(sb, m0)
+ register struct sockbuf *sb;
+ register struct mbuf *m0;
+{
+ register struct mbuf *m;
+
+ if (m0 == 0)
+ return;
+ m = sb->sb_mb;
+ if (m)
+ while (m->m_nextpkt)
+ m = m->m_nextpkt;
+ /*
+ * Put the first mbuf on the queue.
+ * Note this permits zero length records.
+ */
+ sballoc(sb, m0);
+ if (m)
+ m->m_nextpkt = m0;
+ else
+ sb->sb_mb = m0;
+ m = m0->m_next;
+ m0->m_next = 0;
+ if (m && (m0->m_flags & M_EOR)) {
+ m0->m_flags &= ~M_EOR;
+ m->m_flags |= M_EOR;
+ }
+ sbcompress(sb, m, m0);
+}
+
+/*
+ * As above except that OOB data
+ * is inserted at the beginning of the sockbuf,
+ * but after any other OOB data.
+ */
+void
+sbinsertoob(sb, m0)
+ register struct sockbuf *sb;
+ register struct mbuf *m0;
+{
+ register struct mbuf *m;
+ register struct mbuf **mp;
+
+ if (m0 == 0)
+ return;
+ for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) {
+ m = *mp;
+ again:
+ switch (m->m_type) {
+
+ case MT_OOBDATA:
+ continue; /* WANT next train */
+
+ case MT_CONTROL:
+ m = m->m_next;
+ if (m)
+ goto again; /* inspect THIS train further */
+ }
+ break;
+ }
+ /*
+ * Put the first mbuf on the queue.
+ * Note this permits zero length records.
+ */
+ sballoc(sb, m0);
+ m0->m_nextpkt = *mp;
+ *mp = m0;
+ m = m0->m_next;
+ m0->m_next = 0;
+ if (m && (m0->m_flags & M_EOR)) {
+ m0->m_flags &= ~M_EOR;
+ m->m_flags |= M_EOR;
+ }
+ sbcompress(sb, m, m0);
+}
+
+/*
+ * Append address and data, and optionally, control (ancillary) data
+ * to the receive queue of a socket. If present,
+ * m0 must include a packet header with total length.
+ * Returns 0 if no space in sockbuf or insufficient mbufs.
+ */
+int
+sbappendaddr(sb, asa, m0, control)
+ register struct sockbuf *sb;
+ struct sockaddr *asa;
+ struct mbuf *m0, *control;
+{
+ register struct mbuf *m, *n;
+ int space = asa->sa_len;
+
+if (m0 && (m0->m_flags & M_PKTHDR) == 0)
+panic("sbappendaddr");
+ if (m0)
+ space += m0->m_pkthdr.len;
+ for (n = control; n; n = n->m_next) {
+ space += n->m_len;
+ if (n->m_next == 0) /* keep pointer to last control buf */
+ break;
+ }
+ if (space > sbspace(sb))
+ return (0);
+ if (asa->sa_len > MLEN)
+ return (0);
+ MGET(m, M_DONTWAIT, MT_SONAME);
+ if (m == 0)
+ return (0);
+ m->m_len = asa->sa_len;
+ bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len);
+ if (n)
+ n->m_next = m0; /* concatenate data to control */
+ else
+ control = m0;
+ m->m_next = control;
+ for (n = m; n; n = n->m_next)
+ sballoc(sb, n);
+ n = sb->sb_mb;
+ if (n) {
+ while (n->m_nextpkt)
+ n = n->m_nextpkt;
+ n->m_nextpkt = m;
+ } else
+ sb->sb_mb = m;
+ return (1);
+}
+
+int
+sbappendcontrol(sb, m0, control)
+ struct sockbuf *sb;
+ struct mbuf *control, *m0;
+{
+ register struct mbuf *m, *n;
+ int space = 0;
+
+ if (control == 0)
+ panic("sbappendcontrol");
+ for (m = control; ; m = m->m_next) {
+ space += m->m_len;
+ if (m->m_next == 0)
+ break;
+ }
+ n = m; /* save pointer to last control buffer */
+ for (m = m0; m; m = m->m_next)
+ space += m->m_len;
+ if (space > sbspace(sb))
+ return (0);
+ n->m_next = m0; /* concatenate data to control */
+ for (m = control; m; m = m->m_next)
+ sballoc(sb, m);
+ n = sb->sb_mb;
+ if (n) {
+ while (n->m_nextpkt)
+ n = n->m_nextpkt;
+ n->m_nextpkt = control;
+ } else
+ sb->sb_mb = control;
+ return (1);
+}
+
+/*
+ * Compress mbuf chain m into the socket
+ * buffer sb following mbuf n. If n
+ * is null, the buffer is presumed empty.
+ */
+void
+sbcompress(sb, m, n)
+ register struct sockbuf *sb;
+ register struct mbuf *m, *n;
+{
+ register int eor = 0;
+ register struct mbuf *o;
+
+ while (m) {
+ eor |= m->m_flags & M_EOR;
+ if (m->m_len == 0 &&
+ (eor == 0 ||
+ (((o = m->m_next) || (o = n)) &&
+ o->m_type == m->m_type))) {
+ m = m_free(m);
+ continue;
+ }
+ if (n && (n->m_flags & (M_EXT | M_EOR)) == 0 &&
+ (n->m_data + n->m_len + m->m_len) < &n->m_dat[MLEN] &&
+ n->m_type == m->m_type) {
+ bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
+ (unsigned)m->m_len);
+ n->m_len += m->m_len;
+ sb->sb_cc += m->m_len;
+ m = m_free(m);
+ continue;
+ }
+ if (n)
+ n->m_next = m;
+ else
+ sb->sb_mb = m;
+ sballoc(sb, m);
+ n = m;
+ m->m_flags &= ~M_EOR;
+ m = m->m_next;
+ n->m_next = 0;
+ }
+ if (eor) {
+ if (n)
+ n->m_flags |= eor;
+ else
+ printf("semi-panic: sbcompress\n");
+ }
+}
+
+/*
+ * Free all mbufs in a sockbuf.
+ * Check that all resources are reclaimed.
+ */
+void
+sbflush(sb)
+ register struct sockbuf *sb;
+{
+
+ if (sb->sb_flags & SB_LOCK)
+ panic("sbflush");
+ while (sb->sb_mbcnt)
+ sbdrop(sb, (int)sb->sb_cc);
+ if (sb->sb_cc || sb->sb_mb)
+ panic("sbflush 2");
+}
+
+/*
+ * Drop data from (the front of) a sockbuf.
+ */
+void
+sbdrop(sb, len)
+ register struct sockbuf *sb;
+ register int len;
+{
+ register struct mbuf *m, *mn;
+ struct mbuf *next;
+
+ next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
+ while (len > 0) {
+ if (m == 0) {
+ if (next == 0)
+ panic("sbdrop");
+ m = next;
+ next = m->m_nextpkt;
+ continue;
+ }
+ if (m->m_len > len) {
+ m->m_len -= len;
+ m->m_data += len;
+ sb->sb_cc -= len;
+ break;
+ }
+ len -= m->m_len;
+ sbfree(sb, m);
+ MFREE(m, mn);
+ m = mn;
+ }
+ while (m && m->m_len == 0) {
+ sbfree(sb, m);
+ MFREE(m, mn);
+ m = mn;
+ }
+ if (m) {
+ sb->sb_mb = m;
+ m->m_nextpkt = next;
+ } else
+ sb->sb_mb = next;
+}
+
+/*
+ * Drop a record off the front of a sockbuf
+ * and move the next record to the front.
+ */
+void
+sbdroprecord(sb)
+ register struct sockbuf *sb;
+{
+ register struct mbuf *m, *mn;
+
+ m = sb->sb_mb;
+ if (m) {
+ sb->sb_mb = m->m_nextpkt;
+ do {
+ sbfree(sb, m);
+ MFREE(m, mn);
+ m = mn;
+ } while (m);
+ }
+}
+
+/*
+ * Create a "control" mbuf containing the specified data
+ * with the specified type for presentation on a socket buffer.
+ */
+struct mbuf *
+sbcreatecontrol(p, size, type, level)
+ caddr_t p;
+ register int size;
+ int type, level;
+{
+ register struct cmsghdr *cp;
+ struct mbuf *m;
+
+ if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
+ return ((struct mbuf *) NULL);
+ cp = mtod(m, struct cmsghdr *);
+ /* XXX check size? */
+ (void)memcpy(CMSG_DATA(cp), p, size);
+ size += sizeof(*cp);
+ m->m_len = size;
+ cp->cmsg_len = size;
+ cp->cmsg_level = level;
+ cp->cmsg_type = type;
+ return (m);
+}
+
+#ifdef PRU_OLDSTYLE
+/*
+ * The following routines mediate between the old-style `pr_usrreq'
+ * protocol implementations and the new-style `struct pr_usrreqs'
+ * calling convention.
+ */
+
+/* syntactic sugar */
+#define nomb (struct mbuf *)0
+
+static int
+old_abort(struct socket *so)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_ABORT, nomb, nomb, nomb);
+}
+
+static int
+old_accept(struct socket *so, struct mbuf *nam)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_ACCEPT, nomb, nam, nomb);
+}
+
+static int
+old_attach(struct socket *so, int proto)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_ATTACH, nomb,
+ (struct mbuf *)proto, /* XXX */
+ nomb);
+}
+
+static int
+old_bind(struct socket *so, struct mbuf *nam)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_BIND, nomb, nam, nomb);
+}
+
+static int
+old_connect(struct socket *so, struct mbuf *nam)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_CONNECT, nomb, nam, nomb);
+}
+
+static int
+old_connect2(struct socket *so1, struct socket *so2)
+{
+ return so1->so_proto->pr_ousrreq(so1, PRU_CONNECT2, nomb,
+ (struct mbuf *)so2, nomb);
+}
+
+static int
+old_control(struct socket *so, int cmd, caddr_t data, struct ifnet *ifp)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_CONTROL, (struct mbuf *)cmd,
+ (struct mbuf *)data,
+ (struct mbuf *)ifp);
+}
+
+static int
+old_detach(struct socket *so)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_DETACH, nomb, nomb, nomb);
+}
+
+static int
+old_disconnect(struct socket *so)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_DISCONNECT, nomb, nomb, nomb);
+}
+
+static int
+old_listen(struct socket *so)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_LISTEN, nomb, nomb, nomb);
+}
+
+static int
+old_peeraddr(struct socket *so, struct mbuf *nam)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_PEERADDR, nomb, nam, nomb);
+}
+
+static int
+old_rcvd(struct socket *so, int flags)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_RCVD, nomb,
+ (struct mbuf *)flags, /* XXX */
+ nomb);
+}
+
+static int
+old_rcvoob(struct socket *so, struct mbuf *m, int flags)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_RCVOOB, m,
+ (struct mbuf *)flags, /* XXX */
+ nomb);
+}
+
+static int
+old_send(struct socket *so, int flags, struct mbuf *m, struct mbuf *addr,
+ struct mbuf *control)
+{
+ int req;
+
+ if (flags & PRUS_OOB) {
+ req = PRU_SENDOOB;
+ } else if(flags & PRUS_EOF) {
+ req = PRU_SEND_EOF;
+ } else {
+ req = PRU_SEND;
+ }
+ return so->so_proto->pr_ousrreq(so, req, m, addr, control);
+}
+
+static int
+old_sense(struct socket *so, struct stat *sb)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_SENSE, (struct mbuf *)sb,
+ nomb, nomb);
+}
+
+static int
+old_shutdown(struct socket *so)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_SHUTDOWN, nomb, nomb, nomb);
+}
+
+static int
+old_sockaddr(struct socket *so, struct mbuf *nam)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_SOCKADDR, nomb, nam, nomb);
+}
+
+struct pr_usrreqs pru_oldstyle = {
+ old_abort, old_accept, old_attach, old_bind, old_connect,
+ old_connect2, old_control, old_detach, old_disconnect,
+ old_listen, old_peeraddr, old_rcvd, old_rcvoob, old_send,
+ old_sense, old_shutdown, old_sockaddr
+};
+
+#endif /* PRU_OLDSTYLE */
+
+/*
+ * Some routines that return EOPNOTSUPP for entry points that are not
+ * supported by a protocol. Fill in as needed.
+ */
+int
+pru_accept_notsupp(struct socket *so, struct mbuf *nam)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_connect2_notsupp(struct socket *so1, struct socket *so2)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_listen_notsupp(struct socket *so)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_rcvd_notsupp(struct socket *so, int flags)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
+{
+ return EOPNOTSUPP;
+}
+
+/*
+ * This isn't really a ``null'' operation, but it's the default one
+ * and doesn't do anything destructive.
+ */
+int
+pru_sense_null(struct socket *so, struct stat *sb)
+{
+ sb->st_blksize = so->so_snd.sb_hiwat;
+ return 0;
+}
+
+/*
+ * Here is the definition of some of the basic objects in the kern.ipc
+ * branch of the MIB.
+ */
+SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
+
+/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
+static int dummy;
+SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
+
+SYSCTL_INT(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLFLAG_RW, &sb_max, 0, "")
+SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
+ &sb_efficiency, 0, "");
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index a9c5453..9f70207 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -30,13 +30,15 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)uipc_socket.c 8.6 (Berkeley) 5/2/95
+ * @(#)uipc_socket.c 8.3 (Berkeley) 4/15/94
+ * $Id: uipc_socket.c,v 1.24 1997/02/24 20:30:56 wollman Exp $
*/
#include <sys/param.h>
+#include <sys/queue.h>
#include <sys/systm.h>
#include <sys/proc.h>
-#include <sys/file.h>
+#include <sys/fcntl.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/domain.h>
@@ -45,6 +47,12 @@
#include <sys/socket.h>
#include <sys/socketvar.h>
#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+
+static int somaxconn = SOMAXCONN;
+SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn,
+ 0, "");
/*
* Socket operation routines.
@@ -55,13 +63,13 @@
*/
/*ARGSUSED*/
int
-socreate(dom, aso, type, proto)
+socreate(dom, aso, type, proto, p)
int dom;
struct socket **aso;
register int type;
int proto;
+ struct proc *p;
{
- struct proc *p = curproc; /* XXX */
register struct protosw *prp;
register struct socket *so;
register int error;
@@ -70,18 +78,19 @@ socreate(dom, aso, type, proto)
prp = pffindproto(dom, proto, type);
else
prp = pffindtype(dom, type);
- if (prp == 0 || prp->pr_usrreq == 0)
+ if (prp == 0 || prp->pr_usrreqs == 0)
return (EPROTONOSUPPORT);
if (prp->pr_type != type)
return (EPROTOTYPE);
MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_WAIT);
bzero((caddr_t)so, sizeof(*so));
+ TAILQ_INIT(&so->so_incomp);
+ TAILQ_INIT(&so->so_comp);
so->so_type = type;
if (p->p_ucred->cr_uid == 0)
so->so_state = SS_PRIV;
so->so_proto = prp;
- error = (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0,
- (struct mbuf *)(long)proto, (struct mbuf *)0);
+ error = (*prp->pr_usrreqs->pru_attach)(so, proto);
if (error) {
so->so_state |= SS_NOFDREF;
sofree(so);
@@ -99,9 +108,7 @@ sobind(so, nam)
int s = splnet();
int error;
- error =
- (*so->so_proto->pr_usrreq)(so, PRU_BIND,
- (struct mbuf *)0, nam, (struct mbuf *)0);
+ error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam);
splx(s);
return (error);
}
@@ -113,33 +120,40 @@ solisten(so, backlog)
{
int s = splnet(), error;
- error =
- (*so->so_proto->pr_usrreq)(so, PRU_LISTEN,
- (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);
+ error = (*so->so_proto->pr_usrreqs->pru_listen)(so);
if (error) {
splx(s);
return (error);
}
- if (so->so_q == 0)
+ if (so->so_comp.tqh_first == NULL)
so->so_options |= SO_ACCEPTCONN;
- if (backlog < 0)
- backlog = 0;
- so->so_qlimit = min(backlog, SOMAXCONN);
+ if (backlog < 0 || backlog > somaxconn)
+ backlog = somaxconn;
+ so->so_qlimit = backlog;
splx(s);
return (0);
}
-int
+void
sofree(so)
register struct socket *so;
{
+ struct socket *head = so->so_head;
if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
return;
- if (so->so_head) {
- if (!soqremque(so, 0) && !soqremque(so, 1))
- panic("sofree dq");
- so->so_head = 0;
+ if (head != NULL) {
+ if (so->so_state & SS_INCOMP) {
+ TAILQ_REMOVE(&head->so_incomp, so, so_list);
+ head->so_incqlen--;
+ } else if (so->so_state & SS_COMP) {
+ TAILQ_REMOVE(&head->so_comp, so, so_list);
+ } else {
+ panic("sofree: not queued");
+ }
+ head->so_qlen--;
+ so->so_state &= ~(SS_INCOMP|SS_COMP);
+ so->so_head = NULL;
}
sbrelease(&so->so_snd);
sorflush(so);
@@ -159,10 +173,16 @@ soclose(so)
int error = 0;
if (so->so_options & SO_ACCEPTCONN) {
- while (so->so_q0)
- (void) soabort(so->so_q0);
- while (so->so_q)
- (void) soabort(so->so_q);
+ struct socket *sp, *sonext;
+
+ for (sp = so->so_incomp.tqh_first; sp != NULL; sp = sonext) {
+ sonext = sp->so_list.tqe_next;
+ (void) soabort(sp);
+ }
+ for (sp = so->so_comp.tqh_first; sp != NULL; sp = sonext) {
+ sonext = sp->so_list.tqe_next;
+ (void) soabort(sp);
+ }
}
if (so->so_pcb == 0)
goto discard;
@@ -176,17 +196,17 @@ soclose(so)
if ((so->so_state & SS_ISDISCONNECTING) &&
(so->so_state & SS_NBIO))
goto drop;
- while (so->so_state & SS_ISCONNECTED)
- if (error = tsleep((caddr_t)&so->so_timeo,
- PSOCK | PCATCH, netcls, so->so_linger * hz))
+ while (so->so_state & SS_ISCONNECTED) {
+ error = tsleep((caddr_t)&so->so_timeo,
+ PSOCK | PCATCH, "soclos", so->so_linger);
+ if (error)
break;
+ }
}
}
drop:
if (so->so_pcb) {
- int error2 =
- (*so->so_proto->pr_usrreq)(so, PRU_DETACH,
- (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);
+ int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
if (error == 0)
error = error2;
}
@@ -207,9 +227,7 @@ soabort(so)
struct socket *so;
{
- return (
- (*so->so_proto->pr_usrreq)(so, PRU_ABORT,
- (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0));
+ return (*so->so_proto->pr_usrreqs->pru_abort)(so);
}
int
@@ -223,8 +241,7 @@ soaccept(so, nam)
if ((so->so_state & SS_NOFDREF) == 0)
panic("soaccept: !NOFDREF");
so->so_state &= ~SS_NOFDREF;
- error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT,
- (struct mbuf *)0, nam, (struct mbuf *)0);
+ error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
splx(s);
return (error);
}
@@ -251,8 +268,7 @@ soconnect(so, nam)
(error = sodisconnect(so))))
error = EISCONN;
else
- error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT,
- (struct mbuf *)0, nam, (struct mbuf *)0);
+ error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam);
splx(s);
return (error);
}
@@ -265,8 +281,7 @@ soconnect2(so1, so2)
int s = splnet();
int error;
- error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2,
- (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0);
+ error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
splx(s);
return (error);
}
@@ -286,8 +301,7 @@ sodisconnect(so)
error = EALREADY;
goto bad;
}
- error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT,
- (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);
+ error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
bad:
splx(s);
return (error);
@@ -349,7 +363,8 @@ sosend(so, addr, uio, top, control, flags)
#define snderr(errno) { error = errno; splx(s); goto release; }
restart:
- if (error = sblock(&so->so_snd, SBLOCKWAIT(flags)))
+ error = sblock(&so->so_snd, SBLOCKWAIT(flags));
+ if (error)
goto out;
do {
s = splnet();
@@ -358,17 +373,25 @@ restart:
if (so->so_error)
snderr(so->so_error);
if ((so->so_state & SS_ISCONNECTED) == 0) {
- if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
+ /*
+ * `sendto' and `sendmsg' is allowed on a connection-
+ * based socket if it supports implied connect.
+ * Return ENOTCONN if not connected and no address is
+ * supplied.
+ */
+ if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
+ (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
if ((so->so_state & SS_ISCONFIRMING) == 0 &&
!(resid == 0 && clen != 0))
snderr(ENOTCONN);
} else if (addr == 0)
- snderr(EDESTADDRREQ);
+ snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
+ ENOTCONN : EDESTADDRREQ);
}
space = sbspace(&so->so_snd);
if (flags & MSG_OOB)
space += 1024;
- if (atomic && resid > so->so_snd.sb_hiwat ||
+ if ((atomic && resid > so->so_snd.sb_hiwat) ||
clen > so->so_snd.sb_hiwat)
snderr(EMSGSIZE);
if (space < resid + clen && uio &&
@@ -403,25 +426,15 @@ restart:
MGET(m, M_WAIT, MT_DATA);
mlen = MLEN;
}
- if (resid >= MINCLSIZE && space >= MCLBYTES) {
+ if (resid >= MINCLSIZE) {
MCLGET(m, M_WAIT);
if ((m->m_flags & M_EXT) == 0)
goto nopages;
mlen = MCLBYTES;
-#ifdef MAPPED_MBUFS
- len = min(MCLBYTES, resid);
-#else
- if (atomic && top == 0) {
- len = min(MCLBYTES - max_hdr, resid);
- m->m_data += max_hdr;
- } else
- len = min(MCLBYTES, resid);
-#endif
- space -= MCLBYTES;
+ len = min(min(mlen, resid), space);
} else {
nopages:
len = min(min(mlen, resid), space);
- space -= len;
/*
* For datagram protocols, leave room
* for protocol headers in first mbuf.
@@ -429,6 +442,7 @@ nopages:
if (atomic && top == 0 && len < mlen)
MH_ALIGN(m, len);
}
+ space -= len;
error = uiomove(mtod(m, caddr_t), (int)len, uio);
resid = uio->uio_resid;
m->m_len = len;
@@ -446,8 +460,17 @@ nopages:
if (dontroute)
so->so_options |= SO_DONTROUTE;
s = splnet(); /* XXX */
- error = (*so->so_proto->pr_usrreq)(so,
- (flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND,
+ error = (*so->so_proto->pr_usrreqs->pru_send)(so,
+ (flags & MSG_OOB) ? PRUS_OOB :
+ /*
+ * If the user set MSG_EOF, the protocol
+ * understands this flag and nothing left to
+ * send then use PRU_SEND_EOF instead of PRU_SEND.
+ */
+ ((flags & MSG_EOF) &&
+ (so->so_proto->pr_flags & PR_IMPLOPCL) &&
+ (resid <= 0)) ?
+ PRUS_EOF : 0,
top, addr, control);
splx(s);
if (dontroute)
@@ -500,7 +523,7 @@ soreceive(so, paddr, uio, mp0, controlp, flagsp)
register int flags, len, error, s, offset;
struct protosw *pr = so->so_proto;
struct mbuf *nextrecord;
- int moff, type;
+ int moff, type = 0;
int orig_resid = uio->uio_resid;
mp = mp0;
@@ -514,8 +537,7 @@ soreceive(so, paddr, uio, mp0, controlp, flagsp)
flags = 0;
if (flags & MSG_OOB) {
m = m_get(M_WAIT, MT_DATA);
- error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m,
- (struct mbuf *)(long)(flags & MSG_PEEK), (struct mbuf *)0);
+ error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
if (error)
goto bad;
do {
@@ -531,11 +553,11 @@ bad:
if (mp)
*mp = (struct mbuf *)0;
if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
- (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
- (struct mbuf *)0, (struct mbuf *)0);
+ (*pr->pr_usrreqs->pru_rcvd)(so, 0);
restart:
- if (error = sblock(&so->so_rcv, SBLOCKWAIT(flags)))
+ error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
+ if (error)
return (error);
s = splnet();
@@ -545,17 +567,17 @@ restart:
* (subject to any timeout) if:
* 1. the current count is less than the low water mark, or
* 2. MSG_WAITALL is set, and it is possible to do the entire
- * receive operation at once if we block (resid <= hiwat), or
- * 3. MSG_DONTWAIT is not set.
+ * receive operation at once if we block (resid <= hiwat).
+ * 3. MSG_DONTWAIT is not set
* If MSG_WAITALL is set but resid is larger than the receive buffer,
* we have to do the receive in sections, and thus risk returning
* a short count if a timeout or signal occurs after we start.
*/
- if (m == 0 || ((flags & MSG_DONTWAIT) == 0 &&
+ if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
so->so_rcv.sb_cc < uio->uio_resid) &&
(so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
- m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0) {
+ m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
#ifdef DIAGNOSTIC
if (m == 0 && so->so_rcv.sb_cc)
panic("receive 1");
@@ -687,6 +709,8 @@ dontblock:
splx(s);
error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
s = splnet();
+ if (error)
+ goto release;
} else
uio->uio_resid -= len;
if (len == m->m_len - moff) {
@@ -753,7 +777,8 @@ dontblock:
splx(s);
return (0);
}
- if (m = so->so_rcv.sb_mb)
+ m = so->so_rcv.sb_mb;
+ if (m)
nextrecord = m->m_nextpkt;
}
}
@@ -767,9 +792,7 @@ dontblock:
if (m == 0)
so->so_rcv.sb_mb = nextrecord;
if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
- (*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
- (struct mbuf *)(long)flags, (struct mbuf *)0,
- (struct mbuf *)0);
+ (*pr->pr_usrreqs->pru_rcvd)(so, flags);
}
if (orig_resid == uio->uio_resid && orig_resid &&
(flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
@@ -777,7 +800,7 @@ dontblock:
splx(s);
goto restart;
}
-
+
if (flagsp)
*flagsp |= flags;
release:
@@ -797,8 +820,7 @@ soshutdown(so, how)
if (how & FREAD)
sorflush(so);
if (how & FWRITE)
- return ((*pr->pr_usrreq)(so, PRU_SHUTDOWN,
- (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0));
+ return ((*pr->pr_usrreqs->pru_shutdown)(so));
return (0);
}
@@ -857,6 +879,7 @@ sosetopt(so, level, optname, m0)
case SO_REUSEADDR:
case SO_REUSEPORT:
case SO_OOBINLINE:
+ case SO_TIMESTAMP:
if (m == NULL || m->m_len < sizeof (int)) {
error = EINVAL;
goto bad;
@@ -907,7 +930,7 @@ sosetopt(so, level, optname, m0)
goto bad;
}
tv = mtod(m, struct timeval *);
- if (tv->tv_sec * hz + tv->tv_usec / tick > SHRT_MAX) {
+ if (tv->tv_sec > SHRT_MAX / hz - hz) {
error = EDOM;
goto bad;
}
@@ -925,6 +948,11 @@ sosetopt(so, level, optname, m0)
break;
}
+ case SO_PRIVSTATE:
+ /* we don't care what the parameter is... */
+ so->so_state &= ~SS_PRIV;
+ break;
+
default:
error = ENOPROTOOPT;
break;
@@ -976,9 +1004,14 @@ sogetopt(so, level, optname, mp)
case SO_REUSEPORT:
case SO_BROADCAST:
case SO_OOBINLINE:
+ case SO_TIMESTAMP:
*mtod(m, int *) = so->so_options & optname;
break;
+ case SO_PRIVSTATE:
+ *mtod(m, int *) = so->so_state & SS_PRIV;
+ break;
+
case SO_TYPE:
*mtod(m, int *) = so->so_type;
break;
diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c
index 865108a..e19db0c 100644
--- a/sys/kern/uipc_socket2.c
+++ b/sys/kern/uipc_socket2.c
@@ -30,30 +30,32 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)uipc_socket2.c 8.2 (Berkeley) 2/14/95
+ * @(#)uipc_socket2.c 8.1 (Berkeley) 6/10/93
+ * $Id: uipc_socket2.c,v 1.21 1997/02/19 19:15:43 wollman Exp $
*/
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/kernel.h>
#include <sys/proc.h>
#include <sys/file.h>
#include <sys/buf.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
+#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
/*
* Primitive routines for operating on sockets and socket buffers
*/
-/* strings for sleep message: */
-char netio[] = "netio";
-char netcon[] = "netcon";
-char netcls[] = "netcls";
+u_long sb_max = SB_MAX; /* XXX should be static */
-u_long sb_max = SB_MAX; /* patchable */
+static u_long sb_efficiency = 8; /* parameter for sbreserve() */
/*
* Procedures to manipulate state flags of socket
@@ -76,7 +78,7 @@ u_long sb_max = SB_MAX; /* patchable */
* structure queued on so_q0 by calling sonewconn(). When the connection
* is established, soisconnected() is called, and transfers the
* socket structure to so_q, making it available to accept().
- *
+ *
* If a socket is closed with sockets on either
* so_q0 or so_q, these sockets are dropped.
*
@@ -102,8 +104,12 @@ soisconnected(so)
so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
so->so_state |= SS_ISCONNECTED;
- if (head && soqremque(so, 0)) {
- soqinsque(head, so, 1);
+ if (head && (so->so_state & SS_INCOMP)) {
+ TAILQ_REMOVE(&head->so_incomp, so, so_list);
+ head->so_incqlen--;
+ so->so_state &= ~SS_INCOMP;
+ TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+ so->so_state |= SS_COMP;
sorwakeup(head);
wakeup((caddr_t)&head->so_timeo);
} else {
@@ -138,6 +144,49 @@ soisdisconnected(so)
}
/*
+ * Return a random connection that hasn't been serviced yet and
+ * is eligible for discard. There is a one in qlen chance that
+ * we will return a null, saying that there are no dropable
+ * requests. In this case, the protocol specific code should drop
+ * the new request. This insures fairness.
+ *
+ * This may be used in conjunction with protocol specific queue
+ * congestion routines.
+ */
+struct socket *
+sodropablereq(head)
+ register struct socket *head;
+{
+ register struct socket *so;
+ unsigned int i, j, qlen;
+
+ static int rnd;
+ static long old_mono_secs;
+ static unsigned int cur_cnt, old_cnt;
+
+ if ((i = (mono_time.tv_sec - old_mono_secs)) != 0) {
+ old_mono_secs = mono_time.tv_sec;
+ old_cnt = cur_cnt / i;
+ cur_cnt = 0;
+ }
+
+ so = TAILQ_FIRST(&head->so_incomp);
+ if (!so)
+ return (so);
+
+ qlen = head->so_incqlen;
+ if (++cur_cnt > qlen || old_cnt > qlen) {
+ rnd = (314159 * rnd + 66329) & 0xffff;
+ j = ((qlen + 1) * rnd) >> 16;
+
+ while (j-- && so)
+ so = TAILQ_NEXT(so, so_list);
+ }
+
+ return (so);
+}
+
+/*
* When an attempt at a new connection is noted on a socket
* which accepts connections, sonewconn is called. If the
* connection is possible (subject to space constraints, etc.)
@@ -154,14 +203,14 @@ sonewconn1(head, connstatus)
int connstatus;
{
register struct socket *so;
- int soqueue = connstatus ? 1 : 0;
- if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2)
+ if (head->so_qlen > 3 * head->so_qlimit / 2)
return ((struct socket *)0);
MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_DONTWAIT);
- if (so == NULL)
+ if (so == NULL)
return ((struct socket *)0);
bzero((caddr_t)so, sizeof(*so));
+ so->so_head = head;
so->so_type = head->so_type;
so->so_options = head->so_options &~ SO_ACCEPTCONN;
so->so_linger = head->so_linger;
@@ -170,13 +219,21 @@ sonewconn1(head, connstatus)
so->so_timeo = head->so_timeo;
so->so_pgid = head->so_pgid;
(void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat);
- soqinsque(head, so, soqueue);
- if ((*so->so_proto->pr_usrreq)(so, PRU_ATTACH,
- (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0)) {
- (void) soqremque(so, soqueue);
+
+ if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0)) {
(void) free((caddr_t)so, M_SOCKET);
return ((struct socket *)0);
}
+
+ if (connstatus) {
+ TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+ so->so_state |= SS_COMP;
+ } else {
+ TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
+ so->so_state |= SS_INCOMP;
+ head->so_incqlen++;
+ }
+ head->so_qlen++;
if (connstatus) {
sorwakeup(head);
wakeup((caddr_t)&head->so_timeo);
@@ -185,57 +242,6 @@ sonewconn1(head, connstatus)
return (so);
}
-void
-soqinsque(head, so, q)
- register struct socket *head, *so;
- int q;
-{
-
- register struct socket **prev;
- so->so_head = head;
- if (q == 0) {
- head->so_q0len++;
- so->so_q0 = 0;
- for (prev = &(head->so_q0); *prev; )
- prev = &((*prev)->so_q0);
- } else {
- head->so_qlen++;
- so->so_q = 0;
- for (prev = &(head->so_q); *prev; )
- prev = &((*prev)->so_q);
- }
- *prev = so;
-}
-
-int
-soqremque(so, q)
- register struct socket *so;
- int q;
-{
- register struct socket *head, *prev, *next;
-
- head = so->so_head;
- prev = head;
- for (;;) {
- next = q ? prev->so_q : prev->so_q0;
- if (next == so)
- break;
- if (next == 0)
- return (0);
- prev = next;
- }
- if (q == 0) {
- prev->so_q0 = next->so_q0;
- head->so_q0len--;
- } else {
- prev->so_q = next->so_q;
- head->so_qlen--;
- }
- next->so_q0 = next->so_q = 0;
- next->so_head = 0;
- return (1);
-}
-
/*
* Socantsendmore indicates that no more data will be sent on the
* socket; it would normally be applied to a socket when the user
@@ -274,11 +280,11 @@ sbwait(sb)
sb->sb_flags |= SB_WAIT;
return (tsleep((caddr_t)&sb->sb_cc,
- (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, netio,
+ (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
sb->sb_timeo));
}
-/*
+/*
* Lock a sockbuf already known to be locked;
* return any error returned from sleep (EINTR).
*/
@@ -290,9 +296,10 @@ sb_lock(sb)
while (sb->sb_flags & SB_LOCK) {
sb->sb_flags |= SB_WANT;
- if (error = tsleep((caddr_t)&sb->sb_flags,
+ error = tsleep((caddr_t)&sb->sb_flags,
(sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH,
- netio, 0))
+ "sblock", 0);
+ if (error)
return (error);
}
sb->sb_flags |= SB_LOCK;
@@ -390,11 +397,10 @@ sbreserve(sb, cc)
struct sockbuf *sb;
u_long cc;
{
-
- if (cc > sb_max * MCLBYTES / (MSIZE + MCLBYTES))
+ if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES))
return (0);
sb->sb_hiwat = cc;
- sb->sb_mbmax = min(cc * 2, sb_max);
+ sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
if (sb->sb_lowat > sb->sb_hiwat)
sb->sb_lowat = sb->sb_hiwat;
return (1);
@@ -452,7 +458,8 @@ sbappend(sb, m)
if (m == 0)
return;
- if (n = sb->sb_mb) {
+ n = sb->sb_mb;
+ if (n) {
while (n->m_nextpkt)
n = n->m_nextpkt;
do {
@@ -476,7 +483,7 @@ sbcheck(sb)
for (m = sb->sb_mb; m; m = m->m_next) {
len += m->m_len;
mbcnt += MSIZE;
- if (m->m_flags & M_EXT)
+ if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
mbcnt += m->m_ext.ext_size;
if (m->m_nextpkt)
panic("sbcheck nextpkt");
@@ -502,7 +509,8 @@ sbappendrecord(sb, m0)
if (m0 == 0)
return;
- if (m = sb->sb_mb)
+ m = sb->sb_mb;
+ if (m)
while (m->m_nextpkt)
m = m->m_nextpkt;
/*
@@ -538,7 +546,8 @@ sbinsertoob(sb, m0)
if (m0 == 0)
return;
- for (mp = &sb->sb_mb; m = *mp; mp = &((*mp)->m_nextpkt)) {
+ for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) {
+ m = *mp;
again:
switch (m->m_type) {
@@ -546,7 +555,8 @@ sbinsertoob(sb, m0)
continue; /* WANT next train */
case MT_CONTROL:
- if (m = m->m_next)
+ m = m->m_next;
+ if (m)
goto again; /* inspect THIS train further */
}
break;
@@ -607,7 +617,8 @@ panic("sbappendaddr");
m->m_next = control;
for (n = m; n; n = n->m_next)
sballoc(sb, n);
- if (n = sb->sb_mb) {
+ n = sb->sb_mb;
+ if (n) {
while (n->m_nextpkt)
n = n->m_nextpkt;
n->m_nextpkt = m;
@@ -619,7 +630,7 @@ panic("sbappendaddr");
int
sbappendcontrol(sb, m0, control)
struct sockbuf *sb;
- struct mbuf *m0, *control;
+ struct mbuf *control, *m0;
{
register struct mbuf *m, *n;
int space = 0;
@@ -639,7 +650,8 @@ sbappendcontrol(sb, m0, control)
n->m_next = m0; /* concatenate data to control */
for (m = control; m; m = m->m_next)
sballoc(sb, m);
- if (n = sb->sb_mb) {
+ n = sb->sb_mb;
+ if (n) {
while (n->m_nextpkt)
n = n->m_nextpkt;
n->m_nextpkt = control;
@@ -774,6 +786,233 @@ sbdroprecord(sb)
do {
sbfree(sb, m);
MFREE(m, mn);
- } while (m = mn);
+ m = mn;
+ } while (m);
}
}
+
+/*
+ * Create a "control" mbuf containing the specified data
+ * with the specified type for presentation on a socket buffer.
+ */
+struct mbuf *
+sbcreatecontrol(p, size, type, level)
+ caddr_t p;
+ register int size;
+ int type, level;
+{
+ register struct cmsghdr *cp;
+ struct mbuf *m;
+
+ if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
+ return ((struct mbuf *) NULL);
+ cp = mtod(m, struct cmsghdr *);
+ /* XXX check size? */
+ (void)memcpy(CMSG_DATA(cp), p, size);
+ size += sizeof(*cp);
+ m->m_len = size;
+ cp->cmsg_len = size;
+ cp->cmsg_level = level;
+ cp->cmsg_type = type;
+ return (m);
+}
+
+#ifdef PRU_OLDSTYLE
+/*
+ * The following routines mediate between the old-style `pr_usrreq'
+ * protocol implementations and the new-style `struct pr_usrreqs'
+ * calling convention.
+ */
+
+/* syntactic sugar */
+#define nomb (struct mbuf *)0
+
+static int
+old_abort(struct socket *so)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_ABORT, nomb, nomb, nomb);
+}
+
+static int
+old_accept(struct socket *so, struct mbuf *nam)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_ACCEPT, nomb, nam, nomb);
+}
+
+static int
+old_attach(struct socket *so, int proto)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_ATTACH, nomb,
+ (struct mbuf *)proto, /* XXX */
+ nomb);
+}
+
+static int
+old_bind(struct socket *so, struct mbuf *nam)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_BIND, nomb, nam, nomb);
+}
+
+static int
+old_connect(struct socket *so, struct mbuf *nam)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_CONNECT, nomb, nam, nomb);
+}
+
+static int
+old_connect2(struct socket *so1, struct socket *so2)
+{
+ return so1->so_proto->pr_ousrreq(so1, PRU_CONNECT2, nomb,
+ (struct mbuf *)so2, nomb);
+}
+
+static int
+old_control(struct socket *so, int cmd, caddr_t data, struct ifnet *ifp)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_CONTROL, (struct mbuf *)cmd,
+ (struct mbuf *)data,
+ (struct mbuf *)ifp);
+}
+
+static int
+old_detach(struct socket *so)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_DETACH, nomb, nomb, nomb);
+}
+
+static int
+old_disconnect(struct socket *so)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_DISCONNECT, nomb, nomb, nomb);
+}
+
+static int
+old_listen(struct socket *so)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_LISTEN, nomb, nomb, nomb);
+}
+
+static int
+old_peeraddr(struct socket *so, struct mbuf *nam)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_PEERADDR, nomb, nam, nomb);
+}
+
+static int
+old_rcvd(struct socket *so, int flags)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_RCVD, nomb,
+ (struct mbuf *)flags, /* XXX */
+ nomb);
+}
+
+static int
+old_rcvoob(struct socket *so, struct mbuf *m, int flags)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_RCVOOB, m,
+ (struct mbuf *)flags, /* XXX */
+ nomb);
+}
+
+static int
+old_send(struct socket *so, int flags, struct mbuf *m, struct mbuf *addr,
+ struct mbuf *control)
+{
+ int req;
+
+ if (flags & PRUS_OOB) {
+ req = PRU_SENDOOB;
+ } else if(flags & PRUS_EOF) {
+ req = PRU_SEND_EOF;
+ } else {
+ req = PRU_SEND;
+ }
+ return so->so_proto->pr_ousrreq(so, req, m, addr, control);
+}
+
+static int
+old_sense(struct socket *so, struct stat *sb)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_SENSE, (struct mbuf *)sb,
+ nomb, nomb);
+}
+
+static int
+old_shutdown(struct socket *so)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_SHUTDOWN, nomb, nomb, nomb);
+}
+
+static int
+old_sockaddr(struct socket *so, struct mbuf *nam)
+{
+ return so->so_proto->pr_ousrreq(so, PRU_SOCKADDR, nomb, nam, nomb);
+}
+
+struct pr_usrreqs pru_oldstyle = {
+ old_abort, old_accept, old_attach, old_bind, old_connect,
+ old_connect2, old_control, old_detach, old_disconnect,
+ old_listen, old_peeraddr, old_rcvd, old_rcvoob, old_send,
+ old_sense, old_shutdown, old_sockaddr
+};
+
+#endif /* PRU_OLDSTYLE */
+
+/*
+ * Some routines that return EOPNOTSUPP for entry points that are not
+ * supported by a protocol. Fill in as needed.
+ */
+int
+pru_accept_notsupp(struct socket *so, struct mbuf *nam)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_connect2_notsupp(struct socket *so1, struct socket *so2)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_listen_notsupp(struct socket *so)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_rcvd_notsupp(struct socket *so, int flags)
+{
+ return EOPNOTSUPP;
+}
+
+int
+pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
+{
+ return EOPNOTSUPP;
+}
+
+/*
+ * This isn't really a ``null'' operation, but it's the default one
+ * and doesn't do anything destructive.
+ */
+int
+pru_sense_null(struct socket *so, struct stat *sb)
+{
+ sb->st_blksize = so->so_snd.sb_hiwat;
+ return 0;
+}
+
+/*
+ * Here is the definition of some of the basic objects in the kern.ipc
+ * branch of the MIB.
+ */
+SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
+
+/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
+static int dummy;
+SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
+
+SYSCTL_INT(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLFLAG_RW, &sb_max, 0, "")
+SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
+ &sb_efficiency, 0, "");
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
index 800434c..e3aca30 100644
--- a/sys/kern/uipc_syscalls.c
+++ b/sys/kern/uipc_syscalls.c
@@ -30,26 +30,43 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)uipc_syscalls.c 8.6 (Berkeley) 2/14/95
+ * @(#)uipc_syscalls.c 8.4 (Berkeley) 2/21/94
+ * $Id: uipc_syscalls.c,v 1.22 1997/02/22 09:39:29 peter Exp $
*/
+#include "opt_ktrace.h"
+
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/sysproto.h>
#include <sys/filedesc.h>
#include <sys/proc.h>
+#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/buf.h>
#include <sys/malloc.h>
#include <sys/mbuf.h>
#include <sys/protosw.h>
+#include <sys/stat.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
+#include <sys/signalvar.h>
+#include <sys/un.h>
#ifdef KTRACE
#include <sys/ktrace.h>
#endif
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
+extern int sendit __P((struct proc *p, int s, struct msghdr *mp, int flags,
+ int *retsize));
+extern int recvit __P((struct proc *p, int s, struct msghdr *mp,
+ caddr_t namelenp, int *retsize));
+
+static int accept1 __P((struct proc *p, struct accept_args *uap, int *retval,
+ int compat));
+static int getsockname1 __P((struct proc *p, struct getsockname_args *uap,
+ int *retval, int compat));
+static int getpeername1 __P((struct proc *p, struct getpeername_args *uap,
+ int *retval, int compat));
/*
* System call interface to the socket abstraction.
@@ -64,24 +81,25 @@ int
socket(p, uap, retval)
struct proc *p;
register struct socket_args /* {
- syscallarg(int) domain;
- syscallarg(int) type;
- syscallarg(int) protocol;
+ int domain;
+ int type;
+ int protocol;
} */ *uap;
- register_t *retval;
+ int *retval;
{
struct filedesc *fdp = p->p_fd;
struct socket *so;
struct file *fp;
int fd, error;
- if (error = falloc(p, &fp, &fd))
+ error = falloc(p, &fp, &fd);
+ if (error)
return (error);
fp->f_flag = FREAD|FWRITE;
fp->f_type = DTYPE_SOCKET;
fp->f_ops = &socketops;
- if (error = socreate(SCARG(uap, domain), &so, SCARG(uap, type),
- SCARG(uap, protocol))) {
+ error = socreate(uap->domain, &so, uap->type, uap->protocol, p);
+ if (error) {
fdp->fd_ofiles[fd] = 0;
ffree(fp);
} else {
@@ -96,20 +114,21 @@ int
bind(p, uap, retval)
struct proc *p;
register struct bind_args /* {
- syscallarg(int) s;
- syscallarg(caddr_t) name;
- syscallarg(int) namelen;
+ int s;
+ caddr_t name;
+ int namelen;
} */ *uap;
- register_t *retval;
+ int *retval;
{
struct file *fp;
struct mbuf *nam;
int error;
- if (error = getsock(p->p_fd, SCARG(uap, s), &fp))
+ error = getsock(p->p_fd, uap->s, &fp);
+ if (error)
return (error);
- if (error = sockargs(&nam, SCARG(uap, name), SCARG(uap, namelen),
- MT_SONAME))
+ error = sockargs(&nam, uap->name, uap->namelen, MT_SONAME);
+ if (error)
return (error);
error = sobind((struct socket *)fp->f_data, nam);
m_freem(nam);
@@ -121,159 +140,161 @@ int
listen(p, uap, retval)
struct proc *p;
register struct listen_args /* {
- syscallarg(int) s;
- syscallarg(int) backlog;
+ int s;
+ int backlog;
} */ *uap;
- register_t *retval;
+ int *retval;
{
struct file *fp;
int error;
- if (error = getsock(p->p_fd, SCARG(uap, s), &fp))
+ error = getsock(p->p_fd, uap->s, &fp);
+ if (error)
return (error);
- return (solisten((struct socket *)fp->f_data, SCARG(uap, backlog)));
+ return (solisten((struct socket *)fp->f_data, uap->backlog));
}
-#ifdef COMPAT_OLDSOCK
-int
-accept(p, uap, retval)
- struct proc *p;
- struct accept_args /* {
- syscallarg(int) s;
- syscallarg(caddr_t) name;
- syscallarg(int *) anamelen;
- } */ *uap;
- register_t *retval;
-{
-
- return (accept1(p, uap, retval, 0));
-}
-
-int
-compat_43_accept(p, uap, retval)
- struct proc *p;
- struct accept_args /* {
- syscallarg(int) s;
- syscallarg(caddr_t) name;
- syscallarg(int *) anamelen;
- } */ *uap;
- register_t *retval;
-{
-
- return (accept1(p, uap, retval, 1));
-}
-#else /* COMPAT_OLDSOCK */
-
-#define accept1 accept
-#endif
-
-int
-accept1(p, uap, retval, compat_43)
+static int
+accept1(p, uap, retval, compat)
struct proc *p;
register struct accept_args /* {
- syscallarg(int) s;
- syscallarg(caddr_t) name;
- syscallarg(int *) anamelen;
+ int s;
+ caddr_t name;
+ int *anamelen;
} */ *uap;
- register_t *retval;
- int compat_43;
+ int *retval;
+ int compat;
{
struct file *fp;
struct mbuf *nam;
- int namelen, error, s, tmpfd;
- register struct socket *so;
-
- if (SCARG(uap, name) && (error = copyin((caddr_t)SCARG(uap, anamelen),
- (caddr_t)&namelen, sizeof (namelen))))
- return (error);
- if (error = getsock(p->p_fd, SCARG(uap, s), &fp))
+ int namelen, error, s;
+ struct socket *head, *so;
+ short fflag; /* type must match fp->f_flag */
+
+ if (uap->name) {
+ error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen,
+ sizeof (namelen));
+ if(error)
+ return (error);
+ }
+ error = getsock(p->p_fd, uap->s, &fp);
+ if (error)
return (error);
s = splnet();
- so = (struct socket *)fp->f_data;
- if ((so->so_options & SO_ACCEPTCONN) == 0) {
+ head = (struct socket *)fp->f_data;
+ if ((head->so_options & SO_ACCEPTCONN) == 0) {
splx(s);
return (EINVAL);
}
- if ((so->so_state & SS_NBIO) && so->so_qlen == 0) {
+ if ((head->so_state & SS_NBIO) && head->so_comp.tqh_first == NULL) {
splx(s);
return (EWOULDBLOCK);
}
- while (so->so_qlen == 0 && so->so_error == 0) {
- if (so->so_state & SS_CANTRCVMORE) {
- so->so_error = ECONNABORTED;
+ while (head->so_comp.tqh_first == NULL && head->so_error == 0) {
+ if (head->so_state & SS_CANTRCVMORE) {
+ head->so_error = ECONNABORTED;
break;
}
- if (error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH,
- netcon, 0)) {
+ error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH,
+ "accept", 0);
+ if (error) {
splx(s);
return (error);
}
}
- if (so->so_error) {
- error = so->so_error;
- so->so_error = 0;
+ if (head->so_error) {
+ error = head->so_error;
+ head->so_error = 0;
splx(s);
return (error);
}
- if (error = falloc(p, &fp, &tmpfd)) {
+ fflag = fp->f_flag;
+ error = falloc(p, &fp, retval);
+ if (error) {
splx(s);
return (error);
}
- *retval = tmpfd;
- { struct socket *aso = so->so_q;
- if (soqremque(aso, 1) == 0)
- panic("accept");
- so = aso;
- }
+
+ so = head->so_comp.tqh_first;
+ if (so == NULL)
+ panic("accept: nothing queued");
+ TAILQ_REMOVE(&head->so_comp, so, so_list);
+ so->so_state &= ~SS_COMP;
+ so->so_head = NULL;
+ head->so_qlen--;
+
fp->f_type = DTYPE_SOCKET;
- fp->f_flag = FREAD|FWRITE;
+ fp->f_flag = fflag;
fp->f_ops = &socketops;
fp->f_data = (caddr_t)so;
nam = m_get(M_WAIT, MT_SONAME);
(void) soaccept(so, nam);
- if (SCARG(uap, name)) {
+ if (uap->name) {
#ifdef COMPAT_OLDSOCK
- if (compat_43)
+ if (compat)
mtod(nam, struct osockaddr *)->sa_family =
mtod(nam, struct sockaddr *)->sa_family;
#endif
if (namelen > nam->m_len)
namelen = nam->m_len;
/* SHOULD COPY OUT A CHAIN HERE */
- if ((error = copyout(mtod(nam, caddr_t),
- (caddr_t)SCARG(uap, name), (u_int)namelen)) == 0)
+ error = copyout(mtod(nam, caddr_t), (caddr_t)uap->name,
+ (u_int)namelen);
+ if (!error)
error = copyout((caddr_t)&namelen,
- (caddr_t)SCARG(uap, anamelen),
- sizeof (*SCARG(uap, anamelen)));
+ (caddr_t)uap->anamelen, sizeof (*uap->anamelen));
}
m_freem(nam);
splx(s);
return (error);
}
+int
+accept(p, uap, retval)
+ struct proc *p;
+ struct accept_args *uap;
+ int *retval;
+{
+
+ return (accept1(p, uap, retval, 0));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+oaccept(p, uap, retval)
+ struct proc *p;
+ struct accept_args *uap;
+ int *retval;
+{
+
+ return (accept1(p, uap, retval, 1));
+}
+#endif /* COMPAT_OLDSOCK */
+
/* ARGSUSED */
int
connect(p, uap, retval)
struct proc *p;
register struct connect_args /* {
- syscallarg(int) s;
- syscallarg(caddr_t) name;
- syscallarg(int) namelen;
+ int s;
+ caddr_t name;
+ int namelen;
} */ *uap;
- register_t *retval;
+ int *retval;
{
struct file *fp;
register struct socket *so;
struct mbuf *nam;
int error, s;
- if (error = getsock(p->p_fd, SCARG(uap, s), &fp))
+ error = getsock(p->p_fd, uap->s, &fp);
+ if (error)
return (error);
so = (struct socket *)fp->f_data;
if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING))
return (EALREADY);
- if (error = sockargs(&nam, SCARG(uap, name), SCARG(uap, namelen),
- MT_SONAME))
+ error = sockargs(&nam, uap->name, uap->namelen, MT_SONAME);
+ if (error)
return (error);
error = soconnect(so, nam);
if (error)
@@ -283,10 +304,12 @@ connect(p, uap, retval)
return (EINPROGRESS);
}
s = splnet();
- while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0)
- if (error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH,
- netcon, 0))
+ while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
+ error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH,
+ "connec", 0);
+ if (error)
break;
+ }
if (error == 0) {
error = so->so_error;
so->so_error = 0;
@@ -304,51 +327,56 @@ int
socketpair(p, uap, retval)
struct proc *p;
register struct socketpair_args /* {
- syscallarg(int) domain;
- syscallarg(int) type;
- syscallarg(int) protocol;
- syscallarg(int *) rsv;
+ int domain;
+ int type;
+ int protocol;
+ int *rsv;
} */ *uap;
- register_t *retval;
+ int retval[];
{
register struct filedesc *fdp = p->p_fd;
struct file *fp1, *fp2;
struct socket *so1, *so2;
int fd, error, sv[2];
- if (error = socreate(SCARG(uap, domain), &so1, SCARG(uap, type),
- SCARG(uap, protocol)))
+ error = socreate(uap->domain, &so1, uap->type, uap->protocol, p);
+ if (error)
return (error);
- if (error = socreate(SCARG(uap, domain), &so2, SCARG(uap, type),
- SCARG(uap, protocol)))
+ error = socreate(uap->domain, &so2, uap->type, uap->protocol, p);
+ if (error)
goto free1;
- if (error = falloc(p, &fp1, &fd))
+ error = falloc(p, &fp1, &fd);
+ if (error)
goto free2;
sv[0] = fd;
fp1->f_flag = FREAD|FWRITE;
fp1->f_type = DTYPE_SOCKET;
fp1->f_ops = &socketops;
fp1->f_data = (caddr_t)so1;
- if (error = falloc(p, &fp2, &fd))
+ error = falloc(p, &fp2, &fd);
+ if (error)
goto free3;
fp2->f_flag = FREAD|FWRITE;
fp2->f_type = DTYPE_SOCKET;
fp2->f_ops = &socketops;
fp2->f_data = (caddr_t)so2;
sv[1] = fd;
- if (error = soconnect2(so1, so2))
+ error = soconnect2(so1, so2);
+ if (error)
goto free4;
- if (SCARG(uap, type) == SOCK_DGRAM) {
+ if (uap->type == SOCK_DGRAM) {
/*
* Datagram socket connection is asymmetric.
*/
- if (error = soconnect2(so2, so1))
+ error = soconnect2(so2, so1);
+ if (error)
goto free4;
}
- error = copyout((caddr_t)sv, (caddr_t)SCARG(uap, rsv),
- 2 * sizeof (int));
+ error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int));
+#if 0 /* old pipe(2) syscall compatability, unused these days */
retval[0] = sv[0]; /* XXX ??? */
retval[1] = sv[1]; /* XXX ??? */
+#endif
return (error);
free4:
ffree(fp2);
@@ -364,145 +392,11 @@ free1:
}
int
-sendto(p, uap, retval)
- struct proc *p;
- register struct sendto_args /* {
- syscallarg(int) s;
- syscallarg(caddr_t) buf;
- syscallarg(size_t) len;
- syscallarg(int) flags;
- syscallarg(caddr_t) to;
- syscallarg(int) tolen;
- } */ *uap;
- register_t *retval;
-{
- struct msghdr msg;
- struct iovec aiov;
-
- msg.msg_name = SCARG(uap, to);
- msg.msg_namelen = SCARG(uap, tolen);
- msg.msg_iov = &aiov;
- msg.msg_iovlen = 1;
- msg.msg_control = 0;
-#ifdef COMPAT_OLDSOCK
- msg.msg_flags = 0;
-#endif
- aiov.iov_base = SCARG(uap, buf);
- aiov.iov_len = SCARG(uap, len);
- return (sendit(p, SCARG(uap, s), &msg, SCARG(uap, flags), retval));
-}
-
-#ifdef COMPAT_OLDSOCK
-int
-compat_43_send(p, uap, retval)
- struct proc *p;
- register struct compat_43_send_args /* {
- syscallarg(int) s;
- syscallarg(caddr_t) buf;
- syscallarg(int) len;
- syscallarg(int) flags;
- } */ *uap;
- register_t *retval;
-{
- struct msghdr msg;
- struct iovec aiov;
-
- msg.msg_name = 0;
- msg.msg_namelen = 0;
- msg.msg_iov = &aiov;
- msg.msg_iovlen = 1;
- aiov.iov_base = SCARG(uap, buf);
- aiov.iov_len = SCARG(uap, len);
- msg.msg_control = 0;
- msg.msg_flags = 0;
- return (sendit(p, SCARG(uap, s), &msg, SCARG(uap, flags), retval));
-}
-
-#define MSG_COMPAT 0x8000
-int
-compat_43_sendmsg(p, uap, retval)
- struct proc *p;
- register struct compat_43_sendmsg_args /* {
- syscallarg(int) s;
- syscallarg(caddr_t) msg;
- syscallarg(int) flags;
- } */ *uap;
- register_t *retval;
-{
- struct msghdr msg;
- struct iovec aiov[UIO_SMALLIOV], *iov;
- int error;
-
- if (error = copyin(SCARG(uap, msg), (caddr_t)&msg,
- sizeof (struct omsghdr)))
- return (error);
- if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
- if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
- return (EMSGSIZE);
- MALLOC(iov, struct iovec *,
- sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
- M_WAITOK);
- } else
- iov = aiov;
- if (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
- (unsigned)(msg.msg_iovlen * sizeof (struct iovec))))
- goto done;
- msg.msg_flags = MSG_COMPAT;
- msg.msg_iov = iov;
- error = sendit(p, SCARG(uap, s), &msg, SCARG(uap, flags), retval);
-done:
- if (iov != aiov)
- FREE(iov, M_IOV);
- return (error);
-}
-#endif
-
-int
-sendmsg(p, uap, retval)
- struct proc *p;
- register struct sendmsg_args /* {
- syscallarg(int) s;
- syscallarg(caddr_t) msg;
- syscallarg(int) flags;
- } */ *uap;
- register_t *retval;
-{
- struct msghdr msg;
- struct iovec aiov[UIO_SMALLIOV], *iov;
- int error;
-
- if (error = copyin(SCARG(uap, msg), (caddr_t)&msg, sizeof (msg)))
- return (error);
- if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
- if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
- return (EMSGSIZE);
- MALLOC(iov, struct iovec *,
- sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
- M_WAITOK);
- } else
- iov = aiov;
- if (msg.msg_iovlen &&
- (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
- (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
- goto done;
- msg.msg_iov = iov;
-#ifdef COMPAT_OLDSOCK
- msg.msg_flags = 0;
-#endif
- error = sendit(p, SCARG(uap, s), &msg, SCARG(uap, flags), retval);
-done:
- if (iov != aiov)
- FREE(iov, M_IOV);
- return (error);
-}
-
-int
sendit(p, s, mp, flags, retsize)
register struct proc *p;
int s;
register struct msghdr *mp;
- int flags;
- register_t *retsize;
+ int flags, *retsize;
{
struct file *fp;
struct uio auio;
@@ -513,8 +407,9 @@ sendit(p, s, mp, flags, retsize)
#ifdef KTRACE
struct iovec *ktriov = NULL;
#endif
-
- if (error = getsock(p->p_fd, s, &fp))
+
+ error = getsock(p->p_fd, s, &fp);
+ if (error)
return (error);
auio.uio_iov = mp->msg_iov;
auio.uio_iovcnt = mp->msg_iovlen;
@@ -525,13 +420,12 @@ sendit(p, s, mp, flags, retsize)
auio.uio_resid = 0;
iov = mp->msg_iov;
for (i = 0; i < mp->msg_iovlen; i++, iov++) {
- if (auio.uio_resid + iov->iov_len < auio.uio_resid)
+ if ((auio.uio_resid += iov->iov_len) < 0)
return (EINVAL);
- auio.uio_resid += iov->iov_len;
}
if (mp->msg_name) {
- if (error = sockargs(&to, mp->msg_name, mp->msg_namelen,
- MT_SONAME))
+ error = sockargs(&to, mp->msg_name, mp->msg_namelen, MT_SONAME);
+ if (error)
return (error);
} else
to = 0;
@@ -544,8 +438,9 @@ sendit(p, s, mp, flags, retsize)
error = EINVAL;
goto bad;
}
- if (error = sockargs(&control, mp->msg_control,
- mp->msg_controllen, MT_CONTROL))
+ error = sockargs(&control, mp->msg_control,
+ mp->msg_controllen, MT_CONTROL);
+ if (error)
goto bad;
#ifdef COMPAT_OLDSOCK
if (mp->msg_flags == MSG_COMPAT) {
@@ -574,8 +469,9 @@ sendit(p, s, mp, flags, retsize)
}
#endif
len = auio.uio_resid;
- if (error = sosend((struct socket *)fp->f_data, to, &auio,
- (struct mbuf *)0, control, flags)) {
+ error = sosend((struct socket *)fp->f_data, to, &auio,
+ (struct mbuf *)0, control, flags);
+ if (error) {
if (auio.uio_resid != len && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
@@ -598,71 +494,46 @@ bad:
return (error);
}
-#ifdef COMPAT_OLDSOCK
int
-compat_43_recvfrom(p, uap, retval)
- struct proc *p;
- struct recvfrom_args /* {
- syscallarg(int) s;
- syscallarg(caddr_t) buf;
- syscallarg(size_t) len;
- syscallarg(int) flags;
- syscallarg(caddr_t) from;
- syscallarg(int *) fromlenaddr;
- } */ *uap;
- register_t *retval;
-{
-
- SCARG(uap, flags) |= MSG_COMPAT;
- return (recvfrom(p, uap, retval));
-}
-#endif
-
-int
-recvfrom(p, uap, retval)
+sendto(p, uap, retval)
struct proc *p;
- register struct recvfrom_args /* {
- syscallarg(int) s;
- syscallarg(caddr_t) buf;
- syscallarg(size_t) len;
- syscallarg(int) flags;
- syscallarg(caddr_t) from;
- syscallarg(int *) fromlenaddr;
+ register struct sendto_args /* {
+ int s;
+ caddr_t buf;
+ size_t len;
+ int flags;
+ caddr_t to;
+ int tolen;
} */ *uap;
- register_t *retval;
+ int *retval;
{
struct msghdr msg;
struct iovec aiov;
- int error;
- if (SCARG(uap, fromlenaddr)) {
- if (error = copyin((caddr_t)SCARG(uap, fromlenaddr),
- (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen)))
- return (error);
- } else
- msg.msg_namelen = 0;
- msg.msg_name = SCARG(uap, from);
+ msg.msg_name = uap->to;
+ msg.msg_namelen = uap->tolen;
msg.msg_iov = &aiov;
msg.msg_iovlen = 1;
- aiov.iov_base = SCARG(uap, buf);
- aiov.iov_len = SCARG(uap, len);
msg.msg_control = 0;
- msg.msg_flags = SCARG(uap, flags);
- return (recvit(p, SCARG(uap, s), &msg,
- (caddr_t)SCARG(uap, fromlenaddr), retval));
+#ifdef COMPAT_OLDSOCK
+ msg.msg_flags = 0;
+#endif
+ aiov.iov_base = uap->buf;
+ aiov.iov_len = uap->len;
+ return (sendit(p, uap->s, &msg, uap->flags, retval));
}
#ifdef COMPAT_OLDSOCK
int
-compat_43_recv(p, uap, retval)
+osend(p, uap, retval)
struct proc *p;
- register struct compat_43_recv_args /* {
- syscallarg(int) s;
- syscallarg(caddr_t) buf;
- syscallarg(int) len;
- syscallarg(int) flags;
+ register struct osend_args /* {
+ int s;
+ caddr_t buf;
+ int len;
+ int flags;
} */ *uap;
- register_t *retval;
+ int *retval;
{
struct msghdr msg;
struct iovec aiov;
@@ -671,34 +542,29 @@ compat_43_recv(p, uap, retval)
msg.msg_namelen = 0;
msg.msg_iov = &aiov;
msg.msg_iovlen = 1;
- aiov.iov_base = SCARG(uap, buf);
- aiov.iov_len = SCARG(uap, len);
+ aiov.iov_base = uap->buf;
+ aiov.iov_len = uap->len;
msg.msg_control = 0;
- msg.msg_flags = SCARG(uap, flags);
- return (recvit(p, SCARG(uap, s), &msg, (caddr_t)0, retval));
+ msg.msg_flags = 0;
+ return (sendit(p, uap->s, &msg, uap->flags, retval));
}
-/*
- * Old recvmsg. This code takes advantage of the fact that the old msghdr
- * overlays the new one, missing only the flags, and with the (old) access
- * rights where the control fields are now.
- */
int
-compat_43_recvmsg(p, uap, retval)
+osendmsg(p, uap, retval)
struct proc *p;
- register struct compat_43_recvmsg_args /* {
- syscallarg(int) s;
- syscallarg(struct omsghdr *) msg;
- syscallarg(int) flags;
+ register struct osendmsg_args /* {
+ int s;
+ caddr_t msg;
+ int flags;
} */ *uap;
- register_t *retval;
+ int *retval;
{
struct msghdr msg;
struct iovec aiov[UIO_SMALLIOV], *iov;
int error;
- if (error = copyin((caddr_t)SCARG(uap, msg), (caddr_t)&msg,
- sizeof (struct omsghdr)))
+ error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr));
+ if (error)
return (error);
if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
@@ -708,17 +574,13 @@ compat_43_recvmsg(p, uap, retval)
M_WAITOK);
} else
iov = aiov;
- msg.msg_flags = SCARG(uap, flags) | MSG_COMPAT;
- if (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
- (unsigned)(msg.msg_iovlen * sizeof (struct iovec))))
+ error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
+ (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
+ if (error)
goto done;
+ msg.msg_flags = MSG_COMPAT;
msg.msg_iov = iov;
- error = recvit(p, SCARG(uap, s), &msg,
- (caddr_t)&SCARG(uap, msg)->msg_namelen, retval);
-
- if (msg.msg_controllen && error == 0)
- error = copyout((caddr_t)&msg.msg_controllen,
- (caddr_t)&SCARG(uap, msg)->msg_accrightslen, sizeof (int));
+ error = sendit(p, uap->s, &msg, uap->flags, retval);
done:
if (iov != aiov)
FREE(iov, M_IOV);
@@ -727,21 +589,21 @@ done:
#endif
int
-recvmsg(p, uap, retval)
+sendmsg(p, uap, retval)
struct proc *p;
- register struct recvmsg_args /* {
- syscallarg(int) s;
- syscallarg(struct msghdr *) msg;
- syscallarg(int) flags;
+ register struct sendmsg_args /* {
+ int s;
+ caddr_t msg;
+ int flags;
} */ *uap;
- register_t *retval;
+ int *retval;
{
struct msghdr msg;
- struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
- register int error;
+ struct iovec aiov[UIO_SMALLIOV], *iov;
+ int error;
- if (error = copyin((caddr_t)SCARG(uap, msg), (caddr_t)&msg,
- sizeof (msg)))
+ error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg));
+ if (error)
return (error);
if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
@@ -751,21 +613,15 @@ recvmsg(p, uap, retval)
M_WAITOK);
} else
iov = aiov;
+ if (msg.msg_iovlen &&
+ (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
+ (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
+ goto done;
+ msg.msg_iov = iov;
#ifdef COMPAT_OLDSOCK
- msg.msg_flags = SCARG(uap, flags) &~ MSG_COMPAT;
-#else
- msg.msg_flags = SCARG(uap, flags);
+ msg.msg_flags = 0;
#endif
- uiov = msg.msg_iov;
- msg.msg_iov = iov;
- if (error = copyin((caddr_t)uiov, (caddr_t)iov,
- (unsigned)(msg.msg_iovlen * sizeof (struct iovec))))
- goto done;
- if ((error = recvit(p, SCARG(uap, s), &msg, (caddr_t)0, retval)) == 0) {
- msg.msg_iov = uiov;
- error = copyout((caddr_t)&msg, (caddr_t)SCARG(uap, msg),
- sizeof(msg));
- }
+ error = sendit(p, uap->s, &msg, uap->flags, retval);
done:
if (iov != aiov)
FREE(iov, M_IOV);
@@ -778,19 +634,21 @@ recvit(p, s, mp, namelenp, retsize)
int s;
register struct msghdr *mp;
caddr_t namelenp;
- register_t *retsize;
+ int *retsize;
{
struct file *fp;
struct uio auio;
register struct iovec *iov;
register int i;
int len, error;
- struct mbuf *from = 0, *control = 0;
+ struct mbuf *m, *from = 0, *control = 0;
+ caddr_t ctlbuf;
#ifdef KTRACE
struct iovec *ktriov = NULL;
#endif
-
- if (error = getsock(p->p_fd, s, &fp))
+
+ error = getsock(p->p_fd, s, &fp);
+ if (error)
return (error);
auio.uio_iov = mp->msg_iov;
auio.uio_iovcnt = mp->msg_iovlen;
@@ -801,9 +659,8 @@ recvit(p, s, mp, namelenp, retsize)
auio.uio_resid = 0;
iov = mp->msg_iov;
for (i = 0; i < mp->msg_iovlen; i++, iov++) {
- if (auio.uio_resid + iov->iov_len < auio.uio_resid)
+ if ((auio.uio_resid += iov->iov_len) < 0)
return (EINVAL);
- auio.uio_resid += iov->iov_len;
}
#ifdef KTRACE
if (KTRPOINT(p, KTR_GENIO)) {
@@ -814,9 +671,10 @@ recvit(p, s, mp, namelenp, retsize)
}
#endif
len = auio.uio_resid;
- if (error = soreceive((struct socket *)fp->f_data, &from, &auio,
+ error = soreceive((struct socket *)fp->f_data, &from, &auio,
(struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
- &mp->msg_flags)) {
+ &mp->msg_flags);
+ if (error) {
if (auio.uio_resid != len && (error == ERESTART ||
error == EINTR || error == EWOULDBLOCK))
error = 0;
@@ -845,8 +703,9 @@ recvit(p, s, mp, namelenp, retsize)
if (len > from->m_len)
len = from->m_len;
/* else if len < from->m_len ??? */
- if (error = copyout(mtod(from, caddr_t),
- (caddr_t)mp->msg_name, (unsigned)len))
+ error = copyout(mtod(from, caddr_t),
+ (caddr_t)mp->msg_name, (unsigned)len);
+ if (error)
goto out;
}
mp->msg_namelen = len;
@@ -882,17 +741,29 @@ recvit(p, s, mp, namelenp, retsize)
}
#endif
len = mp->msg_controllen;
- if (len <= 0 || control == 0)
- len = 0;
- else {
- if (len >= control->m_len)
- len = control->m_len;
- else
+ m = control;
+ mp->msg_controllen = 0;
+ ctlbuf = (caddr_t) mp->msg_control;
+
+ while (m && len > 0) {
+ unsigned int tocopy;
+
+ if (len >= m->m_len)
+ tocopy = m->m_len;
+ else {
mp->msg_flags |= MSG_CTRUNC;
- error = copyout((caddr_t)mtod(control, caddr_t),
- (caddr_t)mp->msg_control, (unsigned)len);
+ tocopy = len;
+ }
+
+ if (error = copyout((caddr_t)mtod(m, caddr_t),
+ ctlbuf, tocopy))
+ goto out;
+
+ ctlbuf += tocopy;
+ len -= tocopy;
+ m = m->m_next;
}
- mp->msg_controllen = len;
+ mp->msg_controllen = ctlbuf - mp->msg_control;
}
out:
if (from)
@@ -902,22 +773,193 @@ out:
return (error);
}
+int
+recvfrom(p, uap, retval)
+ struct proc *p;
+ register struct recvfrom_args /* {
+ int s;
+ caddr_t buf;
+ size_t len;
+ int flags;
+ caddr_t from;
+ int *fromlenaddr;
+ } */ *uap;
+ int *retval;
+{
+ struct msghdr msg;
+ struct iovec aiov;
+ int error;
+
+ if (uap->fromlenaddr) {
+ error = copyin((caddr_t)uap->fromlenaddr,
+ (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen));
+ if (error)
+ return (error);
+ } else
+ msg.msg_namelen = 0;
+ msg.msg_name = uap->from;
+ msg.msg_iov = &aiov;
+ msg.msg_iovlen = 1;
+ aiov.iov_base = uap->buf;
+ aiov.iov_len = uap->len;
+ msg.msg_control = 0;
+ msg.msg_flags = uap->flags;
+ return (recvit(p, uap->s, &msg, (caddr_t)uap->fromlenaddr, retval));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+orecvfrom(p, uap, retval)
+ struct proc *p;
+ struct recvfrom_args *uap;
+ int *retval;
+{
+
+ uap->flags |= MSG_COMPAT;
+ return (recvfrom(p, uap, retval));
+}
+#endif
+
+
+#ifdef COMPAT_OLDSOCK
+int
+orecv(p, uap, retval)
+ struct proc *p;
+ register struct orecv_args /* {
+ int s;
+ caddr_t buf;
+ int len;
+ int flags;
+ } */ *uap;
+ int *retval;
+{
+ struct msghdr msg;
+ struct iovec aiov;
+
+ msg.msg_name = 0;
+ msg.msg_namelen = 0;
+ msg.msg_iov = &aiov;
+ msg.msg_iovlen = 1;
+ aiov.iov_base = uap->buf;
+ aiov.iov_len = uap->len;
+ msg.msg_control = 0;
+ msg.msg_flags = uap->flags;
+ return (recvit(p, uap->s, &msg, (caddr_t)0, retval));
+}
+
+/*
+ * Old recvmsg. This code takes advantage of the fact that the old msghdr
+ * overlays the new one, missing only the flags, and with the (old) access
+ * rights where the control fields are now.
+ */
+int
+orecvmsg(p, uap, retval)
+ struct proc *p;
+ register struct orecvmsg_args /* {
+ int s;
+ struct omsghdr *msg;
+ int flags;
+ } */ *uap;
+ int *retval;
+{
+ struct msghdr msg;
+ struct iovec aiov[UIO_SMALLIOV], *iov;
+ int error;
+
+ error = copyin((caddr_t)uap->msg, (caddr_t)&msg,
+ sizeof (struct omsghdr));
+ if (error)
+ return (error);
+ if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
+ if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
+ return (EMSGSIZE);
+ MALLOC(iov, struct iovec *,
+ sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
+ M_WAITOK);
+ } else
+ iov = aiov;
+ msg.msg_flags = uap->flags | MSG_COMPAT;
+ error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
+ (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
+ if (error)
+ goto done;
+ msg.msg_iov = iov;
+ error = recvit(p, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen, retval);
+
+ if (msg.msg_controllen && error == 0)
+ error = copyout((caddr_t)&msg.msg_controllen,
+ (caddr_t)&uap->msg->msg_accrightslen, sizeof (int));
+done:
+ if (iov != aiov)
+ FREE(iov, M_IOV);
+ return (error);
+}
+#endif
+
+int
+recvmsg(p, uap, retval)
+ struct proc *p;
+ register struct recvmsg_args /* {
+ int s;
+ struct msghdr *msg;
+ int flags;
+ } */ *uap;
+ int *retval;
+{
+ struct msghdr msg;
+ struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
+ register int error;
+
+ error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg));
+ if (error)
+ return (error);
+ if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
+ if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
+ return (EMSGSIZE);
+ MALLOC(iov, struct iovec *,
+ sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
+ M_WAITOK);
+ } else
+ iov = aiov;
+#ifdef COMPAT_OLDSOCK
+ msg.msg_flags = uap->flags &~ MSG_COMPAT;
+#else
+ msg.msg_flags = uap->flags;
+#endif
+ uiov = msg.msg_iov;
+ msg.msg_iov = iov;
+ error = copyin((caddr_t)uiov, (caddr_t)iov,
+ (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
+ if (error)
+ goto done;
+ error = recvit(p, uap->s, &msg, (caddr_t)0, retval);
+ if (!error) {
+ msg.msg_iov = uiov;
+ error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg));
+ }
+done:
+ if (iov != aiov)
+ FREE(iov, M_IOV);
+ return (error);
+}
+
/* ARGSUSED */
int
shutdown(p, uap, retval)
struct proc *p;
register struct shutdown_args /* {
- syscallarg(int) s;
- syscallarg(int) how;
+ int s;
+ int how;
} */ *uap;
- register_t *retval;
+ int *retval;
{
struct file *fp;
int error;
- if (error = getsock(p->p_fd, SCARG(uap, s), &fp))
+ error = getsock(p->p_fd, uap->s, &fp);
+ if (error)
return (error);
- return (soshutdown((struct socket *)fp->f_data, SCARG(uap, how)));
+ return (soshutdown((struct socket *)fp->f_data, uap->how));
}
/* ARGSUSED */
@@ -925,35 +967,36 @@ int
setsockopt(p, uap, retval)
struct proc *p;
register struct setsockopt_args /* {
- syscallarg(int) s;
- syscallarg(int) level;
- syscallarg(int) name;
- syscallarg(caddr_t) val;
- syscallarg(int) valsize;
+ int s;
+ int level;
+ int name;
+ caddr_t val;
+ int valsize;
} */ *uap;
- register_t *retval;
+ int *retval;
{
struct file *fp;
struct mbuf *m = NULL;
int error;
- if (error = getsock(p->p_fd, SCARG(uap, s), &fp))
+ error = getsock(p->p_fd, uap->s, &fp);
+ if (error)
return (error);
- if (SCARG(uap, valsize) > MLEN)
+ if (uap->valsize > MLEN)
return (EINVAL);
- if (SCARG(uap, val)) {
+ if (uap->val) {
m = m_get(M_WAIT, MT_SOOPTS);
if (m == NULL)
return (ENOBUFS);
- if (error = copyin(SCARG(uap, val), mtod(m, caddr_t),
- (u_int)SCARG(uap, valsize))) {
+ error = copyin(uap->val, mtod(m, caddr_t), (u_int)uap->valsize);
+ if (error) {
(void) m_free(m);
return (error);
}
- m->m_len = SCARG(uap, valsize);
+ m->m_len = uap->valsize;
}
- return (sosetopt((struct socket *)fp->f_data, SCARG(uap, level),
- SCARG(uap, name), m));
+ return (sosetopt((struct socket *)fp->f_data, uap->level,
+ uap->name, m));
}
/* ARGSUSED */
@@ -961,73 +1004,88 @@ int
getsockopt(p, uap, retval)
struct proc *p;
register struct getsockopt_args /* {
- syscallarg(int) s;
- syscallarg(int) level;
- syscallarg(int) name;
- syscallarg(caddr_t) val;
- syscallarg(int *) avalsize;
+ int s;
+ int level;
+ int name;
+ caddr_t val;
+ int *avalsize;
} */ *uap;
- register_t *retval;
+ int *retval;
{
struct file *fp;
- struct mbuf *m = NULL;
- int valsize, error;
+ struct mbuf *m = NULL, *m0;
+ int op, i, valsize, error;
- if (error = getsock(p->p_fd, SCARG(uap, s), &fp))
+ error = getsock(p->p_fd, uap->s, &fp);
+ if (error)
return (error);
- if (SCARG(uap, val)) {
- if (error = copyin((caddr_t)SCARG(uap, avalsize),
- (caddr_t)&valsize, sizeof (valsize)))
+ if (uap->val) {
+ error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize,
+ sizeof (valsize));
+ if (error)
return (error);
} else
valsize = 0;
- if ((error = sogetopt((struct socket *)fp->f_data, SCARG(uap, level),
- SCARG(uap, name), &m)) == 0 && SCARG(uap, val) && valsize &&
- m != NULL) {
- if (valsize > m->m_len)
- valsize = m->m_len;
- error = copyout(mtod(m, caddr_t), SCARG(uap, val),
- (u_int)valsize);
+ if ((error = sogetopt((struct socket *)fp->f_data, uap->level,
+ uap->name, &m)) == 0 && uap->val && valsize && m != NULL) {
+ op = 0;
+ while (m && !error && op < valsize) {
+ i = min(m->m_len, (valsize - op));
+ error = copyout(mtod(m, caddr_t), uap->val, (u_int)i);
+ op += i;
+ uap->val += i;
+ m0 = m;
+ MFREE(m0,m);
+ }
+ valsize = op;
if (error == 0)
error = copyout((caddr_t)&valsize,
- (caddr_t)SCARG(uap, avalsize), sizeof (valsize));
+ (caddr_t)uap->avalsize, sizeof (valsize));
}
if (m != NULL)
(void) m_free(m);
return (error);
}
+#ifdef OLD_PIPE
/* ARGSUSED */
int
pipe(p, uap, retval)
struct proc *p;
- void *uap;
- register_t *retval;
+ struct pipe_args /* {
+ int dummy;
+ } */ *uap;
+ int retval[];
{
register struct filedesc *fdp = p->p_fd;
struct file *rf, *wf;
struct socket *rso, *wso;
int fd, error;
- if (error = socreate(AF_UNIX, &rso, SOCK_STREAM, 0))
+ error = socreate(AF_UNIX, &rso, SOCK_STREAM, 0, p);
+ if (error)
return (error);
- if (error = socreate(AF_UNIX, &wso, SOCK_STREAM, 0))
+ error = socreate(AF_UNIX, &wso, SOCK_STREAM, 0, p);
+ if (error)
goto free1;
- if (error = falloc(p, &rf, &fd))
+ error = falloc(p, &rf, &fd);
+ if (error)
goto free2;
retval[0] = fd;
- rf->f_flag = FREAD;
+ rf->f_flag = FREAD | FWRITE;
rf->f_type = DTYPE_SOCKET;
rf->f_ops = &socketops;
rf->f_data = (caddr_t)rso;
- if (error = falloc(p, &wf, &fd))
+ error = falloc(p, &wf, &fd);
+ if (error)
goto free3;
- wf->f_flag = FWRITE;
+ wf->f_flag = FREAD | FWRITE;
wf->f_type = DTYPE_SOCKET;
wf->f_ops = &socketops;
wf->f_data = (caddr_t)wso;
retval[1] = fd;
- if (error = unp_connect2(wso, rso))
+ error = unp_connect2(wso, rso);
+ if (error)
goto free4;
return (0);
free4:
@@ -1042,170 +1100,153 @@ free1:
(void)soclose(rso);
return (error);
}
-
+#endif
/*
* Get socket name.
*/
-#ifdef COMPAT_OLDSOCK
-int
-getsockname(p, uap, retval)
- struct proc *p;
- struct getsockname_args /* {
- syscallarg(int) fdes;
- syscallarg(caddr_t) asa;
- syscallarg(int *) alen;
- } */ *uap;
- register_t *retval;
-{
-
- return (getsockname1(p, uap, retval, 0));
-}
-
-int
-compat_43_getsockname(p, uap, retval)
- struct proc *p;
- struct getsockname_args /* {
- syscallarg(int) fdes;
- syscallarg(caddr_t) asa;
- syscallarg(int *) alen;
- } */ *uap;
- register_t *retval;
-{
-
- return (getsockname1(p, uap, retval, 1));
-}
-#else /* COMPAT_OLDSOCK */
-
-#define getsockname1 getsockname
-#endif
-
/* ARGSUSED */
-int
-getsockname1(p, uap, retval, compat_43)
+static int
+getsockname1(p, uap, retval, compat)
struct proc *p;
register struct getsockname_args /* {
- syscallarg(int) fdes;
- syscallarg(caddr_t) asa;
- syscallarg(int *) alen;
+ int fdes;
+ caddr_t asa;
+ int *alen;
} */ *uap;
- register_t *retval;
- int compat_43;
+ int *retval;
+ int compat;
{
struct file *fp;
register struct socket *so;
struct mbuf *m;
int len, error;
- if (error = getsock(p->p_fd, SCARG(uap, fdes), &fp))
+ error = getsock(p->p_fd, uap->fdes, &fp);
+ if (error)
return (error);
- if (error = copyin((caddr_t)SCARG(uap, alen), (caddr_t)&len,
- sizeof (len)))
+ error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
+ if (error)
return (error);
so = (struct socket *)fp->f_data;
m = m_getclr(M_WAIT, MT_SONAME);
if (m == NULL)
return (ENOBUFS);
- if (error = (*so->so_proto->pr_usrreq)(so, PRU_SOCKADDR, 0, m, 0))
+ error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, m);
+ if (error)
goto bad;
if (len > m->m_len)
len = m->m_len;
#ifdef COMPAT_OLDSOCK
- if (compat_43)
+ if (compat)
mtod(m, struct osockaddr *)->sa_family =
mtod(m, struct sockaddr *)->sa_family;
#endif
- error = copyout(mtod(m, caddr_t), (caddr_t)SCARG(uap, asa), (u_int)len);
+ error = copyout(mtod(m, caddr_t), (caddr_t)uap->asa, (u_int)len);
if (error == 0)
- error = copyout((caddr_t)&len, (caddr_t)SCARG(uap, alen),
+ error = copyout((caddr_t)&len, (caddr_t)uap->alen,
sizeof (len));
bad:
m_freem(m);
return (error);
}
-/*
- * Get name of peer for connected socket.
- */
-#ifdef COMPAT_OLDSOCK
int
-getpeername(p, uap, retval)
+getsockname(p, uap, retval)
struct proc *p;
- struct getpeername_args /* {
- syscallarg(int) fdes;
- syscallarg(caddr_t) asa;
- syscallarg(int *) alen;
- } */ *uap;
- register_t *retval;
+ struct getsockname_args *uap;
+ int *retval;
{
- return (getpeername1(p, uap, retval, 0));
+ return (getsockname1(p, uap, retval, 0));
}
+#ifdef COMPAT_OLDSOCK
int
-compat_43_getpeername(p, uap, retval)
+ogetsockname(p, uap, retval)
struct proc *p;
- struct getpeername_args /* {
- syscallarg(int) fdes;
- syscallarg(caddr_t) asa;
- syscallarg(int *) alen;
- } */ *uap;
- register_t *retval;
+ struct getsockname_args *uap;
+ int *retval;
{
- return (getpeername1(p, uap, retval, 1));
+ return (getsockname1(p, uap, retval, 1));
}
-#else /* COMPAT_OLDSOCK */
-
-#define getpeername1 getpeername
-#endif
+#endif /* COMPAT_OLDSOCK */
+/*
+ * Get name of peer for connected socket.
+ */
/* ARGSUSED */
-int
-getpeername1(p, uap, retval, compat_43)
+static int
+getpeername1(p, uap, retval, compat)
struct proc *p;
register struct getpeername_args /* {
- syscallarg(int) fdes;
- syscallarg(caddr_t) asa;
- syscallarg(int *) alen;
+ int fdes;
+ caddr_t asa;
+ int *alen;
} */ *uap;
- register_t *retval;
- int compat_43;
+ int *retval;
+ int compat;
{
struct file *fp;
register struct socket *so;
struct mbuf *m;
int len, error;
- if (error = getsock(p->p_fd, SCARG(uap, fdes), &fp))
+ error = getsock(p->p_fd, uap->fdes, &fp);
+ if (error)
return (error);
so = (struct socket *)fp->f_data;
if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0)
return (ENOTCONN);
- if (error =
- copyin((caddr_t)SCARG(uap, alen), (caddr_t)&len, sizeof (len)))
+ error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
+ if (error)
return (error);
m = m_getclr(M_WAIT, MT_SONAME);
if (m == NULL)
return (ENOBUFS);
- if (error = (*so->so_proto->pr_usrreq)(so, PRU_PEERADDR, 0, m, 0))
+ error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, m);
+ if (error)
goto bad;
if (len > m->m_len)
len = m->m_len;
#ifdef COMPAT_OLDSOCK
- if (compat_43)
+ if (compat)
mtod(m, struct osockaddr *)->sa_family =
mtod(m, struct sockaddr *)->sa_family;
#endif
- if (error =
- copyout(mtod(m, caddr_t), (caddr_t)SCARG(uap, asa), (u_int)len))
+ error = copyout(mtod(m, caddr_t), (caddr_t)uap->asa, (u_int)len);
+ if (error)
goto bad;
- error = copyout((caddr_t)&len, (caddr_t)SCARG(uap, alen), sizeof (len));
+ error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len));
bad:
m_freem(m);
return (error);
}
int
+getpeername(p, uap, retval)
+ struct proc *p;
+ struct getpeername_args *uap;
+ int *retval;
+{
+
+ return (getpeername1(p, uap, retval, 0));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+ogetpeername(p, uap, retval)
+ struct proc *p;
+ struct ogetpeername_args *uap;
+ int *retval;
+{
+
+ /* XXX uap should have type `getpeername_args *' to begin with. */
+ return (getpeername1(p, (struct getpeername_args *)uap, retval, 1));
+}
+#endif /* COMPAT_OLDSOCK */
+
+int
sockargs(mp, buf, buflen, type)
struct mbuf **mp;
caddr_t buf;
@@ -1228,21 +1269,21 @@ sockargs(mp, buf, buflen, type)
return (ENOBUFS);
m->m_len = buflen;
error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
- if (error) {
+ if (error)
(void) m_free(m);
- return (error);
- }
- *mp = m;
- if (type == MT_SONAME) {
- sa = mtod(m, struct sockaddr *);
+ else {
+ *mp = m;
+ if (type == MT_SONAME) {
+ sa = mtod(m, struct sockaddr *);
#if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
- if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
- sa->sa_family = sa->sa_len;
+ if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
+ sa->sa_family = sa->sa_len;
#endif
- sa->sa_len = buflen;
+ sa->sa_len = buflen;
+ }
}
- return (0);
+ return (error);
}
int
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index c6bcbfd..0a47414 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -30,24 +30,29 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)uipc_usrreq.c 8.9 (Berkeley) 5/14/95
+ * From: @(#)uipc_usrreq.c 8.3 (Berkeley) 1/4/94
+ * $Id: uipc_usrreq.c,v 1.21 1997/03/21 16:12:32 wpaul Exp $
*/
#include <sys/param.h>
+#include <sys/queue.h>
#include <sys/systm.h>
-#include <sys/proc.h>
-#include <sys/filedesc.h>
+#include <sys/kernel.h>
#include <sys/domain.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/mbuf.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
#include <sys/protosw.h>
#include <sys/socket.h>
#include <sys/socketvar.h>
-#include <sys/unpcb.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
#include <sys/un.h>
-#include <sys/namei.h>
+#include <sys/unpcb.h>
#include <sys/vnode.h>
-#include <sys/file.h>
-#include <sys/stat.h>
-#include <sys/mbuf.h>
/*
* Unix communications domain.
@@ -57,8 +62,22 @@
* rethink name space problems
* need a proper out-of-band
*/
-struct sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
-ino_t unp_ino; /* prototype for fake inode numbers */
+static struct sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
+static ino_t unp_ino; /* prototype for fake inode numbers */
+
+static int unp_attach __P((struct socket *));
+static void unp_detach __P((struct unpcb *));
+static int unp_bind __P((struct unpcb *,struct mbuf *, struct proc *));
+static int unp_connect __P((struct socket *,struct mbuf *, struct proc *));
+static void unp_disconnect __P((struct unpcb *));
+static void unp_shutdown __P((struct unpcb *));
+static void unp_drop __P((struct unpcb *, int));
+static void unp_gc __P((void));
+static void unp_scan __P((struct mbuf *, void (*)(struct file *)));
+static void unp_mark __P((struct file *));
+static void unp_discard __P((struct file *));
+static int unp_internalize __P((struct mbuf *, struct proc *));
+
/*ARGSUSED*/
int
@@ -170,6 +189,7 @@ uipc_usrreq(so, req, m, nam, control)
break;
case PRU_SEND:
+ case PRU_SEND_EOF:
if (control && (error = unp_internalize(control, p)))
break;
switch (so->so_type) {
@@ -210,6 +230,22 @@ uipc_usrreq(so, req, m, nam, control)
case SOCK_STREAM:
#define rcv (&so2->so_rcv)
#define snd (&so->so_snd)
+ /* Connect if not connected yet. */
+ /*
+ * Note: A better implementation would complain
+ * if not equal to the peer's address.
+ */
+ if ((so->so_state & SS_ISCONNECTED) == 0) {
+ if (nam) {
+ error = unp_connect(so, nam, p);
+ if (error)
+ break; /* XXX */
+ } else {
+ error = ENOTCONN;
+ break;
+ }
+ }
+
if (so->so_state & SS_CANTSENDMORE) {
error = EPIPE;
break;
@@ -241,6 +277,14 @@ uipc_usrreq(so, req, m, nam, control)
default:
panic("uipc 4");
}
+ /*
+ * SEND_EOF is equivalent to a SEND followed by
+ * a SHUTDOWN.
+ */
+ if (req == PRU_SEND_EOF) {
+ socantsendmore(so);
+ unp_shutdown(unp);
+ }
break;
case PRU_ABORT:
@@ -306,22 +350,34 @@ release:
* and don't really want to reserve the sendspace. Their recvspace should
* be large enough for at least one max-size datagram plus address.
*/
-#define PIPSIZ 4096
-u_long unpst_sendspace = PIPSIZ;
-u_long unpst_recvspace = PIPSIZ;
-u_long unpdg_sendspace = 2*1024; /* really max datagram size */
-u_long unpdg_recvspace = 4*1024;
-
-int unp_rights; /* file descriptors in flight */
-
-int
+#ifndef PIPSIZ
+#define PIPSIZ 8192
+#endif
+static u_long unpst_sendspace = PIPSIZ;
+static u_long unpst_recvspace = PIPSIZ;
+static u_long unpdg_sendspace = 2*1024; /* really max datagram size */
+static u_long unpdg_recvspace = 4*1024;
+
+static int unp_rights; /* file descriptors in flight */
+
+SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW,
+ &unpst_sendspace, 0, "");
+SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
+ &unpst_recvspace, 0, "");
+SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
+ &unpdg_sendspace, 0, "");
+SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
+ &unpdg_recvspace, 0, "");
+SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
+
+static int
unp_attach(so)
struct socket *so;
{
register struct mbuf *m;
register struct unpcb *unp;
int error;
-
+
if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
switch (so->so_type) {
@@ -348,11 +404,11 @@ unp_attach(so)
return (0);
}
-void
+static void
unp_detach(unp)
register struct unpcb *unp;
{
-
+
if (unp->unp_vnode) {
unp->unp_vnode->v_socket = 0;
vrele(unp->unp_vnode);
@@ -364,8 +420,6 @@ unp_detach(unp)
unp_drop(unp->unp_refs, ECONNRESET);
soisdisconnected(unp->unp_socket);
unp->unp_socket->so_pcb = 0;
- m_freem(unp->unp_addr);
- (void) m_free(dtom(unp));
if (unp_rights) {
/*
* Normally the receive buffer is flushed later,
@@ -377,9 +431,11 @@ unp_detach(unp)
sorflush(unp->unp_socket);
unp_gc();
}
+ m_freem(unp->unp_addr);
+ (void) m_free(dtom(unp));
}
-int
+static int
unp_bind(unp, nam, p)
struct unpcb *unp;
struct mbuf *nam;
@@ -401,7 +457,8 @@ unp_bind(unp, nam, p)
} else
*(mtod(nam, caddr_t) + nam->m_len) = 0;
/* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
- if (error = namei(&nd))
+ error = namei(&nd);
+ if (error)
return (error);
vp = nd.ni_vp;
if (vp != NULL) {
@@ -427,7 +484,7 @@ unp_bind(unp, nam, p)
return (0);
}
-int
+static int
unp_connect(so, nam, p)
struct socket *so;
struct mbuf *nam;
@@ -446,14 +503,16 @@ unp_connect(so, nam, p)
return (EMSGSIZE);
} else
*(mtod(nam, caddr_t) + nam->m_len) = 0;
- if (error = namei(&nd))
+ error = namei(&nd);
+ if (error)
return (error);
vp = nd.ni_vp;
if (vp->v_type != VSOCK) {
error = ENOTSOCK;
goto bad;
}
- if (error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p))
+ error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p);
+ if (error)
goto bad;
so2 = vp->v_socket;
if (so2 == 0) {
@@ -515,7 +574,7 @@ unp_connect2(so, so2)
return (0);
}
-void
+static void
unp_disconnect(unp)
struct unpcb *unp;
{
@@ -562,7 +621,7 @@ unp_abort(unp)
}
#endif
-void
+static void
unp_shutdown(unp)
struct unpcb *unp;
{
@@ -573,7 +632,7 @@ unp_shutdown(unp)
socantrcvmore(so);
}
-void
+static void
unp_drop(unp, errno)
struct unpcb *unp;
int errno;
@@ -591,6 +650,7 @@ unp_drop(unp, errno)
}
#ifdef notdef
+void
unp_drain()
{
@@ -609,6 +669,9 @@ unp_externalize(rights)
int newfds = (cm->cmsg_len - sizeof(*cm)) / sizeof (int);
int f;
+ /*
+ * if the new FD's will not fit, then we free them all
+ */
if (!fdavail(p, newfds)) {
for (i = 0; i < newfds; i++) {
fp = *rp;
@@ -617,6 +680,12 @@ unp_externalize(rights)
}
return (EMSGSIZE);
}
+ /*
+ * now change each pointer to an fd in the global table to
+ * an integer that is the index to the local fd table entry
+ * that we set up to point to the global one we are transferring.
+ * XXX this assumes a pointer and int are the same size...!
+ */
for (i = 0; i < newfds; i++) {
if (fdalloc(p, 0, &f))
panic("unp_externalize");
@@ -629,7 +698,11 @@ unp_externalize(rights)
return (0);
}
-int
+#ifndef MIN
+#define MIN(a,b) (((a)<(b))?(a):(b))
+#endif
+
+static int
unp_internalize(control, p)
struct mbuf *control;
struct proc *p;
@@ -639,12 +712,34 @@ unp_internalize(control, p)
register struct file **rp;
register struct file *fp;
register int i, fd;
+ register struct cmsgcred *cmcred;
int oldfds;
- if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
- cm->cmsg_len != control->m_len)
+ if ((cm->cmsg_type != SCM_RIGHTS && cm->cmsg_type != SCM_CREDS) ||
+ cm->cmsg_level != SOL_SOCKET || cm->cmsg_len != control->m_len)
return (EINVAL);
+
+ /*
+ * Fill in credential information.
+ */
+ if (cm->cmsg_type == SCM_CREDS) {
+ cmcred = (struct cmsgcred *)(cm + 1);
+ cmcred->cmcred_pid = p->p_pid;
+ cmcred->cmcred_uid = p->p_cred->p_ruid;
+ cmcred->cmcred_gid = p->p_cred->p_rgid;
+ cmcred->cmcred_euid = p->p_ucred->cr_uid;
+ cmcred->cmcred_ngroups = MIN(p->p_ucred->cr_ngroups,
+ CMGROUP_MAX);
+ for (i = 0; i < cmcred->cmcred_ngroups; i++)
+ cmcred->cmcred_groups[i] = p->p_ucred->cr_groups[i];
+ return(0);
+ }
+
oldfds = (cm->cmsg_len - sizeof (*cm)) / sizeof (int);
+ /*
+ * check that all the FDs passed in refer to legal OPEN files
+ * If not, reject the entire operation.
+ */
rp = (struct file **)(cm + 1);
for (i = 0; i < oldfds; i++) {
fd = *(int *)rp++;
@@ -652,6 +747,11 @@ unp_internalize(control, p)
fdp->fd_ofiles[fd] == NULL)
return (EBADF);
}
+ /*
+ * Now replace the integer FDs with pointers to
+ * the associated global file table entry..
+ * XXX this assumes a pointer and an int are the same size!
+ */
rp = (struct file **)(cm + 1);
for (i = 0; i < oldfds; i++) {
fp = fdp->fd_ofiles[*(int *)rp];
@@ -663,10 +763,9 @@ unp_internalize(control, p)
return (0);
}
-int unp_defer, unp_gcing;
-extern struct domain unixdomain;
+static int unp_defer, unp_gcing;
-void
+static void
unp_gc()
{
register struct file *fp, *nextfp;
@@ -678,26 +777,56 @@ unp_gc()
return;
unp_gcing = 1;
unp_defer = 0;
+ /*
+ * before going through all this, set all FDs to
+ * be NOT defered and NOT externally accessible
+ */
for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next)
fp->f_flag &= ~(FMARK|FDEFER);
do {
for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next) {
+ /*
+ * If the file is not open, skip it
+ */
if (fp->f_count == 0)
continue;
+ /*
+ * If we already marked it as 'defer' in a
+ * previous pass, then try process it this time
+ * and un-mark it
+ */
if (fp->f_flag & FDEFER) {
fp->f_flag &= ~FDEFER;
unp_defer--;
} else {
+ /*
+ * if it's not defered, then check if it's
+ * already marked.. if so skip it
+ */
if (fp->f_flag & FMARK)
continue;
+ /*
+ * If all references are from messages
+ * in transit, then skip it. it's not
+ * externally accessible.
+ */
if (fp->f_count == fp->f_msgcount)
continue;
+ /*
+ * If it got this far then it must be
+ * externally accessible.
+ */
fp->f_flag |= FMARK;
}
+ /*
+ * either it was defered, or it is externally
+ * accessible and not already marked so.
+ * Now check if it is possibly one of OUR sockets.
+ */
if (fp->f_type != DTYPE_SOCKET ||
(so = (struct socket *)fp->f_data) == 0)
continue;
- if (so->so_proto->pr_domain != &unixdomain ||
+ if (so->so_proto->pr_domain != &localdomain ||
(so->so_proto->pr_flags&PR_RIGHTS) == 0)
continue;
#ifdef notdef
@@ -716,6 +845,13 @@ unp_gc()
goto restart;
}
#endif
+ /*
+ * So, Ok, it's one of our sockets and it IS externally
+ * accessible (or was defered). Now we look
+ * to see if we hold any file descriptors in it's
+ * message buffers. Follow those links and mark them
+ * as accessible too.
+ */
unp_scan(so->so_rcv.sb_mb, unp_mark);
}
} while (unp_defer);
@@ -762,18 +898,30 @@ unp_gc()
for (nunref = 0, fp = filehead.lh_first, fpp = extra_ref; fp != 0;
fp = nextfp) {
nextfp = fp->f_list.le_next;
+ /*
+ * If it's not open, skip it
+ */
if (fp->f_count == 0)
continue;
+ /*
+ * If all refs are from msgs, and it's not marked accessible
+ * then it must be referenced from some unreachable cycle
+ * of (shut-down) FDs, so include it in our
+ * list of FDs to remove
+ */
if (fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) {
*fpp++ = fp;
nunref++;
fp->f_count++;
}
}
+ /*
+ * for each FD on our hit list, do the following two things
+ */
for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
sorflush((struct socket *)(*fpp)->f_data);
for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
- closef(*fpp, (struct proc *)NULL);
+ closef(*fpp, (struct proc *) NULL);
free((caddr_t)extra_ref, M_FILE);
unp_gcing = 0;
}
@@ -787,7 +935,7 @@ unp_dispose(m)
unp_scan(m, unp_discard);
}
-void
+static void
unp_scan(m0, op)
register struct mbuf *m0;
void (*op) __P((struct file *));
@@ -817,7 +965,7 @@ unp_scan(m0, op)
}
}
-void
+static void
unp_mark(fp)
struct file *fp;
{
@@ -828,7 +976,7 @@ unp_mark(fp)
fp->f_flag |= (FMARK|FDEFER);
}
-void
+static void
unp_discard(fp)
struct file *fp;
{
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index ec5c962..494a53d 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -1,181 +1,377 @@
-/*-
- * Copyright (c) 1982, 1986, 1989, 1993
- * The Regents of the University of California. All rights reserved.
- * (c) UNIX System Laboratories, Inc.
- * All or some portions of this file are derived from material licensed
- * to the University of California by American Telephone and Telegraph
- * Co. or Unix System Laboratories, Inc. and are reproduced herein with
- * the permission of UNIX System Laboratories, Inc.
+/*
+ * Copyright (c) 1994 John S. Dyson
+ * All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
* are met:
* 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
+ * notice immediately at the beginning of the file, without modification,
+ * this list of conditions, and the following disclaimer.
* 2. Redistributions in binary form must reproduce the above copyright
* notice, this list of conditions and the following disclaimer in the
* documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- * must display the following acknowledgement:
- * This product includes software developed by the University of
- * California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- * may be used to endorse or promote products derived from this software
- * without specific prior written permission.
+ * 3. Absolutely no warranty of function or purpose is made by the author
+ * John S. Dyson.
+ * 4. This work was done expressly for inclusion into FreeBSD. Other use
+ * is allowed if this notation is included.
+ * 5. Modifications may be freely made to this file if the above conditions
+ * are met.
*
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * $Id$
+ */
+
+/*
+ * this file contains a new buffer I/O scheme implementing a coherent
+ * VM object and buffer cache scheme. Pains have been taken to make
+ * sure that the performance degradation associated with schemes such
+ * as this is not realized.
*
- * from: @(#)vfs_bio.c 8.6 (Berkeley) 1/11/94
+ * Author: John S. Dyson
+ * Significant help during the development and debugging phases
+ * had been provided by David Greenman, also of the FreeBSD core team.
*/
+#include "opt_bounce.h"
+
+#define VMIO
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
#include <sys/proc.h>
-#include <sys/buf.h>
#include <sys/vnode.h>
+#include <sys/vmmeter.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/lock.h>
+#include <vm/vm_map.h>
+#include <sys/buf.h>
#include <sys/mount.h>
-#include <sys/trace.h>
#include <sys/malloc.h>
#include <sys/resourcevar.h>
+#include <sys/proc.h>
+
+#include <miscfs/specfs/specdev.h>
+
+static void vfs_update __P((void));
+static struct proc *updateproc;
+static struct kproc_desc up_kp = {
+ "update",
+ vfs_update,
+ &updateproc
+};
+SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
+
+struct buf *buf; /* buffer header pool */
+struct swqueue bswlist;
+
+int count_lock_queue __P((void));
+static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
+ vm_offset_t to);
+static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
+ vm_offset_t to);
+static void vfs_clean_pages(struct buf * bp);
+static void vfs_setdirty(struct buf *bp);
+static void vfs_vmio_release(struct buf *bp);
+
+int needsbuffer;
/*
- * Definitions for the buffer hash lists.
+ * Internal update daemon, process 3
+ * The variable vfs_update_wakeup allows for internal syncs.
*/
-#define BUFHASH(dvp, lbn) \
- (&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
-LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
-u_long bufhash;
+int vfs_update_wakeup;
+
/*
- * Insq/Remq for the buffer hash lists.
+ * buffers base kva
*/
-#define binshash(bp, dp) LIST_INSERT_HEAD(dp, bp, b_hash)
-#define bremhash(bp) LIST_REMOVE(bp, b_hash)
/*
- * Definitions for the buffer free lists.
+ * bogus page -- for I/O to/from partially complete buffers
+ * this is a temporary solution to the problem, but it is not
+ * really that bad. it would be better to split the buffer
+ * for input in the case of buffers partially already in memory,
+ * but the code is intricate enough already.
*/
-#define BQUEUES 4 /* number of free buffer queues */
+vm_page_t bogus_page;
+static vm_offset_t bogus_offset;
-#define BQ_LOCKED 0 /* super-blocks &c */
-#define BQ_LRU 1 /* lru, useful buffers */
-#define BQ_AGE 2 /* rubbish */
-#define BQ_EMPTY 3 /* buffer headers with no memory */
+static int bufspace, maxbufspace, vmiospace, maxvmiobufspace,
+ bufmallocspace, maxbufmallocspace;
-TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
-int needbuffer;
+static struct bufhashhdr bufhashtbl[BUFHSZ], invalhash;
+static struct bqueues bufqueues[BUFFER_QUEUES];
-/*
- * Insq/Remq for the buffer free lists.
- */
-#define binsheadfree(bp, dp) TAILQ_INSERT_HEAD(dp, bp, b_freelist)
-#define binstailfree(bp, dp) TAILQ_INSERT_TAIL(dp, bp, b_freelist)
+extern int vm_swap_size;
-void
-bremfree(bp)
- struct buf *bp;
-{
- struct bqueues *dp = NULL;
-
- /*
- * We only calculate the head of the freelist when removing
- * the last element of the list as that is the only time that
- * it is needed (e.g. to reset the tail pointer).
- *
- * NB: This makes an assumption about how tailq's are implemented.
- */
- if (bp->b_freelist.tqe_next == NULL) {
- for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
- if (dp->tqh_last == &bp->b_freelist.tqe_next)
- break;
- if (dp == &bufqueues[BQUEUES])
- panic("bremfree: lost tail");
- }
- TAILQ_REMOVE(dp, bp, b_freelist);
-}
+#define BUF_MAXUSE 16
/*
- * Initialize buffers and hash links for buffers.
+ * Initialize buffer headers and related structures.
*/
void
bufinit()
{
- register struct buf *bp;
- struct bqueues *dp;
- register int i;
- int base, residual;
-
- for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
- TAILQ_INIT(dp);
- bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
- base = bufpages / nbuf;
- residual = bufpages % nbuf;
+ struct buf *bp;
+ int i;
+
+ TAILQ_INIT(&bswlist);
+ LIST_INIT(&invalhash);
+
+ /* first, make a null hash table */
+ for (i = 0; i < BUFHSZ; i++)
+ LIST_INIT(&bufhashtbl[i]);
+
+ /* next, make a null set of free lists */
+ for (i = 0; i < BUFFER_QUEUES; i++)
+ TAILQ_INIT(&bufqueues[i]);
+
+ /* finally, initialize each buffer header and stick on empty q */
for (i = 0; i < nbuf; i++) {
bp = &buf[i];
- bzero((char *)bp, sizeof *bp);
+ bzero(bp, sizeof *bp);
+ bp->b_flags = B_INVAL; /* we're just an empty header */
bp->b_dev = NODEV;
bp->b_rcred = NOCRED;
bp->b_wcred = NOCRED;
+ bp->b_qindex = QUEUE_EMPTY;
bp->b_vnbufs.le_next = NOLIST;
- bp->b_data = buffers + i * MAXBSIZE;
- if (i < residual)
- bp->b_bufsize = (base + 1) * CLBYTES;
- else
- bp->b_bufsize = base * CLBYTES;
- bp->b_flags = B_INVAL;
- dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY];
- binsheadfree(bp, dp);
- binshash(bp, &invalhash);
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
+ LIST_INSERT_HEAD(&invalhash, bp, b_hash);
}
+/*
+ * maxbufspace is currently calculated to support all filesystem blocks
+ * to be 8K. If you happen to use a 16K filesystem, the size of the buffer
+ * cache is still the same as it would be for 8K filesystems. This
+ * keeps the size of the buffer cache "in check" for big block filesystems.
+ */
+ maxbufspace = (nbuf + 8) * DFLTBSIZE;
+/*
+ * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
+ */
+ maxvmiobufspace = 2 * maxbufspace / 3;
+/*
+ * Limit the amount of malloc memory since it is wired permanently into
+ * the kernel space. Even though this is accounted for in the buffer
+ * allocation, we don't want the malloced region to grow uncontrolled.
+ * The malloc scheme improves memory utilization significantly on average
+ * (small) directories.
+ */
+ maxbufmallocspace = maxbufspace / 20;
+
+ bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
+ bogus_page = vm_page_alloc(kernel_object,
+ ((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
+ VM_ALLOC_NORMAL);
+
}
-bread(a1, a2, a3, a4, a5)
- struct vnode *a1;
- daddr_t a2;
- int a3;
- struct ucred *a4;
- struct buf **a5;
+/*
+ * Free the kva allocation for a buffer
+ * Must be called only at splbio or higher,
+ * as this is the only locking for buffer_map.
+ */
+static void
+bfreekva(struct buf * bp)
{
+ if (bp->b_kvasize == 0)
+ return;
+
+ vm_map_delete(buffer_map,
+ (vm_offset_t) bp->b_kvabase,
+ (vm_offset_t) bp->b_kvabase + bp->b_kvasize);
+
+ bp->b_kvasize = 0;
- /*
- * Body deleted.
- */
- return (EIO);
}
-breadn(a1, a2, a3, a4, a5, a6, a7, a8)
- struct vnode *a1;
- daddr_t a2; int a3;
- daddr_t a4[]; int a5[];
- int a6;
- struct ucred *a7;
- struct buf **a8;
+/*
+ * remove the buffer from the appropriate free list
+ */
+void
+bremfree(struct buf * bp)
{
+ int s = splbio();
- /*
- * Body deleted.
- */
- return (EIO);
+ if (bp->b_qindex != QUEUE_NONE) {
+ TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
+ bp->b_qindex = QUEUE_NONE;
+ } else {
+ panic("bremfree: removing a buffer when not on a queue");
+ }
+ splx(s);
}
-bwrite(a1)
- struct buf *a1;
+/*
+ * Get a buffer with the specified data. Look in the cache first.
+ */
+int
+bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
+ struct buf ** bpp)
{
+ struct buf *bp;
+
+ bp = getblk(vp, blkno, size, 0, 0);
+ *bpp = bp;
+
+ /* if not found in cache, do some I/O */
+ if ((bp->b_flags & B_CACHE) == 0) {
+ if (curproc != NULL)
+ curproc->p_stats->p_ru.ru_inblock++;
+ bp->b_flags |= B_READ;
+ bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
+ if (bp->b_rcred == NOCRED) {
+ if (cred != NOCRED)
+ crhold(cred);
+ bp->b_rcred = cred;
+ }
+ vfs_busy_pages(bp, 0);
+ VOP_STRATEGY(bp);
+ return (biowait(bp));
+ }
+ return (0);
+}
+
+/*
+ * Operates like bread, but also starts asynchronous I/O on
+ * read-ahead blocks.
+ */
+int
+breadn(struct vnode * vp, daddr_t blkno, int size,
+ daddr_t * rablkno, int *rabsize,
+ int cnt, struct ucred * cred, struct buf ** bpp)
+{
+ struct buf *bp, *rabp;
+ int i;
+ int rv = 0, readwait = 0;
+
+ *bpp = bp = getblk(vp, blkno, size, 0, 0);
+
+ /* if not found in cache, do some I/O */
+ if ((bp->b_flags & B_CACHE) == 0) {
+ if (curproc != NULL)
+ curproc->p_stats->p_ru.ru_inblock++;
+ bp->b_flags |= B_READ;
+ bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
+ if (bp->b_rcred == NOCRED) {
+ if (cred != NOCRED)
+ crhold(cred);
+ bp->b_rcred = cred;
+ }
+ vfs_busy_pages(bp, 0);
+ VOP_STRATEGY(bp);
+ ++readwait;
+ }
+ for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
+ if (inmem(vp, *rablkno))
+ continue;
+ rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
+
+ if ((rabp->b_flags & B_CACHE) == 0) {
+ if (curproc != NULL)
+ curproc->p_stats->p_ru.ru_inblock++;
+ rabp->b_flags |= B_READ | B_ASYNC;
+ rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
+ if (rabp->b_rcred == NOCRED) {
+ if (cred != NOCRED)
+ crhold(cred);
+ rabp->b_rcred = cred;
+ }
+ vfs_busy_pages(rabp, 0);
+ VOP_STRATEGY(rabp);
+ } else {
+ brelse(rabp);
+ }
+ }
+
+ if (readwait) {
+ rv = biowait(bp);
+ }
+ return (rv);
+}
+
+/*
+ * Write, release buffer on completion. (Done by iodone
+ * if async.)
+ */
+int
+bwrite(struct buf * bp)
+{
+ int oldflags = bp->b_flags;
+
+ if (bp->b_flags & B_INVAL) {
+ brelse(bp);
+ return (0);
+ }
+ if (!(bp->b_flags & B_BUSY))
+ panic("bwrite: buffer is not busy???");
+
+ bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
+ bp->b_flags |= B_WRITEINPROG;
+
+ if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) {
+ reassignbuf(bp, bp->b_vp);
+ }
+
+ bp->b_vp->v_numoutput++;
+ vfs_busy_pages(bp, 1);
+ if (curproc != NULL)
+ curproc->p_stats->p_ru.ru_oublock++;
+ VOP_STRATEGY(bp);
/*
- * Body deleted.
+ * Handle ordered writes here.
+ * If the write was originally flagged as ordered,
+ * then we check to see if it was converted to async.
+ * If it was converted to async, and is done now, then
+ * we release the buffer. Otherwise we clear the
+ * ordered flag because it is not needed anymore.
+ *
+ * Note that biodone has been modified so that it does
+ * not release ordered buffers. This allows us to have
+ * a chance to determine whether or not the driver
+ * has set the async flag in the strategy routine. Otherwise
+ * if biodone was not modified, then the buffer may have been
+ * reused before we have had a chance to check the flag.
*/
- return (EIO);
+
+ if ((oldflags & B_ORDERED) == B_ORDERED) {
+ int s;
+ s = splbio();
+ if (bp->b_flags & B_ASYNC) {
+ if ((bp->b_flags & B_DONE)) {
+ if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
+ brelse(bp);
+ else
+ bqrelse(bp);
+ }
+ splx(s);
+ return (0);
+ } else {
+ bp->b_flags &= ~B_ORDERED;
+ }
+ splx(s);
+ }
+
+ if ((oldflags & B_ASYNC) == 0) {
+ int rtval = biowait(bp);
+
+ if (oldflags & B_DELWRI) {
+ reassignbuf(bp, bp->b_vp);
+ }
+ brelse(bp);
+ return (rtval);
+ }
+ return (0);
}
int
@@ -185,155 +381,1566 @@ vn_bwrite(ap)
return (bwrite(ap->a_bp));
}
-bdwrite(a1)
- struct buf *a1;
+/*
+ * Delayed write. (Buffer is marked dirty).
+ */
+void
+bdwrite(struct buf * bp)
{
+ if ((bp->b_flags & B_BUSY) == 0) {
+ panic("bdwrite: buffer is not busy");
+ }
+ if (bp->b_flags & B_INVAL) {
+ brelse(bp);
+ return;
+ }
+ if (bp->b_flags & B_TAPE) {
+ bawrite(bp);
+ return;
+ }
+ bp->b_flags &= ~(B_READ|B_RELBUF);
+ if ((bp->b_flags & B_DELWRI) == 0) {
+ bp->b_flags |= B_DONE | B_DELWRI;
+ reassignbuf(bp, bp->b_vp);
+ }
+
/*
- * Body deleted.
+ * This bmap keeps the system from needing to do the bmap later,
+ * perhaps when the system is attempting to do a sync. Since it
+ * is likely that the indirect block -- or whatever other datastructure
+ * that the filesystem needs is still in memory now, it is a good
+ * thing to do this. Note also, that if the pageout daemon is
+ * requesting a sync -- there might not be enough memory to do
+ * the bmap then... So, this is important to do.
*/
- return;
-}
+ if( bp->b_lblkno == bp->b_blkno) {
+ VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
+ }
-bawrite(a1)
- struct buf *a1;
-{
+ /*
+ * Set the *dirty* buffer range based upon the VM system dirty pages.
+ */
+ vfs_setdirty(bp);
/*
- * Body deleted.
+ * We need to do this here to satisfy the vnode_pager and the
+ * pageout daemon, so that it thinks that the pages have been
+ * "cleaned". Note that since the pages are in a delayed write
+ * buffer -- the VFS layer "will" see that the pages get written
+ * out on the next sync, or perhaps the cluster will be completed.
*/
+ vfs_clean_pages(bp);
+ bqrelse(bp);
return;
}
-brelse(a1)
- struct buf *a1;
+/*
+ * Asynchronous write.
+ * Start output on a buffer, but do not wait for it to complete.
+ * The buffer is released when the output completes.
+ */
+void
+bawrite(struct buf * bp)
+{
+ bp->b_flags |= B_ASYNC;
+ (void) VOP_BWRITE(bp);
+}
+
+/*
+ * Ordered write.
+ * Start output on a buffer, but only wait for it to complete if the
+ * output device cannot guarantee ordering in some other way. Devices
+ * that can perform asynchronous ordered writes will set the B_ASYNC
+ * flag in their strategy routine.
+ * The buffer is released when the output completes.
+ */
+int
+bowrite(struct buf * bp)
{
+ bp->b_flags |= B_ORDERED;
+ return (VOP_BWRITE(bp));
+}
+
+/*
+ * Release a buffer.
+ */
+void
+brelse(struct buf * bp)
+{
+ int s;
+
+ if (bp->b_flags & B_CLUSTER) {
+ relpbuf(bp);
+ return;
+ }
+ /* anyone need a "free" block? */
+ s = splbio();
+
+ /* anyone need this block? */
+ if (bp->b_flags & B_WANTED) {
+ bp->b_flags &= ~(B_WANTED | B_AGE);
+ wakeup(bp);
+ }
+
+ if (bp->b_flags & B_LOCKED)
+ bp->b_flags &= ~B_ERROR;
+
+ if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
+ (bp->b_bufsize <= 0)) {
+ bp->b_flags |= B_INVAL;
+ bp->b_flags &= ~(B_DELWRI | B_CACHE);
+ if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) {
+ if (bp->b_bufsize)
+ allocbuf(bp, 0);
+ brelvp(bp);
+ }
+ }
/*
- * Body deleted.
+ * VMIO buffer rundown. It is not very necessary to keep a VMIO buffer
+ * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
+ * but the VM object is kept around. The B_NOCACHE flag is used to
+ * invalidate the pages in the VM object.
*/
- return;
+ if (bp->b_flags & B_VMIO) {
+ vm_ooffset_t foff;
+ vm_object_t obj;
+ int i, resid;
+ vm_page_t m;
+ struct vnode *vp;
+ int iototal = bp->b_bufsize;
+
+ vp = bp->b_vp;
+ if (!vp)
+ panic("brelse: missing vp");
+
+ if (bp->b_npages) {
+ vm_pindex_t poff;
+ obj = (vm_object_t) vp->v_object;
+ if (vp->v_type == VBLK)
+ foff = ((vm_ooffset_t) bp->b_lblkno) << DEV_BSHIFT;
+ else
+ foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
+ poff = OFF_TO_IDX(foff);
+ for (i = 0; i < bp->b_npages; i++) {
+ m = bp->b_pages[i];
+ if (m == bogus_page) {
+ m = vm_page_lookup(obj, poff + i);
+ if (!m) {
+ panic("brelse: page missing\n");
+ }
+ bp->b_pages[i] = m;
+ pmap_qenter(trunc_page(bp->b_data),
+ bp->b_pages, bp->b_npages);
+ }
+ resid = IDX_TO_OFF(m->pindex+1) - foff;
+ if (resid > iototal)
+ resid = iototal;
+ if (resid > 0) {
+ /*
+ * Don't invalidate the page if the local machine has already
+ * modified it. This is the lesser of two evils, and should
+ * be fixed.
+ */
+ if (bp->b_flags & (B_NOCACHE | B_ERROR)) {
+ vm_page_test_dirty(m);
+ if (m->dirty == 0) {
+ vm_page_set_invalid(m, (vm_offset_t) foff, resid);
+ if (m->valid == 0)
+ vm_page_protect(m, VM_PROT_NONE);
+ }
+ }
+ if (resid >= PAGE_SIZE) {
+ if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
+ bp->b_flags |= B_INVAL;
+ }
+ } else {
+ if (!vm_page_is_valid(m,
+ (((vm_offset_t) bp->b_data) & PAGE_MASK), resid)) {
+ bp->b_flags |= B_INVAL;
+ }
+ }
+ }
+ foff += resid;
+ iototal -= resid;
+ }
+ }
+ if (bp->b_flags & (B_INVAL | B_RELBUF))
+ vfs_vmio_release(bp);
+ }
+ if (bp->b_qindex != QUEUE_NONE)
+ panic("brelse: free buffer onto another queue???");
+
+ /* enqueue */
+ /* buffers with no memory */
+ if (bp->b_bufsize == 0) {
+ bp->b_qindex = QUEUE_EMPTY;
+ TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
+ LIST_REMOVE(bp, b_hash);
+ LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+ bp->b_dev = NODEV;
+ /*
+ * Get rid of the kva allocation *now*
+ */
+ bfreekva(bp);
+ if (needsbuffer) {
+ wakeup(&needsbuffer);
+ needsbuffer=0;
+ }
+ /* buffers with junk contents */
+ } else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
+ bp->b_qindex = QUEUE_AGE;
+ TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
+ LIST_REMOVE(bp, b_hash);
+ LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+ bp->b_dev = NODEV;
+ if (needsbuffer) {
+ wakeup(&needsbuffer);
+ needsbuffer=0;
+ }
+ /* buffers that are locked */
+ } else if (bp->b_flags & B_LOCKED) {
+ bp->b_qindex = QUEUE_LOCKED;
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
+ /* buffers with stale but valid contents */
+ } else if (bp->b_flags & B_AGE) {
+ bp->b_qindex = QUEUE_AGE;
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
+ if (needsbuffer) {
+ wakeup(&needsbuffer);
+ needsbuffer=0;
+ }
+ /* buffers with valid and quite potentially reuseable contents */
+ } else {
+ bp->b_qindex = QUEUE_LRU;
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+ if (needsbuffer) {
+ wakeup(&needsbuffer);
+ needsbuffer=0;
+ }
+ }
+
+ /* unlock */
+ bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
+ B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+ splx(s);
}
+/*
+ * Release a buffer.
+ */
+void
+bqrelse(struct buf * bp)
+{
+ int s;
+
+ s = splbio();
+
+
+ /* anyone need this block? */
+ if (bp->b_flags & B_WANTED) {
+ bp->b_flags &= ~(B_WANTED | B_AGE);
+ wakeup(bp);
+ }
+
+ if (bp->b_qindex != QUEUE_NONE)
+ panic("bqrelse: free buffer onto another queue???");
+
+ if (bp->b_flags & B_LOCKED) {
+ bp->b_flags &= ~B_ERROR;
+ bp->b_qindex = QUEUE_LOCKED;
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
+ /* buffers with stale but valid contents */
+ } else {
+ bp->b_qindex = QUEUE_LRU;
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+ if (needsbuffer) {
+ wakeup(&needsbuffer);
+ needsbuffer=0;
+ }
+ }
+
+ /* unlock */
+ bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
+ B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+ splx(s);
+}
+
+static void
+vfs_vmio_release(bp)
+ struct buf *bp;
+{
+ int i;
+ vm_page_t m;
+
+ for (i = 0; i < bp->b_npages; i++) {
+ m = bp->b_pages[i];
+ bp->b_pages[i] = NULL;
+ vm_page_unwire(m);
+ /*
+ * We don't mess with busy pages, it is
+ * the responsibility of the process that
+ * busied the pages to deal with them.
+ */
+ if ((m->flags & PG_BUSY) || (m->busy != 0))
+ continue;
+
+ if (m->wire_count == 0) {
+
+ if (m->flags & PG_WANTED) {
+ m->flags &= ~PG_WANTED;
+ wakeup(m);
+ }
+
+ /*
+ * If this is an async free -- we cannot place
+ * pages onto the cache queue, so our policy for
+ * such buffers is to avoid the cache queue, and
+ * only modify the active queue or free queue.
+ */
+ if ((bp->b_flags & B_ASYNC) == 0) {
+
+ /*
+ * In the case of sync buffer frees, we can do pretty much
+ * anything to any of the memory queues. Specifically,
+ * the cache queue is free to be modified.
+ */
+ if (m->valid) {
+ if(m->dirty == 0)
+ vm_page_test_dirty(m);
+ /*
+ * this keeps pressure off of the process memory
+ */
+ if ((vm_swap_size == 0) ||
+ (cnt.v_free_count < cnt.v_free_min)) {
+ if ((m->dirty == 0) &&
+ (m->hold_count == 0))
+ vm_page_cache(m);
+ else
+ vm_page_deactivate(m);
+ }
+ } else if (m->hold_count == 0) {
+ vm_page_protect(m, VM_PROT_NONE);
+ vm_page_free(m);
+ }
+ } else {
+ /*
+ * If async, then at least we clear the
+ * act_count.
+ */
+ m->act_count = 0;
+ }
+ }
+ }
+ bufspace -= bp->b_bufsize;
+ vmiospace -= bp->b_bufsize;
+ pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
+ bp->b_npages = 0;
+ bp->b_bufsize = 0;
+ bp->b_flags &= ~B_VMIO;
+ if (bp->b_vp)
+ brelvp(bp);
+}
+
+/*
+ * Check to see if a block is currently memory resident.
+ */
struct buf *
-incore(a1, a2)
- struct vnode *a1;
- daddr_t a2;
+gbincore(struct vnode * vp, daddr_t blkno)
{
+ struct buf *bp;
+ struct bufhashhdr *bh;
+ bh = BUFHASH(vp, blkno);
+ bp = bh->lh_first;
+
+ /* Search hash chain */
+ while (bp != NULL) {
+ /* hit */
+ if (bp->b_vp == vp && bp->b_lblkno == blkno &&
+ (bp->b_flags & B_INVAL) == 0) {
+ break;
+ }
+ bp = bp->b_hash.le_next;
+ }
+ return (bp);
+}
+
+/*
+ * this routine implements clustered async writes for
+ * clearing out B_DELWRI buffers... This is much better
+ * than the old way of writing only one buffer at a time.
+ */
+int
+vfs_bio_awrite(struct buf * bp)
+{
+ int i;
+ daddr_t lblkno = bp->b_lblkno;
+ struct vnode *vp = bp->b_vp;
+ int s;
+ int ncl;
+ struct buf *bpa;
+ int nwritten;
+
+ s = splbio();
/*
- * Body deleted.
+ * right now we support clustered writing only to regular files
*/
- return (0);
+ if ((vp->v_type == VREG) &&
+ (vp->v_mount != 0) && /* Only on nodes that have the size info */
+ (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
+ int size;
+ int maxcl;
+
+ size = vp->v_mount->mnt_stat.f_iosize;
+ maxcl = MAXPHYS / size;
+
+ for (i = 1; i < maxcl; i++) {
+ if ((bpa = gbincore(vp, lblkno + i)) &&
+ ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
+ (B_DELWRI | B_CLUSTEROK)) &&
+ (bpa->b_bufsize == size)) {
+ if ((bpa->b_blkno == bpa->b_lblkno) ||
+ (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
+ break;
+ } else {
+ break;
+ }
+ }
+ ncl = i;
+ /*
+ * this is a possible cluster write
+ */
+ if (ncl != 1) {
+ nwritten = cluster_wbuild(vp, size, lblkno, ncl);
+ splx(s);
+ return nwritten;
+ }
+ }
+ bremfree(bp);
+ splx(s);
+ /*
+ * default (old) behavior, writing out only one block
+ */
+ bp->b_flags |= B_BUSY | B_ASYNC;
+ nwritten = bp->b_bufsize;
+ (void) VOP_BWRITE(bp);
+ return nwritten;
}
-struct buf *
-getblk(a1, a2, a3, a4, a5)
- struct vnode *a1;
- daddr_t a2;
- int a3, a4, a5;
+
+/*
+ * Find a buffer header which is available for use.
+ */
+static struct buf *
+getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
{
+ struct buf *bp;
+ int nbyteswritten = 0;
+ vm_offset_t addr;
+
+start:
+ if (bufspace >= maxbufspace)
+ goto trytofreespace;
+
+ /* can we constitute a new buffer? */
+ if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) {
+ if (bp->b_qindex != QUEUE_EMPTY)
+ panic("getnewbuf: inconsistent EMPTY queue, qindex=%d",
+ bp->b_qindex);
+ bp->b_flags |= B_BUSY;
+ bremfree(bp);
+ goto fillbuf;
+ }
+trytofreespace:
+ /*
+ * We keep the file I/O from hogging metadata I/O
+ * This is desirable because file data is cached in the
+ * VM/Buffer cache even if a buffer is freed.
+ */
+ if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) {
+ if (bp->b_qindex != QUEUE_AGE)
+ panic("getnewbuf: inconsistent AGE queue, qindex=%d",
+ bp->b_qindex);
+ } else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) {
+ if (bp->b_qindex != QUEUE_LRU)
+ panic("getnewbuf: inconsistent LRU queue, qindex=%d",
+ bp->b_qindex);
+ }
+ if (!bp) {
+ /* wait for a free buffer of any kind */
+ needsbuffer = 1;
+ tsleep(&needsbuffer,
+ (PRIBIO + 1) | slpflag, "newbuf", slptimeo);
+ return (0);
+ }
+
+#if defined(DIAGNOSTIC)
+ if (bp->b_flags & B_BUSY) {
+ panic("getnewbuf: busy buffer on free list\n");
+ }
+#endif
/*
- * Body deleted.
+ * We are fairly aggressive about freeing VMIO buffers, but since
+ * the buffering is intact without buffer headers, there is not
+ * much loss. We gain by maintaining non-VMIOed metadata in buffers.
*/
- return ((struct buf *)0);
+ if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
+ if ((bp->b_flags & B_VMIO) == 0 ||
+ (vmiospace < maxvmiobufspace)) {
+ --bp->b_usecount;
+ TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
+ if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+ goto start;
+ }
+ TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+ }
+ }
+
+ /* if we are a delayed write, convert to an async write */
+ if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
+ nbyteswritten += vfs_bio_awrite(bp);
+ if (!slpflag && !slptimeo) {
+ return (0);
+ }
+ goto start;
+ }
+
+ if (bp->b_flags & B_WANTED) {
+ bp->b_flags &= ~B_WANTED;
+ wakeup(bp);
+ }
+ bremfree(bp);
+ bp->b_flags |= B_BUSY;
+
+ if (bp->b_flags & B_VMIO) {
+ bp->b_flags &= ~B_ASYNC;
+ vfs_vmio_release(bp);
+ }
+
+ if (bp->b_vp)
+ brelvp(bp);
+
+fillbuf:
+ /* we are not free, nor do we contain interesting data */
+ if (bp->b_rcred != NOCRED) {
+ crfree(bp->b_rcred);
+ bp->b_rcred = NOCRED;
+ }
+ if (bp->b_wcred != NOCRED) {
+ crfree(bp->b_wcred);
+ bp->b_wcred = NOCRED;
+ }
+
+ LIST_REMOVE(bp, b_hash);
+ LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+ if (bp->b_bufsize) {
+ allocbuf(bp, 0);
+ }
+ bp->b_flags = B_BUSY;
+ bp->b_dev = NODEV;
+ bp->b_vp = NULL;
+ bp->b_blkno = bp->b_lblkno = 0;
+ bp->b_iodone = 0;
+ bp->b_error = 0;
+ bp->b_resid = 0;
+ bp->b_bcount = 0;
+ bp->b_npages = 0;
+ bp->b_dirtyoff = bp->b_dirtyend = 0;
+ bp->b_validoff = bp->b_validend = 0;
+ bp->b_usecount = 4;
+
+ maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
+
+ /*
+ * we assume that buffer_map is not at address 0
+ */
+ addr = 0;
+ if (maxsize != bp->b_kvasize) {
+ bfreekva(bp);
+
+ /*
+ * See if we have buffer kva space
+ */
+ if (vm_map_findspace(buffer_map,
+ vm_map_min(buffer_map), maxsize, &addr)) {
+ bp->b_flags |= B_INVAL;
+ brelse(bp);
+ goto trytofreespace;
+ }
+ }
+
+ /*
+ * See if we are below are allocated minimum
+ */
+ if (bufspace >= (maxbufspace + nbyteswritten)) {
+ bp->b_flags |= B_INVAL;
+ brelse(bp);
+ goto trytofreespace;
+ }
+
+ /*
+ * create a map entry for the buffer -- in essence
+ * reserving the kva space.
+ */
+ if (addr) {
+ vm_map_insert(buffer_map, NULL, 0,
+ addr, addr + maxsize,
+ VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
+
+ bp->b_kvabase = (caddr_t) addr;
+ bp->b_kvasize = maxsize;
+ }
+ bp->b_data = bp->b_kvabase;
+
+ return (bp);
}
+/*
+ * Check to see if a block is currently memory resident.
+ */
struct buf *
-geteblk(a1)
- int a1;
+incore(struct vnode * vp, daddr_t blkno)
{
+ struct buf *bp;
- /*
- * Body deleted.
- */
- return ((struct buf *)0);
+ int s = splbio();
+ bp = gbincore(vp, blkno);
+ splx(s);
+ return (bp);
}
-allocbuf(a1, a2)
- struct buf *a1;
- int a2;
+/*
+ * Returns true if no I/O is needed to access the
+ * associated VM object. This is like incore except
+ * it also hunts around in the VM system for the data.
+ */
+
+int
+inmem(struct vnode * vp, daddr_t blkno)
{
+ vm_object_t obj;
+ vm_offset_t toff, tinc;
+ vm_page_t m;
+ vm_ooffset_t off;
+
+ if (incore(vp, blkno))
+ return 1;
+ if (vp->v_mount == NULL)
+ return 0;
+ if ((vp->v_object == NULL) || (vp->v_flag & VVMIO) == 0)
+ return 0;
+
+ obj = vp->v_object;
+ tinc = PAGE_SIZE;
+ if (tinc > vp->v_mount->mnt_stat.f_iosize)
+ tinc = vp->v_mount->mnt_stat.f_iosize;
+ off = blkno * vp->v_mount->mnt_stat.f_iosize;
+
+ for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
+
+ m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
+ if (!m)
+ return 0;
+ if (vm_page_is_valid(m, (vm_offset_t) (toff + off), tinc) == 0)
+ return 0;
+ }
+ return 1;
+}
+/*
+ * now we set the dirty range for the buffer --
+ * for NFS -- if the file is mapped and pages have
+ * been written to, let it know. We want the
+ * entire range of the buffer to be marked dirty if
+ * any of the pages have been written to for consistancy
+ * with the b_validoff, b_validend set in the nfs write
+ * code, and used by the nfs read code.
+ */
+static void
+vfs_setdirty(struct buf *bp) {
+ int i;
+ vm_object_t object;
+ vm_offset_t boffset, offset;
/*
- * Body deleted.
+ * We qualify the scan for modified pages on whether the
+ * object has been flushed yet. The OBJ_WRITEABLE flag
+ * is not cleared simply by protecting pages off.
*/
- return (0);
+ if ((bp->b_flags & B_VMIO) &&
+ ((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
+ /*
+ * test the pages to see if they have been modified directly
+ * by users through the VM system.
+ */
+ for (i = 0; i < bp->b_npages; i++)
+ vm_page_test_dirty(bp->b_pages[i]);
+
+ /*
+ * scan forwards for the first page modified
+ */
+ for (i = 0; i < bp->b_npages; i++) {
+ if (bp->b_pages[i]->dirty) {
+ break;
+ }
+ }
+ boffset = (i << PAGE_SHIFT);
+ if (boffset < bp->b_dirtyoff) {
+ bp->b_dirtyoff = boffset;
+ }
+
+ /*
+ * scan backwards for the last page modified
+ */
+ for (i = bp->b_npages - 1; i >= 0; --i) {
+ if (bp->b_pages[i]->dirty) {
+ break;
+ }
+ }
+ boffset = (i + 1);
+ offset = boffset + bp->b_pages[0]->pindex;
+ if (offset >= object->size)
+ boffset = object->size - bp->b_pages[0]->pindex;
+ if (bp->b_dirtyend < (boffset << PAGE_SHIFT))
+ bp->b_dirtyend = (boffset << PAGE_SHIFT);
+ }
}
+/*
+ * Get a block given a specified block and offset into a file/device.
+ */
struct buf *
-getnewbuf(a1, a2)
- int a1, a2;
+getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
{
+ struct buf *bp;
+ int s;
+ struct bufhashhdr *bh;
+ int maxsize;
- /*
- * Body deleted.
- */
- return ((struct buf *)0);
+ if (vp->v_mount) {
+ maxsize = vp->v_mount->mnt_stat.f_iosize;
+ /*
+ * This happens on mount points.
+ */
+ if (maxsize < size)
+ maxsize = size;
+ } else {
+ maxsize = size;
+ }
+
+ if (size > MAXBSIZE)
+ panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
+
+ s = splbio();
+loop:
+ if ((bp = gbincore(vp, blkno))) {
+ if (bp->b_flags & B_BUSY) {
+ bp->b_flags |= B_WANTED;
+ if (bp->b_usecount < BUF_MAXUSE)
+ ++bp->b_usecount;
+ if (!tsleep(bp,
+ (PRIBIO + 1) | slpflag, "getblk", slptimeo))
+ goto loop;
+
+ splx(s);
+ return (struct buf *) NULL;
+ }
+ bp->b_flags |= B_BUSY | B_CACHE;
+ bremfree(bp);
+
+ /*
+ * check for size inconsistancies (note that they shouldn't happen
+ * but do when filesystems don't handle the size changes correctly.)
+ * We are conservative on metadata and don't just extend the buffer
+ * but write and re-constitute it.
+ */
+
+ if (bp->b_bcount != size) {
+ if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) {
+ allocbuf(bp, size);
+ } else {
+ bp->b_flags |= B_NOCACHE;
+ VOP_BWRITE(bp);
+ goto loop;
+ }
+ }
+
+ if (bp->b_usecount < BUF_MAXUSE)
+ ++bp->b_usecount;
+ splx(s);
+ return (bp);
+ } else {
+ vm_object_t obj;
+
+ if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == 0) {
+ if (slpflag || slptimeo) {
+ splx(s);
+ return NULL;
+ }
+ goto loop;
+ }
+
+ /*
+ * This code is used to make sure that a buffer is not
+ * created while the getnewbuf routine is blocked.
+ * Normally the vnode is locked so this isn't a problem.
+ * VBLK type I/O requests, however, don't lock the vnode.
+ */
+ if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) {
+ bp->b_flags |= B_INVAL;
+ brelse(bp);
+ goto loop;
+ }
+
+ /*
+ * Insert the buffer into the hash, so that it can
+ * be found by incore.
+ */
+ bp->b_blkno = bp->b_lblkno = blkno;
+ bgetvp(vp, bp);
+ LIST_REMOVE(bp, b_hash);
+ bh = BUFHASH(vp, blkno);
+ LIST_INSERT_HEAD(bh, bp, b_hash);
+
+ if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) {
+ bp->b_flags |= (B_VMIO | B_CACHE);
+#if defined(VFS_BIO_DEBUG)
+ if (vp->v_type != VREG && vp->v_type != VBLK)
+ printf("getblk: vmioing file type %d???\n", vp->v_type);
+#endif
+ } else {
+ bp->b_flags &= ~B_VMIO;
+ }
+ splx(s);
+
+ allocbuf(bp, size);
+#ifdef PC98
+ /*
+ * 1024byte/sector support
+ */
+#define B_XXX2 0x8000000
+ if (vp->v_flag & 0x10000) bp->b_flags |= B_XXX2;
+#endif
+ return (bp);
+ }
}
-biowait(a1)
- struct buf *a1;
+/*
+ * Get an empty, disassociated buffer of given size.
+ */
+struct buf *
+geteblk(int size)
{
+ struct buf *bp;
+ int s;
- /*
- * Body deleted.
- */
- return (EIO);
+ s = splbio();
+ while ((bp = getnewbuf(0, 0, size, MAXBSIZE)) == 0);
+ splx(s);
+ allocbuf(bp, size);
+ bp->b_flags |= B_INVAL;
+ return (bp);
}
+
+/*
+ * This code constitutes the buffer memory from either anonymous system
+ * memory (in the case of non-VMIO operations) or from an associated
+ * VM object (in the case of VMIO operations).
+ *
+ * Note that this code is tricky, and has many complications to resolve
+ * deadlock or inconsistant data situations. Tread lightly!!!
+ *
+ * Modify the length of a buffer's underlying buffer storage without
+ * destroying information (unless, of course the buffer is shrinking).
+ */
+int
+allocbuf(struct buf * bp, int size)
+{
+
+ int s;
+ int newbsize, mbsize;
+ int i;
+
+ if (!(bp->b_flags & B_BUSY))
+ panic("allocbuf: buffer not busy");
+
+ if (bp->b_kvasize < size)
+ panic("allocbuf: buffer too small");
+
+ if ((bp->b_flags & B_VMIO) == 0) {
+ caddr_t origbuf;
+ int origbufsize;
+ /*
+ * Just get anonymous memory from the kernel
+ */
+ mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+#if !defined(NO_B_MALLOC)
+ if (bp->b_flags & B_MALLOC)
+ newbsize = mbsize;
+ else
+#endif
+ newbsize = round_page(size);
+
+ if (newbsize < bp->b_bufsize) {
+#if !defined(NO_B_MALLOC)
+ /*
+ * malloced buffers are not shrunk
+ */
+ if (bp->b_flags & B_MALLOC) {
+ if (newbsize) {
+ bp->b_bcount = size;
+ } else {
+ free(bp->b_data, M_BIOBUF);
+ bufspace -= bp->b_bufsize;
+ bufmallocspace -= bp->b_bufsize;
+ bp->b_data = bp->b_kvabase;
+ bp->b_bufsize = 0;
+ bp->b_bcount = 0;
+ bp->b_flags &= ~B_MALLOC;
+ }
+ return 1;
+ }
+#endif
+ vm_hold_free_pages(
+ bp,
+ (vm_offset_t) bp->b_data + newbsize,
+ (vm_offset_t) bp->b_data + bp->b_bufsize);
+ } else if (newbsize > bp->b_bufsize) {
+#if !defined(NO_B_MALLOC)
+ /*
+ * We only use malloced memory on the first allocation.
+ * and revert to page-allocated memory when the buffer grows.
+ */
+ if ( (bufmallocspace < maxbufmallocspace) &&
+ (bp->b_bufsize == 0) &&
+ (mbsize <= PAGE_SIZE/2)) {
+
+ bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
+ bp->b_bufsize = mbsize;
+ bp->b_bcount = size;
+ bp->b_flags |= B_MALLOC;
+ bufspace += mbsize;
+ bufmallocspace += mbsize;
+ return 1;
+ }
+#endif
+ origbuf = NULL;
+ origbufsize = 0;
+#if !defined(NO_B_MALLOC)
+ /*
+ * If the buffer is growing on it's other-than-first allocation,
+ * then we revert to the page-allocation scheme.
+ */
+ if (bp->b_flags & B_MALLOC) {
+ origbuf = bp->b_data;
+ origbufsize = bp->b_bufsize;
+ bp->b_data = bp->b_kvabase;
+ bufspace -= bp->b_bufsize;
+ bufmallocspace -= bp->b_bufsize;
+ bp->b_bufsize = 0;
+ bp->b_flags &= ~B_MALLOC;
+ newbsize = round_page(newbsize);
+ }
+#endif
+ vm_hold_load_pages(
+ bp,
+ (vm_offset_t) bp->b_data + bp->b_bufsize,
+ (vm_offset_t) bp->b_data + newbsize);
+#if !defined(NO_B_MALLOC)
+ if (origbuf) {
+ bcopy(origbuf, bp->b_data, origbufsize);
+ free(origbuf, M_BIOBUF);
+ }
+#endif
+ }
+ } else {
+ vm_page_t m;
+ int desiredpages;
+
+ newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+ desiredpages = (round_page(newbsize) >> PAGE_SHIFT);
+
+#if !defined(NO_B_MALLOC)
+ if (bp->b_flags & B_MALLOC)
+ panic("allocbuf: VMIO buffer can't be malloced");
+#endif
+
+ if (newbsize < bp->b_bufsize) {
+ if (desiredpages < bp->b_npages) {
+ for (i = desiredpages; i < bp->b_npages; i++) {
+ /*
+ * the page is not freed here -- it
+ * is the responsibility of vnode_pager_setsize
+ */
+ m = bp->b_pages[i];
+#if defined(DIAGNOSTIC)
+ if (m == bogus_page)
+ panic("allocbuf: bogus page found");
+#endif
+ s = splvm();
+ while ((m->flags & PG_BUSY) || (m->busy != 0)) {
+ m->flags |= PG_WANTED;
+ tsleep(m, PVM, "biodep", 0);
+ }
+ splx(s);
+
+ bp->b_pages[i] = NULL;
+ vm_page_unwire(m);
+ }
+ pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
+ (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
+ bp->b_npages = desiredpages;
+ }
+ } else if (newbsize > bp->b_bufsize) {
+ vm_object_t obj;
+ vm_offset_t tinc, toff;
+ vm_ooffset_t off;
+ vm_pindex_t objoff;
+ int pageindex, curbpnpages;
+ struct vnode *vp;
+ int bsize;
+
+ vp = bp->b_vp;
+
+ if (vp->v_type == VBLK)
+ bsize = DEV_BSIZE;
+ else
+ bsize = vp->v_mount->mnt_stat.f_iosize;
+
+ if (bp->b_npages < desiredpages) {
+ obj = vp->v_object;
+ tinc = PAGE_SIZE;
+ if (tinc > bsize)
+ tinc = bsize;
+ off = (vm_ooffset_t) bp->b_lblkno * bsize;
+ curbpnpages = bp->b_npages;
+ doretry:
+ bp->b_flags |= B_CACHE;
+ for (toff = 0; toff < newbsize; toff += tinc) {
+ int bytesinpage;
+
+ pageindex = toff >> PAGE_SHIFT;
+ objoff = OFF_TO_IDX(off + toff);
+ if (pageindex < curbpnpages) {
+
+ m = bp->b_pages[pageindex];
+#ifdef VFS_BIO_DIAG
+ if (m->pindex != objoff)
+ panic("allocbuf: page changed offset??!!!?");
+#endif
+ bytesinpage = tinc;
+ if (tinc > (newbsize - toff))
+ bytesinpage = newbsize - toff;
+ if ((bp->b_flags & B_CACHE) &&
+ !vm_page_is_valid(m,
+ (vm_offset_t) ((toff + off) & PAGE_MASK),
+ bytesinpage)) {
+ bp->b_flags &= ~B_CACHE;
+ }
+ continue;
+ }
+ m = vm_page_lookup(obj, objoff);
+ if (!m) {
+ m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
+ if (!m) {
+ VM_WAIT;
+ goto doretry;
+ }
+ /*
+ * Normally it is unwise to clear PG_BUSY without
+ * PAGE_WAKEUP -- but it is okay here, as there is
+ * no chance for blocking between here and vm_page_alloc
+ */
+ m->flags &= ~PG_BUSY;
+ vm_page_wire(m);
+ bp->b_flags &= ~B_CACHE;
+ } else if (m->flags & PG_BUSY) {
+ s = splvm();
+ if (m->flags & PG_BUSY) {
+ m->flags |= PG_WANTED;
+ tsleep(m, PVM, "pgtblk", 0);
+ }
+ splx(s);
+ goto doretry;
+ } else {
+ if ((curproc != pageproc) &&
+ ((m->queue - m->pc) == PQ_CACHE) &&
+ ((cnt.v_free_count + cnt.v_cache_count) <
+ (cnt.v_free_min + cnt.v_cache_min))) {
+ pagedaemon_wakeup();
+ }
+ bytesinpage = tinc;
+ if (tinc > (newbsize - toff))
+ bytesinpage = newbsize - toff;
+ if ((bp->b_flags & B_CACHE) &&
+ !vm_page_is_valid(m,
+ (vm_offset_t) ((toff + off) & PAGE_MASK),
+ bytesinpage)) {
+ bp->b_flags &= ~B_CACHE;
+ }
+ vm_page_wire(m);
+ }
+ bp->b_pages[pageindex] = m;
+ curbpnpages = pageindex + 1;
+ }
+ bp->b_data = (caddr_t) trunc_page(bp->b_data);
+ bp->b_npages = curbpnpages;
+ pmap_qenter((vm_offset_t) bp->b_data,
+ bp->b_pages, bp->b_npages);
+ ((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
+ }
+ }
+ }
+ if (bp->b_flags & B_VMIO)
+ vmiospace += bp->b_bufsize;
+ bufspace += (newbsize - bp->b_bufsize);
+ bp->b_bufsize = newbsize;
+ bp->b_bcount = size;
+ return 1;
+}
+
+/*
+ * Wait for buffer I/O completion, returning error status.
+ */
+int
+biowait(register struct buf * bp)
+{
+ int s;
+
+ s = splbio();
+ while ((bp->b_flags & B_DONE) == 0)
+ tsleep(bp, PRIBIO, "biowait", 0);
+ splx(s);
+ if (bp->b_flags & B_EINTR) {
+ bp->b_flags &= ~B_EINTR;
+ return (EINTR);
+ }
+ if (bp->b_flags & B_ERROR) {
+ return (bp->b_error ? bp->b_error : EIO);
+ } else {
+ return (0);
+ }
+}
+
+/*
+ * Finish I/O on a buffer, calling an optional function.
+ * This is usually called from interrupt level, so process blocking
+ * is not *a good idea*.
+ */
void
-biodone(a1)
- struct buf *a1;
+biodone(register struct buf * bp)
{
+ int s;
+
+ s = splbio();
+ if (!(bp->b_flags & B_BUSY))
+ panic("biodone: buffer not busy");
+
+ if (bp->b_flags & B_DONE) {
+ splx(s);
+ printf("biodone: buffer already done\n");
+ return;
+ }
+ bp->b_flags |= B_DONE;
+
+ if ((bp->b_flags & B_READ) == 0) {
+ vwakeup(bp);
+ }
+#ifdef BOUNCE_BUFFERS
+ if (bp->b_flags & B_BOUNCE)
+ vm_bounce_free(bp);
+#endif
+
+ /* call optional completion function if requested */
+ if (bp->b_flags & B_CALL) {
+ bp->b_flags &= ~B_CALL;
+ (*bp->b_iodone) (bp);
+ splx(s);
+ return;
+ }
+ if (bp->b_flags & B_VMIO) {
+ int i, resid;
+ vm_ooffset_t foff;
+ vm_page_t m;
+ vm_object_t obj;
+ int iosize;
+ struct vnode *vp = bp->b_vp;
+
+ if (vp->v_type == VBLK)
+ foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
+ else
+ foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
+ obj = vp->v_object;
+ if (!obj) {
+ panic("biodone: no object");
+ }
+#if defined(VFS_BIO_DEBUG)
+ if (obj->paging_in_progress < bp->b_npages) {
+ printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
+ obj->paging_in_progress, bp->b_npages);
+ }
+#endif
+ iosize = bp->b_bufsize;
+ for (i = 0; i < bp->b_npages; i++) {
+ int bogusflag = 0;
+ m = bp->b_pages[i];
+ if (m == bogus_page) {
+ bogusflag = 1;
+ m = vm_page_lookup(obj, OFF_TO_IDX(foff));
+ if (!m) {
+#if defined(VFS_BIO_DEBUG)
+ printf("biodone: page disappeared\n");
+#endif
+ --obj->paging_in_progress;
+ continue;
+ }
+ bp->b_pages[i] = m;
+ pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
+ }
+#if defined(VFS_BIO_DEBUG)
+ if (OFF_TO_IDX(foff) != m->pindex) {
+ printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex);
+ }
+#endif
+ resid = IDX_TO_OFF(m->pindex + 1) - foff;
+ if (resid > iosize)
+ resid = iosize;
+ /*
+ * In the write case, the valid and clean bits are
+ * already changed correctly, so we only need to do this
+ * here in the read case.
+ */
+ if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
+ vm_page_set_validclean(m,
+ (vm_offset_t) (foff & PAGE_MASK), resid);
+ }
+ /*
+ * when debugging new filesystems or buffer I/O methods, this
+ * is the most common error that pops up. if you see this, you
+ * have not set the page busy flag correctly!!!
+ */
+ if (m->busy == 0) {
+ printf("biodone: page busy < 0, "
+ "pindex: %d, foff: 0x(%x,%x), "
+ "resid: %d, index: %d\n",
+ (int) m->pindex, (int)(foff >> 32),
+ (int) foff & 0xffffffff, resid, i);
+ if (vp->v_type != VBLK)
+ printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
+ bp->b_vp->v_mount->mnt_stat.f_iosize,
+ (int) bp->b_lblkno,
+ bp->b_flags, bp->b_npages);
+ else
+ printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
+ (int) bp->b_lblkno,
+ bp->b_flags, bp->b_npages);
+ printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
+ m->valid, m->dirty, m->wire_count);
+ panic("biodone: page busy < 0\n");
+ }
+ --m->busy;
+ if ((m->busy == 0) && (m->flags & PG_WANTED)) {
+ m->flags &= ~PG_WANTED;
+ wakeup(m);
+ }
+ --obj->paging_in_progress;
+ foff += resid;
+ iosize -= resid;
+ }
+ if (obj && obj->paging_in_progress == 0 &&
+ (obj->flags & OBJ_PIPWNT)) {
+ obj->flags &= ~OBJ_PIPWNT;
+ wakeup(obj);
+ }
+ }
/*
- * Body deleted.
+ * For asynchronous completions, release the buffer now. The brelse
+ * checks for B_WANTED and will do the wakeup there if necessary - so
+ * no need to do a wakeup here in the async case.
*/
- return;
+
+ if (bp->b_flags & B_ASYNC) {
+ if ((bp->b_flags & B_ORDERED) == 0) {
+ if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
+ brelse(bp);
+ else
+ bqrelse(bp);
+ }
+ } else {
+ bp->b_flags &= ~B_WANTED;
+ wakeup(bp);
+ }
+ splx(s);
}
int
count_lock_queue()
{
+ int count;
+ struct buf *bp;
- /*
- * Body deleted.
- */
- return (0);
+ count = 0;
+ for (bp = TAILQ_FIRST(&bufqueues[QUEUE_LOCKED]);
+ bp != NULL;
+ bp = TAILQ_NEXT(bp, b_freelist))
+ count++;
+ return (count);
+}
+
+int vfs_update_interval = 30;
+
+static void
+vfs_update()
+{
+ while (1) {
+ tsleep(&vfs_update_wakeup, PUSER, "update",
+ hz * vfs_update_interval);
+ vfs_update_wakeup = 0;
+ sync(curproc, NULL, NULL);
+ }
}
-#ifdef DIAGNOSTIC
+static int
+sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
+{
+ int error = sysctl_handle_int(oidp,
+ oidp->oid_arg1, oidp->oid_arg2, req);
+ if (!error)
+ wakeup(&vfs_update_wakeup);
+ return error;
+}
+
+SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
+ &vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
+
+
/*
- * Print out statistics on the current allocation of the buffer pool.
- * Can be enabled to print out on every ``sync'' by setting "syncprt"
- * in vfs_syscalls.c using sysctl.
+ * This routine is called in lieu of iodone in the case of
+ * incomplete I/O. This keeps the busy status for pages
+ * consistant.
*/
void
-vfs_bufstats()
+vfs_unbusy_pages(struct buf * bp)
{
- int s, i, j, count;
- register struct buf *bp;
- register struct bqueues *dp;
- int counts[MAXBSIZE/CLBYTES+1];
- static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" };
-
- for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
- count = 0;
- for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
- counts[j] = 0;
- s = splbio();
- for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
- counts[bp->b_bufsize/CLBYTES]++;
- count++;
+ int i;
+
+ if (bp->b_flags & B_VMIO) {
+ struct vnode *vp = bp->b_vp;
+ vm_object_t obj = vp->v_object;
+ vm_ooffset_t foff;
+
+ foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
+
+ for (i = 0; i < bp->b_npages; i++) {
+ vm_page_t m = bp->b_pages[i];
+
+ if (m == bogus_page) {
+ m = vm_page_lookup(obj, OFF_TO_IDX(foff) + i);
+ if (!m) {
+ panic("vfs_unbusy_pages: page missing\n");
+ }
+ bp->b_pages[i] = m;
+ pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
+ }
+ --obj->paging_in_progress;
+ --m->busy;
+ if ((m->busy == 0) && (m->flags & PG_WANTED)) {
+ m->flags &= ~PG_WANTED;
+ wakeup(m);
+ }
+ }
+ if (obj->paging_in_progress == 0 &&
+ (obj->flags & OBJ_PIPWNT)) {
+ obj->flags &= ~OBJ_PIPWNT;
+ wakeup(obj);
+ }
+ }
+}
+
+/*
+ * This routine is called before a device strategy routine.
+ * It is used to tell the VM system that paging I/O is in
+ * progress, and treat the pages associated with the buffer
+ * almost as being PG_BUSY. Also the object paging_in_progress
+ * flag is handled to make sure that the object doesn't become
+ * inconsistant.
+ */
+void
+vfs_busy_pages(struct buf * bp, int clear_modify)
+{
+ int i;
+
+ if (bp->b_flags & B_VMIO) {
+ vm_object_t obj = bp->b_vp->v_object;
+ vm_ooffset_t foff;
+ int iocount = bp->b_bufsize;
+
+ if (bp->b_vp->v_type == VBLK)
+ foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
+ else
+ foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
+ vfs_setdirty(bp);
+ for (i = 0; i < bp->b_npages; i++) {
+ vm_page_t m = bp->b_pages[i];
+ int resid = IDX_TO_OFF(m->pindex + 1) - foff;
+
+ if (resid > iocount)
+ resid = iocount;
+ if ((bp->b_flags & B_CLUSTER) == 0) {
+ obj->paging_in_progress++;
+ m->busy++;
+ }
+ vm_page_protect(m, VM_PROT_NONE);
+ if (clear_modify) {
+ vm_page_set_validclean(m,
+ (vm_offset_t) (foff & PAGE_MASK), resid);
+ } else if (bp->b_bcount >= PAGE_SIZE) {
+ if (m->valid && (bp->b_flags & B_CACHE) == 0) {
+ bp->b_pages[i] = bogus_page;
+ pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
+ }
+ }
+ foff += resid;
+ iocount -= resid;
+ }
+ }
+}
+
+/*
+ * Tell the VM system that the pages associated with this buffer
+ * are clean. This is used for delayed writes where the data is
+ * going to go to disk eventually without additional VM intevention.
+ */
+void
+vfs_clean_pages(struct buf * bp)
+{
+ int i;
+
+ if (bp->b_flags & B_VMIO) {
+ vm_ooffset_t foff;
+ int iocount = bp->b_bufsize;
+
+ if (bp->b_vp->v_type == VBLK)
+ foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
+ else
+ foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
+
+ for (i = 0; i < bp->b_npages; i++) {
+ vm_page_t m = bp->b_pages[i];
+ int resid = IDX_TO_OFF(m->pindex + 1) - foff;
+
+ if (resid > iocount)
+ resid = iocount;
+ if (resid > 0) {
+ vm_page_set_validclean(m,
+ ((vm_offset_t) foff & PAGE_MASK), resid);
+ }
+ foff += resid;
+ iocount -= resid;
+ }
+ }
+}
+
+void
+vfs_bio_clrbuf(struct buf *bp) {
+ int i;
+ if( bp->b_flags & B_VMIO) {
+ if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
+ int mask;
+ mask = 0;
+ for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE)
+ mask |= (1 << (i/DEV_BSIZE));
+ if( bp->b_pages[0]->valid != mask) {
+ bzero(bp->b_data, bp->b_bufsize);
+ }
+ bp->b_pages[0]->valid = mask;
+ bp->b_resid = 0;
+ return;
+ }
+ for(i=0;i<bp->b_npages;i++) {
+ if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
+ continue;
+ if( bp->b_pages[i]->valid == 0) {
+ if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
+ bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE);
+ }
+ } else {
+ int j;
+ for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
+ if( (bp->b_pages[i]->valid & (1<<j)) == 0)
+ bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE);
+ }
+ }
+ /* bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; */
+ }
+ bp->b_resid = 0;
+ } else {
+ clrbuf(bp);
+ }
+}
+
+/*
+ * vm_hold_load_pages and vm_hold_unload pages get pages into
+ * a buffers address space. The pages are anonymous and are
+ * not associated with a file object.
+ */
+void
+vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
+{
+ vm_offset_t pg;
+ vm_page_t p;
+ int index;
+
+ to = round_page(to);
+ from = round_page(from);
+ index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
+
+ for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
+
+tryagain:
+
+ p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
+ VM_ALLOC_NORMAL);
+ if (!p) {
+ VM_WAIT;
+ goto tryagain;
+ }
+ vm_page_wire(p);
+ pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
+ bp->b_pages[index] = p;
+ PAGE_WAKEUP(p);
+ }
+ bp->b_npages = to >> PAGE_SHIFT;
+}
+
+void
+vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
+{
+ vm_offset_t pg;
+ vm_page_t p;
+ int index;
+
+ from = round_page(from);
+ to = round_page(to);
+ index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
+
+ for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
+ p = bp->b_pages[index];
+ if (p && (index < bp->b_npages)) {
+ if (p->busy) {
+ printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
+ bp->b_blkno, bp->b_lblkno);
+ }
+ bp->b_pages[index] = NULL;
+ pmap_kremove(pg);
+ vm_page_unwire(p);
+ vm_page_free(p);
}
- splx(s);
- printf("%s: total-%d", bname[i], count);
- for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
- if (counts[j] != 0)
- printf(", %d-%d", j * CLBYTES, counts[j]);
- printf("\n");
}
+ bp->b_npages = from >> PAGE_SHIFT;
}
-#endif /* DIAGNOSTIC */
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index c20966b..ef0f222 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -33,13 +33,14 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * from: vfs_cache.c,v 1.11 1995/03/12 02:01:20 phk Exp $
- *
* @(#)vfs_cache.c 8.5 (Berkeley) 3/22/95
+ * $Id: vfs_cache.c,v 1.23 1997/02/22 09:39:31 peter Exp $
*/
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
#include <sys/time.h>
#include <sys/mount.h>
#include <sys/vnode.h>
@@ -47,6 +48,8 @@
#include <sys/errno.h>
#include <sys/malloc.h>
+#define MAXVNODEUSE 32
+
/*
* Name caching works as follows:
*
@@ -72,14 +75,24 @@
* Structures associated with name cacheing.
*/
#define NCHHASH(dvp, cnp) \
- (&nchashtbl[((dvp)->v_id + (cnp)->cn_hash) & nchash])
-LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */
-u_long nchash; /* size of hash table - 1 */
-long numcache; /* number of cache entries allocated */
-TAILQ_HEAD(, namecache) nclruhead; /* LRU chain */
+ (&nchashtbl[((dvp)->v_id + (cnp)->cn_hash) % nchash])
+static LIST_HEAD(nchashhead, namecache) *nchashtbl; /* Hash Table */
+static u_long nchash; /* size of hash table */
+static u_long numcache; /* number of cache entries allocated */
+static TAILQ_HEAD(, namecache) nclruhead; /* LRU chain */
struct nchstats nchstats; /* cache effectiveness statistics */
-int doingcache = 1; /* 1 => enable the cache */
+static int doingcache = 1; /* 1 => enable the cache */
+SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
+
+#ifdef NCH_STATISTICS
+u_long nchnbr;
+#define NCHNBR(ncp) (ncp)->nc_nbr = ++nchnbr;
+#define NCHHIT(ncp) (ncp)->nc_hits++
+#else
+#define NCHNBR(ncp)
+#define NCHHIT(ncp)
+#endif
/*
* Delete an entry from its hash list and move it to the front
@@ -100,13 +113,14 @@ int doingcache = 1; /* 1 => enable the cache */
if (ncp->nc_lru.tqe_next != 0) { \
TAILQ_REMOVE(&nclruhead, ncp, nc_lru); \
TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru); \
+ NCHNBR(ncp); \
} \
}
/*
- * Lookup an entry in the cache
+ * Lookup an entry in the cache
*
- * We don't do this if the segment name is long, simply so the cache
+ * We don't do this if the segment name is long, simply so the cache
* can avoid holding long names (which would either waste space, or
* add greatly to the complexity).
*
@@ -160,18 +174,22 @@ cache_lookup(dvp, vpp, cnp)
return (0);
}
+ NCHHIT(ncp);
+
/* We don't want to have an entry, so dump it */
if ((cnp->cn_flags & MAKEENTRY) == 0) {
nchstats.ncs_badhits++;
PURGE(ncp);
return (0);
- }
+ }
/* We found a "positive" match, return the vnode */
if (ncp->nc_vp) {
nchstats.ncs_goodhits++;
TOUCH(ncp);
*vpp = ncp->nc_vp;
+ if ((*vpp)->v_usage < MAXVNODEUSE)
+ (*vpp)->v_usage++;
return (-1);
}
@@ -207,10 +225,10 @@ cache_enter(dvp, vp, cnp)
if (!doingcache)
return;
-#ifdef DIAGNOSTIC
- if (cnp->cn_namelen > NCHNAMLEN)
- panic("cache_enter: name too long");
-#endif
+ if (cnp->cn_namelen > NCHNAMLEN) {
+ printf("cache_enter: name too long");
+ return;
+ }
/*
* We allocate a new entry if we are less than the maximum
@@ -244,9 +262,11 @@ cache_enter(dvp, vp, cnp)
* otherwise unused.
*/
ncp->nc_vp = vp;
- if (vp)
+ if (vp) {
ncp->nc_vpid = vp->v_id;
- else
+ if (vp->v_usage < MAXVNODEUSE)
+ ++vp->v_usage;
+ } else
ncp->nc_vpid = cnp->cn_flags & ISWHITEOUT;
ncp->nc_dvp = dvp;
ncp->nc_dvpid = dvp->v_id;
@@ -265,14 +285,14 @@ nchinit()
{
TAILQ_INIT(&nclruhead);
- nchashtbl = hashinit(desiredvnodes, M_CACHE, &nchash);
+ nchashtbl = phashinit(desiredvnodes, M_CACHE, &nchash);
}
/*
- * Invalidate a all entries to particular vnode.
- *
- * We actually just increment the v_id, that will do it. The entries will
- * be purged by lookup as they get found. If the v_id wraps around, we
+ * Invalidate all entries to particular vnode.
+ *
+ * We actually just increment the v_id, that will do it. The stale entries
+ * will be purged by lookup as they get found. If the v_id wraps around, we
* need to ditch the entire cache, to avoid confusion. No valid vnode will
* ever have (v_id == 0).
*/
@@ -282,11 +302,12 @@ cache_purge(vp)
{
struct namecache *ncp;
struct nchashhead *ncpp;
+ static u_long nextvnodeid;
vp->v_id = ++nextvnodeid;
if (nextvnodeid != 0)
return;
- for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
+ for (ncpp = &nchashtbl[nchash - 1]; ncpp >= nchashtbl; ncpp--) {
while (ncp = ncpp->lh_first)
PURGE(ncp);
}
@@ -297,7 +318,7 @@ cache_purge(vp)
* Flush all entries referencing a particular filesystem.
*
* Since we need to check it anyway, we will flush all the invalid
- * entriess at the same time.
+ * entries at the same time.
*/
void
cache_purgevfs(mp)
@@ -307,7 +328,7 @@ cache_purgevfs(mp)
struct namecache *ncp, *nnp;
/* Scan hash tables for applicable entries */
- for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
+ for (ncpp = &nchashtbl[nchash - 1]; ncpp >= nchashtbl; ncpp--) {
for (ncp = ncpp->lh_first; ncp != 0; ncp = nnp) {
nnp = ncp->nc_hash.le_next;
if (ncp->nc_dvpid != ncp->nc_dvp->v_id ||
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index e01d24f..b00da1f 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -1,6 +1,8 @@
/*-
* Copyright (c) 1993
* The Regents of the University of California. All rights reserved.
+ * Modifications/enhancements:
+ * Copyright (c) 1995 John S. Dyson. All rights reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -30,233 +32,281 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)vfs_cluster.c 8.10 (Berkeley) 3/28/95
+ * @(#)vfs_cluster.c 8.7 (Berkeley) 2/13/94
+ * $Id: vfs_cluster.c,v 1.42 1997/02/22 09:39:31 peter Exp $
*/
#include <sys/param.h>
+#include <sys/systm.h>
#include <sys/proc.h>
#include <sys/buf.h>
#include <sys/vnode.h>
#include <sys/mount.h>
-#include <sys/trace.h>
#include <sys/malloc.h>
#include <sys/resourcevar.h>
-#include <libkern/libkern.h>
+#include <sys/vmmeter.h>
+#include <miscfs/specfs/specdev.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+
+#if defined(CLUSTERDEBUG)
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+static int rcluster= 0;
+SYSCTL_INT(_debug, 14, rcluster, CTLFLAG_RW, &rcluster, 0, "");
+#endif
-/*
- * Local declarations
- */
-struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t,
- daddr_t, long, int));
-struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *,
- daddr_t, daddr_t, long, int, long));
-void cluster_wbuild __P((struct vnode *, struct buf *, long,
- daddr_t, int, daddr_t));
-struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *));
+#ifdef notyet_block_reallocation_enabled
+#ifdef DEBUG
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
-#ifdef DIAGNOSTIC
-/*
- * Set to 1 if reads of block zero should cause readahead to be done.
- * Set to 0 treats a read of block zero as a non-sequential read.
- *
- * Setting to one assumes that most reads of block zero of files are due to
- * sequential passes over the files (e.g. cat, sum) where additional blocks
- * will soon be needed. Setting to zero assumes that the majority are
- * surgical strikes to get particular info (e.g. size, file) where readahead
- * blocks will not be used and, in fact, push out other potentially useful
- * blocks from the cache. The former seems intuitive, but some quick tests
- * showed that the latter performed better from a system-wide point of view.
- */
-int doclusterraz = 0;
-#define ISSEQREAD(vp, blk) \
- (((blk) != 0 || doclusterraz) && \
- ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
+static int doreallocblks = 0;
+SYSCTL_INT(_debug, 13, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, "");
#else
-#define ISSEQREAD(vp, blk) \
- ((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
+#define doreallocblks 0
#endif
+#endif /* notyet_block_reallocation_enabled */
+
+#ifdef notyet_block_reallocation_enabled
+static struct cluster_save *
+ cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp));
+#endif
+static struct buf *
+ cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn,
+ daddr_t blkno, long size, int run, struct buf *fbp));
+
+extern vm_page_t bogus_page;
/*
- * This replaces bread. If this is a bread at the beginning of a file and
- * lastr is 0, we assume this is the first read and we'll read up to two
- * blocks if they are sequential. After that, we'll do regular read ahead
- * in clustered chunks.
- *
- * There are 4 or 5 cases depending on how you count:
- * Desired block is in the cache:
- * 1 Not sequential access (0 I/Os).
- * 2 Access is sequential, do read-ahead (1 ASYNC).
- * Desired block is not in cache:
- * 3 Not sequential access (1 SYNC).
- * 4 Sequential access, next block is contiguous (1 SYNC).
- * 5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
- *
- * There are potentially two buffers that require I/O.
- * bp is the block requested.
- * rbp is the read-ahead block.
- * If either is NULL, then you don't have to do the I/O.
+ * Maximum number of blocks for read-ahead.
*/
-cluster_read(vp, filesize, lblkno, size, cred, bpp)
+#define MAXRA 32
+
+/*
+ * This replaces bread.
+ */
+int
+cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)
struct vnode *vp;
u_quad_t filesize;
daddr_t lblkno;
long size;
struct ucred *cred;
+ long totread;
+ int seqcount;
struct buf **bpp;
{
- struct buf *bp, *rbp;
- daddr_t blkno, ioblkno;
- long flags;
- int error, num_ra, alreadyincore;
-
-#ifdef DIAGNOSTIC
- if (size == 0)
- panic("cluster_read: size = 0");
-#endif
+ struct buf *bp, *rbp, *reqbp;
+ daddr_t blkno, rablkno, origblkno;
+ int error, num_ra;
+ int i;
+ int maxra, racluster;
+ long origtotread;
error = 0;
- flags = B_READ;
- *bpp = bp = getblk(vp, lblkno, size, 0, 0);
- if (bp->b_flags & B_CACHE) {
- /*
- * Desired block is in cache; do any readahead ASYNC.
- * Case 1, 2.
- */
- trace(TR_BREADHIT, pack(vp, size), lblkno);
- flags |= B_ASYNC;
- ioblkno = lblkno + (vp->v_ralen ? vp->v_ralen : 1);
- alreadyincore = incore(vp, ioblkno) != NULL;
- bp = NULL;
- } else {
- /* Block wasn't in cache, case 3, 4, 5. */
- trace(TR_BREADMISS, pack(vp, size), lblkno);
- bp->b_flags |= B_READ;
- ioblkno = lblkno;
- alreadyincore = 0;
- curproc->p_stats->p_ru.ru_inblock++; /* XXX */
- }
+
/*
- * XXX
- * Replace 1 with a window size based on some permutation of
- * maxcontig and rot_delay. This will let you figure out how
- * many blocks you should read-ahead (case 2, 4, 5).
- *
- * If the access isn't sequential, reset the window to 1.
- * Note that a read to the same block is considered sequential.
- * This catches the case where the file is being read sequentially,
- * but at smaller than the filesystem block size.
+ * Try to limit the amount of read-ahead by a few
+ * ad-hoc parameters. This needs work!!!
*/
- rbp = NULL;
- if (!ISSEQREAD(vp, lblkno)) {
- vp->v_ralen = 0;
- vp->v_maxra = lblkno;
- } else if ((ioblkno + 1) * size <= filesize && !alreadyincore &&
- !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) &&
- blkno != -1) {
- /*
- * Reading sequentially, and the next block is not in the
- * cache. We are going to try reading ahead.
- */
- if (num_ra) {
- /*
- * If our desired readahead block had been read
- * in a previous readahead but is no longer in
- * core, then we may be reading ahead too far
- * or are not using our readahead very rapidly.
- * In this case we scale back the window.
- */
- if (!alreadyincore && ioblkno <= vp->v_maxra)
- vp->v_ralen = max(vp->v_ralen >> 1, 1);
- /*
- * There are more sequential blocks than our current
- * window allows, scale up. Ideally we want to get
- * in sync with the filesystem maxcontig value.
- */
- else if (num_ra > vp->v_ralen && lblkno != vp->v_lastr)
- vp->v_ralen = vp->v_ralen ?
- min(num_ra, vp->v_ralen << 1) : 1;
+ racluster = MAXPHYS/size;
+ maxra = 2 * racluster + (totread / size);
+ if (maxra > MAXRA)
+ maxra = MAXRA;
+ if (maxra > nbuf/8)
+ maxra = nbuf/8;
- if (num_ra > vp->v_ralen)
- num_ra = vp->v_ralen;
- }
+ /*
+ * get the requested block
+ */
+ *bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0);
+ origblkno = lblkno;
+ origtotread = totread;
- if (num_ra) /* case 2, 4 */
- rbp = cluster_rbuild(vp, filesize,
- bp, ioblkno, blkno, size, num_ra, flags);
- else if (ioblkno == lblkno) {
- bp->b_blkno = blkno;
- /* Case 5: check how many blocks to read ahead */
- ++ioblkno;
- if ((ioblkno + 1) * size > filesize ||
- incore(vp, ioblkno) || (error = VOP_BMAP(vp,
- ioblkno, NULL, &blkno, &num_ra)) || blkno == -1)
- goto skip_readahead;
+ /*
+ * if it is in the cache, then check to see if the reads have been
+ * sequential. If they have, then try some read-ahead, otherwise
+ * back-off on prospective read-aheads.
+ */
+ if (bp->b_flags & B_CACHE) {
+ if (!seqcount) {
+ return 0;
+ } else if ((bp->b_flags & B_RAM) == 0) {
+ return 0;
+ } else {
+ int s;
+ struct buf *tbp;
+ bp->b_flags &= ~B_RAM;
/*
- * Adjust readahead as above.
- * Don't check alreadyincore, we know it is 0 from
- * the previous conditional.
+ * We do the spl here so that there is no window
+ * between the incore and the b_usecount increment
+ * below. We opt to keep the spl out of the loop
+ * for efficiency.
*/
- if (num_ra) {
- if (ioblkno <= vp->v_maxra)
- vp->v_ralen = max(vp->v_ralen >> 1, 1);
- else if (num_ra > vp->v_ralen &&
- lblkno != vp->v_lastr)
- vp->v_ralen = vp->v_ralen ?
- min(num_ra,vp->v_ralen<<1) : 1;
- if (num_ra > vp->v_ralen)
- num_ra = vp->v_ralen;
+ s = splbio();
+ for(i=1;i<maxra;i++) {
+
+ if (!(tbp = incore(vp, lblkno+i))) {
+ break;
+ }
+
+ /*
+ * Set another read-ahead mark so we know to check
+ * again.
+ */
+ if (((i % racluster) == (racluster - 1)) ||
+ (i == (maxra - 1)))
+ tbp->b_flags |= B_RAM;
+
+#if 0
+ if (tbp->b_usecount == 0) {
+ /*
+ * Make sure that the soon-to-be used readaheads
+ * are still there. The getblk/bqrelse pair will
+ * boost the priority of the buffer.
+ */
+ tbp = getblk(vp, lblkno+i, size, 0, 0);
+ bqrelse(tbp);
+ }
+#endif
}
- flags |= B_ASYNC;
- if (num_ra)
- rbp = cluster_rbuild(vp, filesize,
- NULL, ioblkno, blkno, size, num_ra, flags);
- else {
- rbp = getblk(vp, ioblkno, size, 0, 0);
- rbp->b_flags |= flags;
- rbp->b_blkno = blkno;
+ splx(s);
+ if (i >= maxra) {
+ return 0;
}
+ lblkno += i;
+ }
+ reqbp = bp = NULL;
+ } else {
+ u_quad_t firstread;
+ firstread = (u_quad_t) lblkno * size;
+ if (firstread + totread > filesize)
+ totread = filesize - firstread;
+ if (totread > size) {
+ int nblks = 0;
+ int ncontigafter;
+ while (totread > 0) {
+ nblks++;
+ totread -= size;
+ }
+ if (nblks == 1)
+ goto single_block_read;
+ if (nblks > racluster)
+ nblks = racluster;
+
+ error = VOP_BMAP(vp, lblkno, NULL,
+ &blkno, &ncontigafter, NULL);
+ if (error)
+ goto single_block_read;
+ if (blkno == -1)
+ goto single_block_read;
+ if (ncontigafter == 0)
+ goto single_block_read;
+ if (ncontigafter + 1 < nblks)
+ nblks = ncontigafter + 1;
+
+ bp = cluster_rbuild(vp, filesize, lblkno,
+ blkno, size, nblks, bp);
+ lblkno += nblks;
} else {
- /* case 2; read ahead single block */
- rbp = getblk(vp, ioblkno, size, 0, 0);
- rbp->b_flags |= flags;
- rbp->b_blkno = blkno;
+single_block_read:
+ /*
+ * if it isn't in the cache, then get a chunk from
+ * disk if sequential, otherwise just get the block.
+ */
+ bp->b_flags |= B_READ | B_RAM;
+ lblkno += 1;
}
+ }
- if (rbp == bp) /* case 4 */
- rbp = NULL;
- else if (rbp) { /* case 2, 5 */
- trace(TR_BREADMISSRA,
- pack(vp, (num_ra + 1) * size), ioblkno);
- curproc->p_stats->p_ru.ru_inblock++; /* XXX */
+ /*
+ * if we have been doing sequential I/O, then do some read-ahead
+ */
+ rbp = NULL;
+ /* if (seqcount && (lblkno < (origblkno + maxra))) { */
+ if (seqcount && (lblkno < (origblkno + seqcount))) {
+ /*
+ * we now build the read-ahead buffer if it is desirable.
+ */
+ if (((u_quad_t)(lblkno + 1) * size) <= filesize &&
+ !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) &&
+ blkno != -1) {
+ int nblksread;
+ int ntoread = num_ra + 1;
+ nblksread = (origtotread + size - 1) / size;
+ if (seqcount < nblksread)
+ seqcount = nblksread;
+ if (seqcount < ntoread)
+ ntoread = seqcount;
+ if (num_ra) {
+ rbp = cluster_rbuild(vp, filesize, lblkno,
+ blkno, size, ntoread, NULL);
+ } else {
+ rbp = getblk(vp, lblkno, size, 0, 0);
+ rbp->b_flags |= B_READ | B_ASYNC | B_RAM;
+ rbp->b_blkno = blkno;
+ }
}
}
- /* XXX Kirk, do we need to make sure the bp has creds? */
-skip_readahead:
- if (bp)
- if (bp->b_flags & (B_DONE | B_DELWRI))
+ /*
+ * handle the synchronous read
+ */
+ if (bp) {
+ if (bp->b_flags & (B_DONE | B_DELWRI)) {
panic("cluster_read: DONE bp");
- else
+ } else {
+#if defined(CLUSTERDEBUG)
+ if (rcluster)
+ printf("S(%d,%d,%d) ",
+ bp->b_lblkno, bp->b_bcount, seqcount);
+#endif
+ if ((bp->b_flags & B_CLUSTER) == 0)
+ vfs_busy_pages(bp, 0);
error = VOP_STRATEGY(bp);
-
- if (rbp)
- if (error || rbp->b_flags & (B_DONE | B_DELWRI)) {
+ curproc->p_stats->p_ru.ru_inblock++;
+ }
+ }
+ /*
+ * and if we have read-aheads, do them too
+ */
+ if (rbp) {
+ if (error) {
rbp->b_flags &= ~(B_ASYNC | B_READ);
brelse(rbp);
- } else
- (void) VOP_STRATEGY(rbp);
+ } else if (rbp->b_flags & B_CACHE) {
+ rbp->b_flags &= ~(B_ASYNC | B_READ);
+ bqrelse(rbp);
+ } else {
+#if defined(CLUSTERDEBUG)
+ if (rcluster) {
+ if (bp)
+ printf("A+(%d,%d,%d,%d) ",
+ rbp->b_lblkno, rbp->b_bcount,
+ rbp->b_lblkno - origblkno,
+ seqcount);
+ else
+ printf("A(%d,%d,%d,%d) ",
+ rbp->b_lblkno, rbp->b_bcount,
+ rbp->b_lblkno - origblkno,
+ seqcount);
+ }
+#endif
- /*
- * Recalculate our maximum readahead
- */
- if (rbp == NULL)
- rbp = bp;
- if (rbp)
- vp->v_maxra = rbp->b_lblkno + (rbp->b_bufsize / size) - 1;
-
- if (bp)
- return(biowait(bp));
- return(error);
+ if ((rbp->b_flags & B_CLUSTER) == 0)
+ vfs_busy_pages(rbp, 0);
+ (void) VOP_STRATEGY(rbp);
+ curproc->p_stats->p_ru.ru_inblock++;
+ }
+ }
+ if (reqbp)
+ return (biowait(reqbp));
+ else
+ return (error);
}
/*
@@ -264,145 +314,139 @@ skip_readahead:
* read ahead. We will read as many blocks as possible sequentially
* and then parcel them up into logical blocks in the buffer hash table.
*/
-struct buf *
-cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
+static struct buf *
+cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
struct vnode *vp;
u_quad_t filesize;
- struct buf *bp;
daddr_t lbn;
daddr_t blkno;
long size;
int run;
- long flags;
+ struct buf *fbp;
{
- struct cluster_save *b_save;
- struct buf *tbp;
+ struct buf *bp, *tbp;
daddr_t bn;
- int i, inc;
+ int i, inc, j;
#ifdef DIAGNOSTIC
if (size != vp->v_mount->mnt_stat.f_iosize)
panic("cluster_rbuild: size %d != filesize %d\n",
- size, vp->v_mount->mnt_stat.f_iosize);
+ size, vp->v_mount->mnt_stat.f_iosize);
#endif
- if (size * (lbn + run + 1) > filesize)
+ /*
+ * avoid a division
+ */
+ while ((u_quad_t) size * (lbn + run) > filesize) {
--run;
- if (run == 0) {
- if (!bp) {
- bp = getblk(vp, lbn, size, 0, 0);
- bp->b_blkno = blkno;
- bp->b_flags |= flags;
- }
- return(bp);
}
- bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1);
- if (bp->b_flags & (B_DONE | B_DELWRI))
- return (bp);
+ if (fbp) {
+ tbp = fbp;
+ tbp->b_flags |= B_READ;
+ } else {
+ tbp = getblk(vp, lbn, size, 0, 0);
+ if (tbp->b_flags & B_CACHE)
+ return tbp;
+ tbp->b_flags |= B_ASYNC | B_READ | B_RAM;
+ }
+
+ tbp->b_blkno = blkno;
+ if( (tbp->b_flags & B_MALLOC) ||
+ ((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
+ return tbp;
- b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save),
- M_SEGMENT, M_WAITOK);
- b_save->bs_bufsize = b_save->bs_bcount = size;
- b_save->bs_nchildren = 0;
- b_save->bs_children = (struct buf **)(b_save + 1);
- b_save->bs_saveaddr = bp->b_saveaddr;
- bp->b_saveaddr = (caddr_t) b_save;
+ bp = trypbuf();
+ if (bp == 0)
+ return tbp;
+
+ (vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
+ bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO;
+ bp->b_iodone = cluster_callback;
+ bp->b_blkno = blkno;
+ bp->b_lblkno = lbn;
+ pbgetvp(vp, bp);
+
+ TAILQ_INIT(&bp->b_cluster.cluster_head);
+
+ bp->b_bcount = 0;
+ bp->b_bufsize = 0;
+ bp->b_npages = 0;
inc = btodb(size);
- for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) {
- /*
- * A component of the cluster is already in core,
- * terminate the cluster early.
- */
- if (incore(vp, lbn + i))
- break;
- tbp = getblk(vp, lbn + i, 0, 0, 0);
- /*
- * getblk may return some memory in the buffer if there were
- * no empty buffers to shed it to. If there is currently
- * memory in the buffer, we move it down size bytes to make
- * room for the valid pages that cluster_callback will insert.
- * We do this now so we don't have to do it at interrupt time
- * in the callback routine.
- */
- if (tbp->b_bufsize != 0) {
- caddr_t bdata = (char *)tbp->b_data;
+ for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
+ if (i != 0) {
+ if ((bp->b_npages * PAGE_SIZE) +
+ round_page(size) > MAXPHYS)
+ break;
- /*
- * No room in the buffer to add another page,
- * terminate the cluster early.
- */
- if (tbp->b_bufsize + size > MAXBSIZE) {
-#ifdef DIAGNOSTIC
- if (tbp->b_bufsize != MAXBSIZE)
- panic("cluster_rbuild: too much memory");
-#endif
- brelse(tbp);
+ if (incore(vp, lbn + i))
break;
+
+ tbp = getblk(vp, lbn + i, size, 0, 0);
+
+ if ((tbp->b_flags & B_CACHE) ||
+ (tbp->b_flags & B_VMIO) == 0) {
+ bqrelse(tbp);
+ break;
+ }
+
+ for (j=0;j<tbp->b_npages;j++) {
+ if (tbp->b_pages[j]->valid) {
+ break;
+ }
}
- if (tbp->b_bufsize > size) {
+
+ if (j != tbp->b_npages) {
/*
- * XXX if the source and destination regions
- * overlap we have to copy backward to avoid
- * clobbering any valid pages (i.e. pagemove
- * implementations typically can't handle
- * overlap).
+ * force buffer to be re-constituted later
*/
- bdata += tbp->b_bufsize;
- while (bdata > (char *)tbp->b_data) {
- bdata -= CLBYTES;
- pagemove(bdata, bdata + size, CLBYTES);
- }
- } else
- pagemove(bdata, bdata + size, tbp->b_bufsize);
+ tbp->b_flags |= B_RELBUF;
+ brelse(tbp);
+ break;
+ }
+
+ if ((fbp && (i == 1)) || (i == (run - 1)))
+ tbp->b_flags |= B_RAM;
+ tbp->b_flags |= B_READ | B_ASYNC;
+ if (tbp->b_blkno == tbp->b_lblkno) {
+ tbp->b_blkno = bn;
+ } else if (tbp->b_blkno != bn) {
+ brelse(tbp);
+ break;
+ }
}
- tbp->b_blkno = bn;
- tbp->b_flags |= flags | B_READ | B_ASYNC;
- ++b_save->bs_nchildren;
- b_save->bs_children[i - 1] = tbp;
- }
- /*
- * The cluster may have been terminated early, adjust the cluster
- * buffer size accordingly. If no cluster could be formed,
- * deallocate the cluster save info.
- */
- if (i <= run) {
- if (i == 1) {
- bp->b_saveaddr = b_save->bs_saveaddr;
- bp->b_flags &= ~B_CALL;
- bp->b_iodone = NULL;
- free(b_save, M_SEGMENT);
+ TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
+ tbp, b_cluster.cluster_entry);
+ for (j = 0; j < tbp->b_npages; j += 1) {
+ vm_page_t m;
+ m = tbp->b_pages[j];
+ ++m->busy;
+ ++m->object->paging_in_progress;
+ if ((bp->b_npages == 0) ||
+ (bp->b_pages[bp->b_npages-1] != m)) {
+ bp->b_pages[bp->b_npages] = m;
+ bp->b_npages++;
+ }
+ if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
+ tbp->b_pages[j] = bogus_page;
}
- allocbuf(bp, size * i);
+ bp->b_bcount += tbp->b_bcount;
+ bp->b_bufsize += tbp->b_bufsize;
}
- return(bp);
-}
-/*
- * Either get a new buffer or grow the existing one.
- */
-struct buf *
-cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run)
- struct vnode *vp;
- struct buf *bp;
- long flags;
- daddr_t blkno;
- daddr_t lblkno;
- long size;
- int run;
-{
- if (!bp) {
- bp = getblk(vp, lblkno, size, 0, 0);
- if (bp->b_flags & (B_DONE | B_DELWRI)) {
- bp->b_blkno = blkno;
- return(bp);
- }
+ for(j=0;j<bp->b_npages;j++) {
+ if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) ==
+ VM_PAGE_BITS_ALL)
+ bp->b_pages[j] = bogus_page;
}
- allocbuf(bp, run * size);
- bp->b_blkno = blkno;
- bp->b_iodone = cluster_callback;
- bp->b_flags |= flags | B_CALL;
- return(bp);
+ if (bp->b_bufsize > bp->b_kvasize)
+ panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)\n",
+ bp->b_bufsize, bp->b_kvasize);
+ bp->b_kvasize = bp->b_bufsize;
+
+ pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+ (vm_page_t *)bp->b_pages, bp->b_npages);
+ return (bp);
}
/*
@@ -415,10 +459,7 @@ void
cluster_callback(bp)
struct buf *bp;
{
- struct cluster_save *b_save;
- struct buf **bpp, *tbp;
- long bsize;
- caddr_t cp;
+ struct buf *nbp, *tbp;
int error = 0;
/*
@@ -427,46 +468,21 @@ cluster_callback(bp)
if (bp->b_flags & B_ERROR)
error = bp->b_error;
- b_save = (struct cluster_save *)(bp->b_saveaddr);
- bp->b_saveaddr = b_save->bs_saveaddr;
-
- bsize = b_save->bs_bufsize;
- cp = (char *)bp->b_data + bsize;
+ pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
/*
* Move memory from the large cluster buffer into the component
* buffers and mark IO as done on these.
*/
- for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) {
- tbp = *bpp;
- pagemove(cp, tbp->b_data, bsize);
- tbp->b_bufsize += bsize;
- tbp->b_bcount = bsize;
+ for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
+ tbp; tbp = nbp) {
+ nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
if (error) {
tbp->b_flags |= B_ERROR;
tbp->b_error = error;
}
biodone(tbp);
- bp->b_bufsize -= bsize;
- cp += bsize;
- }
- /*
- * If there was excess memory in the cluster buffer,
- * slide it up adjacent to the remaining valid data.
- */
- if (bp->b_bufsize != bsize) {
- if (bp->b_bufsize < bsize)
- panic("cluster_callback: too little memory");
- pagemove(cp, (char *)bp->b_data + bsize, bp->b_bufsize - bsize);
- }
- bp->b_bcount = bsize;
- bp->b_iodone = NULL;
- free(b_save, M_SEGMENT);
- if (bp->b_flags & B_ASYNC)
- brelse(bp);
- else {
- bp->b_flags &= ~B_WANTED;
- wakeup((caddr_t)bp);
}
+ relpbuf(bp);
}
/*
@@ -481,38 +497,53 @@ cluster_callback(bp)
*/
void
cluster_write(bp, filesize)
- struct buf *bp;
+ struct buf *bp;
u_quad_t filesize;
{
- struct vnode *vp;
- daddr_t lbn;
- int maxclen, cursize;
+ struct vnode *vp;
+ daddr_t lbn;
+ int maxclen, cursize;
+ int lblocksize;
+ int async;
- vp = bp->b_vp;
- lbn = bp->b_lblkno;
+ vp = bp->b_vp;
+ async = vp->v_mount->mnt_flag & MNT_ASYNC;
+ lblocksize = vp->v_mount->mnt_stat.f_iosize;
+ lbn = bp->b_lblkno;
/* Initialize vnode to beginning of file. */
if (lbn == 0)
vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
- if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
- (bp->b_blkno != vp->v_lasta + btodb(bp->b_bcount))) {
- maxclen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1;
+ if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
+ (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
+ maxclen = MAXPHYS / lblocksize - 1;
if (vp->v_clen != 0) {
/*
* Next block is not sequential.
*
* If we are not writing at end of file, the process
- * seeked to another point in the file since its
- * last write, or we have reached our maximum
- * cluster size, then push the previous cluster.
- * Otherwise try reallocating to make it sequential.
+ * seeked to another point in the file since its last
+ * write, or we have reached our maximum cluster size,
+ * then push the previous cluster. Otherwise try
+ * reallocating to make it sequential.
*/
cursize = vp->v_lastw - vp->v_cstart + 1;
- if ((lbn + 1) * bp->b_bcount != filesize ||
+#ifndef notyet_block_reallocation_enabled
+ if (((u_quad_t)(lbn + 1) * lblocksize) != filesize ||
+ lbn != vp->v_lastw + 1 ||
+ vp->v_clen <= cursize) {
+ if (!async)
+ cluster_wbuild(vp, lblocksize,
+ vp->v_cstart, cursize);
+ }
+#else
+ if (!doreallocblks ||
+ (lbn + 1) * lblocksize != filesize ||
lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
- cluster_wbuild(vp, NULL, bp->b_bcount,
- vp->v_cstart, cursize, lbn);
+ if (!async)
+ cluster_wbuild(vp, lblocksize,
+ vp->v_cstart, cursize);
} else {
struct buf **bpp, **endbp;
struct cluster_save *buflist;
@@ -528,8 +559,8 @@ cluster_write(bp, filesize)
bpp < endbp; bpp++)
brelse(*bpp);
free(buflist, M_SEGMENT);
- cluster_wbuild(vp, NULL, bp->b_bcount,
- vp->v_cstart, cursize, lbn);
+ cluster_wbuild(vp, lblocksize,
+ vp->v_cstart, cursize);
} else {
/*
* Succeeded, keep building cluster.
@@ -543,14 +574,16 @@ cluster_write(bp, filesize)
return;
}
}
+#endif /* notyet_block_reallocation_enabled */
}
/*
- * Consider beginning a cluster.
- * If at end of file, make cluster as large as possible,
- * otherwise find size of existing cluster.
+ * Consider beginning a cluster. If at end of file, make
+ * cluster as large as possible, otherwise find size of
+ * existing cluster.
*/
- if ((lbn + 1) * bp->b_bcount != filesize &&
- (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) ||
+ if (((u_quad_t) (lbn + 1) * lblocksize) != filesize &&
+ (bp->b_blkno == bp->b_lblkno) &&
+ (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
bp->b_blkno == -1)) {
bawrite(bp);
vp->v_clen = 0;
@@ -559,26 +592,25 @@ cluster_write(bp, filesize)
vp->v_lastw = lbn;
return;
}
- vp->v_clen = maxclen;
- if (maxclen == 0) { /* I/O not contiguous */
+ vp->v_clen = maxclen;
+ if (!async && maxclen == 0) { /* I/O not contiguous */
vp->v_cstart = lbn + 1;
- bawrite(bp);
- } else { /* Wait for rest of cluster */
+ bawrite(bp);
+ } else { /* Wait for rest of cluster */
vp->v_cstart = lbn;
- bdwrite(bp);
+ bdwrite(bp);
}
} else if (lbn == vp->v_cstart + vp->v_clen) {
/*
* At end of cluster, write it out.
*/
- cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart,
- vp->v_clen + 1, lbn);
+ bdwrite(bp);
+ cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
vp->v_clen = 0;
vp->v_cstart = lbn + 1;
} else
/*
- * In the middle of a cluster, so just delay the
- * I/O for now.
+ * In the middle of a cluster, so just delay the I/O for now.
*/
bdwrite(bp);
vp->v_lastw = lbn;
@@ -592,165 +624,168 @@ cluster_write(bp, filesize)
* performed. Check to see that it doesn't fall in the middle of
* the current block (if last_bp == NULL).
*/
-void
-cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
+int
+cluster_wbuild(vp, size, start_lbn, len)
struct vnode *vp;
- struct buf *last_bp;
long size;
daddr_t start_lbn;
int len;
- daddr_t lbn;
{
- struct cluster_save *b_save;
struct buf *bp, *tbp;
- caddr_t cp;
- int i, s;
-
-#ifdef DIAGNOSTIC
- if (size != vp->v_mount->mnt_stat.f_iosize)
- panic("cluster_wbuild: size %d != filesize %d\n",
- size, vp->v_mount->mnt_stat.f_iosize);
-#endif
-redo:
- while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) {
- ++start_lbn;
- --len;
- }
-
- /* Get more memory for current buffer */
- if (len <= 1) {
- if (last_bp) {
- bawrite(last_bp);
- } else if (len) {
- bp = getblk(vp, start_lbn, size, 0, 0);
- bawrite(bp);
+ int i, j, s;
+ int totalwritten = 0;
+ int dbsize = btodb(size);
+ while (len > 0) {
+ s = splbio();
+ if ( ((tbp = gbincore(vp, start_lbn)) == NULL) ||
+ ((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) {
+ ++start_lbn;
+ --len;
+ splx(s);
+ continue;
}
- return;
- }
-
- bp = getblk(vp, start_lbn, size, 0, 0);
- if (!(bp->b_flags & B_DELWRI)) {
- ++start_lbn;
- --len;
- brelse(bp);
- goto redo;
- }
+ bremfree(tbp);
+ tbp->b_flags |= B_BUSY;
+ tbp->b_flags &= ~B_DONE;
+ splx(s);
/*
- * Extra memory in the buffer, punt on this buffer.
- * XXX we could handle this in most cases, but we would have to
- * push the extra memory down to after our max possible cluster
- * size and then potentially pull it back up if the cluster was
- * terminated prematurely--too much hassle.
+ * Extra memory in the buffer, punt on this buffer. XXX we could
+ * handle this in most cases, but we would have to push the extra
+ * memory down to after our max possible cluster size and then
+ * potentially pull it back up if the cluster was terminated
+ * prematurely--too much hassle.
*/
- if (bp->b_bcount != bp->b_bufsize) {
- ++start_lbn;
- --len;
- bawrite(bp);
- goto redo;
- }
+ if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) ||
+ (tbp->b_bcount != tbp->b_bufsize) ||
+ (tbp->b_bcount != size) ||
+ len == 1) {
+ totalwritten += tbp->b_bufsize;
+ bawrite(tbp);
+ ++start_lbn;
+ --len;
+ continue;
+ }
- --len;
- b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save),
- M_SEGMENT, M_WAITOK);
- b_save->bs_bcount = bp->b_bcount;
- b_save->bs_bufsize = bp->b_bufsize;
- b_save->bs_nchildren = 0;
- b_save->bs_children = (struct buf **)(b_save + 1);
- b_save->bs_saveaddr = bp->b_saveaddr;
- bp->b_saveaddr = (caddr_t) b_save;
-
- bp->b_flags |= B_CALL;
- bp->b_iodone = cluster_callback;
- cp = (char *)bp->b_data + size;
- for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) {
- /*
- * Block is not in core or the non-sequential block
- * ending our cluster was part of the cluster (in which
- * case we don't want to write it twice).
- */
- if (!incore(vp, start_lbn) ||
- last_bp == NULL && start_lbn == lbn)
- break;
+ bp = trypbuf();
+ if (bp == NULL) {
+ totalwritten += tbp->b_bufsize;
+ bawrite(tbp);
+ ++start_lbn;
+ --len;
+ continue;
+ }
- /*
- * Get the desired block buffer (unless it is the final
- * sequential block whose buffer was passed in explictly
- * as last_bp).
- */
- if (last_bp == NULL || start_lbn != lbn) {
- tbp = getblk(vp, start_lbn, size, 0, 0);
- if (!(tbp->b_flags & B_DELWRI)) {
- brelse(tbp);
- break;
- }
- } else
- tbp = last_bp;
-
- ++b_save->bs_nchildren;
-
- /* Move memory from children to parent */
- if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize))) {
- printf("Clustered Block: %d addr %x bufsize: %d\n",
- bp->b_lblkno, bp->b_blkno, bp->b_bufsize);
- printf("Child Block: %d addr: %x\n", tbp->b_lblkno,
- tbp->b_blkno);
- panic("Clustered write to wrong blocks");
+ TAILQ_INIT(&bp->b_cluster.cluster_head);
+ bp->b_bcount = 0;
+ bp->b_bufsize = 0;
+ bp->b_npages = 0;
+ if (tbp->b_wcred != NOCRED) {
+ bp->b_wcred = tbp->b_wcred;
+ crhold(bp->b_wcred);
}
- pagemove(tbp->b_data, cp, size);
- bp->b_bcount += size;
- bp->b_bufsize += size;
+ bp->b_blkno = tbp->b_blkno;
+ bp->b_lblkno = tbp->b_lblkno;
+ (vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
+ bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER | (tbp->b_flags & (B_VMIO|B_NEEDCOMMIT));
+ bp->b_iodone = cluster_callback;
+ pbgetvp(vp, bp);
+
+ for (i = 0; i < len; ++i, ++start_lbn) {
+ if (i != 0) {
+ s = splbio();
+ if ((tbp = gbincore(vp, start_lbn)) == NULL) {
+ splx(s);
+ break;
+ }
- tbp->b_bufsize -= size;
- tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
- tbp->b_flags |= (B_ASYNC | B_AGE);
- s = splbio();
- reassignbuf(tbp, tbp->b_vp); /* put on clean list */
- ++tbp->b_vp->v_numoutput;
- splx(s);
- b_save->bs_children[i] = tbp;
+ if ((tbp->b_flags & (B_VMIO|B_CLUSTEROK|B_INVAL|B_BUSY|B_DELWRI|B_NEEDCOMMIT)) != (B_DELWRI|B_CLUSTEROK|(bp->b_flags & (B_VMIO|B_NEEDCOMMIT)))) {
+ splx(s);
+ break;
+ }
- cp += size;
- }
+ if (tbp->b_wcred != bp->b_wcred) {
+ splx(s);
+ break;
+ }
- if (i == 0) {
- /* None to cluster */
- bp->b_saveaddr = b_save->bs_saveaddr;
- bp->b_flags &= ~B_CALL;
- bp->b_iodone = NULL;
- free(b_save, M_SEGMENT);
- }
- bawrite(bp);
- if (i < len) {
- len -= i + 1;
- start_lbn += 1;
- goto redo;
+ if ((tbp->b_bcount != size) ||
+ ((bp->b_blkno + dbsize * i) != tbp->b_blkno) ||
+ ((tbp->b_npages + bp->b_npages) > (MAXPHYS / PAGE_SIZE))) {
+ splx(s);
+ break;
+ }
+ bremfree(tbp);
+ tbp->b_flags |= B_BUSY;
+ tbp->b_flags &= ~B_DONE;
+ splx(s);
+ }
+ if (tbp->b_flags & B_VMIO) {
+ for (j = 0; j < tbp->b_npages; j += 1) {
+ vm_page_t m;
+ m = tbp->b_pages[j];
+ ++m->busy;
+ ++m->object->paging_in_progress;
+ if ((bp->b_npages == 0) ||
+ (bp->b_pages[bp->b_npages - 1] != m)) {
+ bp->b_pages[bp->b_npages] = m;
+ bp->b_npages++;
+ }
+ }
+ }
+ bp->b_bcount += size;
+ bp->b_bufsize += size;
+
+ tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
+ tbp->b_flags |= B_ASYNC;
+ s = splbio();
+ reassignbuf(tbp, tbp->b_vp); /* put on clean list */
+ ++tbp->b_vp->v_numoutput;
+ splx(s);
+ TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
+ tbp, b_cluster.cluster_entry);
+ }
+ pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+ (vm_page_t *) bp->b_pages, bp->b_npages);
+ if (bp->b_bufsize > bp->b_kvasize)
+ panic("cluster_wbuild: b_bufsize(%d) > b_kvasize(%d)\n",
+ bp->b_bufsize, bp->b_kvasize);
+ bp->b_kvasize = bp->b_bufsize;
+ totalwritten += bp->b_bufsize;
+ bp->b_dirtyoff = 0;
+ bp->b_dirtyend = bp->b_bufsize;
+ bawrite(bp);
+
+ len -= i;
}
+ return totalwritten;
}
+#ifdef notyet_block_reallocation_enabled
/*
* Collect together all the buffers in a cluster.
* Plus add one additional buffer.
*/
-struct cluster_save *
+static struct cluster_save *
cluster_collectbufs(vp, last_bp)
struct vnode *vp;
struct buf *last_bp;
{
struct cluster_save *buflist;
- daddr_t lbn;
+ daddr_t lbn;
int i, len;
len = vp->v_lastw - vp->v_cstart + 1;
buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
M_SEGMENT, M_WAITOK);
buflist->bs_nchildren = 0;
- buflist->bs_children = (struct buf **)(buflist + 1);
+ buflist->bs_children = (struct buf **) (buflist + 1);
for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++)
- (void)bread(vp, lbn, last_bp->b_bcount, NOCRED,
- &buflist->bs_children[i]);
+ (void) bread(vp, lbn, last_bp->b_bcount, NOCRED,
+ &buflist->bs_children[i]);
buflist->bs_children[i] = last_bp;
buflist->bs_nchildren = i + 1;
return (buflist);
}
+#endif /* notyet_block_reallocation_enabled */
diff --git a/sys/kern/vfs_conf.c b/sys/kern/vfs_conf.c
index 9b57797..779a1c4 100644
--- a/sys/kern/vfs_conf.c
+++ b/sys/kern/vfs_conf.c
@@ -1,6 +1,7 @@
/*
* Copyright (c) 1989, 1993, 1995
* The Regents of the University of California. All rights reserved.
+ * Copyright (c) 1995 Artisoft, Inc. All Rights Reserved.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -30,219 +31,123 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)vfs_conf.c 8.11 (Berkeley) 5/10/95
+ * @(#)vfs_conf.c 8.8 (Berkeley) 3/31/94
+ * $Id$
*/
-#include <sys/param.h>
-#include <sys/mount.h>
-#include <sys/vnode.h>
-
/*
- * These define the root filesystem, device, and root filesystem type.
+ * PURPOSE: This file abstracts the root mounting interface from
+ * the per file system semantics for handling mounts,
+ * the overall intent of which is to move the BSD
+ * internals dependence out of the FS code, both to
+ * make the FS code more portable and to free up some
+ * of the BSD internals so that they may more easily
+ * be changed.
+ *
+ * NOTE1: Code is single entry/single exit to aid debugging
+ * and conversion for kernel multithreading.
+ *
+ * NOTE2: Code notes lock state in headers on entry and exit
+ * as an aid to conversion for kernel multithreading
+ * on SMP reentrancy
*/
-struct mount *rootfs;
-struct vnode *rootvnode;
-int (*mountroot)() = NULL;
+#include <sys/param.h> /* dev_t (types.h)*/
+#include <sys/systm.h> /* rootvp*/
+#include <sys/proc.h> /* curproc*/
+#include <sys/vnode.h> /* NULLVP*/
+#include <sys/mount.h> /* struct mount*/
+#include <sys/malloc.h> /* M_MOUNT*/
/*
- * Set up the initial array of known filesystem types.
+ * GLOBALS
*/
-extern struct vfsops ufs_vfsops;
-extern int ffs_mountroot();
-extern struct vfsops lfs_vfsops;
-extern int lfs_mountroot();
-extern struct vfsops mfs_vfsops;
-extern int mfs_mountroot();
-extern struct vfsops cd9660_vfsops;
-extern int cd9660_mountroot();
-extern struct vfsops msdos_vfsops;
-extern struct vfsops adosfs_vfsops;
-extern struct vfsops nfs_vfsops;
-extern int nfs_mountroot();
-extern struct vfsops afs_vfsops;
-extern struct vfsops procfs_vfsops;
-extern struct vfsops null_vfsops;
-extern struct vfsops union_vfsops;
-extern struct vfsops umap_vfsops;
-extern struct vfsops portal_vfsops;
-extern struct vfsops fdesc_vfsops;
-extern struct vfsops kernfs_vfsops;
/*
- * Set up the filesystem operations for vnodes.
+ * These define the root filesystem, device, and root filesystem type.
*/
-static struct vfsconf vfsconflist[] = {
-
- /* Fast Filesystem */
-#ifdef FFS
- { &ufs_vfsops, "ufs", 1, 0, MNT_LOCAL, ffs_mountroot, NULL },
-#endif
-
- /* Log-based Filesystem */
-#ifdef LFS
- { &lfs_vfsops, "lfs", 5, 0, MNT_LOCAL, lfs_mountroot, NULL },
-#endif
-
- /* Memory-based Filesystem */
-#ifdef MFS
- { &mfs_vfsops, "mfs", 3, 0, MNT_LOCAL, mfs_mountroot, NULL },
-#endif
-
- /* ISO9660 (aka CDROM) Filesystem */
-#ifdef CD9660
- { &cd9660_vfsops, "cd9660", 14, 0, MNT_LOCAL, cd9660_mountroot, NULL },
-#endif
-
- /* MSDOS Filesystem */
-#ifdef MSDOS
- { &msdos_vfsops, "msdos", 4, 0, MNT_LOCAL, NULL, NULL },
-#endif
-
- /* AmigaDOS Filesystem */
-#ifdef ADOSFS
- { &adosfs_vfsops, "adosfs", 16, 0, MNT_LOCAL, NULL, NULL },
-#endif
-
- /* Sun-compatible Network Filesystem */
-#ifdef NFS
- { &nfs_vfsops, "nfs", 2, 0, 0, nfs_mountroot, NULL },
-#endif
-
- /* Andrew Filesystem */
-#ifdef AFS
- { &afs_vfsops, "andrewfs", 13, 0, 0, afs_mountroot, NULL },
-#endif
-
- /* /proc Filesystem */
-#ifdef PROCFS
- { &procfs_vfsops, "procfs", 12, 0, 0, NULL, NULL },
-#endif
-
- /* Loopback (Minimal) Filesystem Layer */
-#ifdef NULLFS
- { &null_vfsops, "loopback", 9, 0, 0, NULL, NULL },
-#endif
-
- /* Union (translucent) Filesystem */
-#ifdef UNION
- { &union_vfsops, "union", 15, 0, 0, NULL, NULL },
-#endif
-
- /* User/Group Identifer Remapping Filesystem */
-#ifdef UMAPFS
- { &umap_vfsops, "umap", 10, 0, 0, NULL, NULL },
-#endif
-
- /* Portal Filesystem */
-#ifdef PORTAL
- { &portal_vfsops, "portal", 8, 0, 0, NULL, NULL },
-#endif
-
- /* File Descriptor Filesystem */
-#ifdef FDESC
- { &fdesc_vfsops, "fdesc", 7, 0, 0, NULL, NULL },
-#endif
-
- /* Kernel Information Filesystem */
-#ifdef KERNFS
- { &kernfs_vfsops, "kernfs", 11, 0, 0, NULL, NULL },
-#endif
-
-};
+struct mount *rootfs;
+struct vnode *rootvnode;
+char *mountrootfsname;
/*
- * Initially the size of the list, vfs_init will set maxvfsconf
+ * vfs_init() will set maxvfsconf
* to the highest defined type number.
*/
-int maxvfsconf = sizeof(vfsconflist) / sizeof (struct vfsconf);
-struct vfsconf *vfsconf = vfsconflist;
+int maxvfsconf;
+struct vfsconf *vfsconf;
/*
+ * Common root mount code shared by all filesystems
+ */
+#define ROOTNAME "root_device"
+
+/*
+ * vfs_mountrootfs
+ *
+ * Common entry point for root mounts
+ *
+ * PARAMETERS:
+ * fsname name of the filesystem
+ *
+ * RETURNS: 0 Success
+ * !0 error number (errno.h)
*
- * vfs_opv_descs enumerates the list of vnode classes, each with it's own
- * vnode operation vector. It is consulted at system boot to build operation
- * vectors. It is NULL terminated.
+ * LOCK STATE:
+ * ENTRY
+ * <no locks held>
+ * EXIT
+ * <no locks held>
*
+ * NOTES:
+ * This code is currently supported only for use for
+ * the FFS file system type. This is a matter of
+ * fixing the other file systems, not this code!
*/
-extern struct vnodeopv_desc ffs_vnodeop_opv_desc;
-extern struct vnodeopv_desc ffs_specop_opv_desc;
-extern struct vnodeopv_desc ffs_fifoop_opv_desc;
-extern struct vnodeopv_desc lfs_vnodeop_opv_desc;
-extern struct vnodeopv_desc lfs_specop_opv_desc;
-extern struct vnodeopv_desc lfs_fifoop_opv_desc;
-extern struct vnodeopv_desc mfs_vnodeop_opv_desc;
-extern struct vnodeopv_desc dead_vnodeop_opv_desc;
-extern struct vnodeopv_desc fifo_vnodeop_opv_desc;
-extern struct vnodeopv_desc spec_vnodeop_opv_desc;
-extern struct vnodeopv_desc nfsv2_vnodeop_opv_desc;
-extern struct vnodeopv_desc spec_nfsv2nodeop_opv_desc;
-extern struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc;
-extern struct vnodeopv_desc fdesc_vnodeop_opv_desc;
-extern struct vnodeopv_desc portal_vnodeop_opv_desc;
-extern struct vnodeopv_desc null_vnodeop_opv_desc;
-extern struct vnodeopv_desc umap_vnodeop_opv_desc;
-extern struct vnodeopv_desc kernfs_vnodeop_opv_desc;
-extern struct vnodeopv_desc procfs_vnodeop_opv_desc;
-extern struct vnodeopv_desc cd9660_vnodeop_opv_desc;
-extern struct vnodeopv_desc cd9660_specop_opv_desc;
-extern struct vnodeopv_desc cd9660_fifoop_opv_desc;
-extern struct vnodeopv_desc union_vnodeop_opv_desc;
-
-struct vnodeopv_desc *vfs_opv_descs[] = {
- &ffs_vnodeop_opv_desc,
- &ffs_specop_opv_desc,
-#ifdef FIFO
- &ffs_fifoop_opv_desc,
-#endif
- &dead_vnodeop_opv_desc,
-#ifdef FIFO
- &fifo_vnodeop_opv_desc,
-#endif
- &spec_vnodeop_opv_desc,
-#ifdef LFS
- &lfs_vnodeop_opv_desc,
- &lfs_specop_opv_desc,
-#ifdef FIFO
- &lfs_fifoop_opv_desc,
-#endif
-#endif
-#ifdef MFS
- &mfs_vnodeop_opv_desc,
-#endif
-#ifdef NFS
- &nfsv2_vnodeop_opv_desc,
- &spec_nfsv2nodeop_opv_desc,
-#ifdef FIFO
- &fifo_nfsv2nodeop_opv_desc,
-#endif
-#endif
-#ifdef FDESC
- &fdesc_vnodeop_opv_desc,
-#endif
-#ifdef PORTAL
- &portal_vnodeop_opv_desc,
-#endif
-#ifdef NULLFS
- &null_vnodeop_opv_desc,
-#endif
-#ifdef UMAPFS
- &umap_vnodeop_opv_desc,
-#endif
-#ifdef KERNFS
- &kernfs_vnodeop_opv_desc,
-#endif
-#ifdef PROCFS
- &procfs_vnodeop_opv_desc,
-#endif
-#ifdef CD9660
- &cd9660_vnodeop_opv_desc,
- &cd9660_specop_opv_desc,
-#ifdef FIFO
- &cd9660_fifoop_opv_desc,
-#endif
-#endif
-#ifdef UNION
- &union_vnodeop_opv_desc,
-#endif
- NULL
-};
+int
+vfs_mountrootfs(fsname)
+ char *fsname;
+{
+ struct mount *mp;
+ int err = 0;
+ struct proc *p = curproc; /* XXX */
+
+ /*
+ * New root mount structure
+ */
+ err = vfs_rootmountalloc(fsname, ROOTNAME, &mp);
+ if (err)
+ return (err);
+ mp->mnt_flag |= MNT_ROOTFS;
+
+ /*
+ * Attempt the mount
+ */
+ err = VFS_MOUNT(mp, NULL, NULL, NULL, p);
+ if (err)
+ goto error_2;
+
+ simple_lock(&mountlist_slock);
+ /* Add fs to list of mounted file systems*/
+ CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+ simple_unlock(&mountlist_slock);
+
+ vfs_unbusy(mp, p);
+
+ /* root mount, update system time from FS specific data*/
+ inittodr(mp->mnt_time);
+
+ goto success;
+
+
+error_2: /* mount error*/
+
+ vfs_unbusy(mp, p);
+
+error_1: /* lock error*/
+
+ /* free mount struct before failing*/
+ free( mp, M_MOUNT);
+
+success:
+ return( err);
+}
diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c
new file mode 100644
index 0000000..0b487fd
--- /dev/null
+++ b/sys/kern/vfs_export.c
@@ -0,0 +1,2079 @@
+/*
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
+ * $Id: vfs_subr.c,v 1.79 1997/03/04 18:31:56 bde Exp $
+ */
+
+/*
+ * External virtual filesystem routines
+ */
+#include "opt_ddb.h"
+#include "opt_devfs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/time.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/namei.h>
+#include <sys/ucred.h>
+#include <sys/buf.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/domain.h>
+#include <sys/mbuf.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_pager.h>
+#include <vm/vnode_pager.h>
+#include <sys/sysctl.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#ifdef DDB
+extern void printlockedvnodes __P((void));
+#endif
+static void vclean __P((struct vnode *vp, int flags, struct proc *p));
+extern void vgonel __P((struct vnode *vp, struct proc *p));
+unsigned long numvnodes;
+extern void vfs_unmountroot __P((struct mount *rootfs));
+extern void vputrele __P((struct vnode *vp, int put));
+
+enum vtype iftovt_tab[16] = {
+ VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
+ VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
+};
+int vttoif_tab[9] = {
+ 0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
+ S_IFSOCK, S_IFIFO, S_IFMT,
+};
+
+/*
+ * Insq/Remq for the vnode usage lists.
+ */
+#define bufinsvn(bp, dp) LIST_INSERT_HEAD(dp, bp, b_vnbufs)
+#define bufremvn(bp) { \
+ LIST_REMOVE(bp, b_vnbufs); \
+ (bp)->b_vnbufs.le_next = NOLIST; \
+}
+TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */
+static u_long freevnodes = 0;
+
+struct mntlist mountlist; /* mounted filesystem list */
+struct simplelock mountlist_slock;
+static struct simplelock mntid_slock;
+struct simplelock mntvnode_slock;
+struct simplelock vnode_free_list_slock;
+static struct simplelock spechash_slock;
+
+int desiredvnodes;
+SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "");
+
+static void vfs_free_addrlist __P((struct netexport *nep));
+static int vfs_free_netcred __P((struct radix_node *rn, void *w));
+static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
+ struct export_args *argp));
+
+/*
+ * Initialize the vnode management data structures.
+ */
+void
+vntblinit()
+{
+
+ desiredvnodes = maxproc + vm_object_cache_max;
+ simple_lock_init(&mntvnode_slock);
+ simple_lock_init(&mntid_slock);
+ simple_lock_init(&spechash_slock);
+ TAILQ_INIT(&vnode_free_list);
+ simple_lock_init(&vnode_free_list_slock);
+ CIRCLEQ_INIT(&mountlist);
+}
+
+/*
+ * Mark a mount point as busy. Used to synchronize access and to delay
+ * unmounting. Interlock is not released on failure.
+ */
+int
+vfs_busy(mp, flags, interlkp, p)
+ struct mount *mp;
+ int flags;
+ struct simplelock *interlkp;
+ struct proc *p;
+{
+ int lkflags;
+
+ if (mp->mnt_flag & MNT_UNMOUNT) {
+ if (flags & LK_NOWAIT)
+ return (ENOENT);
+ mp->mnt_flag |= MNT_MWAIT;
+ if (interlkp) {
+ simple_unlock(interlkp);
+ }
+ /*
+ * Since all busy locks are shared except the exclusive
+ * lock granted when unmounting, the only place that a
+ * wakeup needs to be done is at the release of the
+ * exclusive lock at the end of dounmount.
+ */
+ tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
+ if (interlkp) {
+ simple_lock(interlkp);
+ }
+ return (ENOENT);
+ }
+ lkflags = LK_SHARED;
+ if (interlkp)
+ lkflags |= LK_INTERLOCK;
+ if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
+ panic("vfs_busy: unexpected lock failure");
+ return (0);
+}
+
+/*
+ * Free a busy filesystem.
+ */
+void
+vfs_unbusy(mp, p)
+ struct mount *mp;
+ struct proc *p;
+{
+
+ lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
+}
+
+/*
+ * Lookup a filesystem type, and if found allocate and initialize
+ * a mount structure for it.
+ *
+ * Devname is usually updated by mount(8) after booting.
+ */
+int
+vfs_rootmountalloc(fstypename, devname, mpp)
+ char *fstypename;
+ char *devname;
+ struct mount **mpp;
+{
+ struct proc *p = curproc; /* XXX */
+ struct vfsconf *vfsp;
+ struct mount *mp;
+
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (!strcmp(vfsp->vfc_name, fstypename))
+ break;
+ if (vfsp == NULL)
+ return (ENODEV);
+ mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
+ bzero((char *)mp, (u_long)sizeof(struct mount));
+ lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
+ (void)vfs_busy(mp, LK_NOWAIT, 0, p);
+ LIST_INIT(&mp->mnt_vnodelist);
+ mp->mnt_vfc = vfsp;
+ mp->mnt_op = vfsp->vfc_vfsops;
+ mp->mnt_flag = MNT_RDONLY;
+ mp->mnt_vnodecovered = NULLVP;
+ vfsp->vfc_refcount++;
+ mp->mnt_stat.f_type = vfsp->vfc_typenum;
+ mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+ strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
+ mp->mnt_stat.f_mntonname[0] = '/';
+ mp->mnt_stat.f_mntonname[1] = 0;
+ (void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
+ *mpp = mp;
+ return (0);
+}
+
+/*
+ * Find an appropriate filesystem to use for the root. If a filesystem
+ * has not been preselected, walk through the list of known filesystems
+ * trying those that have mountroot routines, and try them until one
+ * works or we have tried them all.
+ */
+#ifdef notdef /* XXX JH */
+int
+lite2_vfs_mountroot(void)
+{
+ struct vfsconf *vfsp;
+ extern int (*lite2_mountroot)(void);
+ int error;
+
+ if (lite2_mountroot != NULL)
+ return ((*lite2_mountroot)());
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
+ if (vfsp->vfc_mountroot == NULL)
+ continue;
+ if ((error = (*vfsp->vfc_mountroot)()) == 0)
+ return (0);
+ printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
+ }
+ return (ENODEV);
+}
+#endif
+
+/*
+ * Lookup a mount point by filesystem identifier.
+ */
+struct mount *
+vfs_getvfs(fsid)
+ fsid_t *fsid;
+{
+ register struct mount *mp;
+
+ simple_lock(&mountlist_slock);
+ for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
+ mp = mp->mnt_list.cqe_next) {
+ if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
+ mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
+ simple_unlock(&mountlist_slock);
+ return (mp);
+ }
+ }
+ simple_unlock(&mountlist_slock);
+ return ((struct mount *) 0);
+}
+
+/*
+ * Get a new unique fsid
+ */
+void
+vfs_getnewfsid(mp)
+ struct mount *mp;
+{
+ static u_short xxxfs_mntid;
+
+ fsid_t tfsid;
+ int mtype;
+
+ simple_lock(&mntid_slock);
+ mtype = mp->mnt_vfc->vfc_typenum;
+ mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
+ mp->mnt_stat.f_fsid.val[1] = mtype;
+ if (xxxfs_mntid == 0)
+ ++xxxfs_mntid;
+ tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
+ tfsid.val[1] = mtype;
+ if (mountlist.cqh_first != (void *)&mountlist) {
+ while (vfs_getvfs(&tfsid)) {
+ tfsid.val[0]++;
+ xxxfs_mntid++;
+ }
+ }
+ mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
+ simple_unlock(&mntid_slock);
+}
+
+/*
+ * Set vnode attributes to VNOVAL
+ */
+void
+vattr_null(vap)
+ register struct vattr *vap;
+{
+
+ vap->va_type = VNON;
+ vap->va_size = VNOVAL;
+ vap->va_bytes = VNOVAL;
+ vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid =
+ vap->va_fsid = vap->va_fileid =
+ vap->va_blocksize = vap->va_rdev =
+ vap->va_atime.tv_sec = vap->va_atime.tv_nsec =
+ vap->va_mtime.tv_sec = vap->va_mtime.tv_nsec =
+ vap->va_ctime.tv_sec = vap->va_ctime.tv_nsec =
+ vap->va_flags = vap->va_gen = VNOVAL;
+ vap->va_vaflags = 0;
+}
+
+/*
+ * Routines having to do with the management of the vnode table.
+ */
+extern vop_t **dead_vnodeop_p;
+
+/*
+ * Return the next vnode from the free list.
+ */
+int
+getnewvnode(tag, mp, vops, vpp)
+ enum vtagtype tag;
+ struct mount *mp;
+ vop_t **vops;
+ struct vnode **vpp;
+{
+ struct proc *p = curproc; /* XXX */
+ struct vnode *vp;
+
+ simple_lock(&vnode_free_list_slock);
+retry:
+ /*
+ * we allocate a new vnode if
+ * 1. we don't have any free
+ * Pretty obvious, we actually used to panic, but that
+ * is a silly thing to do.
+ * 2. we havn't filled our pool yet
+ * We don't want to trash the incore (VM-)vnodecache.
+ * 3. if less that 1/4th of our vnodes are free.
+ * We don't want to trash the namei cache either.
+ */
+ if (freevnodes < (numvnodes >> 2) ||
+ numvnodes < desiredvnodes ||
+ vnode_free_list.tqh_first == NULL) {
+ simple_unlock(&vnode_free_list_slock);
+ vp = (struct vnode *) malloc((u_long) sizeof *vp,
+ M_VNODE, M_WAITOK);
+ bzero((char *) vp, sizeof *vp);
+ numvnodes++;
+ } else {
+ for (vp = vnode_free_list.tqh_first;
+ vp != NULLVP; vp = vp->v_freelist.tqe_next) {
+ if (simple_lock_try(&vp->v_interlock))
+ break;
+ }
+ /*
+ * Unless this is a bad time of the month, at most
+ * the first NCPUS items on the free list are
+ * locked, so this is close enough to being empty.
+ */
+ if (vp == NULLVP) {
+ simple_unlock(&vnode_free_list_slock);
+ tablefull("vnode");
+ *vpp = 0;
+ return (ENFILE);
+ }
+ if (vp->v_usecount)
+ panic("free vnode isn't");
+ TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+ if (vp->v_usage > 0) {
+ simple_unlock(&vp->v_interlock);
+ --vp->v_usage;
+ TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+ goto retry;
+ }
+ freevnodes--;
+
+ /* see comment on why 0xdeadb is set at end of vgone (below) */
+ vp->v_freelist.tqe_prev = (struct vnode **) 0xdeadb;
+ simple_unlock(&vnode_free_list_slock);
+ vp->v_lease = NULL;
+ if (vp->v_type != VBAD)
+ vgonel(vp, p);
+ else {
+ simple_unlock(&vp->v_interlock);
+ }
+
+#ifdef DIAGNOSTIC
+ {
+ int s;
+
+ if (vp->v_data)
+ panic("cleaned vnode isn't");
+ s = splbio();
+ if (vp->v_numoutput)
+ panic("Clean vnode has pending I/O's");
+ splx(s);
+ }
+#endif
+ vp->v_flag = 0;
+ vp->v_lastr = 0;
+ vp->v_lastw = 0;
+ vp->v_lasta = 0;
+ vp->v_cstart = 0;
+ vp->v_clen = 0;
+ vp->v_socket = 0;
+ vp->v_writecount = 0; /* XXX */
+ vp->v_usage = 0;
+ }
+ vp->v_type = VNON;
+ cache_purge(vp);
+ vp->v_tag = tag;
+ vp->v_op = vops;
+ insmntque(vp, mp);
+ *vpp = vp;
+ vp->v_usecount = 1;
+ vp->v_data = 0;
+ return (0);
+}
+
+/*
+ * Move a vnode from one mount queue to another.
+ */
+void
+insmntque(vp, mp)
+ register struct vnode *vp;
+ register struct mount *mp;
+{
+
+ simple_lock(&mntvnode_slock);
+ /*
+ * Delete from old mount point vnode list, if on one.
+ */
+ if (vp->v_mount != NULL)
+ LIST_REMOVE(vp, v_mntvnodes);
+ /*
+ * Insert into list of vnodes for the new mount point, if available.
+ */
+ if ((vp->v_mount = mp) == NULL) {
+ simple_unlock(&mntvnode_slock);
+ return;
+ }
+ LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
+ simple_unlock(&mntvnode_slock);
+}
+
+/*
+ * Update outstanding I/O count and do wakeup if requested.
+ */
+void
+vwakeup(bp)
+ register struct buf *bp;
+{
+ register struct vnode *vp;
+
+ bp->b_flags &= ~B_WRITEINPROG;
+ if ((vp = bp->b_vp)) {
+ vp->v_numoutput--;
+ if (vp->v_numoutput < 0)
+ panic("vwakeup: neg numoutput");
+ if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
+ vp->v_flag &= ~VBWAIT;
+ wakeup((caddr_t) &vp->v_numoutput);
+ }
+ }
+}
+
+/*
+ * Flush out and invalidate all buffers associated with a vnode.
+ * Called with the underlying object locked.
+ */
+int
+vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
+ register struct vnode *vp;
+ int flags;
+ struct ucred *cred;
+ struct proc *p;
+ int slpflag, slptimeo;
+{
+ register struct buf *bp;
+ struct buf *nbp, *blist;
+ int s, error;
+ vm_object_t object;
+
+ if (flags & V_SAVE) {
+ if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)))
+ return (error);
+ if (vp->v_dirtyblkhd.lh_first != NULL)
+ panic("vinvalbuf: dirty bufs");
+ }
+
+ s = splbio();
+ for (;;) {
+ if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA))
+ while (blist && blist->b_lblkno < 0)
+ blist = blist->b_vnbufs.le_next;
+ if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
+ (flags & V_SAVEMETA))
+ while (blist && blist->b_lblkno < 0)
+ blist = blist->b_vnbufs.le_next;
+ if (!blist)
+ break;
+
+ for (bp = blist; bp; bp = nbp) {
+ nbp = bp->b_vnbufs.le_next;
+ if ((flags & V_SAVEMETA) && bp->b_lblkno < 0)
+ continue;
+ if (bp->b_flags & B_BUSY) {
+ bp->b_flags |= B_WANTED;
+ error = tsleep((caddr_t) bp,
+ slpflag | (PRIBIO + 1), "vinvalbuf",
+ slptimeo);
+ if (error) {
+ splx(s);
+ return (error);
+ }
+ break;
+ }
+ bremfree(bp);
+ bp->b_flags |= B_BUSY;
+ /*
+ * XXX Since there are no node locks for NFS, I
+ * believe there is a slight chance that a delayed
+ * write will occur while sleeping just above, so
+ * check for it.
+ */
+ if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
+ (void) VOP_BWRITE(bp);
+ break;
+ }
+ bp->b_flags |= (B_INVAL|B_NOCACHE|B_RELBUF);
+ brelse(bp);
+ }
+ }
+
+ while (vp->v_numoutput > 0) {
+ vp->v_flag |= VBWAIT;
+ tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
+ }
+
+ splx(s);
+
+ /*
+ * Destroy the copy in the VM cache, too.
+ */
+ object = vp->v_object;
+ if (object != NULL) {
+ vm_object_page_remove(object, 0, object->size,
+ (flags & V_SAVE) ? TRUE : FALSE);
+ }
+ if (!(flags & V_SAVEMETA) &&
+ (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first))
+ panic("vinvalbuf: flush failed");
+ return (0);
+}
+
+/*
+ * Associate a buffer with a vnode.
+ */
+void
+bgetvp(vp, bp)
+ register struct vnode *vp;
+ register struct buf *bp;
+{
+ int s;
+
+ if (bp->b_vp)
+ panic("bgetvp: not free");
+ VHOLD(vp);
+ bp->b_vp = vp;
+ if (vp->v_type == VBLK || vp->v_type == VCHR)
+ bp->b_dev = vp->v_rdev;
+ else
+ bp->b_dev = NODEV;
+ /*
+ * Insert onto list for new vnode.
+ */
+ s = splbio();
+ bufinsvn(bp, &vp->v_cleanblkhd);
+ splx(s);
+}
+
+/*
+ * Disassociate a buffer from a vnode.
+ */
+void
+brelvp(bp)
+ register struct buf *bp;
+{
+ struct vnode *vp;
+ int s;
+
+ if (bp->b_vp == (struct vnode *) 0)
+ panic("brelvp: NULL");
+ /*
+ * Delete from old vnode list, if on one.
+ */
+ s = splbio();
+ if (bp->b_vnbufs.le_next != NOLIST)
+ bufremvn(bp);
+ splx(s);
+
+ vp = bp->b_vp;
+ bp->b_vp = (struct vnode *) 0;
+ HOLDRELE(vp);
+}
+
+/*
+ * Associate a p-buffer with a vnode.
+ */
+void
+pbgetvp(vp, bp)
+ register struct vnode *vp;
+ register struct buf *bp;
+{
+#if defined(DIAGNOSTIC)
+ if (bp->b_vp)
+ panic("pbgetvp: not free");
+#endif
+ bp->b_vp = vp;
+ if (vp->v_type == VBLK || vp->v_type == VCHR)
+ bp->b_dev = vp->v_rdev;
+ else
+ bp->b_dev = NODEV;
+}
+
+/*
+ * Disassociate a p-buffer from a vnode.
+ */
+void
+pbrelvp(bp)
+ register struct buf *bp;
+{
+ struct vnode *vp;
+
+#if defined(DIAGNOSTIC)
+ if (bp->b_vp == (struct vnode *) 0)
+ panic("pbrelvp: NULL");
+#endif
+
+ bp->b_vp = (struct vnode *) 0;
+}
+
+/*
+ * Reassign a buffer from one vnode to another.
+ * Used to assign file specific control information
+ * (indirect blocks) to the vnode to which they belong.
+ */
+void
+reassignbuf(bp, newvp)
+ register struct buf *bp;
+ register struct vnode *newvp;
+{
+ int s;
+
+ if (newvp == NULL) {
+ printf("reassignbuf: NULL");
+ return;
+ }
+
+ s = splbio();
+ /*
+ * Delete from old vnode list, if on one.
+ */
+ if (bp->b_vnbufs.le_next != NOLIST)
+ bufremvn(bp);
+ /*
+ * If dirty, put on list of dirty buffers; otherwise insert onto list
+ * of clean buffers.
+ */
+ if (bp->b_flags & B_DELWRI) {
+ struct buf *tbp;
+
+ tbp = newvp->v_dirtyblkhd.lh_first;
+ if (!tbp || (tbp->b_lblkno > bp->b_lblkno)) {
+ bufinsvn(bp, &newvp->v_dirtyblkhd);
+ } else {
+ while (tbp->b_vnbufs.le_next &&
+ (tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) {
+ tbp = tbp->b_vnbufs.le_next;
+ }
+ LIST_INSERT_AFTER(tbp, bp, b_vnbufs);
+ }
+ } else {
+ bufinsvn(bp, &newvp->v_cleanblkhd);
+ }
+ splx(s);
+}
+
+#ifndef DEVFS_ROOT
+/*
+ * Create a vnode for a block device.
+ * Used for root filesystem, argdev, and swap areas.
+ * Also used for memory file system special devices.
+ */
+int
+bdevvp(dev, vpp)
+ dev_t dev;
+ struct vnode **vpp;
+{
+ register struct vnode *vp;
+ struct vnode *nvp;
+ int error;
+
+ if (dev == NODEV)
+ return (0);
+ error = getnewvnode(VT_NON, (struct mount *) 0, spec_vnodeop_p, &nvp);
+ if (error) {
+ *vpp = 0;
+ return (error);
+ }
+ vp = nvp;
+ vp->v_type = VBLK;
+ if ((nvp = checkalias(vp, dev, (struct mount *) 0))) {
+ vput(vp);
+ vp = nvp;
+ }
+ *vpp = vp;
+ return (0);
+}
+#endif /* !DEVFS_ROOT */
+
+/*
+ * Check to see if the new vnode represents a special device
+ * for which we already have a vnode (either because of
+ * bdevvp() or because of a different vnode representing
+ * the same block device). If such an alias exists, deallocate
+ * the existing contents and return the aliased vnode. The
+ * caller is responsible for filling it with its new contents.
+ */
+struct vnode *
+checkalias(nvp, nvp_rdev, mp)
+ register struct vnode *nvp;
+ dev_t nvp_rdev;
+ struct mount *mp;
+{
+ struct proc *p = curproc; /* XXX */
+ struct vnode *vp;
+ struct vnode **vpp;
+
+ if (nvp->v_type != VBLK && nvp->v_type != VCHR)
+ return (NULLVP);
+
+ vpp = &speclisth[SPECHASH(nvp_rdev)];
+loop:
+ simple_lock(&spechash_slock);
+ for (vp = *vpp; vp; vp = vp->v_specnext) {
+ if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
+ continue;
+ /*
+ * Alias, but not in use, so flush it out.
+ */
+ simple_lock(&vp->v_interlock);
+ if (vp->v_usecount == 0) {
+ simple_unlock(&spechash_slock);
+ vgonel(vp, p);
+ goto loop;
+ }
+ if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
+ simple_unlock(&spechash_slock);
+ goto loop;
+ }
+ break;
+ }
+ if (vp == NULL || vp->v_tag != VT_NON) {
+ MALLOC(nvp->v_specinfo, struct specinfo *,
+ sizeof(struct specinfo), M_VNODE, M_WAITOK);
+ nvp->v_rdev = nvp_rdev;
+ nvp->v_hashchain = vpp;
+ nvp->v_specnext = *vpp;
+ nvp->v_specflags = 0;
+ simple_unlock(&spechash_slock);
+ *vpp = nvp;
+ if (vp != NULLVP) {
+ nvp->v_flag |= VALIASED;
+ vp->v_flag |= VALIASED;
+ vput(vp);
+ }
+ return (NULLVP);
+ }
+ simple_unlock(&spechash_slock);
+ VOP_UNLOCK(vp, 0, p);
+ simple_lock(&vp->v_interlock);
+ vclean(vp, 0, p);
+ vp->v_op = nvp->v_op;
+ vp->v_tag = nvp->v_tag;
+ nvp->v_type = VNON;
+ insmntque(vp, mp);
+ return (vp);
+}
+
+/*
+ * Grab a particular vnode from the free list, increment its
+ * reference count and lock it. The vnode lock bit is set the
+ * vnode is being eliminated in vgone. The process is awakened
+ * when the transition is completed, and an error returned to
+ * indicate that the vnode is no longer usable (possibly having
+ * been changed to a new file system type).
+ */
+int
+vget(vp, flags, p)
+ register struct vnode *vp;
+ int flags;
+ struct proc *p;
+{
+ int error;
+
+ /*
+ * If the vnode is in the process of being cleaned out for
+ * another use, we wait for the cleaning to finish and then
+ * return failure. Cleaning is determined by checking that
+ * the VXLOCK flag is set.
+ */
+ if ((flags & LK_INTERLOCK) == 0) {
+ simple_lock(&vp->v_interlock);
+ }
+ if (vp->v_flag & VXLOCK) {
+ vp->v_flag |= VXWANT;
+ simple_unlock(&vp->v_interlock);
+ tsleep((caddr_t)vp, PINOD, "vget", 0);
+ return (ENOENT);
+ }
+ if (vp->v_usecount == 0) {
+ simple_lock(&vnode_free_list_slock);
+ TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+ simple_unlock(&vnode_free_list_slock);
+ freevnodes--;
+ }
+ vp->v_usecount++;
+ /*
+ * Create the VM object, if needed
+ */
+ if ((vp->v_type == VREG) &&
+ ((vp->v_object == NULL) ||
+ (vp->v_object->flags & OBJ_VFS_REF) == 0)) {
+ /*
+ * XXX vfs_object_create probably needs the interlock.
+ */
+ simple_unlock(&vp->v_interlock);
+ vfs_object_create(vp, curproc, curproc->p_ucred, 0);
+ simple_lock(&vp->v_interlock);
+ }
+ if (flags & LK_TYPE_MASK) {
+ if (error = vn_lock(vp, flags | LK_INTERLOCK, p))
+ vrele(vp);
+ return (error);
+ }
+ simple_unlock(&vp->v_interlock);
+ return (0);
+}
+
+/*
+ * Stubs to use when there is no locking to be done on the underlying object.
+ * A minimal shared lock is necessary to ensure that the underlying object
+ * is not revoked while an operation is in progress. So, an active shared
+ * count is maintained in an auxillary vnode lock structure.
+ */
+int
+vop_nolock(ap)
+ struct vop_lock_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ struct proc *a_p;
+ } */ *ap;
+{
+#ifdef notyet
+ /*
+ * This code cannot be used until all the non-locking filesystems
+ * (notably NFS) are converted to properly lock and release nodes.
+ * Also, certain vnode operations change the locking state within
+ * the operation (create, mknod, remove, link, rename, mkdir, rmdir,
+ * and symlink). Ideally these operations should not change the
+ * lock state, but should be changed to let the caller of the
+ * function unlock them. Otherwise all intermediate vnode layers
+ * (such as union, umapfs, etc) must catch these functions to do
+ * the necessary locking at their layer. Note that the inactive
+ * and lookup operations also change their lock state, but this
+ * cannot be avoided, so these two operations will always need
+ * to be handled in intermediate layers.
+ */
+ struct vnode *vp = ap->a_vp;
+ int vnflags, flags = ap->a_flags;
+
+ if (vp->v_vnlock == NULL) {
+ if ((flags & LK_TYPE_MASK) == LK_DRAIN)
+ return (0);
+ MALLOC(vp->v_vnlock, struct lock *, sizeof(struct lock),
+ M_VNODE, M_WAITOK);
+ lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
+ }
+ switch (flags & LK_TYPE_MASK) {
+ case LK_DRAIN:
+ vnflags = LK_DRAIN;
+ break;
+ case LK_EXCLUSIVE:
+ case LK_SHARED:
+ vnflags = LK_SHARED;
+ break;
+ case LK_UPGRADE:
+ case LK_EXCLUPGRADE:
+ case LK_DOWNGRADE:
+ return (0);
+ case LK_RELEASE:
+ default:
+ panic("vop_nolock: bad operation %d", flags & LK_TYPE_MASK);
+ }
+ if (flags & LK_INTERLOCK)
+ vnflags |= LK_INTERLOCK;
+ return(lockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p));
+#else /* for now */
+ /*
+ * Since we are not using the lock manager, we must clear
+ * the interlock here.
+ */
+ if (ap->a_flags & LK_INTERLOCK) {
+ simple_unlock(&ap->a_vp->v_interlock);
+ }
+ return (0);
+#endif
+}
+
+/*
+ * Do the inverse of vop_nolock, handling the interlock in a compatible way.
+ */
+int
+vop_nounlock(ap)
+ struct vop_unlock_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ struct proc *a_p;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+
+ if (vp->v_vnlock == NULL) {
+ if (ap->a_flags & LK_INTERLOCK)
+ simple_unlock(&ap->a_vp->v_interlock);
+ return (0);
+ }
+ return (lockmgr(vp->v_vnlock, LK_RELEASE | ap->a_flags,
+ &ap->a_vp->v_interlock, ap->a_p));
+}
+
+/*
+ * Return whether or not the node is in use.
+ */
+int
+vop_noislocked(ap)
+ struct vop_islocked_args /* {
+ struct vnode *a_vp;
+ } */ *ap;
+{
+ struct vnode *vp = ap->a_vp;
+
+ if (vp->v_vnlock == NULL)
+ return (0);
+ return (lockstatus(vp->v_vnlock));
+}
+
+/* #ifdef DIAGNOSTIC */
+/*
+ * Vnode reference, just increment the count
+ */
+void
+vref(vp)
+ struct vnode *vp;
+{
+ simple_lock(&vp->v_interlock);
+ if (vp->v_usecount <= 0)
+ panic("vref used where vget required");
+
+ vp->v_usecount++;
+
+ if ((vp->v_type == VREG) &&
+ ((vp->v_object == NULL) ||
+ ((vp->v_object->flags & OBJ_VFS_REF) == 0)) ) {
+ /*
+ * We need to lock to VP during the time that
+ * the object is created. This is necessary to
+ * keep the system from re-entrantly doing it
+ * multiple times.
+ * XXX vfs_object_create probably needs the interlock?
+ */
+ simple_unlock(&vp->v_interlock);
+ vfs_object_create(vp, curproc, curproc->p_ucred, 0);
+ return;
+ }
+ simple_unlock(&vp->v_interlock);
+}
+
+/*
+ * Vnode put/release.
+ * If count drops to zero, call inactive routine and return to freelist.
+ */
+void
+vputrele(vp, put)
+ struct vnode *vp;
+ int put;
+{
+ struct proc *p = curproc; /* XXX */
+
+#ifdef DIAGNOSTIC
+ if (vp == NULL)
+ panic("vputrele: null vp");
+#endif
+ simple_lock(&vp->v_interlock);
+ vp->v_usecount--;
+
+ if ((vp->v_usecount == 1) &&
+ vp->v_object &&
+ (vp->v_object->flags & OBJ_VFS_REF)) {
+ vp->v_object->flags &= ~OBJ_VFS_REF;
+ if (put) {
+ VOP_UNLOCK(vp, LK_INTERLOCK, p);
+ } else {
+ simple_unlock(&vp->v_interlock);
+ }
+ vm_object_deallocate(vp->v_object);
+ return;
+ }
+
+ if (vp->v_usecount > 0) {
+ if (put) {
+ VOP_UNLOCK(vp, LK_INTERLOCK, p);
+ } else {
+ simple_unlock(&vp->v_interlock);
+ }
+ return;
+ }
+
+ if (vp->v_usecount < 0) {
+#ifdef DIAGNOSTIC
+ vprint("vputrele: negative ref count", vp);
+#endif
+ panic("vputrele: negative ref cnt");
+ }
+ simple_lock(&vnode_free_list_slock);
+ if (vp->v_flag & VAGE) {
+ vp->v_flag &= ~VAGE;
+ vp->v_usage = 0;
+ if(vp->v_tag != VT_TFS)
+ TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+ } else {
+ if(vp->v_tag != VT_TFS)
+ TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+ }
+ freevnodes++;
+ simple_unlock(&vnode_free_list_slock);
+
+ /*
+ * If we are doing a vput, the node is already locked, and we must
+ * call VOP_INACTIVE with the node locked. So, in the case of
+ * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
+ */
+ if (put) {
+ simple_unlock(&vp->v_interlock);
+ VOP_INACTIVE(vp, p);
+ } else if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
+ VOP_INACTIVE(vp, p);
+ }
+}
+
+/*
+ * vput(), just unlock and vrele()
+ */
+void
+vput(vp)
+ struct vnode *vp;
+{
+ vputrele(vp, 1);
+}
+
+void
+vrele(vp)
+ struct vnode *vp;
+{
+ vputrele(vp, 0);
+}
+
+#ifdef DIAGNOSTIC
+/*
+ * Page or buffer structure gets a reference.
+ */
+void
+vhold(vp)
+ register struct vnode *vp;
+{
+
+ simple_lock(&vp->v_interlock);
+ vp->v_holdcnt++;
+ simple_unlock(&vp->v_interlock);
+}
+
+/*
+ * Page or buffer structure frees a reference.
+ */
+void
+holdrele(vp)
+ register struct vnode *vp;
+{
+
+ simple_lock(&vp->v_interlock);
+ if (vp->v_holdcnt <= 0)
+ panic("holdrele: holdcnt");
+ vp->v_holdcnt--;
+ simple_unlock(&vp->v_interlock);
+}
+#endif /* DIAGNOSTIC */
+
+/*
+ * Remove any vnodes in the vnode table belonging to mount point mp.
+ *
+ * If MNT_NOFORCE is specified, there should not be any active ones,
+ * return error if any are found (nb: this is a user error, not a
+ * system error). If MNT_FORCE is specified, detach any active vnodes
+ * that are found.
+ */
+#ifdef DIAGNOSTIC
+static int busyprt = 0; /* print out busy vnodes */
+SYSCTL_INT(_debug, 1, busyprt, CTLFLAG_RW, &busyprt, 0, "");
+#endif
+
+int
+vflush(mp, skipvp, flags)
+ struct mount *mp;
+ struct vnode *skipvp;
+ int flags;
+{
+ struct proc *p = curproc; /* XXX */
+ struct vnode *vp, *nvp;
+ int busy = 0;
+
+ simple_lock(&mntvnode_slock);
+loop:
+ for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
+ /*
+ * Make sure this vnode wasn't reclaimed in getnewvnode().
+ * Start over if it has (it won't be on the list anymore).
+ */
+ if (vp->v_mount != mp)
+ goto loop;
+ nvp = vp->v_mntvnodes.le_next;
+ /*
+ * Skip over a selected vnode.
+ */
+ if (vp == skipvp)
+ continue;
+
+ simple_lock(&vp->v_interlock);
+ /*
+ * Skip over a vnodes marked VSYSTEM.
+ */
+ if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
+ simple_unlock(&vp->v_interlock);
+ continue;
+ }
+ /*
+ * If WRITECLOSE is set, only flush out regular file vnodes
+ * open for writing.
+ */
+ if ((flags & WRITECLOSE) &&
+ (vp->v_writecount == 0 || vp->v_type != VREG)) {
+ simple_unlock(&vp->v_interlock);
+ continue;
+ }
+
+ if (vp->v_object && (vp->v_object->flags & OBJ_VFS_REF)) {
+ simple_unlock(&vp->v_interlock);
+ simple_unlock(&mntvnode_slock);
+ vm_object_reference(vp->v_object);
+ pager_cache(vp->v_object, FALSE);
+ vp->v_object->flags &= ~OBJ_VFS_REF;
+ vm_object_deallocate(vp->v_object);
+ simple_lock(&mntvnode_slock);
+ simple_lock(&vp->v_interlock);
+ }
+
+ /*
+ * With v_usecount == 0, all we need to do is clear out the
+ * vnode data structures and we are done.
+ */
+ if (vp->v_usecount == 0) {
+ simple_unlock(&mntvnode_slock);
+ vgonel(vp, p);
+ simple_lock(&mntvnode_slock);
+ continue;
+ }
+
+ /*
+ * If FORCECLOSE is set, forcibly close the vnode. For block
+ * or character devices, revert to an anonymous device. For
+ * all other files, just kill them.
+ */
+ if (flags & FORCECLOSE) {
+ simple_unlock(&mntvnode_slock);
+ if (vp->v_type != VBLK && vp->v_type != VCHR) {
+ vgonel(vp, p);
+ } else {
+ vclean(vp, 0, p);
+ vp->v_op = spec_vnodeop_p;
+ insmntque(vp, (struct mount *) 0);
+ }
+ simple_lock(&mntvnode_slock);
+ continue;
+ }
+#ifdef DIAGNOSTIC
+ if (busyprt)
+ vprint("vflush: busy vnode", vp);
+#endif
+ simple_unlock(&vp->v_interlock);
+ busy++;
+ }
+ simple_unlock(&mntvnode_slock);
+ if (busy)
+ return (EBUSY);
+ return (0);
+}
+
+/*
+ * Disassociate the underlying file system from a vnode.
+ */
+static void
+vclean(struct vnode *vp, int flags, struct proc *p)
+{
+ int active;
+
+ /*
+ * Check to see if the vnode is in use. If so we have to reference it
+ * before we clean it out so that its count cannot fall to zero and
+ * generate a race against ourselves to recycle it.
+ */
+ if ((active = vp->v_usecount))
+ vp->v_usecount++;
+ /*
+ * Prevent the vnode from being recycled or brought into use while we
+ * clean it out.
+ */
+ if (vp->v_flag & VXLOCK)
+ panic("vclean: deadlock");
+ vp->v_flag |= VXLOCK;
+ /*
+ * Even if the count is zero, the VOP_INACTIVE routine may still
+ * have the object locked while it cleans it out. The VOP_LOCK
+ * ensures that the VOP_INACTIVE routine is done with its work.
+ * For active vnodes, it ensures that no other activity can
+ * occur while the underlying object is being cleaned out.
+ */
+ VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
+ /*
+ * Clean out any buffers associated with the vnode.
+ */
+ if (flags & DOCLOSE)
+ vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
+ /*
+ * If purging an active vnode, it must be closed and
+ * deactivated before being reclaimed. Note that the
+ * VOP_INACTIVE will unlock the vnode.
+ */
+ if (active) {
+ if (flags & DOCLOSE)
+ VOP_CLOSE(vp, IO_NDELAY, NOCRED, p);
+ VOP_INACTIVE(vp, p);
+ } else {
+ /*
+ * Any other processes trying to obtain this lock must first
+ * wait for VXLOCK to clear, then call the new lock operation.
+ */
+ VOP_UNLOCK(vp, 0, p);
+ }
+ /*
+ * Reclaim the vnode.
+ */
+ if (VOP_RECLAIM(vp, p))
+ panic("vclean: cannot reclaim");
+ if (active)
+ vrele(vp);
+ cache_purge(vp);
+ if (vp->v_vnlock) {
+ if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
+ vprint("vclean: lock not drained", vp);
+ FREE(vp->v_vnlock, M_VNODE);
+ vp->v_vnlock = NULL;
+ }
+
+ /*
+ * Done with purge, notify sleepers of the grim news.
+ */
+ vp->v_op = dead_vnodeop_p;
+ vp->v_tag = VT_NON;
+ vp->v_flag &= ~VXLOCK;
+ if (vp->v_flag & VXWANT) {
+ vp->v_flag &= ~VXWANT;
+ wakeup((caddr_t) vp);
+ }
+}
+
+/*
+ * Eliminate all activity associated with the requested vnode
+ * and with all vnodes aliased to the requested vnode.
+ */
+int
+vop_revoke(ap)
+ struct vop_revoke_args /* {
+ struct vnode *a_vp;
+ int a_flags;
+ } */ *ap;
+{
+ struct vnode *vp, *vq;
+ struct proc *p = curproc; /* XXX */
+
+#ifdef DIAGNOSTIC
+ if ((ap->a_flags & REVOKEALL) == 0)
+ panic("vop_revoke");
+#endif
+
+ vp = ap->a_vp;
+ simple_lock(&vp->v_interlock);
+
+ if (vp->v_flag & VALIASED) {
+ /*
+ * If a vgone (or vclean) is already in progress,
+ * wait until it is done and return.
+ */
+ if (vp->v_flag & VXLOCK) {
+ vp->v_flag |= VXWANT;
+ simple_unlock(&vp->v_interlock);
+ tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
+ return (0);
+ }
+ /*
+ * Ensure that vp will not be vgone'd while we
+ * are eliminating its aliases.
+ */
+ vp->v_flag |= VXLOCK;
+ simple_unlock(&vp->v_interlock);
+ while (vp->v_flag & VALIASED) {
+ simple_lock(&spechash_slock);
+ for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+ if (vq->v_rdev != vp->v_rdev ||
+ vq->v_type != vp->v_type || vp == vq)
+ continue;
+ simple_unlock(&spechash_slock);
+ vgone(vq);
+ break;
+ }
+ if (vq == NULLVP) {
+ simple_unlock(&spechash_slock);
+ }
+ }
+ /*
+ * Remove the lock so that vgone below will
+ * really eliminate the vnode after which time
+ * vgone will awaken any sleepers.
+ */
+ simple_lock(&vp->v_interlock);
+ vp->v_flag &= ~VXLOCK;
+ }
+ vgonel(vp, p);
+ return (0);
+}
+
+/*
+ * Recycle an unused vnode to the front of the free list.
+ * Release the passed interlock if the vnode will be recycled.
+ */
+int
+vrecycle(vp, inter_lkp, p)
+ struct vnode *vp;
+ struct simplelock *inter_lkp;
+ struct proc *p;
+{
+
+ simple_lock(&vp->v_interlock);
+ if (vp->v_usecount == 0) {
+ if (inter_lkp) {
+ simple_unlock(inter_lkp);
+ }
+ vgonel(vp, p);
+ return (1);
+ }
+ simple_unlock(&vp->v_interlock);
+ return (0);
+}
+
+/*
+ * Eliminate all activity associated with a vnode
+ * in preparation for reuse.
+ */
+void
+vgone(vp)
+ register struct vnode *vp;
+{
+ struct proc *p = curproc; /* XXX */
+
+ simple_lock(&vp->v_interlock);
+ vgonel(vp, p);
+}
+
+/*
+ * vgone, with the vp interlock held.
+ */
+void
+vgonel(vp, p)
+ struct vnode *vp;
+ struct proc *p;
+{
+ struct vnode *vq;
+ struct vnode *vx;
+
+ /*
+ * If a vgone (or vclean) is already in progress,
+ * wait until it is done and return.
+ */
+ if (vp->v_flag & VXLOCK) {
+ vp->v_flag |= VXWANT;
+ simple_unlock(&vp->v_interlock);
+ tsleep((caddr_t)vp, PINOD, "vgone", 0);
+ return;
+ }
+
+ if (vp->v_object) {
+ vp->v_object->flags |= OBJ_VNODE_GONE;
+ }
+
+ /*
+ * Clean out the filesystem specific data.
+ */
+ vclean(vp, DOCLOSE, p);
+ /*
+ * Delete from old mount point vnode list, if on one.
+ */
+ if (vp->v_mount != NULL)
+ insmntque(vp, (struct mount *)0);
+ /*
+ * If special device, remove it from special device alias list
+ * if it is on one.
+ */
+ if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
+ simple_lock(&spechash_slock);
+ if (*vp->v_hashchain == vp) {
+ *vp->v_hashchain = vp->v_specnext;
+ } else {
+ for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+ if (vq->v_specnext != vp)
+ continue;
+ vq->v_specnext = vp->v_specnext;
+ break;
+ }
+ if (vq == NULL)
+ panic("missing bdev");
+ }
+ if (vp->v_flag & VALIASED) {
+ vx = NULL;
+ for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+ if (vq->v_rdev != vp->v_rdev ||
+ vq->v_type != vp->v_type)
+ continue;
+ if (vx)
+ break;
+ vx = vq;
+ }
+ if (vx == NULL)
+ panic("missing alias");
+ if (vq == NULL)
+ vx->v_flag &= ~VALIASED;
+ vp->v_flag &= ~VALIASED;
+ }
+ simple_unlock(&spechash_slock);
+ FREE(vp->v_specinfo, M_VNODE);
+ vp->v_specinfo = NULL;
+ }
+
+ /*
+ * If it is on the freelist and not already at the head,
+ * move it to the head of the list. The test of the back
+ * pointer and the reference count of zero is because
+ * it will be removed from the free list by getnewvnode,
+ * but will not have its reference count incremented until
+ * after calling vgone. If the reference count were
+ * incremented first, vgone would (incorrectly) try to
+ * close the previous instance of the underlying object.
+ * So, the back pointer is explicitly set to `0xdeadb' in
+ * getnewvnode after removing it from the freelist to ensure
+ * that we do not try to move it here.
+ */
+ if (vp->v_usecount == 0) {
+ simple_lock(&vnode_free_list_slock);
+ if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
+ vnode_free_list.tqh_first != vp) {
+ TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+ TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+ }
+ simple_unlock(&vnode_free_list_slock);
+ }
+
+ vp->v_type = VBAD;
+}
+
+/*
+ * Lookup a vnode by device number.
+ */
+int
+vfinddev(dev, type, vpp)
+ dev_t dev;
+ enum vtype type;
+ struct vnode **vpp;
+{
+ register struct vnode *vp;
+ int rc = 0;
+
+ simple_lock(&spechash_slock);
+ for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
+ if (dev != vp->v_rdev || type != vp->v_type)
+ continue;
+ *vpp = vp;
+ rc = 1;
+ break;
+ }
+ simple_unlock(&spechash_slock);
+ return (rc);
+}
+
+/*
+ * Calculate the total number of references to a special device.
+ */
+int
+vcount(vp)
+ register struct vnode *vp;
+{
+ struct vnode *vq, *vnext;
+ int count;
+
+loop:
+ if ((vp->v_flag & VALIASED) == 0)
+ return (vp->v_usecount);
+ simple_lock(&spechash_slock);
+ for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
+ vnext = vq->v_specnext;
+ if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
+ continue;
+ /*
+ * Alias, but not in use, so flush it out.
+ */
+ if (vq->v_usecount == 0 && vq != vp) {
+ simple_unlock(&spechash_slock);
+ vgone(vq);
+ goto loop;
+ }
+ count += vq->v_usecount;
+ }
+ simple_unlock(&spechash_slock);
+ return (count);
+}
+
+/*
+ * Print out a description of a vnode.
+ */
+static char *typename[] =
+{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
+
+void
+vprint(label, vp)
+ char *label;
+ register struct vnode *vp;
+{
+ char buf[64];
+
+ if (label != NULL)
+ printf("%s: ", label);
+ printf("type %s, usecount %d, writecount %d, refcount %ld,",
+ typename[vp->v_type], vp->v_usecount, vp->v_writecount,
+ vp->v_holdcnt);
+ buf[0] = '\0';
+ if (vp->v_flag & VROOT)
+ strcat(buf, "|VROOT");
+ if (vp->v_flag & VTEXT)
+ strcat(buf, "|VTEXT");
+ if (vp->v_flag & VSYSTEM)
+ strcat(buf, "|VSYSTEM");
+ if (vp->v_flag & VXLOCK)
+ strcat(buf, "|VXLOCK");
+ if (vp->v_flag & VXWANT)
+ strcat(buf, "|VXWANT");
+ if (vp->v_flag & VBWAIT)
+ strcat(buf, "|VBWAIT");
+ if (vp->v_flag & VALIASED)
+ strcat(buf, "|VALIASED");
+ if (buf[0] != '\0')
+ printf(" flags (%s)", &buf[1]);
+ if (vp->v_data == NULL) {
+ printf("\n");
+ } else {
+ printf("\n\t");
+ VOP_PRINT(vp);
+ }
+}
+
+#ifdef DDB
+/*
+ * List all of the locked vnodes in the system.
+ * Called when debugging the kernel.
+ */
+void
+printlockedvnodes()
+{
+ struct proc *p = curproc; /* XXX */
+ struct mount *mp, *nmp;
+ struct vnode *vp;
+
+ printf("Locked vnodes\n");
+ simple_lock(&mountlist_slock);
+ for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+ if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+ nmp = mp->mnt_list.cqe_next;
+ continue;
+ }
+ for (vp = mp->mnt_vnodelist.lh_first;
+ vp != NULL;
+ vp = vp->v_mntvnodes.le_next) {
+ if (VOP_ISLOCKED(vp))
+ vprint((char *)0, vp);
+ }
+ simple_lock(&mountlist_slock);
+ nmp = mp->mnt_list.cqe_next;
+ vfs_unbusy(mp, p);
+ }
+ simple_unlock(&mountlist_slock);
+}
+#endif
+
+/*
+ * Top level filesystem related information gathering.
+ */
+static int sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS);
+
+static int
+vfs_sysctl SYSCTL_HANDLER_ARGS
+{
+ int *name = (int *)arg1 - 1; /* XXX */
+ u_int namelen = arg2 + 1; /* XXX */
+ struct vfsconf *vfsp;
+
+#ifndef NO_COMPAT_PRELITE2
+ /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
+ if (namelen == 1)
+ return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
+#endif
+
+#ifdef notyet
+ /* all sysctl names at this level are at least name and field */
+ if (namelen < 2)
+ return (ENOTDIR); /* overloaded */
+ if (name[0] != VFS_GENERIC) {
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (vfsp->vfc_typenum == name[0])
+ break;
+ if (vfsp == NULL)
+ return (EOPNOTSUPP);
+ return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
+ oldp, oldlenp, newp, newlen, p));
+ }
+#endif
+ switch (name[1]) {
+ case VFS_MAXTYPENUM:
+ if (namelen != 2)
+ return (ENOTDIR);
+ return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
+ case VFS_CONF:
+ if (namelen != 3)
+ return (ENOTDIR); /* overloaded */
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (vfsp->vfc_typenum == name[2])
+ break;
+ if (vfsp == NULL)
+ return (EOPNOTSUPP);
+ return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
+ }
+ return (EOPNOTSUPP);
+}
+
+SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
+ "Generic filesystem");
+
+#ifndef NO_COMPAT_PRELITE2
+
+static int
+sysctl_ovfs_conf SYSCTL_HANDLER_ARGS
+{
+ int error;
+ struct vfsconf *vfsp;
+ struct ovfsconf ovfs;
+
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
+ ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
+ strcpy(ovfs.vfc_name, vfsp->vfc_name);
+ ovfs.vfc_index = vfsp->vfc_typenum;
+ ovfs.vfc_refcount = vfsp->vfc_refcount;
+ ovfs.vfc_flags = vfsp->vfc_flags;
+ error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
+ if (error)
+ return error;
+ }
+ return 0;
+}
+
+#endif /* !NO_COMPAT_PRELITE2 */
+
+int kinfo_vdebug = 1;
+int kinfo_vgetfailed;
+
+#define KINFO_VNODESLOP 10
+/*
+ * Dump vnode list (via sysctl).
+ * Copyout address of vnode followed by vnode.
+ */
+/* ARGSUSED */
+static int
+sysctl_vnode SYSCTL_HANDLER_ARGS
+{
+ struct proc *p = curproc; /* XXX */
+ struct mount *mp, *nmp;
+ struct vnode *nvp, *vp;
+ int error;
+
+#define VPTRSZ sizeof (struct vnode *)
+#define VNODESZ sizeof (struct vnode)
+
+ req->lock = 0;
+ if (!req->oldptr) /* Make an estimate */
+ return (SYSCTL_OUT(req, 0,
+ (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
+
+ simple_lock(&mountlist_slock);
+ for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+ if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+ nmp = mp->mnt_list.cqe_next;
+ continue;
+ }
+again:
+ simple_lock(&mntvnode_slock);
+ for (vp = mp->mnt_vnodelist.lh_first;
+ vp != NULL;
+ vp = nvp) {
+ /*
+ * Check that the vp is still associated with
+ * this filesystem. RACE: could have been
+ * recycled onto the same filesystem.
+ */
+ if (vp->v_mount != mp) {
+ simple_unlock(&mntvnode_slock);
+ if (kinfo_vdebug)
+ printf("kinfo: vp changed\n");
+ goto again;
+ }
+ nvp = vp->v_mntvnodes.le_next;
+ simple_unlock(&mntvnode_slock);
+ if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
+ (error = SYSCTL_OUT(req, vp, VNODESZ)))
+ return (error);
+ simple_lock(&mntvnode_slock);
+ }
+ simple_unlock(&mntvnode_slock);
+ simple_lock(&mountlist_slock);
+ nmp = mp->mnt_list.cqe_next;
+ vfs_unbusy(mp, p);
+ }
+ simple_unlock(&mountlist_slock);
+
+ return (0);
+}
+
+SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
+ 0, 0, sysctl_vnode, "S,vnode", "");
+
+/*
+ * Check to see if a filesystem is mounted on a block device.
+ */
+int
+vfs_mountedon(vp)
+ struct vnode *vp;
+{
+ struct vnode *vq;
+ int error = 0;
+
+ if (vp->v_specflags & SI_MOUNTEDON)
+ return (EBUSY);
+ if (vp->v_flag & VALIASED) {
+ simple_lock(&spechash_slock);
+ for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+ if (vq->v_rdev != vp->v_rdev ||
+ vq->v_type != vp->v_type)
+ continue;
+ if (vq->v_specflags & SI_MOUNTEDON) {
+ error = EBUSY;
+ break;
+ }
+ }
+ simple_unlock(&spechash_slock);
+ }
+ return (error);
+}
+
+/*
+ * Unmount all filesystems. The list is traversed in reverse order
+ * of mounting to avoid dependencies.
+ */
+void
+vfs_unmountall()
+{
+ struct mount *mp, *nmp;
+ struct proc *p = initproc; /* XXX XXX should this be proc0? */
+ int error;
+
+ /*
+ * Since this only runs when rebooting, it is not interlocked.
+ */
+ for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
+ nmp = mp->mnt_list.cqe_prev;
+ error = dounmount(mp, MNT_FORCE, p);
+ if (error) {
+ printf("unmount of %s failed (",
+ mp->mnt_stat.f_mntonname);
+ if (error == EBUSY)
+ printf("BUSY)\n");
+ else
+ printf("%d)\n", error);
+ }
+ }
+}
+
+/*
+ * Build hash lists of net addresses and hang them off the mount point.
+ * Called by ufs_mount() to set up the lists of export addresses.
+ */
+static int
+vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
+ struct export_args *argp)
+{
+ register struct netcred *np;
+ register struct radix_node_head *rnh;
+ register int i;
+ struct radix_node *rn;
+ struct sockaddr *saddr, *smask = 0;
+ struct domain *dom;
+ int error;
+
+ if (argp->ex_addrlen == 0) {
+ if (mp->mnt_flag & MNT_DEFEXPORTED)
+ return (EPERM);
+ np = &nep->ne_defexported;
+ np->netc_exflags = argp->ex_flags;
+ np->netc_anon = argp->ex_anon;
+ np->netc_anon.cr_ref = 1;
+ mp->mnt_flag |= MNT_DEFEXPORTED;
+ return (0);
+ }
+ i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
+ np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
+ bzero((caddr_t) np, i);
+ saddr = (struct sockaddr *) (np + 1);
+ if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
+ goto out;
+ if (saddr->sa_len > argp->ex_addrlen)
+ saddr->sa_len = argp->ex_addrlen;
+ if (argp->ex_masklen) {
+ smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
+ error = copyin(argp->ex_addr, (caddr_t) smask, argp->ex_masklen);
+ if (error)
+ goto out;
+ if (smask->sa_len > argp->ex_masklen)
+ smask->sa_len = argp->ex_masklen;
+ }
+ i = saddr->sa_family;
+ if ((rnh = nep->ne_rtable[i]) == 0) {
+ /*
+ * Seems silly to initialize every AF when most are not used,
+ * do so on demand here
+ */
+ for (dom = domains; dom; dom = dom->dom_next)
+ if (dom->dom_family == i && dom->dom_rtattach) {
+ dom->dom_rtattach((void **) &nep->ne_rtable[i],
+ dom->dom_rtoffset);
+ break;
+ }
+ if ((rnh = nep->ne_rtable[i]) == 0) {
+ error = ENOBUFS;
+ goto out;
+ }
+ }
+ rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
+ np->netc_rnodes);
+ if (rn == 0 || np != (struct netcred *) rn) { /* already exists */
+ error = EPERM;
+ goto out;
+ }
+ np->netc_exflags = argp->ex_flags;
+ np->netc_anon = argp->ex_anon;
+ np->netc_anon.cr_ref = 1;
+ return (0);
+out:
+ free(np, M_NETADDR);
+ return (error);
+}
+
+/* ARGSUSED */
+static int
+vfs_free_netcred(struct radix_node *rn, void *w)
+{
+ register struct radix_node_head *rnh = (struct radix_node_head *) w;
+
+ (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
+ free((caddr_t) rn, M_NETADDR);
+ return (0);
+}
+
+/*
+ * Free the net address hash lists that are hanging off the mount points.
+ */
+static void
+vfs_free_addrlist(struct netexport *nep)
+{
+ register int i;
+ register struct radix_node_head *rnh;
+
+ for (i = 0; i <= AF_MAX; i++)
+ if ((rnh = nep->ne_rtable[i])) {
+ (*rnh->rnh_walktree) (rnh, vfs_free_netcred,
+ (caddr_t) rnh);
+ free((caddr_t) rnh, M_RTABLE);
+ nep->ne_rtable[i] = 0;
+ }
+}
+
+int
+vfs_export(mp, nep, argp)
+ struct mount *mp;
+ struct netexport *nep;
+ struct export_args *argp;
+{
+ int error;
+
+ if (argp->ex_flags & MNT_DELEXPORT) {
+ vfs_free_addrlist(nep);
+ mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
+ }
+ if (argp->ex_flags & MNT_EXPORTED) {
+ if ((error = vfs_hang_addrlist(mp, nep, argp)))
+ return (error);
+ mp->mnt_flag |= MNT_EXPORTED;
+ }
+ return (0);
+}
+
+struct netcred *
+vfs_export_lookup(mp, nep, nam)
+ register struct mount *mp;
+ struct netexport *nep;
+ struct mbuf *nam;
+{
+ register struct netcred *np;
+ register struct radix_node_head *rnh;
+ struct sockaddr *saddr;
+
+ np = NULL;
+ if (mp->mnt_flag & MNT_EXPORTED) {
+ /*
+ * Lookup in the export list first.
+ */
+ if (nam != NULL) {
+ saddr = mtod(nam, struct sockaddr *);
+ rnh = nep->ne_rtable[saddr->sa_family];
+ if (rnh != NULL) {
+ np = (struct netcred *)
+ (*rnh->rnh_matchaddr)((caddr_t)saddr,
+ rnh);
+ if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
+ np = NULL;
+ }
+ }
+ /*
+ * If no address match, use the default if it exists.
+ */
+ if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
+ np = &nep->ne_defexported;
+ }
+ return (np);
+}
+
+/*
+ * perform msync on all vnodes under a mount point
+ * the mount point must be locked.
+ */
+void
+vfs_msync(struct mount *mp, int flags) {
+ struct vnode *vp, *nvp;
+loop:
+ for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
+
+ if (vp->v_mount != mp)
+ goto loop;
+ nvp = vp->v_mntvnodes.le_next;
+ if (VOP_ISLOCKED(vp) && (flags != MNT_WAIT))
+ continue;
+ if (vp->v_object &&
+ (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
+ vm_object_page_clean(vp->v_object, 0, 0, TRUE, TRUE);
+ }
+ }
+}
+
+/*
+ * Create the VM object needed for VMIO and mmap support. This
+ * is done for all VREG files in the system. Some filesystems might
+ * afford the additional metadata buffering capability of the
+ * VMIO code by making the device node be VMIO mode also.
+ */
+int
+vfs_object_create(vp, p, cred, waslocked)
+ struct vnode *vp;
+ struct proc *p;
+ struct ucred *cred;
+ int waslocked;
+{
+ struct vattr vat;
+ vm_object_t object;
+ int error = 0;
+
+retry:
+ if ((object = vp->v_object) == NULL) {
+ if (vp->v_type == VREG) {
+ if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
+ goto retn;
+ (void) vnode_pager_alloc(vp,
+ OFF_TO_IDX(round_page(vat.va_size)), 0, 0);
+ } else {
+ /*
+ * This simply allocates the biggest object possible
+ * for a VBLK vnode. This should be fixed, but doesn't
+ * cause any problems (yet).
+ */
+ (void) vnode_pager_alloc(vp, INT_MAX, 0, 0);
+ }
+ vp->v_object->flags |= OBJ_VFS_REF;
+ } else {
+ if (object->flags & OBJ_DEAD) {
+ if (waslocked)
+ VOP_UNLOCK(vp, 0, p);
+ tsleep(object, PVM, "vodead", 0);
+ if (waslocked)
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ goto retry;
+ }
+ if ((object->flags & OBJ_VFS_REF) == 0) {
+ object->flags |= OBJ_VFS_REF;
+ vm_object_reference(object);
+ }
+ }
+ if (vp->v_object)
+ vp->v_flag |= VVMIO;
+
+retn:
+ return error;
+}
diff --git a/sys/kern/vfs_extattr.c b/sys/kern/vfs_extattr.c
new file mode 100644
index 0000000..2997fe5
--- /dev/null
+++ b/sys/kern/vfs_extattr.c
@@ -0,0 +1,2756 @@
+/*
+ * Copyright (c) 1989, 1993
+ * The Regents of the University of California. All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94
+ * $Id: vfs_syscalls.c,v 1.60 1997/03/23 03:36:35 bde Exp $
+ */
+
+/*
+ * XXX - The following is required because of some magic done
+ * in getdirentries() below which is only done if the translucent
+ * filesystem `UNION' is compiled into the kernel. This is broken,
+ * but I don't have time to study the code deeply enough to understand
+ * what's going on and determine an appropriate fix. -GAW
+ */
+#include "opt_union.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/namei.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+#include <sys/malloc.h>
+#include <sys/dirent.h>
+
+#ifdef UNION
+#include <miscfs/union/union.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <sys/sysctl.h>
+
+static int change_dir __P((struct nameidata *ndp, struct proc *p));
+static void checkdirs __P((struct vnode *olddp));
+
+/*
+ * Virtual File System System Calls
+ */
+
+/*
+ * Mount a file system.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mount_args {
+ char *type;
+ char *path;
+ int flags;
+ caddr_t data;
+};
+#endif
+/* ARGSUSED */
+int
+mount(p, uap, retval)
+ struct proc *p;
+ register struct mount_args /* {
+ syscallarg(char *) type;
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ syscallarg(caddr_t) data;
+ } */ *uap;
+ register_t *retval;
+{
+ struct vnode *vp;
+ struct mount *mp;
+ struct vfsconf *vfsp;
+ int error, flag = 0;
+ struct vattr va;
+ u_long fstypenum;
+ struct nameidata nd;
+ char fstypename[MFSNAMELEN];
+
+ /*
+ * Get vnode to be covered
+ */
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ if (SCARG(uap, flags) & MNT_UPDATE) {
+ if ((vp->v_flag & VROOT) == 0) {
+ vput(vp);
+ return (EINVAL);
+ }
+ mp = vp->v_mount;
+ flag = mp->mnt_flag;
+ /*
+ * We only allow the filesystem to be reloaded if it
+ * is currently mounted read-only.
+ */
+ if ((SCARG(uap, flags) & MNT_RELOAD) &&
+ ((mp->mnt_flag & MNT_RDONLY) == 0)) {
+ vput(vp);
+ return (EOPNOTSUPP); /* Needs translation */
+ }
+ mp->mnt_flag |=
+ SCARG(uap, flags) & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
+ /*
+ * Only root, or the user that did the original mount is
+ * permitted to update it.
+ */
+ if (mp->mnt_stat.f_owner != p->p_ucred->cr_uid &&
+ (error = suser(p->p_ucred, &p->p_acflag))) {
+ vput(vp);
+ return (error);
+ }
+ /*
+ * Do not allow NFS export by non-root users. Silently
+ * enforce MNT_NOSUID and MNT_NODEV for non-root users.
+ */
+ if (p->p_ucred->cr_uid != 0) {
+ if (SCARG(uap, flags) & MNT_EXPORTED) {
+ vput(vp);
+ return (EPERM);
+ }
+ SCARG(uap, flags) |= MNT_NOSUID | MNT_NODEV;
+ }
+ if (vfs_busy(mp, LK_NOWAIT, 0, p)) {
+ vput(vp);
+ return (EBUSY);
+ }
+ VOP_UNLOCK(vp, 0, p);
+ goto update;
+ }
+ /*
+ * If the user is not root, ensure that they own the directory
+ * onto which we are attempting to mount.
+ */
+ if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)) ||
+ (va.va_uid != p->p_ucred->cr_uid &&
+ (error = suser(p->p_ucred, &p->p_acflag)))) {
+ vput(vp);
+ return (error);
+ }
+ /*
+ * Do not allow NFS export by non-root users. Silently
+ * enforce MNT_NOSUID and MNT_NODEV for non-root users.
+ */
+ if (p->p_ucred->cr_uid != 0) {
+ if (SCARG(uap, flags) & MNT_EXPORTED) {
+ vput(vp);
+ return (EPERM);
+ }
+ SCARG(uap, flags) |= MNT_NOSUID | MNT_NODEV;
+ }
+ if (error = vinvalbuf(vp, V_SAVE, p->p_ucred, p, 0, 0))
+ return (error);
+ if (vp->v_type != VDIR) {
+ vput(vp);
+ return (ENOTDIR);
+ }
+#ifdef COMPAT_43
+ /*
+ * Historically filesystem types were identified by number. If we
+ * get an integer for the filesystem type instead of a string, we
+ * check to see if it matches one of the historic filesystem types.
+ */
+ fstypenum = (u_long)SCARG(uap, type);
+ if (fstypenum < maxvfsconf) {
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (vfsp->vfc_typenum == fstypenum)
+ break;
+ if (vfsp == NULL) {
+ vput(vp);
+ return (ENODEV);
+ }
+ strncpy(fstypename, vfsp->vfc_name, MFSNAMELEN);
+ } else
+#endif /* COMPAT_43 */
+ if (error = copyinstr(SCARG(uap, type), fstypename, MFSNAMELEN, NULL)) {
+ vput(vp);
+ return (error);
+ }
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+ if (!strcmp(vfsp->vfc_name, fstypename))
+ break;
+ if (vfsp == NULL) {
+ vput(vp);
+ return (ENODEV);
+ }
+ if (vp->v_mountedhere != NULL) {
+ vput(vp);
+ return (EBUSY);
+ }
+
+ /*
+ * Allocate and initialize the filesystem.
+ */
+ mp = (struct mount *)malloc((u_long)sizeof(struct mount),
+ M_MOUNT, M_WAITOK);
+ bzero((char *)mp, (u_long)sizeof(struct mount));
+ lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
+ (void)vfs_busy(mp, LK_NOWAIT, 0, p);
+ mp->mnt_op = vfsp->vfc_vfsops;
+ mp->mnt_vfc = vfsp;
+ vfsp->vfc_refcount++;
+ mp->mnt_stat.f_type = vfsp->vfc_typenum;
+ mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+ strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
+ vp->v_mountedhere = mp;
+ mp->mnt_vnodecovered = vp;
+ mp->mnt_stat.f_owner = p->p_ucred->cr_uid;
+update:
+ /*
+ * Set the mount level flags.
+ */
+ if (SCARG(uap, flags) & MNT_RDONLY)
+ mp->mnt_flag |= MNT_RDONLY;
+ else if (mp->mnt_flag & MNT_RDONLY)
+ mp->mnt_flag |= MNT_WANTRDWR;
+ mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
+ MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOATIME);
+ mp->mnt_flag |= SCARG(uap, flags) & (MNT_NOSUID | MNT_NOEXEC |
+ MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_FORCE |
+ MNT_NOATIME);
+ /*
+ * Mount the filesystem.
+ */
+ error = VFS_MOUNT(mp, SCARG(uap, path), SCARG(uap, data), &nd, p);
+ if (mp->mnt_flag & MNT_UPDATE) {
+ vrele(vp);
+ if (mp->mnt_flag & MNT_WANTRDWR)
+ mp->mnt_flag &= ~MNT_RDONLY;
+ mp->mnt_flag &=~
+ (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_WANTRDWR);
+ if (error)
+ mp->mnt_flag = flag;
+ vfs_unbusy(mp, p);
+ return (error);
+ }
+ /*
+ * Put the new filesystem on the mount list after root.
+ */
+ cache_purge(vp);
+ if (!error) {
+ simple_lock(&mountlist_slock);
+ CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+ simple_unlock(&mountlist_slock);
+ checkdirs(vp);
+ VOP_UNLOCK(vp, 0, p);
+ vfs_unbusy(mp, p);
+ if (error = VFS_START(mp, 0, p))
+ vrele(vp);
+ } else {
+ mp->mnt_vnodecovered->v_mountedhere = (struct mount *)0;
+ mp->mnt_vfc->vfc_refcount--;
+ vfs_unbusy(mp, p);
+ free((caddr_t)mp, M_MOUNT);
+ vput(vp);
+ }
+ return (error);
+}
+
+/*
+ * Scan all active processes to see if any of them have a current
+ * or root directory onto which the new filesystem has just been
+ * mounted. If so, replace them with the new mount point.
+ */
+static void
+checkdirs(olddp)
+ struct vnode *olddp;
+{
+ struct filedesc *fdp;
+ struct vnode *newdp;
+ struct proc *p;
+
+ if (olddp->v_usecount == 1)
+ return;
+ if (VFS_ROOT(olddp->v_mountedhere, &newdp))
+ panic("mount: lost mount");
+ for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
+ fdp = p->p_fd;
+ if (fdp->fd_cdir == olddp) {
+ vrele(fdp->fd_cdir);
+ VREF(newdp);
+ fdp->fd_cdir = newdp;
+ }
+ if (fdp->fd_rdir == olddp) {
+ vrele(fdp->fd_rdir);
+ VREF(newdp);
+ fdp->fd_rdir = newdp;
+ }
+ }
+ if (rootvnode == olddp) {
+ vrele(rootvnode);
+ VREF(newdp);
+ rootvnode = newdp;
+ }
+ vput(newdp);
+}
+
+/*
+ * Unmount a file system.
+ *
+ * Note: unmount takes a path to the vnode mounted on as argument,
+ * not special file (as before).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unmount_args {
+ char *path;
+ int flags;
+};
+#endif
+/* ARGSUSED */
+int
+unmount(p, uap, retval)
+ struct proc *p;
+ register struct unmount_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct vnode *vp;
+ struct mount *mp;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ mp = vp->v_mount;
+
+ /*
+ * Only root, or the user that did the original mount is
+ * permitted to unmount this filesystem.
+ */
+ if ((mp->mnt_stat.f_owner != p->p_ucred->cr_uid) &&
+ (error = suser(p->p_ucred, &p->p_acflag))) {
+ vput(vp);
+ return (error);
+ }
+
+ /*
+ * Don't allow unmounting the root file system.
+ */
+ if (mp->mnt_flag & MNT_ROOTFS) {
+ vput(vp);
+ return (EINVAL);
+ }
+
+ /*
+ * Must be the root of the filesystem
+ */
+ if ((vp->v_flag & VROOT) == 0) {
+ vput(vp);
+ return (EINVAL);
+ }
+ vput(vp);
+ return (dounmount(mp, SCARG(uap, flags), p));
+}
+
+/*
+ * Do the actual file system unmount.
+ */
+int
+dounmount(mp, flags, p)
+ register struct mount *mp;
+ int flags;
+ struct proc *p;
+{
+ struct vnode *coveredvp;
+ int error;
+
+ simple_lock(&mountlist_slock);
+ mp->mnt_flag |= MNT_UNMOUNT;
+ lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK, &mountlist_slock, p);
+ mp->mnt_flag &=~ MNT_ASYNC;
+ vfs_msync(mp, MNT_NOWAIT);
+ vnode_pager_umount(mp); /* release cached vnodes */
+ cache_purgevfs(mp); /* remove cache entries for this file sys */
+ if (((mp->mnt_flag & MNT_RDONLY) ||
+ (error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p)) == 0) ||
+ (flags & MNT_FORCE))
+ error = VFS_UNMOUNT(mp, flags, p);
+ simple_lock(&mountlist_slock);
+ if (error) {
+ mp->mnt_flag &= ~MNT_UNMOUNT;
+ lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK | LK_REENABLE,
+ &mountlist_slock, p);
+ return (error);
+ }
+ CIRCLEQ_REMOVE(&mountlist, mp, mnt_list);
+ if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
+ coveredvp->v_mountedhere = (struct mount *)0;
+ vrele(coveredvp);
+ }
+ mp->mnt_vfc->vfc_refcount--;
+ if (mp->mnt_vnodelist.lh_first != NULL)
+ panic("unmount: dangling vnode");
+ lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_slock, p);
+ if (mp->mnt_flag & MNT_MWAIT)
+ wakeup((caddr_t)mp);
+ free((caddr_t)mp, M_MOUNT);
+ return (0);
+}
+
+/*
+ * Sync each mounted filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct sync_args {
+ int dummy;
+};
+#endif
+
+#ifdef DEBUG
+int syncprt = 0;
+SYSCTL_INT(_debug, 0, syncprt, CTLFLAG_RW, &syncprt, 0, "");
+#endif
+
+/* ARGSUSED */
+int
+sync(p, uap, retval)
+ struct proc *p;
+ struct sync_args *uap;
+ register_t *retval;
+{
+ register struct mount *mp, *nmp;
+ int asyncflag;
+
+ simple_lock(&mountlist_slock);
+ for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+ if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+ nmp = mp->mnt_list.cqe_next;
+ continue;
+ }
+ if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+ asyncflag = mp->mnt_flag & MNT_ASYNC;
+ mp->mnt_flag &= ~MNT_ASYNC;
+ vfs_msync(mp, MNT_NOWAIT);
+ VFS_SYNC(mp, MNT_NOWAIT, p != NULL ? p->p_ucred : NOCRED, p);
+ if (asyncflag)
+ mp->mnt_flag |= MNT_ASYNC;
+ }
+ simple_lock(&mountlist_slock);
+ nmp = mp->mnt_list.cqe_next;
+ vfs_unbusy(mp, p);
+ }
+ simple_unlock(&mountlist_slock);
+#if 0
+/*
+ * XXX don't call vfs_bufstats() yet because that routine
+ * was not imported in the Lite2 merge.
+ */
+#ifdef DIAGNOSTIC
+ if (syncprt)
+ vfs_bufstats();
+#endif /* DIAGNOSTIC */
+#endif
+ return (0);
+}
+
+/*
+ * Change filesystem quotas.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct quotactl_args {
+ char *path;
+ int cmd;
+ int uid;
+ caddr_t arg;
+};
+#endif
+/* ARGSUSED */
+int
+quotactl(p, uap, retval)
+ struct proc *p;
+ register struct quotactl_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) cmd;
+ syscallarg(int) uid;
+ syscallarg(caddr_t) arg;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct mount *mp;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ mp = nd.ni_vp->v_mount;
+ vrele(nd.ni_vp);
+ return (VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid),
+ SCARG(uap, arg), p));
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct statfs_args {
+ char *path;
+ struct statfs *buf;
+};
+#endif
+/* ARGSUSED */
+int
+statfs(p, uap, retval)
+ struct proc *p;
+ register struct statfs_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct statfs *) buf;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct mount *mp;
+ register struct statfs *sp;
+ int error;
+ struct nameidata nd;
+ struct statfs sb;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ mp = nd.ni_vp->v_mount;
+ sp = &mp->mnt_stat;
+ vrele(nd.ni_vp);
+ error = VFS_STATFS(mp, sp, p);
+ if (error)
+ return (error);
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ if (p->p_ucred->cr_uid != 0) {
+ bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb));
+ sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+ sp = &sb;
+ }
+ return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstatfs_args {
+ int fd;
+ struct statfs *buf;
+};
+#endif
+/* ARGSUSED */
+int
+fstatfs(p, uap, retval)
+ struct proc *p;
+ register struct fstatfs_args /* {
+ syscallarg(int) fd;
+ syscallarg(struct statfs *) buf;
+ } */ *uap;
+ register_t *retval;
+{
+ struct file *fp;
+ struct mount *mp;
+ register struct statfs *sp;
+ int error;
+ struct statfs sb;
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ mp = ((struct vnode *)fp->f_data)->v_mount;
+ sp = &mp->mnt_stat;
+ error = VFS_STATFS(mp, sp, p);
+ if (error)
+ return (error);
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ if (p->p_ucred->cr_uid != 0) {
+ bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb));
+ sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+ sp = &sb;
+ }
+ return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Get statistics on all filesystems.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getfsstat_args {
+ struct statfs *buf;
+ long bufsize;
+ int flags;
+};
+#endif
+int
+getfsstat(p, uap, retval)
+ struct proc *p;
+ register struct getfsstat_args /* {
+ syscallarg(struct statfs *) buf;
+ syscallarg(long) bufsize;
+ syscallarg(int) flags;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct mount *mp, *nmp;
+ register struct statfs *sp;
+ caddr_t sfsp;
+ long count, maxcount, error;
+
+ maxcount = SCARG(uap, bufsize) / sizeof(struct statfs);
+ sfsp = (caddr_t)SCARG(uap, buf);
+ count = 0;
+ simple_lock(&mountlist_slock);
+ for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+ if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+ nmp = mp->mnt_list.cqe_next;
+ continue;
+ }
+ if (sfsp && count < maxcount) {
+ sp = &mp->mnt_stat;
+ /*
+ * If MNT_NOWAIT is specified, do not refresh the
+ * fsstat cache. MNT_WAIT overrides MNT_NOWAIT.
+ */
+ if (((SCARG(uap, flags) & MNT_NOWAIT) == 0 ||
+ (SCARG(uap, flags) & MNT_WAIT)) &&
+ (error = VFS_STATFS(mp, sp, p))) {
+ simple_lock(&mountlist_slock);
+ nmp = mp->mnt_list.cqe_next;
+ vfs_unbusy(mp, p);
+ continue;
+ }
+ sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ error = copyout((caddr_t)sp, sfsp, sizeof(*sp));
+ if (error) {
+ vfs_unbusy(mp, p);
+ return (error);
+ }
+ sfsp += sizeof(*sp);
+ }
+ count++;
+ simple_lock(&mountlist_slock);
+ nmp = mp->mnt_list.cqe_next;
+ vfs_unbusy(mp, p);
+ }
+ simple_unlock(&mountlist_slock);
+ if (sfsp && count > maxcount)
+ *retval = maxcount;
+ else
+ *retval = count;
+ return (0);
+}
+
+/*
+ * Change current working directory to a given file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchdir_args {
+ int fd;
+};
+#endif
+/* ARGSUSED */
+int
+fchdir(p, uap, retval)
+ struct proc *p;
+ struct fchdir_args /* {
+ syscallarg(int) fd;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct filedesc *fdp = p->p_fd;
+ struct vnode *vp, *tdp;
+ struct mount *mp;
+ struct file *fp;
+ int error;
+
+ if (error = getvnode(fdp, SCARG(uap, fd), &fp))
+ return (error);
+ vp = (struct vnode *)fp->f_data;
+ VREF(vp);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ if (vp->v_type != VDIR)
+ error = ENOTDIR;
+ else
+ error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p);
+ while (!error && (mp = vp->v_mountedhere) != NULL) {
+ if (vfs_busy(mp, 0, 0, p))
+ continue;
+ error = VFS_ROOT(mp, &tdp);
+ vfs_unbusy(mp, p);
+ if (error)
+ break;
+ vput(vp);
+ vp = tdp;
+ }
+ if (error) {
+ vput(vp);
+ return (error);
+ }
+ VOP_UNLOCK(vp, 0, p);
+ vrele(fdp->fd_cdir);
+ fdp->fd_cdir = vp;
+ return (0);
+}
+
+/*
+ * Change current working directory (``.'').
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chdir_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+chdir(p, uap, retval)
+ struct proc *p;
+ struct chdir_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct filedesc *fdp = p->p_fd;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = change_dir(&nd, p))
+ return (error);
+ vrele(fdp->fd_cdir);
+ fdp->fd_cdir = nd.ni_vp;
+ return (0);
+}
+
+/*
+ * Change notion of root (``/'') directory.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chroot_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+chroot(p, uap, retval)
+ struct proc *p;
+ struct chroot_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct filedesc *fdp = p->p_fd;
+ int error;
+ struct nameidata nd;
+
+ error = suser(p->p_ucred, &p->p_acflag);
+ if (error)
+ return (error);
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = change_dir(&nd, p))
+ return (error);
+ if (fdp->fd_rdir != NULL)
+ vrele(fdp->fd_rdir);
+ fdp->fd_rdir = nd.ni_vp;
+ return (0);
+}
+
+/*
+ * Common routine for chroot and chdir.
+ */
+static int
+change_dir(ndp, p)
+ register struct nameidata *ndp;
+ struct proc *p;
+{
+ struct vnode *vp;
+ int error;
+
+ error = namei(ndp);
+ if (error)
+ return (error);
+ vp = ndp->ni_vp;
+ if (vp->v_type != VDIR)
+ error = ENOTDIR;
+ else
+ error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p);
+ if (error)
+ vput(vp);
+ else
+ VOP_UNLOCK(vp, 0, p);
+ return (error);
+}
+
+/*
+ * Check permissions, allocate an open file structure,
+ * and call the device open routine if any.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct open_args {
+ char *path;
+ int flags;
+ int mode;
+};
+#endif
+int
+open(p, uap, retval)
+ struct proc *p;
+ register struct open_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ syscallarg(int) mode;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct filedesc *fdp = p->p_fd;
+ register struct file *fp;
+ register struct vnode *vp;
+ int flags, cmode;
+ struct file *nfp;
+ int type, indx, error;
+ struct flock lf;
+ struct nameidata nd;
+
+ error = falloc(p, &nfp, &indx);
+ if (error)
+ return (error);
+ fp = nfp;
+ flags = FFLAGS(SCARG(uap, flags));
+ cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ p->p_dupfd = -indx - 1; /* XXX check for fdopen */
+ error = vn_open(&nd, flags, cmode);
+ if (error) {
+ ffree(fp);
+ if ((error == ENODEV || error == ENXIO) &&
+ p->p_dupfd >= 0 && /* XXX from fdopen */
+ (error =
+ dupfdopen(fdp, indx, p->p_dupfd, flags, error)) == 0) {
+ *retval = indx;
+ return (0);
+ }
+ if (error == ERESTART)
+ error = EINTR;
+ fdp->fd_ofiles[indx] = NULL;
+ return (error);
+ }
+ p->p_dupfd = 0;
+ vp = nd.ni_vp;
+
+ fp->f_flag = flags & FMASK;
+ fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE);
+ fp->f_ops = &vnops;
+ fp->f_data = (caddr_t)vp;
+ if (flags & (O_EXLOCK | O_SHLOCK)) {
+ lf.l_whence = SEEK_SET;
+ lf.l_start = 0;
+ lf.l_len = 0;
+ if (flags & O_EXLOCK)
+ lf.l_type = F_WRLCK;
+ else
+ lf.l_type = F_RDLCK;
+ type = F_FLOCK;
+ if ((flags & FNONBLOCK) == 0)
+ type |= F_WAIT;
+ VOP_UNLOCK(vp, 0, p);
+ if (error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) {
+ (void) vn_close(vp, fp->f_flag, fp->f_cred, p);
+ ffree(fp);
+ fdp->fd_ofiles[indx] = NULL;
+ return (error);
+ }
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ fp->f_flag |= FHASLOCK;
+ }
+ VOP_UNLOCK(vp, 0, p);
+ *retval = indx;
+ return (0);
+}
+
+#ifdef COMPAT_43
+/*
+ * Create a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ocreat_args {
+ char *path;
+ int mode;
+};
+#endif
+int
+ocreat(p, uap, retval)
+ struct proc *p;
+ register struct ocreat_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+ register_t *retval;
+{
+ struct open_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ syscallarg(int) mode;
+ } */ nuap;
+
+ SCARG(&nuap, path) = SCARG(uap, path);
+ SCARG(&nuap, mode) = SCARG(uap, mode);
+ SCARG(&nuap, flags) = O_WRONLY | O_CREAT | O_TRUNC;
+ return (open(p, &nuap, retval));
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Create a special file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mknod_args {
+ char *path;
+ int mode;
+ int dev;
+};
+#endif
+/* ARGSUSED */
+int
+mknod(p, uap, retval)
+ struct proc *p;
+ register struct mknod_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ syscallarg(int) dev;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct vnode *vp;
+ struct vattr vattr;
+ int error;
+ int whiteout;
+ struct nameidata nd;
+
+ error = suser(p->p_ucred, &p->p_acflag);
+ if (error)
+ return (error);
+ NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ if (vp != NULL)
+ error = EEXIST;
+ else {
+ VATTR_NULL(&vattr);
+ vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask;
+ vattr.va_rdev = SCARG(uap, dev);
+ whiteout = 0;
+
+ switch (SCARG(uap, mode) & S_IFMT) {
+ case S_IFMT: /* used by badsect to flag bad sectors */
+ vattr.va_type = VBAD;
+ break;
+ case S_IFCHR:
+ vattr.va_type = VCHR;
+ break;
+ case S_IFBLK:
+ vattr.va_type = VBLK;
+ break;
+ case S_IFWHT:
+ whiteout = 1;
+ break;
+ default:
+ error = EINVAL;
+ break;
+ }
+ }
+ if (!error) {
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ if (whiteout) {
+ error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
+ if (error)
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ vput(nd.ni_dvp);
+ } else {
+ error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
+ &nd.ni_cnd, &vattr);
+ }
+ } else {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ if (vp)
+ vrele(vp);
+ }
+ return (error);
+}
+
+/*
+ * Create a named pipe.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkfifo_args {
+ char *path;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+mkfifo(p, uap, retval)
+ struct proc *p;
+ register struct mkfifo_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+ register_t *retval;
+{
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ if (nd.ni_vp != NULL) {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ if (nd.ni_dvp == nd.ni_vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vrele(nd.ni_vp);
+ return (EEXIST);
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_type = VFIFO;
+ vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask;
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ return (VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr));
+}
+
+/*
+ * Make a hard file link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct link_args {
+ char *path;
+ char *link;
+};
+#endif
+/* ARGSUSED */
+int
+link(p, uap, retval)
+ struct proc *p;
+ register struct link_args /* {
+ syscallarg(char *) path;
+ syscallarg(char *) link;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct vnode *vp;
+ struct nameidata nd;
+ int error;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ if (vp->v_type == VDIR)
+ error = EPERM; /* POSIX */
+ else {
+ NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, link), p);
+ error = namei(&nd);
+ if (!error) {
+ if (nd.ni_vp != NULL) {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ if (nd.ni_dvp == nd.ni_vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ if (nd.ni_vp)
+ vrele(nd.ni_vp);
+ error = EEXIST;
+ } else {
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred,
+ LEASE_WRITE);
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
+ }
+ }
+ }
+ vrele(vp);
+ return (error);
+}
+
+/*
+ * Make a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct symlink_args {
+ char *path;
+ char *link;
+};
+#endif
+/* ARGSUSED */
+int
+symlink(p, uap, retval)
+ struct proc *p;
+ register struct symlink_args /* {
+ syscallarg(char *) path;
+ syscallarg(char *) link;
+ } */ *uap;
+ register_t *retval;
+{
+ struct vattr vattr;
+ char *path;
+ int error;
+ struct nameidata nd;
+
+ MALLOC(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
+ if (error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL))
+ goto out;
+ NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, link), p);
+ if (error = namei(&nd))
+ goto out;
+ if (nd.ni_vp) {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ if (nd.ni_dvp == nd.ni_vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vrele(nd.ni_vp);
+ error = EEXIST;
+ goto out;
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask;
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
+out:
+ FREE(path, M_NAMEI);
+ return (error);
+}
+
+/*
+ * Delete a whiteout from the filesystem.
+ */
+/* ARGSUSED */
+int
+undelete(p, uap, retval)
+ struct proc *p;
+ register struct undelete_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+ register_t *retval;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ error = namei(&nd);
+ if (error)
+ return (error);
+
+ if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ if (nd.ni_dvp == nd.ni_vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ if (nd.ni_vp)
+ vrele(nd.ni_vp);
+ return (EEXIST);
+ }
+
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ if (error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE))
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ vput(nd.ni_dvp);
+ return (error);
+}
+
+/*
+ * Delete a name from the filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unlink_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+unlink(p, uap, retval)
+ struct proc *p;
+ struct unlink_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct vnode *vp;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+
+ if (vp->v_type == VDIR)
+ error = EPERM; /* POSIX */
+ else {
+ /*
+ * The root of a mounted filesystem cannot be deleted.
+ *
+ * XXX: can this only be a VDIR case?
+ */
+ if (vp->v_flag & VROOT)
+ error = EBUSY;
+ else
+ (void) vnode_pager_uncache(vp, p);
+ }
+
+ if (!error) {
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
+ } else {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ if (vp != NULLVP)
+ vput(vp);
+ }
+ return (error);
+}
+
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lseek_args {
+ int fd;
+ int pad;
+ off_t offset;
+ int whence;
+};
+#endif
+int
+lseek(p, uap, retval)
+ struct proc *p;
+ register struct lseek_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) pad;
+ syscallarg(off_t) offset;
+ syscallarg(int) whence;
+ } */ *uap;
+ register_t *retval; /* XXX */
+{
+ struct ucred *cred = p->p_ucred;
+ register struct filedesc *fdp = p->p_fd;
+ register struct file *fp;
+ struct vattr vattr;
+ int error;
+
+ if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL)
+ return (EBADF);
+ if (fp->f_type != DTYPE_VNODE)
+ return (ESPIPE);
+ switch (SCARG(uap, whence)) {
+ case L_INCR:
+ fp->f_offset += SCARG(uap, offset);
+ break;
+ case L_XTND:
+ error=VOP_GETATTR((struct vnode *)fp->f_data, &vattr, cred, p);
+ if (error)
+ return (error);
+ fp->f_offset = SCARG(uap, offset) + vattr.va_size;
+ break;
+ case L_SET:
+ fp->f_offset = SCARG(uap, offset);
+ break;
+ default:
+ return (EINVAL);
+ }
+ *(off_t *)retval = fp->f_offset;
+ return (0);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olseek_args {
+ int fd;
+ long offset;
+ int whence;
+};
+#endif
+int
+olseek(p, uap, retval)
+ struct proc *p;
+ register struct olseek_args /* {
+ syscallarg(int) fd;
+ syscallarg(long) offset;
+ syscallarg(int) whence;
+ } */ *uap;
+ register_t *retval;
+{
+ struct lseek_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) pad;
+ syscallarg(off_t) offset;
+ syscallarg(int) whence;
+ } */ nuap;
+ off_t qret;
+ int error;
+
+ SCARG(&nuap, fd) = SCARG(uap, fd);
+ SCARG(&nuap, offset) = SCARG(uap, offset);
+ SCARG(&nuap, whence) = SCARG(uap, whence);
+ error = lseek(p, &nuap, (register_t *) &qret);
+ *(long *)retval = qret;
+ return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Check access permissions.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct access_args {
+ char *path;
+ int flags;
+};
+#endif
+int
+access(p, uap, retval)
+ struct proc *p;
+ register struct access_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct ucred *cred = p->p_ucred;
+ register struct vnode *vp;
+ int error, flags, t_gid, t_uid;
+ struct nameidata nd;
+
+ t_uid = cred->cr_uid;
+ t_gid = cred->cr_groups[0];
+ cred->cr_uid = p->p_cred->p_ruid;
+ cred->cr_groups[0] = p->p_cred->p_rgid;
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ goto out1;
+ vp = nd.ni_vp;
+
+ /* Flags == 0 means only check for existence. */
+ if (SCARG(uap, flags)) {
+ flags = 0;
+ if (SCARG(uap, flags) & R_OK)
+ flags |= VREAD;
+ if (SCARG(uap, flags) & W_OK)
+ flags |= VWRITE;
+ if (SCARG(uap, flags) & X_OK)
+ flags |= VEXEC;
+ if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
+ error = VOP_ACCESS(vp, flags, cred, p);
+ }
+ vput(vp);
+out1:
+ cred->cr_uid = t_uid;
+ cred->cr_groups[0] = t_gid;
+ return (error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ostat_args {
+ char *path;
+ struct ostat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+ostat(p, uap, retval)
+ struct proc *p;
+ register struct ostat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct ostat *) ub;
+ } */ *uap;
+ register_t *retval;
+{
+ struct stat sb;
+ struct ostat osb;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = vn_stat(nd.ni_vp, &sb, p);
+ vput(nd.ni_vp);
+ if (error)
+ return (error);
+ cvtstat(&sb, &osb);
+ error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb));
+ return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olstat_args {
+ char *path;
+ struct ostat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+olstat(p, uap, retval)
+ struct proc *p;
+ register struct olstat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct ostat *) ub;
+ } */ *uap;
+ register_t *retval;
+{
+ struct vnode *vp, *dvp;
+ struct stat sb, sb1;
+ struct ostat osb;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKPARENT, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ /*
+ * For symbolic links, always return the attributes of its
+ * containing directory, except for mode, size, and links.
+ */
+ vp = nd.ni_vp;
+ dvp = nd.ni_dvp;
+ if (vp->v_type != VLNK) {
+ if (dvp == vp)
+ vrele(dvp);
+ else
+ vput(dvp);
+ error = vn_stat(vp, &sb, p);
+ vput(vp);
+ if (error)
+ return (error);
+ } else {
+ error = vn_stat(dvp, &sb, p);
+ vput(dvp);
+ if (error) {
+ vput(vp);
+ return (error);
+ }
+ error = vn_stat(vp, &sb1, p);
+ vput(vp);
+ if (error)
+ return (error);
+ sb.st_mode &= ~S_IFDIR;
+ sb.st_mode |= S_IFLNK;
+ sb.st_nlink = sb1.st_nlink;
+ sb.st_size = sb1.st_size;
+ sb.st_blocks = sb1.st_blocks;
+ }
+ cvtstat(&sb, &osb);
+ error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb));
+ return (error);
+}
+
+/*
+ * Convert from an old to a new stat structure.
+ */
+void
+cvtstat(st, ost)
+ struct stat *st;
+ struct ostat *ost;
+{
+
+ ost->st_dev = st->st_dev;
+ ost->st_ino = st->st_ino;
+ ost->st_mode = st->st_mode;
+ ost->st_nlink = st->st_nlink;
+ ost->st_uid = st->st_uid;
+ ost->st_gid = st->st_gid;
+ ost->st_rdev = st->st_rdev;
+ if (st->st_size < (quad_t)1 << 32)
+ ost->st_size = st->st_size;
+ else
+ ost->st_size = -2;
+ ost->st_atime = st->st_atime;
+ ost->st_mtime = st->st_mtime;
+ ost->st_ctime = st->st_ctime;
+ ost->st_blksize = st->st_blksize;
+ ost->st_blocks = st->st_blocks;
+ ost->st_flags = st->st_flags;
+ ost->st_gen = st->st_gen;
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct stat_args {
+ char *path;
+ struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+stat(p, uap, retval)
+ struct proc *p;
+ register struct stat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct stat *) ub;
+ } */ *uap;
+ register_t *retval;
+{
+ struct stat sb;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = vn_stat(nd.ni_vp, &sb, p);
+ vput(nd.ni_vp);
+ if (error)
+ return (error);
+ error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb));
+ return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+ char *path;
+ struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+lstat(p, uap, retval)
+ struct proc *p;
+ register struct lstat_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct stat *) ub;
+ } */ *uap;
+ register_t *retval;
+{
+ int error;
+ struct vnode *vp, *dvp;
+ struct stat sb, sb1;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKPARENT, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ /*
+ * For symbolic links, always return the attributes of its containing
+ * directory, except for mode, size, inode number, and links.
+ */
+ vp = nd.ni_vp;
+ dvp = nd.ni_dvp;
+ if (vp->v_type != VLNK) {
+ if (dvp == vp)
+ vrele(dvp);
+ else
+ vput(dvp);
+ error = vn_stat(vp, &sb, p);
+ vput(vp);
+ if (error)
+ return (error);
+ } else {
+ error = vn_stat(dvp, &sb, p);
+ vput(dvp);
+ if (error) {
+ vput(vp);
+ return (error);
+ }
+ error = vn_stat(vp, &sb1, p);
+ vput(vp);
+ if (error)
+ return (error);
+ sb.st_mode &= ~S_IFDIR;
+ sb.st_mode |= S_IFLNK;
+ sb.st_nlink = sb1.st_nlink;
+ sb.st_size = sb1.st_size;
+ sb.st_blocks = sb1.st_blocks;
+ sb.st_ino = sb1.st_ino;
+ }
+ error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb));
+ return (error);
+}
+
+/*
+ * Get configurable pathname variables.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pathconf_args {
+ char *path;
+ int name;
+};
+#endif
+/* ARGSUSED */
+int
+pathconf(p, uap, retval)
+ struct proc *p;
+ register struct pathconf_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) name;
+ } */ *uap;
+ register_t *retval;
+{
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), retval);
+ vput(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Return target name of a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct readlink_args {
+ char *path;
+ char *buf;
+ int count;
+};
+#endif
+/* ARGSUSED */
+int
+readlink(p, uap, retval)
+ struct proc *p;
+ register struct readlink_args /* {
+ syscallarg(char *) path;
+ syscallarg(char *) buf;
+ syscallarg(int) count;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct vnode *vp;
+ struct iovec aiov;
+ struct uio auio;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ if (vp->v_type != VLNK)
+ error = EINVAL;
+ else {
+ aiov.iov_base = SCARG(uap, buf);
+ aiov.iov_len = SCARG(uap, count);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_offset = 0;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_procp = p;
+ auio.uio_resid = SCARG(uap, count);
+ error = VOP_READLINK(vp, &auio, p->p_ucred);
+ }
+ vput(vp);
+ *retval = SCARG(uap, count) - auio.uio_resid;
+ return (error);
+}
+
+/*
+ * Change flags of a file given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chflags_args {
+ char *path;
+ int flags;
+};
+#endif
+/* ARGSUSED */
+int
+chflags(p, uap, retval)
+ struct proc *p;
+ register struct chflags_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) flags;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct vnode *vp;
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ VATTR_NULL(&vattr);
+ vattr.va_flags = SCARG(uap, flags);
+ error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+ vput(vp);
+ return (error);
+}
+
+/*
+ * Change flags of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchflags_args {
+ int fd;
+ int flags;
+};
+#endif
+/* ARGSUSED */
+int
+fchflags(p, uap, retval)
+ struct proc *p;
+ register struct fchflags_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) flags;
+ } */ *uap;
+ register_t *retval;
+{
+ struct vattr vattr;
+ struct vnode *vp;
+ struct file *fp;
+ int error;
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ vp = (struct vnode *)fp->f_data;
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ VATTR_NULL(&vattr);
+ vattr.va_flags = SCARG(uap, flags);
+ error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+ VOP_UNLOCK(vp, 0, p);
+ return (error);
+}
+
+/*
+ * Change mode of a file given path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chmod_args {
+ char *path;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+chmod(p, uap, retval)
+ struct proc *p;
+ register struct chmod_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct vnode *vp;
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ VATTR_NULL(&vattr);
+ vattr.va_mode = SCARG(uap, mode) & ALLPERMS;
+ error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+ vput(vp);
+ return (error);
+}
+
+/*
+ * Change mode of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchmod_args {
+ int fd;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+fchmod(p, uap, retval)
+ struct proc *p;
+ register struct fchmod_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) mode;
+ } */ *uap;
+ register_t *retval;
+{
+ struct vattr vattr;
+ struct vnode *vp;
+ struct file *fp;
+ int error;
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ vp = (struct vnode *)fp->f_data;
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ VATTR_NULL(&vattr);
+ vattr.va_mode = SCARG(uap, mode) & ALLPERMS;
+ error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+ VOP_UNLOCK(vp, 0, p);
+ return (error);
+}
+
+/*
+ * Set ownership given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chown_args {
+ char *path;
+ int uid;
+ int gid;
+};
+#endif
+/* ARGSUSED */
+int
+chown(p, uap, retval)
+ struct proc *p;
+ register struct chown_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) uid;
+ syscallarg(int) gid;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct vnode *vp;
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ VATTR_NULL(&vattr);
+ vattr.va_uid = SCARG(uap, uid);
+ vattr.va_gid = SCARG(uap, gid);
+ error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+ vput(vp);
+ return (error);
+}
+
+/*
+ * Set ownership given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchown_args {
+ int fd;
+ int uid;
+ int gid;
+};
+#endif
+/* ARGSUSED */
+int
+fchown(p, uap, retval)
+ struct proc *p;
+ register struct fchown_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) uid;
+ syscallarg(int) gid;
+ } */ *uap;
+ register_t *retval;
+{
+ struct vattr vattr;
+ struct vnode *vp;
+ struct file *fp;
+ int error;
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ vp = (struct vnode *)fp->f_data;
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ VATTR_NULL(&vattr);
+ vattr.va_uid = SCARG(uap, uid);
+ vattr.va_gid = SCARG(uap, gid);
+ error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+ VOP_UNLOCK(vp, 0, p);
+ return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct utimes_args {
+ char *path;
+ struct timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+utimes(p, uap, retval)
+ struct proc *p;
+ register struct utimes_args /* {
+ syscallarg(char *) path;
+ syscallarg(struct timeval *) tptr;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct vnode *vp;
+ struct timeval tv[2];
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+ VATTR_NULL(&vattr);
+ if (SCARG(uap, tptr) == NULL) {
+ microtime(&tv[0]);
+ tv[1] = tv[0];
+ vattr.va_vaflags |= VA_UTIMES_NULL;
+ } else if (error = copyin((caddr_t)SCARG(uap, tptr), (caddr_t)tv,
+ sizeof (tv)))
+ return (error);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ vattr.va_atime.tv_sec = tv[0].tv_sec;
+ vattr.va_atime.tv_nsec = tv[0].tv_usec * 1000;
+ vattr.va_mtime.tv_sec = tv[1].tv_sec;
+ vattr.va_mtime.tv_nsec = tv[1].tv_usec * 1000;
+ error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+ vput(vp);
+ return (error);
+}
+
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct truncate_args {
+ char *path;
+ int pad;
+ off_t length;
+};
+#endif
+/* ARGSUSED */
+int
+truncate(p, uap, retval)
+ struct proc *p;
+ register struct truncate_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) pad;
+ syscallarg(off_t) length;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct vnode *vp;
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+ if (uap->length < 0)
+ return(EINVAL);
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ if (vp->v_type == VDIR)
+ error = EISDIR;
+ else if ((error = vn_writechk(vp)) == 0 &&
+ (error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) == 0) {
+ VATTR_NULL(&vattr);
+ vattr.va_size = SCARG(uap, length);
+ error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+ }
+ vput(vp);
+ return (error);
+}
+
+/*
+ * Truncate a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ftruncate_args {
+ int fd;
+ int pad;
+ off_t length;
+};
+#endif
+/* ARGSUSED */
+int
+ftruncate(p, uap, retval)
+ struct proc *p;
+ register struct ftruncate_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) pad;
+ syscallarg(off_t) length;
+ } */ *uap;
+ register_t *retval;
+{
+ struct vattr vattr;
+ struct vnode *vp;
+ struct file *fp;
+ int error;
+
+ if (uap->length < 0)
+ return(EINVAL);
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ if ((fp->f_flag & FWRITE) == 0)
+ return (EINVAL);
+ vp = (struct vnode *)fp->f_data;
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ if (vp->v_type == VDIR)
+ error = EISDIR;
+ else if ((error = vn_writechk(vp)) == 0) {
+ VATTR_NULL(&vattr);
+ vattr.va_size = SCARG(uap, length);
+ error = VOP_SETATTR(vp, &vattr, fp->f_cred, p);
+ }
+ VOP_UNLOCK(vp, 0, p);
+ return (error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct otruncate_args {
+ char *path;
+ long length;
+};
+#endif
+/* ARGSUSED */
+int
+otruncate(p, uap, retval)
+ struct proc *p;
+ register struct otruncate_args /* {
+ syscallarg(char *) path;
+ syscallarg(long) length;
+ } */ *uap;
+ register_t *retval;
+{
+ struct truncate_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) pad;
+ syscallarg(off_t) length;
+ } */ nuap;
+
+ SCARG(&nuap, path) = SCARG(uap, path);
+ SCARG(&nuap, length) = SCARG(uap, length);
+ return (truncate(p, &nuap, retval));
+}
+
+/*
+ * Truncate a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct oftruncate_args {
+ int fd;
+ long length;
+};
+#endif
+/* ARGSUSED */
+int
+oftruncate(p, uap, retval)
+ struct proc *p;
+ register struct oftruncate_args /* {
+ syscallarg(int) fd;
+ syscallarg(long) length;
+ } */ *uap;
+ register_t *retval;
+{
+ struct ftruncate_args /* {
+ syscallarg(int) fd;
+ syscallarg(int) pad;
+ syscallarg(off_t) length;
+ } */ nuap;
+
+ SCARG(&nuap, fd) = SCARG(uap, fd);
+ SCARG(&nuap, length) = SCARG(uap, length);
+ return (ftruncate(p, &nuap, retval));
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Sync an open file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fsync_args {
+ int fd;
+};
+#endif
+/* ARGSUSED */
+int
+fsync(p, uap, retval)
+ struct proc *p;
+ struct fsync_args /* {
+ syscallarg(int) fd;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct vnode *vp;
+ struct file *fp;
+ int error;
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ vp = (struct vnode *)fp->f_data;
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ if (vp->v_object) {
+ vm_object_page_clean(vp->v_object, 0, 0 ,0, FALSE);
+ }
+ error = VOP_FSYNC(vp, fp->f_cred,
+ (vp->v_mount && (vp->v_mount->mnt_flag & MNT_ASYNC)) ?
+ MNT_NOWAIT : MNT_WAIT, p);
+ VOP_UNLOCK(vp, 0, p);
+ return (error);
+}
+
+/*
+ * Rename files. Source and destination must either both be directories,
+ * or both not be directories. If target is a directory, it must be empty.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rename_args {
+ char *from;
+ char *to;
+};
+#endif
+/* ARGSUSED */
+int
+rename(p, uap, retval)
+ struct proc *p;
+ register struct rename_args /* {
+ syscallarg(char *) from;
+ syscallarg(char *) to;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct vnode *tvp, *fvp, *tdvp;
+ struct nameidata fromnd, tond;
+ int error;
+
+ NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE,
+ SCARG(uap, from), p);
+ if (error = namei(&fromnd))
+ return (error);
+ fvp = fromnd.ni_vp;
+ NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART,
+ UIO_USERSPACE, SCARG(uap, to), p);
+ if (fromnd.ni_vp->v_type == VDIR)
+ tond.ni_cnd.cn_flags |= WILLBEDIR;
+ if (error = namei(&tond)) {
+ /* Translate error code for rename("dir1", "dir2/."). */
+ if (error == EISDIR && fvp->v_type == VDIR)
+ error = EINVAL;
+ VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
+ vrele(fromnd.ni_dvp);
+ vrele(fvp);
+ goto out1;
+ }
+ tdvp = tond.ni_dvp;
+ tvp = tond.ni_vp;
+ if (tvp != NULL) {
+ if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ } else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
+ error = EISDIR;
+ goto out;
+ }
+ }
+ if (fvp == tdvp)
+ error = EINVAL;
+ /*
+ * If source is the same as the destination (that is the
+ * same inode number with the same name in the same directory),
+ * then there is nothing to do.
+ */
+ if (fvp == tvp && fromnd.ni_dvp == tdvp &&
+ fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen &&
+ !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr,
+ fromnd.ni_cnd.cn_namelen))
+ error = -1;
+out:
+ if (!error) {
+ VOP_LEASE(tdvp, p, p->p_ucred, LEASE_WRITE);
+ if (fromnd.ni_dvp != tdvp)
+ VOP_LEASE(fromnd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ if (tvp) {
+ VOP_LEASE(tvp, p, p->p_ucred, LEASE_WRITE);
+ (void) vnode_pager_uncache(tvp, p);
+ }
+ error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
+ tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
+ } else {
+ VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd);
+ if (tdvp == tvp)
+ vrele(tdvp);
+ else
+ vput(tdvp);
+ if (tvp)
+ vput(tvp);
+ VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
+ vrele(fromnd.ni_dvp);
+ vrele(fvp);
+ }
+ vrele(tond.ni_startdir);
+ FREE(tond.ni_cnd.cn_pnbuf, M_NAMEI);
+out1:
+ if (fromnd.ni_startdir)
+ vrele(fromnd.ni_startdir);
+ FREE(fromnd.ni_cnd.cn_pnbuf, M_NAMEI);
+ if (error == -1)
+ return (0);
+ return (error);
+}
+
+/*
+ * Make a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkdir_args {
+ char *path;
+ int mode;
+};
+#endif
+/* ARGSUSED */
+int
+mkdir(p, uap, retval)
+ struct proc *p;
+ register struct mkdir_args /* {
+ syscallarg(char *) path;
+ syscallarg(int) mode;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct vnode *vp;
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+ nd.ni_cnd.cn_flags |= WILLBEDIR;
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ if (vp != NULL) {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vrele(vp);
+ return (EEXIST);
+ }
+ VATTR_NULL(&vattr);
+ vattr.va_type = VDIR;
+ vattr.va_mode = (SCARG(uap, mode) & ACCESSPERMS) &~ p->p_fd->fd_cmask;
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+ if (!error)
+ vput(nd.ni_vp);
+ return (error);
+}
+
+/*
+ * Remove a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rmdir_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+rmdir(p, uap, retval)
+ struct proc *p;
+ struct rmdir_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct vnode *vp;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE,
+ SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ if (vp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto out;
+ }
+ /*
+ * No rmdir "." please.
+ */
+ if (nd.ni_dvp == vp) {
+ error = EINVAL;
+ goto out;
+ }
+ /*
+ * The root of a mounted filesystem cannot be deleted.
+ */
+ if (vp->v_flag & VROOT)
+ error = EBUSY;
+out:
+ if (!error) {
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
+ } else {
+ VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+ if (nd.ni_dvp == vp)
+ vrele(nd.ni_dvp);
+ else
+ vput(nd.ni_dvp);
+ vput(vp);
+ }
+ return (error);
+}
+
+#ifdef COMPAT_43
+/*
+ * Read a block of directory entries in a file system independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ogetdirentries_args {
+ int fd;
+ char *buf;
+ u_int count;
+ long *basep;
+};
+#endif
+int
+ogetdirentries(p, uap, retval)
+ struct proc *p;
+ register struct ogetdirentries_args /* {
+ syscallarg(int) fd;
+ syscallarg(char *) buf;
+ syscallarg(u_int) count;
+ syscallarg(long *) basep;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct vnode *vp;
+ struct file *fp;
+ struct uio auio, kuio;
+ struct iovec aiov, kiov;
+ struct dirent *dp, *edp;
+ caddr_t dirbuf;
+ int error, eofflag, readcnt;
+ long loff;
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ if ((fp->f_flag & FREAD) == 0)
+ return (EBADF);
+ vp = (struct vnode *)fp->f_data;
+unionread:
+ if (vp->v_type != VDIR)
+ return (EINVAL);
+ aiov.iov_base = SCARG(uap, buf);
+ aiov.iov_len = SCARG(uap, count);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_procp = p;
+ auio.uio_resid = SCARG(uap, count);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ loff = auio.uio_offset = fp->f_offset;
+# if (BYTE_ORDER != LITTLE_ENDIAN)
+ if (vp->v_mount->mnt_maxsymlinklen <= 0) {
+ error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
+ NULL, NULL);
+ fp->f_offset = auio.uio_offset;
+ } else
+# endif
+ {
+ kuio = auio;
+ kuio.uio_iov = &kiov;
+ kuio.uio_segflg = UIO_SYSSPACE;
+ kiov.iov_len = SCARG(uap, count);
+ MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK);
+ kiov.iov_base = dirbuf;
+ error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
+ NULL, NULL);
+ fp->f_offset = kuio.uio_offset;
+ if (error == 0) {
+ readcnt = SCARG(uap, count) - kuio.uio_resid;
+ edp = (struct dirent *)&dirbuf[readcnt];
+ for (dp = (struct dirent *)dirbuf; dp < edp; ) {
+# if (BYTE_ORDER == LITTLE_ENDIAN)
+ /*
+ * The expected low byte of
+ * dp->d_namlen is our dp->d_type.
+ * The high MBZ byte of dp->d_namlen
+ * is our dp->d_namlen.
+ */
+ dp->d_type = dp->d_namlen;
+ dp->d_namlen = 0;
+# else
+ /*
+ * The dp->d_type is the high byte
+ * of the expected dp->d_namlen,
+ * so must be zero'ed.
+ */
+ dp->d_type = 0;
+# endif
+ if (dp->d_reclen > 0) {
+ dp = (struct dirent *)
+ ((char *)dp + dp->d_reclen);
+ } else {
+ error = EIO;
+ break;
+ }
+ }
+ if (dp >= edp)
+ error = uiomove(dirbuf, readcnt, &auio);
+ }
+ FREE(dirbuf, M_TEMP);
+ }
+ VOP_UNLOCK(vp, 0, p);
+ if (error)
+ return (error);
+
+#ifdef UNION
+{
+ if ((SCARG(uap, count) == auio.uio_resid) &&
+ (vp->v_op == union_vnodeop_p)) {
+ struct vnode *lvp;
+
+ lvp = union_dircache(vp, p);
+ if (lvp != NULLVP) {
+ struct vattr va;
+
+ /*
+ * If the directory is opaque,
+ * then don't show lower entries
+ */
+ error = VOP_GETATTR(vp, &va, fp->f_cred, p);
+ if (va.va_flags & OPAQUE) {
+ vput(lvp);
+ lvp = NULL;
+ }
+ }
+
+ if (lvp != NULLVP) {
+ error = VOP_OPEN(lvp, FREAD, fp->f_cred, p);
+ if (error) {
+ vput(lvp);
+ return (error);
+ }
+ VOP_UNLOCK(lvp, 0, p);
+ fp->f_data = (caddr_t) lvp;
+ fp->f_offset = 0;
+ error = vn_close(vp, FREAD, fp->f_cred, p);
+ if (error)
+ return (error);
+ vp = lvp;
+ goto unionread;
+ }
+ }
+}
+#endif /* UNION */
+
+ if ((SCARG(uap, count) == auio.uio_resid) &&
+ (vp->v_flag & VROOT) &&
+ (vp->v_mount->mnt_flag & MNT_UNION)) {
+ struct vnode *tvp = vp;
+ vp = vp->v_mount->mnt_vnodecovered;
+ VREF(vp);
+ fp->f_data = (caddr_t) vp;
+ fp->f_offset = 0;
+ vrele(tvp);
+ goto unionread;
+ }
+ error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep),
+ sizeof(long));
+ *retval = SCARG(uap, count) - auio.uio_resid;
+ return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Read a block of directory entries in a file system independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getdirentries_args {
+ int fd;
+ char *buf;
+ u_int count;
+ long *basep;
+};
+#endif
+int
+getdirentries(p, uap, retval)
+ struct proc *p;
+ register struct getdirentries_args /* {
+ syscallarg(int) fd;
+ syscallarg(char *) buf;
+ syscallarg(u_int) count;
+ syscallarg(long *) basep;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct vnode *vp;
+ struct file *fp;
+ struct uio auio;
+ struct iovec aiov;
+ long loff;
+ int error, eofflag;
+
+ if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+ return (error);
+ if ((fp->f_flag & FREAD) == 0)
+ return (EBADF);
+ vp = (struct vnode *)fp->f_data;
+unionread:
+ if (vp->v_type != VDIR)
+ return (EINVAL);
+ aiov.iov_base = SCARG(uap, buf);
+ aiov.iov_len = SCARG(uap, count);
+ auio.uio_iov = &aiov;
+ auio.uio_iovcnt = 1;
+ auio.uio_rw = UIO_READ;
+ auio.uio_segflg = UIO_USERSPACE;
+ auio.uio_procp = p;
+ auio.uio_resid = SCARG(uap, count);
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ loff = auio.uio_offset = fp->f_offset;
+ error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
+ fp->f_offset = auio.uio_offset;
+ VOP_UNLOCK(vp, 0, p);
+ if (error)
+ return (error);
+
+#ifdef UNION
+{
+ if ((SCARG(uap, count) == auio.uio_resid) &&
+ (vp->v_op == union_vnodeop_p)) {
+ struct vnode *lvp;
+
+ lvp = union_dircache(vp, p);
+ if (lvp != NULLVP) {
+ struct vattr va;
+
+ /*
+ * If the directory is opaque,
+ * then don't show lower entries
+ */
+ error = VOP_GETATTR(vp, &va, fp->f_cred, p);
+ if (va.va_flags & OPAQUE) {
+ vput(lvp);
+ lvp = NULL;
+ }
+ }
+
+ if (lvp != NULLVP) {
+ error = VOP_OPEN(lvp, FREAD, fp->f_cred, p);
+ if (error) {
+ vput(lvp);
+ return (error);
+ }
+ VOP_UNLOCK(lvp, 0, p);
+ fp->f_data = (caddr_t) lvp;
+ fp->f_offset = 0;
+ error = vn_close(vp, FREAD, fp->f_cred, p);
+ if (error)
+ return (error);
+ vp = lvp;
+ goto unionread;
+ }
+ }
+}
+#endif /* UNION */
+
+ if ((SCARG(uap, count) == auio.uio_resid) &&
+ (vp->v_flag & VROOT) &&
+ (vp->v_mount->mnt_flag & MNT_UNION)) {
+ struct vnode *tvp = vp;
+ vp = vp->v_mount->mnt_vnodecovered;
+ VREF(vp);
+ fp->f_data = (caddr_t) vp;
+ fp->f_offset = 0;
+ vrele(tvp);
+ goto unionread;
+ }
+ error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep),
+ sizeof(long));
+ *retval = SCARG(uap, count) - auio.uio_resid;
+ return (error);
+}
+
+/*
+ * Set the mode mask for creation of filesystem nodes.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct umask_args {
+ int newmask;
+};
+#endif
+int
+umask(p, uap, retval)
+ struct proc *p;
+ struct umask_args /* {
+ syscallarg(int) newmask;
+ } */ *uap;
+ int *retval; /* XXX */
+{
+ register struct filedesc *fdp;
+
+ fdp = p->p_fd;
+ *retval = fdp->fd_cmask;
+ fdp->fd_cmask = SCARG(uap, newmask) & ALLPERMS;
+ return (0);
+}
+
+/*
+ * Void all references to file by ripping underlying filesystem
+ * away from vnode.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct revoke_args {
+ char *path;
+};
+#endif
+/* ARGSUSED */
+int
+revoke(p, uap, retval)
+ struct proc *p;
+ register struct revoke_args /* {
+ syscallarg(char *) path;
+ } */ *uap;
+ register_t *retval;
+{
+ register struct vnode *vp;
+ struct vattr vattr;
+ int error;
+ struct nameidata nd;
+
+ NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+ if (error = namei(&nd))
+ return (error);
+ vp = nd.ni_vp;
+ if (error = VOP_GETATTR(vp, &vattr, p->p_ucred, p))
+ goto out;
+ if (p->p_ucred->cr_uid != vattr.va_uid &&
+ (error = suser(p->p_ucred, &p->p_acflag)))
+ goto out;
+ if (vp->v_usecount > 1 || (vp->v_flag & VALIASED))
+ VOP_REVOKE(vp, REVOKEALL);
+out:
+ vrele(vp);
+ return (error);
+}
+
+/*
+ * Convert a user file descriptor to a kernel file entry.
+ */
+int
+getvnode(fdp, fd, fpp)
+ struct filedesc *fdp;
+ int fd;
+ struct file **fpp;
+{
+ struct file *fp;
+
+ if ((u_int)fd >= fdp->fd_nfiles ||
+ (fp = fdp->fd_ofiles[fd]) == NULL)
+ return (EBADF);
+ if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO)
+ return (EINVAL);
+ *fpp = fp;
+ return (0);
+}
diff --git a/sys/kern/vfs_init.c b/sys/kern/vfs_init.c
index b5abe58..21061e8 100644
--- a/sys/kern/vfs_init.c
+++ b/sys/kern/vfs_init.c
@@ -35,11 +35,14 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)vfs_init.c 8.5 (Berkeley) 5/11/95
+ * @(#)vfs_init.c 8.3 (Berkeley) 1/4/94
+ * $Id: vfs_init.c,v 1.24 1997/02/22 09:39:32 peter Exp $
*/
#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
#include <sys/mount.h>
#include <sys/time.h>
#include <sys/vnode.h>
@@ -49,6 +52,12 @@
#include <sys/buf.h>
#include <sys/errno.h>
#include <sys/malloc.h>
+#include <sys/proc.h>
+
+static void vfs_op_init __P((void));
+
+static void vfsinit __P((void *));
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vfsinit, NULL)
/*
* Sigh, such primitive tools are these...
@@ -59,8 +68,13 @@
#define DODEBUG(A)
#endif
-extern struct vnodeopv_desc *vfs_opv_descs[];
- /* a list of lists of vnodeops defns */
+struct vfsconf void_vfsconf;
+
+extern struct linker_set vfs_opv_descs_;
+#define vfs_opv_descs ((struct vnodeopv_desc **)vfs_opv_descs_.ls_items)
+
+extern struct linker_set vfs_set;
+
extern struct vnodeop_desc *vfs_op_descs[];
/* and the operations they perform */
/*
@@ -69,9 +83,7 @@ extern struct vnodeop_desc *vfs_op_descs[];
* extra level of indirection for arrays. It's an interesting
* "feature" of C.
*/
-int vfs_opv_numops;
-
-typedef (*PFI)(); /* the standard Pointer to a Function returning an Int */
+static int vfs_opv_numops;
/*
* A miscellaneous routine.
@@ -101,33 +113,35 @@ vn_default_error()
* that is a(whole)nother story.) This is a feature.
*/
void
-vfs_opv_init()
+vfs_opv_init(struct vnodeopv_desc **them)
{
int i, j, k;
- int (***opv_desc_vector_p)();
- int (**opv_desc_vector)();
+ vop_t ***opv_desc_vector_p;
+ vop_t **opv_desc_vector;
struct vnodeopv_entry_desc *opve_descp;
/*
* Allocate the dynamic vectors and fill them in.
*/
- for (i=0; vfs_opv_descs[i]; i++) {
- opv_desc_vector_p = vfs_opv_descs[i]->opv_desc_vector_p;
+ for (i=0; them[i]; i++) {
+ opv_desc_vector_p = them[i]->opv_desc_vector_p;
/*
* Allocate and init the vector, if it needs it.
* Also handle backwards compatibility.
*/
if (*opv_desc_vector_p == NULL) {
/* XXX - shouldn't be M_VNODE */
- MALLOC(*opv_desc_vector_p, PFI*,
- vfs_opv_numops*sizeof(PFI), M_VNODE, M_WAITOK);
- bzero (*opv_desc_vector_p, vfs_opv_numops*sizeof(PFI));
+ MALLOC(*opv_desc_vector_p, vop_t **,
+ vfs_opv_numops * sizeof(vop_t *), M_VNODE,
+ M_WAITOK);
+ bzero(*opv_desc_vector_p,
+ vfs_opv_numops * sizeof(vop_t *));
DODEBUG(printf("vector at %x allocated\n",
opv_desc_vector_p));
}
opv_desc_vector = *opv_desc_vector_p;
- for (j=0; vfs_opv_descs[i]->opv_desc_ops[j].opve_op; j++) {
- opve_descp = &(vfs_opv_descs[i]->opv_desc_ops[j]);
+ for (j=0; them[i]->opv_desc_ops[j].opve_op; j++) {
+ opve_descp = &(them[i]->opv_desc_ops[j]);
/*
* Sanity check: is this operation listed
@@ -166,8 +180,8 @@ vfs_opv_init()
* with their default. (Sigh, an O(n^3) algorithm. I
* could make it better, but that'd be work, and n is small.)
*/
- for (i = 0; vfs_opv_descs[i]; i++) {
- opv_desc_vector = *(vfs_opv_descs[i]->opv_desc_vector_p);
+ for (i = 0; them[i]; i++) {
+ opv_desc_vector = *(them[i]->opv_desc_vector_p);
/*
* Force every operations vector to have a default routine.
*/
@@ -176,7 +190,7 @@ vfs_opv_init()
}
for (k = 0; k<vfs_opv_numops; k++)
if (opv_desc_vector[k] == NULL)
- opv_desc_vector[k] =
+ opv_desc_vector[k] =
opv_desc_vector[VOFFSET(vop_default)];
}
}
@@ -184,7 +198,7 @@ vfs_opv_init()
/*
* Initialize known vnode operations vectors.
*/
-void
+static void
vfs_op_init()
{
int i;
@@ -216,10 +230,13 @@ struct vattr va_null;
/*
* Initialize the vnode structures and initialize each file system type.
*/
-vfsinit()
+/* ARGSUSED*/
+static void
+vfsinit(dummy)
+ void *dummy;
{
- struct vfsconf *vfsp;
- int i, maxtypenum;
+ struct vfsconf **vfc;
+ int maxtypenum;
/*
* Initialize the vnode table
@@ -233,15 +250,19 @@ vfsinit()
* Build vnode operation vectors.
*/
vfs_op_init();
- vfs_opv_init(); /* finish the job */
+ vfs_opv_init(vfs_opv_descs); /* finish the job */
/*
* Initialize each file system type.
*/
vattr_null(&va_null);
maxtypenum = 0;
- for (vfsp = vfsconf, i = 1; i <= maxvfsconf; i++, vfsp++) {
- if (i < maxvfsconf)
- vfsp->vfc_next = vfsp + 1;
+ vfc = (struct vfsconf **)vfs_set.ls_items;
+ vfsconf = *vfc; /* simulate Lite2 vfsconf array */
+ while (*vfc) {
+ struct vfsconf *vfsp = *vfc;
+
+ vfc++;
+ vfsp->vfc_next = *vfc;
if (maxtypenum <= vfsp->vfc_typenum)
maxtypenum = vfsp->vfc_typenum + 1;
(*vfsp->vfc_vfsops->vfs_init)(vfsp);
@@ -249,3 +270,30 @@ vfsinit()
/* next vfc_typenum to be used */
maxvfsconf = maxtypenum;
}
+
+/*
+ * kernel related system variables.
+ */
+
+/*
+ * This goop is here to support a loadable NFS module... grumble...
+ */
+int (*lease_check_hook) __P((struct vop_lease_args *))
+ = 0;
+void (*lease_updatetime) __P((int))
+ = 0;
+
+int
+lease_check(ap)
+ struct vop_lease_args /* {
+ struct vnode *a_vp;
+ struct proc *a_p;
+ struct ucred *a_cred;
+ int a_flag;
+ } */ *ap;
+{
+ if (lease_check_hook)
+ return (*lease_check_hook)(ap);
+ else
+ return 0;
+}
diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c
index 826fbfe..0c04b01 100644
--- a/sys/kern/vfs_lookup.c
+++ b/sys/kern/vfs_lookup.c
@@ -35,10 +35,14 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)vfs_lookup.c 8.10 (Berkeley) 5/27/95
+ * @(#)vfs_lookup.c 8.4 (Berkeley) 2/16/94
+ * $Id$
*/
+#include "opt_ktrace.h"
+
#include <sys/param.h>
+#include <sys/systm.h>
#include <sys/syslimits.h>
#include <sys/time.h>
#include <sys/namei.h>
@@ -105,10 +109,17 @@ namei(ndp)
MALLOC(cnp->cn_pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK);
if (ndp->ni_segflg == UIO_SYSSPACE)
error = copystr(ndp->ni_dirp, cnp->cn_pnbuf,
- MAXPATHLEN, &ndp->ni_pathlen);
+ MAXPATHLEN, (u_int *)&ndp->ni_pathlen);
else
error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
- MAXPATHLEN, &ndp->ni_pathlen);
+ MAXPATHLEN, (u_int *)&ndp->ni_pathlen);
+
+ /*
+ * Don't allow empty pathnames.
+ */
+ if (!error && *cnp->cn_pnbuf == '\0')
+ error = ENOENT;
+
if (error) {
free(cnp->cn_pnbuf, M_NAMEI);
ndp->ni_vp = NULL;
@@ -143,7 +154,8 @@ namei(ndp)
VREF(dp);
}
ndp->ni_startdir = dp;
- if (error = lookup(ndp)) {
+ error = lookup(ndp);
+ if (error) {
FREE(cnp->cn_pnbuf, M_NAMEI);
return (error);
}
@@ -176,7 +188,8 @@ namei(ndp)
auio.uio_segflg = UIO_SYSSPACE;
auio.uio_procp = (struct proc *)0;
auio.uio_resid = MAXPATHLEN;
- if (error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred)) {
+ error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
+ if (error) {
if (ndp->ni_pathlen > 1)
free(cp, M_NAMEI);
break;
@@ -226,7 +239,7 @@ namei(ndp)
* the target is returned locked, otherwise it is returned unlocked.
* When creating or renaming and LOCKPARENT is specified, the target may not
* be ".". When deleting and LOCKPARENT is specified, the target may be ".".
- *
+ *
* Overall outline of lookup:
*
* dirloop:
@@ -254,6 +267,7 @@ lookup(ndp)
int docache; /* == 0 do not cache last component */
int wantparent; /* 1 => wantparent or lockparent flag */
int rdonly; /* lookup read-only flag bit */
+ int trailing_slash;
int error = 0;
struct componentname *cnp = &ndp->ni_cnd;
struct proc *p = cnp->cn_proc;
@@ -264,7 +278,8 @@ lookup(ndp)
wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
if (cnp->cn_nameiop == DELETE ||
- (wantparent && cnp->cn_nameiop != CREATE))
+ (wantparent && cnp->cn_nameiop != CREATE &&
+ cnp->cn_nameiop != LOOKUP))
docache = 0;
rdonly = cnp->cn_flags & RDONLY;
ndp->ni_dvp = NULL;
@@ -300,6 +315,25 @@ dirloop:
#endif
ndp->ni_pathlen -= cnp->cn_namelen;
ndp->ni_next = cp;
+
+ /*
+ * Replace multiple slashes by a single slash and trailing slashes
+ * by a null. This must be done before VOP_LOOKUP() because some
+ * fs's don't know about trailing slashes. Remember if there were
+ * trailing slashes to handle symlinks, existing non-directories
+ * and non-existing files that won't be directories specially later.
+ */
+ trailing_slash = 0;
+ while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
+ cp++;
+ ndp->ni_pathlen--;
+ if (*cp == '\0') {
+ trailing_slash = 1;
+ *ndp->ni_next = '\0'; /* XXX for direnter() ... */
+ }
+ }
+ ndp->ni_next = cp;
+
cnp->cn_flags |= MAKEENTRY;
if (*cp == '\0' && docache == 0)
cnp->cn_flags &= ~MAKEENTRY;
@@ -404,6 +438,11 @@ unionlookup:
error = EROFS;
goto bad;
}
+ if (*cp == '\0' && trailing_slash &&
+ !(cnp->cn_flags & WILLBEDIR)) {
+ error = ENOENT;
+ goto bad;
+ }
/*
* We return with ni_vp NULL to indicate that the entry
* doesn't currently exist, leaving a pointer to the
@@ -431,6 +470,7 @@ unionlookup:
}
dp = ndp->ni_vp;
+
/*
* Check to see if the vnode has been mounted on;
* if so find the root of the mounted file system.
@@ -451,11 +491,20 @@ unionlookup:
* Check for symbolic link
*/
if ((dp->v_type == VLNK) &&
- ((cnp->cn_flags & FOLLOW) || *ndp->ni_next == '/')) {
+ ((cnp->cn_flags & FOLLOW) || trailing_slash ||
+ *ndp->ni_next == '/')) {
cnp->cn_flags |= ISSYMLINK;
return (0);
}
+ /*
+ * Check for bogus trailing slashes.
+ */
+ if (trailing_slash && dp->v_type != VDIR) {
+ error = ENOTDIR;
+ goto bad2;
+ }
+
nextname:
/*
* Not a symbolic link. If more pathname,
diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
new file mode 100644
index 0000000..779a1c4
--- /dev/null
+++ b/sys/kern/vfs_mount.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 1989, 1993, 1995
+ * The Regents of the University of California. All rights reserved.
+ * Copyright (c) 1995 Artisoft, Inc. All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ * must display the following acknowledgement:
+ * This product includes software developed by the University of
+ * California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * @(#)vfs_conf.c 8.8 (Berkeley) 3/31/94
+ * $Id$
+ */
+
+/*
+ * PURPOSE: This file abstracts the root mounting interface from
+ * the per file system semantics for handling mounts,
+ * the overall intent of which is to move the BSD
+ * internals dependence out of the FS code, both to
+ * make the FS code more portable and to free up some
+ * of the BSD internals so that they may more easily
+ * be changed.
+ *
+ * NOTE1: Code is single entry/single exit to aid debugging
+ * and conversion for kernel multithreading.
+ *
+ * NOTE2: Code notes lock state in headers on entry and exit
+ * as an aid to conversion for kernel multithreading
+ * on SMP reentrancy
+ */
+#include <sys/param.h> /* dev_t (types.h)*/
+#include <sys/systm.h> /* rootvp*/
+#include <sys/proc.h> /* curproc*/
+#include <sys/vnode.h> /* NULLVP*/
+#include <sys/mount.h> /* struct mount*/
+#include <sys/malloc.h> /* M_MOUNT*/
+
+/*
+ * GLOBALS
+ */
+
+/*
+ * These define the root filesystem, device, and root filesystem type.
+ */
+struct mount *rootfs;
+struct vnode *rootvnode;
+char *mountrootfsname;
+
+/*
+ * vfs_init() will set maxvfsconf
+ * to the highest defined type number.
+ */
+int maxvfsconf;
+struct vfsconf *vfsconf;
+
+/*
+ * Common root mount code shared by all filesystems
+ */
+#define ROOTNAME "root_device"
+
+/*
+ * vfs_mountrootfs
+ *
+ * Common entry point for root mounts
+ *
+ * PARAMETERS:
+ * fsname name of the filesystem
+ *
+ * RETURNS: 0 Success
+ * !0 error number (errno.h)
+ *
+ * LOCK STATE:
+ * ENTRY
+ * <no locks held>
+ * EXIT
+ * <no locks held>
+ *
+ * NOTES:
+ * This code is currently supported only for use for
+ * the FFS file system type. This is a matter of
+ * fixing the other file systems, not this code!
+ */
+int
+vfs_mountrootfs(fsname)
+ char *fsname;
+{
+ struct mount *mp;
+ int err = 0;
+ struct proc *p = curproc; /* XXX */
+
+ /*
+ * New root mount structure
+ */
+ err = vfs_rootmountalloc(fsname, ROOTNAME, &mp);
+ if (err)
+ return (err);
+ mp->mnt_flag |= MNT_ROOTFS;
+
+ /*
+ * Attempt the mount
+ */
+ err = VFS_MOUNT(mp, NULL, NULL, NULL, p);
+ if (err)
+ goto error_2;
+
+ simple_lock(&mountlist_slock);
+ /* Add fs to list of mounted file systems*/
+ CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+ simple_unlock(&mountlist_slock);
+
+ vfs_unbusy(mp, p);
+
+ /* root mount, update system time from FS specific data*/
+ inittodr(mp->mnt_time);
+
+ goto success;
+
+
+error_2: /* mount error*/
+
+ vfs_unbusy(mp, p);
+
+error_1: /* lock error*/
+
+ /* free mount struct before failing*/
+ free( mp, M_MOUNT);
+
+success:
+ return( err);
+}
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index f891e02..0b487fd 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -36,14 +36,19 @@
* SUCH DAMAGE.
*
* @(#)vfs_subr.c 8.31 (Berkeley) 5/26/95
+ * $Id: vfs_subr.c,v 1.79 1997/03/04 18:31:56 bde Exp $
*/
/*
* External virtual filesystem routines
*/
+#include "opt_ddb.h"
+#include "opt_devfs.h"
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
#include <sys/proc.h>
#include <sys/mount.h>
#include <sys/time.h>
@@ -58,15 +63,29 @@
#include <sys/mbuf.h>
#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_pager.h>
+#include <vm/vnode_pager.h>
#include <sys/sysctl.h>
#include <miscfs/specfs/specdev.h>
+#ifdef DDB
+extern void printlockedvnodes __P((void));
+#endif
+static void vclean __P((struct vnode *vp, int flags, struct proc *p));
+extern void vgonel __P((struct vnode *vp, struct proc *p));
+unsigned long numvnodes;
+extern void vfs_unmountroot __P((struct mount *rootfs));
+extern void vputrele __P((struct vnode *vp, int put));
+
enum vtype iftovt_tab[16] = {
VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
};
-int vttoif_tab[9] = {
+int vttoif_tab[9] = {
0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
S_IFSOCK, S_IFIFO, S_IFMT,
};
@@ -80,13 +99,23 @@ int vttoif_tab[9] = {
(bp)->b_vnbufs.le_next = NOLIST; \
}
TAILQ_HEAD(freelst, vnode) vnode_free_list; /* vnode free list */
-struct mntlist mountlist; /* mounted filesystem list */
+static u_long freevnodes = 0;
+
+struct mntlist mountlist; /* mounted filesystem list */
struct simplelock mountlist_slock;
static struct simplelock mntid_slock;
struct simplelock mntvnode_slock;
struct simplelock vnode_free_list_slock;
static struct simplelock spechash_slock;
+int desiredvnodes;
+SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "");
+
+static void vfs_free_addrlist __P((struct netexport *nep));
+static int vfs_free_netcred __P((struct radix_node *rn, void *w));
+static int vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
+ struct export_args *argp));
+
/*
* Initialize the vnode management data structures.
*/
@@ -94,6 +123,7 @@ void
vntblinit()
{
+ desiredvnodes = maxproc + vm_object_cache_max;
simple_lock_init(&mntvnode_slock);
simple_lock_init(&mntid_slock);
simple_lock_init(&spechash_slock);
@@ -119,17 +149,19 @@ vfs_busy(mp, flags, interlkp, p)
if (flags & LK_NOWAIT)
return (ENOENT);
mp->mnt_flag |= MNT_MWAIT;
- if (interlkp)
+ if (interlkp) {
simple_unlock(interlkp);
+ }
/*
* Since all busy locks are shared except the exclusive
* lock granted when unmounting, the only place that a
* wakeup needs to be done is at the release of the
* exclusive lock at the end of dounmount.
*/
- sleep((caddr_t)mp, PVFS);
- if (interlkp)
+ tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
+ if (interlkp) {
simple_lock(interlkp);
+ }
return (ENOENT);
}
lkflags = LK_SHARED;
@@ -187,6 +219,7 @@ vfs_rootmountalloc(fstypename, devname, mpp)
mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
mp->mnt_stat.f_mntonname[0] = '/';
+ mp->mnt_stat.f_mntonname[1] = 0;
(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
*mpp = mp;
return (0);
@@ -198,15 +231,16 @@ vfs_rootmountalloc(fstypename, devname, mpp)
* trying those that have mountroot routines, and try them until one
* works or we have tried them all.
*/
+#ifdef notdef /* XXX JH */
int
-vfs_mountroot()
+lite2_vfs_mountroot(void)
{
struct vfsconf *vfsp;
- extern int (*mountroot)(void);
+ extern int (*lite2_mountroot)(void);
int error;
- if (mountroot != NULL)
- return ((*mountroot)());
+ if (lite2_mountroot != NULL)
+ return ((*lite2_mountroot)());
for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
if (vfsp->vfc_mountroot == NULL)
continue;
@@ -216,6 +250,7 @@ vfs_mountroot()
}
return (ENODEV);
}
+#endif
/*
* Lookup a mount point by filesystem identifier.
@@ -228,15 +263,15 @@ vfs_getvfs(fsid)
simple_lock(&mountlist_slock);
for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
- mp = mp->mnt_list.cqe_next) {
+ mp = mp->mnt_list.cqe_next) {
if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
simple_unlock(&mountlist_slock);
return (mp);
- }
+ }
}
simple_unlock(&mountlist_slock);
- return ((struct mount *)0);
+ return ((struct mount *) 0);
}
/*
@@ -246,12 +281,12 @@ void
vfs_getnewfsid(mp)
struct mount *mp;
{
-static u_short xxxfs_mntid;
+ static u_short xxxfs_mntid;
fsid_t tfsid;
int mtype;
- simple_lock(&mntid_slock);
+ simple_lock(&mntid_slock);
mtype = mp->mnt_vfc->vfc_typenum;
mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
mp->mnt_stat.f_fsid.val[1] = mtype;
@@ -278,25 +313,22 @@ vattr_null(vap)
{
vap->va_type = VNON;
- vap->va_size = vap->va_bytes = VNOVAL;
+ vap->va_size = VNOVAL;
+ vap->va_bytes = VNOVAL;
vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid =
- vap->va_fsid = vap->va_fileid =
- vap->va_blocksize = vap->va_rdev =
- vap->va_atime.ts_sec = vap->va_atime.ts_nsec =
- vap->va_mtime.ts_sec = vap->va_mtime.ts_nsec =
- vap->va_ctime.ts_sec = vap->va_ctime.ts_nsec =
- vap->va_flags = vap->va_gen = VNOVAL;
+ vap->va_fsid = vap->va_fileid =
+ vap->va_blocksize = vap->va_rdev =
+ vap->va_atime.tv_sec = vap->va_atime.tv_nsec =
+ vap->va_mtime.tv_sec = vap->va_mtime.tv_nsec =
+ vap->va_ctime.tv_sec = vap->va_ctime.tv_nsec =
+ vap->va_flags = vap->va_gen = VNOVAL;
vap->va_vaflags = 0;
}
/*
* Routines having to do with the management of the vnode table.
*/
-extern int (**dead_vnodeop_p)();
-static void vclean __P((struct vnode *vp, int flag, struct proc *p));
-extern void vgonel __P((struct vnode *vp, struct proc *p));
-long numvnodes;
-extern struct vattr va_null;
+extern vop_t **dead_vnodeop_p;
/*
* Return the next vnode from the free list.
@@ -305,23 +337,31 @@ int
getnewvnode(tag, mp, vops, vpp)
enum vtagtype tag;
struct mount *mp;
- int (**vops)();
+ vop_t **vops;
struct vnode **vpp;
{
struct proc *p = curproc; /* XXX */
struct vnode *vp;
- int s;
- int cnt;
-top:
simple_lock(&vnode_free_list_slock);
- if ((vnode_free_list.tqh_first == NULL &&
- numvnodes < 2 * desiredvnodes) ||
- numvnodes < desiredvnodes) {
+retry:
+ /*
+ * we allocate a new vnode if
+ * 1. we don't have any free
+ * Pretty obvious, we actually used to panic, but that
+ * is a silly thing to do.
+ * 2. we havn't filled our pool yet
+ * We don't want to trash the incore (VM-)vnodecache.
+ * 3. if less that 1/4th of our vnodes are free.
+ * We don't want to trash the namei cache either.
+ */
+ if (freevnodes < (numvnodes >> 2) ||
+ numvnodes < desiredvnodes ||
+ vnode_free_list.tqh_first == NULL) {
simple_unlock(&vnode_free_list_slock);
- vp = (struct vnode *)malloc((u_long)sizeof *vp,
+ vp = (struct vnode *) malloc((u_long) sizeof *vp,
M_VNODE, M_WAITOK);
- bzero((char *)vp, sizeof *vp);
+ bzero((char *) vp, sizeof *vp);
numvnodes++;
} else {
for (vp = vnode_free_list.tqh_first;
@@ -343,31 +383,45 @@ top:
if (vp->v_usecount)
panic("free vnode isn't");
TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+ if (vp->v_usage > 0) {
+ simple_unlock(&vp->v_interlock);
+ --vp->v_usage;
+ TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+ goto retry;
+ }
+ freevnodes--;
+
/* see comment on why 0xdeadb is set at end of vgone (below) */
- vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb;
+ vp->v_freelist.tqe_prev = (struct vnode **) 0xdeadb;
simple_unlock(&vnode_free_list_slock);
vp->v_lease = NULL;
if (vp->v_type != VBAD)
vgonel(vp, p);
- else
+ else {
simple_unlock(&vp->v_interlock);
+ }
+
#ifdef DIAGNOSTIC
- if (vp->v_data)
- panic("cleaned vnode isn't");
- s = splbio();
- if (vp->v_numoutput)
- panic("Clean vnode has pending I/O's");
- splx(s);
+ {
+ int s;
+
+ if (vp->v_data)
+ panic("cleaned vnode isn't");
+ s = splbio();
+ if (vp->v_numoutput)
+ panic("Clean vnode has pending I/O's");
+ splx(s);
+ }
#endif
vp->v_flag = 0;
vp->v_lastr = 0;
- vp->v_ralen = 0;
- vp->v_maxra = 0;
vp->v_lastw = 0;
vp->v_lasta = 0;
vp->v_cstart = 0;
vp->v_clen = 0;
vp->v_socket = 0;
+ vp->v_writecount = 0; /* XXX */
+ vp->v_usage = 0;
}
vp->v_type = VNON;
cache_purge(vp);
@@ -385,8 +439,8 @@ top:
*/
void
insmntque(vp, mp)
- struct vnode *vp;
- struct mount *mp;
+ register struct vnode *vp;
+ register struct mount *mp;
{
simple_lock(&mntvnode_slock);
@@ -398,8 +452,11 @@ insmntque(vp, mp)
/*
* Insert into list of vnodes for the new mount point, if available.
*/
- if ((vp->v_mount = mp) != NULL)
- LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
+ if ((vp->v_mount = mp) == NULL) {
+ simple_unlock(&mntvnode_slock);
+ return;
+ }
+ LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
simple_unlock(&mntvnode_slock);
}
@@ -413,14 +470,13 @@ vwakeup(bp)
register struct vnode *vp;
bp->b_flags &= ~B_WRITEINPROG;
- if (vp = bp->b_vp) {
- if (--vp->v_numoutput < 0)
+ if ((vp = bp->b_vp)) {
+ vp->v_numoutput--;
+ if (vp->v_numoutput < 0)
panic("vwakeup: neg numoutput");
- if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) {
- if (vp->v_numoutput < 0)
- panic("vwakeup: neg numoutput 2");
+ if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
vp->v_flag &= ~VBWAIT;
- wakeup((caddr_t)&vp->v_numoutput);
+ wakeup((caddr_t) &vp->v_numoutput);
}
}
}
@@ -440,15 +496,18 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
register struct buf *bp;
struct buf *nbp, *blist;
int s, error;
+ vm_object_t object;
if (flags & V_SAVE) {
- if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p))
+ if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)))
return (error);
if (vp->v_dirtyblkhd.lh_first != NULL)
panic("vinvalbuf: dirty bufs");
}
+
+ s = splbio();
for (;;) {
- if ((blist = vp->v_cleanblkhd.lh_first) && flags & V_SAVEMETA)
+ if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA))
while (blist && blist->b_lblkno < 0)
blist = blist->b_vnbufs.le_next;
if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
@@ -460,35 +519,51 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
for (bp = blist; bp; bp = nbp) {
nbp = bp->b_vnbufs.le_next;
- if (flags & V_SAVEMETA && bp->b_lblkno < 0)
+ if ((flags & V_SAVEMETA) && bp->b_lblkno < 0)
continue;
- s = splbio();
if (bp->b_flags & B_BUSY) {
bp->b_flags |= B_WANTED;
- error = tsleep((caddr_t)bp,
- slpflag | (PRIBIO + 1), "vinvalbuf",
- slptimeo);
- splx(s);
- if (error)
+ error = tsleep((caddr_t) bp,
+ slpflag | (PRIBIO + 1), "vinvalbuf",
+ slptimeo);
+ if (error) {
+ splx(s);
return (error);
+ }
break;
}
bremfree(bp);
bp->b_flags |= B_BUSY;
- splx(s);
/*
- * XXX Since there are no node locks for NFS, I believe
- * there is a slight chance that a delayed write will
- * occur while sleeping just above, so check for it.
+ * XXX Since there are no node locks for NFS, I
+ * believe there is a slight chance that a delayed
+ * write will occur while sleeping just above, so
+ * check for it.
*/
if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
(void) VOP_BWRITE(bp);
break;
}
- bp->b_flags |= B_INVAL;
+ bp->b_flags |= (B_INVAL|B_NOCACHE|B_RELBUF);
brelse(bp);
}
}
+
+ while (vp->v_numoutput > 0) {
+ vp->v_flag |= VBWAIT;
+ tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
+ }
+
+ splx(s);
+
+ /*
+ * Destroy the copy in the VM cache, too.
+ */
+ object = vp->v_object;
+ if (object != NULL) {
+ vm_object_page_remove(object, 0, object->size,
+ (flags & V_SAVE) ? TRUE : FALSE);
+ }
if (!(flags & V_SAVEMETA) &&
(vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first))
panic("vinvalbuf: flush failed");
@@ -503,6 +578,7 @@ bgetvp(vp, bp)
register struct vnode *vp;
register struct buf *bp;
{
+ int s;
if (bp->b_vp)
panic("bgetvp: not free");
@@ -515,7 +591,9 @@ bgetvp(vp, bp)
/*
* Insert onto list for new vnode.
*/
+ s = splbio();
bufinsvn(bp, &vp->v_cleanblkhd);
+ splx(s);
}
/*
@@ -526,20 +604,60 @@ brelvp(bp)
register struct buf *bp;
{
struct vnode *vp;
+ int s;
if (bp->b_vp == (struct vnode *) 0)
panic("brelvp: NULL");
/*
* Delete from old vnode list, if on one.
*/
+ s = splbio();
if (bp->b_vnbufs.le_next != NOLIST)
bufremvn(bp);
+ splx(s);
+
vp = bp->b_vp;
bp->b_vp = (struct vnode *) 0;
HOLDRELE(vp);
}
/*
+ * Associate a p-buffer with a vnode.
+ */
+void
+pbgetvp(vp, bp)
+ register struct vnode *vp;
+ register struct buf *bp;
+{
+#if defined(DIAGNOSTIC)
+ if (bp->b_vp)
+ panic("pbgetvp: not free");
+#endif
+ bp->b_vp = vp;
+ if (vp->v_type == VBLK || vp->v_type == VCHR)
+ bp->b_dev = vp->v_rdev;
+ else
+ bp->b_dev = NODEV;
+}
+
+/*
+ * Disassociate a p-buffer from a vnode.
+ */
+void
+pbrelvp(bp)
+ register struct buf *bp;
+{
+ struct vnode *vp;
+
+#if defined(DIAGNOSTIC)
+ if (bp->b_vp == (struct vnode *) 0)
+ panic("pbrelvp: NULL");
+#endif
+
+ bp->b_vp = (struct vnode *) 0;
+}
+
+/*
* Reassign a buffer from one vnode to another.
* Used to assign file specific control information
* (indirect blocks) to the vnode to which they belong.
@@ -549,28 +667,43 @@ reassignbuf(bp, newvp)
register struct buf *bp;
register struct vnode *newvp;
{
- register struct buflists *listheadp;
+ int s;
if (newvp == NULL) {
printf("reassignbuf: NULL");
return;
}
+
+ s = splbio();
/*
* Delete from old vnode list, if on one.
*/
if (bp->b_vnbufs.le_next != NOLIST)
bufremvn(bp);
/*
- * If dirty, put on list of dirty buffers;
- * otherwise insert onto list of clean buffers.
+ * If dirty, put on list of dirty buffers; otherwise insert onto list
+ * of clean buffers.
*/
- if (bp->b_flags & B_DELWRI)
- listheadp = &newvp->v_dirtyblkhd;
- else
- listheadp = &newvp->v_cleanblkhd;
- bufinsvn(bp, listheadp);
+ if (bp->b_flags & B_DELWRI) {
+ struct buf *tbp;
+
+ tbp = newvp->v_dirtyblkhd.lh_first;
+ if (!tbp || (tbp->b_lblkno > bp->b_lblkno)) {
+ bufinsvn(bp, &newvp->v_dirtyblkhd);
+ } else {
+ while (tbp->b_vnbufs.le_next &&
+ (tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) {
+ tbp = tbp->b_vnbufs.le_next;
+ }
+ LIST_INSERT_AFTER(tbp, bp, b_vnbufs);
+ }
+ } else {
+ bufinsvn(bp, &newvp->v_cleanblkhd);
+ }
+ splx(s);
}
+#ifndef DEVFS_ROOT
/*
* Create a vnode for a block device.
* Used for root filesystem, argdev, and swap areas.
@@ -585,24 +718,23 @@ bdevvp(dev, vpp)
struct vnode *nvp;
int error;
- if (dev == NODEV) {
- *vpp = NULLVP;
- return (ENODEV);
- }
- error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
+ if (dev == NODEV)
+ return (0);
+ error = getnewvnode(VT_NON, (struct mount *) 0, spec_vnodeop_p, &nvp);
if (error) {
- *vpp = NULLVP;
+ *vpp = 0;
return (error);
}
vp = nvp;
vp->v_type = VBLK;
- if (nvp = checkalias(vp, dev, (struct mount *)0)) {
+ if ((nvp = checkalias(vp, dev, (struct mount *) 0))) {
vput(vp);
vp = nvp;
}
*vpp = vp;
return (0);
}
+#endif /* !DEVFS_ROOT */
/*
* Check to see if the new vnode represents a special device
@@ -648,7 +780,7 @@ loop:
}
if (vp == NULL || vp->v_tag != VT_NON) {
MALLOC(nvp->v_specinfo, struct specinfo *,
- sizeof(struct specinfo), M_VNODE, M_WAITOK);
+ sizeof(struct specinfo), M_VNODE, M_WAITOK);
nvp->v_rdev = nvp_rdev;
nvp->v_hashchain = vpp;
nvp->v_specnext = *vpp;
@@ -683,7 +815,7 @@ loop:
*/
int
vget(vp, flags, p)
- struct vnode *vp;
+ register struct vnode *vp;
int flags;
struct proc *p;
{
@@ -695,8 +827,9 @@ vget(vp, flags, p)
* return failure. Cleaning is determined by checking that
* the VXLOCK flag is set.
*/
- if ((flags & LK_INTERLOCK) == 0)
+ if ((flags & LK_INTERLOCK) == 0) {
simple_lock(&vp->v_interlock);
+ }
if (vp->v_flag & VXLOCK) {
vp->v_flag |= VXWANT;
simple_unlock(&vp->v_interlock);
@@ -707,8 +840,22 @@ vget(vp, flags, p)
simple_lock(&vnode_free_list_slock);
TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
simple_unlock(&vnode_free_list_slock);
+ freevnodes--;
}
vp->v_usecount++;
+ /*
+ * Create the VM object, if needed
+ */
+ if ((vp->v_type == VREG) &&
+ ((vp->v_object == NULL) ||
+ (vp->v_object->flags & OBJ_VFS_REF) == 0)) {
+ /*
+ * XXX vfs_object_create probably needs the interlock.
+ */
+ simple_unlock(&vp->v_interlock);
+ vfs_object_create(vp, curproc, curproc->p_ucred, 0);
+ simple_lock(&vp->v_interlock);
+ }
if (flags & LK_TYPE_MASK) {
if (error = vn_lock(vp, flags | LK_INTERLOCK, p))
vrele(vp);
@@ -781,14 +928,15 @@ vop_nolock(ap)
* Since we are not using the lock manager, we must clear
* the interlock here.
*/
- if (ap->a_flags & LK_INTERLOCK)
+ if (ap->a_flags & LK_INTERLOCK) {
simple_unlock(&ap->a_vp->v_interlock);
+ }
return (0);
#endif
}
/*
- * Decrement the active use count.
+ * Do the inverse of vop_nolock, handling the interlock in a compatible way.
*/
int
vop_nounlock(ap)
@@ -800,9 +948,13 @@ vop_nounlock(ap)
{
struct vnode *vp = ap->a_vp;
- if (vp->v_vnlock == NULL)
+ if (vp->v_vnlock == NULL) {
+ if (ap->a_flags & LK_INTERLOCK)
+ simple_unlock(&ap->a_vp->v_interlock);
return (0);
- return (lockmgr(vp->v_vnlock, LK_RELEASE, NULL, ap->a_p));
+ }
+ return (lockmgr(vp->v_vnlock, LK_RELEASE | ap->a_flags,
+ &ap->a_vp->v_interlock, ap->a_p));
}
/*
@@ -821,91 +973,124 @@ vop_noislocked(ap)
return (lockstatus(vp->v_vnlock));
}
+/* #ifdef DIAGNOSTIC */
/*
- * Vnode reference.
+ * Vnode reference, just increment the count
*/
void
vref(vp)
struct vnode *vp;
{
-
simple_lock(&vp->v_interlock);
if (vp->v_usecount <= 0)
panic("vref used where vget required");
+
vp->v_usecount++;
+
+ if ((vp->v_type == VREG) &&
+ ((vp->v_object == NULL) ||
+ ((vp->v_object->flags & OBJ_VFS_REF) == 0)) ) {
+ /*
+ * We need to lock to VP during the time that
+ * the object is created. This is necessary to
+ * keep the system from re-entrantly doing it
+ * multiple times.
+ * XXX vfs_object_create probably needs the interlock?
+ */
+ simple_unlock(&vp->v_interlock);
+ vfs_object_create(vp, curproc, curproc->p_ucred, 0);
+ return;
+ }
simple_unlock(&vp->v_interlock);
}
/*
- * vput(), just unlock and vrele()
+ * Vnode put/release.
+ * If count drops to zero, call inactive routine and return to freelist.
*/
void
-vput(vp)
+vputrele(vp, put)
struct vnode *vp;
+ int put;
{
struct proc *p = curproc; /* XXX */
-#ifdef DIGANOSTIC
+#ifdef DIAGNOSTIC
if (vp == NULL)
- panic("vput: null vp");
+ panic("vputrele: null vp");
#endif
simple_lock(&vp->v_interlock);
vp->v_usecount--;
+
+ if ((vp->v_usecount == 1) &&
+ vp->v_object &&
+ (vp->v_object->flags & OBJ_VFS_REF)) {
+ vp->v_object->flags &= ~OBJ_VFS_REF;
+ if (put) {
+ VOP_UNLOCK(vp, LK_INTERLOCK, p);
+ } else {
+ simple_unlock(&vp->v_interlock);
+ }
+ vm_object_deallocate(vp->v_object);
+ return;
+ }
+
if (vp->v_usecount > 0) {
- simple_unlock(&vp->v_interlock);
- VOP_UNLOCK(vp, 0, p);
+ if (put) {
+ VOP_UNLOCK(vp, LK_INTERLOCK, p);
+ } else {
+ simple_unlock(&vp->v_interlock);
+ }
return;
}
+
+ if (vp->v_usecount < 0) {
#ifdef DIAGNOSTIC
- if (vp->v_usecount < 0 || vp->v_writecount != 0) {
- vprint("vput: bad ref count", vp);
- panic("vput: ref cnt");
- }
+ vprint("vputrele: negative ref count", vp);
#endif
- /*
- * insert at tail of LRU list
- */
+ panic("vputrele: negative ref cnt");
+ }
simple_lock(&vnode_free_list_slock);
- TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+ if (vp->v_flag & VAGE) {
+ vp->v_flag &= ~VAGE;
+ vp->v_usage = 0;
+ if(vp->v_tag != VT_TFS)
+ TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+ } else {
+ if(vp->v_tag != VT_TFS)
+ TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+ }
+ freevnodes++;
simple_unlock(&vnode_free_list_slock);
- simple_unlock(&vp->v_interlock);
- VOP_INACTIVE(vp, p);
+
+ /*
+ * If we are doing a vput, the node is already locked, and we must
+ * call VOP_INACTIVE with the node locked. So, in the case of
+ * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
+ */
+ if (put) {
+ simple_unlock(&vp->v_interlock);
+ VOP_INACTIVE(vp, p);
+ } else if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
+ VOP_INACTIVE(vp, p);
+ }
}
/*
- * Vnode release.
- * If count drops to zero, call inactive routine and return to freelist.
+ * vput(), just unlock and vrele()
*/
void
-vrele(vp)
+vput(vp)
struct vnode *vp;
{
- struct proc *p = curproc; /* XXX */
+ vputrele(vp, 1);
+}
-#ifdef DIAGNOSTIC
- if (vp == NULL)
- panic("vrele: null vp");
-#endif
- simple_lock(&vp->v_interlock);
- vp->v_usecount--;
- if (vp->v_usecount > 0) {
- simple_unlock(&vp->v_interlock);
- return;
- }
-#ifdef DIAGNOSTIC
- if (vp->v_usecount < 0 || vp->v_writecount != 0) {
- vprint("vrele: bad ref count", vp);
- panic("vrele: ref cnt");
- }
-#endif
- /*
- * insert at tail of LRU list
- */
- simple_lock(&vnode_free_list_slock);
- TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
- simple_unlock(&vnode_free_list_slock);
- if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0)
- VOP_INACTIVE(vp, p);
+void
+vrele(vp)
+ struct vnode *vp;
+{
+ vputrele(vp, 0);
}
#ifdef DIAGNOSTIC
@@ -947,8 +1132,8 @@ holdrele(vp)
* that are found.
*/
#ifdef DIAGNOSTIC
-int busyprt = 0; /* print out busy vnodes */
-struct ctldebug debug1 = { "busyprt", &busyprt };
+static int busyprt = 0; /* print out busy vnodes */
+SYSCTL_INT(_debug, 1, busyprt, CTLFLAG_RW, &busyprt, 0, "");
#endif
int
@@ -964,6 +1149,10 @@ vflush(mp, skipvp, flags)
simple_lock(&mntvnode_slock);
loop:
for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
+ /*
+ * Make sure this vnode wasn't reclaimed in getnewvnode().
+ * Start over if it has (it won't be on the list anymore).
+ */
if (vp->v_mount != mp)
goto loop;
nvp = vp->v_mntvnodes.le_next;
@@ -982,17 +1171,29 @@ loop:
continue;
}
/*
- * If WRITECLOSE is set, only flush out regular file
- * vnodes open for writing.
+ * If WRITECLOSE is set, only flush out regular file vnodes
+ * open for writing.
*/
if ((flags & WRITECLOSE) &&
(vp->v_writecount == 0 || vp->v_type != VREG)) {
simple_unlock(&vp->v_interlock);
continue;
}
+
+ if (vp->v_object && (vp->v_object->flags & OBJ_VFS_REF)) {
+ simple_unlock(&vp->v_interlock);
+ simple_unlock(&mntvnode_slock);
+ vm_object_reference(vp->v_object);
+ pager_cache(vp->v_object, FALSE);
+ vp->v_object->flags &= ~OBJ_VFS_REF;
+ vm_object_deallocate(vp->v_object);
+ simple_lock(&mntvnode_slock);
+ simple_lock(&vp->v_interlock);
+ }
+
/*
- * With v_usecount == 0, all we need to do is clear
- * out the vnode data structures and we are done.
+ * With v_usecount == 0, all we need to do is clear out the
+ * vnode data structures and we are done.
*/
if (vp->v_usecount == 0) {
simple_unlock(&mntvnode_slock);
@@ -1000,10 +1201,11 @@ loop:
simple_lock(&mntvnode_slock);
continue;
}
+
/*
- * If FORCECLOSE is set, forcibly close the vnode.
- * For block or character devices, revert to an
- * anonymous device. For all other files, just kill them.
+ * If FORCECLOSE is set, forcibly close the vnode. For block
+ * or character devices, revert to an anonymous device. For
+ * all other files, just kill them.
*/
if (flags & FORCECLOSE) {
simple_unlock(&mntvnode_slock);
@@ -1012,7 +1214,7 @@ loop:
} else {
vclean(vp, 0, p);
vp->v_op = spec_vnodeop_p;
- insmntque(vp, (struct mount *)0);
+ insmntque(vp, (struct mount *) 0);
}
simple_lock(&mntvnode_slock);
continue;
@@ -1032,27 +1234,22 @@ loop:
/*
* Disassociate the underlying file system from a vnode.
- * The vnode interlock is held on entry.
*/
static void
-vclean(vp, flags, p)
- struct vnode *vp;
- int flags;
- struct proc *p;
+vclean(struct vnode *vp, int flags, struct proc *p)
{
int active;
/*
- * Check to see if the vnode is in use.
- * If so we have to reference it before we clean it out
- * so that its count cannot fall to zero and generate a
- * race against ourselves to recycle it.
+ * Check to see if the vnode is in use. If so we have to reference it
+ * before we clean it out so that its count cannot fall to zero and
+ * generate a race against ourselves to recycle it.
*/
- if (active = vp->v_usecount)
+ if ((active = vp->v_usecount))
vp->v_usecount++;
/*
- * Prevent the vnode from being recycled or
- * brought into use while we clean it out.
+ * Prevent the vnode from being recycled or brought into use while we
+ * clean it out.
*/
if (vp->v_flag & VXLOCK)
panic("vclean: deadlock");
@@ -1109,12 +1306,12 @@ vclean(vp, flags, p)
vp->v_flag &= ~VXLOCK;
if (vp->v_flag & VXWANT) {
vp->v_flag &= ~VXWANT;
- wakeup((caddr_t)vp);
+ wakeup((caddr_t) vp);
}
}
/*
- * Eliminate all activity associated with the requested vnode
+ * Eliminate all activity associated with the requested vnode
* and with all vnodes aliased to the requested vnode.
*/
int
@@ -1162,8 +1359,9 @@ vop_revoke(ap)
vgone(vq);
break;
}
- if (vq == NULLVP)
+ if (vq == NULLVP) {
simple_unlock(&spechash_slock);
+ }
}
/*
* Remove the lock so that vgone below will
@@ -1190,8 +1388,9 @@ vrecycle(vp, inter_lkp, p)
simple_lock(&vp->v_interlock);
if (vp->v_usecount == 0) {
- if (inter_lkp)
+ if (inter_lkp) {
simple_unlock(inter_lkp);
+ }
vgonel(vp, p);
return (1);
}
@@ -1205,7 +1404,7 @@ vrecycle(vp, inter_lkp, p)
*/
void
vgone(vp)
- struct vnode *vp;
+ register struct vnode *vp;
{
struct proc *p = curproc; /* XXX */
@@ -1234,6 +1433,11 @@ vgonel(vp, p)
tsleep((caddr_t)vp, PINOD, "vgone", 0);
return;
}
+
+ if (vp->v_object) {
+ vp->v_object->flags |= OBJ_VNODE_GONE;
+ }
+
/*
* Clean out the filesystem specific data.
*/
@@ -1281,6 +1485,7 @@ vgonel(vp, p)
FREE(vp->v_specinfo, M_VNODE);
vp->v_specinfo = NULL;
}
+
/*
* If it is on the freelist and not already at the head,
* move it to the head of the list. The test of the back
@@ -1297,12 +1502,13 @@ vgonel(vp, p)
if (vp->v_usecount == 0) {
simple_lock(&vnode_free_list_slock);
if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
- vnode_free_list.tqh_first != vp) {
+ vnode_free_list.tqh_first != vp) {
TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
}
simple_unlock(&vnode_free_list_slock);
}
+
vp->v_type = VBAD;
}
@@ -1315,7 +1521,7 @@ vfinddev(dev, type, vpp)
enum vtype type;
struct vnode **vpp;
{
- struct vnode *vp;
+ register struct vnode *vp;
int rc = 0;
simple_lock(&spechash_slock);
@@ -1335,7 +1541,7 @@ vfinddev(dev, type, vpp)
*/
int
vcount(vp)
- struct vnode *vp;
+ register struct vnode *vp;
{
struct vnode *vq, *vnext;
int count;
@@ -1366,7 +1572,7 @@ loop:
* Print out a description of a vnode.
*/
static char *typename[] =
- { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
+{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
void
vprint(label, vp)
@@ -1377,9 +1583,9 @@ vprint(label, vp)
if (label != NULL)
printf("%s: ", label);
- printf("type %s, usecount %d, writecount %d, refcount %d,",
- typename[vp->v_type], vp->v_usecount, vp->v_writecount,
- vp->v_holdcnt);
+ printf("type %s, usecount %d, writecount %d, refcount %ld,",
+ typename[vp->v_type], vp->v_usecount, vp->v_writecount,
+ vp->v_holdcnt);
buf[0] = '\0';
if (vp->v_flag & VROOT)
strcat(buf, "|VROOT");
@@ -1405,7 +1611,7 @@ vprint(label, vp)
}
}
-#ifdef DEBUG
+#ifdef DDB
/*
* List all of the locked vnodes in the system.
* Called when debugging the kernel.
@@ -1441,19 +1647,22 @@ printlockedvnodes()
/*
* Top level filesystem related information gathering.
*/
-int
-vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
- int *name;
- u_int namelen;
- void *oldp;
- size_t *oldlenp;
- void *newp;
- size_t newlen;
- struct proc *p;
+static int sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS);
+
+static int
+vfs_sysctl SYSCTL_HANDLER_ARGS
{
- struct ctldebug *cdp;
+ int *name = (int *)arg1 - 1; /* XXX */
+ u_int namelen = arg2 + 1; /* XXX */
struct vfsconf *vfsp;
+#ifndef NO_COMPAT_PRELITE2
+ /* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
+ if (namelen == 1)
+ return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
+#endif
+
+#ifdef notyet
/* all sysctl names at this level are at least name and field */
if (namelen < 2)
return (ENOTDIR); /* overloaded */
@@ -1466,58 +1675,83 @@ vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
oldp, oldlenp, newp, newlen, p));
}
+#endif
switch (name[1]) {
case VFS_MAXTYPENUM:
- return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf));
+ if (namelen != 2)
+ return (ENOTDIR);
+ return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
case VFS_CONF:
- if (namelen < 3)
+ if (namelen != 3)
return (ENOTDIR); /* overloaded */
for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
if (vfsp->vfc_typenum == name[2])
break;
if (vfsp == NULL)
return (EOPNOTSUPP);
- return (sysctl_rdstruct(oldp, oldlenp, newp, vfsp,
- sizeof(struct vfsconf)));
+ return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
}
return (EOPNOTSUPP);
}
+SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
+ "Generic filesystem");
+
+#ifndef NO_COMPAT_PRELITE2
+
+static int
+sysctl_ovfs_conf SYSCTL_HANDLER_ARGS
+{
+ int error;
+ struct vfsconf *vfsp;
+ struct ovfsconf ovfs;
+
+ for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
+ ovfs.vfc_vfsops = vfsp->vfc_vfsops; /* XXX used as flag */
+ strcpy(ovfs.vfc_name, vfsp->vfc_name);
+ ovfs.vfc_index = vfsp->vfc_typenum;
+ ovfs.vfc_refcount = vfsp->vfc_refcount;
+ ovfs.vfc_flags = vfsp->vfc_flags;
+ error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
+ if (error)
+ return error;
+ }
+ return 0;
+}
+
+#endif /* !NO_COMPAT_PRELITE2 */
+
int kinfo_vdebug = 1;
int kinfo_vgetfailed;
+
#define KINFO_VNODESLOP 10
/*
* Dump vnode list (via sysctl).
* Copyout address of vnode followed by vnode.
*/
/* ARGSUSED */
-int
-sysctl_vnode(where, sizep, p)
- char *where;
- size_t *sizep;
- struct proc *p;
+static int
+sysctl_vnode SYSCTL_HANDLER_ARGS
{
+ struct proc *p = curproc; /* XXX */
struct mount *mp, *nmp;
struct vnode *nvp, *vp;
- char *bp = where, *savebp;
- char *ewhere;
int error;
#define VPTRSZ sizeof (struct vnode *)
#define VNODESZ sizeof (struct vnode)
- if (where == NULL) {
- *sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
- return (0);
- }
- ewhere = where + *sizep;
-
+
+ req->lock = 0;
+ if (!req->oldptr) /* Make an estimate */
+ return (SYSCTL_OUT(req, 0,
+ (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
+
simple_lock(&mountlist_slock);
for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
nmp = mp->mnt_list.cqe_next;
continue;
}
- savebp = bp;
again:
simple_lock(&mntvnode_slock);
for (vp = mp->mnt_vnodelist.lh_first;
@@ -1532,20 +1766,13 @@ again:
simple_unlock(&mntvnode_slock);
if (kinfo_vdebug)
printf("kinfo: vp changed\n");
- bp = savebp;
goto again;
}
nvp = vp->v_mntvnodes.le_next;
- if (bp + VPTRSZ + VNODESZ > ewhere) {
- simple_unlock(&mntvnode_slock);
- *sizep = bp - where;
- return (ENOMEM);
- }
simple_unlock(&mntvnode_slock);
- if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) ||
- (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ)))
+ if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
+ (error = SYSCTL_OUT(req, vp, VNODESZ)))
return (error);
- bp += VPTRSZ + VNODESZ;
simple_lock(&mntvnode_slock);
}
simple_unlock(&mntvnode_slock);
@@ -1555,10 +1782,12 @@ again:
}
simple_unlock(&mountlist_slock);
- *sizep = bp - where;
return (0);
}
+SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
+ 0, 0, sysctl_vnode, "S,vnode", "");
+
/*
* Check to see if a filesystem is mounted on a block device.
*/
@@ -1595,14 +1824,23 @@ void
vfs_unmountall()
{
struct mount *mp, *nmp;
- struct proc *p = curproc; /* XXX */
+ struct proc *p = initproc; /* XXX XXX should this be proc0? */
+ int error;
/*
* Since this only runs when rebooting, it is not interlocked.
*/
for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
nmp = mp->mnt_list.cqe_prev;
- (void) dounmount(mp, MNT_FORCE, p);
+ error = dounmount(mp, MNT_FORCE, p);
+ if (error) {
+ printf("unmount of %s failed (",
+ mp->mnt_stat.f_mntonname);
+ if (error == EBUSY)
+ printf("BUSY)\n");
+ else
+ printf("%d)\n", error);
+ }
}
}
@@ -1611,10 +1849,8 @@ vfs_unmountall()
* Called by ufs_mount() to set up the lists of export addresses.
*/
static int
-vfs_hang_addrlist(mp, nep, argp)
- struct mount *mp;
- struct netexport *nep;
- struct export_args *argp;
+vfs_hang_addrlist(struct mount *mp, struct netexport *nep,
+ struct export_args *argp)
{
register struct netcred *np;
register struct radix_node_head *rnh;
@@ -1635,16 +1871,16 @@ vfs_hang_addrlist(mp, nep, argp)
return (0);
}
i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
- np = (struct netcred *)malloc(i, M_NETADDR, M_WAITOK);
- bzero((caddr_t)np, i);
- saddr = (struct sockaddr *)(np + 1);
- if (error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen))
+ np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
+ bzero((caddr_t) np, i);
+ saddr = (struct sockaddr *) (np + 1);
+ if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
goto out;
if (saddr->sa_len > argp->ex_addrlen)
saddr->sa_len = argp->ex_addrlen;
if (argp->ex_masklen) {
- smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
- error = copyin(argp->ex_addr, (caddr_t)smask, argp->ex_masklen);
+ smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
+ error = copyin(argp->ex_addr, (caddr_t) smask, argp->ex_masklen);
if (error)
goto out;
if (smask->sa_len > argp->ex_masklen)
@@ -1653,13 +1889,13 @@ vfs_hang_addrlist(mp, nep, argp)
i = saddr->sa_family;
if ((rnh = nep->ne_rtable[i]) == 0) {
/*
- * Seems silly to initialize every AF when most are not
- * used, do so on demand here
+ * Seems silly to initialize every AF when most are not used,
+ * do so on demand here
*/
for (dom = domains; dom; dom = dom->dom_next)
if (dom->dom_family == i && dom->dom_rtattach) {
- dom->dom_rtattach((void **)&nep->ne_rtable[i],
- dom->dom_rtoffset);
+ dom->dom_rtattach((void **) &nep->ne_rtable[i],
+ dom->dom_rtoffset);
break;
}
if ((rnh = nep->ne_rtable[i]) == 0) {
@@ -1667,23 +1903,11 @@ vfs_hang_addrlist(mp, nep, argp)
goto out;
}
}
- rn = (*rnh->rnh_addaddr)((caddr_t)saddr, (caddr_t)smask, rnh,
- np->netc_rnodes);
- if (rn == 0) {
- /*
- * One of the reasons that rnh_addaddr may fail is that
- * the entry already exists. To check for this case, we
- * look up the entry to see if it is there. If so, we
- * do not need to make a new entry but do return success.
- */
- free(np, M_NETADDR);
- rn = (*rnh->rnh_matchaddr)((caddr_t)saddr, rnh);
- if (rn != 0 && (rn->rn_flags & RNF_ROOT) == 0 &&
- ((struct netcred *)rn)->netc_exflags == argp->ex_flags &&
- !bcmp((caddr_t)&((struct netcred *)rn)->netc_anon,
- (caddr_t)&argp->ex_anon, sizeof(struct ucred)))
- return (0);
- return (EPERM);
+ rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
+ np->netc_rnodes);
+ if (rn == 0 || np != (struct netcred *) rn) { /* already exists */
+ error = EPERM;
+ goto out;
}
np->netc_exflags = argp->ex_flags;
np->netc_anon = argp->ex_anon;
@@ -1696,14 +1920,12 @@ out:
/* ARGSUSED */
static int
-vfs_free_netcred(rn, w)
- struct radix_node *rn;
- caddr_t w;
+vfs_free_netcred(struct radix_node *rn, void *w)
{
- register struct radix_node_head *rnh = (struct radix_node_head *)w;
+ register struct radix_node_head *rnh = (struct radix_node_head *) w;
- (*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh);
- free((caddr_t)rn, M_NETADDR);
+ (*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
+ free((caddr_t) rn, M_NETADDR);
return (0);
}
@@ -1711,17 +1933,16 @@ vfs_free_netcred(rn, w)
* Free the net address hash lists that are hanging off the mount points.
*/
static void
-vfs_free_addrlist(nep)
- struct netexport *nep;
+vfs_free_addrlist(struct netexport *nep)
{
register int i;
register struct radix_node_head *rnh;
for (i = 0; i <= AF_MAX; i++)
- if (rnh = nep->ne_rtable[i]) {
- (*rnh->rnh_walktree)(rnh, vfs_free_netcred,
- (caddr_t)rnh);
- free((caddr_t)rnh, M_RTABLE);
+ if ((rnh = nep->ne_rtable[i])) {
+ (*rnh->rnh_walktree) (rnh, vfs_free_netcred,
+ (caddr_t) rnh);
+ free((caddr_t) rnh, M_RTABLE);
nep->ne_rtable[i] = 0;
}
}
@@ -1739,7 +1960,7 @@ vfs_export(mp, nep, argp)
mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
}
if (argp->ex_flags & MNT_EXPORTED) {
- if (error = vfs_hang_addrlist(mp, nep, argp))
+ if ((error = vfs_hang_addrlist(mp, nep, argp)))
return (error);
mp->mnt_flag |= MNT_EXPORTED;
}
@@ -1780,3 +2001,79 @@ vfs_export_lookup(mp, nep, nam)
}
return (np);
}
+
+/*
+ * perform msync on all vnodes under a mount point
+ * the mount point must be locked.
+ */
+void
+vfs_msync(struct mount *mp, int flags) {
+ struct vnode *vp, *nvp;
+loop:
+ for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
+
+ if (vp->v_mount != mp)
+ goto loop;
+ nvp = vp->v_mntvnodes.le_next;
+ if (VOP_ISLOCKED(vp) && (flags != MNT_WAIT))
+ continue;
+ if (vp->v_object &&
+ (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
+ vm_object_page_clean(vp->v_object, 0, 0, TRUE, TRUE);
+ }
+ }
+}
+
+/*
+ * Create the VM object needed for VMIO and mmap support. This
+ * is done for all VREG files in the system. Some filesystems might
+ * afford the additional metadata buffering capability of the
+ * VMIO code by making the device node be VMIO mode also.
+ */
+int
+vfs_object_create(vp, p, cred, waslocked)
+ struct vnode *vp;
+ struct proc *p;
+ struct ucred *cred;
+ int waslocked;
+{
+ struct vattr vat;
+ vm_object_t object;
+ int error = 0;
+
+retry:
+ if ((object = vp->v_object) == NULL) {
+ if (vp->v_type == VREG) {
+ if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
+ goto retn;
+ (void) vnode_pager_alloc(vp,
+ OFF_TO_IDX(round_page(vat.va_size)), 0, 0);
+ } else {
+ /*
+ * This simply allocates the biggest object possible
+ * for a VBLK vnode. This should be fixed, but doesn't
+ * cause any problems (yet).
+ */
+ (void) vnode_pager_alloc(vp, INT_MAX, 0, 0);
+ }
+ vp->v_object->flags |= OBJ_VFS_REF;
+ } else {
+ if (object->flags & OBJ_DEAD) {
+ if (waslocked)
+ VOP_UNLOCK(vp, 0, p);
+ tsleep(object, PVM, "vodead", 0);
+ if (waslocked)
+ vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+ goto retry;
+ }
+ if ((object->flags & OBJ_VFS_REF) == 0) {
+ object->flags |= OBJ_VFS_REF;
+ vm_object_reference(object);
+ }
+ }
+ if (vp->v_object)
+ vp->v_flag |= VVMIO;
+
+retn:
+ return error;
+}
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index 0cf7680..2997fe5 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -35,16 +35,30 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)vfs_syscalls.c 8.41 (Berkeley) 6/15/95
+ * @(#)vfs_syscalls.c 8.13 (Berkeley) 4/15/94
+ * $Id: vfs_syscalls.c,v 1.60 1997/03/23 03:36:35 bde Exp $
*/
+/*
+ * XXX - The following is required because of some magic done
+ * in getdirentries() below which is only done if the translucent
+ * filesystem `UNION' is compiled into the kernel. This is broken,
+ * but I don't have time to study the code deeply enough to understand
+ * what's going on and determine an appropriate fix. -GAW
+ */
+#include "opt_union.h"
+
#include <sys/param.h>
#include <sys/systm.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
#include <sys/namei.h>
#include <sys/filedesc.h>
#include <sys/kernel.h>
+#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/stat.h>
+#include <sys/unistd.h>
#include <sys/vnode.h>
#include <sys/mount.h>
#include <sys/proc.h>
@@ -52,9 +66,14 @@
#include <sys/malloc.h>
#include <sys/dirent.h>
-#include <sys/syscallargs.h>
+#ifdef UNION
+#include <miscfs/union/union.h>
+#endif
#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
#include <sys/sysctl.h>
static int change_dir __P((struct nameidata *ndp, struct proc *p));
@@ -67,6 +86,14 @@ static void checkdirs __P((struct vnode *olddp));
/*
* Mount a file system.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct mount_args {
+ char *type;
+ char *path;
+ int flags;
+ caddr_t data;
+};
+#endif
/* ARGSUSED */
int
mount(p, uap, retval)
@@ -82,7 +109,7 @@ mount(p, uap, retval)
struct vnode *vp;
struct mount *mp;
struct vfsconf *vfsp;
- int error, flag;
+ int error, flag = 0;
struct vattr va;
u_long fstypenum;
struct nameidata nd;
@@ -228,9 +255,10 @@ update:
else if (mp->mnt_flag & MNT_RDONLY)
mp->mnt_flag |= MNT_WANTRDWR;
mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
- MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC);
+ MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOATIME);
mp->mnt_flag |= SCARG(uap, flags) & (MNT_NOSUID | MNT_NOEXEC |
- MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC);
+ MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_FORCE |
+ MNT_NOATIME);
/*
* Mount the filesystem.
*/
@@ -313,6 +341,12 @@ checkdirs(olddp)
* Note: unmount takes a path to the vnode mounted on as argument,
* not special file (as before).
*/
+#ifndef _SYS_SYSPROTO_H_
+struct unmount_args {
+ char *path;
+ int flags;
+};
+#endif
/* ARGSUSED */
int
unmount(p, uap, retval)
@@ -380,6 +414,7 @@ dounmount(mp, flags, p)
mp->mnt_flag |= MNT_UNMOUNT;
lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK, &mountlist_slock, p);
mp->mnt_flag &=~ MNT_ASYNC;
+ vfs_msync(mp, MNT_NOWAIT);
vnode_pager_umount(mp); /* release cached vnodes */
cache_purgevfs(mp); /* remove cache entries for this file sys */
if (((mp->mnt_flag & MNT_RDONLY) ||
@@ -411,16 +446,22 @@ dounmount(mp, flags, p)
/*
* Sync each mounted filesystem.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct sync_args {
+ int dummy;
+};
+#endif
+
#ifdef DEBUG
int syncprt = 0;
-struct ctldebug debug0 = { "syncprt", &syncprt };
+SYSCTL_INT(_debug, 0, syncprt, CTLFLAG_RW, &syncprt, 0, "");
#endif
/* ARGSUSED */
int
sync(p, uap, retval)
struct proc *p;
- void *uap;
+ struct sync_args *uap;
register_t *retval;
{
register struct mount *mp, *nmp;
@@ -435,7 +476,8 @@ sync(p, uap, retval)
if ((mp->mnt_flag & MNT_RDONLY) == 0) {
asyncflag = mp->mnt_flag & MNT_ASYNC;
mp->mnt_flag &= ~MNT_ASYNC;
- VFS_SYNC(mp, MNT_NOWAIT, p->p_ucred, p);
+ vfs_msync(mp, MNT_NOWAIT);
+ VFS_SYNC(mp, MNT_NOWAIT, p != NULL ? p->p_ucred : NOCRED, p);
if (asyncflag)
mp->mnt_flag |= MNT_ASYNC;
}
@@ -444,16 +486,30 @@ sync(p, uap, retval)
vfs_unbusy(mp, p);
}
simple_unlock(&mountlist_slock);
+#if 0
+/*
+ * XXX don't call vfs_bufstats() yet because that routine
+ * was not imported in the Lite2 merge.
+ */
#ifdef DIAGNOSTIC
if (syncprt)
vfs_bufstats();
#endif /* DIAGNOSTIC */
+#endif
return (0);
}
/*
* Change filesystem quotas.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct quotactl_args {
+ char *path;
+ int cmd;
+ int uid;
+ caddr_t arg;
+};
+#endif
/* ARGSUSED */
int
quotactl(p, uap, retval)
@@ -482,6 +538,12 @@ quotactl(p, uap, retval)
/*
* Get filesystem statistics.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct statfs_args {
+ char *path;
+ struct statfs *buf;
+};
+#endif
/* ARGSUSED */
int
statfs(p, uap, retval)
@@ -496,6 +558,7 @@ statfs(p, uap, retval)
register struct statfs *sp;
int error;
struct nameidata nd;
+ struct statfs sb;
NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
if (error = namei(&nd))
@@ -503,15 +566,27 @@ statfs(p, uap, retval)
mp = nd.ni_vp->v_mount;
sp = &mp->mnt_stat;
vrele(nd.ni_vp);
- if (error = VFS_STATFS(mp, sp, p))
+ error = VFS_STATFS(mp, sp, p);
+ if (error)
return (error);
sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ if (p->p_ucred->cr_uid != 0) {
+ bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb));
+ sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+ sp = &sb;
+ }
return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp)));
}
/*
* Get filesystem statistics.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct fstatfs_args {
+ int fd;
+ struct statfs *buf;
+};
+#endif
/* ARGSUSED */
int
fstatfs(p, uap, retval)
@@ -526,20 +601,34 @@ fstatfs(p, uap, retval)
struct mount *mp;
register struct statfs *sp;
int error;
+ struct statfs sb;
if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
return (error);
mp = ((struct vnode *)fp->f_data)->v_mount;
sp = &mp->mnt_stat;
- if (error = VFS_STATFS(mp, sp, p))
+ error = VFS_STATFS(mp, sp, p);
+ if (error)
return (error);
sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+ if (p->p_ucred->cr_uid != 0) {
+ bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb));
+ sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+ sp = &sb;
+ }
return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp)));
}
/*
* Get statistics on all filesystems.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct getfsstat_args {
+ struct statfs *buf;
+ long bufsize;
+ int flags;
+};
+#endif
int
getfsstat(p, uap, retval)
struct proc *p;
@@ -579,8 +668,11 @@ getfsstat(p, uap, retval)
continue;
}
sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
- if (error = copyout((caddr_t)sp, sfsp, sizeof(*sp)))
+ error = copyout((caddr_t)sp, sfsp, sizeof(*sp));
+ if (error) {
+ vfs_unbusy(mp, p);
return (error);
+ }
sfsp += sizeof(*sp);
}
count++;
@@ -599,6 +691,11 @@ getfsstat(p, uap, retval)
/*
* Change current working directory to a given file descriptor.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct fchdir_args {
+ int fd;
+};
+#endif
/* ARGSUSED */
int
fchdir(p, uap, retval)
@@ -646,6 +743,11 @@ fchdir(p, uap, retval)
/*
* Change current working directory (``.'').
*/
+#ifndef _SYS_SYSPROTO_H_
+struct chdir_args {
+ char *path;
+};
+#endif
/* ARGSUSED */
int
chdir(p, uap, retval)
@@ -671,6 +773,11 @@ chdir(p, uap, retval)
/*
* Change notion of root (``/'') directory.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct chroot_args {
+ char *path;
+};
+#endif
/* ARGSUSED */
int
chroot(p, uap, retval)
@@ -684,7 +791,8 @@ chroot(p, uap, retval)
int error;
struct nameidata nd;
- if (error = suser(p->p_ucred, &p->p_acflag))
+ error = suser(p->p_ucred, &p->p_acflag);
+ if (error)
return (error);
NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
SCARG(uap, path), p);
@@ -707,7 +815,8 @@ change_dir(ndp, p)
struct vnode *vp;
int error;
- if (error = namei(ndp))
+ error = namei(ndp);
+ if (error)
return (error);
vp = ndp->ni_vp;
if (vp->v_type != VDIR)
@@ -725,6 +834,13 @@ change_dir(ndp, p)
* Check permissions, allocate an open file structure,
* and call the device open routine if any.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct open_args {
+ char *path;
+ int flags;
+ int mode;
+};
+#endif
int
open(p, uap, retval)
struct proc *p;
@@ -743,16 +859,17 @@ open(p, uap, retval)
int type, indx, error;
struct flock lf;
struct nameidata nd;
- extern struct fileops vnops;
- if (error = falloc(p, &nfp, &indx))
+ error = falloc(p, &nfp, &indx);
+ if (error)
return (error);
fp = nfp;
flags = FFLAGS(SCARG(uap, flags));
cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
p->p_dupfd = -indx - 1; /* XXX check for fdopen */
- if (error = vn_open(&nd, flags, cmode)) {
+ error = vn_open(&nd, flags, cmode);
+ if (error) {
ffree(fp);
if ((error == ENODEV || error == ENXIO) &&
p->p_dupfd >= 0 && /* XXX from fdopen */
@@ -768,8 +885,9 @@ open(p, uap, retval)
}
p->p_dupfd = 0;
vp = nd.ni_vp;
+
fp->f_flag = flags & FMASK;
- fp->f_type = DTYPE_VNODE;
+ fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE);
fp->f_ops = &vnops;
fp->f_data = (caddr_t)vp;
if (flags & (O_EXLOCK | O_SHLOCK)) {
@@ -802,10 +920,16 @@ open(p, uap, retval)
/*
* Create a file.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct ocreat_args {
+ char *path;
+ int mode;
+};
+#endif
int
-compat_43_creat(p, uap, retval)
+ocreat(p, uap, retval)
struct proc *p;
- register struct compat_43_creat_args /* {
+ register struct ocreat_args /* {
syscallarg(char *) path;
syscallarg(int) mode;
} */ *uap;
@@ -827,6 +951,13 @@ compat_43_creat(p, uap, retval)
/*
* Create a special file.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct mknod_args {
+ char *path;
+ int mode;
+ int dev;
+};
+#endif
/* ARGSUSED */
int
mknod(p, uap, retval)
@@ -844,7 +975,8 @@ mknod(p, uap, retval)
int whiteout;
struct nameidata nd;
- if (error = suser(p->p_ucred, &p->p_acflag))
+ error = suser(p->p_ucred, &p->p_acflag);
+ if (error)
return (error);
NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
if (error = namei(&nd))
@@ -902,6 +1034,12 @@ mknod(p, uap, retval)
/*
* Create a named pipe.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct mkfifo_args {
+ char *path;
+ int mode;
+};
+#endif
/* ARGSUSED */
int
mkfifo(p, uap, retval)
@@ -916,9 +1054,6 @@ mkfifo(p, uap, retval)
int error;
struct nameidata nd;
-#ifndef FIFO
- return (EOPNOTSUPP);
-#else
NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
if (error = namei(&nd))
return (error);
@@ -936,12 +1071,17 @@ mkfifo(p, uap, retval)
vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask;
VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
return (VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr));
-#endif /* FIFO */
}
/*
* Make a hard file link.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct link_args {
+ char *path;
+ char *link;
+};
+#endif
/* ARGSUSED */
int
link(p, uap, retval)
@@ -960,20 +1100,13 @@ link(p, uap, retval)
if (error = namei(&nd))
return (error);
vp = nd.ni_vp;
- if (vp->v_type != VDIR ||
- (error = suser(p->p_ucred, &p->p_acflag)) == 0) {
- nd.ni_cnd.cn_nameiop = CREATE;
- nd.ni_cnd.cn_flags = LOCKPARENT;
- nd.ni_dirp = SCARG(uap, link);
- if ((error = namei(&nd)) == 0) {
- if (nd.ni_vp != NULL)
- error = EEXIST;
- if (!error) {
- VOP_LEASE(nd.ni_dvp, p, p->p_ucred,
- LEASE_WRITE);
- VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
- error = VOP_LINK(vp, nd.ni_dvp, &nd.ni_cnd);
- } else {
+ if (vp->v_type == VDIR)
+ error = EPERM; /* POSIX */
+ else {
+ NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, link), p);
+ error = namei(&nd);
+ if (!error) {
+ if (nd.ni_vp != NULL) {
VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
if (nd.ni_dvp == nd.ni_vp)
vrele(nd.ni_dvp);
@@ -981,6 +1114,12 @@ link(p, uap, retval)
vput(nd.ni_dvp);
if (nd.ni_vp)
vrele(nd.ni_vp);
+ error = EEXIST;
+ } else {
+ VOP_LEASE(nd.ni_dvp, p, p->p_ucred,
+ LEASE_WRITE);
+ VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+ error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
}
}
}
@@ -991,6 +1130,12 @@ link(p, uap, retval)
/*
* Make a symbolic link.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct symlink_args {
+ char *path;
+ char *link;
+};
+#endif
/* ARGSUSED */
int
symlink(p, uap, retval)
@@ -1073,6 +1218,11 @@ undelete(p, uap, retval)
/*
* Delete a name from the filesystem.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct unlink_args {
+ char *path;
+};
+#endif
/* ARGSUSED */
int
unlink(p, uap, retval)
@@ -1093,15 +1243,18 @@ unlink(p, uap, retval)
VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
- if (vp->v_type != VDIR ||
- (error = suser(p->p_ucred, &p->p_acflag)) == 0) {
+ if (vp->v_type == VDIR)
+ error = EPERM; /* POSIX */
+ else {
/*
* The root of a mounted filesystem cannot be deleted.
+ *
+ * XXX: can this only be a VDIR case?
*/
if (vp->v_flag & VROOT)
error = EBUSY;
else
- (void)vnode_pager_uncache(vp);
+ (void) vnode_pager_uncache(vp, p);
}
if (!error) {
@@ -1122,6 +1275,14 @@ unlink(p, uap, retval)
/*
* Reposition read/write file offset.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct lseek_args {
+ int fd;
+ int pad;
+ off_t offset;
+ int whence;
+};
+#endif
int
lseek(p, uap, retval)
struct proc *p;
@@ -1131,7 +1292,7 @@ lseek(p, uap, retval)
syscallarg(off_t) offset;
syscallarg(int) whence;
} */ *uap;
- register_t *retval;
+ register_t *retval; /* XXX */
{
struct ucred *cred = p->p_ucred;
register struct filedesc *fdp = p->p_fd;
@@ -1149,8 +1310,8 @@ lseek(p, uap, retval)
fp->f_offset += SCARG(uap, offset);
break;
case L_XTND:
- if (error =
- VOP_GETATTR((struct vnode *)fp->f_data, &vattr, cred, p))
+ error=VOP_GETATTR((struct vnode *)fp->f_data, &vattr, cred, p);
+ if (error)
return (error);
fp->f_offset = SCARG(uap, offset) + vattr.va_size;
break;
@@ -1168,10 +1329,17 @@ lseek(p, uap, retval)
/*
* Reposition read/write file offset.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct olseek_args {
+ int fd;
+ long offset;
+ int whence;
+};
+#endif
int
-compat_43_lseek(p, uap, retval)
+olseek(p, uap, retval)
struct proc *p;
- register struct compat_43_lseek_args /* {
+ register struct olseek_args /* {
syscallarg(int) fd;
syscallarg(long) offset;
syscallarg(int) whence;
@@ -1190,7 +1358,7 @@ compat_43_lseek(p, uap, retval)
SCARG(&nuap, fd) = SCARG(uap, fd);
SCARG(&nuap, offset) = SCARG(uap, offset);
SCARG(&nuap, whence) = SCARG(uap, whence);
- error = lseek(p, &nuap, &qret);
+ error = lseek(p, &nuap, (register_t *) &qret);
*(long *)retval = qret;
return (error);
}
@@ -1199,6 +1367,12 @@ compat_43_lseek(p, uap, retval)
/*
* Check access permissions.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct access_args {
+ char *path;
+ int flags;
+};
+#endif
int
access(p, uap, retval)
struct proc *p;
@@ -1246,11 +1420,17 @@ out1:
/*
* Get file status; this version follows links.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct ostat_args {
+ char *path;
+ struct ostat *ub;
+};
+#endif
/* ARGSUSED */
int
-compat_43_stat(p, uap, retval)
+ostat(p, uap, retval)
struct proc *p;
- register struct compat_43_stat_args /* {
+ register struct ostat_args /* {
syscallarg(char *) path;
syscallarg(struct ostat *) ub;
} */ *uap;
@@ -1277,11 +1457,17 @@ compat_43_stat(p, uap, retval)
/*
* Get file status; this version does not follow links.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct olstat_args {
+ char *path;
+ struct ostat *ub;
+};
+#endif
/* ARGSUSED */
int
-compat_43_lstat(p, uap, retval)
+olstat(p, uap, retval)
struct proc *p;
- register struct compat_43_lstat_args /* {
+ register struct olstat_args /* {
syscallarg(char *) path;
syscallarg(struct ostat *) ub;
} */ *uap;
@@ -1367,6 +1553,12 @@ cvtstat(st, ost)
/*
* Get file status; this version follows links.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct stat_args {
+ char *path;
+ struct stat *ub;
+};
+#endif
/* ARGSUSED */
int
stat(p, uap, retval)
@@ -1396,6 +1588,12 @@ stat(p, uap, retval)
/*
* Get file status; this version does not follow links.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+ char *path;
+ struct stat *ub;
+};
+#endif
/* ARGSUSED */
int
lstat(p, uap, retval)
@@ -1455,6 +1653,12 @@ lstat(p, uap, retval)
/*
* Get configurable pathname variables.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct pathconf_args {
+ char *path;
+ int name;
+};
+#endif
/* ARGSUSED */
int
pathconf(p, uap, retval)
@@ -1480,6 +1684,13 @@ pathconf(p, uap, retval)
/*
* Return target name of a symbolic link.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct readlink_args {
+ char *path;
+ char *buf;
+ int count;
+};
+#endif
/* ARGSUSED */
int
readlink(p, uap, retval)
@@ -1524,6 +1735,12 @@ readlink(p, uap, retval)
/*
* Change flags of a file given a path name.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct chflags_args {
+ char *path;
+ int flags;
+};
+#endif
/* ARGSUSED */
int
chflags(p, uap, retval)
@@ -1555,6 +1772,12 @@ chflags(p, uap, retval)
/*
* Change flags of a file given a file descriptor.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct fchflags_args {
+ int fd;
+ int flags;
+};
+#endif
/* ARGSUSED */
int
fchflags(p, uap, retval)
@@ -1585,6 +1808,12 @@ fchflags(p, uap, retval)
/*
* Change mode of a file given path name.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct chmod_args {
+ char *path;
+ int mode;
+};
+#endif
/* ARGSUSED */
int
chmod(p, uap, retval)
@@ -1616,6 +1845,12 @@ chmod(p, uap, retval)
/*
* Change mode of a file given a file descriptor.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct fchmod_args {
+ int fd;
+ int mode;
+};
+#endif
/* ARGSUSED */
int
fchmod(p, uap, retval)
@@ -1646,6 +1881,13 @@ fchmod(p, uap, retval)
/*
* Set ownership given a path name.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct chown_args {
+ char *path;
+ int uid;
+ int gid;
+};
+#endif
/* ARGSUSED */
int
chown(p, uap, retval)
@@ -1679,6 +1921,13 @@ chown(p, uap, retval)
/*
* Set ownership given a file descriptor.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct fchown_args {
+ int fd;
+ int uid;
+ int gid;
+};
+#endif
/* ARGSUSED */
int
fchown(p, uap, retval)
@@ -1711,6 +1960,12 @@ fchown(p, uap, retval)
/*
* Set the access and modification times of a file.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct utimes_args {
+ char *path;
+ struct timeval *tptr;
+};
+#endif
/* ARGSUSED */
int
utimes(p, uap, retval)
@@ -1741,10 +1996,10 @@ utimes(p, uap, retval)
vp = nd.ni_vp;
VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
- vattr.va_atime.ts_sec = tv[0].tv_sec;
- vattr.va_atime.ts_nsec = tv[0].tv_usec * 1000;
- vattr.va_mtime.ts_sec = tv[1].tv_sec;
- vattr.va_mtime.ts_nsec = tv[1].tv_usec * 1000;
+ vattr.va_atime.tv_sec = tv[0].tv_sec;
+ vattr.va_atime.tv_nsec = tv[0].tv_usec * 1000;
+ vattr.va_mtime.tv_sec = tv[1].tv_sec;
+ vattr.va_mtime.tv_nsec = tv[1].tv_usec * 1000;
error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
vput(vp);
return (error);
@@ -1753,6 +2008,13 @@ utimes(p, uap, retval)
/*
* Truncate a file given its path name.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct truncate_args {
+ char *path;
+ int pad;
+ off_t length;
+};
+#endif
/* ARGSUSED */
int
truncate(p, uap, retval)
@@ -1769,6 +2031,8 @@ truncate(p, uap, retval)
int error;
struct nameidata nd;
+ if (uap->length < 0)
+ return(EINVAL);
NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
if (error = namei(&nd))
return (error);
@@ -1790,6 +2054,13 @@ truncate(p, uap, retval)
/*
* Truncate a file given a file descriptor.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct ftruncate_args {
+ int fd;
+ int pad;
+ off_t length;
+};
+#endif
/* ARGSUSED */
int
ftruncate(p, uap, retval)
@@ -1806,6 +2077,8 @@ ftruncate(p, uap, retval)
struct file *fp;
int error;
+ if (uap->length < 0)
+ return(EINVAL);
if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
return (error);
if ((fp->f_flag & FWRITE) == 0)
@@ -1828,11 +2101,17 @@ ftruncate(p, uap, retval)
/*
* Truncate a file given its path name.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct otruncate_args {
+ char *path;
+ long length;
+};
+#endif
/* ARGSUSED */
int
-compat_43_truncate(p, uap, retval)
+otruncate(p, uap, retval)
struct proc *p;
- register struct compat_43_truncate_args /* {
+ register struct otruncate_args /* {
syscallarg(char *) path;
syscallarg(long) length;
} */ *uap;
@@ -1852,11 +2131,17 @@ compat_43_truncate(p, uap, retval)
/*
* Truncate a file given a file descriptor.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct oftruncate_args {
+ int fd;
+ long length;
+};
+#endif
/* ARGSUSED */
int
-compat_43_ftruncate(p, uap, retval)
+oftruncate(p, uap, retval)
struct proc *p;
- register struct compat_43_ftruncate_args /* {
+ register struct oftruncate_args /* {
syscallarg(int) fd;
syscallarg(long) length;
} */ *uap;
@@ -1877,6 +2162,11 @@ compat_43_ftruncate(p, uap, retval)
/*
* Sync an open file.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct fsync_args {
+ int fd;
+};
+#endif
/* ARGSUSED */
int
fsync(p, uap, retval)
@@ -1894,7 +2184,12 @@ fsync(p, uap, retval)
return (error);
vp = (struct vnode *)fp->f_data;
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
- error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, p);
+ if (vp->v_object) {
+ vm_object_page_clean(vp->v_object, 0, 0 ,0, FALSE);
+ }
+ error = VOP_FSYNC(vp, fp->f_cred,
+ (vp->v_mount && (vp->v_mount->mnt_flag & MNT_ASYNC)) ?
+ MNT_NOWAIT : MNT_WAIT, p);
VOP_UNLOCK(vp, 0, p);
return (error);
}
@@ -1903,6 +2198,12 @@ fsync(p, uap, retval)
* Rename files. Source and destination must either both be directories,
* or both not be directories. If target is a directory, it must be empty.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct rename_args {
+ char *from;
+ char *to;
+};
+#endif
/* ARGSUSED */
int
rename(p, uap, retval)
@@ -1924,7 +2225,12 @@ rename(p, uap, retval)
fvp = fromnd.ni_vp;
NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART,
UIO_USERSPACE, SCARG(uap, to), p);
+ if (fromnd.ni_vp->v_type == VDIR)
+ tond.ni_cnd.cn_flags |= WILLBEDIR;
if (error = namei(&tond)) {
+ /* Translate error code for rename("dir1", "dir2/."). */
+ if (error == EISDIR && fvp->v_type == VDIR)
+ error = EINVAL;
VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
vrele(fromnd.ni_dvp);
vrele(fvp);
@@ -1958,8 +2264,10 @@ out:
VOP_LEASE(tdvp, p, p->p_ucred, LEASE_WRITE);
if (fromnd.ni_dvp != tdvp)
VOP_LEASE(fromnd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
- if (tvp)
+ if (tvp) {
VOP_LEASE(tvp, p, p->p_ucred, LEASE_WRITE);
+ (void) vnode_pager_uncache(tvp, p);
+ }
error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
} else {
@@ -1988,6 +2296,12 @@ out1:
/*
* Make a directory file.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct mkdir_args {
+ char *path;
+ int mode;
+};
+#endif
/* ARGSUSED */
int
mkdir(p, uap, retval)
@@ -2004,6 +2318,7 @@ mkdir(p, uap, retval)
struct nameidata nd;
NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+ nd.ni_cnd.cn_flags |= WILLBEDIR;
if (error = namei(&nd))
return (error);
vp = nd.ni_vp;
@@ -2029,6 +2344,11 @@ mkdir(p, uap, retval)
/*
* Remove a directory file.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct rmdir_args {
+ char *path;
+};
+#endif
/* ARGSUSED */
int
rmdir(p, uap, retval)
@@ -2083,10 +2403,18 @@ out:
/*
* Read a block of directory entries in a file system independent format.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct ogetdirentries_args {
+ int fd;
+ char *buf;
+ u_int count;
+ long *basep;
+};
+#endif
int
-compat_43_getdirentries(p, uap, retval)
+ogetdirentries(p, uap, retval)
struct proc *p;
- register struct compat_43_getdirentries_args /* {
+ register struct ogetdirentries_args /* {
syscallarg(int) fd;
syscallarg(char *) buf;
syscallarg(u_int) count;
@@ -2124,7 +2452,7 @@ unionread:
# if (BYTE_ORDER != LITTLE_ENDIAN)
if (vp->v_mount->mnt_maxsymlinklen <= 0) {
error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
- (int *)0, (u_long *)0);
+ NULL, NULL);
fp->f_offset = auio.uio_offset;
} else
# endif
@@ -2136,7 +2464,7 @@ unionread:
MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK);
kiov.iov_base = dirbuf;
error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
- (int *)0, (u_long *)0);
+ NULL, NULL);
fp->f_offset = kuio.uio_offset;
if (error == 0) {
readcnt = SCARG(uap, count) - kuio.uio_resid;
@@ -2178,9 +2506,6 @@ unionread:
#ifdef UNION
{
- extern int (**union_vnodeop_p)();
- extern struct vnode *union_dircache __P((struct vnode*, struct proc*));
-
if ((SCARG(uap, count) == auio.uio_resid) &&
(vp->v_op == union_vnodeop_p)) {
struct vnode *lvp;
@@ -2240,6 +2565,14 @@ unionread:
/*
* Read a block of directory entries in a file system independent format.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct getdirentries_args {
+ int fd;
+ char *buf;
+ u_int count;
+ long *basep;
+};
+#endif
int
getdirentries(p, uap, retval)
struct proc *p;
@@ -2276,8 +2609,7 @@ unionread:
auio.uio_resid = SCARG(uap, count);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
loff = auio.uio_offset = fp->f_offset;
- error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
- (int *)0, (u_long *)0);
+ error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
fp->f_offset = auio.uio_offset;
VOP_UNLOCK(vp, 0, p);
if (error)
@@ -2285,9 +2617,6 @@ unionread:
#ifdef UNION
{
- extern int (**union_vnodeop_p)();
- extern struct vnode *union_dircache __P((struct vnode*, struct proc*));
-
if ((SCARG(uap, count) == auio.uio_resid) &&
(vp->v_op == union_vnodeop_p)) {
struct vnode *lvp;
@@ -2346,13 +2675,18 @@ unionread:
/*
* Set the mode mask for creation of filesystem nodes.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct umask_args {
+ int newmask;
+};
+#endif
int
umask(p, uap, retval)
struct proc *p;
struct umask_args /* {
syscallarg(int) newmask;
} */ *uap;
- register_t *retval;
+ int *retval; /* XXX */
{
register struct filedesc *fdp;
@@ -2366,6 +2700,11 @@ umask(p, uap, retval)
* Void all references to file by ripping underlying filesystem
* away from vnode.
*/
+#ifndef _SYS_SYSPROTO_H_
+struct revoke_args {
+ char *path;
+};
+#endif
/* ARGSUSED */
int
revoke(p, uap, retval)
@@ -2402,15 +2741,15 @@ out:
int
getvnode(fdp, fd, fpp)
struct filedesc *fdp;
- struct file **fpp;
int fd;
+ struct file **fpp;
{
struct file *fp;
if ((u_int)fd >= fdp->fd_nfiles ||
(fp = fdp->fd_ofiles[fd]) == NULL)
return (EBADF);
- if (fp->f_type != DTYPE_VNODE)
+ if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO)
return (EINVAL);
*fpp = fp;
return (0);
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 3cfc6fd..cb6c932 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -35,12 +35,14 @@
* OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
* SUCH DAMAGE.
*
- * @(#)vfs_vnops.c 8.14 (Berkeley) 6/15/95
+ * @(#)vfs_vnops.c 8.2 (Berkeley) 1/21/94
+ * $Id: vfs_vnops.c,v 1.33 1997/03/23 03:36:38 bde Exp $
*/
#include <sys/param.h>
#include <sys/systm.h>
#include <sys/kernel.h>
+#include <sys/fcntl.h>
#include <sys/file.h>
#include <sys/stat.h>
#include <sys/buf.h>
@@ -48,10 +50,22 @@
#include <sys/mount.h>
#include <sys/namei.h>
#include <sys/vnode.h>
-#include <sys/ioctl.h>
-#include <sys/tty.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vnode_pager.h>
+
+static int vn_closefile __P((struct file *fp, struct proc *p));
+static int vn_ioctl __P((struct file *fp, int com, caddr_t data,
+ struct proc *p));
+static int vn_read __P((struct file *fp, struct uio *uio,
+ struct ucred *cred));
+static int vn_select __P((struct file *fp, int which, struct proc *p));
+static int vn_write __P((struct file *fp, struct uio *uio,
+ struct ucred *cred));
struct fileops vnops =
{ vn_read, vn_write, vn_ioctl, vn_select, vn_closefile };
@@ -60,6 +74,7 @@ struct fileops vnops =
* Common code for vnode open operations.
* Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
*/
+int
vn_open(ndp, fmode, cmode)
register struct nameidata *ndp;
int fmode, cmode;
@@ -76,7 +91,8 @@ vn_open(ndp, fmode, cmode)
ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF;
if ((fmode & O_EXCL) == 0)
ndp->ni_cnd.cn_flags |= FOLLOW;
- if (error = namei(ndp))
+ error = namei(ndp);
+ if (error)
return (error);
if (ndp->ni_vp == NULL) {
VATTR_NULL(vap);
@@ -107,7 +123,8 @@ vn_open(ndp, fmode, cmode)
} else {
ndp->ni_cnd.cn_nameiop = LOOKUP;
ndp->ni_cnd.cn_flags = FOLLOW | LOCKLEAF;
- if (error = namei(ndp))
+ error = namei(ndp);
+ if (error)
return (error);
vp = ndp->ni_vp;
}
@@ -117,7 +134,8 @@ vn_open(ndp, fmode, cmode)
}
if ((fmode & O_CREAT) == 0) {
if (fmode & FREAD) {
- if (error = VOP_ACCESS(vp, VREAD, cred, p))
+ error = VOP_ACCESS(vp, VREAD, cred, p);
+ if (error)
goto bad;
}
if (fmode & (FWRITE | O_TRUNC)) {
@@ -125,8 +143,11 @@ vn_open(ndp, fmode, cmode)
error = EISDIR;
goto bad;
}
- if ((error = vn_writechk(vp)) ||
- (error = VOP_ACCESS(vp, VWRITE, cred, p)))
+ error = vn_writechk(vp);
+ if (error)
+ goto bad;
+ error = VOP_ACCESS(vp, VWRITE, cred, p);
+ if (error)
goto bad;
}
}
@@ -136,11 +157,21 @@ vn_open(ndp, fmode, cmode)
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p); /* XXX */
VATTR_NULL(vap);
vap->va_size = 0;
- if (error = VOP_SETATTR(vp, vap, cred, p))
+ error = VOP_SETATTR(vp, vap, cred, p);
+ if (error)
goto bad;
}
- if (error = VOP_OPEN(vp, fmode, cred, p))
+ error = VOP_OPEN(vp, fmode, cred, p);
+ if (error)
goto bad;
+ /*
+ * Make sure that a VM object is created for VMIO support.
+ */
+ if (vp->v_type == VREG) {
+ if ((error = vfs_object_create(vp, p, cred, 1)) != 0)
+ goto bad;
+ }
+
if (fmode & FWRITE)
vp->v_writecount++;
return (0);
@@ -153,6 +184,7 @@ bad:
* Check for write permissions on the specified vnode.
* Prototype text segments cannot be written.
*/
+int
vn_writechk(vp)
register struct vnode *vp;
{
@@ -162,7 +194,7 @@ vn_writechk(vp)
* the vnode, try to free it up once. If
* we fail, we can't allow writing.
*/
- if ((vp->v_flag & VTEXT) && !vnode_pager_uncache(vp))
+ if (vp->v_flag & VTEXT)
return (ETXTBSY);
return (0);
}
@@ -170,6 +202,7 @@ vn_writechk(vp)
/*
* Vnode close call
*/
+int
vn_close(vp, flags, cred, p)
register struct vnode *vp;
int flags;
@@ -188,6 +221,7 @@ vn_close(vp, flags, cred, p)
/*
* Package up an I/O request on a vnode into a uio and do it.
*/
+int
vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p)
enum uio_rw rw;
struct vnode *vp;
@@ -233,6 +267,7 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p)
/*
* File table vnode read routine.
*/
+static int
vn_read(fp, uio, cred)
struct file *fp;
struct uio *uio;
@@ -241,14 +276,46 @@ vn_read(fp, uio, cred)
struct vnode *vp = (struct vnode *)fp->f_data;
struct proc *p = uio->uio_procp;
int count, error;
+ int flag, seq;
VOP_LEASE(vp, p, cred, LEASE_READ);
vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
uio->uio_offset = fp->f_offset;
count = uio->uio_resid;
- error = VOP_READ(vp, uio, (fp->f_flag & FNONBLOCK) ? IO_NDELAY : 0,
- cred);
+ flag = 0;
+ if (fp->f_flag & FNONBLOCK)
+ flag |= IO_NDELAY;
+
+ /*
+ * Sequential read heuristic.
+ * If we have been doing sequential input,
+ * a rewind operation doesn't turn off
+ * sequential input mode.
+ */
+ if (((fp->f_offset == 0) && (fp->f_seqcount > 0)) ||
+ (fp->f_offset == fp->f_nextread)) {
+ int tmpseq = fp->f_seqcount;
+ /*
+ * XXX we assume that the filesystem block size is
+ * the default. Not true, but still gives us a pretty
+ * good indicator of how sequential the read operations
+ * are.
+ */
+ tmpseq += ((count + BKVASIZE - 1) / BKVASIZE);
+ if (tmpseq >= CHAR_MAX)
+ tmpseq = CHAR_MAX;
+ fp->f_seqcount = tmpseq;
+ flag |= (fp->f_seqcount << 16);
+ } else {
+ if (fp->f_seqcount > 1)
+ fp->f_seqcount = 1;
+ else
+ fp->f_seqcount = 0;
+ }
+
+ error = VOP_READ(vp, uio, flag, cred);
fp->f_offset += count - uio->uio_resid;
+ fp->f_nextread = fp->f_offset;
VOP_UNLOCK(vp, 0, p);
return (error);
}
@@ -256,6 +323,7 @@ vn_read(fp, uio, cred)
/*
* File table vnode write routine.
*/
+static int
vn_write(fp, uio, cred)
struct file *fp;
struct uio *uio;
@@ -288,6 +356,7 @@ vn_write(fp, uio, cred)
/*
* File table vnode stat routine.
*/
+int
vn_stat(vp, sb, p)
struct vnode *vp;
register struct stat *sb;
@@ -344,17 +413,27 @@ vn_stat(vp, sb, p)
sb->st_ctimespec = vap->va_ctime;
sb->st_blksize = vap->va_blocksize;
sb->st_flags = vap->va_flags;
- sb->st_gen = vap->va_gen;
+ if (p->p_ucred->cr_uid != 0)
+ sb->st_gen = 0;
+ else
+ sb->st_gen = vap->va_gen;
+
+#if (S_BLKSIZE == 512)
+ /* Optimize this case */
+ sb->st_blocks = vap->va_bytes >> 9;
+#else
sb->st_blocks = vap->va_bytes / S_BLKSIZE;
+#endif
return (0);
}
/*
* File table vnode ioctl routine.
*/
+static int
vn_ioctl(fp, com, data, p)
struct file *fp;
- u_long com;
+ int com;
caddr_t data;
struct proc *p;
{
@@ -367,7 +446,8 @@ vn_ioctl(fp, com, data, p)
case VREG:
case VDIR:
if (com == FIONREAD) {
- if (error = VOP_GETATTR(vp, &vattr, p->p_ucred, p))
+ error = VOP_GETATTR(vp, &vattr, p->p_ucred, p);
+ if (error)
return (error);
*(int *)data = vattr.va_size - fp->f_offset;
return (0);
@@ -384,8 +464,15 @@ vn_ioctl(fp, com, data, p)
case VBLK:
error = VOP_IOCTL(vp, com, data, fp->f_flag, p->p_ucred, p);
if (error == 0 && com == TIOCSCTTY) {
+
+ /* Do nothing if reassigning same control tty */
+ if (p->p_session->s_ttyvp == vp)
+ return (0);
+
+ /* Get rid of reference to old control tty */
if (p->p_session->s_ttyvp)
vrele(p->p_session->s_ttyvp);
+
p->p_session->s_ttyvp = vp;
VREF(vp);
}
@@ -396,6 +483,7 @@ vn_ioctl(fp, com, data, p)
/*
* File table vnode select routine.
*/
+static int
vn_select(fp, which, p)
struct file *fp;
int which;
@@ -407,6 +495,19 @@ vn_select(fp, which, p)
}
/*
+ * File table vnode close routine.
+ */
+static int
+vn_closefile(fp, p)
+ struct file *fp;
+ struct proc *p;
+{
+
+ return (vn_close(((struct vnode *)fp->f_data), fp->f_flag,
+ fp->f_cred, p));
+}
+
+/*
* Check that the vnode is still valid, and if so
* acquire requested lock.
*/
@@ -419,8 +520,9 @@ vn_lock(vp, flags, p)
int error;
do {
- if ((flags & LK_INTERLOCK) == 0)
+ if ((flags & LK_INTERLOCK) == 0) {
simple_lock(&vp->v_interlock);
+ }
if (vp->v_flag & VXLOCK) {
vp->v_flag |= VXWANT;
simple_unlock(&vp->v_interlock);
@@ -435,15 +537,3 @@ vn_lock(vp, flags, p)
} while (flags & LK_RETRY);
return (error);
}
-
-/*
- * File table vnode close routine.
- */
-vn_closefile(fp, p)
- struct file *fp;
- struct proc *p;
-{
-
- return (vn_close(((struct vnode *)fp->f_data), fp->f_flag,
- fp->f_cred, p));
-}
diff --git a/sys/kern/vnode_if.pl b/sys/kern/vnode_if.pl
new file mode 100644
index 0000000..75f49a7
--- /dev/null
+++ b/sys/kern/vnode_if.pl
@@ -0,0 +1,459 @@
+#!/bin/sh -
+#
+# Copyright (c) 1992, 1993
+# The Regents of the University of California. All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution.
+# 3. All advertising materials mentioning features or use of this software
+# must display the following acknowledgement:
+# This product includes software developed by the University of
+# California, Berkeley and its contributors.
+# 4. Neither the name of the University nor the names of its contributors
+# may be used to endorse or promote products derived from this software
+# without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+# @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93
+# $Id$
+#
+
+# Script to produce VFS front-end sugar.
+#
+# usage: vnode_if.sh srcfile
+# (where srcfile is currently /sys/kern/vnode_if.src)
+#
+# These awk scripts are not particularly well written, specifically they
+# don't use arrays well and figure out the same information repeatedly.
+# Please rewrite them if you actually understand how to use awk. Note,
+# they use nawk extensions and gawk's toupper.
+
+if [ $# -ne 1 ] ; then
+ echo 'usage: vnode_if.sh srcfile'
+ exit 1
+fi
+
+# Name of the source file.
+SRC=$1
+
+# Names of the created files.
+CFILE=vnode_if.c
+HEADER=vnode_if.h
+
+# Awk program (must support nawk extensions and gawk's "toupper")
+# Use "awk" at Berkeley, "gawk" elsewhere.
+AWK=awk
+
+# Print out header information for vnode_if.h.
+cat << END_OF_LEADING_COMMENT > $HEADER
+/*
+ * This file is produced automatically.
+ * Do not modify anything in here by hand.
+ *
+ * Created from @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93
+ */
+
+extern struct vnodeop_desc vop_default_desc;
+END_OF_LEADING_COMMENT
+
+# Awk script to take vnode_if.src and turn it into vnode_if.h.
+$AWK '
+ NF == 0 || $0 ~ "^#" {
+ next;
+ }
+ {
+ # Get the function name.
+ name = $1;
+ uname = toupper(name);
+
+ # Get the function arguments.
+ for (c1 = 0;; ++c1) {
+ if (getline <= 0)
+ exit
+ if ($0 ~ "^};")
+ break;
+ a[c1] = $0;
+ }
+
+ # Print out the vop_F_args structure.
+ printf("struct %s_args {\n\tstruct vnodeop_desc *a_desc;\n",
+ name);
+ for (c2 = 0; c2 < c1; ++c2) {
+ c3 = split(a[c2], t);
+ printf("\t");
+ if (t[2] ~ "WILLRELE")
+ c4 = 3;
+ else
+ c4 = 2;
+ for (; c4 < c3; ++c4)
+ printf("%s ", t[c4]);
+ beg = match(t[c3], "[^*]");
+ printf("%sa_%s\n",
+ substr(t[c4], 0, beg - 1), substr(t[c4], beg));
+ }
+ printf("};\n");
+
+ # Print out extern declaration.
+ printf("extern struct vnodeop_desc %s_desc;\n", name);
+
+ # Print out prototype.
+ printf("static int %s __P((\n", uname);
+ sep = ",\n";
+ for (c2 = 0; c2 < c1; ++c2) {
+ if (c2 == c1 - 1)
+ sep = "));\n";
+ c3 = split(a[c2], t);
+ printf("\t");
+ if (t[2] ~ "WILLRELE")
+ c4 = 3;
+ else
+ c4 = 2;
+ for (; c4 < c3; ++c4)
+ printf("%s ", t[c4]);
+ beg = match(t[c3], "[^*]");
+ end = match(t[c3], ";");
+ printf("%s%s%s",
+ substr(t[c4], 0, beg - 1),
+ substr(t[c4], beg, end - beg), sep);
+ }
+
+ # Print out inline struct.
+ printf("static inline int %s(", uname);
+ sep = ", ";
+ for (c2 = 0; c2 < c1; ++c2) {
+ if (c2 == c1 - 1)
+ sep = ")\n";
+ c3 = split(a[c2], t);
+ beg = match(t[c3], "[^*]");
+ end = match(t[c3], ";");
+ printf("%s%s", substr(t[c3], beg, end - beg), sep);
+ }
+ for (c2 = 0; c2 < c1; ++c2) {
+ c3 = split(a[c2], t);
+ printf("\t");
+ if (t[2] ~ "WILLRELE")
+ c4 = 3;
+ else
+ c4 = 2;
+ for (; c4 < c3; ++c4)
+ printf("%s ", t[c4]);
+ beg = match(t[c3], "[^*]");
+ printf("%s%s\n",
+ substr(t[c4], 0, beg - 1), substr(t[c4], beg));
+ }
+ printf("{\n\tstruct %s_args a;\n\n", name);
+ printf("\ta.a_desc = VDESC(%s);\n", name);
+ for (c2 = 0; c2 < c1; ++c2) {
+ c3 = split(a[c2], t);
+ printf("\t");
+ beg = match(t[c3], "[^*]");
+ end = match(t[c3], ";");
+ printf("a.a_%s = %s\n",
+ substr(t[c3], beg, end - beg), substr(t[c3], beg));
+ }
+ c1 = split(a[0], t);
+ beg = match(t[c1], "[^*]");
+ end = match(t[c1], ";");
+ printf("\treturn (VCALL(%s, VOFFSET(%s), &a));\n}\n",
+ substr(t[c1], beg, end - beg), name);
+ }' < $SRC >> $HEADER
+
+# Print out header information for vnode_if.c.
+cat << END_OF_LEADING_COMMENT > $CFILE
+/*
+ * This file is produced automatically.
+ * Do not modify anything in here by hand.
+ *
+ * Created from @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93
+ */
+
+#include <sys/param.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+
+struct vnodeop_desc vop_default_desc = {
+ 0,
+ "default",
+ 0,
+ NULL,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ NULL,
+};
+
+END_OF_LEADING_COMMENT
+
+# Awk script to take vnode_if.src and turn it into vnode_if.c.
+$AWK 'function kill_surrounding_ws (s) {
+ sub (/^[ \t]*/, "", s);
+ sub (/[ \t]*$/, "", s);
+ return s;
+ }
+
+ function read_args() {
+ numargs = 0;
+ while (getline ln) {
+ if (ln ~ /}/) {
+ break;
+ };
+
+ # Delete comments, if any.
+ gsub (/\/\*.*\*\//, "", ln);
+
+ # Delete leading/trailing space.
+ ln = kill_surrounding_ws(ln);
+
+ # Pick off direction.
+ if (1 == sub(/^INOUT[ \t]+/, "", ln))
+ dir = "INOUT";
+ else if (1 == sub(/^IN[ \t]+/, "", ln))
+ dir = "IN";
+ else if (1 == sub(/^OUT[ \t]+/, "", ln))
+ dir = "OUT";
+ else
+ bail("No IN/OUT direction for \"" ln "\".");
+
+ # check for "WILLRELE"
+ if (1 == sub(/^WILLRELE[ \t]+/, "", ln)) {
+ rele = "WILLRELE";
+ } else {
+ rele = "WONTRELE";
+ };
+
+ # kill trailing ;
+ if (1 != sub (/;$/, "", ln)) {
+ bail("Missing end-of-line ; in \"" ln "\".");
+ };
+
+ # pick off variable name
+ if (!(i = match(ln, /[A-Za-z0-9_]+$/))) {
+ bail("Missing var name \"a_foo\" in \"" ln "\".");
+ };
+ arg = substr (ln, i);
+ # Want to <<substr(ln, i) = "";>>, but nawk cannot.
+ # Hack around this.
+ ln = substr(ln, 1, i-1);
+
+ # what is left must be type
+ # (put clean it up some)
+ type = ln;
+ gsub (/[ \t]+/, " ", type); # condense whitespace
+ type = kill_surrounding_ws(type);
+
+ # (boy this was easier in Perl)
+
+ numargs++;
+ dirs[numargs] = dir;
+ reles[numargs] = rele;
+ types[numargs] = type;
+ args[numargs] = arg;
+ };
+ }
+
+ function generate_operation_vp_offsets() {
+ printf ("static int %s_vp_offsets[] = {\n", name);
+ # as a side effect, figure out the releflags
+ releflags = "";
+ vpnum = 0;
+ for (i=1; i<=numargs; i++) {
+ if (types[i] == "struct vnode *") {
+ printf ("\tVOPARG_OFFSETOF(struct %s_args,a_%s),\n",
+ name, args[i]);
+ if (reles[i] == "WILLRELE") {
+ releflags = releflags "|VDESC_VP" vpnum "_WILLRELE";
+ };
+ vpnum++;
+ };
+ };
+ sub (/^\|/, "", releflags);
+ print "\tVDESC_NO_OFFSET";
+ print "};";
+ }
+
+ function find_arg_with_type (type) {
+ for (i=1; i<=numargs; i++) {
+ if (types[i] == type) {
+ return "VOPARG_OFFSETOF(struct " name "_args,a_" args[i] ")";
+ };
+ };
+ return "VDESC_NO_OFFSET";
+ }
+
+ function generate_operation_desc() {
+ printf ("struct vnodeop_desc %s_desc = {\n", name);
+ # offset
+ printf ("\t0,\n");
+ # printable name
+ printf ("\t\"%s\",\n", name);
+ # flags
+ vppwillrele = "";
+ for (i=1; i<=numargs; i++) {
+ if (types[i] == "struct vnode **" &&
+ (reles[i] == "WILLRELE")) {
+ vppwillrele = "|VDESC_VPP_WILLRELE";
+ };
+ };
+ if (releflags == "") {
+ printf ("\t0%s,\n", vppwillrele);
+ } else {
+ printf ("\t%s%s,\n", releflags, vppwillrele);
+ };
+ # vp offsets
+ printf ("\t%s_vp_offsets,\n", name);
+ # vpp (if any)
+ printf ("\t%s,\n", find_arg_with_type("struct vnode **"));
+ # cred (if any)
+ printf ("\t%s,\n", find_arg_with_type("struct ucred *"));
+ # proc (if any)
+ printf ("\t%s,\n", find_arg_with_type("struct proc *"));
+ # componentname
+ printf ("\t%s,\n", find_arg_with_type("struct componentname *"));
+ # transport layer information
+ printf ("\tNULL,\n};\n");
+ }
+
+ NF == 0 || $0 ~ "^#" {
+ next;
+ }
+ {
+ # get the function name
+ name = $1;
+
+ # get the function arguments
+ read_args();
+
+ # Print out the vop_F_vp_offsets structure. This all depends
+ # on naming conventions and nothing else.
+ generate_operation_vp_offsets();
+
+ # Print out the vnodeop_desc structure.
+ generate_operation_desc();
+
+ printf "\n";
+
+ }' < $SRC >> $CFILE
+# THINGS THAT DON'T WORK RIGHT YET.
+#
+# Two existing BSD vnodeops (bwrite and strategy) don't take any vnodes as
+# arguments. This means that these operations can't function successfully
+# through a bypass routine.
+#
+# Bwrite and strategy will be replaced when the VM page/buffer cache
+# integration happens.
+#
+# To get around this problem for now we handle these ops as special cases.
+
+cat << END_OF_SPECIAL_CASES >> $HEADER
+#include <sys/buf.h>
+struct vop_strategy_args {
+ struct vnodeop_desc *a_desc;
+ struct buf *a_bp;
+};
+extern struct vnodeop_desc vop_strategy_desc;
+static int VOP_STRATEGY __P((
+ struct buf *bp));
+static inline int VOP_STRATEGY(bp)
+ struct buf *bp;
+{
+ struct vop_strategy_args a;
+
+ a.a_desc = VDESC(vop_strategy);
+ a.a_bp = bp;
+ return (VCALL((bp)->b_vp, VOFFSET(vop_strategy), &a));
+}
+
+struct vop_bwrite_args {
+ struct vnodeop_desc *a_desc;
+ struct buf *a_bp;
+};
+extern struct vnodeop_desc vop_bwrite_desc;
+static int VOP_BWRITE __P((
+ struct buf *bp));
+static inline int VOP_BWRITE(bp)
+ struct buf *bp;
+{
+ struct vop_bwrite_args a;
+
+ a.a_desc = VDESC(vop_bwrite);
+ a.a_bp = bp;
+ return (VCALL((bp)->b_vp, VOFFSET(vop_bwrite), &a));
+}
+END_OF_SPECIAL_CASES
+
+cat << END_OF_SPECIAL_CASES >> $CFILE
+static int vop_strategy_vp_offsets[] = {
+ VDESC_NO_OFFSET
+};
+struct vnodeop_desc vop_strategy_desc = {
+ 0,
+ "vop_strategy",
+ 0,
+ vop_strategy_vp_offsets,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ NULL,
+};
+static int vop_bwrite_vp_offsets[] = {
+ VDESC_NO_OFFSET
+};
+struct vnodeop_desc vop_bwrite_desc = {
+ 0,
+ "vop_bwrite",
+ 0,
+ vop_bwrite_vp_offsets,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ NULL,
+};
+END_OF_SPECIAL_CASES
+
+# Add the vfs_op_descs array to the C file.
+$AWK '
+ BEGIN {
+ printf("\nstruct vnodeop_desc *vfs_op_descs[] = {\n");
+ printf("\t&vop_default_desc, /* MUST BE FIRST */\n");
+ printf("\t&vop_strategy_desc, /* XXX: SPECIAL CASE */\n");
+ printf("\t&vop_bwrite_desc, /* XXX: SPECIAL CASE */\n");
+ }
+ END {
+ printf("\tNULL\n};\n");
+ }
+ NF == 0 || $0 ~ "^#" {
+ next;
+ }
+ {
+ # Get the function name.
+ printf("\t&%s_desc,\n", $1);
+
+ # Skip the function arguments.
+ for (;;) {
+ if (getline <= 0)
+ exit
+ if ($0 ~ "^};")
+ break;
+ }
+ }' < $SRC >> $CFILE
+
diff --git a/sys/kern/vnode_if.sh b/sys/kern/vnode_if.sh
index 8b74d83..75f49a7 100644
--- a/sys/kern/vnode_if.sh
+++ b/sys/kern/vnode_if.sh
@@ -1,9 +1,8 @@
#!/bin/sh -
-copyright='
-/*
- * Copyright (c) 1992, 1993, 1994, 1995
- * The Regents of the University of California. All rights reserved.
- *
+#
+# Copyright (c) 1992, 1993
+# The Regents of the University of California. All rights reserved.
+#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions
# are met:
@@ -31,17 +30,20 @@ copyright='
# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
# SUCH DAMAGE.
- *
- * from: NetBSD: vnode_if.sh,v 1.7 1994/08/25 03:04:28 cgd Exp $
- */
-'
-SCRIPT_ID='@(#)vnode_if.sh 8.7 (Berkeley) 5/11/95'
+#
+# @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93
+# $Id$
+#
# Script to produce VFS front-end sugar.
#
# usage: vnode_if.sh srcfile
# (where srcfile is currently /sys/kern/vnode_if.src)
#
+# These awk scripts are not particularly well written, specifically they
+# don't use arrays well and figure out the same information repeatedly.
+# Please rewrite them if you actually understand how to use awk. Note,
+# they use nawk extensions and gawk's toupper.
if [ $# -ne 1 ] ; then
echo 'usage: vnode_if.sh srcfile'
@@ -49,180 +51,139 @@ if [ $# -ne 1 ] ; then
fi
# Name of the source file.
-src=$1
+SRC=$1
# Names of the created files.
-out_c=vnode_if.c
-out_h=vnode_if.h
-
-# Awk program (must support nawk extensions)
-# Use "awk" at Berkeley, "nawk" or "gawk" elsewhere.
-awk=${AWK:-awk}
-
-# Does this awk have a "toupper" function? (i.e. is it GNU awk)
-isgawk=`$awk 'BEGIN { print toupper("true"); exit; }' 2>/dev/null`
-
-# If this awk does not define "toupper" then define our own.
-if [ "$isgawk" = TRUE ] ; then
- # GNU awk provides it.
- toupper=
-else
- # Provide our own toupper()
- toupper='
-function toupper(str) {
- _toupper_cmd = "echo "str" |tr a-z A-Z"
- _toupper_cmd | getline _toupper_str;
- close(_toupper_cmd);
- return _toupper_str;
-}'
-fi
+CFILE=vnode_if.c
+HEADER=vnode_if.h
-#
-# This is the common part of all awk programs that read $src
-# This parses the input for one function into the arrays:
-# argdir, argtype, argname, willrele
-# and calls "doit()" to generate output for the function.
-#
-# Input to this parser is pre-processed slightly by sed
-# so this awk parser doesn't have to work so hard. The
-# changes done by the sed pre-processing step are:
-# insert a space beween * and pointer name
-# replace semicolons with spaces
-#
-sed_prep='s:\*\([^\*/]\):\* \1:g
-s/;/ /'
-awk_parser='
-# Comment line
-/^#/ { next; }
-# First line of description
-/^vop_/ {
- name=$1;
- argc=0;
- next;
-}
-# Last line of description
-/^}/ {
- doit();
- next;
-}
-# Middle lines of description
-{
- argdir[argc] = $1; i=2;
- if ($2 == "WILLRELE") {
- willrele[argc] = 1;
- i++;
- } else
- willrele[argc] = 0;
- argtype[argc] = $i; i++;
- while (i < NF) {
- argtype[argc] = argtype[argc]" "$i;
- i++;
- }
- argname[argc] = $i;
- argc++;
- next;
-}
-'
+# Awk program (must support nawk extensions and gawk's "toupper")
+# Use "awk" at Berkeley, "gawk" elsewhere.
+AWK=awk
-# This is put after the copyright on each generated file.
-warning="
+# Print out header information for vnode_if.h.
+cat << END_OF_LEADING_COMMENT > $HEADER
/*
- * Warning: This file is generated automatically.
- * (Modifications made here may easily be lost!)
+ * This file is produced automatically.
+ * Do not modify anything in here by hand.
*
- * Created by the script:
- * ${SCRIPT_ID}
+ * Created from @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93
*/
-"
-
-# Get rid of ugly spaces
-space_elim='s:\([^/]\*\) :\1:g'
-
-#
-# Redirect stdout to the H file.
-#
-echo "$0: Creating $out_h" 1>&2
-exec > $out_h
-# Begin stuff
-echo "$copyright"
-echo "$warning"
-echo '
extern struct vnodeop_desc vop_default_desc;
-'
-
-# Body stuff
-# This awk program needs toupper() so define it if necessary.
-sed -e "$sed_prep" $src | $awk "$toupper"'
-function doit() {
- # Declare arg struct, descriptor.
- printf("\nstruct %s_args {\n", name);
- printf("\tstruct vnodeop_desc * a_desc;\n");
- for (i=0; i<argc; i++) {
- printf("\t%s a_%s;\n", argtype[i], argname[i]);
- }
- printf("};\n");
- printf("extern struct vnodeop_desc %s_desc;\n", name);
- # Define inline function.
- printf("#define %s(", toupper(name));
- for (i=0; i<argc; i++) {
- printf("%s", argname[i]);
- if (i < (argc-1)) printf(", ");
- }
- printf(") _%s(", toupper(name));
- for (i=0; i<argc; i++) {
- printf("%s", argname[i]);
- if (i < (argc-1)) printf(", ");
- }
- printf(")\n");
- printf("static __inline int _%s(", toupper(name));
- for (i=0; i<argc; i++) {
- printf("%s", argname[i]);
- if (i < (argc-1)) printf(", ");
- }
- printf(")\n");
- for (i=0; i<argc; i++) {
- printf("\t%s %s;\n", argtype[i], argname[i]);
- }
- printf("{\n\tstruct %s_args a;\n", name);
- printf("\ta.a_desc = VDESC(%s);\n", name);
- for (i=0; i<argc; i++) {
- printf("\ta.a_%s = %s;\n", argname[i], argname[i]);
+END_OF_LEADING_COMMENT
+
+# Awk script to take vnode_if.src and turn it into vnode_if.h.
+$AWK '
+ NF == 0 || $0 ~ "^#" {
+ next;
}
- printf("\treturn (VCALL(%s%s, VOFFSET(%s), &a));\n}\n",
- argname[0], arg0special, name);
-}
-BEGIN {
- arg0special="";
-}
-END {
- printf("\n/* Special cases: */\n#include <sys/buf.h>\n");
- argc=1;
- argtype[0]="struct buf *";
- argname[0]="bp";
- arg0special="->b_vp";
- name="vop_strategy";
- doit();
- name="vop_bwrite";
- doit();
-}
-'"$awk_parser" | sed -e "$space_elim"
+ {
+ # Get the function name.
+ name = $1;
+ uname = toupper(name);
-# End stuff
-echo '
-/* End of special cases. */'
+ # Get the function arguments.
+ for (c1 = 0;; ++c1) {
+ if (getline <= 0)
+ exit
+ if ($0 ~ "^};")
+ break;
+ a[c1] = $0;
+ }
+ # Print out the vop_F_args structure.
+ printf("struct %s_args {\n\tstruct vnodeop_desc *a_desc;\n",
+ name);
+ for (c2 = 0; c2 < c1; ++c2) {
+ c3 = split(a[c2], t);
+ printf("\t");
+ if (t[2] ~ "WILLRELE")
+ c4 = 3;
+ else
+ c4 = 2;
+ for (; c4 < c3; ++c4)
+ printf("%s ", t[c4]);
+ beg = match(t[c3], "[^*]");
+ printf("%sa_%s\n",
+ substr(t[c4], 0, beg - 1), substr(t[c4], beg));
+ }
+ printf("};\n");
-#
-# Redirect stdout to the C file.
-#
-echo "$0: Creating $out_c" 1>&2
-exec > $out_c
+ # Print out extern declaration.
+ printf("extern struct vnodeop_desc %s_desc;\n", name);
+
+ # Print out prototype.
+ printf("static int %s __P((\n", uname);
+ sep = ",\n";
+ for (c2 = 0; c2 < c1; ++c2) {
+ if (c2 == c1 - 1)
+ sep = "));\n";
+ c3 = split(a[c2], t);
+ printf("\t");
+ if (t[2] ~ "WILLRELE")
+ c4 = 3;
+ else
+ c4 = 2;
+ for (; c4 < c3; ++c4)
+ printf("%s ", t[c4]);
+ beg = match(t[c3], "[^*]");
+ end = match(t[c3], ";");
+ printf("%s%s%s",
+ substr(t[c4], 0, beg - 1),
+ substr(t[c4], beg, end - beg), sep);
+ }
+
+ # Print out inline struct.
+ printf("static inline int %s(", uname);
+ sep = ", ";
+ for (c2 = 0; c2 < c1; ++c2) {
+ if (c2 == c1 - 1)
+ sep = ")\n";
+ c3 = split(a[c2], t);
+ beg = match(t[c3], "[^*]");
+ end = match(t[c3], ";");
+ printf("%s%s", substr(t[c3], beg, end - beg), sep);
+ }
+ for (c2 = 0; c2 < c1; ++c2) {
+ c3 = split(a[c2], t);
+ printf("\t");
+ if (t[2] ~ "WILLRELE")
+ c4 = 3;
+ else
+ c4 = 2;
+ for (; c4 < c3; ++c4)
+ printf("%s ", t[c4]);
+ beg = match(t[c3], "[^*]");
+ printf("%s%s\n",
+ substr(t[c4], 0, beg - 1), substr(t[c4], beg));
+ }
+ printf("{\n\tstruct %s_args a;\n\n", name);
+ printf("\ta.a_desc = VDESC(%s);\n", name);
+ for (c2 = 0; c2 < c1; ++c2) {
+ c3 = split(a[c2], t);
+ printf("\t");
+ beg = match(t[c3], "[^*]");
+ end = match(t[c3], ";");
+ printf("a.a_%s = %s\n",
+ substr(t[c3], beg, end - beg), substr(t[c3], beg));
+ }
+ c1 = split(a[0], t);
+ beg = match(t[c1], "[^*]");
+ end = match(t[c1], ";");
+ printf("\treturn (VCALL(%s, VOFFSET(%s), &a));\n}\n",
+ substr(t[c1], beg, end - beg), name);
+ }' < $SRC >> $HEADER
+
+# Print out header information for vnode_if.c.
+cat << END_OF_LEADING_COMMENT > $CFILE
+/*
+ * This file is produced automatically.
+ * Do not modify anything in here by hand.
+ *
+ * Created from @(#)vnode_if.sh 8.1 (Berkeley) 6/10/93
+ */
-# Begin stuff
-echo "$copyright"
-echo "$warning"
-echo '
#include <sys/param.h>
#include <sys/mount.h>
#include <sys/vnode.h>
@@ -238,107 +199,261 @@ struct vnodeop_desc vop_default_desc = {
VDESC_NO_OFFSET,
NULL,
};
-'
-
-# Body stuff
-sed -e "$sed_prep" $src | $awk '
-function do_offset(typematch) {
- for (i=0; i<argc; i++) {
- if (argtype[i] == typematch) {
- printf("\tVOPARG_OFFSETOF(struct %s_args, a_%s),\n",
- name, argname[i]);
- return i;
- };
- };
- print "\tVDESC_NO_OFFSET,";
- return -1;
-}
-function doit() {
- # Define offsets array
- printf("\nint %s_vp_offsets[] = {\n", name);
- for (i=0; i<argc; i++) {
- if (argtype[i] == "struct vnode *") {
- printf ("\tVOPARG_OFFSETOF(struct %s_args,a_%s),\n",
- name, argname[i]);
- }
+END_OF_LEADING_COMMENT
+
+# Awk script to take vnode_if.src and turn it into vnode_if.c.
+$AWK 'function kill_surrounding_ws (s) {
+ sub (/^[ \t]*/, "", s);
+ sub (/[ \t]*$/, "", s);
+ return s;
}
- print "\tVDESC_NO_OFFSET";
- print "};";
- # Define F_desc
- printf("struct vnodeop_desc %s_desc = {\n", name);
- # offset
- printf ("\t0,\n");
- # printable name
- printf ("\t\"%s\",\n", name);
- # flags
- printf("\t0");
- vpnum = 0;
- for (i=0; i<argc; i++) {
- if (willrele[i]) {
- if (argdir[i] ~ /OUT/) {
- printf(" | VDESC_VPP_WILLRELE");
+
+ function read_args() {
+ numargs = 0;
+ while (getline ln) {
+ if (ln ~ /}/) {
+ break;
+ };
+
+ # Delete comments, if any.
+ gsub (/\/\*.*\*\//, "", ln);
+
+ # Delete leading/trailing space.
+ ln = kill_surrounding_ws(ln);
+
+ # Pick off direction.
+ if (1 == sub(/^INOUT[ \t]+/, "", ln))
+ dir = "INOUT";
+ else if (1 == sub(/^IN[ \t]+/, "", ln))
+ dir = "IN";
+ else if (1 == sub(/^OUT[ \t]+/, "", ln))
+ dir = "OUT";
+ else
+ bail("No IN/OUT direction for \"" ln "\".");
+
+ # check for "WILLRELE"
+ if (1 == sub(/^WILLRELE[ \t]+/, "", ln)) {
+ rele = "WILLRELE";
} else {
- printf(" | VDESC_VP%s_WILLRELE", vpnum);
+ rele = "WONTRELE";
};
- vpnum++;
- }
+
+ # kill trailing ;
+ if (1 != sub (/;$/, "", ln)) {
+ bail("Missing end-of-line ; in \"" ln "\".");
+ };
+
+ # pick off variable name
+ if (!(i = match(ln, /[A-Za-z0-9_]+$/))) {
+ bail("Missing var name \"a_foo\" in \"" ln "\".");
+ };
+ arg = substr (ln, i);
+ # Want to <<substr(ln, i) = "";>>, but nawk cannot.
+ # Hack around this.
+ ln = substr(ln, 1, i-1);
+
+ # what is left must be type
+ # (put clean it up some)
+ type = ln;
+ gsub (/[ \t]+/, " ", type); # condense whitespace
+ type = kill_surrounding_ws(type);
+
+ # (boy this was easier in Perl)
+
+ numargs++;
+ dirs[numargs] = dir;
+ reles[numargs] = rele;
+ types[numargs] = type;
+ args[numargs] = arg;
+ };
}
- print ",";
- # vp offsets
- printf ("\t%s_vp_offsets,\n", name);
- # vpp (if any)
- do_offset("struct vnode **");
- # cred (if any)
- do_offset("struct ucred *");
- # proc (if any)
- do_offset("struct proc *");
- # componentname
- do_offset("struct componentname *");
- # transport layer information
- printf ("\tNULL,\n};\n");
-}
-END {
- printf("\n/* Special cases: */\n");
- argc=1;
- argdir[0]="IN";
- argtype[0]="struct buf *";
- argname[0]="bp";
- willrele[0]=0;
- name="vop_strategy";
- doit();
- name="vop_bwrite";
- doit();
+
+ function generate_operation_vp_offsets() {
+ printf ("static int %s_vp_offsets[] = {\n", name);
+ # as a side effect, figure out the releflags
+ releflags = "";
+ vpnum = 0;
+ for (i=1; i<=numargs; i++) {
+ if (types[i] == "struct vnode *") {
+ printf ("\tVOPARG_OFFSETOF(struct %s_args,a_%s),\n",
+ name, args[i]);
+ if (reles[i] == "WILLRELE") {
+ releflags = releflags "|VDESC_VP" vpnum "_WILLRELE";
+ };
+ vpnum++;
+ };
+ };
+ sub (/^\|/, "", releflags);
+ print "\tVDESC_NO_OFFSET";
+ print "};";
+ }
+
+ function find_arg_with_type (type) {
+ for (i=1; i<=numargs; i++) {
+ if (types[i] == type) {
+ return "VOPARG_OFFSETOF(struct " name "_args,a_" args[i] ")";
+ };
+ };
+ return "VDESC_NO_OFFSET";
+ }
+
+ function generate_operation_desc() {
+ printf ("struct vnodeop_desc %s_desc = {\n", name);
+ # offset
+ printf ("\t0,\n");
+ # printable name
+ printf ("\t\"%s\",\n", name);
+ # flags
+ vppwillrele = "";
+ for (i=1; i<=numargs; i++) {
+ if (types[i] == "struct vnode **" &&
+ (reles[i] == "WILLRELE")) {
+ vppwillrele = "|VDESC_VPP_WILLRELE";
+ };
+ };
+ if (releflags == "") {
+ printf ("\t0%s,\n", vppwillrele);
+ } else {
+ printf ("\t%s%s,\n", releflags, vppwillrele);
+ };
+ # vp offsets
+ printf ("\t%s_vp_offsets,\n", name);
+ # vpp (if any)
+ printf ("\t%s,\n", find_arg_with_type("struct vnode **"));
+ # cred (if any)
+ printf ("\t%s,\n", find_arg_with_type("struct ucred *"));
+ # proc (if any)
+ printf ("\t%s,\n", find_arg_with_type("struct proc *"));
+ # componentname
+ printf ("\t%s,\n", find_arg_with_type("struct componentname *"));
+ # transport layer information
+ printf ("\tNULL,\n};\n");
+ }
+
+ NF == 0 || $0 ~ "^#" {
+ next;
+ }
+ {
+ # get the function name
+ name = $1;
+
+ # get the function arguments
+ read_args();
+
+ # Print out the vop_F_vp_offsets structure. This all depends
+ # on naming conventions and nothing else.
+ generate_operation_vp_offsets();
+
+ # Print out the vnodeop_desc structure.
+ generate_operation_desc();
+
+ printf "\n";
+
+ }' < $SRC >> $CFILE
+# THINGS THAT DON'T WORK RIGHT YET.
+#
+# Two existing BSD vnodeops (bwrite and strategy) don't take any vnodes as
+# arguments. This means that these operations can't function successfully
+# through a bypass routine.
+#
+# Bwrite and strategy will be replaced when the VM page/buffer cache
+# integration happens.
+#
+# To get around this problem for now we handle these ops as special cases.
+
+cat << END_OF_SPECIAL_CASES >> $HEADER
+#include <sys/buf.h>
+struct vop_strategy_args {
+ struct vnodeop_desc *a_desc;
+ struct buf *a_bp;
+};
+extern struct vnodeop_desc vop_strategy_desc;
+static int VOP_STRATEGY __P((
+ struct buf *bp));
+static inline int VOP_STRATEGY(bp)
+ struct buf *bp;
+{
+ struct vop_strategy_args a;
+
+ a.a_desc = VDESC(vop_strategy);
+ a.a_bp = bp;
+ return (VCALL((bp)->b_vp, VOFFSET(vop_strategy), &a));
}
-'"$awk_parser" | sed -e "$space_elim"
-# End stuff
-echo '
-/* End of special cases. */'
+struct vop_bwrite_args {
+ struct vnodeop_desc *a_desc;
+ struct buf *a_bp;
+};
+extern struct vnodeop_desc vop_bwrite_desc;
+static int VOP_BWRITE __P((
+ struct buf *bp));
+static inline int VOP_BWRITE(bp)
+ struct buf *bp;
+{
+ struct vop_bwrite_args a;
-# Add the vfs_op_descs array to the C file.
-# Begin stuff
-echo '
-struct vnodeop_desc *vfs_op_descs[] = {
- &vop_default_desc, /* MUST BE FIRST */
- &vop_strategy_desc, /* XXX: SPECIAL CASE */
- &vop_bwrite_desc, /* XXX: SPECIAL CASE */
-'
-
-# Body stuff
-sed -e "$sed_prep" $src | $awk '
-function doit() {
- printf("\t&%s_desc,\n", name);
+ a.a_desc = VDESC(vop_bwrite);
+ a.a_bp = bp;
+ return (VCALL((bp)->b_vp, VOFFSET(vop_bwrite), &a));
}
-'"$awk_parser"
+END_OF_SPECIAL_CASES
-# End stuff
-echo ' NULL
+cat << END_OF_SPECIAL_CASES >> $CFILE
+static int vop_strategy_vp_offsets[] = {
+ VDESC_NO_OFFSET
+};
+struct vnodeop_desc vop_strategy_desc = {
+ 0,
+ "vop_strategy",
+ 0,
+ vop_strategy_vp_offsets,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ NULL,
+};
+static int vop_bwrite_vp_offsets[] = {
+ VDESC_NO_OFFSET
+};
+struct vnodeop_desc vop_bwrite_desc = {
+ 0,
+ "vop_bwrite",
+ 0,
+ vop_bwrite_vp_offsets,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ VDESC_NO_OFFSET,
+ NULL,
};
-'
+END_OF_SPECIAL_CASES
-exit 0
+# Add the vfs_op_descs array to the C file.
+$AWK '
+ BEGIN {
+ printf("\nstruct vnodeop_desc *vfs_op_descs[] = {\n");
+ printf("\t&vop_default_desc, /* MUST BE FIRST */\n");
+ printf("\t&vop_strategy_desc, /* XXX: SPECIAL CASE */\n");
+ printf("\t&vop_bwrite_desc, /* XXX: SPECIAL CASE */\n");
+ }
+ END {
+ printf("\tNULL\n};\n");
+ }
+ NF == 0 || $0 ~ "^#" {
+ next;
+ }
+ {
+ # Get the function name.
+ printf("\t&%s_desc,\n", $1);
+
+ # Skip the function arguments.
+ for (;;) {
+ if (getline <= 0)
+ exit
+ if ($0 ~ "^};")
+ break;
+ }
+ }' < $SRC >> $CFILE
-# Local Variables:
-# tab-width: 4
-# End:
diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src
index 1e32f29..7e3338f 100644
--- a/sys/kern/vnode_if.src
+++ b/sys/kern/vnode_if.src
@@ -31,6 +31,7 @@
# SUCH DAMAGE.
#
# @(#)vnode_if.src 8.12 (Berkeley) 5/14/95
+# $Id: vnode_if.src,v 1.9.2000.1 1996/09/17 14:32:01 peter Exp $
#
#
@@ -255,8 +256,8 @@ vop_remove {
#% link tdvp L U U
#
vop_link {
- IN WILLRELE struct vnode *vp;
- IN struct vnode *tdvp;
+ IN WILLRELE struct vnode *tdvp;
+ IN struct vnode *vp;
IN struct componentname *cnp;
};
@@ -385,6 +386,7 @@ vop_bmap {
OUT struct vnode **vpp;
IN daddr_t *bnp;
OUT int *runp;
+ OUT int *runb;
};
#
@@ -486,6 +488,23 @@ vop_update {
IN int waitfor;
};
+vop_getpages {
+ IN struct vnode *vp;
+ IN vm_page_t *m;
+ IN int count;
+ IN int reqpage;
+ IN vm_ooffset_t offset;
+};
+
+vop_putpages {
+ IN struct vnode *vp;
+ IN vm_page_t *m;
+ IN int count;
+ IN int sync;
+ IN int *rtvals;
+ IN vm_ooffset_t offset;
+};
+
#
# Needs work: no vp?
#
OpenPOWER on IntegriCloud