97 files changed, 40170 insertions, 7231 deletions
diff --git a/sys/kern/Make.tags.inc b/sys/kern/Make.tags.inc
index 79cb83a..a09e484 100644
--- a/sys/kern/Make.tags.inc
+++ b/sys/kern/Make.tags.inc
@@ -1,4 +1,5 @@
-#	@(#)Make.tags.inc	8.2 (Berkeley) 11/23/94
+#	@(#)Make.tags.inc	8.1 (Berkeley) 6/11/93
+# $Id$
 
 # Common files for "make tags".
 # Included by the Makefile for each architecture.
@@ -9,7 +10,6 @@
 
 COMM=	/sys/conf/*.[ch] \
 	/sys/dev/*.[ch] /sys/dev/scsi/*.[ch] \
-	/sys/isofs/*/*.[ch] \
 	/sys/kern/*.[ch] /sys/libkern/*.[ch] \
 	/sys/miscfs/*/*.[ch] \
 	/sys/net/*.[ch] /sys/netccitt/*.[ch] /sys/netinet/*.[ch] \
diff --git a/sys/kern/Makefile b/sys/kern/Makefile
index 3159d20..f42a44e 100644
--- a/sys/kern/Makefile
+++ b/sys/kern/Makefile
@@ -1,17 +1,20 @@
-#	@(#)Makefile	8.3 (Berkeley) 2/14/95
+#	@(#)Makefile	8.2 (Berkeley) 3/21/94
 
 # Makefile for kernel tags files, init_sysent, etc.
 
-ARCH=	hp300 i386 luna68k news3400 pmax sparc tahoe vax
+ARCH=	i386 # luna68k news3400 pmax sparc tahoe vax
 
 all:
 	@echo "make tags, make links or init_sysent.c only"
 
-init_sysent.c syscalls.c ../sys/syscall.h ../sys/syscallargs.h: makesyscalls.sh syscalls.master
+init_sysent.c syscalls.c ../sys/syscall.h ../sys/syscall-hide.h \
+../sys/sysproto.h: makesyscalls.sh syscalls.master
 	-mv -f init_sysent.c init_sysent.c.bak
 	-mv -f syscalls.c syscalls.c.bak
 	-mv -f ../sys/syscall.h ../sys/syscall.h.bak
-	sh makesyscalls.sh syscalls.conf syscalls.master
+	-mv -f ../sys/syscall-hide.h ../sys/syscall-hide.h.bak
+	-mv -f ../sys/sysproto.h ../sys/sysproto.h.bak
+	sh makesyscalls.sh syscalls.master
 
 # Kernel tags:
 # Tags files are built in the top-level directory for each architecture,
diff --git a/sys/kern/imgact_aout.c b/sys/kern/imgact_aout.c
new file mode 100644
index 0000000..4adbd05
--- /dev/null
+++ b/sys/kern/imgact_aout.c
@@ -0,0 +1,210 @@
+/*
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$Id$
+ */
+
+#include "opt_rlimit.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/resourcevar.h>
+#include <sys/exec.h>
+#include <sys/mman.h>
+#include <sys/imgact.h>
+#include <sys/imgact_aout.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/sysent.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+
+static int	exec_aout_imgact __P((struct image_params *imgp));
+
+static int
+exec_aout_imgact(imgp)
+	struct image_params *imgp;
+{
+	const struct exec *a_out = (const struct exec *) imgp->image_header;
+	struct vmspace *vmspace = imgp->proc->p_vmspace;
+	vm_offset_t vmaddr;
+	unsigned long virtual_offset;
+	unsigned long file_offset;
+	unsigned long bss_size;
+	int error;
+
+	/*
+	 * Linux and *BSD binaries look very much alike,
+	 * only the machine id is different:
+	 * 0x64 for Linux, 0x86 for *BSD, 0x00 for BSDI.
+	 * NetBSD is in network byte order.. ugh.
+	 */
+	if (((a_out->a_magic >> 16) & 0xff) != 0x86 &&
+	    ((a_out->a_magic >> 16) & 0xff) != 0 &&
+	    ((((int)ntohl(a_out->a_magic)) >> 16) & 0xff) != 0x86)
+                return -1;
+
+	/*
+	 * Set file/virtual offset based on a.out variant.
+	 *	We do two cases: host byte order and network byte order
+	 *	(for NetBSD compatibility)
+	 */
+	switch ((int)(a_out->a_magic & 0xffff)) {
+	case ZMAGIC:
+		virtual_offset = 0;
+		if (a_out->a_text) {
+			file_offset = PAGE_SIZE;
+		} else {
+			/* Bill's "screwball mode" */
+			file_offset = 0;
+		}
+		break;
+	case QMAGIC:
+		virtual_offset = PAGE_SIZE;
+		file_offset = 0;
+		break;
+	default:
+		/* NetBSD compatibility */
+		switch ((int)(ntohl(a_out->a_magic) & 0xffff)) {
+		case ZMAGIC:
+		case QMAGIC:
+			virtual_offset = PAGE_SIZE;
+			file_offset = 0;
+			break;
+		default:
+			return (-1);
+		}
+	}
+
+	bss_size = roundup(a_out->a_bss, PAGE_SIZE);
+
+	/*
+	 * Check various fields in header for validity/bounds.
+	 */
+	if (/* entry point must lay with text region */
+	    a_out->a_entry < virtual_offset ||
+	    a_out->a_entry >= virtual_offset + a_out->a_text ||
+
+	    /* text and data size must each be page rounded */
+	    a_out->a_text & PAGE_MASK || a_out->a_data & PAGE_MASK)
+		return (-1);
+
+	/* text + data can't exceed file size */
+	if (a_out->a_data + a_out->a_text > imgp->attr->va_size)
+		return (EFAULT);
+
+	/*
+	 * text/data/bss must not exceed limits
+	 */
+	if (/* text can't exceed maximum text size */
+	    a_out->a_text > MAXTSIZ ||
+
+	    /* data + bss can't exceed maximum data size */
+	    a_out->a_data + bss_size > MAXDSIZ ||
+
+	    /* data + bss can't exceed rlimit */
+	    a_out->a_data + bss_size >
+		imgp->proc->p_rlimit[RLIMIT_DATA].rlim_cur)
+			return (ENOMEM);
+
+	/* copy in arguments and/or environment from old process */
+	error = exec_extract_strings(imgp);
+	if (error)
+		return (error);
+
+	/*
+	 * Destroy old process VM and create a new one (with a new stack)
+	 */
+	exec_new_vmspace(imgp);
+
+	/*
+	 * Map text/data read/execute
+	 */
+	vmaddr = virtual_offset;
+	error =
+	    vm_mmap(&vmspace->vm_map,			/* map */
+		&vmaddr,				/* address */
+		a_out->a_text + a_out->a_data,		/* size */
+		VM_PROT_READ | VM_PROT_EXECUTE,		/* protection */
+		VM_PROT_ALL,				/* max protection */
+		MAP_PRIVATE | MAP_FIXED,		/* flags */
+		(caddr_t)imgp->vp,			/* vnode */
+		file_offset);				/* offset */
+	if (error)
+		return (error);
+
+	/*
+	 * allow writing of data
+	 */
+	vm_map_protect(&vmspace->vm_map,
+		vmaddr + a_out->a_text,
+		vmaddr + a_out->a_text + a_out->a_data,
+		VM_PROT_ALL,
+		FALSE);
+
+	if (bss_size != 0) {
+		/*
+		 * Allocate demand-zeroed area for uninitialized data
+		 * "bss" = 'block started by symbol' - named after the IBM 7090
+		 *	instruction of the same name.
+		 */
+		vmaddr = virtual_offset + a_out->a_text + a_out->a_data;
+		error = vm_map_find(&vmspace->vm_map, NULL, 0, &vmaddr, bss_size, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
+		if (error)
+			return (error);
+	}
+
+	/* Fill in process VM information */
+	vmspace->vm_tsize = a_out->a_text >> PAGE_SHIFT;
+	vmspace->vm_dsize = (a_out->a_data + bss_size) >> PAGE_SHIFT;
+	vmspace->vm_taddr = (caddr_t) virtual_offset;
+	vmspace->vm_daddr = (caddr_t) virtual_offset + a_out->a_text;
+
+	/* Fill in image_params */
+	imgp->interpreted = 0;
+	imgp->entry_addr = a_out->a_entry;
+
+	imgp->proc->p_sysent = &aout_sysvec;
+
+	/* Indicate that this file should not be modified */
+	imgp->vp->v_flag |= VTEXT;
+
+	return (0);
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ * Since `const' objects end up in the text segment, TEXT_SET is the
+ * correct directive to use.
+ */
+static const struct execsw aout_execsw = { exec_aout_imgact, "a.out" };
+TEXT_SET(execsw_set, aout_execsw);
diff --git a/sys/kern/imgact_elf.c b/sys/kern/imgact_elf.c
new file mode 100644
index 0000000..525d76d
--- /dev/null
+++ b/sys/kern/imgact_elf.c
@@ -0,0 +1,749 @@
+/*-
+ * Copyright (c) 1995-1996 S�ren Schmidt
+ * Copyright (c) 1996 Peter Wemm
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer
+ *    in this position and unchanged.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote products
+ *    derived from this software withough specific prior written permission
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ *	$Id: imgact_elf.c,v 1.16 1997/02/22 09:38:56 peter Exp $
+ */
+
+#include "opt_rlimit.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/resourcevar.h>
+#include <sys/exec.h>
+#include <sys/mman.h>
+#include <sys/imgact.h>
+#include <sys/imgact_elf.h>
+#include <sys/kernel.h>
+#include <sys/sysent.h>
+#include <sys/fcntl.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
+#include <sys/sysproto.h>
+#include <sys/syscall.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_param.h>
+#include <vm/pmap.h>
+#include <sys/lock.h>
+#include <vm/vm_map.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_extern.h>
+
+#include <machine/md_var.h>
+#include <i386/linux/linux_syscall.h>
+#include <i386/linux/linux.h>
+
+#define MAX_PHDR	32	/* XXX enough ? */
+
+static int map_pages __P((struct vnode *vp, vm_offset_t offset, vm_offset_t *buf, vm_size_t size));
+static void unmap_pages __P((vm_offset_t buf, vm_size_t size));
+static int elf_check_permissions __P((struct proc *p, struct vnode *vp));
+static int elf_check_header __P((const Elf32_Ehdr *hdr, int type));
+static int elf_load_section __P((struct vmspace *vmspace, struct vnode *vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot));
+static int elf_load_file __P((struct proc *p, char *file, u_long *addr, u_long *entry));
+static int elf_freebsd_fixup __P((int **stack_base, struct image_params *imgp));
+int exec_elf_imgact __P((struct image_params *imgp));
+
+int elf_trace = 0;
+SYSCTL_INT(_debug, 1, elf_trace, CTLFLAG_RW, &elf_trace, 0, "");
+#define UPRINTF if (elf_trace) uprintf
+
+static struct sysentvec elf_freebsd_sysvec = {
+        SYS_MAXSYSCALL,
+        sysent,
+        0,
+        0,
+        0,
+        0,
+        0,
+        elf_freebsd_fixup,
+        sendsig,
+        sigcode,
+        &szsigcode,
+        0,
+	"FreeBSD ELF"
+};
+
+static Elf32_Brandinfo freebsd_brand_info = {
+						"FreeBSD",
+						"",
+						"/usr/libexec/ld-elf.so.1",
+						&elf_freebsd_sysvec
+					  };
+static Elf32_Brandinfo *elf_brand_list[MAX_BRANDS] = {
+							&freebsd_brand_info,
+							NULL, NULL, NULL,
+							NULL, NULL, NULL, NULL
+						    };
+
+int
+elf_insert_brand_entry(Elf32_Brandinfo *entry)
+{
+	int i;
+
+	for (i=1; i<MAX_BRANDS; i++) {
+		if (elf_brand_list[i] == NULL) {
+			elf_brand_list[i] = entry;
+			break;
+		}
+	}
+	if (i == MAX_BRANDS)
+		return -1;
+	return 0;
+}
+
+int
+elf_remove_brand_entry(Elf32_Brandinfo *entry)
+{
+	int i;
+
+	for (i=1; i<MAX_BRANDS; i++) {
+		if (elf_brand_list[i] == entry) {
+			elf_brand_list[i] = NULL;
+			break;
+		}
+	}
+	if (i == MAX_BRANDS)
+		return -1;
+	return 0;
+}
+
+static int
+map_pages(struct vnode *vp, vm_offset_t offset, 
+	     vm_offset_t *buf, vm_size_t size)
+{
+	int error;
+	vm_offset_t kern_buf;
+	vm_size_t pageoff;
+	
+	/*
+	 * The request may not be aligned, and may even cross several
+	 * page boundaries in the file...
+	 */
+	pageoff = (offset & PAGE_MASK);
+	offset -= pageoff;		/* start of first aligned page to map */
+	size += pageoff;
+	size = round_page(size);	/* size of aligned pages to map */
+	
+	if (error = vm_mmap(kernel_map,
+			    &kern_buf,
+			    size,
+			    VM_PROT_READ,
+			    VM_PROT_READ,
+			    0,
+			    (caddr_t)vp,
+			    offset))
+		return error;
+
+	*buf = kern_buf + pageoff;
+
+	return 0;
+}
+
+static void
+unmap_pages(vm_offset_t buf, vm_size_t size)
+{
+	vm_size_t pageoff;
+	
+	pageoff = (buf & PAGE_MASK);
+	buf -= pageoff;		/* start of first aligned page to map */
+	size += pageoff;
+	size = round_page(size);/* size of aligned pages to map */
+	
+      	vm_map_remove(kernel_map, buf, buf + size);
+}
+
+static int
+elf_check_permissions(struct proc *p, struct vnode *vp)
+{
+	struct vattr attr;
+	int error;
+
+	/*
+	 * Check number of open-for-writes on the file and deny execution
+	 *	if there are any.
+	 */
+	if (vp->v_writecount) {
+		return (ETXTBSY);
+	}
+
+	/* Get file attributes */
+	error = VOP_GETATTR(vp, &attr, p->p_ucred, p);
+	if (error)
+		return (error);
+
+	/*
+	 * 1) Check if file execution is disabled for the filesystem that this
+	 *	file resides on.
+	 * 2) Insure that at least one execute bit is on - otherwise root
+	 *	will always succeed, and we don't want to happen unless the
+	 *	file really is executable.
+	 * 3) Insure that the file is a regular file.
+	 */
+	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
+	    ((attr.va_mode & 0111) == 0) ||
+	    (attr.va_type != VREG)) {
+		return (EACCES);
+	}
+
+	/*
+	 * Zero length files can't be exec'd
+	 */
+	if (attr.va_size == 0)
+		return (ENOEXEC);
+
+	/*
+	 *  Check for execute permission to file based on current credentials.
+	 *	Then call filesystem specific open routine (which does nothing
+	 *	in the general case).
+	 */
+	error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p);
+	if (error)
+		return (error);
+
+	error = VOP_OPEN(vp, FREAD, p->p_ucred, p);
+	if (error)
+		return (error);
+
+	return (0);
+}
+
+static int
+elf_check_header(const Elf32_Ehdr *hdr, int type)
+{
+	if (!(hdr->e_ident[EI_MAG0] == ELFMAG0 &&
+	      hdr->e_ident[EI_MAG1] == ELFMAG1 &&
+	      hdr->e_ident[EI_MAG2] == ELFMAG2 &&
+	      hdr->e_ident[EI_MAG3] == ELFMAG3))
+		return ENOEXEC;
+
+	if (hdr->e_machine != EM_386 && hdr->e_machine != EM_486)
+		return ENOEXEC;
+
+	if (hdr->e_type != type)
+		return ENOEXEC;
+	
+	return 0;
+}
+
+static int
+elf_load_section(struct vmspace *vmspace, struct vnode *vp, vm_offset_t offset, caddr_t vmaddr, size_t memsz, size_t filsz, vm_prot_t prot)
+{
+	size_t map_len;
+	vm_offset_t map_addr;
+	int error;
+	unsigned char *data_buf = 0;
+	size_t copy_len;
+
+	map_addr = trunc_page(vmaddr);
+
+	if (memsz > filsz)
+		map_len = trunc_page(offset+filsz) - trunc_page(offset);
+	else
+		map_len = round_page(offset+filsz) - trunc_page(offset);
+
+	if (error = vm_mmap (&vmspace->vm_map,
+			     &map_addr,
+			     map_len,
+			     prot,
+			     VM_PROT_ALL,
+			     MAP_PRIVATE | MAP_FIXED,
+			     (caddr_t)vp,
+			     trunc_page(offset)))
+		return error;
+
+	if (memsz == filsz)
+		return 0;
+
+	/*
+	 * We have to map the remaining bit of the file into the kernel's
+	 * memory map, allocate some anonymous memory, and copy that last
+	 * bit into it. The remaining space should be .bss...
+	 */
+	copy_len = (offset + filsz) - trunc_page(offset + filsz);
+	map_addr = trunc_page(vmaddr + filsz);
+	map_len = round_page(vmaddr + memsz) - map_addr;
+
+        if (map_len != 0) {
+		if (error = vm_map_find(&vmspace->vm_map, NULL, 0,
+					&map_addr, map_len, FALSE,
+					VM_PROT_ALL, VM_PROT_ALL,0))
+			return error; 
+	}
+
+	if (error = vm_mmap(kernel_map,
+			    (vm_offset_t *)&data_buf,
+			    PAGE_SIZE,
+			    VM_PROT_READ,
+			    VM_PROT_READ,
+			    0,
+			    (caddr_t)vp,
+			    trunc_page(offset + filsz)))
+		return error;
+
+	error = copyout(data_buf, (caddr_t)map_addr, copy_len);
+
+        vm_map_remove(kernel_map, (vm_offset_t)data_buf, 
+		      (vm_offset_t)data_buf + PAGE_SIZE);
+
+	/*
+	 * set it to the specified protection
+	 */
+	vm_map_protect(&vmspace->vm_map, map_addr, map_addr + map_len,  prot,
+		       FALSE);
+
+	UPRINTF("bss size %d (%x)\n", map_len-copy_len, map_len-copy_len);
+	return error;
+}
+
+static int
+elf_load_file(struct proc *p, char *file, u_long *addr, u_long *entry)
+{
+	Elf32_Ehdr *hdr = NULL;
+	Elf32_Phdr *phdr = NULL;
+	struct nameidata nd;
+	struct vmspace *vmspace = p->p_vmspace;
+	vm_prot_t prot = 0;
+	unsigned long text_size = 0, data_size = 0;
+	unsigned long text_addr = 0, data_addr = 0;
+	int header_size = 0;
+        int error, i;
+
+        NDINIT(&nd, LOOKUP, LOCKLEAF|FOLLOW, UIO_SYSSPACE, file, p);   
+			 
+	if (error = namei(&nd))
+                goto fail;
+
+	if (nd.ni_vp == NULL) {
+		error = ENOEXEC;
+		goto fail;
+	}
+
+	/*
+	 * Check permissions, modes, uid, etc on the file, and "open" it.
+	 */
+	error = elf_check_permissions(p, nd.ni_vp);
+
+	/*
+	 * No longer need this, and it prevents demand paging.
+	 */
+	VOP_UNLOCK(nd.ni_vp, 0, p);
+
+	if (error)
+                goto fail;
+		
+	/*
+	 * Map in the header
+	 */
+	if (error = map_pages(nd.ni_vp, 0, (vm_offset_t *)&hdr, sizeof(hdr)))
+                goto fail;
+
+	/*
+	 * Do we have a valid ELF header ?
+	 */
+	if (error = elf_check_header(hdr, ET_DYN))
+		goto fail;
+
+	/*
+	 * ouch, need to bounds check in case user gives us a corrupted
+	 * file with an insane header size
+	 */
+	if (hdr->e_phnum > MAX_PHDR) {	/* XXX: ever more than this? */
+		error = ENOEXEC;
+		goto fail;
+	}
+
+	header_size = hdr->e_phentsize * hdr->e_phnum;
+
+	if (error = map_pages(nd.ni_vp, hdr->e_phoff, (vm_offset_t *)&phdr, 
+			         header_size))
+        	goto fail;
+
+	for (i = 0; i < hdr->e_phnum; i++) {
+		switch(phdr[i].p_type) {
+
+	   	case PT_NULL:	/* NULL section */
+	    		UPRINTF ("ELF(file) PT_NULL section\n");
+			break;
+		case PT_LOAD:	/* Loadable segment */
+		{
+	    		UPRINTF ("ELF(file) PT_LOAD section ");
+			if (phdr[i].p_flags & PF_X)
+  				prot |= VM_PROT_EXECUTE;
+			if (phdr[i].p_flags & PF_W)
+  				prot |= VM_PROT_WRITE;
+			if (phdr[i].p_flags & PF_R)
+  				prot |= VM_PROT_READ;
+
+			if (error = elf_load_section(vmspace, nd.ni_vp,
+  						     phdr[i].p_offset,
+  						     (caddr_t)phdr[i].p_vaddr +
+							(*addr),
+  						     phdr[i].p_memsz,
+  						     phdr[i].p_filesz, prot)) 
+				goto fail;
+
+			/*
+			 * Is this .text or .data ??
+			 *
+			 * We only handle one each of those yet XXX
+			 */
+			if (hdr->e_entry >= phdr[i].p_vaddr &&
+			hdr->e_entry <(phdr[i].p_vaddr+phdr[i].p_memsz)) {
+  				text_addr = trunc_page(phdr[i].p_vaddr+(*addr));
+  				text_size = round_page(phdr[i].p_memsz +
+						       phdr[i].p_vaddr -
+						       trunc_page(phdr[i].p_vaddr));
+				*entry=(unsigned long)hdr->e_entry+(*addr);
+	    			UPRINTF(".text <%08x,%08x> entry=%08x\n",
+					text_addr, text_size, *entry);
+			} else {
+  				data_addr = trunc_page(phdr[i].p_vaddr+(*addr));
+  				data_size = round_page(phdr[i].p_memsz +
+						       phdr[i].p_vaddr -
+						       trunc_page(phdr[i].p_vaddr));
+	    			UPRINTF(".data <%08x,%08x>\n",
+					data_addr, data_size);
+			}
+		}
+		break;
+
+	   	case PT_DYNAMIC:/* Dynamic link information */
+	    		UPRINTF ("ELF(file) PT_DYNAMIC section\n");
+			break;
+	  	case PT_INTERP:	/* Path to interpreter */
+	    		UPRINTF ("ELF(file) PT_INTERP section\n");
+			break;
+	  	case PT_NOTE:	/* Note section */
+	    		UPRINTF ("ELF(file) PT_NOTE section\n");
+			break;
+	  	case PT_SHLIB:	/* Shared lib section  */
+	    		UPRINTF ("ELF(file) PT_SHLIB section\n");
+			break;
+		case PT_PHDR: 	/* Program header table info */
+	    		UPRINTF ("ELF(file) PT_PHDR section\n");
+			break;
+		default:
+	    		UPRINTF ("ELF(file) %d section ??\n", phdr[i].p_type );
+		}
+	}
+
+fail:
+	if (phdr)
+		unmap_pages((vm_offset_t)phdr, header_size);
+	if (hdr)
+		unmap_pages((vm_offset_t)hdr, sizeof(hdr));
+
+	return error;
+}
+
+int
+exec_elf_imgact(struct image_params *imgp)
+{
+	const Elf32_Ehdr *hdr = (const Elf32_Ehdr *) imgp->image_header;
+	const Elf32_Phdr *phdr, *mapped_phdr = NULL;
+	Elf32_Auxargs *elf_auxargs = NULL;
+	struct vmspace *vmspace = imgp->proc->p_vmspace;
+	vm_prot_t prot = 0;
+	u_long text_size = 0, data_size = 0;
+	u_long text_addr = 0, data_addr = 0;
+	u_long addr, entry = 0, proghdr = 0;
+	int error, i, header_size = 0, interp_len = 0;
+	char *interp = NULL;
+	char *brand = NULL;
+	char path[MAXPATHLEN];
+
+	/*
+	 * Do we have a valid ELF header ?
+	 */
+	if (elf_check_header(hdr, ET_EXEC))
+		return -1;
+
+	/*
+	 * From here on down, we return an errno, not -1, as we've
+	 * detected an ELF file.
+	 */
+
+	/*
+	 * ouch, need to bounds check in case user gives us a corrupted
+	 * file with an insane header size
+	 */
+	if (hdr->e_phnum > MAX_PHDR) {	/* XXX: ever more than this? */
+		return ENOEXEC;
+	}
+
+	header_size = hdr->e_phentsize * hdr->e_phnum;
+
+	if ((hdr->e_phoff > PAGE_SIZE) ||
+	    (hdr->e_phoff + header_size) > PAGE_SIZE) {
+	  	/*
+		 * Ouch ! we only get one page full of header...
+		 * Try to map it in ourselves, and see how we go.
+	   	 */
+		if (error = map_pages(imgp->vp, hdr->e_phoff,
+				(vm_offset_t *)&mapped_phdr, header_size))
+			return (error);
+		/*
+		 * Save manual mapping for cleanup
+		 */
+		phdr = mapped_phdr;
+	} else {
+		phdr = (const Elf32_Phdr*)
+		       ((const char *)imgp->image_header + hdr->e_phoff);
+	}
+	
+	/*
+	 * From this point on, we may have resources that need to be freed.
+	 */
+	if (error = exec_extract_strings(imgp))
+		goto fail;
+
+	exec_new_vmspace(imgp);
+
+	for (i = 0; i < hdr->e_phnum; i++) {
+		switch(phdr[i].p_type) {
+
+	   	case PT_NULL:	/* NULL section */
+	    		UPRINTF ("ELF PT_NULL section\n");
+			break;
+		case PT_LOAD:	/* Loadable segment */
+		{
+	    		UPRINTF ("ELF PT_LOAD section ");
+			if (phdr[i].p_flags & PF_X)
+  				prot |= VM_PROT_EXECUTE;
+			if (phdr[i].p_flags & PF_W)
+  				prot |= VM_PROT_WRITE;
+			if (phdr[i].p_flags & PF_R)
+  				prot |= VM_PROT_READ;
+
+			if (error = elf_load_section(vmspace, imgp->vp,
+  						     phdr[i].p_offset,
+  						     (caddr_t)phdr[i].p_vaddr,
+  						     phdr[i].p_memsz,
+  						     phdr[i].p_filesz, prot)) 
+  				goto fail;
+
+			/*
+			 * Is this .text or .data ??
+			 *
+			 * We only handle one each of those yet XXX
+			 */
+			if (hdr->e_entry >= phdr[i].p_vaddr &&
+			hdr->e_entry <(phdr[i].p_vaddr+phdr[i].p_memsz)) {
+  				text_addr = trunc_page(phdr[i].p_vaddr);
+  				text_size = round_page(phdr[i].p_memsz +
+						       phdr[i].p_vaddr -
+						       text_addr);
+				entry = (u_long)hdr->e_entry;
+	    			UPRINTF(".text <%08x,%08x> entry=%08x\n",
+					text_addr, text_size, entry);
+			} else {
+  				data_addr = trunc_page(phdr[i].p_vaddr);
+  				data_size = round_page(phdr[i].p_memsz +
+						       phdr[i].p_vaddr -
+						       data_addr);
+	    			UPRINTF(".data <%08x,%08x>\n",
+					data_addr, data_size);
+			}
+		}
+		break;
+
+	   	case PT_DYNAMIC:/* Dynamic link information */
+	    		UPRINTF ("ELF PT_DYNAMIC section ??\n");
+			break;
+	  	case PT_INTERP:	/* Path to interpreter */
+	    		UPRINTF ("ELF PT_INTERP section ");
+			if (phdr[i].p_filesz > MAXPATHLEN) {
+				error = ENOEXEC;
+				goto fail;
+			}
+			interp_len = MAXPATHLEN;
+			if (error = map_pages(imgp->vp, phdr[i].p_offset,
+					 (vm_offset_t *)&interp, interp_len))
+				goto fail;
+			UPRINTF("<%s>\n", interp);
+			break;
+	  	case PT_NOTE:	/* Note section */
+	    		UPRINTF ("ELF PT_NOTE section\n");
+			break;
+	  	case PT_SHLIB:	/* Shared lib section  */
+	    		UPRINTF ("ELF PT_SHLIB section\n");
+			break;
+		case PT_PHDR: 	/* Program header table info */
+	    		UPRINTF ("ELF PT_PHDR section <%x>\n", phdr[i].p_vaddr);
+			proghdr = phdr[i].p_vaddr;
+			break;
+		default:
+	    		UPRINTF ("ELF %d section ??\n", phdr[i].p_type);
+		}
+	}
+
+	vmspace->vm_tsize = text_size >> PAGE_SHIFT;
+	vmspace->vm_taddr = (caddr_t)text_addr;
+	vmspace->vm_dsize = data_size >> PAGE_SHIFT;
+	vmspace->vm_daddr = (caddr_t)data_addr;
+
+	addr = 2*MAXDSIZ; /* May depend on OS type XXX */
+
+	imgp->entry_addr = entry;
+
+	/* 
+	 * So which kind (brand) of ELF binary do we have at hand
+	 * FreeBSD, Linux, SVR4 or something else ??
+	 * If its has a interpreter section try that first
+	 */
+        if (interp) {
+                for (i=0; i<MAX_BRANDS; i++) {
+                        if (elf_brand_list[i] != NULL) {
+                                if (!strcmp(interp, elf_brand_list[i]->interp_path)) {
+                                        imgp->proc->p_sysent =
+                                                elf_brand_list[i]->sysvec;
+                                        strcpy(path, elf_brand_list[i]->emul_path);
+                                        strcat(path, elf_brand_list[i]->interp_path);
+                                        UPRINTF("interpreter=<%s> %s\n",
+                                                elf_brand_list[i]->interp_path,
+                                                elf_brand_list[i]->emul_path);
+                                        break;
+                                }
+                        }
+                }
+        }
+
+	/*
+	 * If there is no interpreter, or recognition of it
+	 * failed, se if the binary is branded.
+	 */
+	if (!interp || i == MAX_BRANDS) {
+		brand = (char *)&(hdr->e_ident[EI_BRAND]);
+		for (i=0; i<MAX_BRANDS; i++) {
+			if (elf_brand_list[i] != NULL) {
+				if (!strcmp(brand, elf_brand_list[i]->brand)) {
+					imgp->proc->p_sysent = elf_brand_list[i]->sysvec;
+					if (interp) {
+						strcpy(path, elf_brand_list[i]->emul_path);
+						strcat(path, elf_brand_list[i]->interp_path);
+						UPRINTF("interpreter=<%s> %s\n",
+						elf_brand_list[i]->interp_path,
+						elf_brand_list[i]->emul_path);
+					}
+					break;
+				}
+			}
+		}
+	}
+	if (i == MAX_BRANDS) {
+		uprintf("ELF binary type not known\n");
+		error = ENOEXEC;
+		goto fail;
+	}
+	if (interp) {
+                if (error = elf_load_file(imgp->proc,
+                                          path,
+                                          &addr,        /* XXX */
+                                          &imgp->entry_addr)) {
+                        uprintf("ELF interpreter %s not found\n", path);
+                        goto fail;
+                }
+	}
+
+	UPRINTF("Executing %s binary\n", elf_brand_list[i]->brand);
+
+	/*
+	 * Construct auxargs table (used by the fixup routine)
+	 */
+	elf_auxargs = malloc(sizeof(Elf32_Auxargs), M_TEMP, M_WAITOK);
+	elf_auxargs->execfd = -1;
+	elf_auxargs->phdr = proghdr;
+	elf_auxargs->phent = hdr->e_phentsize;
+	elf_auxargs->phnum = hdr->e_phnum;
+	elf_auxargs->pagesz = PAGE_SIZE;
+	elf_auxargs->base = addr;
+	elf_auxargs->flags = 0;
+	elf_auxargs->entry = entry;
+	elf_auxargs->trace = elf_trace;
+
+	imgp->auxargs = elf_auxargs;
+	imgp->interpreted = 0;
+
+	/* don't allow modifying the file while we run it */
+	imgp->vp->v_flag |= VTEXT;
+	
+fail:
+	if (mapped_phdr)
+		unmap_pages((vm_offset_t)mapped_phdr, header_size);
+	if (interp)
+		unmap_pages((vm_offset_t)interp, interp_len);
+
+	return error;
+}
+
+static int
+elf_freebsd_fixup(int **stack_base, struct image_params *imgp)
+{
+	Elf32_Auxargs *args = (Elf32_Auxargs *)imgp->auxargs;
+	int *pos;
+
+	pos = *stack_base + (imgp->argc + imgp->envc + 2);
+
+	if (args->trace) {
+		AUXARGS_ENTRY(pos, AT_DEBUG, 1);
+	}
+	if (args->execfd != -1) {
+		AUXARGS_ENTRY(pos, AT_EXECFD, args->execfd);
+	}
+	AUXARGS_ENTRY(pos, AT_PHDR, args->phdr);
+	AUXARGS_ENTRY(pos, AT_PHENT, args->phent);
+	AUXARGS_ENTRY(pos, AT_PHNUM, args->phnum);
+	AUXARGS_ENTRY(pos, AT_PAGESZ, args->pagesz);
+	AUXARGS_ENTRY(pos, AT_FLAGS, args->flags);
+	AUXARGS_ENTRY(pos, AT_ENTRY, args->entry);
+	AUXARGS_ENTRY(pos, AT_BASE, args->base);
+	AUXARGS_ENTRY(pos, AT_NULL, 0);
+
+	free(imgp->auxargs, M_TEMP);
+	imgp->auxargs = NULL;
+
+	(*stack_base)--;
+	**stack_base = (int)imgp->argc;
+	return 0;
+} 
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ * Since `const' objects end up in the text segment, TEXT_SET is the
+ * correct directive to use.
+ */
+const struct execsw elf_execsw = {exec_elf_imgact, "ELF"};
+TEXT_SET(execsw_set, elf_execsw);
+
diff --git a/sys/kern/imgact_gzip.c b/sys/kern/imgact_gzip.c
new file mode 100644
index 0000000..9a3237f
--- /dev/null
+++ b/sys/kern/imgact_gzip.c
@@ -0,0 +1,378 @@
+/*
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@login.dkuug.dk> wrote this file.  As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * $Id$
+ *
+ * This module handles execution of a.out files which have been run through
+ * "gzip".  This saves diskspace, but wastes cpu-cycles and VM.
+ *
+ * TODO:
+ *	text-segments should be made R/O after being filled
+ *	is the vm-stuff safe ?
+ * 	should handle the entire header of gzip'ed stuff.
+ *	inflate isn't quite reentrant yet...
+ *	error-handling is a mess...
+ *	so is the rest...
+ *	tidy up unnecesary includes
+ */
+
+#include "opt_rlimit.h"
+
+#include <sys/param.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/imgact_aout.h>
+#include <sys/kernel.h>
+#include <sys/mman.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/sysent.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+#include <sys/inflate.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+struct imgact_gzip {
+	struct image_params *ip;
+	struct exec     a_out;
+	int             error;
+	int             where;
+	u_char         *inbuf;
+	u_long          offset;
+	u_long          output;
+	u_long          len;
+	int             idx;
+	u_long          virtual_offset, file_offset, file_end, bss_size;
+};
+
+static int exec_gzip_imgact __P((struct image_params *imgp));
+static int NextByte __P((void *vp));
+static int do_aout_hdr __P((struct imgact_gzip *));
+static int Flush __P((void *vp, u_char *, u_long siz));
+
+static int
+exec_gzip_imgact(imgp)
+	struct image_params *imgp;
+{
+	int             error, error2 = 0;
+	const u_char   *p = (const u_char *) imgp->image_header;
+	struct imgact_gzip igz;
+	struct inflate  infl;
+	struct vmspace *vmspace;
+
+	/* If these four are not OK, it isn't a gzip file */
+	if (p[0] != 0x1f)
+		return -1;	/* 0    Simply magic	 */
+	if (p[1] != 0x8b)
+		return -1;	/* 1    Simply magic	 */
+	if (p[2] != 0x08)
+		return -1;	/* 2    Compression method	 */
+	if (p[9] != 0x03)
+		return -1;	/* 9    OS compressed on	 */
+
+	/*
+	 * If this one contains anything but a comment or a filename marker,
+	 * we don't want to chew on it
+	 */
+	if (p[3] & ~(0x18))
+		return ENOEXEC;	/* 3    Flags		 */
+
+	/* These are of no use to us */
+	/* 4-7  Timestamp		 */
+	/* 8    Extra flags		 */
+
+	bzero(&igz, sizeof igz);
+	bzero(&infl, sizeof infl);
+	infl.gz_private = (void *) &igz;
+	infl.gz_input = NextByte;
+	infl.gz_output = Flush;
+
+	igz.ip = imgp;
+	igz.idx = 10;
+
+	if (p[3] & 0x08) {	/* skip a filename */
+		while (p[igz.idx++])
+			if (igz.idx >= PAGE_SIZE)
+				return ENOEXEC;
+	}
+	if (p[3] & 0x10) {	/* skip a comment */
+		while (p[igz.idx++])
+			if (igz.idx >= PAGE_SIZE)
+				return ENOEXEC;
+	}
+	igz.len = imgp->attr->va_size;
+
+	error = inflate(&infl);
+
+	if ( !error ) {
+		vmspace = imgp->proc->p_vmspace;
+		error = vm_map_protect(&vmspace->vm_map,
+			(vm_offset_t) vmspace->vm_taddr,
+			(vm_offset_t) (vmspace->vm_taddr + 
+				      (vmspace->vm_tsize << PAGE_SHIFT)) ,
+			VM_PROT_READ|VM_PROT_EXECUTE,0);
+	}
+
+	if (igz.inbuf) {
+		error2 =
+			vm_map_remove(kernel_map, (vm_offset_t) igz.inbuf,
+			    (vm_offset_t) igz.inbuf + PAGE_SIZE);
+	}
+	if (igz.error || error || error2) {
+		printf("Output=%lu ", igz.output);
+		printf("Inflate_error=%d igz.error=%d error2=%d where=%d\n",
+		       error, igz.error, error2, igz.where);
+	}
+	if (igz.error)
+		return igz.error;
+	if (error)
+		return ENOEXEC;
+	if (error2)
+		return error2;
+	return 0;
+}
+
+static int
+do_aout_hdr(struct imgact_gzip * gz)
+{
+	int             error;
+	struct vmspace *vmspace = gz->ip->proc->p_vmspace;
+	vm_offset_t     vmaddr;
+
+	/*
+	 * Set file/virtual offset based on a.out variant. We do two cases:
+	 * host byte order and network byte order (for NetBSD compatibility)
+	 */
+	switch ((int) (gz->a_out.a_magic & 0xffff)) {
+	case ZMAGIC:
+		gz->virtual_offset = 0;
+		if (gz->a_out.a_text) {
+			gz->file_offset = PAGE_SIZE;
+		} else {
+			/* Bill's "screwball mode" */
+			gz->file_offset = 0;
+		}
+		break;
+	case QMAGIC:
+		gz->virtual_offset = PAGE_SIZE;
+		gz->file_offset = 0;
+		break;
+	default:
+		/* NetBSD compatibility */
+		switch ((int) (ntohl(gz->a_out.a_magic) & 0xffff)) {
+		case ZMAGIC:
+		case QMAGIC:
+			gz->virtual_offset = PAGE_SIZE;
+			gz->file_offset = 0;
+			break;
+		default:
+			gz->where = __LINE__;
+			return (-1);
+		}
+	}
+
+	gz->bss_size = roundup(gz->a_out.a_bss, PAGE_SIZE);
+
+	/*
+	 * Check various fields in header for validity/bounds.
+	 */
+	if (			/* entry point must lay with text region */
+	    gz->a_out.a_entry < gz->virtual_offset ||
+	    gz->a_out.a_entry >= gz->virtual_offset + gz->a_out.a_text ||
+
+	/* text and data size must each be page rounded */
+	    gz->a_out.a_text & PAGE_MASK || gz->a_out.a_data & PAGE_MASK) {
+		gz->where = __LINE__;
+		return (-1);
+	}
+	/*
+	 * text/data/bss must not exceed limits
+	 */
+	if (			/* text can't exceed maximum text size */
+	    gz->a_out.a_text > MAXTSIZ ||
+
+	/* data + bss can't exceed maximum data size */
+	    gz->a_out.a_data + gz->bss_size > MAXDSIZ ||
+
+	/* data + bss can't exceed rlimit */
+	    gz->a_out.a_data + gz->bss_size >
+	    gz->ip->proc->p_rlimit[RLIMIT_DATA].rlim_cur) {
+		gz->where = __LINE__;
+		return (ENOMEM);
+	}
+	/* Find out how far we should go */
+	gz->file_end = gz->file_offset + gz->a_out.a_text + gz->a_out.a_data;
+
+	/* copy in arguments and/or environment from old process */
+	error = exec_extract_strings(gz->ip);
+	if (error) {
+		gz->where = __LINE__;
+		return (error);
+	}
+	/*
+	 * Destroy old process VM and create a new one (with a new stack)
+	 */
+	exec_new_vmspace(gz->ip);
+
+	vmaddr = gz->virtual_offset;
+
+	error = vm_mmap(&vmspace->vm_map,
+			&vmaddr,
+			gz->a_out.a_text + gz->a_out.a_data,
+			VM_PROT_ALL, VM_PROT_ALL, MAP_ANON | MAP_FIXED,
+			0,
+			0);
+
+	if (error) {
+		gz->where = __LINE__;
+		return (error);
+	}
+
+	if (gz->bss_size != 0) {
+		/*
+		 * Allocate demand-zeroed area for uninitialized data.
+		 * "bss" = 'block started by symbol' - named after the 
+		 * IBM 7090 instruction of the same name.
+		 */
+		vmaddr = gz->virtual_offset + gz->a_out.a_text + 
+			gz->a_out.a_data;
+		error = vm_map_find(&vmspace->vm_map,
+				NULL,
+				0,
+				&vmaddr, 
+				gz->bss_size,
+				FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
+		if (error) {
+			gz->where = __LINE__;
+			return (error);
+		}
+	}
+	/* Fill in process VM information */
+	vmspace->vm_tsize = gz->a_out.a_text >> PAGE_SHIFT;
+	vmspace->vm_dsize = (gz->a_out.a_data + gz->bss_size) >> PAGE_SHIFT;
+	vmspace->vm_taddr = (caddr_t) gz->virtual_offset;
+	vmspace->vm_daddr = (caddr_t) gz->virtual_offset + gz->a_out.a_text;
+
+	/* Fill in image_params */
+	gz->ip->interpreted = 0;
+	gz->ip->entry_addr = gz->a_out.a_entry;
+
+	gz->ip->proc->p_sysent = &aout_sysvec;
+
+	return 0;
+}
+
+static int
+NextByte(void *vp)
+{
+	int             error;
+	struct imgact_gzip *igz = (struct imgact_gzip *) vp;
+
+	if (igz->idx >= igz->len) {
+		igz->where = __LINE__;
+		return GZ_EOF;
+	}
+	if (igz->inbuf && igz->idx < (igz->offset + PAGE_SIZE)) {
+		return igz->inbuf[(igz->idx++) - igz->offset];
+	}
+	if (igz->inbuf) {
+		error = vm_map_remove(kernel_map, (vm_offset_t) igz->inbuf,
+			    (vm_offset_t) igz->inbuf + PAGE_SIZE);
+		if (error) {
+			igz->where = __LINE__;
+			igz->error = error;
+			return GZ_EOF;
+		}
+	}
+	igz->offset = igz->idx & ~PAGE_MASK;
+
+	error = vm_mmap(kernel_map,	/* map */
+			(vm_offset_t *) & igz->inbuf,	/* address */
+			PAGE_SIZE,	/* size */
+			VM_PROT_READ,	/* protection */
+			VM_PROT_READ,	/* max protection */
+			0,	/* flags */
+			(caddr_t) igz->ip->vp,	/* vnode */
+			igz->offset);	/* offset */
+	if (error) {
+		igz->where = __LINE__;
+		igz->error = error;
+		return GZ_EOF;
+	}
+	return igz->inbuf[(igz->idx++) - igz->offset];
+}
+
+static int
+Flush(void *vp, u_char * ptr, u_long siz)
+{
+	struct imgact_gzip *gz = (struct imgact_gzip *) vp;
+	u_char         *p = ptr, *q;
+	int             i;
+
+	/* First, find a a.out-header */
+	if (gz->output < sizeof gz->a_out) {
+		q = (u_char *) & gz->a_out;
+		i = min(siz, sizeof gz->a_out - gz->output);
+		bcopy(p, q + gz->output, i);
+		gz->output += i;
+		p += i;
+		siz -= i;
+		if (gz->output == sizeof gz->a_out) {
+			i = do_aout_hdr(gz);
+			if (i == -1) {
+				if (!gz->where)
+					gz->where = __LINE__;
+				gz->error = ENOEXEC;
+				return ENOEXEC;
+			} else if (i) {
+				gz->where = __LINE__;
+				gz->error = i;
+				return ENOEXEC;
+			}
+			if (gz->file_offset < sizeof gz->a_out) {
+				q = (u_char *) gz->virtual_offset + gz->output - gz->file_offset;
+				bcopy(&gz->a_out, q, sizeof gz->a_out - gz->file_offset);
+			}
+		}
+	}
+	/* Skip over zero-padded first PAGE if needed */
+	if (gz->output < gz->file_offset && (gz->output + siz) > gz->file_offset) {
+		i = min(siz, gz->file_offset - gz->output);
+		gz->output += i;
+		p += i;
+		siz -= i;
+	}
+	if (gz->output >= gz->file_offset && gz->output < gz->file_end) {
+		i = min(siz, gz->file_end - gz->output);
+		q = (u_char *) gz->virtual_offset + gz->output - gz->file_offset;
+		bcopy(p, q, i);
+		gz->output += i;
+		p += i;
+		siz -= i;
+	}
+	gz->output += siz;
+	return 0;
+}
+
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ * Since `const' objects end up in the text segment, TEXT_SET is the
+ * correct directive to use.
+ */
+
+static const struct execsw gzip_execsw = {exec_gzip_imgact, "gzip"};
+TEXT_SET(execsw_set, gzip_execsw);
diff --git a/sys/kern/imgact_shell.c b/sys/kern/imgact_shell.c
new file mode 100644
index 0000000..fb03011
--- /dev/null
+++ b/sys/kern/imgact_shell.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$Id$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/resourcevar.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/kernel.h>
+#include <machine/endian.h>
+
+#if BYTE_ORDER == LITTLE_ENDIAN
+#define SHELLMAGIC	0x2123 /* #! */
+#else
+#define SHELLMAGIC	0x2321
+#endif
+
+#define MAXSHELLCMDLEN	64
+
+static int	exec_shell_imgact __P((struct image_params *imgp));
+
+/*
+ * Shell interpreter image activator. A interpreter name beginning
+ *	at imgp->stringbase is the minimal successful exit requirement.
+ */
+static int
+exec_shell_imgact(imgp)
+	struct image_params *imgp;
+{
+	const char *image_header = imgp->image_header;
+	const char *ihp, *line_endp;
+	char *interp;
+
+	/* a shell script? */
+	if (((const short *) image_header)[0] != SHELLMAGIC)
+		return(-1);
+
+	/*
+	 * Don't allow a shell script to be the shell for a shell
+	 *	script. :-)
+	 */
+	if (imgp->interpreted)
+		return(ENOEXEC);
+
+	imgp->interpreted = 1;
+
+	/*
+	 * Copy shell name and arguments from image_header into string
+	 *	buffer.
+	 */
+
+	/*
+	 * Find end of line; return if the line > MAXSHELLCMDLEN long.
+	 */
+	for (ihp = &image_header[2]; *ihp != '\n'; ++ihp) {
+		if (ihp >= &image_header[MAXSHELLCMDLEN])
+			return(ENOEXEC);
+	}
+	line_endp = ihp;
+
+	/* reset for another pass */
+	ihp = &image_header[2];
+
+	/* Skip over leading spaces - until the interpreter name */
+	while ((*ihp == ' ') || (*ihp == '\t')) ihp++;
+
+	/* copy the interpreter name */
+	interp = imgp->interpreter_name;
+	while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t'))
+		*interp++ = *ihp++;
+	*interp = '\0';
+
+	/* Disallow a null interpreter filename */
+	if (*imgp->interpreter_name == '\0')
+		return(ENOEXEC);
+
+	/* reset for another pass */
+	ihp = &image_header[2];
+
+	/* copy the interpreter name and arguments */
+	while (ihp < line_endp) {
+		/* Skip over leading spaces */
+		while ((*ihp == ' ') || (*ihp == '\t')) ihp++;
+
+		if (ihp < line_endp) {
+			/*
+			 * Copy to end of token. No need to watch stringspace
+			 *	because this is at the front of the string buffer
+			 *	and the maximum shell command length is tiny.
+			 */
+			while ((ihp < line_endp) && (*ihp != ' ') && (*ihp != '\t')) {
+				*imgp->stringp++ = *ihp++;
+				imgp->stringspace--;
+			}
+
+			*imgp->stringp++ = 0;
+			imgp->stringspace--;
+
+			imgp->argc++;
+		}
+	}
+
+	/* set argv[0] to point to original file name */
+	suword(imgp->uap->argv, (int)imgp->uap->fname);
+
+	return(0);
+}
+
+/*
+ * Tell kern_execve.c about it, with a little help from the linker.
+ * Since `const' objects end up in the text segment, TEXT_SET is the
+ * correct directive to use.
+ */
+static const struct execsw shell_execsw = { exec_shell_imgact, "#!" };
+TEXT_SET(execsw_set, shell_execsw);
diff --git a/sys/kern/inflate.c b/sys/kern/inflate.c
new file mode 100644
index 0000000..2024bc1
--- /dev/null
+++ b/sys/kern/inflate.c
@@ -0,0 +1,1072 @@
+/*
+ * Most parts of this file are not covered by:
+ * ----------------------------------------------------------------------------
+ * "THE BEER-WARE LICENSE" (Revision 42):
+ * <phk@login.dknet.dk> wrote this file.  As long as you retain this notice you
+ * can do whatever you want with this stuff. If we meet some day, and you think
+ * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
+ * ----------------------------------------------------------------------------
+ *
+ * $Id$
+ *
+ *
+ */
+
+#include <sys/param.h>
+#include <sys/inflate.h>
+#ifdef KERNEL
+#include <sys/systm.h>
+#endif
+#include <sys/mman.h>
+#include <sys/malloc.h>
+
+/* needed to make inflate() work */
+#define	uch u_char
+#define	ush u_short
+#define	ulg u_long
+
+/* Stuff to make inflate() work */
+#ifdef KERNEL
+#define memzero(dest,len)      bzero(dest,len)
+#endif
+#define NOMEMCPY
+#ifdef KERNEL
+#define FPRINTF printf
+#else
+extern void putstr (char *);
+#define FPRINTF putstr
+#endif
+
+#define FLUSH(x,y) {						\
+	int foo = (*x->gz_output)(x->gz_private,x->gz_slide,y);	\
+	if (foo) 						\
+		return foo;					\
+	}
+
+static const int qflag = 0;
+
+#ifndef KERNEL /* want to use this file in kzip also */
+extern unsigned char *malloc (int, int, int);
+extern void free (void*, int);
+#endif
+
+/*
+ * This came from unzip-5.12.  I have changed it the flow to pass
+ * a structure pointer around, thus hopefully making it re-entrant.
+ * Poul-Henning
+ */
+
+/* inflate.c -- put in the public domain by Mark Adler
+   version c14o, 23 August 1994 */
+
+/* You can do whatever you like with this source file, though I would
+   prefer that if you modify it and redistribute it that you include
+   comments to that effect with your name and the date.  Thank you.
+
+   History:
+   vers    date          who           what
+   ----  ---------  --------------  ------------------------------------
+    a    ~~ Feb 92  M. Adler        used full (large, one-step) lookup table
+    b1   21 Mar 92  M. Adler        first version with partial lookup tables
+    b2   21 Mar 92  M. Adler        fixed bug in fixed-code blocks
+    b3   22 Mar 92  M. Adler        sped up match copies, cleaned up some
+    b4   25 Mar 92  M. Adler        added prototypes; removed window[] (now
+                                    is the responsibility of unzip.h--also
+                                    changed name to slide[]), so needs diffs
+                                    for unzip.c and unzip.h (this allows
+                                    compiling in the small model on MSDOS);
+                                    fixed cast of q in huft_build();
+    b5   26 Mar 92  M. Adler        got rid of unintended macro recursion.
+    b6   27 Mar 92  M. Adler        got rid of nextbyte() routine.  fixed
+                                    bug in inflate_fixed().
+    c1   30 Mar 92  M. Adler        removed lbits, dbits environment variables.
+                                    changed BMAX to 16 for explode.  Removed
+                                    OUTB usage, and replaced it with flush()--
+                                    this was a 20% speed improvement!  Added
+                                    an explode.c (to replace unimplod.c) that
+                                    uses the huft routines here.  Removed
+                                    register union.
+    c2    4 Apr 92  M. Adler        fixed bug for file sizes a multiple of 32k.
+    c3   10 Apr 92  M. Adler        reduced memory of code tables made by
+                                    huft_build significantly (factor of two to
+                                    three).
+    c4   15 Apr 92  M. Adler        added NOMEMCPY do kill use of memcpy().
+                                    worked around a Turbo C optimization bug.
+    c5   21 Apr 92  M. Adler        added the GZ_WSIZE #define to allow reducing
+                                    the 32K window size for specialized
+                                    applications.
+    c6   31 May 92  M. Adler        added some typecasts to eliminate warnings
+    c7   27 Jun 92  G. Roelofs      added some more typecasts (444:  MSC bug).
+    c8    5 Oct 92  J-l. Gailly     added ifdef'd code to deal with PKZIP bug.
+    c9    9 Oct 92  M. Adler        removed a memory error message (~line 416).
+    c10  17 Oct 92  G. Roelofs      changed ULONG/UWORD/byte to ulg/ush/uch,
+                                    removed old inflate, renamed inflate_entry
+                                    to inflate, added Mark's fix to a comment.
+   c10.5 14 Dec 92  M. Adler        fix up error messages for incomplete trees.
+    c11   2 Jan 93  M. Adler        fixed bug in detection of incomplete
+                                    tables, and removed assumption that EOB is
+                                    the longest code (bad assumption).
+    c12   3 Jan 93  M. Adler        make tables for fixed blocks only once.
+    c13   5 Jan 93  M. Adler        allow all zero length codes (pkzip 2.04c
+                                    outputs one zero length code for an empty
+                                    distance tree).
+    c14  12 Mar 93  M. Adler        made inflate.c standalone with the
+                                    introduction of inflate.h.
+   c14b  16 Jul 93  G. Roelofs      added (unsigned) typecast to w at 470.
+   c14c  19 Jul 93  J. Bush         changed v[N_MAX], l[288], ll[28x+3x] arrays
+                                    to static for Amiga.
+   c14d  13 Aug 93  J-l. Gailly     de-complicatified Mark's c[*p++]++ thing.
+   c14e   8 Oct 93  G. Roelofs      changed memset() to memzero().
+   c14f  22 Oct 93  G. Roelofs      renamed quietflg to qflag; made Trace()
+                                    conditional; added inflate_free().
+   c14g  28 Oct 93  G. Roelofs      changed l/(lx+1) macro to pointer (Cray bug)
+   c14h   7 Dec 93  C. Ghisler      huft_build() optimizations.
+   c14i   9 Jan 94  A. Verheijen    set fixed_t{d,l} to NULL after freeing;
+                    G. Roelofs      check NEXTBYTE macro for GZ_EOF.
+   c14j  23 Jan 94  G. Roelofs      removed Ghisler "optimizations"; ifdef'd
+                                    GZ_EOF check.
+   c14k  27 Feb 94  G. Roelofs      added some typecasts to avoid warnings.
+   c14l   9 Apr 94  G. Roelofs      fixed split comments on preprocessor lines
+                                    to avoid bug in Encore compiler.
+   c14m   7 Jul 94  P. Kienitz      modified to allow assembler version of
+                                    inflate_codes() (define ASM_INFLATECODES)
+   c14n  22 Jul 94  G. Roelofs      changed fprintf to FPRINTF for DLL versions
+   c14o  23 Aug 94  C. Spieler      added a newline to a debug statement;
+                    G. Roelofs      added another typecast to avoid MSC warning
+ */
+
+
+/*
+   Inflate deflated (PKZIP's method 8 compressed) data.  The compression
+   method searches for as much of the current string of bytes (up to a
+   length of 258) in the previous 32K bytes.  If it doesn't find any
+   matches (of at least length 3), it codes the next byte.  Otherwise, it
+   codes the length of the matched string and its distance backwards from
+   the current position.  There is a single Huffman code that codes both
+   single bytes (called "literals") and match lengths.  A second Huffman
+   code codes the distance information, which follows a length code.  Each
+   length or distance code actually represents a base value and a number
+   of "extra" (sometimes zero) bits to get to add to the base value.  At
+   the end of each deflated block is a special end-of-block (EOB) literal/
+   length code.  The decoding process is basically: get a literal/length
+   code; if EOB then done; if a literal, emit the decoded byte; if a
+   length then get the distance and emit the referred-to bytes from the
+   sliding window of previously emitted data.
+
+   There are (currently) three kinds of inflate blocks: stored, fixed, and
+   dynamic.  The compressor outputs a chunk of data at a time and decides
+   which method to use on a chunk-by-chunk basis.  A chunk might typically
+   be 32K to 64K, uncompressed.  If the chunk is uncompressible, then the
+   "stored" method is used.  In this case, the bytes are simply stored as
+   is, eight bits per byte, with none of the above coding.  The bytes are
+   preceded by a count, since there is no longer an EOB code.
+
+   If the data is compressible, then either the fixed or dynamic methods
+   are used.  In the dynamic method, the compressed data is preceded by
+   an encoding of the literal/length and distance Huffman codes that are
+   to be used to decode this block.  The representation is itself Huffman
+   coded, and so is preceded by a description of that code.  These code
+   descriptions take up a little space, and so for small blocks, there is
+   a predefined set of codes, called the fixed codes.  The fixed method is
+   used if the block ends up smaller that way (usually for quite small
+   chunks); otherwise the dynamic method is used.  In the latter case, the
+   codes are customized to the probabilities in the current block and so
+   can code it much better than the pre-determined fixed codes can.
+
+   The Huffman codes themselves are decoded using a mutli-level table
+   lookup, in order to maximize the speed of decoding plus the speed of
+   building the decoding tables.  See the comments below that precede the
+   lbits and dbits tuning parameters.
+ */
+
+
+/*
+   Notes beyond the 1.93a appnote.txt:
+
+   1. Distance pointers never point before the beginning of the output
+      stream.
+   2. Distance pointers can point back across blocks, up to 32k away.
+   3. There is an implied maximum of 7 bits for the bit length table and
+      15 bits for the actual data.
+   4. If only one code exists, then it is encoded using one bit.  (Zero
+      would be more efficient, but perhaps a little confusing.)  If two
+      codes exist, they are coded using one bit each (0 and 1).
+   5. There is no way of sending zero distance codes--a dummy must be
+      sent if there are none.  (History: a pre 2.0 version of PKZIP would
+      store blocks with no distance codes, but this was discovered to be
+      too harsh a criterion.)  Valid only for 1.93a.  2.04c does allow
+      zero distance codes, which is sent as one code of zero bits in
+      length.
+   6. There are up to 286 literal/length codes.  Code 256 represents the
+      end-of-block.  Note however that the static length tree defines
+      288 codes just to fill out the Huffman codes.  Codes 286 and 287
+      cannot be used though, since there is no length base or extra bits
+      defined for them.  Similarily, there are up to 30 distance codes.
+      However, static trees define 32 codes (all 5 bits) to fill out the
+      Huffman codes, but the last two had better not show up in the data.
+   7. Unzip can check dynamic Huffman blocks for complete code sets.
+      The exception is that a single code would not be complete (see #4).
+   8. The five bits following the block type is really the number of
+      literal codes sent minus 257.
+   9. Length codes 8,16,16 are interpreted as 13 length codes of 8 bits
+      (1+6+6).  Therefore, to output three times the length, you output
+      three codes (1+1+1), whereas to output four times the same length,
+      you only need two codes (1+3).  Hmm.
+  10. In the tree reconstruction algorithm, Code = Code + Increment
+      only if BitLength(i) is not zero.  (Pretty obvious.)
+  11. Correction: 4 Bits: # of Bit Length codes - 4     (4 - 19)
+  12. Note: length code 284 can represent 227-258, but length code 285
+      really is 258.  The last length deserves its own, short code
+      since it gets used a lot in very redundant files.  The length
+      258 is special since 258 - 3 (the min match length) is 255.
+  13. The literal/length and distance code bit lengths are read as a
+      single stream of lengths.  It is possible (and advantageous) for
+      a repeat code (16, 17, or 18) to go across the boundary between
+      the two sets of lengths.
+ */
+
+
+#define PKZIP_BUG_WORKAROUND	/* PKZIP 1.93a problem--live with it */
+
+/*
+    inflate.h must supply the uch slide[GZ_WSIZE] array and the NEXTBYTE,
+    FLUSH() and memzero macros.  If the window size is not 32K, it
+    should also define GZ_WSIZE.  If INFMOD is defined, it can include
+    compiled functions to support the NEXTBYTE and/or FLUSH() macros.
+    There are defaults for NEXTBYTE and FLUSH() below for use as
+    examples of what those functions need to do.  Normally, you would
+    also want FLUSH() to compute a crc on the data.  inflate.h also
+    needs to provide these typedefs:
+
+        typedef unsigned char uch;
+        typedef unsigned short ush;
+        typedef unsigned long ulg;
+
+    This module uses the external functions malloc() and free() (and
+    probably memset() or bzero() in the memzero() macro).  Their
+    prototypes are normally found in <string.h> and <stdlib.h>.
+ */
+#define INFMOD			/* tell inflate.h to include code to be
+				 * compiled */
+
+/* Huffman code lookup table entry--this entry is four bytes for machines
+   that have 16-bit pointers (e.g. PC's in the small or medium model).
+   Valid extra bits are 0..13.  e == 15 is EOB (end of block), e == 16
+   means that v is a literal, 16 < e < 32 means that v is a pointer to
+   the next table, which codes e - 16 bits, and lastly e == 99 indicates
+   an unused code.  If a code with e == 99 is looked up, this implies an
+   error in the data. */
+struct huft {
+	uch             e;	/* number of extra bits or operation */
+	uch             b;	/* number of bits in this code or subcode */
+	union {
+		ush             n;	/* literal, length base, or distance
+					 * base */
+		struct huft    *t;	/* pointer to next level of table */
+	}               v;
+};
+
+
+/* Function prototypes */
+static int huft_build __P((struct inflate *, unsigned *, unsigned, unsigned, const ush *, const ush *, struct huft **, int *));
+static int huft_free __P((struct inflate *, struct huft *));
+static int inflate_codes __P((struct inflate *, struct huft *, struct huft *, int, int));
+static int inflate_stored __P((struct inflate *));
+static int xinflate __P((struct inflate *));
+static int inflate_fixed __P((struct inflate *));
+static int inflate_dynamic __P((struct inflate *));
+static int inflate_block __P((struct inflate *, int *));
+
+/* The inflate algorithm uses a sliding 32K byte window on the uncompressed
+   stream to find repeated byte strings.  This is implemented here as a
+   circular buffer.  The index is updated simply by incrementing and then
+   and'ing with 0x7fff (32K-1). */
+/* It is left to other modules to supply the 32K area.  It is assumed
+   to be usable as if it were declared "uch slide[32768];" or as just
+   "uch *slide;" and then malloc'ed in the latter case.  The definition
+   must be in unzip.h, included above. */
+
+
+/* Tables for deflate from PKZIP's appnote.txt. */
+
+/* Order of the bit length code lengths */
+static const unsigned border[] = {
+	16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15};
+
+static const ush cplens[] = {	/* Copy lengths for literal codes 257..285 */
+	3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 15, 17, 19, 23, 27, 31,
+	35, 43, 51, 59, 67, 83, 99, 115, 131, 163, 195, 227, 258, 0, 0};
+ /* note: see note #13 above about the 258 in this list. */
+
+static const ush cplext[] = {	/* Extra bits for literal codes 257..285 */
+	0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2,
+	3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 0, 99, 99};	/* 99==invalid */
+
+static const ush cpdist[] = {	/* Copy offsets for distance codes 0..29 */
+	1, 2, 3, 4, 5, 7, 9, 13, 17, 25, 33, 49, 65, 97, 129, 193,
+	257, 385, 513, 769, 1025, 1537, 2049, 3073, 4097, 6145,
+	8193, 12289, 16385, 24577};
+
+static const ush cpdext[] = {	/* Extra bits for distance codes */
+	0, 0, 0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
+	7, 7, 8, 8, 9, 9, 10, 10, 11, 11,
+	12, 12, 13, 13};
+
+/* And'ing with mask[n] masks the lower n bits */
+static const ush mask[] = {
+	0x0000,
+	0x0001, 0x0003, 0x0007, 0x000f, 0x001f, 0x003f, 0x007f, 0x00ff,
+	0x01ff, 0x03ff, 0x07ff, 0x0fff, 0x1fff, 0x3fff, 0x7fff, 0xffff
+};
+
+
+/* Macros for inflate() bit peeking and grabbing.
+   The usage is:
+
+        NEEDBITS(glbl,j)
+        x = b & mask[j];
+        DUMPBITS(j)
+
+   where NEEDBITS makes sure that b has at least j bits in it, and
+   DUMPBITS removes the bits from b.  The macros use the variable k
+   for the number of bits in b.  Normally, b and k are register
+   variables for speed, and are initialized at the begining of a
+   routine that uses these macros from a global bit buffer and count.
+
+   In order to not ask for more bits than there are in the compressed
+   stream, the Huffman tables are constructed to only ask for just
+   enough bits to make up the end-of-block code (value 256).  Then no
+   bytes need to be "returned" to the buffer at the end of the last
+   block.  See the huft_build() routine.
+ */
+
+/*
+ * The following 2 were global variables.
+ * They are now fields of the inflate structure.
+ */
+
+#define NEEDBITS(glbl,n) {						\
+		while(k<(n)) {						\
+			int c=(*glbl->gz_input)(glbl->gz_private);	\
+			if(c==GZ_EOF)					\
+				return 1; 				\
+			b|=((ulg)c)<<k;					\
+			k+=8;						\
+		}							\
+	}
+
+#define DUMPBITS(n) {b>>=(n);k-=(n);}
+
+/*
+   Huffman code decoding is performed using a multi-level table lookup.
+   The fastest way to decode is to simply build a lookup table whose
+   size is determined by the longest code.  However, the time it takes
+   to build this table can also be a factor if the data being decoded
+   is not very long.  The most common codes are necessarily the
+   shortest codes, so those codes dominate the decoding time, and hence
+   the speed.  The idea is you can have a shorter table that decodes the
+   shorter, more probable codes, and then point to subsidiary tables for
+   the longer codes.  The time it costs to decode the longer codes is
+   then traded against the time it takes to make longer tables.
+
+   This results of this trade are in the variables lbits and dbits
+   below.  lbits is the number of bits the first level table for literal/
+   length codes can decode in one step, and dbits is the same thing for
+   the distance codes.  Subsequent tables are also less than or equal to
+   those sizes.  These values may be adjusted either when all of the
+   codes are shorter than that, in which case the longest code length in
+   bits is used, or when the shortest code is *longer* than the requested
+   table size, in which case the length of the shortest code in bits is
+   used.
+
+   There are two different values for the two tables, since they code a
+   different number of possibilities each.  The literal/length table
+   codes 286 possible values, or in a flat code, a little over eight
+   bits.  The distance table codes 30 possible values, or a little less
+   than five bits, flat.  The optimum values for speed end up being
+   about one bit more than those, so lbits is 8+1 and dbits is 5+1.
+   The optimum values may differ though from machine to machine, and
+   possibly even between compilers.  Your mileage may vary.
+ */
+
+static const int lbits = 9;	/* bits in base literal/length lookup table */
+static const int dbits = 6;	/* bits in base distance lookup table */
+
+
+/* If BMAX needs to be larger than 16, then h and x[] should be ulg. */
+#define BMAX 16			/* maximum bit length of any code (16 for
+				 * explode) */
+#define N_MAX 288		/* maximum number of codes in any set */
+
+/* Given a list of code lengths and a maximum table size, make a set of
+   tables to decode that set of codes.  Return zero on success, one if
+   the given code set is incomplete (the tables are still built in this
+   case), two if the input is invalid (all zero length codes or an
+   oversubscribed set of lengths), and three if not enough memory.
+   The code with value 256 is special, and the tables are constructed
+   so that no bits beyond that code are fetched when that code is
+   decoded. */
+static int
+huft_build(glbl, b, n, s, d, e, t, m)
+	struct inflate *glbl;
+	unsigned       *b;	/* code lengths in bits (all assumed <= BMAX) */
+	unsigned        n;	/* number of codes (assumed <= N_MAX) */
+	unsigned        s;	/* number of simple-valued codes (0..s-1) */
+	const ush      *d;	/* list of base values for non-simple codes */
+	const ush      *e;	/* list of extra bits for non-simple codes */
+	struct huft   **t;	/* result: starting table */
+	int            *m;	/* maximum lookup bits, returns actual */
+{
+	unsigned        a;	/* counter for codes of length k */
+	unsigned        c[BMAX + 1];	/* bit length count table */
+	unsigned        el;	/* length of EOB code (value 256) */
+	unsigned        f;	/* i repeats in table every f entries */
+	int             g;	/* maximum code length */
+	int             h;	/* table level */
+	register unsigned i;	/* counter, current code */
+	register unsigned j;	/* counter */
+	register int    k;	/* number of bits in current code */
+	int             lx[BMAX + 1];	/* memory for l[-1..BMAX-1] */
+	int            *l = lx + 1;	/* stack of bits per table */
+	register unsigned *p;	/* pointer into c[], b[], or v[] */
+	register struct huft *q;/* points to current table */
+	struct huft     r;	/* table entry for structure assignment */
+	struct huft    *u[BMAX];/* table stack */
+	unsigned        v[N_MAX];	/* values in order of bit length */
+	register int    w;	/* bits before this table == (l * h) */
+	unsigned        x[BMAX + 1];	/* bit offsets, then code stack */
+	unsigned       *xp;	/* pointer into x */
+	int             y;	/* number of dummy codes added */
+	unsigned        z;	/* number of entries in current table */
+
+	/* Generate counts for each bit length */
+	el = n > 256 ? b[256] : BMAX;	/* set length of EOB code, if any */
+#ifdef KERNEL
+	memzero((char *) c, sizeof(c));
+#else
+	for (i = 0; i < BMAX+1; i++)
+		c [i] = 0;
+#endif
+	p = b;
+	i = n;
+	do {
+		c[*p]++;
+		p++;		/* assume all entries <= BMAX */
+	} while (--i);
+	if (c[0] == n) {	/* null input--all zero length codes */
+		*t = (struct huft *) NULL;
+		*m = 0;
+		return 0;
+	}
+	/* Find minimum and maximum length, bound *m by those */
+	for (j = 1; j <= BMAX; j++)
+		if (c[j])
+			break;
+	k = j;			/* minimum code length */
+	if ((unsigned) *m < j)
+		*m = j;
+	for (i = BMAX; i; i--)
+		if (c[i])
+			break;
+	g = i;			/* maximum code length */
+	if ((unsigned) *m > i)
+		*m = i;
+
+	/* Adjust last length count to fill out codes, if needed */
+	for (y = 1 << j; j < i; j++, y <<= 1)
+		if ((y -= c[j]) < 0)
+			return 2;	/* bad input: more codes than bits */
+	if ((y -= c[i]) < 0)
+		return 2;
+	c[i] += y;
+
+	/* Generate starting offsets into the value table for each length */
+	x[1] = j = 0;
+	p = c + 1;
+	xp = x + 2;
+	while (--i) {		/* note that i == g from above */
+		*xp++ = (j += *p++);
+	}
+
+	/* Make a table of values in order of bit lengths */
+	p = b;
+	i = 0;
+	do {
+		if ((j = *p++) != 0)
+			v[x[j]++] = i;
+	} while (++i < n);
+
+	/* Generate the Huffman codes and for each, make the table entries */
+	x[0] = i = 0;		/* first Huffman code is zero */
+	p = v;			/* grab values in bit order */
+	h = -1;			/* no tables yet--level -1 */
+	w = l[-1] = 0;		/* no bits decoded yet */
+	u[0] = (struct huft *) NULL;	/* just to keep compilers happy */
+	q = (struct huft *) NULL;	/* ditto */
+	z = 0;			/* ditto */
+
+	/* go through the bit lengths (k already is bits in shortest code) */
+	for (; k <= g; k++) {
+		a = c[k];
+		while (a--) {
+			/*
+			 * here i is the Huffman code of length k bits for
+			 * value *p
+			 */
+			/* make tables up to required level */
+			while (k > w + l[h]) {
+				w += l[h++];	/* add bits already decoded */
+
+				/*
+				 * compute minimum size table less than or
+				 * equal to *m bits
+				 */
+				z = (z = g - w) > (unsigned) *m ? *m : z;	/* upper limit */
+				if ((f = 1 << (j = k - w)) > a + 1) {	/* try a k-w bit table *//* t
+									 * oo few codes for k-w
+									 * bit table */
+					f -= a + 1;	/* deduct codes from
+							 * patterns left */
+					xp = c + k;
+					while (++j < z) {	/* try smaller tables up
+								 * to z bits */
+						if ((f <<= 1) <= *++xp)
+							break;	/* enough codes to use
+								 * up j bits */
+						f -= *xp;	/* else deduct codes
+								 * from patterns */
+					}
+				}
+				if ((unsigned) w + j > el && (unsigned) w < el)
+					j = el - w;	/* make EOB code end at
+							 * table */
+				z = 1 << j;	/* table entries for j-bit
+						 * table */
+				l[h] = j;	/* set table size in stack */
+
+				/* allocate and link in new table */
+				if ((q = (struct huft *) malloc((z + 1) * sizeof(struct huft), M_GZIP, M_WAITOK)) ==
+				    (struct huft *) NULL) {
+					if (h)
+						huft_free(glbl, u[0]);
+					return 3;	/* not enough memory */
+				}
+				glbl->gz_hufts += z + 1;	/* track memory usage */
+				*t = q + 1;	/* link to list for
+						 * huft_free() */
+				*(t = &(q->v.t)) = (struct huft *) NULL;
+				u[h] = ++q;	/* table starts after link */
+
+				/* connect to last table, if there is one */
+				if (h) {
+					x[h] = i;	/* save pattern for
+							 * backing up */
+					r.b = (uch) l[h - 1];	/* bits to dump before
+								 * this table */
+					r.e = (uch) (16 + j);	/* bits in this table */
+					r.v.t = q;	/* pointer to this table */
+					j = (i & ((1 << w) - 1)) >> (w - l[h - 1]);
+					u[h - 1][j] = r;	/* connect to last table */
+				}
+			}
+
+			/* set up table entry in r */
+			r.b = (uch) (k - w);
+			if (p >= v + n)
+				r.e = 99;	/* out of values--invalid
+						 * code */
+			else if (*p < s) {
+				r.e = (uch) (*p < 256 ? 16 : 15);	/* 256 is end-of-block
+									 * code */
+				r.v.n = *p++;	/* simple code is just the
+						 * value */
+			} else {
+				r.e = (uch) e[*p - s];	/* non-simple--look up
+							 * in lists */
+				r.v.n = d[*p++ - s];
+			}
+
+			/* fill code-like entries with r */
+			f = 1 << (k - w);
+			for (j = i >> w; j < z; j += f)
+				q[j] = r;
+
+			/* backwards increment the k-bit code i */
+			for (j = 1 << (k - 1); i & j; j >>= 1)
+				i ^= j;
+			i ^= j;
+
+			/* backup over finished tables */
+			while ((i & ((1 << w) - 1)) != x[h])
+				w -= l[--h];	/* don't need to update q */
+		}
+	}
+
+	/* return actual size of base table */
+	*m = l[0];
+
+	/* Return true (1) if we were given an incomplete table */
+	return y != 0 && g != 1;
+}
+
+static int
+huft_free(glbl, t)
+	struct inflate *glbl;
+	struct huft    *t;	/* table to free */
+/* Free the malloc'ed tables built by huft_build(), which makes a linked
+   list of the tables it made, with the links in a dummy first entry of
+   each table. */
+{
+	register struct huft *p, *q;
+
+	/* Go through linked list, freeing from the malloced (t[-1]) address. */
+	p = t;
+	while (p != (struct huft *) NULL) {
+		q = (--p)->v.t;
+		free(p, M_GZIP);
+		p = q;
+	}
+	return 0;
+}
+
+/* inflate (decompress) the codes in a deflated (compressed) block.
+   Return an error code or zero if it all goes ok. */
+static int
+inflate_codes(glbl, tl, td, bl, bd)
+	struct inflate *glbl;
+	struct huft    *tl, *td;/* literal/length and distance decoder tables */
+	int             bl, bd;	/* number of bits decoded by tl[] and td[] */
+{
+	register unsigned e;	/* table entry flag/number of extra bits */
+	unsigned        n, d;	/* length and index for copy */
+	unsigned        w;	/* current window position */
+	struct huft    *t;	/* pointer to table entry */
+	unsigned        ml, md;	/* masks for bl and bd bits */
+	register ulg    b;	/* bit buffer */
+	register unsigned k;	/* number of bits in bit buffer */
+
+	/* make local copies of globals */
+	b = glbl->gz_bb;			/* initialize bit buffer */
+	k = glbl->gz_bk;
+	w = glbl->gz_wp;	/* initialize window position */
+
+	/* inflate the coded data */
+	ml = mask[bl];		/* precompute masks for speed */
+	md = mask[bd];
+	while (1) {		/* do until end of block */
+		NEEDBITS(glbl, (unsigned) bl)
+			if ((e = (t = tl + ((unsigned) b & ml))->e) > 16)
+			do {
+				if (e == 99)
+					return 1;
+				DUMPBITS(t->b)
+					e -= 16;
+				NEEDBITS(glbl, e)
+			} while ((e = (t = t->v.t + ((unsigned) b & mask[e]))->e) > 16);
+		DUMPBITS(t->b)
+			if (e == 16) {	/* then it's a literal */
+			glbl->gz_slide[w++] = (uch) t->v.n;
+			if (w == GZ_WSIZE) {
+				FLUSH(glbl, w);
+				w = 0;
+			}
+		} else {	/* it's an EOB or a length */
+			/* exit if end of block */
+			if (e == 15)
+				break;
+
+			/* get length of block to copy */
+			NEEDBITS(glbl, e)
+				n = t->v.n + ((unsigned) b & mask[e]);
+			DUMPBITS(e);
+
+			/* decode distance of block to copy */
+			NEEDBITS(glbl, (unsigned) bd)
+				if ((e = (t = td + ((unsigned) b & md))->e) > 16)
+				do {
+					if (e == 99)
+						return 1;
+					DUMPBITS(t->b)
+						e -= 16;
+					NEEDBITS(glbl, e)
+				} while ((e = (t = t->v.t + ((unsigned) b & mask[e]))->e) > 16);
+			DUMPBITS(t->b)
+				NEEDBITS(glbl, e)
+				d = w - t->v.n - ((unsigned) b & mask[e]);
+			DUMPBITS(e)
+			/* do the copy */
+				do {
+				n -= (e = (e = GZ_WSIZE - ((d &= GZ_WSIZE - 1) > w ? d : w)) > n ? n : e);
+#ifndef NOMEMCPY
+				if (w - d >= e) {	/* (this test assumes
+							 * unsigned comparison) */
+					memcpy(glbl->gz_slide + w, glbl->gz_slide + d, e);
+					w += e;
+					d += e;
+				} else	/* do it slow to avoid memcpy()
+					 * overlap */
+#endif				/* !NOMEMCPY */
+					do {
+						glbl->gz_slide[w++] = glbl->gz_slide[d++];
+					} while (--e);
+				if (w == GZ_WSIZE) {
+					FLUSH(glbl, w);
+					w = 0;
+				}
+			} while (n);
+		}
+	}
+
+	/* restore the globals from the locals */
+	glbl->gz_wp = w;	/* restore global window pointer */
+	glbl->gz_bb = b;			/* restore global bit buffer */
+	glbl->gz_bk = k;
+
+	/* done */
+	return 0;
+}
+
+/* "decompress" an inflated type 0 (stored) block. */
+static int
+inflate_stored(glbl)
+	struct inflate *glbl;
+{
+	unsigned        n;	/* number of bytes in block */
+	unsigned        w;	/* current window position */
+	register ulg    b;	/* bit buffer */
+	register unsigned k;	/* number of bits in bit buffer */
+
+	/* make local copies of globals */
+	b = glbl->gz_bb;			/* initialize bit buffer */
+	k = glbl->gz_bk;
+	w = glbl->gz_wp;	/* initialize window position */
+
+	/* go to byte boundary */
+	n = k & 7;
+	DUMPBITS(n);
+
+	/* get the length and its complement */
+	NEEDBITS(glbl, 16)
+		n = ((unsigned) b & 0xffff);
+	DUMPBITS(16)
+		NEEDBITS(glbl, 16)
+		if (n != (unsigned) ((~b) & 0xffff))
+		return 1;	/* error in compressed data */
+	DUMPBITS(16)
+	/* read and output the compressed data */
+		while (n--) {
+		NEEDBITS(glbl, 8)
+			glbl->gz_slide[w++] = (uch) b;
+		if (w == GZ_WSIZE) {
+			FLUSH(glbl, w);
+			w = 0;
+		}
+		DUMPBITS(8)
+	}
+
+	/* restore the globals from the locals */
+	glbl->gz_wp = w;	/* restore global window pointer */
+	glbl->gz_bb = b;			/* restore global bit buffer */
+	glbl->gz_bk = k;
+	return 0;
+}
+
+/* decompress an inflated type 1 (fixed Huffman codes) block.  We should
+   either replace this with a custom decoder, or at least precompute the
+   Huffman tables. */
+static int
+inflate_fixed(glbl)
+	struct inflate *glbl;
+{
+	/* if first time, set up tables for fixed blocks */
+	if (glbl->gz_fixed_tl == (struct huft *) NULL) {
+		int             i;	/* temporary variable */
+		static unsigned l[288];	/* length list for huft_build */
+
+		/* literal table */
+		for (i = 0; i < 144; i++)
+			l[i] = 8;
+		for (; i < 256; i++)
+			l[i] = 9;
+		for (; i < 280; i++)
+			l[i] = 7;
+		for (; i < 288; i++)	/* make a complete, but wrong code
+					 * set */
+			l[i] = 8;
+		glbl->gz_fixed_bl = 7;
+		if ((i = huft_build(glbl, l, 288, 257, cplens, cplext,
+			    &glbl->gz_fixed_tl, &glbl->gz_fixed_bl)) != 0) {
+			glbl->gz_fixed_tl = (struct huft *) NULL;
+			return i;
+		}
+		/* distance table */
+		for (i = 0; i < 30; i++)	/* make an incomplete code
+						 * set */
+			l[i] = 5;
+		glbl->gz_fixed_bd = 5;
+		if ((i = huft_build(glbl, l, 30, 0, cpdist, cpdext,
+			     &glbl->gz_fixed_td, &glbl->gz_fixed_bd)) > 1) {
+			huft_free(glbl, glbl->gz_fixed_tl);
+			glbl->gz_fixed_tl = (struct huft *) NULL;
+			return i;
+		}
+	}
+	/* decompress until an end-of-block code */
+	return inflate_codes(glbl, glbl->gz_fixed_tl, glbl->gz_fixed_td, glbl->gz_fixed_bl, glbl->gz_fixed_bd) != 0;
+}
+
+/* decompress an inflated type 2 (dynamic Huffman codes) block. */
+static int
+inflate_dynamic(glbl)
+	struct inflate *glbl;
+{
+	int             i;	/* temporary variables */
+	unsigned        j;
+	unsigned        l;	/* last length */
+	unsigned        m;	/* mask for bit lengths table */
+	unsigned        n;	/* number of lengths to get */
+	struct huft    *tl;	/* literal/length code table */
+	struct huft    *td;	/* distance code table */
+	int             bl;	/* lookup bits for tl */
+	int             bd;	/* lookup bits for td */
+	unsigned        nb;	/* number of bit length codes */
+	unsigned        nl;	/* number of literal/length codes */
+	unsigned        nd;	/* number of distance codes */
+#ifdef PKZIP_BUG_WORKAROUND
+	unsigned        ll[288 + 32];	/* literal/length and distance code
+					 * lengths */
+#else
+	unsigned        ll[286 + 30];	/* literal/length and distance code
+					 * lengths */
+#endif
+	register ulg    b;	/* bit buffer */
+	register unsigned k;	/* number of bits in bit buffer */
+
+	/* make local bit buffer */
+	b = glbl->gz_bb;
+	k = glbl->gz_bk;
+
+	/* read in table lengths */
+	NEEDBITS(glbl, 5)
+		nl = 257 + ((unsigned) b & 0x1f);	/* number of
+							 * literal/length codes */
+	DUMPBITS(5)
+		NEEDBITS(glbl, 5)
+		nd = 1 + ((unsigned) b & 0x1f);	/* number of distance codes */
+	DUMPBITS(5)
+		NEEDBITS(glbl, 4)
+		nb = 4 + ((unsigned) b & 0xf);	/* number of bit length codes */
+	DUMPBITS(4)
+#ifdef PKZIP_BUG_WORKAROUND
+		if (nl > 288 || nd > 32)
+#else
+		if (nl > 286 || nd > 30)
+#endif
+		return 1;	/* bad lengths */
+	/* read in bit-length-code lengths */
+	for (j = 0; j < nb; j++) {
+		NEEDBITS(glbl, 3)
+			ll[border[j]] = (unsigned) b & 7;
+		DUMPBITS(3)
+	}
+	for (; j < 19; j++)
+		ll[border[j]] = 0;
+
+	/* build decoding table for trees--single level, 7 bit lookup */
+	bl = 7;
+	if ((i = huft_build(glbl, ll, 19, 19, NULL, NULL, &tl, &bl)) != 0) {
+		if (i == 1)
+			huft_free(glbl, tl);
+		return i;	/* incomplete code set */
+	}
+	/* read in literal and distance code lengths */
+	n = nl + nd;
+	m = mask[bl];
+	i = l = 0;
+	while ((unsigned) i < n) {
+		NEEDBITS(glbl, (unsigned) bl)
+			j = (td = tl + ((unsigned) b & m))->b;
+		DUMPBITS(j)
+			j = td->v.n;
+		if (j < 16)	/* length of code in bits (0..15) */
+			ll[i++] = l = j;	/* save last length in l */
+		else if (j == 16) {	/* repeat last length 3 to 6 times */
+			NEEDBITS(glbl, 2)
+				j = 3 + ((unsigned) b & 3);
+			DUMPBITS(2)
+				if ((unsigned) i + j > n)
+				return 1;
+			while (j--)
+				ll[i++] = l;
+		} else if (j == 17) {	/* 3 to 10 zero length codes */
+			NEEDBITS(glbl, 3)
+				j = 3 + ((unsigned) b & 7);
+			DUMPBITS(3)
+				if ((unsigned) i + j > n)
+				return 1;
+			while (j--)
+				ll[i++] = 0;
+			l = 0;
+		} else {	/* j == 18: 11 to 138 zero length codes */
+			NEEDBITS(glbl, 7)
+				j = 11 + ((unsigned) b & 0x7f);
+			DUMPBITS(7)
+				if ((unsigned) i + j > n)
+				return 1;
+			while (j--)
+				ll[i++] = 0;
+			l = 0;
+		}
+	}
+
+	/* free decoding table for trees */
+	huft_free(glbl, tl);
+
+	/* restore the global bit buffer */
+	glbl->gz_bb = b;
+	glbl->gz_bk = k;
+
+	/* build the decoding tables for literal/length and distance codes */
+	bl = lbits;
+	i = huft_build(glbl, ll, nl, 257, cplens, cplext, &tl, &bl);
+	if (i != 0) {
+		if (i == 1 && !qflag) {
+			FPRINTF("(incomplete l-tree)  ");
+			huft_free(glbl, tl);
+		}
+		return i;	/* incomplete code set */
+	}
+	bd = dbits;
+	i = huft_build(glbl, ll + nl, nd, 0, cpdist, cpdext, &td, &bd);
+	if (i != 0) {
+		if (i == 1 && !qflag) {
+			FPRINTF("(incomplete d-tree)  ");
+#ifdef PKZIP_BUG_WORKAROUND
+			i = 0;
+		}
+#else
+			huft_free(glbl, td);
+		}
+		huft_free(glbl, tl);
+		return i;	/* incomplete code set */
+#endif
+	}
+	/* decompress until an end-of-block code */
+	if (inflate_codes(glbl, tl, td, bl, bd))
+		return 1;
+
+	/* free the decoding tables, return */
+	huft_free(glbl, tl);
+	huft_free(glbl, td);
+	return 0;
+}
+
+/* decompress an inflated block */
+static int
+inflate_block(glbl, e)
+	struct inflate *glbl;
+	int            *e;	/* last block flag */
+{
+	unsigned        t;	/* block type */
+	register ulg    b;	/* bit buffer */
+	register unsigned k;	/* number of bits in bit buffer */
+
+	/* make local bit buffer */
+	b = glbl->gz_bb;
+	k = glbl->gz_bk;
+
+	/* read in last block bit */
+	NEEDBITS(glbl, 1)
+		* e = (int) b & 1;
+	DUMPBITS(1)
+	/* read in block type */
+		NEEDBITS(glbl, 2)
+		t = (unsigned) b & 3;
+	DUMPBITS(2)
+	/* restore the global bit buffer */
+		glbl->gz_bb = b;
+	glbl->gz_bk = k;
+
+	/* inflate that block type */
+	if (t == 2)
+		return inflate_dynamic(glbl);
+	if (t == 0)
+		return inflate_stored(glbl);
+	if (t == 1)
+		return inflate_fixed(glbl);
+	/* bad block type */
+	return 2;
+}
+
+
+
+/* decompress an inflated entry */
+static int
+xinflate(glbl)
+	struct inflate *glbl;
+{
+	int             e;	/* last block flag */
+	int             r;	/* result code */
+	unsigned        h;	/* maximum struct huft's malloc'ed */
+
+	glbl->gz_fixed_tl = (struct huft *) NULL;
+
+	/* initialize window, bit buffer */
+	glbl->gz_wp = 0;
+	glbl->gz_bk = 0;
+	glbl->gz_bb = 0;
+
+	/* decompress until the last block */
+	h = 0;
+	do {
+		glbl->gz_hufts = 0;
+		if ((r = inflate_block(glbl, &e)) != 0)
+			return r;
+		if (glbl->gz_hufts > h)
+			h = glbl->gz_hufts;
+	} while (!e);
+
+	/* flush out slide */
+	FLUSH(glbl, glbl->gz_wp);
+
+	/* return success */
+	return 0;
+}
+
+/* Nobody uses this - why not? */
+int
+inflate(glbl)
+	struct inflate *glbl;
+{
+	int             i;
+#ifdef KERNEL
+	u_char		*p = NULL;
+
+	if (!glbl->gz_slide)
+		p = glbl->gz_slide = malloc(GZ_WSIZE, M_GZIP, M_WAITOK);
+#endif
+	if (!glbl->gz_slide)
+#ifdef KERNEL
+		return(ENOMEM);
+#else
+		return 3; /* kzip expects 3 */
+#endif
+	i = xinflate(glbl);
+
+	if (glbl->gz_fixed_td != (struct huft *) NULL) {
+		huft_free(glbl, glbl->gz_fixed_td);
+		glbl->gz_fixed_td = (struct huft *) NULL;
+	}
+	if (glbl->gz_fixed_tl != (struct huft *) NULL) {
+		huft_free(glbl, glbl->gz_fixed_tl);
+		glbl->gz_fixed_tl = (struct huft *) NULL;
+	}
+#ifdef KERNEL
+	if (p == glbl->gz_slide) {
+		free(glbl->gz_slide, M_GZIP);
+		glbl->gz_slide = NULL;
+	}
+#endif
+	return i;
+}
+/* ----------------------- END INFLATE.C */
diff --git a/sys/kern/init_main.c b/sys/kern/init_main.c
index 61a0a14..f108547 100644
--- a/sys/kern/init_main.c
+++ b/sys/kern/init_main.c
@@ -1,4 +1,7 @@
 /*
+ * Copyright (c) 1995 Terrence R. Lambert
+ * All rights reserved.
+ *
  * Copyright (c) 1982, 1986, 1989, 1991, 1992, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
@@ -35,100 +38,270 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)init_main.c	8.16 (Berkeley) 5/14/95
+ *	@(#)init_main.c	8.9 (Berkeley) 1/21/94
+ * $Id: init_main.c,v 1.58 1997/03/01 17:49:09 wosch Exp $
  */
 
+#include "opt_rlimit.h"
+#include "opt_devfs.h"
+
 #include <sys/param.h>
+#include <sys/file.h>
 #include <sys/filedesc.h>
-#include <sys/errno.h>
-#include <sys/exec.h>
 #include <sys/kernel.h>
 #include <sys/mount.h>
-#include <sys/map.h>
+#include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/signalvar.h>
 #include <sys/systm.h>
 #include <sys/vnode.h>
-#include <sys/conf.h>
-#include <sys/buf.h>
-#include <sys/clist.h>
-#include <sys/device.h>
-#include <sys/protosw.h>
+#include <sys/sysent.h>
 #include <sys/reboot.h>
-#include <sys/user.h>
-#include <sys/syscallargs.h>
-
-#include <ufs/ufs/quota.h>
+#include <sys/sysproto.h>
+#include <sys/vmmeter.h>
 
 #include <machine/cpu.h>
 
 #include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/user.h>
+#include <sys/copyright.h>
 
-#ifdef HPFPLIB
-char	copyright[] =
-"Copyright (c) 1982, 1986, 1989, 1991, 1993\n\tThe Regents of the University of California.\nCopyright (c) 1992 Hewlett-Packard Company\nCopyright (c) 1992 Motorola Inc.\nAll rights reserved.\n\n";
-#else
-char	copyright[] =
-"Copyright (c) 1982, 1986, 1989, 1991, 1993\n\tThe Regents of the University of California.  All rights reserved.\n\n";
-#endif
+extern struct linker_set	sysinit_set;	/* XXX */
+
+extern void __main __P((void));
+extern void main __P((void *framep));
 
 /* Components of the first process -- never freed. */
-struct	session session0;
-struct	pgrp pgrp0;
+static struct session session0;
+static struct pgrp pgrp0;
 struct	proc proc0;
-struct	pcred cred0;
-struct	filedesc0 filedesc0;
-struct	plimit limit0;
-struct	vmspace vmspace0;
+static struct pcred cred0;
+static struct filedesc0 filedesc0;
+static struct plimit limit0;
+static struct vmspace vmspace0;
 struct	proc *curproc = &proc0;
-struct	proc *initproc, *pageproc;
+struct	proc *initproc;
 
-int	cmask = CMASK;
+int cmask = CMASK;
 extern	struct user *proc0paddr;
 
-struct	vnode *rootvp, *swapdev_vp;
+struct	vnode *rootvp;
 int	boothowto;
+
 struct	timeval boottime;
+SYSCTL_STRUCT(_kern, KERN_BOOTTIME, boottime,
+	CTLFLAG_RW, &boottime, timeval, "");
+
 struct	timeval runtime;
 
-static void start_init __P((struct proc *p, void *framep));
+/*
+ * Promiscuous argument pass for start_init()
+ *
+ * This is a kludge because we use a return from main() rather than a call
+ * to a new routine in locore.s to kick the kernel alive from locore.s.
+ */
+static void	*init_framep;
+
+
+#if __GNUC__ >= 2
+void __main() {}
+#endif
+
+
+/*
+ * This ensures that there is at least one entry so that the sysinit_set
+ * symbol is not undefined.  A sybsystem ID of SI_SUB_DUMMY is never
+ * executed.
+ */
+SYSINIT(placeholder, SI_SUB_DUMMY,SI_ORDER_ANY, NULL, NULL)
+
 
 /*
  * System startup; initialize the world, create process 0, mount root
  * filesystem, and fork to create init and pagedaemon.  Most of the
  * hard work is done in the lower-level initialization routines including
  * startup(), which does memory initialization and autoconfiguration.
+ *
+ * This allows simple addition of new kernel subsystems that require
+ * boot time initialization.  It also allows substitution of subsystem
+ * (for instance, a scheduler, kernel profiler, or VM system) by object
+ * module.  Finally, it allows for optional "kernel threads", like an LFS
+ * cleaner.
  */
+void
 main(framep)
 	void *framep;
 {
-	register struct proc *p;
-	register struct filedesc0 *fdp;
-	register struct pdevinit *pdev;
-	register int i;
-	int s;
-	register_t rval[2];
-	extern struct pdevinit pdevinit[];
-	extern void roundrobin __P((void *));
-	extern void schedcpu __P((void *));
+
+	register struct sysinit **sipp;		/* system initialization*/
+	register struct sysinit **xipp;		/* interior loop of sort*/
+	register struct sysinit *save;		/* bubble*/
+	int			rval[2];	/* SI_TYPE_KTHREAD support*/
 
 	/*
-	 * Initialize the current process pointer (curproc) before
-	 * any possible traps/probes to simplify trap processing.
+	 * Save the locore.s frame pointer for start_init().
 	 */
-	p = &proc0;
-	curproc = p;
+	init_framep = framep;
+
 	/*
-	 * Attempt to find console and initialize
-	 * in case of early panic or other messages.
+	 * Perform a bubble sort of the system initialization objects by
+	 * their subsystem (primary key) and order (secondary key).
+	 *
+	 * Since some things care about execution order, this is the
+	 * operation which ensures continued function.
 	 */
-	consinit();
-	printf(copyright);
+	for( sipp = (struct sysinit **)sysinit_set.ls_items; *sipp; sipp++) {
+		for( xipp = sipp + 1; *xipp; xipp++) {
+			if( (*sipp)->subsystem < (*xipp)->subsystem ||
+			    ( (*sipp)->subsystem == (*xipp)->subsystem &&
+			      (*sipp)->order < (*xipp)->order))
+				continue;	/* skip*/
+			save = *sipp;
+			*sipp = *xipp;
+			*xipp = save;
+		}
+	}
+
+	/*
+	 * Traverse the (now) ordered list of system initialization tasks.
+	 * Perform each task, and continue on to the next task.
+	 *
+	 * The last item on the list is expected to be the scheduler,
+	 * which will not return.
+	 */
+	for( sipp = (struct sysinit **)sysinit_set.ls_items; *sipp; sipp++) {
+		if( (*sipp)->subsystem == SI_SUB_DUMMY)
+			continue;	/* skip dummy task(s)*/
+
+		switch( (*sipp)->type) {
+		case SI_TYPE_DEFAULT:
+			/* no special processing*/
+			(*((*sipp)->func))( (*sipp)->udata);
+			break;
+
+		case SI_TYPE_KTHREAD:
+			/* kernel thread*/
+			if (fork(&proc0, NULL, rval))
+				panic("fork kernel process");
+			if (rval[1]) {
+				(*((*sipp)->func))( (*sipp)->udata);
+				/*
+				 * The call to start "init" returns
+				 * here after the scheduler has been
+				 * started, and returns to the caller
+				 * in i386/i386/locore.s.  This is a
+				 * necessary part of initialization
+				 * and is rather non-obvious.
+				 *
+				 * No other "kernel threads" should
+				 * return here.  Call panic() instead.
+				 */
+				return;
+			}
+			break;
+
+		default:
+			panic( "init_main: unrecognized init type");
+		}
+	}
+
+	/* NOTREACHED*/
+}
+
+
+/*
+ * Start a kernel process.  This is called after a fork() call in
+ * main() in the file kern/init_main.c.
+ *
+ * This function is used to start "internal" daemons.
+ */
+/* ARGSUSED*/
+void
+kproc_start(udata)
+	void *udata;
+{
+	struct kproc_desc	*kp = udata;
+	struct proc		*p = curproc;
+
+	/* save a global descriptor, if desired*/
+	if( kp->global_procpp != NULL)
+		*kp->global_procpp	= p;
+
+	/* this is a non-swapped system process*/
+	p->p_flag |= P_INMEM | P_SYSTEM;
 
-	vm_mem_init();
-	kmeminit();
-	cpu_startup();
+	/* set up arg0 for 'ps', et al*/
+	strcpy( p->p_comm, kp->arg0);
+
+	/* call the processes' main()...*/
+	(*kp->func)();
+
+	/* NOTREACHED */
+	panic("kproc_start: %s", kp->arg0);
+}
+
+
+/*
+ ***************************************************************************
+ ****
+ **** The following SYSINIT's belong elsewhere, but have not yet
+ **** been moved.
+ ****
+ ***************************************************************************
+ */
+#ifdef OMIT
+/*
+ * Handled by vfs_mountroot (bad idea) at this time... should be
+ * done the same as 4.4Lite2.
+ */
+SYSINIT(swapinit, SI_SUB_SWAP, SI_ORDER_FIRST, swapinit, NULL)
+#endif	/* OMIT*/
+
+static void print_caddr_t __P((void *data));
+static void
+print_caddr_t(data)
+	void *data;
+{
+	printf("%s", (char *)data);
+}
+SYSINIT(announce, SI_SUB_COPYRIGHT, SI_ORDER_FIRST, print_caddr_t, copyright)
+
+
+/*
+ ***************************************************************************
+ ****
+ **** The two following SYSINT's are proc0 specific glue code.  I am not
+ **** convinced that they can not be safely combined, but their order of
+ **** operation has been maintained as the same as the original init_main.c
+ **** for right now.
+ ****
+ **** These probably belong in init_proc.c or kern_proc.c, since they
+ **** deal with proc0 (the fork template process).
+ ****
+ ***************************************************************************
+ */
+/* ARGSUSED*/
+static void proc0_init __P((void *dummy));
+static void
+proc0_init(dummy)
+	void *dummy;
+{
+	register struct proc		*p;
+	register struct filedesc0	*fdp;
+	register unsigned i;
+
+	/*
+	 * Initialize the current process pointer (curproc) before
+	 * any possible traps/probes to simplify trap processing.
+	 */
+	p = &proc0;
+	curproc = p;			/* XXX redundant*/
 
 	/*
 	 * Initialize process and pgrp structures.
@@ -136,6 +309,11 @@ main(framep)
 	procinit();
 
 	/*
+	 * Initialize sleep queue hash table
+	 */
+	sleepinit();
+
+	/*
 	 * Create process 0 (the swapper).
 	 */
 	LIST_INSERT_HEAD(&allproc, p, p_list);
@@ -148,9 +326,14 @@ main(framep)
 	session0.s_count = 1;
 	session0.s_leader = p;
 
+	p->p_sysent = &aout_sysvec;
+
 	p->p_flag = P_INMEM | P_SYSTEM;
 	p->p_stat = SRUN;
 	p->p_nice = NZERO;
+	p->p_rtprio.type = RTP_PRIO_NORMAL;
+	p->p_rtprio.prio = 0;
+
 	bcopy("swapper", p->p_comm, sizeof ("swapper"));
 
 	/* Create credentials. */
@@ -173,8 +356,10 @@ main(framep)
 	for (i = 0; i < sizeof(p->p_rlimit)/sizeof(p->p_rlimit[0]); i++)
 		limit0.pl_rlimit[i].rlim_cur =
 		    limit0.pl_rlimit[i].rlim_max = RLIM_INFINITY;
-	limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur = NOFILE;
-	limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur = MAXUPRC;
+	limit0.pl_rlimit[RLIMIT_NOFILE].rlim_cur =
+	    limit0.pl_rlimit[RLIMIT_NOFILE].rlim_max = maxfiles;
+	limit0.pl_rlimit[RLIMIT_NPROC].rlim_cur =
+	    limit0.pl_rlimit[RLIMIT_NPROC].rlim_max = maxproc;
 	i = ptoa(cnt.v_free_count);
 	limit0.pl_rlimit[RLIMIT_RSS].rlim_max = i;
 	limit0.pl_rlimit[RLIMIT_MEMLOCK].rlim_max = i;
@@ -185,11 +370,22 @@ main(framep)
 	p->p_vmspace = &vmspace0;
 	vmspace0.vm_refcnt = 1;
 	pmap_pinit(&vmspace0.vm_pmap);
-	vm_map_init(&p->p_vmspace->vm_map, round_page(VM_MIN_ADDRESS),
-	    trunc_page(VM_MAX_ADDRESS), TRUE);
+	vm_map_init(&vmspace0.vm_map, round_page(VM_MIN_ADDRESS),
+	    trunc_page(VM_MAXUSER_ADDRESS), TRUE);
 	vmspace0.vm_map.pmap = &vmspace0.vm_pmap;
 	p->p_addr = proc0paddr;				/* XXX */
 
+#define INCOMPAT_LITES2
+#ifdef INCOMPAT_LITES2
+	/*
+	 * proc0 needs to have a coherent frame base, too.
+	 * This probably makes the identical call for the init proc
+	 * that happens later unnecessary since it should inherit
+	 * it during the fork.
+	 */
+	cpu_set_init_frame(p, init_framep);			/* XXX! */
+#endif	/* INCOMPAT_LITES2*/
+
 	/*
 	 * We continue to place resource usage info and signal
 	 * actions in the user struct so they're pageable.
@@ -201,104 +397,127 @@ main(framep)
 	 * Charge root for one process.
 	 */
 	(void)chgproccnt(0, 1);
+}
+SYSINIT(p0init, SI_SUB_INTRINSIC, SI_ORDER_FIRST, proc0_init, NULL)
 
-	rqinit();
-
-	/* Configure virtual memory system, set vm rlimits. */
-	vm_init_limits(p);
-
-	/* Initialize the file systems. */
-	vfsinit();
+/* ARGSUSED*/
+static void proc0_post __P((void *dummy));
+static void
+proc0_post(dummy)
+	void *dummy;
+{
+	struct timeval tv;
 
-	/* Start real time and statistics clocks. */
-	initclocks();
+	/*
+	 * Now can look at time, having had a chance to verify the time
+	 * from the file system.  Reset p->p_rtime as it may have been
+	 * munched in mi_switch() after the time got set.
+	 */
+	gettime(&boottime);
+	proc0.p_stats->p_start = runtime = mono_time = boottime;
+	proc0.p_rtime.tv_sec = proc0.p_rtime.tv_usec = 0;
 
-	/* Initialize mbuf's. */
-	mbinit();
+	/*
+	 * Give the ``random'' number generator a thump.
+	 */
+	microtime(&tv);
+	srandom(tv.tv_sec ^ tv.tv_usec);
 
-	/* Initialize clists. */
-	clist_init();
+	/* Initialize signal state for process 0. */
+	siginit(&proc0);
+}
+SYSINIT(p0post, SI_SUB_INTRINSIC_POST, SI_ORDER_FIRST, proc0_post, NULL)
 
-#ifdef SYSVSHM
-	/* Initialize System V style shared memory. */
-	shminit();
-#endif
 
-	/* Attach pseudo-devices. */
-	for (pdev = pdevinit; pdev->pdev_attach != NULL; pdev++)
-		(*pdev->pdev_attach)(pdev->pdev_count);
 
-	/*
-	 * Initialize protocols.  Block reception of incoming packets
-	 * until everything is ready.
-	 */
-	s = splimp();
-	ifinit();
-	domaininit();
-	splx(s);
-
-#ifdef GPROF
-	/* Initialize kernel profiling. */
-	kmstartup();
-#endif
 
+/*
+ ***************************************************************************
+ ****
+ **** The following SYSINIT's and glue code should be moved to the
+ **** respective files on a per subsystem basis.
+ ****
+ ***************************************************************************
+ */
+/* ARGSUSED*/
+static void sched_setup __P((void *dummy));
+static void
+sched_setup(dummy)
+	void *dummy;
+{
 	/* Kick off timeout driven events by calling first time. */
 	roundrobin(NULL);
 	schedcpu(NULL);
+}
+SYSINIT(sched_setup, SI_SUB_KICK_SCHEDULER, SI_ORDER_FIRST, sched_setup, NULL)
 
+/* ARGSUSED*/
+static void xxx_vfs_mountroot __P((void *fsnamep));
+static void
+xxx_vfs_mountroot(fsnamep)
+	void *fsnamep;
+{
 	/* Mount the root file system. */
-	if (vfs_mountroot())
+	if (vfs_mountrootfs(*((char **) fsnamep)))
 		panic("cannot mount root");
-	mountlist.cqh_first->mnt_flag |= MNT_ROOTFS;
+}
+SYSINIT(mountroot, SI_SUB_ROOT, SI_ORDER_FIRST, xxx_vfs_mountroot, &mountrootfsname)
+
+/* ARGSUSED*/
+static void xxx_vfs_root_fdtab __P((void *dummy));
+static void
+xxx_vfs_root_fdtab(dummy)
+	void *dummy;
+{
+	register struct filedesc0	*fdp = &filedesc0;
 
 	/* Get the vnode for '/'.  Set fdp->fd_fd.fd_cdir to reference it. */
 	if (VFS_ROOT(mountlist.cqh_first, &rootvnode))
 		panic("cannot find root vnode");
 	fdp->fd_fd.fd_cdir = rootvnode;
 	VREF(fdp->fd_fd.fd_cdir);
-	VOP_UNLOCK(rootvnode, 0, p);
+	VOP_UNLOCK(rootvnode, 0, &proc0);
 	fdp->fd_fd.fd_rdir = NULL;
-	swapinit();
+}
+SYSINIT(retrofit, SI_SUB_ROOT_FDTAB, SI_ORDER_FIRST, xxx_vfs_root_fdtab, NULL)
 
-	/*
-	 * Now can look at time, having had a chance to verify the time
-	 * from the file system.  Reset p->p_rtime as it may have been
-	 * munched in mi_switch() after the time got set.
-	 */
-	p->p_stats->p_start = runtime = mono_time = boottime = time;
-	p->p_rtime.tv_sec = p->p_rtime.tv_usec = 0;
 
-	/* Initialize signal state for process 0. */
-	siginit(p);
+/*
+ ***************************************************************************
+ ****
+ **** The following code probably belongs in another file, like
+ **** kern/init_init.c.  It is here for two reasons only:
+ ****
+ ****	1)	This code returns to startup the system; this is
+ ****		abnormal for a kernel thread.
+ ****	2)	This code promiscuously uses init_frame
+ ****
+ ***************************************************************************
+ */
 
-	/* Create process 1 (init(8)). */
-	if (fork(p, NULL, rval))
-		panic("fork init");
-	if (rval[1]) {
-		start_init(curproc, framep);
-		return;
-	}
+static void kthread_init __P((void *dummy));
+SYSINIT_KT(init,SI_SUB_KTHREAD_INIT, SI_ORDER_FIRST, kthread_init, NULL)
 
-	/* Create process 2 (the pageout daemon). */
-	if (fork(p, NULL, rval))
-		panic("fork pager");
-	if (rval[1]) {
-		/*
-		 * Now in process 2.
-		 */
-		p = curproc;
-		pageproc = p;
-		p->p_flag |= P_INMEM | P_SYSTEM;	/* XXX */
-		bcopy("pagedaemon", curproc->p_comm, sizeof ("pagedaemon"));
-		vm_pageout();
-		/* NOTREACHED */
-	}
 
-	/* The scheduler is an infinite loop. */
-	scheduler();
-	/* NOTREACHED */
+static void start_init __P((struct proc *p, void *framep));
+
+/* ARGSUSED*/
+static void
+kthread_init(dummy)
+	void *dummy;
+{
+
+	/* Create process 1 (init(8)). */
+	start_init(curproc, init_framep);
+
+	/*
+	 * This is the only kernel thread allowed to return yo the
+	 * caller!!!
+	 */
+	return;	
 }
 
+
 /*
  * List of paths to try when searching for "init".
  */
@@ -306,6 +525,7 @@ static char *initpaths[] = {
 	"/sbin/init",
 	"/sbin/oinit",
 	"/sbin/init.bak",
+	"/stand/sysinstall",
 	NULL,
 };
 
@@ -319,14 +539,8 @@ start_init(p, framep)
 	void *framep;
 {
 	vm_offset_t addr;
-	struct execve_args /* {
-		syscallarg(char *) path;
-		syscallarg(char **) argp;
-		syscallarg(char **) envp;
-	} */ args;
-	int options, i, error;
-	register_t retval[2];
-	char flags[4] = "-", *flagsp;
+	struct execve_args args;
+	int options, i, retval[2], error;
 	char **pathp, *path, *ucp, **uap, *arg0, *arg1;
 
 	initproc = p;
@@ -343,66 +557,74 @@ start_init(p, framep)
 	/*
 	 * Need just enough stack to hold the faked-up "execve()" arguments.
 	 */
-	addr = trunc_page(VM_MAX_ADDRESS - PAGE_SIZE);
-	if (vm_allocate(&p->p_vmspace->vm_map, &addr, PAGE_SIZE, FALSE) != 0)
+	addr = trunc_page(VM_MAXUSER_ADDRESS - PAGE_SIZE);
+	if (vm_map_find(&p->p_vmspace->vm_map, NULL, 0, &addr, PAGE_SIZE, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0) != 0)
 		panic("init: couldn't allocate argument space");
 	p->p_vmspace->vm_maxsaddr = (caddr_t)addr;
+	p->p_vmspace->vm_ssize = 1;
 
 	for (pathp = &initpaths[0]; (path = *pathp) != NULL; pathp++) {
 		/*
-		 * Construct the boot flag argument.
+		 * Move out the boot flag argument.
 		 */
 		options = 0;
-		flagsp = flags + 1;
 		ucp = (char *)USRSTACK;
+		(void)subyte(--ucp, 0);		/* trailing zero */
 		if (boothowto & RB_SINGLE) {
-			*flagsp++ = 's';
+			(void)subyte(--ucp, 's');
 			options = 1;
 		}
 #ifdef notyet
                 if (boothowto & RB_FASTBOOT) {
-			*flagsp++ = 'f';
+			(void)subyte(--ucp, 'f');
 			options = 1;
 		}
 #endif
-		/*
-		 * Move out the flags (arg 1), if necessary.
-		 */
-		if (options != 0) {
-			*flagsp++ = '\0';
-			i = flagsp - flags;
-			(void)copyout((caddr_t)flags, (caddr_t)(ucp -= i), i);
-			arg1 = ucp;
-		}
+
+#ifdef BOOTCDROM
+		(void)subyte(--ucp, 'C');
+		options = 1;
+#endif
+
+#if defined(DEVFS) && defined(DEVFS_ROOT)
+		(void)subyte(--ucp, 'd');
+		options = 1;
+#endif
+		if (options == 0)
+			(void)subyte(--ucp, '-');
+		(void)subyte(--ucp, '-');		/* leading hyphen */
+		arg1 = ucp;
 
 		/*
 		 * Move out the file name (also arg 0).
 		 */
-		i = strlen(path) + 1;
-		(void)copyout((caddr_t)path, (caddr_t)(ucp -= i), i);
+		for (i = strlen(path) + 1; i >= 0; i--)
+			(void)subyte(--ucp, path[i]);
 		arg0 = ucp;
 
 		/*
 		 * Move out the arg pointers.
 		 */
-		uap = (char **)((long)ucp & ~ALIGNBYTES);
+		uap = (char **)((int)ucp & ~(NBPW-1));
 		(void)suword((caddr_t)--uap, 0);	/* terminator */
-		if (options != 0)
-			(void)suword((caddr_t)--uap, (long)arg1);
-		(void)suword((caddr_t)--uap, (long)arg0);
+		(void)suword((caddr_t)--uap, (int)arg1);
+		(void)suword((caddr_t)--uap, (int)arg0);
 
 		/*
 		 * Point at the arguments.
 		 */
-		SCARG(&args, path) = arg0;
-		SCARG(&args, argp) = uap;
-		SCARG(&args, envp) = NULL;
+		args.fname = arg0;
+		args.argv = uap;
+		args.envv = NULL;
 
 		/*
 		 * Now try to exec the program.  If can't for any reason
 		 * other than it doesn't exist, complain.
+		 *
+		 * Otherwise return to main() which returns to btext
+		 * which completes the system startup.
 		 */
-		if ((error = execve(p, &args, retval)) == 0)
+		if ((error = execve(p, &args, &retval[0])) == 0)
 			return;
 		if (error != ENOENT)
 			printf("exec %s: error %d\n", path, error);
diff --git a/sys/kern/init_sysent.c b/sys/kern/init_sysent.c
index 0bbdd20..6954a04 100644
--- a/sys/kern/init_sysent.c
+++ b/sys/kern/init_sysent.c
@@ -2,766 +2,286 @@
  * System call switch table.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * created from	@(#)syscalls.master	8.6 (Berkeley) 3/30/95
+ * created from	Id: syscalls.master,v 1.33 1997/02/22 09:39:21 peter Exp 
  */
 
 #include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/signal.h>
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
-int	nosys();
-int	exit();
-int	fork();
-int	read();
-int	write();
-int	open();
-int	close();
-int	wait4();
-int	link();
-int	unlink();
-int	chdir();
-int	fchdir();
-int	mknod();
-int	chmod();
-int	chown();
-int	obreak();
-int	getfsstat();
-int	getpid();
-int	mount();
-int	unmount();
-int	setuid();
-int	getuid();
-int	geteuid();
-int	ptrace();
-int	recvmsg();
-int	sendmsg();
-int	recvfrom();
-int	accept();
-int	getpeername();
-int	getsockname();
-int	access();
-int	chflags();
-int	fchflags();
-int	sync();
-int	kill();
-int	getppid();
-int	dup();
-int	pipe();
-int	getegid();
-int	profil();
-#ifdef KTRACE
-int	ktrace();
-#else
-#endif
-int	sigaction();
-int	getgid();
-int	sigprocmask();
-int	getlogin();
-int	setlogin();
-int	acct();
-int	sigpending();
-int	sigaltstack();
-int	ioctl();
-int	reboot();
-int	revoke();
-int	symlink();
-int	readlink();
-int	execve();
-int	umask();
-int	chroot();
-int	msync();
-int	vfork();
-int	sbrk();
-int	sstk();
-int	ovadvise();
-int	munmap();
-int	mprotect();
-int	madvise();
-int	mincore();
-int	getgroups();
-int	setgroups();
-int	getpgrp();
-int	setpgid();
-int	setitimer();
-int	swapon();
-int	getitimer();
-int	getdtablesize();
-int	dup2();
-int	fcntl();
-int	select();
-int	fsync();
-int	setpriority();
-int	socket();
-int	connect();
-int	getpriority();
-int	sigreturn();
-int	bind();
-int	setsockopt();
-int	listen();
-int	sigsuspend();
-#ifdef TRACE
-int	vtrace();
-#else
-#endif
-int	gettimeofday();
-int	getrusage();
-int	getsockopt();
-#ifdef vax
-int	resuba();
-#else
-#endif
-int	readv();
-int	writev();
-int	settimeofday();
-int	fchown();
-int	fchmod();
-int	rename();
-int	flock();
-int	mkfifo();
-int	sendto();
-int	shutdown();
-int	socketpair();
-int	mkdir();
-int	rmdir();
-int	utimes();
-int	adjtime();
-int	setsid();
-int	quotactl();
-#ifdef NFS
-int	nfssvc();
-#else
-#endif
-int	statfs();
-int	fstatfs();
-#ifdef NFS
-int	getfh();
-#else
-#endif
-#if defined(SYSVSHM) && !defined(alpha)
-#else
-#endif
-int	setgid();
-int	setegid();
-int	seteuid();
-#ifdef LFS
-int	lfs_bmapv();
-int	lfs_markv();
-int	lfs_segclean();
-int	lfs_segwait();
-#else
-#endif
-int	stat();
-int	fstat();
-int	lstat();
-int	pathconf();
-int	fpathconf();
-int	getrlimit();
-int	setrlimit();
-int	getdirentries();
-int	mmap();
-int	nosys();
-int	lseek();
-int	truncate();
-int	ftruncate();
-int	__sysctl();
-int	mlock();
-int	munlock();
-int	undelete();
-#if defined(SYSVSHM) && 0
-int	shmat();
-int	shmctl();
-int	shmdt();
-int	shmget();
-#else
-#endif
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
 
 #ifdef COMPAT_43
-#define compat_43(func) __CONCAT(compat_43_,func)
-
-int	compat_43(creat)();
-int	compat_43(lseek)();
-int	compat_43(stat)();
-int	compat_43(lstat)();
-#ifdef KTRACE
+#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name)
 #else
+#define compat(n, name) 0, (sy_call_t *)nosys
 #endif
-int	compat_43(fstat)();
-int	compat_43(getkerninfo)();
-int	compat_43(getpagesize)();
-int	compat_43(mmap)();
-int	compat_43(wait)();
-int	compat_43(gethostname)();
-int	compat_43(sethostname)();
-int	compat_43(accept)();
-int	compat_43(send)();
-int	compat_43(recv)();
-int	compat_43(sigvec)();
-int	compat_43(sigblock)();
-int	compat_43(sigsetmask)();
-int	compat_43(sigstack)();
-int	compat_43(recvmsg)();
-int	compat_43(sendmsg)();
-#ifdef TRACE
-#else
-#endif
-#ifdef vax
-#else
-#endif
-int	compat_43(recvfrom)();
-int	compat_43(setreuid)();
-int	compat_43(setregid)();
-int	compat_43(truncate)();
-int	compat_43(ftruncate)();
-int	compat_43(getpeername)();
-int	compat_43(gethostid)();
-int	compat_43(sethostid)();
-int	compat_43(getrlimit)();
-int	compat_43(setrlimit)();
-int	compat_43(killpg)();
-int	compat_43(quota)();
-int	compat_43(getsockname)();
-#ifdef NFS
-#else
-#endif
-int	compat_43(getdirentries)();
-#ifdef NFS
-#else
-#endif
-#if defined(SYSVSHM) && !defined(alpha)
-int	compat_43(shmsys)();
-#else
-#endif
-#ifdef LFS
-#else
-#endif
-#if defined(SYSVSHM) && 0
-#else
-#endif
-
-#else /* COMPAT_43 */
-#define compat_43(func) nosys
-#endif /* COMPAT_43 */
-
-#define	s(type)	sizeof(type)
 
+/* The casts are bogus but will do for now. */
 struct sysent sysent[] = {
-	{ 0, 0,
-	    nosys },				/* 0 = syscall */
-	{ 1, s(struct exit_args),
-	    exit },				/* 1 = exit */
-	{ 0, 0,
-	    fork },				/* 2 = fork */
-	{ 3, s(struct read_args),
-	    read },				/* 3 = read */
-	{ 3, s(struct write_args),
-	    write },				/* 4 = write */
-	{ 3, s(struct open_args),
-	    open },				/* 5 = open */
-	{ 1, s(struct close_args),
-	    close },				/* 6 = close */
-	{ 4, s(struct wait4_args),
-	    wait4 },				/* 7 = wait4 */
-	{ 2, s(struct compat_43_creat_args),
-	    compat_43(creat) },			/* 8 = compat_43 creat */
-	{ 2, s(struct link_args),
-	    link },				/* 9 = link */
-	{ 1, s(struct unlink_args),
-	    unlink },				/* 10 = unlink */
-	{ 0, 0,
-	    nosys },				/* 11 = obsolete execv */
-	{ 1, s(struct chdir_args),
-	    chdir },				/* 12 = chdir */
-	{ 1, s(struct fchdir_args),
-	    fchdir },				/* 13 = fchdir */
-	{ 3, s(struct mknod_args),
-	    mknod },				/* 14 = mknod */
-	{ 2, s(struct chmod_args),
-	    chmod },				/* 15 = chmod */
-	{ 3, s(struct chown_args),
-	    chown },				/* 16 = chown */
-	{ 1, s(struct obreak_args),
-	    obreak },				/* 17 = break */
-	{ 3, s(struct getfsstat_args),
-	    getfsstat },			/* 18 = getfsstat */
-	{ 3, s(struct compat_43_lseek_args),
-	    compat_43(lseek) },			/* 19 = compat_43 lseek */
-	{ 0, 0,
-	    getpid },				/* 20 = getpid */
-	{ 4, s(struct mount_args),
-	    mount },				/* 21 = mount */
-	{ 2, s(struct unmount_args),
-	    unmount },				/* 22 = unmount */
-	{ 1, s(struct setuid_args),
-	    setuid },				/* 23 = setuid */
-	{ 0, 0,
-	    getuid },				/* 24 = getuid */
-	{ 0, 0,
-	    geteuid },				/* 25 = geteuid */
-	{ 4, s(struct ptrace_args),
-	    ptrace },				/* 26 = ptrace */
-	{ 3, s(struct recvmsg_args),
-	    recvmsg },				/* 27 = recvmsg */
-	{ 3, s(struct sendmsg_args),
-	    sendmsg },				/* 28 = sendmsg */
-	{ 6, s(struct recvfrom_args),
-	    recvfrom },				/* 29 = recvfrom */
-	{ 3, s(struct accept_args),
-	    accept },				/* 30 = accept */
-	{ 3, s(struct getpeername_args),
-	    getpeername },			/* 31 = getpeername */
-	{ 3, s(struct getsockname_args),
-	    getsockname },			/* 32 = getsockname */
-	{ 2, s(struct access_args),
-	    access },				/* 33 = access */
-	{ 2, s(struct chflags_args),
-	    chflags },				/* 34 = chflags */
-	{ 2, s(struct fchflags_args),
-	    fchflags },				/* 35 = fchflags */
-	{ 0, 0,
-	    sync },				/* 36 = sync */
-	{ 2, s(struct kill_args),
-	    kill },				/* 37 = kill */
-	{ 2, s(struct compat_43_stat_args),
-	    compat_43(stat) },			/* 38 = compat_43 stat */
-	{ 0, 0,
-	    getppid },				/* 39 = getppid */
-	{ 2, s(struct compat_43_lstat_args),
-	    compat_43(lstat) },			/* 40 = compat_43 lstat */
-	{ 1, s(struct dup_args),
-	    dup },				/* 41 = dup */
-	{ 0, 0,
-	    pipe },				/* 42 = pipe */
-	{ 0, 0,
-	    getegid },				/* 43 = getegid */
-	{ 4, s(struct profil_args),
-	    profil },				/* 44 = profil */
-#ifdef KTRACE
-	{ 4, s(struct ktrace_args),
-	    ktrace },				/* 45 = ktrace */
-#else
-	{ 0, 0,
-	    nosys },				/* 45 = unimplemented ktrace */
-#endif
-	{ 3, s(struct sigaction_args),
-	    sigaction },			/* 46 = sigaction */
-	{ 0, 0,
-	    getgid },				/* 47 = getgid */
-	{ 2, s(struct sigprocmask_args),
-	    sigprocmask },			/* 48 = sigprocmask */
-	{ 2, s(struct getlogin_args),
-	    getlogin },				/* 49 = getlogin */
-	{ 1, s(struct setlogin_args),
-	    setlogin },				/* 50 = setlogin */
-	{ 1, s(struct acct_args),
-	    acct },				/* 51 = acct */
-	{ 0, 0,
-	    sigpending },			/* 52 = sigpending */
-	{ 2, s(struct sigaltstack_args),
-	    sigaltstack },			/* 53 = sigaltstack */
-	{ 3, s(struct ioctl_args),
-	    ioctl },				/* 54 = ioctl */
-	{ 1, s(struct reboot_args),
-	    reboot },				/* 55 = reboot */
-	{ 1, s(struct revoke_args),
-	    revoke },				/* 56 = revoke */
-	{ 2, s(struct symlink_args),
-	    symlink },				/* 57 = symlink */
-	{ 3, s(struct readlink_args),
-	    readlink },				/* 58 = readlink */
-	{ 3, s(struct execve_args),
-	    execve },				/* 59 = execve */
-	{ 1, s(struct umask_args),
-	    umask },				/* 60 = umask */
-	{ 1, s(struct chroot_args),
-	    chroot },				/* 61 = chroot */
-	{ 2, s(struct compat_43_fstat_args),
-	    compat_43(fstat) },			/* 62 = compat_43 fstat */
-	{ 4, s(struct compat_43_getkerninfo_args),
-	    compat_43(getkerninfo) },		/* 63 = compat_43 getkerninfo */
-	{ 0, 0,
-	    compat_43(getpagesize) },		/* 64 = compat_43 getpagesize */
-	{ 2, s(struct msync_args),
-	    msync },				/* 65 = msync */
-	{ 0, 0,
-	    vfork },				/* 66 = vfork */
-	{ 0, 0,
-	    nosys },				/* 67 = obsolete vread */
-	{ 0, 0,
-	    nosys },				/* 68 = obsolete vwrite */
-	{ 1, s(struct sbrk_args),
-	    sbrk },				/* 69 = sbrk */
-	{ 1, s(struct sstk_args),
-	    sstk },				/* 70 = sstk */
-	{ 6, s(struct compat_43_mmap_args),
-	    compat_43(mmap) },			/* 71 = compat_43 mmap */
-	{ 1, s(struct ovadvise_args),
-	    ovadvise },				/* 72 = vadvise */
-	{ 2, s(struct munmap_args),
-	    munmap },				/* 73 = munmap */
-	{ 3, s(struct mprotect_args),
-	    mprotect },				/* 74 = mprotect */
-	{ 3, s(struct madvise_args),
-	    madvise },				/* 75 = madvise */
-	{ 0, 0,
-	    nosys },				/* 76 = obsolete vhangup */
-	{ 0, 0,
-	    nosys },				/* 77 = obsolete vlimit */
-	{ 3, s(struct mincore_args),
-	    mincore },				/* 78 = mincore */
-	{ 2, s(struct getgroups_args),
-	    getgroups },			/* 79 = getgroups */
-	{ 2, s(struct setgroups_args),
-	    setgroups },			/* 80 = setgroups */
-	{ 0, 0,
-	    getpgrp },				/* 81 = getpgrp */
-	{ 2, s(struct setpgid_args),
-	    setpgid },				/* 82 = setpgid */
-	{ 3, s(struct setitimer_args),
-	    setitimer },			/* 83 = setitimer */
-	{ 0, 0,
-	    compat_43(wait) },			/* 84 = compat_43 wait */
-	{ 1, s(struct swapon_args),
-	    swapon },				/* 85 = swapon */
-	{ 2, s(struct getitimer_args),
-	    getitimer },			/* 86 = getitimer */
-	{ 2, s(struct compat_43_gethostname_args),
-	    compat_43(gethostname) },		/* 87 = compat_43 gethostname */
-	{ 2, s(struct compat_43_sethostname_args),
-	    compat_43(sethostname) },		/* 88 = compat_43 sethostname */
-	{ 0, 0,
-	    getdtablesize },			/* 89 = getdtablesize */
-	{ 2, s(struct dup2_args),
-	    dup2 },				/* 90 = dup2 */
-	{ 0, 0,
-	    nosys },				/* 91 = unimplemented getdopt */
-	{ 3, s(struct fcntl_args),
-	    fcntl },				/* 92 = fcntl */
-	{ 5, s(struct select_args),
-	    select },				/* 93 = select */
-	{ 0, 0,
-	    nosys },				/* 94 = unimplemented setdopt */
-	{ 1, s(struct fsync_args),
-	    fsync },				/* 95 = fsync */
-	{ 3, s(struct setpriority_args),
-	    setpriority },			/* 96 = setpriority */
-	{ 3, s(struct socket_args),
-	    socket },				/* 97 = socket */
-	{ 3, s(struct connect_args),
-	    connect },				/* 98 = connect */
-	{ 3, s(struct compat_43_accept_args),
-	    compat_43(accept) },		/* 99 = compat_43 accept */
-	{ 2, s(struct getpriority_args),
-	    getpriority },			/* 100 = getpriority */
-	{ 4, s(struct compat_43_send_args),
-	    compat_43(send) },			/* 101 = compat_43 send */
-	{ 4, s(struct compat_43_recv_args),
-	    compat_43(recv) },			/* 102 = compat_43 recv */
-	{ 1, s(struct sigreturn_args),
-	    sigreturn },			/* 103 = sigreturn */
-	{ 3, s(struct bind_args),
-	    bind },				/* 104 = bind */
-	{ 5, s(struct setsockopt_args),
-	    setsockopt },			/* 105 = setsockopt */
-	{ 2, s(struct listen_args),
-	    listen },				/* 106 = listen */
-	{ 0, 0,
-	    nosys },				/* 107 = obsolete vtimes */
-	{ 3, s(struct compat_43_sigvec_args),
-	    compat_43(sigvec) },		/* 108 = compat_43 sigvec */
-	{ 1, s(struct compat_43_sigblock_args),
-	    compat_43(sigblock) },		/* 109 = compat_43 sigblock */
-	{ 1, s(struct compat_43_sigsetmask_args),
-	    compat_43(sigsetmask) },		/* 110 = compat_43 sigsetmask */
-	{ 1, s(struct sigsuspend_args),
-	    sigsuspend },			/* 111 = sigsuspend */
-	{ 2, s(struct compat_43_sigstack_args),
-	    compat_43(sigstack) },		/* 112 = compat_43 sigstack */
-	{ 3, s(struct compat_43_recvmsg_args),
-	    compat_43(recvmsg) },		/* 113 = compat_43 recvmsg */
-	{ 3, s(struct compat_43_sendmsg_args),
-	    compat_43(sendmsg) },		/* 114 = compat_43 sendmsg */
-#ifdef TRACE
-	{ 2, s(struct vtrace_args),
-	    vtrace },				/* 115 = vtrace */
-#else
-	{ 0, 0,
-	    nosys },				/* 115 = obsolete vtrace */
-#endif
-	{ 2, s(struct gettimeofday_args),
-	    gettimeofday },			/* 116 = gettimeofday */
-	{ 2, s(struct getrusage_args),
-	    getrusage },			/* 117 = getrusage */
-	{ 5, s(struct getsockopt_args),
-	    getsockopt },			/* 118 = getsockopt */
-#ifdef vax
-	{ 1, s(struct resuba_args),
-	    resuba },				/* 119 = resuba */
-#else
-	{ 0, 0,
-	    nosys },				/* 119 = unimplemented resuba */
-#endif
-	{ 3, s(struct readv_args),
-	    readv },				/* 120 = readv */
-	{ 3, s(struct writev_args),
-	    writev },				/* 121 = writev */
-	{ 2, s(struct settimeofday_args),
-	    settimeofday },			/* 122 = settimeofday */
-	{ 3, s(struct fchown_args),
-	    fchown },				/* 123 = fchown */
-	{ 2, s(struct fchmod_args),
-	    fchmod },				/* 124 = fchmod */
-	{ 6, s(struct compat_43_recvfrom_args),
-	    compat_43(recvfrom) },		/* 125 = compat_43 recvfrom */
-	{ 2, s(struct compat_43_setreuid_args),
-	    compat_43(setreuid) },		/* 126 = compat_43 setreuid */
-	{ 2, s(struct compat_43_setregid_args),
-	    compat_43(setregid) },		/* 127 = compat_43 setregid */
-	{ 2, s(struct rename_args),
-	    rename },				/* 128 = rename */
-	{ 2, s(struct compat_43_truncate_args),
-	    compat_43(truncate) },		/* 129 = compat_43 truncate */
-	{ 2, s(struct compat_43_ftruncate_args),
-	    compat_43(ftruncate) },		/* 130 = compat_43 ftruncate */
-	{ 2, s(struct flock_args),
-	    flock },				/* 131 = flock */
-	{ 2, s(struct mkfifo_args),
-	    mkfifo },				/* 132 = mkfifo */
-	{ 6, s(struct sendto_args),
-	    sendto },				/* 133 = sendto */
-	{ 2, s(struct shutdown_args),
-	    shutdown },				/* 134 = shutdown */
-	{ 4, s(struct socketpair_args),
-	    socketpair },			/* 135 = socketpair */
-	{ 2, s(struct mkdir_args),
-	    mkdir },				/* 136 = mkdir */
-	{ 1, s(struct rmdir_args),
-	    rmdir },				/* 137 = rmdir */
-	{ 2, s(struct utimes_args),
-	    utimes },				/* 138 = utimes */
-	{ 0, 0,
-	    nosys },				/* 139 = obsolete 4.2 sigreturn */
-	{ 2, s(struct adjtime_args),
-	    adjtime },				/* 140 = adjtime */
-	{ 3, s(struct compat_43_getpeername_args),
-	    compat_43(getpeername) },		/* 141 = compat_43 getpeername */
-	{ 0, 0,
-	    compat_43(gethostid) },		/* 142 = compat_43 gethostid */
-	{ 1, s(struct compat_43_sethostid_args),
-	    compat_43(sethostid) },		/* 143 = compat_43 sethostid */
-	{ 2, s(struct compat_43_getrlimit_args),
-	    compat_43(getrlimit) },		/* 144 = compat_43 getrlimit */
-	{ 2, s(struct compat_43_setrlimit_args),
-	    compat_43(setrlimit) },		/* 145 = compat_43 setrlimit */
-	{ 2, s(struct compat_43_killpg_args),
-	    compat_43(killpg) },		/* 146 = compat_43 killpg */
-	{ 0, 0,
-	    setsid },				/* 147 = setsid */
-	{ 4, s(struct quotactl_args),
-	    quotactl },				/* 148 = quotactl */
-	{ 0, 0,
-	    compat_43(quota) },			/* 149 = compat_43 quota */
-	{ 3, s(struct compat_43_getsockname_args),
-	    compat_43(getsockname) },		/* 150 = compat_43 getsockname */
-	{ 0, 0,
-	    nosys },				/* 151 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 152 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 153 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 154 = unimplemented */
+	{ 0, (sy_call_t *)nosys },			/* 0 = syscall */
+	{ 1, (sy_call_t *)exit },			/* 1 = exit */
+	{ 0, (sy_call_t *)fork },			/* 2 = fork */
+	{ 3, (sy_call_t *)read },			/* 3 = read */
+	{ 3, (sy_call_t *)write },			/* 4 = write */
+	{ 3, (sy_call_t *)open },			/* 5 = open */
+	{ 1, (sy_call_t *)close },			/* 6 = close */
+	{ 4, (sy_call_t *)wait4 },			/* 7 = wait4 */
+	{ compat(2,creat) },		/* 8 = old creat */
+	{ 2, (sy_call_t *)link },			/* 9 = link */
+	{ 1, (sy_call_t *)unlink },			/* 10 = unlink */
+	{ 0, (sy_call_t *)nosys },			/* 11 = obsolete execv */
+	{ 1, (sy_call_t *)chdir },			/* 12 = chdir */
+	{ 1, (sy_call_t *)fchdir },			/* 13 = fchdir */
+	{ 3, (sy_call_t *)mknod },			/* 14 = mknod */
+	{ 2, (sy_call_t *)chmod },			/* 15 = chmod */
+	{ 3, (sy_call_t *)chown },			/* 16 = chown */
+	{ 1, (sy_call_t *)obreak },			/* 17 = break */
+	{ 3, (sy_call_t *)getfsstat },			/* 18 = getfsstat */
+	{ compat(3,lseek) },		/* 19 = old lseek */
+	{ 0, (sy_call_t *)getpid },			/* 20 = getpid */
+	{ 4, (sy_call_t *)mount },			/* 21 = mount */
+	{ 2, (sy_call_t *)unmount },			/* 22 = unmount */
+	{ 1, (sy_call_t *)setuid },			/* 23 = setuid */
+	{ 0, (sy_call_t *)getuid },			/* 24 = getuid */
+	{ 0, (sy_call_t *)geteuid },			/* 25 = geteuid */
+	{ 4, (sy_call_t *)ptrace },			/* 26 = ptrace */
+	{ 3, (sy_call_t *)recvmsg },			/* 27 = recvmsg */
+	{ 3, (sy_call_t *)sendmsg },			/* 28 = sendmsg */
+	{ 6, (sy_call_t *)recvfrom },			/* 29 = recvfrom */
+	{ 3, (sy_call_t *)accept },			/* 30 = accept */
+	{ 3, (sy_call_t *)getpeername },		/* 31 = getpeername */
+	{ 3, (sy_call_t *)getsockname },		/* 32 = getsockname */
+	{ 2, (sy_call_t *)access },			/* 33 = access */
+	{ 2, (sy_call_t *)chflags },			/* 34 = chflags */
+	{ 2, (sy_call_t *)fchflags },			/* 35 = fchflags */
+	{ 0, (sy_call_t *)sync },			/* 36 = sync */
+	{ 2, (sy_call_t *)kill },			/* 37 = kill */
+	{ compat(2,stat) },		/* 38 = old stat */
+	{ 0, (sy_call_t *)getppid },			/* 39 = getppid */
+	{ compat(2,lstat) },		/* 40 = old lstat */
+	{ 1, (sy_call_t *)dup },			/* 41 = dup */
+	{ 0, (sy_call_t *)pipe },			/* 42 = pipe */
+	{ 0, (sy_call_t *)getegid },			/* 43 = getegid */
+	{ 4, (sy_call_t *)profil },			/* 44 = profil */
+	{ 4, (sy_call_t *)ktrace },			/* 45 = ktrace */
+	{ 3, (sy_call_t *)sigaction },			/* 46 = sigaction */
+	{ 0, (sy_call_t *)getgid },			/* 47 = getgid */
+	{ 2, (sy_call_t *)sigprocmask },		/* 48 = sigprocmask */
+	{ 2, (sy_call_t *)getlogin },			/* 49 = getlogin */
+	{ 1, (sy_call_t *)setlogin },			/* 50 = setlogin */
+	{ 1, (sy_call_t *)acct },			/* 51 = acct */
+	{ 0, (sy_call_t *)sigpending },			/* 52 = sigpending */
+	{ 2, (sy_call_t *)sigaltstack },		/* 53 = sigaltstack */
+	{ 3, (sy_call_t *)ioctl },			/* 54 = ioctl */
+	{ 1, (sy_call_t *)reboot },			/* 55 = reboot */
+	{ 1, (sy_call_t *)revoke },			/* 56 = revoke */
+	{ 2, (sy_call_t *)symlink },			/* 57 = symlink */
+	{ 3, (sy_call_t *)readlink },			/* 58 = readlink */
+	{ 3, (sy_call_t *)execve },			/* 59 = execve */
+	{ 1, (sy_call_t *)umask },			/* 60 = umask */
+	{ 1, (sy_call_t *)chroot },			/* 61 = chroot */
+	{ compat(2,fstat) },		/* 62 = old fstat */
+	{ compat(4,getkerninfo) },		/* 63 = old getkerninfo */
+	{ compat(0,getpagesize) },		/* 64 = old getpagesize */
+	{ 3, (sy_call_t *)msync },			/* 65 = msync */
+	{ 0, (sy_call_t *)vfork },			/* 66 = vfork */
+	{ 0, (sy_call_t *)nosys },			/* 67 = obsolete vread */
+	{ 0, (sy_call_t *)nosys },			/* 68 = obsolete vwrite */
+	{ 1, (sy_call_t *)sbrk },			/* 69 = sbrk */
+	{ 1, (sy_call_t *)sstk },			/* 70 = sstk */
+	{ compat(6,mmap) },		/* 71 = old mmap */
+	{ 1, (sy_call_t *)ovadvise },			/* 72 = vadvise */
+	{ 2, (sy_call_t *)munmap },			/* 73 = munmap */
+	{ 3, (sy_call_t *)mprotect },			/* 74 = mprotect */
+	{ 3, (sy_call_t *)madvise },			/* 75 = madvise */
+	{ 0, (sy_call_t *)nosys },			/* 76 = obsolete vhangup */
+	{ 0, (sy_call_t *)nosys },			/* 77 = obsolete vlimit */
+	{ 3, (sy_call_t *)mincore },			/* 78 = mincore */
+	{ 2, (sy_call_t *)getgroups },			/* 79 = getgroups */
+	{ 2, (sy_call_t *)setgroups },			/* 80 = setgroups */
+	{ 0, (sy_call_t *)getpgrp },			/* 81 = getpgrp */
+	{ 2, (sy_call_t *)setpgid },			/* 82 = setpgid */
+	{ 3, (sy_call_t *)setitimer },			/* 83 = setitimer */
+	{ compat(0,wait) },		/* 84 = old wait */
+	{ 1, (sy_call_t *)swapon },			/* 85 = swapon */
+	{ 2, (sy_call_t *)getitimer },			/* 86 = getitimer */
+	{ compat(2,gethostname) },		/* 87 = old gethostname */
+	{ compat(2,sethostname) },		/* 88 = old sethostname */
+	{ 0, (sy_call_t *)getdtablesize },		/* 89 = getdtablesize */
+	{ 2, (sy_call_t *)dup2 },			/* 90 = dup2 */
+	{ 0, (sy_call_t *)nosys },			/* 91 = getdopt */
+	{ 3, (sy_call_t *)fcntl },			/* 92 = fcntl */
+	{ 5, (sy_call_t *)select },			/* 93 = select */
+	{ 0, (sy_call_t *)nosys },			/* 94 = setdopt */
+	{ 1, (sy_call_t *)fsync },			/* 95 = fsync */
+	{ 3, (sy_call_t *)setpriority },		/* 96 = setpriority */
+	{ 3, (sy_call_t *)socket },			/* 97 = socket */
+	{ 3, (sy_call_t *)connect },			/* 98 = connect */
+	{ compat(3,accept) },		/* 99 = old accept */
+	{ 2, (sy_call_t *)getpriority },		/* 100 = getpriority */
+	{ compat(4,send) },		/* 101 = old send */
+	{ compat(4,recv) },		/* 102 = old recv */
+	{ 1, (sy_call_t *)sigreturn },			/* 103 = sigreturn */
+	{ 3, (sy_call_t *)bind },			/* 104 = bind */
+	{ 5, (sy_call_t *)setsockopt },			/* 105 = setsockopt */
+	{ 2, (sy_call_t *)listen },			/* 106 = listen */
+	{ 0, (sy_call_t *)nosys },			/* 107 = obsolete vtimes */
+	{ compat(3,sigvec) },		/* 108 = old sigvec */
+	{ compat(1,sigblock) },		/* 109 = old sigblock */
+	{ compat(1,sigsetmask) },		/* 110 = old sigsetmask */
+	{ 1, (sy_call_t *)sigsuspend },			/* 111 = sigsuspend */
+	{ compat(2,sigstack) },		/* 112 = old sigstack */
+	{ compat(3,recvmsg) },		/* 113 = old recvmsg */
+	{ compat(3,sendmsg) },		/* 114 = old sendmsg */
+	{ 0, (sy_call_t *)nosys },			/* 115 = obsolete vtrace */
+	{ 2, (sy_call_t *)gettimeofday },		/* 116 = gettimeofday */
+	{ 2, (sy_call_t *)getrusage },			/* 117 = getrusage */
+	{ 5, (sy_call_t *)getsockopt },			/* 118 = getsockopt */
+	{ 0, (sy_call_t *)nosys },			/* 119 = resuba */
+	{ 3, (sy_call_t *)readv },			/* 120 = readv */
+	{ 3, (sy_call_t *)writev },			/* 121 = writev */
+	{ 2, (sy_call_t *)settimeofday },		/* 122 = settimeofday */
+	{ 3, (sy_call_t *)fchown },			/* 123 = fchown */
+	{ 2, (sy_call_t *)fchmod },			/* 124 = fchmod */
+	{ compat(6,recvfrom) },		/* 125 = old recvfrom */
+	{ 2, (sy_call_t *)setreuid },			/* 126 = setreuid */
+	{ 2, (sy_call_t *)setregid },			/* 127 = setregid */
+	{ 2, (sy_call_t *)rename },			/* 128 = rename */
+	{ compat(2,truncate) },		/* 129 = old truncate */
+	{ compat(2,ftruncate) },		/* 130 = old ftruncate */
+	{ 2, (sy_call_t *)flock },			/* 131 = flock */
+	{ 2, (sy_call_t *)mkfifo },			/* 132 = mkfifo */
+	{ 6, (sy_call_t *)sendto },			/* 133 = sendto */
+	{ 2, (sy_call_t *)shutdown },			/* 134 = shutdown */
+	{ 4, (sy_call_t *)socketpair },			/* 135 = socketpair */
+	{ 2, (sy_call_t *)mkdir },			/* 136 = mkdir */
+	{ 1, (sy_call_t *)rmdir },			/* 137 = rmdir */
+	{ 2, (sy_call_t *)utimes },			/* 138 = utimes */
+	{ 0, (sy_call_t *)nosys },			/* 139 = obsolete 4.2 sigreturn */
+	{ 2, (sy_call_t *)adjtime },			/* 140 = adjtime */
+	{ compat(3,getpeername) },		/* 141 = old getpeername */
+	{ compat(0,gethostid) },		/* 142 = old gethostid */
+	{ compat(1,sethostid) },		/* 143 = old sethostid */
+	{ compat(2,getrlimit) },		/* 144 = old getrlimit */
+	{ compat(2,setrlimit) },		/* 145 = old setrlimit */
+	{ compat(2,killpg) },		/* 146 = old killpg */
+	{ 0, (sy_call_t *)setsid },			/* 147 = setsid */
+	{ 4, (sy_call_t *)quotactl },			/* 148 = quotactl */
+	{ compat(0,quota) },		/* 149 = old quota */
+	{ compat(3,getsockname) },		/* 150 = old getsockname */
+	{ 0, (sy_call_t *)nosys },			/* 151 = sem_lock */
+	{ 0, (sy_call_t *)nosys },			/* 152 = sem_wakeup */
+	{ 0, (sy_call_t *)nosys },			/* 153 = asyncdaemon */
+	{ 0, (sy_call_t *)nosys },			/* 154 = nosys */
 #ifdef NFS
-	{ 2, s(struct nfssvc_args),
-	    nfssvc },				/* 155 = nfssvc */
+	{ 2, (sy_call_t *)nfssvc },			/* 155 = nfssvc */
 #else
-	{ 0, 0,
-	    nosys },				/* 155 = unimplemented nfssvc */
+	{ 0, (sy_call_t *)nosys },			/* 155 = nosys */
 #endif
-	{ 4, s(struct compat_43_getdirentries_args),
-	    compat_43(getdirentries) },		/* 156 = compat_43 getdirentries */
-	{ 2, s(struct statfs_args),
-	    statfs },				/* 157 = statfs */
-	{ 2, s(struct fstatfs_args),
-	    fstatfs },				/* 158 = fstatfs */
-	{ 0, 0,
-	    nosys },				/* 159 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 160 = unimplemented */
-#ifdef NFS
-	{ 2, s(struct getfh_args),
-	    getfh },				/* 161 = getfh */
-#else
-	{ 0, 0,
-	    nosys },				/* 161 = unimplemented getfh */
-#endif
-	{ 0, 0,
-	    nosys },				/* 162 = unimplemented getdomainname */
-	{ 0, 0,
-	    nosys },				/* 163 = unimplemented setdomainname */
-	{ 0, 0,
-	    nosys },				/* 164 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 165 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 166 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 167 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 168 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 169 = unimplemented semsys */
-	{ 0, 0,
-	    nosys },				/* 170 = unimplemented msgsys */
-#if defined(SYSVSHM) && !defined(alpha)
-	{ 4, s(struct compat_43_shmsys_args),
-	    compat_43(shmsys) },		/* 171 = compat_43 shmsys */
+	{ compat(4,getdirentries) },		/* 156 = old getdirentries */
+	{ 2, (sy_call_t *)statfs },			/* 157 = statfs */
+	{ 2, (sy_call_t *)fstatfs },			/* 158 = fstatfs */
+	{ 0, (sy_call_t *)nosys },			/* 159 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 160 = nosys */
+#if defined(NFS) && !defined (NFS_NOSERVER)
+	{ 2, (sy_call_t *)getfh },			/* 161 = getfh */
 #else
-	{ 0, 0,
-	    nosys },				/* 171 = unimplemented shmsys */
+	{ 0, (sy_call_t *)nosys },			/* 161 = nosys */
 #endif
-	{ 0, 0,
-	    nosys },				/* 172 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 173 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 174 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 175 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 176 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 177 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 178 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 179 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 180 = unimplemented */
-	{ 1, s(struct setgid_args),
-	    setgid },				/* 181 = setgid */
-	{ 1, s(struct setegid_args),
-	    setegid },				/* 182 = setegid */
-	{ 1, s(struct seteuid_args),
-	    seteuid },				/* 183 = seteuid */
+	{ 2, (sy_call_t *)getdomainname },		/* 162 = getdomainname */
+	{ 2, (sy_call_t *)setdomainname },		/* 163 = setdomainname */
+	{ 1, (sy_call_t *)uname },			/* 164 = uname */
+	{ 2, (sy_call_t *)sysarch },			/* 165 = sysarch */
+	{ 3, (sy_call_t *)rtprio },			/* 166 = rtprio */
+	{ 0, (sy_call_t *)nosys },			/* 167 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 168 = nosys */
+	{ 5, (sy_call_t *)semsys },			/* 169 = semsys */
+	{ 6, (sy_call_t *)msgsys },			/* 170 = msgsys */
+	{ 4, (sy_call_t *)shmsys },			/* 171 = shmsys */
+	{ 0, (sy_call_t *)nosys },			/* 172 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 173 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 174 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 175 = nosys */
+	{ 1, (sy_call_t *)ntp_adjtime },		/* 176 = ntp_adjtime */
+	{ 0, (sy_call_t *)nosys },			/* 177 = sfork */
+	{ 0, (sy_call_t *)nosys },			/* 178 = getdescriptor */
+	{ 0, (sy_call_t *)nosys },			/* 179 = setdescriptor */
+	{ 0, (sy_call_t *)nosys },			/* 180 = nosys */
+	{ 1, (sy_call_t *)setgid },			/* 181 = setgid */
+	{ 1, (sy_call_t *)setegid },			/* 182 = setegid */
+	{ 1, (sy_call_t *)seteuid },			/* 183 = seteuid */
 #ifdef LFS
-	{ 3, s(struct lfs_bmapv_args),
-	    lfs_bmapv },			/* 184 = lfs_bmapv */
-	{ 3, s(struct lfs_markv_args),
-	    lfs_markv },			/* 185 = lfs_markv */
-	{ 2, s(struct lfs_segclean_args),
-	    lfs_segclean },			/* 186 = lfs_segclean */
-	{ 2, s(struct lfs_segwait_args),
-	    lfs_segwait },			/* 187 = lfs_segwait */
+	{ 3, (sy_call_t *)lfs_bmapv },			/* 184 = lfs_bmapv */
+	{ 3, (sy_call_t *)lfs_markv },			/* 185 = lfs_markv */
+	{ 2, (sy_call_t *)lfs_segclean },		/* 186 = lfs_segclean */
+	{ 2, (sy_call_t *)lfs_segwait },		/* 187 = lfs_segwait */
 #else
-	{ 0, 0,
-	    nosys },				/* 184 = unimplemented lfs_bmapv */
-	{ 0, 0,
-	    nosys },				/* 185 = unimplemented lfs_markv */
-	{ 0, 0,
-	    nosys },				/* 186 = unimplemented lfs_segclean */
-	{ 0, 0,
-	    nosys },				/* 187 = unimplemented lfs_segwait */
-#endif
-	{ 2, s(struct stat_args),
-	    stat },				/* 188 = stat */
-	{ 2, s(struct fstat_args),
-	    fstat },				/* 189 = fstat */
-	{ 2, s(struct lstat_args),
-	    lstat },				/* 190 = lstat */
-	{ 2, s(struct pathconf_args),
-	    pathconf },				/* 191 = pathconf */
-	{ 2, s(struct fpathconf_args),
-	    fpathconf },			/* 192 = fpathconf */
-	{ 0, 0,
-	    nosys },				/* 193 = unimplemented */
-	{ 2, s(struct getrlimit_args),
-	    getrlimit },			/* 194 = getrlimit */
-	{ 2, s(struct setrlimit_args),
-	    setrlimit },			/* 195 = setrlimit */
-	{ 4, s(struct getdirentries_args),
-	    getdirentries },			/* 196 = getdirentries */
-	{ 7, s(struct mmap_args),
-	    mmap },				/* 197 = mmap */
-	{ 0, 0,
-	    nosys },				/* 198 = __syscall */
-	{ 4, s(struct lseek_args),
-	    lseek },				/* 199 = lseek */
-	{ 3, s(struct truncate_args),
-	    truncate },				/* 200 = truncate */
-	{ 3, s(struct ftruncate_args),
-	    ftruncate },			/* 201 = ftruncate */
-	{ 6, s(struct __sysctl_args),
-	    __sysctl },				/* 202 = __sysctl */
-	{ 2, s(struct mlock_args),
-	    mlock },				/* 203 = mlock */
-	{ 2, s(struct munlock_args),
-	    munlock },				/* 204 = munlock */
-	{ 1, s(struct undelete_args),
-	    undelete },				/* 205 = undelete */
-	{ 0, 0,
-	    nosys },				/* 206 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 207 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 208 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 209 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 210 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 211 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 212 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 213 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 214 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 215 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 216 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 217 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 218 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 219 = unimplemented */
-	{ 0, 0,
-	    nosys },				/* 220 = unimplemented semctl */
-	{ 0, 0,
-	    nosys },				/* 221 = unimplemented semget */
-	{ 0, 0,
-	    nosys },				/* 222 = unimplemented semop */
-	{ 0, 0,
-	    nosys },				/* 223 = unimplemented semconfig */
-	{ 0, 0,
-	    nosys },				/* 224 = unimplemented msgctl */
-	{ 0, 0,
-	    nosys },				/* 225 = unimplemented msgget */
-	{ 0, 0,
-	    nosys },				/* 226 = unimplemented msgsnd */
-	{ 0, 0,
-	    nosys },				/* 227 = unimplemented msgrcv */
-#if defined(SYSVSHM) && 0
-	{ 3, s(struct shmat_args),
-	    shmat },				/* 228 = shmat */
-	{ 3, s(struct shmctl_args),
-	    shmctl },				/* 229 = shmctl */
-	{ 1, s(struct shmdt_args),
-	    shmdt },				/* 230 = shmdt */
-	{ 3, s(struct shmget_args),
-	    shmget },				/* 231 = shmget */
-#else
-	{ 0, 0,
-	    nosys },				/* 228 = unimplemented shmat */
-	{ 0, 0,
-	    nosys },				/* 229 = unimplemented shmctl */
-	{ 0, 0,
-	    nosys },				/* 230 = unimplemented shmdt */
-	{ 0, 0,
-	    nosys },				/* 231 = unimplemented shmget */
+	{ 0, (sy_call_t *)nosys },			/* 184 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 185 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 186 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 187 = nosys */
 #endif
+	{ 2, (sy_call_t *)stat },			/* 188 = stat */
+	{ 2, (sy_call_t *)fstat },			/* 189 = fstat */
+	{ 2, (sy_call_t *)lstat },			/* 190 = lstat */
+	{ 2, (sy_call_t *)pathconf },			/* 191 = pathconf */
+	{ 2, (sy_call_t *)fpathconf },			/* 192 = fpathconf */
+	{ 0, (sy_call_t *)nosys },			/* 193 = nosys */
+	{ 2, (sy_call_t *)getrlimit },			/* 194 = getrlimit */
+	{ 2, (sy_call_t *)setrlimit },			/* 195 = setrlimit */
+	{ 4, (sy_call_t *)getdirentries },		/* 196 = getdirentries */
+	{ 8, (sy_call_t *)mmap },			/* 197 = mmap */
+	{ 0, (sy_call_t *)nosys },			/* 198 = __syscall */
+	{ 5, (sy_call_t *)lseek },			/* 199 = lseek */
+	{ 4, (sy_call_t *)truncate },			/* 200 = truncate */
+	{ 4, (sy_call_t *)ftruncate },			/* 201 = ftruncate */
+	{ 6, (sy_call_t *)__sysctl },			/* 202 = __sysctl */
+	{ 2, (sy_call_t *)mlock },			/* 203 = mlock */
+	{ 2, (sy_call_t *)munlock },			/* 204 = munlock */
+	{ 2, (sy_call_t *)utrace },			/* 205 = utrace */
+	{ 1, (sy_call_t *)undelete },			/* 206 = undelete */
+	{ 0, (sy_call_t *)nosys },			/* 207 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 208 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 209 = nosys */
+	{ 0, (sy_call_t *)lkmnosys },			/* 210 = lkmnosys */
+	{ 0, (sy_call_t *)lkmnosys },			/* 211 = lkmnosys */
+	{ 0, (sy_call_t *)lkmnosys },			/* 212 = lkmnosys */
+	{ 0, (sy_call_t *)lkmnosys },			/* 213 = lkmnosys */
+	{ 0, (sy_call_t *)lkmnosys },			/* 214 = lkmnosys */
+	{ 0, (sy_call_t *)lkmnosys },			/* 215 = lkmnosys */
+	{ 0, (sy_call_t *)lkmnosys },			/* 216 = lkmnosys */
+	{ 0, (sy_call_t *)lkmnosys },			/* 217 = lkmnosys */
+	{ 0, (sy_call_t *)lkmnosys },			/* 218 = lkmnosys */
+	{ 0, (sy_call_t *)lkmnosys },			/* 219 = lkmnosys */
+	{ 4, (sy_call_t *)__semctl },			/* 220 = __semctl */
+	{ 3, (sy_call_t *)semget },			/* 221 = semget */
+	{ 3, (sy_call_t *)semop },			/* 222 = semop */
+	{ 1, (sy_call_t *)semconfig },			/* 223 = semconfig */
+	{ 3, (sy_call_t *)msgctl },			/* 224 = msgctl */
+	{ 2, (sy_call_t *)msgget },			/* 225 = msgget */
+	{ 4, (sy_call_t *)msgsnd },			/* 226 = msgsnd */
+	{ 5, (sy_call_t *)msgrcv },			/* 227 = msgrcv */
+	{ 3, (sy_call_t *)shmat },			/* 228 = shmat */
+	{ 3, (sy_call_t *)shmctl },			/* 229 = shmctl */
+	{ 1, (sy_call_t *)shmdt },			/* 230 = shmdt */
+	{ 3, (sy_call_t *)shmget },			/* 231 = shmget */
+	{ 0, (sy_call_t *)nosys },			/* 232 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 233 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 234 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 235 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 236 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 237 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 238 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 239 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 240 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 241 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 242 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 243 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 244 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 245 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 246 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 247 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 248 = nosys */
+	{ 0, (sy_call_t *)nosys },			/* 249 = nosys */
+	{ 3, (sy_call_t *)minherit },			/* 250 = minherit */
+	{ 1, (sy_call_t *)rfork },			/* 251 = rfork */
 };
-
-int	nsysent= sizeof(sysent) / sizeof(sysent[0]);
diff --git a/sys/kern/init_sysvec.c b/sys/kern/init_sysvec.c
new file mode 100644
index 0000000..379a1bf
--- /dev/null
+++ b/sys/kern/init_sysvec.c
@@ -0,0 +1,30 @@
+/*
+ * sysentvec for native FreeBSD a.out executable format.
+ *
+ * $Id$
+ */
+
+#include <sys/types.h>
+#include <sys/param.h>
+#include <sys/mount.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/syscall.h>
+#include <sys/signalvar.h>
+#include <machine/md_var.h>
+
+struct sysentvec aout_sysvec = {
+	SYS_MAXSYSCALL,
+	sysent,
+	0,
+	0,
+	0,
+	0,
+	0,
+	0,
+	sendsig,
+	sigcode,
+	&szsigcode,
+	0,
+	"FreeBSD a.out"
+};
diff --git a/sys/kern/kern_acct.c b/sys/kern/kern_acct.c
index a23543c..f72d2d0 100644
--- a/sys/kern/kern_acct.c
+++ b/sys/kern/kern_acct.c
@@ -1,4 +1,5 @@
 /*-
+ * Copyright (c) 1994 Christopher G. Demetriou
  * Copyright (c) 1982, 1986, 1989, 1993
  *	The Regents of the University of California.  All rights reserved.
  * (c) UNIX System Laboratories, Inc.
@@ -35,91 +36,278 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	from: @(#)kern_acct.c 8.8 (Berkeley) 5/14/95
+ *	@(#)kern_acct.c	8.1 (Berkeley) 6/14/93
+ *	$Id: kern_acct.c,v 1.14 1997/03/23 03:36:17 bde Exp $
  */
 
 #include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
-#include <sys/file.h>
+#include <sys/fcntl.h>
 #include <sys/syslog.h>
 #include <sys/kernel.h>
+#include <sys/sysent.h>
+#include <sys/sysctl.h>
+#include <sys/namei.h>
+#include <sys/errno.h>
+#include <sys/acct.h>
+#include <sys/resourcevar.h>
+#include <sys/tty.h>
 
-acct(a1, a2, a3)
+/*
+ * The routines implemented in this file are described in:
+ *      Leffler, et al.: The Design and Implementation of the 4.3BSD
+ *	    UNIX Operating System (Addison Welley, 1989)
+ * on pages 62-63.
+ *
+ * Arguably, to simplify accounting operations, this mechanism should
+ * be replaced by one in which an accounting log file (similar to /dev/klog)
+ * is read by a user process, etc.  However, that has its own problems.
+ */
+
+/*
+ * Internal accounting functions.
+ * The former's operation is described in Leffler, et al., and the latter
+ * was provided by UCB with the 4.4BSD-Lite release
+ */
+static comp_t	encode_comp_t __P((u_long, u_long));
+static void	acctwatch __P((void *));
+
+/*
+ * Accounting vnode pointer, and saved vnode pointer.
+ */
+static struct	vnode *acctp;
+static struct	vnode *savacctp;
+
+/*
+ * Values associated with enabling and disabling accounting
+ */
+static int acctsuspend = 2;	/* stop accounting when < 2% free space left */
+SYSCTL_INT(_kern, OID_AUTO, acct_suspend, CTLFLAG_RW,
+	&acctsuspend, 0, "");
+
+static int acctresume = 4;	/* resume when free space risen to > 4% */
+SYSCTL_INT(_kern, OID_AUTO, acct_resume, CTLFLAG_RW,
+	&acctresume, 0, "");
+
+static int acctchkfreq = 15;	/* frequency (in seconds) to check space */
+SYSCTL_INT(_kern, OID_AUTO, acct_chkfreq, CTLFLAG_RW,
+	&acctchkfreq, 0, "");
+
+/*
+ * Accounting system call.  Written based on the specification and
+ * previous implementation done by Mark Tinguely.
+ */
+int
+acct(a1, uap, a3)
 	struct proc *a1;
 	struct acct_args /* {
 		syscallarg(char *) path;
-	} */ *a2;
+	} */ *uap;
 	int *a3;
 {
+	struct proc *p = curproc;	/* XXX */
+	struct nameidata nd;
+	int error;
+
+	/* Make sure that the caller is root. */
+	error = suser(p->p_ucred, &p->p_acflag);
+	if (error)
+		return (error);
+
 	/*
-	 * Body deleted.
+	 * If accounting is to be started to a file, open that file for
+	 * writing and make sure it's a 'normal'.
 	 */
-	return (ENOSYS);
-}
+	if (SCARG(uap, path) != NULL) {
+		NDINIT(&nd, LOOKUP, NOFOLLOW, UIO_USERSPACE, SCARG(uap, path),
+		       p);
+		error = vn_open(&nd, FWRITE, 0);
+		if (error)
+			return (error);
+		VOP_UNLOCK(nd.ni_vp, 0, p);
+		if (nd.ni_vp->v_type != VREG) {
+			vn_close(nd.ni_vp, FWRITE, p->p_ucred, p);
+			return (EACCES);
+		}
+	}
 
-acct_process(a1)
-	struct proc *a1;
-{
+	/*
+	 * If accounting was previously enabled, kill the old space-watcher,
+	 * close the file, and (if no new file was specified, leave).
+	 */
+	if (acctp != NULLVP || savacctp != NULLVP) {
+		untimeout(acctwatch, NULL);
+		error = vn_close((acctp != NULLVP ? acctp : savacctp), FWRITE,
+		    p->p_ucred, p);
+		acctp = savacctp = NULLVP;
+	}
+	if (SCARG(uap, path) == NULL)
+		return (error);
 
 	/*
-	 * Body deleted.
+	 * Save the new accounting file vnode, and schedule the new
+	 * free space watcher.
 	 */
-	return;
+	acctp = nd.ni_vp;
+	acctwatch(NULL);
+	return (error);
 }
 
 /*
- * Periodically check the file system to see if accounting
- * should be turned on or off. Beware the case where the vnode
- * has been vgone()'d out from underneath us, e.g. when the file
- * system containing the accounting file has been forcibly unmounted.
+ * Write out process accounting information, on process exit.
+ * Data to be written out is specified in Leffler, et al.
+ * and are enumerated below.  (They're also noted in the system
+ * "acct.h" header file.)
  */
 
+int
+acct_process(p)
+	struct proc *p;
+{
+	struct acct acct;
+	struct rusage *r;
+	struct timeval ut, st, tmp;
+	int t;
+	struct vnode *vp;
+
+	/* If accounting isn't enabled, don't bother */
+	vp = acctp;
+	if (vp == NULLVP)
+		return (0);
+
+	/*
+	 * Get process accounting information.
+	 */
+
+	/* (1) The name of the command that ran */
+	bcopy(p->p_comm, acct.ac_comm, sizeof acct.ac_comm);
+
+	/* (2) The amount of user and system time that was used */
+	calcru(p, &ut, &st, NULL);
+	acct.ac_utime = encode_comp_t(ut.tv_sec, ut.tv_usec);
+	acct.ac_stime = encode_comp_t(st.tv_sec, st.tv_usec);
+
+	/* (3) The elapsed time the commmand ran (and its starting time) */
+	acct.ac_btime = p->p_stats->p_start.tv_sec;
+	microtime(&tmp);
+	timevalsub(&tmp, &p->p_stats->p_start);
+	acct.ac_etime = encode_comp_t(tmp.tv_sec, tmp.tv_usec);
+
+	/* (4) The average amount of memory used */
+	r = &p->p_stats->p_ru;
+	tmp = ut;
+	timevaladd(&tmp, &st);
+	t = tmp.tv_sec * hz + tmp.tv_usec / tick;
+	if (t)
+		acct.ac_mem = (r->ru_ixrss + r->ru_idrss + r->ru_isrss) / t;
+	else
+		acct.ac_mem = 0;
+
+	/* (5) The number of disk I/O operations done */
+	acct.ac_io = encode_comp_t(r->ru_inblock + r->ru_oublock, 0);
+
+	/* (6) The UID and GID of the process */
+	acct.ac_uid = p->p_cred->p_ruid;
+	acct.ac_gid = p->p_cred->p_rgid;
+
+	/* (7) The terminal from which the process was started */
+	if ((p->p_flag & P_CONTROLT) && p->p_pgrp->pg_session->s_ttyp)
+		acct.ac_tty = p->p_pgrp->pg_session->s_ttyp->t_dev;
+	else
+		acct.ac_tty = NODEV;
+
+	/* (8) The boolean flags that tell how the process terminated, etc. */
+	acct.ac_flag = p->p_acflag;
+
+	/*
+	 * Now, just write the accounting information to the file.
+	 */
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	return (vn_rdwr(UIO_WRITE, vp, (caddr_t)&acct, sizeof (acct),
+	    (off_t)0, UIO_SYSSPACE, IO_APPEND|IO_UNIT, p->p_ucred,
+	    (int *)0, p));
+}
+
 /*
- * Values associated with enabling and disabling accounting
+ * Encode_comp_t converts from ticks in seconds and microseconds
+ * to ticks in 1/AHZ seconds.  The encoding is described in
+ * Leffler, et al., on page 63.
  */
-int	acctsuspend = 2;	/* stop accounting when < 2% free space left */
-int	acctresume = 4;		/* resume when free space risen to > 4% */
-int	acctchkfreq = 15;	/* frequency (in seconds) to check space */
+
+#define	MANTSIZE	13			/* 13 bit mantissa. */
+#define	EXPSIZE		3			/* Base 8 (3 bit) exponent. */
+#define	MAXFRACT	((1 << MANTSIZE) - 1)	/* Maximum fractional value. */
+
+static comp_t
+encode_comp_t(s, us)
+	u_long s, us;
+{
+	int exp, rnd;
+
+	exp = 0;
+	rnd = 0;
+	s *= AHZ;
+	s += us / (1000000 / AHZ);	/* Maximize precision. */
+
+	while (s > MAXFRACT) {
+	rnd = s & (1 << (EXPSIZE - 1));	/* Round up? */
+		s >>= EXPSIZE;		/* Base 8 exponent == 3 bit shift. */
+		exp++;
+	}
+
+	/* If we need to round up, do it (and handle overflow correctly). */
+	if (rnd && (++s > MAXFRACT)) {
+		s >>= EXPSIZE;
+		exp++;
+	}
+
+	/* Clean it up and polish it off. */
+	exp <<= MANTSIZE;		/* Shift the exponent into place */
+	exp += s;			/* and add on the mantissa. */
+	return (exp);
+}
 
 /*
- * SHOULD REPLACE THIS WITH A DRIVER THAT CAN BE READ TO SIMPLIFY.
+ * Periodically check the file system to see if accounting
+ * should be turned on or off.  Beware the case where the vnode
+ * has been vgone()'d out from underneath us, e.g. when the file
+ * system containing the accounting file has been forcibly unmounted.
  */
-struct	vnode *acctp;
-struct	vnode *savacctp;
-
 /* ARGSUSED */
-void
+static void
 acctwatch(a)
 	void *a;
 {
 	struct statfs sb;
 
-	if (savacctp) {
+	if (savacctp != NULLVP) {
 		if (savacctp->v_type == VBAD) {
 			(void) vn_close(savacctp, FWRITE, NOCRED, NULL);
-			savacctp = NULL;
+			savacctp = NULLVP;
 			return;
 		}
 		(void)VFS_STATFS(savacctp->v_mount, &sb, (struct proc *)0);
 		if (sb.f_bavail > acctresume * sb.f_blocks / 100) {
 			acctp = savacctp;
-			savacctp = NULL;
+			savacctp = NULLVP;
 			log(LOG_NOTICE, "Accounting resumed\n");
 		}
 	} else {
-		if (acctp == NULL)
+		if (acctp == NULLVP)
 			return;
 		if (acctp->v_type == VBAD) {
 			(void) vn_close(acctp, FWRITE, NOCRED, NULL);
-			acctp = NULL;
+			acctp = NULLVP;
 			return;
 		}
 		(void)VFS_STATFS(acctp->v_mount, &sb, (struct proc *)0);
 		if (sb.f_bavail <= acctsuspend * sb.f_blocks / 100) {
 			savacctp = acctp;
-			acctp = NULL;
+			acctp = NULLVP;
 			log(LOG_NOTICE, "Accounting suspended\n");
 		}
 	}
diff --git a/sys/kern/kern_clock.c b/sys/kern/kern_clock.c
index f42900c..171ed0e 100644
--- a/sys/kern/kern_clock.c
+++ b/sys/kern/kern_clock.c
@@ -36,8 +36,28 @@
  * SUCH DAMAGE.
  *
  *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
+ * $Id: kern_clock.c,v 1.34 1997/03/22 16:52:19 mpp Exp $
  */
 
+/* Portions of this software are covered by the following: */
+/******************************************************************************
+ *                                                                            *
+ * Copyright (c) David L. Mills 1993, 1994                                    *
+ *                                                                            *
+ * Permission to use, copy, modify, and distribute this software and its      *
+ * documentation for any purpose and without fee is hereby granted, provided  *
+ * that the above copyright notice appears in all copies and that both the    *
+ * copyright notice and this permission notice appear in supporting           *
+ * documentation, and that the name University of Delaware not be used in     *
+ * advertising or publicity pertaining to distribution of the software        *
+ * without specific, written prior permission.  The University of Delaware    *
+ * makes no representations about the suitability this software for any       *
+ * purpose.  It is provided "as is" without express or implied warranty.      *
+ *                                                                            *
+ *****************************************************************************/
+
+#include "opt_cpu.h"		/* XXX */
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/dkstat.h>
@@ -45,13 +65,49 @@
 #include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/timex.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/sysctl.h>
 
 #include <machine/cpu.h>
+#define CLOCK_HAIR		/* XXX */
+#include <machine/clock.h>
 
 #ifdef GPROF
 #include <sys/gmon.h>
 #endif
 
+static void initclocks __P((void *dummy));
+SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
+
+/* Exported to machdep.c. */
+struct callout *callfree, *callout;
+
+static struct callout calltodo;
+
+/* Some of these don't belong here, but it's easiest to concentrate them. */
+static long cp_time[CPUSTATES];
+long dk_seek[DK_NDRIVE];
+static long dk_time[DK_NDRIVE];
+long dk_wds[DK_NDRIVE];
+long dk_wpms[DK_NDRIVE];
+long dk_xfer[DK_NDRIVE];
+
+int dk_busy;
+int dk_ndrive = 0;
+char dk_names[DK_NDRIVE][DK_NAMELEN];
+
+long tk_cancc;
+long tk_nin;
+long tk_nout;
+long tk_rawcc;
+
 /*
  * Clock handling routines.
  *
@@ -97,19 +153,278 @@
 
 int	stathz;
 int	profhz;
-int	profprocs;
+static int profprocs;
 int	ticks;
 static int psdiv, pscnt;	/* prof => stat divider */
-int	psratio;		/* ratio: prof / stat */
+int psratio;			/* ratio: prof / stat */
 
 volatile struct	timeval time;
 volatile struct	timeval mono_time;
 
 /*
- * Initialize clock frequencies and start both clocks running.
+ * Phase/frequency-lock loop (PLL/FLL) definitions
+ *
+ * The following variables are read and set by the ntp_adjtime() system
+ * call.
+ *
+ * time_state shows the state of the system clock, with values defined
+ * in the timex.h header file.
+ *
+ * time_status shows the status of the system clock, with bits defined
+ * in the timex.h header file.
+ *
+ * time_offset is used by the PLL/FLL to adjust the system time in small
+ * increments.
+ *
+ * time_constant determines the bandwidth or "stiffness" of the PLL.
+ *
+ * time_tolerance determines maximum frequency error or tolerance of the
+ * CPU clock oscillator and is a property of the architecture; however,
+ * in principle it could change as result of the presence of external
+ * discipline signals, for instance.
+ *
+ * time_precision is usually equal to the kernel tick variable; however,
+ * in cases where a precision clock counter or external clock is
+ * available, the resolution can be much less than this and depend on
+ * whether the external clock is working or not.
+ *
+ * time_maxerror is initialized by a ntp_adjtime() call and increased by
+ * the kernel once each second to reflect the maximum error
+ * bound growth.
+ *
+ * time_esterror is set and read by the ntp_adjtime() call, but
+ * otherwise not used by the kernel.
+ */
+int time_status = STA_UNSYNC;	/* clock status bits */
+int time_state = TIME_OK;	/* clock state */
+long time_offset = 0;		/* time offset (us) */
+long time_constant = 0;		/* pll time constant */
+long time_tolerance = MAXFREQ;	/* frequency tolerance (scaled ppm) */
+long time_precision = 1;	/* clock precision (us) */
+long time_maxerror = MAXPHASE;	/* maximum error (us) */
+long time_esterror = MAXPHASE;	/* estimated error (us) */
+
+/*
+ * The following variables establish the state of the PLL/FLL and the
+ * residual time and frequency offset of the local clock. The scale
+ * factors are defined in the timex.h header file.
+ *
+ * time_phase and time_freq are the phase increment and the frequency
+ * increment, respectively, of the kernel time variable at each tick of
+ * the clock.
+ *
+ * time_freq is set via ntp_adjtime() from a value stored in a file when
+ * the synchronization daemon is first started. Its value is retrieved
+ * via ntp_adjtime() and written to the file about once per hour by the
+ * daemon.
+ *
+ * time_adj is the adjustment added to the value of tick at each timer
+ * interrupt and is recomputed from time_phase and time_freq at each
+ * seconds rollover.
+ *
+ * time_reftime is the second's portion of the system time on the last
+ * call to ntp_adjtime(). It is used to adjust the time_freq variable
+ * and to increase the time_maxerror as the time since last update
+ * increases.
+ */
+static long time_phase = 0;		/* phase offset (scaled us) */
+long time_freq = 0;			/* frequency offset (scaled ppm) */
+static long time_adj = 0;		/* tick adjust (scaled 1 / hz) */
+static long time_reftime = 0;		/* time at last adjustment (s) */
+
+#ifdef PPS_SYNC
+/*
+ * The following variables are used only if the kernel PPS discipline
+ * code is configured (PPS_SYNC). The scale factors are defined in the
+ * timex.h header file.
+ *
+ * pps_time contains the time at each calibration interval, as read by
+ * microtime(). pps_count counts the seconds of the calibration
+ * interval, the duration of which is nominally pps_shift in powers of
+ * two.
+ *
+ * pps_offset is the time offset produced by the time median filter
+ * pps_tf[], while pps_jitter is the dispersion (jitter) measured by
+ * this filter.
+ *
+ * pps_freq is the frequency offset produced by the frequency median
+ * filter pps_ff[], while pps_stabil is the dispersion (wander) measured
+ * by this filter.
+ *
+ * pps_usec is latched from a high resolution counter or external clock
+ * at pps_time. Here we want the hardware counter contents only, not the
+ * contents plus the time_tv.usec as usual.
+ *
+ * pps_valid counts the number of seconds since the last PPS update. It
+ * is used as a watchdog timer to disable the PPS discipline should the
+ * PPS signal be lost.
+ *
+ * pps_glitch counts the number of seconds since the beginning of an
+ * offset burst more than tick/2 from current nominal offset. It is used
+ * mainly to suppress error bursts due to priority conflicts between the
+ * PPS interrupt and timer interrupt.
+ *
+ * pps_intcnt counts the calibration intervals for use in the interval-
+ * adaptation algorithm. It's just too complicated for words.
+ */
+struct timeval pps_time;	/* kernel time at last interval */
+long pps_offset = 0;		/* pps time offset (us) */
+long pps_jitter = MAXTIME;	/* pps time dispersion (jitter) (us) */
+long pps_tf[] = {0, 0, 0};	/* pps time offset median filter (us) */
+long pps_freq = 0;		/* frequency offset (scaled ppm) */
+long pps_stabil = MAXFREQ;	/* frequency dispersion (scaled ppm) */
+long pps_ff[] = {0, 0, 0};	/* frequency offset median filter */
+long pps_usec = 0;		/* microsec counter at last interval */
+long pps_valid = PPS_VALID;	/* pps signal watchdog counter */
+int pps_glitch = 0;		/* pps signal glitch counter */
+int pps_count = 0;		/* calibration interval counter (s) */
+int pps_shift = PPS_SHIFT;	/* interval duration (s) (shift) */
+int pps_intcnt = 0;		/* intervals at current duration */
+
+/*
+ * PPS signal quality monitors
+ *
+ * pps_jitcnt counts the seconds that have been discarded because the
+ * jitter measured by the time median filter exceeds the limit MAXTIME
+ * (100 us).
+ *
+ * pps_calcnt counts the frequency calibration intervals, which are
+ * variable from 4 s to 256 s.
+ *
+ * pps_errcnt counts the calibration intervals which have been discarded
+ * because the wander exceeds the limit MAXFREQ (100 ppm) or where the
+ * calibration interval jitter exceeds two ticks.
+ *
+ * pps_stbcnt counts the calibration intervals that have been discarded
+ * because the frequency wander exceeds the limit MAXFREQ / 4 (25 us).
+ */
+long pps_jitcnt = 0;		/* jitter limit exceeded */
+long pps_calcnt = 0;		/* calibration intervals */
+long pps_errcnt = 0;		/* calibration errors */
+long pps_stbcnt = 0;		/* stability limit exceeded */
+#endif /* PPS_SYNC */
+
+/* XXX none of this stuff works under FreeBSD */
+#ifdef EXT_CLOCK
+/*
+ * External clock definitions
+ *
+ * The following definitions and declarations are used only if an
+ * external clock (HIGHBALL or TPRO) is configured on the system.
+ */
+#define CLOCK_INTERVAL 30	/* CPU clock update interval (s) */
+
+/*
+ * The clock_count variable is set to CLOCK_INTERVAL at each PPS
+ * interrupt and decremented once each second.
+ */
+int clock_count = 0;		/* CPU clock counter */
+
+#ifdef HIGHBALL
+/*
+ * The clock_offset and clock_cpu variables are used by the HIGHBALL
+ * interface. The clock_offset variable defines the offset between
+ * system time and the HIGBALL counters. The clock_cpu variable contains
+ * the offset between the system clock and the HIGHBALL clock for use in
+ * disciplining the kernel time variable.
+ */
+extern struct timeval clock_offset; /* Highball clock offset */
+long clock_cpu = 0;		/* CPU clock adjust */
+#endif /* HIGHBALL */
+#endif /* EXT_CLOCK */
+
+/*
+ * hardupdate() - local clock update
+ *
+ * This routine is called by ntp_adjtime() to update the local clock
+ * phase and frequency. The implementation is of an adaptive-parameter,
+ * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new
+ * time and frequency offset estimates for each call. If the kernel PPS
+ * discipline code is configured (PPS_SYNC), the PPS signal itself
+ * determines the new time offset, instead of the calling argument.
+ * Presumably, calls to ntp_adjtime() occur only when the caller
+ * believes the local clock is valid within some bound (+-128 ms with
+ * NTP). If the caller's time is far different than the PPS time, an
+ * argument will ensue, and it's not clear who will lose.
+ *
+ * For uncompensated quartz crystal oscillatores and nominal update
+ * intervals less than 1024 s, operation should be in phase-lock mode
+ * (STA_FLL = 0), where the loop is disciplined to phase. For update
+ * intervals greater than thiss, operation should be in frequency-lock
+ * mode (STA_FLL = 1), where the loop is disciplined to frequency.
+ *
+ * Note: splclock() is in effect.
  */
 void
-initclocks()
+hardupdate(offset)
+	long offset;
+{
+	long ltemp, mtemp;
+
+	if (!(time_status & STA_PLL) && !(time_status & STA_PPSTIME))
+		return;
+	ltemp = offset;
+#ifdef PPS_SYNC
+	if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
+		ltemp = pps_offset;
+#endif /* PPS_SYNC */
+
+	/*
+	 * Scale the phase adjustment and clamp to the operating range.
+	 */
+	if (ltemp > MAXPHASE)
+		time_offset = MAXPHASE << SHIFT_UPDATE;
+	else if (ltemp < -MAXPHASE)
+		time_offset = -(MAXPHASE << SHIFT_UPDATE);
+	else
+		time_offset = ltemp << SHIFT_UPDATE;
+
+	/*
+	 * Select whether the frequency is to be controlled and in which
+	 * mode (PLL or FLL). Clamp to the operating range. Ugly
+	 * multiply/divide should be replaced someday.
+	 */
+	if (time_status & STA_FREQHOLD || time_reftime == 0)
+		time_reftime = time.tv_sec;
+	mtemp = time.tv_sec - time_reftime;
+	time_reftime = time.tv_sec;
+	if (time_status & STA_FLL) {
+		if (mtemp >= MINSEC) {
+			ltemp = ((time_offset / mtemp) << (SHIFT_USEC -
+			    SHIFT_UPDATE));
+			if (ltemp < 0)
+				time_freq -= -ltemp >> SHIFT_KH;
+			else
+				time_freq += ltemp >> SHIFT_KH;
+		}
+	} else {
+		if (mtemp < MAXSEC) {
+			ltemp *= mtemp;
+			if (ltemp < 0)
+				time_freq -= -ltemp >> (time_constant +
+				    time_constant + SHIFT_KF -
+				    SHIFT_USEC);
+			else
+				time_freq += ltemp >> (time_constant +
+				    time_constant + SHIFT_KF -
+				    SHIFT_USEC);
+		}
+	}
+	if (time_freq > time_tolerance)
+		time_freq = time_tolerance;
+	else if (time_freq < -time_tolerance)
+		time_freq = -time_tolerance;
+}
+
+
+
+/*
+ * Initialize clock frequencies and start both clocks running.
+ */
+/* ARGSUSED*/
+static void
+initclocks(dummy)
+	void *dummy;
 {
 	register int i;
 
@@ -138,9 +453,7 @@ hardclock(frame)
 {
 	register struct callout *p1;
 	register struct proc *p;
-	register int delta, needsoft;
-	extern int tickdelta;
-	extern long timedelta;
+	register int needsoft;
 
 	/*
 	 * Update real-time timeout queue.
@@ -185,18 +498,181 @@ hardclock(frame)
 		statclock(frame);
 
 	/*
-	 * Increment the time-of-day.  The increment is just ``tick'' unless
-	 * we are still adjusting the clock; see adjtime().
+	 * Increment the time-of-day.
 	 */
 	ticks++;
-	if (timedelta == 0)
-		delta = tick;
-	else {
-		delta = tick + tickdelta;
-		timedelta -= tickdelta;
+	{
+		int time_update;
+		struct timeval newtime = time;
+		long ltemp;
+
+		if (timedelta == 0) {
+			time_update = CPU_THISTICKLEN(tick);
+		} else {
+			time_update = CPU_THISTICKLEN(tick) + tickdelta;
+			timedelta -= tickdelta;
+		}
+		BUMPTIME(&mono_time, time_update);
+
+		/*
+		 * Compute the phase adjustment. If the low-order bits
+		 * (time_phase) of the update overflow, bump the high-order bits
+		 * (time_update).
+		 */
+		time_phase += time_adj;
+		if (time_phase <= -FINEUSEC) {
+		  ltemp = -time_phase >> SHIFT_SCALE;
+		  time_phase += ltemp << SHIFT_SCALE;
+		  time_update -= ltemp;
+		}
+		else if (time_phase >= FINEUSEC) {
+		  ltemp = time_phase >> SHIFT_SCALE;
+		  time_phase -= ltemp << SHIFT_SCALE;
+		  time_update += ltemp;
+		}
+
+		newtime.tv_usec += time_update;
+		/*
+		 * On rollover of the second the phase adjustment to be used for
+		 * the next second is calculated. Also, the maximum error is
+		 * increased by the tolerance. If the PPS frequency discipline
+		 * code is present, the phase is increased to compensate for the
+		 * CPU clock oscillator frequency error.
+		 *
+		 * On a 32-bit machine and given parameters in the timex.h
+		 * header file, the maximum phase adjustment is +-512 ms and
+		 * maximum frequency offset is a tad less than) +-512 ppm. On a
+		 * 64-bit machine, you shouldn't need to ask.
+		 */
+		if (newtime.tv_usec >= 1000000) {
+		  newtime.tv_usec -= 1000000;
+		  newtime.tv_sec++;
+		  time_maxerror += time_tolerance >> SHIFT_USEC;
+
+		  /*
+		   * Compute the phase adjustment for the next second. In
+		   * PLL mode, the offset is reduced by a fixed factor
+		   * times the time constant. In FLL mode the offset is
+		   * used directly. In either mode, the maximum phase
+		   * adjustment for each second is clamped so as to spread
+		   * the adjustment over not more than the number of
+		   * seconds between updates.
+		   */
+		  if (time_offset < 0) {
+		    ltemp = -time_offset;
+		    if (!(time_status & STA_FLL))
+			ltemp >>= SHIFT_KG + time_constant;
+		    if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+			ltemp = (MAXPHASE / MINSEC) <<
+			    SHIFT_UPDATE;
+		    time_offset += ltemp;
+		    time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ -
+			SHIFT_UPDATE);
+		    } else {
+		        ltemp = time_offset;
+			if (!(time_status & STA_FLL))
+				ltemp >>= SHIFT_KG + time_constant;
+			if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+				ltemp = (MAXPHASE / MINSEC) <<
+				    SHIFT_UPDATE;
+			time_offset -= ltemp;
+			time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ -
+			    SHIFT_UPDATE);
+		    }
+
+		  /*
+		   * Compute the frequency estimate and additional phase
+		   * adjustment due to frequency error for the next
+		   * second. When the PPS signal is engaged, gnaw on the
+		   * watchdog counter and update the frequency computed by
+		   * the pll and the PPS signal.
+		   */
+#ifdef PPS_SYNC
+		  pps_valid++;
+		  if (pps_valid == PPS_VALID) {
+		    pps_jitter = MAXTIME;
+		    pps_stabil = MAXFREQ;
+		    time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+				     STA_PPSWANDER | STA_PPSERROR);
+		  }
+		  ltemp = time_freq + pps_freq;
+#else
+		  ltemp = time_freq;
+#endif /* PPS_SYNC */
+		  if (ltemp < 0)
+		    time_adj -= -ltemp >>
+		      (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+		  else
+		    time_adj += ltemp >>
+		      (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+
+#if SHIFT_HZ == 7
+		  /*
+		   * When the CPU clock oscillator frequency is not a
+		   * power of two in Hz, the SHIFT_HZ is only an
+		   * approximate scale factor. In the SunOS kernel, this
+		   * results in a PLL gain factor of 1/1.28 = 0.78 what it
+		   * should be. In the following code the overall gain is
+		   * increased by a factor of 1.25, which results in a
+		   * residual error less than 3 percent.
+		   */
+		  /* Same thing applies for FreeBSD --GAW */
+		  if (hz == 100) {
+		    if (time_adj < 0)
+		      time_adj -= -time_adj >> 2;
+		    else
+		      time_adj += time_adj >> 2;
+		  }
+#endif /* SHIFT_HZ */
+
+		  /* XXX - this is really bogus, but can't be fixed until
+		     xntpd's idea of the system clock is fixed to know how
+		     the user wants leap seconds handled; in the mean time,
+		     we assume that users of NTP are running without proper
+		     leap second support (this is now the default anyway) */
+		  /*
+		   * Leap second processing. If in leap-insert state at
+		   * the end of the day, the system clock is set back one
+		   * second; if in leap-delete state, the system clock is
+		   * set ahead one second. The microtime() routine or
+		   * external clock driver will insure that reported time
+		   * is always monotonic. The ugly divides should be
+		   * replaced.
+		   */
+		  switch (time_state) {
+
+		  case TIME_OK:
+		    if (time_status & STA_INS)
+		      time_state = TIME_INS;
+		    else if (time_status & STA_DEL)
+		      time_state = TIME_DEL;
+		    break;
+
+		  case TIME_INS:
+		    if (newtime.tv_sec % 86400 == 0) {
+		      newtime.tv_sec--;
+		      time_state = TIME_OOP;
+		    }
+		    break;
+
+		  case TIME_DEL:
+		    if ((newtime.tv_sec + 1) % 86400 == 0) {
+		      newtime.tv_sec++;
+		      time_state = TIME_WAIT;
+		    }
+		    break;
+
+		  case TIME_OOP:
+		    time_state = TIME_WAIT;
+		    break;
+
+		  case TIME_WAIT:
+		    if (!(time_status & (STA_INS | STA_DEL)))
+		      time_state = TIME_OK;
+		  }
+		}
+		CPU_CLOCKUPDATE(&time, &newtime);
 	}
-	BUMPTIME(&time, delta);
-	BUMPTIME(&mono_time, delta);
 
 	/*
 	 * Process callouts at a very low cpu priority, so we don't keep the
@@ -256,7 +732,7 @@ softclock()
  */
 void
 timeout(ftn, arg, ticks)
-	void (*ftn) __P((void *));
+	timeout_t ftn;
 	void *arg;
 	register int ticks;
 {
@@ -301,7 +777,7 @@ timeout(ftn, arg, ticks)
 
 void
 untimeout(ftn, arg)
-	void (*ftn) __P((void *));
+	timeout_t ftn;
 	void *arg;
 {
 	register struct callout *p, *t;
@@ -323,6 +799,17 @@ untimeout(ftn, arg)
 	splx(s);
 }
 
+void
+gettime(struct timeval *tvp)
+{
+	int s;
+
+	s = splclock();
+	/* XXX should use microtime() iff tv_usec is used. */
+	*tvp = time;
+	splx(s);
+}
+
 /*
  * Compute number of hz until specified time.  Used to
  * compute third argument to timeout() from an absolute time.
@@ -331,28 +818,54 @@ int
 hzto(tv)
 	struct timeval *tv;
 {
-	register long ticks, sec;
+	register unsigned long ticks;
+	register long sec, usec;
 	int s;
 
 	/*
-	 * If number of milliseconds will fit in 32 bit arithmetic,
-	 * then compute number of milliseconds to time and scale to
-	 * ticks.  Otherwise just compute number of hz in time, rounding
-	 * times greater than representible to maximum value.
+	 * If the number of usecs in the whole seconds part of the time
+	 * difference fits in a long, then the total number of usecs will
+	 * fit in an unsigned long.  Compute the total and convert it to
+	 * ticks, rounding up and adding 1 to allow for the current tick
+	 * to expire.  Rounding also depends on unsigned long arithmetic
+	 * to avoid overflow.
+	 *
+	 * Otherwise, if the number of ticks in the whole seconds part of
+	 * the time difference fits in a long, then convert the parts to
+	 * ticks separately and add, using similar rounding methods and
+	 * overflow avoidance.  This method would work in the previous
+	 * case but it is slightly slower and assumes that hz is integral.
 	 *
-	 * Delta times less than 25 days can be computed ``exactly''.
-	 * Maximum value for any timeout in 10ms ticks is 250 days.
+	 * Otherwise, round the time difference down to the maximum
+	 * representable value.
+	 *
+	 * If ints have 32 bits, then the maximum value for any timeout in
+	 * 10ms ticks is 248 days.
 	 */
-	s = splhigh();
+	s = splclock();
 	sec = tv->tv_sec - time.tv_sec;
-	if (sec <= 0x7fffffff / 1000 - 1000)
-		ticks = ((tv->tv_sec - time.tv_sec) * 1000 +
-			(tv->tv_usec - time.tv_usec) / 1000) / (tick / 1000);
-	else if (sec <= 0x7fffffff / hz)
-		ticks = sec * hz;
-	else
-		ticks = 0x7fffffff;
+	usec = tv->tv_usec - time.tv_usec;
 	splx(s);
+	if (usec < 0) {
+		sec--;
+		usec += 1000000;
+	}
+	if (sec < 0) {
+#ifdef DIAGNOSTIC
+		printf("hzto: negative time difference %ld sec %ld usec\n",
+		       sec, usec);
+#endif
+		ticks = 1;
+	} else if (sec <= LONG_MAX / 1000000)
+		ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
+			/ tick + 1;
+	else if (sec <= LONG_MAX / hz)
+		ticks = sec * hz
+			+ ((unsigned long)usec + (tick - 1)) / tick + 1;
+	else
+		ticks = LONG_MAX;
+	if (ticks > INT_MAX)
+		ticks = INT_MAX;
 	return (ticks);
 }
 
@@ -399,8 +912,6 @@ stopprofclock(p)
 	}
 }
 
-int	dk_ndrive = DK_NDRIVE;
-
 /*
  * Statistics clock.  Grab profile sample, and if divider reaches 0,
  * do process and kernel statistics.
@@ -414,6 +925,10 @@ statclock(frame)
 #endif
 	register struct proc *p;
 	register int i;
+	struct pstats *pstats;
+	long rss;
+	struct rusage *ru;
+	struct vmspace *vm;
 
 	if (CLKF_USERMODE(frame)) {
 		p = curproc;
@@ -505,18 +1020,29 @@ statclock(frame)
 			if (p->p_priority >= PUSER)
 				p->p_priority = p->p_usrpri;
 		}
+
+		/* Update resource usage integrals and maximums. */
+		if ((pstats = p->p_stats) != NULL &&
+		    (ru = &pstats->p_ru) != NULL &&
+		    (vm = p->p_vmspace) != NULL) {
+			ru->ru_ixrss += vm->vm_tsize * PAGE_SIZE / 1024;
+			ru->ru_idrss += vm->vm_dsize * PAGE_SIZE / 1024;
+			ru->ru_isrss += vm->vm_ssize * PAGE_SIZE / 1024;
+			rss = vm->vm_pmap.pm_stats.resident_count *
+			      PAGE_SIZE / 1024;
+			if (ru->ru_maxrss < rss)
+				ru->ru_maxrss = rss;
+        	}
 	}
 }
 
 /*
  * Return information about system clocks.
  */
-sysctl_clockrate(where, sizep)
-	register char *where;
-	size_t *sizep;
+static int
+sysctl_kern_clockrate SYSCTL_HANDLER_ARGS
 {
 	struct clockinfo clkinfo;
-
 	/*
 	 * Construct clockinfo structure.
 	 */
@@ -524,5 +1050,254 @@ sysctl_clockrate(where, sizep)
 	clkinfo.tick = tick;
 	clkinfo.profhz = profhz;
 	clkinfo.stathz = stathz ? stathz : hz;
-	return (sysctl_rdstruct(where, sizep, NULL, &clkinfo, sizeof(clkinfo)));
+	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
+}
+
+SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
+	0, 0, sysctl_kern_clockrate, "S,clockinfo","");
+
+#ifdef PPS_SYNC
+/*
+ * hardpps() - discipline CPU clock oscillator to external PPS signal
+ *
+ * This routine is called at each PPS interrupt in order to discipline
+ * the CPU clock oscillator to the PPS signal. It measures the PPS phase
+ * and leaves it in a handy spot for the hardclock() routine. It
+ * integrates successive PPS phase differences and calculates the
+ * frequency offset. This is used in hardclock() to discipline the CPU
+ * clock oscillator so that intrinsic frequency error is cancelled out.
+ * The code requires the caller to capture the time and hardware counter
+ * value at the on-time PPS signal transition.
+ *
+ * Note that, on some Unix systems, this routine runs at an interrupt
+ * priority level higher than the timer interrupt routine hardclock().
+ * Therefore, the variables used are distinct from the hardclock()
+ * variables, except for certain exceptions: The PPS frequency pps_freq
+ * and phase pps_offset variables are determined by this routine and
+ * updated atomically. The time_tolerance variable can be considered a
+ * constant, since it is infrequently changed, and then only when the
+ * PPS signal is disabled. The watchdog counter pps_valid is updated
+ * once per second by hardclock() and is atomically cleared in this
+ * routine.
+ */
+void
+hardpps(tvp, usec)
+	struct timeval *tvp;		/* time at PPS */
+	long usec;			/* hardware counter at PPS */
+{
+	long u_usec, v_usec, bigtick;
+	long cal_sec, cal_usec;
+
+	/*
+	 * An occasional glitch can be produced when the PPS interrupt
+	 * occurs in the hardclock() routine before the time variable is
+	 * updated. Here the offset is discarded when the difference
+	 * between it and the last one is greater than tick/2, but not
+	 * if the interval since the first discard exceeds 30 s.
+	 */
+	time_status |= STA_PPSSIGNAL;
+	time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
+	pps_valid = 0;
+	u_usec = -tvp->tv_usec;
+	if (u_usec < -500000)
+		u_usec += 1000000;
+	v_usec = pps_offset - u_usec;
+	if (v_usec < 0)
+		v_usec = -v_usec;
+	if (v_usec > (tick >> 1)) {
+		if (pps_glitch > MAXGLITCH) {
+			pps_glitch = 0;
+			pps_tf[2] = u_usec;
+			pps_tf[1] = u_usec;
+		} else {
+			pps_glitch++;
+			u_usec = pps_offset;
+		}
+	} else
+		pps_glitch = 0;
+
+	/*
+	 * A three-stage median filter is used to help deglitch the pps
+	 * time. The median sample becomes the time offset estimate; the
+	 * difference between the other two samples becomes the time
+	 * dispersion (jitter) estimate.
+	 */
+	pps_tf[2] = pps_tf[1];
+	pps_tf[1] = pps_tf[0];
+	pps_tf[0] = u_usec;
+	if (pps_tf[0] > pps_tf[1]) {
+		if (pps_tf[1] > pps_tf[2]) {
+			pps_offset = pps_tf[1];		/* 0 1 2 */
+			v_usec = pps_tf[0] - pps_tf[2];
+		} else if (pps_tf[2] > pps_tf[0]) {
+			pps_offset = pps_tf[0];		/* 2 0 1 */
+			v_usec = pps_tf[2] - pps_tf[1];
+		} else {
+			pps_offset = pps_tf[2];		/* 0 2 1 */
+			v_usec = pps_tf[0] - pps_tf[1];
+		}
+	} else {
+		if (pps_tf[1] < pps_tf[2]) {
+			pps_offset = pps_tf[1];		/* 2 1 0 */
+			v_usec = pps_tf[2] - pps_tf[0];
+		} else  if (pps_tf[2] < pps_tf[0]) {
+			pps_offset = pps_tf[0];		/* 1 0 2 */
+			v_usec = pps_tf[1] - pps_tf[2];
+		} else {
+			pps_offset = pps_tf[2];		/* 1 2 0 */
+			v_usec = pps_tf[1] - pps_tf[0];
+		}
+	}
+	if (v_usec > MAXTIME)
+		pps_jitcnt++;
+	v_usec = (v_usec << PPS_AVG) - pps_jitter;
+	if (v_usec < 0)
+		pps_jitter -= -v_usec >> PPS_AVG;
+	else
+		pps_jitter += v_usec >> PPS_AVG;
+	if (pps_jitter > (MAXTIME >> 1))
+		time_status |= STA_PPSJITTER;
+
+	/*
+	 * During the calibration interval adjust the starting time when
+	 * the tick overflows. At the end of the interval compute the
+	 * duration of the interval and the difference of the hardware
+	 * counters at the beginning and end of the interval. This code
+	 * is deliciously complicated by the fact valid differences may
+	 * exceed the value of tick when using long calibration
+	 * intervals and small ticks. Note that the counter can be
+	 * greater than tick if caught at just the wrong instant, but
+	 * the values returned and used here are correct.
+	 */
+	bigtick = (long)tick << SHIFT_USEC;
+	pps_usec -= pps_freq;
+	if (pps_usec >= bigtick)
+		pps_usec -= bigtick;
+	if (pps_usec < 0)
+		pps_usec += bigtick;
+	pps_time.tv_sec++;
+	pps_count++;
+	if (pps_count < (1 << pps_shift))
+		return;
+	pps_count = 0;
+	pps_calcnt++;
+	u_usec = usec << SHIFT_USEC;
+	v_usec = pps_usec - u_usec;
+	if (v_usec >= bigtick >> 1)
+		v_usec -= bigtick;
+	if (v_usec < -(bigtick >> 1))
+		v_usec += bigtick;
+	if (v_usec < 0)
+		v_usec = -(-v_usec >> pps_shift);
+	else
+		v_usec = v_usec >> pps_shift;
+	pps_usec = u_usec;
+	cal_sec = tvp->tv_sec;
+	cal_usec = tvp->tv_usec;
+	cal_sec -= pps_time.tv_sec;
+	cal_usec -= pps_time.tv_usec;
+	if (cal_usec < 0) {
+		cal_usec += 1000000;
+		cal_sec--;
+	}
+	pps_time = *tvp;
+
+	/*
+	 * Check for lost interrupts, noise, excessive jitter and
+	 * excessive frequency error. The number of timer ticks during
+	 * the interval may vary +-1 tick. Add to this a margin of one
+	 * tick for the PPS signal jitter and maximum frequency
+	 * deviation. If the limits are exceeded, the calibration
+	 * interval is reset to the minimum and we start over.
+	 */
+	u_usec = (long)tick << 1;
+	if (!((cal_sec == -1 && cal_usec > (1000000 - u_usec))
+	    || (cal_sec == 0 && cal_usec < u_usec))
+	    || v_usec > time_tolerance || v_usec < -time_tolerance) {
+		pps_errcnt++;
+		pps_shift = PPS_SHIFT;
+		pps_intcnt = 0;
+		time_status |= STA_PPSERROR;
+		return;
+	}
+
+	/*
+	 * A three-stage median filter is used to help deglitch the pps
+	 * frequency. The median sample becomes the frequency offset
+	 * estimate; the difference between the other two samples
+	 * becomes the frequency dispersion (stability) estimate.
+	 */
+	pps_ff[2] = pps_ff[1];
+	pps_ff[1] = pps_ff[0];
+	pps_ff[0] = v_usec;
+	if (pps_ff[0] > pps_ff[1]) {
+		if (pps_ff[1] > pps_ff[2]) {
+			u_usec = pps_ff[1];		/* 0 1 2 */
+			v_usec = pps_ff[0] - pps_ff[2];
+		} else if (pps_ff[2] > pps_ff[0]) {
+			u_usec = pps_ff[0];		/* 2 0 1 */
+			v_usec = pps_ff[2] - pps_ff[1];
+		} else {
+			u_usec = pps_ff[2];		/* 0 2 1 */
+			v_usec = pps_ff[0] - pps_ff[1];
+		}
+	} else {
+		if (pps_ff[1] < pps_ff[2]) {
+			u_usec = pps_ff[1];		/* 2 1 0 */
+			v_usec = pps_ff[2] - pps_ff[0];
+		} else  if (pps_ff[2] < pps_ff[0]) {
+			u_usec = pps_ff[0];		/* 1 0 2 */
+			v_usec = pps_ff[1] - pps_ff[2];
+		} else {
+			u_usec = pps_ff[2];		/* 1 2 0 */
+			v_usec = pps_ff[1] - pps_ff[0];
+		}
+	}
+
+	/*
+	 * Here the frequency dispersion (stability) is updated. If it
+	 * is less than one-fourth the maximum (MAXFREQ), the frequency
+	 * offset is updated as well, but clamped to the tolerance. It
+	 * will be processed later by the hardclock() routine.
+	 */
+	v_usec = (v_usec >> 1) - pps_stabil;
+	if (v_usec < 0)
+		pps_stabil -= -v_usec >> PPS_AVG;
+	else
+		pps_stabil += v_usec >> PPS_AVG;
+	if (pps_stabil > MAXFREQ >> 2) {
+		pps_stbcnt++;
+		time_status |= STA_PPSWANDER;
+		return;
+	}
+	if (time_status & STA_PPSFREQ) {
+		if (u_usec < 0) {
+			pps_freq -= -u_usec >> PPS_AVG;
+			if (pps_freq < -time_tolerance)
+				pps_freq = -time_tolerance;
+			u_usec = -u_usec;
+		} else {
+			pps_freq += u_usec >> PPS_AVG;
+			if (pps_freq > time_tolerance)
+				pps_freq = time_tolerance;
+		}
+	}
+
+	/*
+	 * Here the calibration interval is adjusted. If the maximum
+	 * time difference is greater than tick / 4, reduce the interval
+	 * by half. If this is not the case for four consecutive
+	 * intervals, double the interval.
+	 */
+	if (u_usec << pps_shift > bigtick >> 2) {
+		pps_intcnt = 0;
+		if (pps_shift > PPS_SHIFT)
+			pps_shift--;
+	} else if (pps_intcnt >= 4) {
+		pps_intcnt = 0;
+		if (pps_shift < PPS_SHIFTMAX)
+			pps_shift++;
+	} else
+		pps_intcnt++;
 }
+#endif /* PPS_SYNC */
diff --git a/sys/kern/kern_conf.c b/sys/kern/kern_conf.c
new file mode 100644
index 0000000..bee8b87
--- /dev/null
+++ b/sys/kern/kern_conf.c
@@ -0,0 +1,208 @@
+/*-
+ * Parts Copyright (c) 1995 Terrence R. Lambert
+ * Copyright (c) 1995 Julian R. Elischer
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Terrence R. Lambert.
+ * 4. The name Terrence R. Lambert may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY Julian R. Elischer ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE TERRENCE R. LAMBERT BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id$
+ */
+
+#include <sys/param.h>
+#include <sys/types.h>
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/vnode.h>
+
+#define NUMBDEV 128
+#define NUMCDEV 256
+#define bdevsw_ALLOCSTART	(NUMBDEV/2)
+#define cdevsw_ALLOCSTART	(NUMCDEV/2)
+
+struct bdevsw 	*bdevsw[NUMBDEV];
+int	nblkdev = NUMBDEV;
+struct cdevsw 	*cdevsw[NUMCDEV];
+int	nchrdev = NUMCDEV;
+
+
+
+/*
+ * Routine to determine if a device is a disk.
+ *
+ * KLUDGE XXX add flags to cdevsw entries for disks XXX
+ * A minimal stub routine can always return 0.
+ */
+int
+isdisk(dev, type)
+	dev_t dev;
+	int type;
+{
+
+	switch (major(dev)) {
+	case 15:		/* VBLK: vn, VCHR: cd */
+		return (1);
+	case 0:			/* wd */
+	case 2:			/* fd */
+	case 4:			/* sd */
+	case 6:			/* cd */
+	case 7:			/* mcd */
+	case 16:		/* scd */
+	case 17:		/* matcd */
+	case 18:		/* ata */
+	case 19:		/* wcd */
+	case 20:		/* od */
+	case 22:		/* gd */
+		if (type == VBLK)
+			return (1);
+		return (0);
+	case 3:			/* wd */
+	case 9:			/* fd */
+	case 13:		/* sd */
+	case 29:		/* mcd */
+	case 43:		/* vn */
+	case 45:		/* scd */
+	case 46:		/* matcd */
+	case 69:		/* wcd */
+	case 70:		/* od */
+	case 78:		/* gd */
+		if (type == VCHR)
+			return (1);
+		/* fall through */
+	default:
+		return (0);
+	}
+	/* NOTREACHED */
+}
+
+
+/*
+ * Routine to convert from character to block device number.
+ *
+ * A minimal stub routine can always return NODEV.
+ */
+dev_t
+chrtoblk(dev_t dev)
+{
+	struct bdevsw *bd;
+	struct cdevsw *cd;
+
+	if(cd = cdevsw[major(dev)]) {
+          if ( (bd = cd->d_bdev) )
+	    return(makedev(bd->d_maj,minor(dev)));
+	}
+	return(NODEV);
+}
+
+/*
+ * (re)place an entry in the bdevsw or cdevsw table
+ * return the slot used in major(*descrip)
+ */
+#define ADDENTRY(TTYPE,NXXXDEV,ALLOCSTART) \
+int TTYPE##_add(dev_t *descrip,						\
+		struct TTYPE *newentry,					\
+		struct TTYPE **oldentry)				\
+{									\
+	int i ;								\
+	if ( (int)*descrip == NODEV) {	/* auto (0 is valid) */		\
+		/*							\
+		 * Search the table looking for a slot...		\
+		 */							\
+		for (i = ALLOCSTART; i < NXXXDEV; i++)				\
+			if (TTYPE[i] == NULL)				\
+				break;		/* found one! */	\
+		/* out of allocable slots? */				\
+		if (i >= NXXXDEV) {					\
+			return ENFILE;					\
+		}							\
+	} else {				/* assign */		\
+		i = major(*descrip);					\
+		if (i < 0 || i >= NXXXDEV) {				\
+			return EINVAL;					\
+		}							\
+	}								\
+									\
+	/* maybe save old */						\
+        if (oldentry) {							\
+		*oldentry = TTYPE[i];					\
+	}								\
+	if (newentry)							\
+		newentry->d_maj = i;					\
+	/* replace with new */						\
+	TTYPE[i] = newentry;						\
+									\
+	/* done!  let them know where we put it */			\
+	*descrip = makedev(i,0);					\
+	return 0;							\
+} \
+
+ADDENTRY(bdevsw, nblkdev,bdevsw_ALLOCSTART)
+ADDENTRY(cdevsw, nchrdev,cdevsw_ALLOCSTART)
+
+/* Maybe the author might indicate what the f*@# tehis is for? */
+
+void
+cdevsw_make(struct bdevsw *from)
+{
+	struct cdevsw *to = from->d_cdev;
+
+	if (!to) 
+		panic("No target cdevsw in bdevsw");
+	to->d_open = from->d_open;
+	to->d_close = from->d_close;
+	to->d_read = rawread;
+	to->d_write = rawwrite;
+	to->d_ioctl = from->d_ioctl;
+	to->d_stop = nostop;
+	to->d_reset = nullreset;
+	to->d_devtotty = nodevtotty;
+	to->d_select = seltrue;
+	to->d_mmap = nommap;
+	to->d_strategy = from->d_strategy;
+	to->d_name = from->d_name;
+	to->d_bdev = from;
+	to->d_maj = -1;
+}
+
+void
+bdevsw_add_generic(int bdev, int cdev, struct bdevsw *bdevsw)
+{
+	dev_t dev;
+	/*
+	 * XXX hack alert.
+	 */
+	if (isdisk(makedev(bdev, 0), VBLK) && bdevsw->d_flags != D_DISK) {
+	    printf("bdevsw_add_generic: adding D_DISK flag for device %d\n",
+		   bdev);
+	    bdevsw->d_flags = D_DISK;
+	}
+	cdevsw_make(bdevsw);
+	dev = makedev(cdev, 0);
+	cdevsw_add(&dev, bdevsw->d_cdev, NULL);
+	dev = makedev(bdev, 0);
+	bdevsw_add(&dev, bdevsw        , NULL);
+}
diff --git a/sys/kern/kern_descrip.c b/sys/kern/kern_descrip.c
index 3f2e424..a5c6d94 100644
--- a/sys/kern/kern_descrip.c
+++ b/sys/kern/kern_descrip.c
@@ -35,111 +35,105 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)kern_descrip.c	8.8 (Berkeley) 2/14/95
+ *	@(#)kern_descrip.c	8.6 (Berkeley) 4/19/94
+ * $Id$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/conf.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
+#include <sys/sysctl.h>
 #include <sys/vnode.h>
 #include <sys/proc.h>
 #include <sys/file.h>
-#include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/stat.h>
-#include <sys/ioctl.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
 #include <sys/fcntl.h>
 #include <sys/malloc.h>
-#include <sys/syslog.h>
 #include <sys/unistd.h>
 #include <sys/resourcevar.h>
+#include <sys/pipe.h>
 
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
+#include <vm/vm.h>
+#include <vm/vm_extern.h>
 
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+
+static	 d_open_t  fdopen;
+#define NUMFDESC 64
+
+#define CDEV_MAJOR 22
+static struct cdevsw fildesc_cdevsw = 
+	{ fdopen,	noclose,	noread,		nowrite,	/*22*/
+	  noioc,	nostop,		nullreset,	nodevtotty,/*fd(!=Fd)*/
+	  noselect,	nommap,		nostrat };
+
+static int finishdup(struct filedesc *fdp, int old, int new, int *retval);
 /*
  * Descriptor management.
  */
 struct filelist filehead;	/* head of list of open files */
 int nfiles;			/* actual number of open files */
+extern int cmask;	
 
 /*
  * System calls on descriptors.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct getdtablesize_args {
+	int	dummy;
+};
+#endif
 /* ARGSUSED */
 int
 getdtablesize(p, uap, retval)
 	struct proc *p;
-	void *uap;
-	register_t *retval;
+	struct getdtablesize_args *uap;
+	int *retval;
 {
 
-	*retval = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
+	*retval = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
 	return (0);
 }
 
 /*
- * Duplicate a file descriptor.
- */
-/* ARGSUSED */
-int
-dup(p, uap, retval)
-	struct proc *p;
-	struct dup_args /* {
-		syscallarg(u_int) fd;
-	} */ *uap;
-	register_t *retval;
-{
-	register struct filedesc *fdp;
-	u_int old;
-	int new, error;
-
-	old = SCARG(uap, fd);
-	/*
-	 * XXX Compatibility
-	 */
-	if (old &~ 077) {
-		SCARG(uap, fd) &= 077;
-		return (dup2(p, uap, retval));
-	}
-
-	fdp = p->p_fd;
-	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL)
-		return (EBADF);
-	if (error = fdalloc(p, 0, &new))
-		return (error);
-	return (finishdup(fdp, (int)old, new, retval));
-}
-
-/*
  * Duplicate a file descriptor to a particular value.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct dup2_args {
+	u_int	from;
+	u_int	to;
+};
+#endif
 /* ARGSUSED */
 int
 dup2(p, uap, retval)
 	struct proc *p;
-	struct dup2_args /* {
-		syscallarg(u_int) from;
-		syscallarg(u_int) to;
-	} */ *uap;
-	register_t *retval;
+	struct dup2_args *uap;
+	int *retval;
 {
 	register struct filedesc *fdp = p->p_fd;
-	register int old = SCARG(uap, from), new = SCARG(uap, to);
+	register u_int old = uap->from, new = uap->to;
 	int i, error;
 
 	if (old >= fdp->fd_nfiles ||
 	    fdp->fd_ofiles[old] == NULL ||
 	    new >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
-	    new >= maxfiles)
+	    new >= maxfilesperproc)
 		return (EBADF);
 	if (old == new) {
 		*retval = new;
 		return (0);
 	}
 	if (new >= fdp->fd_nfiles) {
-		if (error = fdalloc(p, new, &i))
+		if ((error = fdalloc(p, new, &i)))
 			return (error);
 		if (new != i)
 			panic("dup2: fdalloc");
@@ -155,20 +149,58 @@ dup2(p, uap, retval)
 }
 
 /*
+ * Duplicate a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct dup_args {
+	u_int	fd;
+};
+#endif
+/* ARGSUSED */
+int
+dup(p, uap, retval)
+	struct proc *p;
+	struct dup_args *uap;
+	int *retval;
+{
+	register struct filedesc *fdp;
+	u_int old;
+	int new, error;
+
+	old = uap->fd;
+
+#if 0
+	/*
+	 * XXX Compatibility
+	 */
+	if (old &~ 077) { uap->fd &= 077; return (dup2(p, uap, retval)); }
+#endif
+
+	fdp = p->p_fd;
+	if (old >= fdp->fd_nfiles || fdp->fd_ofiles[old] == NULL)
+		return (EBADF);
+	if ((error = fdalloc(p, 0, &new)))
+		return (error);
+	return (finishdup(fdp, (int)old, new, retval));
+}
+
+/*
  * The file control system call.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct fcntl_args {
+	int	fd;
+	int	cmd;
+	int	arg;
+};
+#endif
 /* ARGSUSED */
 int
 fcntl(p, uap, retval)
 	struct proc *p;
-	register struct fcntl_args /* {
-		syscallarg(int) fd;
-		syscallarg(int) cmd;
-		syscallarg(void *) arg;
-	} */ *uap;
-	register_t *retval;
+	register struct fcntl_args *uap;
+	int *retval;
 {
-	int fd = SCARG(uap, fd);
 	register struct filedesc *fdp = p->p_fd;
 	register struct file *fp;
 	register char *pop;
@@ -177,27 +209,27 @@ fcntl(p, uap, retval)
 	struct flock fl;
 	u_int newmin;
 
-	if ((u_int)fd >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[fd]) == NULL)
+	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
 		return (EBADF);
-	pop = &fdp->fd_ofileflags[fd];
-	switch (SCARG(uap, cmd)) {
+	pop = &fdp->fd_ofileflags[uap->fd];
+	switch (uap->cmd) {
 
 	case F_DUPFD:
-		newmin = (long)SCARG(uap, arg);
+		newmin = uap->arg;
 		if (newmin >= p->p_rlimit[RLIMIT_NOFILE].rlim_cur ||
-		    newmin >= maxfiles)
+		    newmin >= maxfilesperproc)
 			return (EINVAL);
-		if (error = fdalloc(p, newmin, &i))
+		if ((error = fdalloc(p, newmin, &i)))
 			return (error);
-		return (finishdup(fdp, fd, i, retval));
+		return (finishdup(fdp, uap->fd, i, retval));
 
 	case F_GETFD:
 		*retval = *pop & 1;
 		return (0);
 
 	case F_SETFD:
-		*pop = (*pop &~ 1) | ((long)SCARG(uap, arg) & 1);
+		*pop = (*pop &~ 1) | (uap->arg & 1);
 		return (0);
 
 	case F_GETFL:
@@ -206,7 +238,7 @@ fcntl(p, uap, retval)
 
 	case F_SETFL:
 		fp->f_flag &= ~FCNTLFLAGS;
-		fp->f_flag |= FFLAGS((long)SCARG(uap, arg)) & FCNTLFLAGS;
+		fp->f_flag |= FFLAGS(uap->arg) & FCNTLFLAGS;
 		tmp = fp->f_flag & FNONBLOCK;
 		error = (*fp->f_ops->fo_ioctl)(fp, FIONBIO, (caddr_t)&tmp, p);
 		if (error)
@@ -232,20 +264,19 @@ fcntl(p, uap, retval)
 
 	case F_SETOWN:
 		if (fp->f_type == DTYPE_SOCKET) {
-			((struct socket *)fp->f_data)->so_pgid =
-			    (long)SCARG(uap, arg);
+			((struct socket *)fp->f_data)->so_pgid = uap->arg;
 			return (0);
 		}
-		if ((long)SCARG(uap, arg) <= 0) {
-			SCARG(uap, arg) = (void *)(-(long)SCARG(uap, arg));
+		if (uap->arg <= 0) {
+			uap->arg = -uap->arg;
 		} else {
-			struct proc *p1 = pfind((long)SCARG(uap, arg));
+			struct proc *p1 = pfind(uap->arg);
 			if (p1 == 0)
 				return (ESRCH);
-			SCARG(uap, arg) = (void *)(long)p1->p_pgrp->pg_id;
+			uap->arg = p1->p_pgrp->pg_id;
 		}
 		return ((*fp->f_ops->fo_ioctl)
-			(fp, TIOCSPGRP, (caddr_t)&SCARG(uap, arg), p));
+			(fp, TIOCSPGRP, (caddr_t)&uap->arg, p));
 
 	case F_SETLKW:
 		flg |= F_WAIT;
@@ -256,8 +287,7 @@ fcntl(p, uap, retval)
 			return (EBADF);
 		vp = (struct vnode *)fp->f_data;
 		/* Copy in the lock structure */
-		error = copyin((caddr_t)SCARG(uap, arg), (caddr_t)&fl,
-		    sizeof (fl));
+		error = copyin((caddr_t)uap->arg, (caddr_t)&fl, sizeof (fl));
 		if (error)
 			return (error);
 		if (fl.l_whence == SEEK_CUR)
@@ -289,16 +319,17 @@ fcntl(p, uap, retval)
 			return (EBADF);
 		vp = (struct vnode *)fp->f_data;
 		/* Copy in the lock structure */
-		error = copyin((caddr_t)SCARG(uap, arg), (caddr_t)&fl,
-		    sizeof (fl));
+		error = copyin((caddr_t)uap->arg, (caddr_t)&fl, sizeof (fl));
 		if (error)
 			return (error);
+		if (fl.l_type != F_RDLCK && fl.l_type != F_WRLCK &&
+		    fl.l_type != F_UNLCK)
+			return (EINVAL);
 		if (fl.l_whence == SEEK_CUR)
 			fl.l_start += fp->f_offset;
-		if (error = VOP_ADVLOCK(vp, (caddr_t)p, F_GETLK, &fl, F_POSIX))
+		if ((error = VOP_ADVLOCK(vp,(caddr_t)p,F_GETLK,&fl,F_POSIX)))
 			return (error);
-		return (copyout((caddr_t)&fl, (caddr_t)SCARG(uap, arg),
-		    sizeof (fl)));
+		return (copyout((caddr_t)&fl, (caddr_t)uap->arg, sizeof (fl)));
 
 	default:
 		return (EINVAL);
@@ -309,11 +340,10 @@ fcntl(p, uap, retval)
 /*
  * Common code for dup, dup2, and fcntl(F_DUPFD).
  */
-int
+static int
 finishdup(fdp, old, new, retval)
 	register struct filedesc *fdp;
-	register int old, new;
-	register_t *retval;
+	register int old, new, *retval;
 {
 	register struct file *fp;
 
@@ -330,21 +360,24 @@ finishdup(fdp, old, new, retval)
 /*
  * Close a file descriptor.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct close_args {
+        int     fd;
+};
+#endif
 /* ARGSUSED */
 int
 close(p, uap, retval)
 	struct proc *p;
-	struct close_args /* {
-		syscallarg(int) fd;
-	} */ *uap;
-	register_t *retval;
+	struct close_args *uap;
+	int *retval;
 {
-	int fd = SCARG(uap, fd);
 	register struct filedesc *fdp = p->p_fd;
 	register struct file *fp;
+	register int fd = uap->fd;
 	register u_char *pf;
 
-	if ((u_int)fd >= fdp->fd_nfiles ||
+	if ((unsigned)fd >= fdp->fd_nfiles ||
 	    (fp = fdp->fd_ofiles[fd]) == NULL)
 		return (EBADF);
 	pf = (u_char *)&fdp->fd_ofileflags[fd];
@@ -363,28 +396,31 @@ close(p, uap, retval)
 /*
  * Return status information about a file descriptor.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct ofstat_args {
+	int	fd;
+	struct	ostat *sb;
+};
+#endif
 /* ARGSUSED */
 int
-compat_43_fstat(p, uap, retval)
+ofstat(p, uap, retval)
 	struct proc *p;
-	register struct compat_43_fstat_args /* {
-		syscallarg(int) fd;
-		syscallarg(struct ostat *) sb;
-	} */ *uap;
-	register_t *retval;
+	register struct ofstat_args *uap;
+	int *retval;
 {
-	int fd = SCARG(uap, fd);
 	register struct filedesc *fdp = p->p_fd;
 	register struct file *fp;
 	struct stat ub;
 	struct ostat oub;
 	int error;
 
-	if ((u_int)fd >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[fd]) == NULL)
+	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
 		return (EBADF);
 	switch (fp->f_type) {
 
+	case DTYPE_FIFO:
 	case DTYPE_VNODE:
 		error = vn_stat((struct vnode *)fp->f_data, &ub, p);
 		break;
@@ -393,14 +429,19 @@ compat_43_fstat(p, uap, retval)
 		error = soo_stat((struct socket *)fp->f_data, &ub);
 		break;
 
+#ifndef OLD_PIPE
+	case DTYPE_PIPE:
+		error = pipe_stat((struct pipe *)fp->f_data, &ub);
+		break;
+#endif
+
 	default:
 		panic("ofstat");
 		/*NOTREACHED*/
 	}
 	cvtstat(&ub, &oub);
 	if (error == 0)
-		error = copyout((caddr_t)&oub, (caddr_t)SCARG(uap, sb),
-		    sizeof (oub));
+		error = copyout((caddr_t)&oub, (caddr_t)uap->sb, sizeof (oub));
 	return (error);
 }
 #endif /* COMPAT_43 || COMPAT_SUNOS */
@@ -408,27 +449,30 @@ compat_43_fstat(p, uap, retval)
 /*
  * Return status information about a file descriptor.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct fstat_args {
+	int	fd;
+	struct	stat *sb;
+};
+#endif
 /* ARGSUSED */
 int
 fstat(p, uap, retval)
 	struct proc *p;
-	register struct fstat_args /* {
-		syscallarg(int) fd;
-		syscallarg(struct stat *) sb;
-	} */ *uap;
-	register_t *retval;
+	register struct fstat_args *uap;
+	int *retval;
 {
-	int fd = SCARG(uap, fd);
 	register struct filedesc *fdp = p->p_fd;
 	register struct file *fp;
 	struct stat ub;
 	int error;
 
-	if ((u_int)fd >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[fd]) == NULL)
+	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
 		return (EBADF);
 	switch (fp->f_type) {
 
+	case DTYPE_FIFO:
 	case DTYPE_VNODE:
 		error = vn_stat((struct vnode *)fp->f_data, &ub, p);
 		break;
@@ -437,48 +481,59 @@ fstat(p, uap, retval)
 		error = soo_stat((struct socket *)fp->f_data, &ub);
 		break;
 
+#ifndef OLD_PIPE
+	case DTYPE_PIPE:
+		error = pipe_stat((struct pipe *)fp->f_data, &ub);
+		break;
+#endif
+
 	default:
 		panic("fstat");
 		/*NOTREACHED*/
 	}
 	if (error == 0)
-		error = copyout((caddr_t)&ub, (caddr_t)SCARG(uap, sb),
-		    sizeof (ub));
+		error = copyout((caddr_t)&ub, (caddr_t)uap->sb, sizeof (ub));
 	return (error);
 }
 
 /*
  * Return pathconf information about a file descriptor.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct fpathconf_args {
+	int	fd;
+	int	name;
+};
+#endif
 /* ARGSUSED */
 int
 fpathconf(p, uap, retval)
 	struct proc *p;
-	register struct fpathconf_args /* {
-		syscallarg(int) fd;
-		syscallarg(int) name;
-	} */ *uap;
-	register_t *retval;
+	register struct fpathconf_args *uap;
+	int *retval;
 {
-	int fd = SCARG(uap, fd);
 	struct filedesc *fdp = p->p_fd;
 	struct file *fp;
 	struct vnode *vp;
 
-	if ((u_int)fd >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[fd]) == NULL)
+	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
 		return (EBADF);
 	switch (fp->f_type) {
 
+#ifndef OLD_PIPE
+	case DTYPE_PIPE:
+#endif
 	case DTYPE_SOCKET:
-		if (SCARG(uap, name) != _PC_PIPE_BUF)
+		if (uap->name != _PC_PIPE_BUF)
 			return (EINVAL);
 		*retval = PIPE_BUF;
 		return (0);
 
+	case DTYPE_FIFO:
 	case DTYPE_VNODE:
 		vp = (struct vnode *)fp->f_data;
-		return (VOP_PATHCONF(vp, SCARG(uap, name), retval));
+		return (VOP_PATHCONF(vp, uap->name, retval));
 
 	default:
 		panic("fpathconf");
@@ -489,7 +544,8 @@ fpathconf(p, uap, retval)
 /*
  * Allocate a file descriptor for the process.
  */
-int fdexpand;
+static int fdexpand;
+SYSCTL_INT(_debug, OID_AUTO, fdexpand, CTLFLAG_RD, &fdexpand, 0, "");
 
 int
 fdalloc(p, want, result)
@@ -508,7 +564,7 @@ fdalloc(p, want, result)
 	 * of want or fd_freefile.  If that fails, consider
 	 * expanding the ofile array.
 	 */
-	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
+	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
 	for (;;) {
 		last = min(fdp->fd_nfiles, lim);
 		if ((i = want) < fdp->fd_freefile)
@@ -554,6 +610,7 @@ fdalloc(p, want, result)
 		fdp->fd_nfiles = nfiles;
 		fdexpand++;
 	}
+	return (0);
 }
 
 /*
@@ -567,13 +624,15 @@ fdavail(p, n)
 {
 	register struct filedesc *fdp = p->p_fd;
 	register struct file **fpp;
-	register int i, lim;
+	register int i, lim, last;
 
-	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfiles);
+	lim = min((int)p->p_rlimit[RLIMIT_NOFILE].rlim_cur, maxfilesperproc);
 	if ((i = lim - fdp->fd_nfiles) > 0 && (n -= i) <= 0)
 		return (1);
+
+	last = min(fdp->fd_nfiles, lim);
 	fpp = &fdp->fd_ofiles[fdp->fd_freefile];
-	for (i = fdp->fd_nfiles - fdp->fd_freefile; --i >= 0; fpp++)
+	for (i = last - fdp->fd_freefile; --i >= 0; fpp++)
 		if (*fpp == NULL && --n <= 0)
 			return (1);
 	return (0);
@@ -592,7 +651,7 @@ falloc(p, resultfp, resultfd)
 	register struct file *fp, *fq;
 	int error, i;
 
-	if (error = fdalloc(p, 0, &i))
+	if ((error = fdalloc(p, 0, &i)))
 		return (error);
 	if (nfiles >= maxfiles) {
 		tablefull("file");
@@ -607,7 +666,7 @@ falloc(p, resultfp, resultfd)
 	nfiles++;
 	MALLOC(fp, struct file *, sizeof(struct file), M_FILE, M_WAITOK);
 	bzero(fp, sizeof(struct file));
-	if (fq = p->p_fd->fd_ofiles[0]) {
+	if ((fq = p->p_fd->fd_ofiles[0])) {
 		LIST_INSERT_AFTER(fq, fp, f_list);
 	} else {
 		LIST_INSERT_HEAD(&filehead, fp, f_list);
@@ -615,6 +674,7 @@ falloc(p, resultfp, resultfd)
 	p->p_fd->fd_ofiles[i] = fp;
 	fp->f_count = 1;
 	fp->f_cred = p->p_ucred;
+	fp->f_seqcount = 1;
 	crhold(fp->f_cred);
 	if (resultfp)
 		*resultfp = fp;
@@ -630,8 +690,6 @@ void
 ffree(fp)
 	register struct file *fp;
 {
-	register struct file *fq;
-
 	LIST_REMOVE(fp, f_list);
 	crfree(fp->f_cred);
 #ifdef DIAGNOSTIC
@@ -642,6 +700,49 @@ ffree(fp)
 }
 
 /*
+ * Build a new filedesc structure.
+ */
+struct filedesc *
+fdinit(p)
+	struct proc *p;
+{
+	register struct filedesc0 *newfdp;
+	register struct filedesc *fdp = p->p_fd;
+
+	MALLOC(newfdp, struct filedesc0 *, sizeof(struct filedesc0),
+	    M_FILEDESC, M_WAITOK);
+	bzero(newfdp, sizeof(struct filedesc0));
+	newfdp->fd_fd.fd_cdir = fdp->fd_cdir;
+	VREF(newfdp->fd_fd.fd_cdir);
+	newfdp->fd_fd.fd_rdir = fdp->fd_rdir;
+	if (newfdp->fd_fd.fd_rdir)
+		VREF(newfdp->fd_fd.fd_rdir);
+
+	/* Create the file descriptor table. */
+	newfdp->fd_fd.fd_refcnt = 1;
+	newfdp->fd_fd.fd_cmask = cmask;
+	newfdp->fd_fd.fd_ofiles = newfdp->fd_dfiles;
+	newfdp->fd_fd.fd_ofileflags = newfdp->fd_dfileflags;
+	newfdp->fd_fd.fd_nfiles = NDFILE;
+
+	newfdp->fd_fd.fd_freefile = 0;
+	newfdp->fd_fd.fd_lastfile = 0;
+
+	return (&newfdp->fd_fd);
+}
+
+/*
+ * Share a filedesc structure.
+ */
+struct filedesc *
+fdshare(p)
+	struct proc *p;
+{
+	p->p_fd->fd_refcnt++;
+	return (p->p_fd);
+}
+
+/*
  * Copy a filedesc structure.
  */
 struct filedesc *
@@ -720,6 +821,34 @@ fdfree(p)
 }
 
 /*
+ * Close any files on exec?
+ */
+void
+fdcloseexec(p)
+	struct proc *p;
+{
+	struct filedesc *fdp = p->p_fd;
+	struct file **fpp;
+	char *fdfp;
+	register int i;
+
+	fpp = fdp->fd_ofiles;
+	fdfp = fdp->fd_ofileflags;
+	for (i = 0; i <= fdp->fd_lastfile; i++, fpp++, fdfp++)
+		if (*fpp != NULL && (*fdfp & UF_EXCLOSE)) {
+			if (*fdfp & UF_MAPPED)
+				(void) munmapfd(p, i);
+			(void) closef(*fpp, p);
+			*fpp = NULL;
+			*fdfp = 0;
+			if (i < fdp->fd_freefile)
+				fdp->fd_freefile = i;
+		}
+	while (fdp->fd_lastfile > 0 && fdp->fd_ofiles[fdp->fd_lastfile] == NULL)
+		fdp->fd_lastfile--;
+}
+
+/*
  * Internal form of close.
  * Decrement reference count on file structure.
  * Note: p may be NULL when closing a file
@@ -778,25 +907,26 @@ closef(fp, p)
  * Just attempt to get a record lock of the requested type on
  * the entire file (l_whence = SEEK_SET, l_start = 0, l_len = 0).
  */
+#ifndef _SYS_SYSPROTO_H_
+struct flock_args {
+	int	fd;
+	int	how;
+};
+#endif
 /* ARGSUSED */
 int
 flock(p, uap, retval)
 	struct proc *p;
-	register struct flock_args /* {
-		syscallarg(int) fd;
-		syscallarg(int) how;
-	} */ *uap;
-	register_t *retval;
+	register struct flock_args *uap;
+	int *retval;
 {
-	int fd = SCARG(uap, fd);
-	int how = SCARG(uap, how);
 	register struct filedesc *fdp = p->p_fd;
 	register struct file *fp;
 	struct vnode *vp;
 	struct flock lf;
 
-	if ((u_int)fd >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[fd]) == NULL)
+	if ((unsigned)uap->fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
 		return (EBADF);
 	if (fp->f_type != DTYPE_VNODE)
 		return (EOPNOTSUPP);
@@ -804,19 +934,19 @@ flock(p, uap, retval)
 	lf.l_whence = SEEK_SET;
 	lf.l_start = 0;
 	lf.l_len = 0;
-	if (how & LOCK_UN) {
+	if (uap->how & LOCK_UN) {
 		lf.l_type = F_UNLCK;
 		fp->f_flag &= ~FHASLOCK;
 		return (VOP_ADVLOCK(vp, (caddr_t)fp, F_UNLCK, &lf, F_FLOCK));
 	}
-	if (how & LOCK_EX)
+	if (uap->how & LOCK_EX)
 		lf.l_type = F_WRLCK;
-	else if (how & LOCK_SH)
+	else if (uap->how & LOCK_SH)
 		lf.l_type = F_RDLCK;
 	else
 		return (EBADF);
 	fp->f_flag |= FHASLOCK;
-	if (how & LOCK_NB)
+	if (uap->how & LOCK_NB)
 		return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK));
 	return (VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, F_FLOCK|F_WAIT));
 }
@@ -830,7 +960,7 @@ flock(p, uap, retval)
  * references to this file will be direct to the other driver.
  */
 /* ARGSUSED */
-int
+static int
 fdopen(dev, mode, type, p)
 	dev_t dev;
 	int mode, type;
@@ -839,7 +969,7 @@ fdopen(dev, mode, type, p)
 
 	/*
 	 * XXX Kludge: set curproc->p_dupfd to contain the value of the
-	 * the file descriptor being sought for duplication. The error 
+	 * the file descriptor being sought for duplication. The error
 	 * return ensures that the vnode for this device will be released
 	 * by vn_open. Open will detect this special error and take the
 	 * actions in dupfdopen below. Other callers of vn_open or VOP_OPEN
@@ -928,3 +1058,89 @@ dupfdopen(fdp, indx, dfd, mode, error)
 	}
 	/* NOTREACHED */
 }
+
+/*
+ * Get file structures.
+ */
+static int
+sysctl_kern_file SYSCTL_HANDLER_ARGS
+{
+	int error;
+	struct file *fp;
+
+	if (!req->oldptr) {
+		/*
+		 * overestimate by 10 files
+		 */
+		return (SYSCTL_OUT(req, 0, sizeof(filehead) + 
+				(nfiles + 10) * sizeof(struct file)));
+	}
+
+	error = SYSCTL_OUT(req, (caddr_t)&filehead, sizeof(filehead));
+	if (error)
+		return (error);
+
+	/*
+	 * followed by an array of file structures
+	 */
+	for (fp = filehead.lh_first; fp != NULL; fp = fp->f_list.le_next) {
+		error = SYSCTL_OUT(req, (caddr_t)fp, sizeof (struct file));
+		if (error)
+			return (error);
+	}
+	return (0);
+}
+
+SYSCTL_PROC(_kern, KERN_FILE, file, CTLTYPE_OPAQUE|CTLFLAG_RD,
+	0, 0, sysctl_kern_file, "S,file", "");
+
+SYSCTL_INT(_kern, KERN_MAXFILESPERPROC, maxfilesperproc,
+	CTLFLAG_RW, &maxfilesperproc, 0, "");
+
+SYSCTL_INT(_kern, KERN_MAXFILES, maxfiles, CTLFLAG_RW, &maxfiles, 0, "");
+
+static fildesc_devsw_installed = 0;
+#ifdef DEVFS
+static	void *devfs_token_stdin;
+static	void *devfs_token_stdout;
+static	void *devfs_token_stderr;
+static	void *devfs_token_fildesc[NUMFDESC];
+#endif
+
+static void 	fildesc_drvinit(void *unused)
+{
+	dev_t dev;
+#ifdef DEVFS
+	int fd;
+#endif
+
+	if( ! fildesc_devsw_installed ) {
+		dev = makedev(CDEV_MAJOR,0);
+		cdevsw_add(&dev,&fildesc_cdevsw,NULL);
+		fildesc_devsw_installed = 1;
+#ifdef DEVFS
+		for (fd = 0; fd < NUMFDESC; fd++)
+			devfs_token_fildesc[fd] =
+				devfs_add_devswf(&fildesc_cdevsw, fd, DV_CHR,
+						 UID_BIN, GID_BIN, 0666,
+						 "fd/%d", fd);
+		devfs_token_stdin =
+			devfs_add_devswf(&fildesc_cdevsw, 0, DV_CHR,
+					 UID_ROOT, GID_WHEEL, 0666,
+					 "stdin", fd);
+		devfs_token_stdout =
+			devfs_add_devswf(&fildesc_cdevsw, 1, DV_CHR,
+					 UID_ROOT, GID_WHEEL, 0666,
+					 "stdout", fd);
+		devfs_token_stderr =
+			devfs_add_devswf(&fildesc_cdevsw, 2, DV_CHR,
+					 UID_ROOT, GID_WHEEL, 0666,
+					 "stderr", fd);
+#endif
+    	}
+}
+
+SYSINIT(fildescdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,
+					fildesc_drvinit,NULL)
+
+
diff --git a/sys/kern/kern_exec.c b/sys/kern/kern_exec.c
index fbb4444..21049a3 100644
--- a/sys/kern/kern_exec.c
+++ b/sys/kern/kern_exec.c
@@ -1,11 +1,6 @@
-/*-
- * Copyright (c) 1982, 1986, 1991, 1993
- *	The Regents of the University of California.  All rights reserved.
- * (c) UNIX System Laboratories, Inc.
- * All or some portions of this file are derived from material licensed
- * to the University of California by American Telephone and Telegraph
- * Co. or Unix System Laboratories, Inc. and are reproduced herein with
- * the permission of UNIX System Laboratories, Inc.
+/*
+ * Copyright (c) 1993, David Greenman
+ * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -15,18 +10,11 @@
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *	This product includes software developed by the University of
- *	California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
@@ -35,30 +23,597 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	from: @(#)kern_exec.c	8.1 (Berkeley) 6/10/93
+ *	$Id$
  */
 
 #include <sys/param.h>
-#include <sys/errno.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/filedesc.h>
+#include <sys/fcntl.h>
+#include <sys/acct.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/imgact_elf.h>
+#include <sys/wait.h>
 #include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/namei.h>
+#include <sys/sysent.h>
+#include <sys/syslog.h>
+#include <sys/shm.h>
+#include <sys/sysctl.h>
+#include <sys/vnode.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+#include <machine/reg.h>
+
+static int *exec_copyout_strings __P((struct image_params *));
+
+static int exec_check_permissions(struct image_params *);
 
 /*
- * exec system call
+ * XXX trouble here if sizeof(caddr_t) != sizeof(int), other parts
+ * of the sysctl code also assumes this, and sizeof(int) == sizeof(long).
  */
+static struct ps_strings *ps_strings = PS_STRINGS;
+SYSCTL_INT(_kern, KERN_PS_STRINGS, ps_strings, 0, &ps_strings, 0, "");
+
+static caddr_t usrstack = (caddr_t)USRSTACK;
+SYSCTL_INT(_kern, KERN_USRSTACK, usrstack, 0, &usrstack, 0, "");
+
+/*
+ * execsw_set is constructed for us by the linker.  Each of the items
+ * is a pointer to a `const struct execsw', hence the double pointer here.
+ */
+static const struct execsw **execsw = 
+	(const struct execsw **)&execsw_set.ls_items[0];
+
+#ifndef _SYS_SYSPROTO_H_
 struct execve_args {
-	char	*fname;
-	char	**argp;
-	char	**envp;
+        char    *fname; 
+        char    **argv;
+        char    **envv; 
 };
-/* ARGSUSED */
-execve(a1, a2, a3)
-	struct proc *a1;
-	struct execve_args *a2;
-	int *a3;
+#endif
+
+/*
+ * execve() system call.
+ */
+int
+execve(p, uap, retval)
+	struct proc *p;
+	register struct execve_args *uap;
+	int *retval;
+{
+	struct nameidata nd, *ndp;
+	int *stack_base;
+	int error, len, i;
+	struct image_params image_params, *imgp;
+	struct vattr attr;
+
+	imgp = &image_params;
+
+	/*
+	 * Initialize part of the common data
+	 */
+	imgp->proc = p;
+	imgp->uap = uap;
+	imgp->attr = &attr;
+	imgp->image_header = NULL;
+	imgp->argc = imgp->envc = 0;
+	imgp->entry_addr = 0;
+	imgp->vmspace_destroyed = 0;
+	imgp->interpreted = 0;
+	imgp->interpreter_name[0] = '\0';
+	imgp->auxargs = NULL;
+
+	/*
+	 * Allocate temporary demand zeroed space for argument and
+	 *	environment strings
+	 */
+	imgp->stringbase = (char *)kmem_alloc_wait(exec_map, ARG_MAX);
+	if (imgp->stringbase == NULL) {
+		error = ENOMEM;
+		goto exec_fail;
+	}
+	imgp->stringp = imgp->stringbase;
+	imgp->stringspace = ARG_MAX;
+
+	/*
+	 * Translate the file name. namei() returns a vnode pointer
+	 *	in ni_vp amoung other things.
+	 */
+	ndp = &nd;
+	NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
+	    UIO_USERSPACE, uap->fname, p);
+
+interpret:
+
+	error = namei(ndp);
+	if (error) {
+		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX);
+		goto exec_fail;
+	}
+
+	imgp->vp = ndp->ni_vp;
+	if (imgp->vp == NULL) {
+		error = ENOEXEC;
+		goto exec_fail_dealloc;
+	}
+
+	/*
+	 * Check file permissions (also 'opens' file)
+	 */
+	error = exec_check_permissions(imgp);
+
+	/*
+	 * Lose the lock on the vnode. It's no longer needed, and must not
+	 * exist for the pagefault paging to work below.
+	 */
+	VOP_UNLOCK(imgp->vp, 0, p);
+
+	if (error)
+		goto exec_fail_dealloc;
+
+	/*
+	 * Map the image header (first page) of the file into
+	 *	kernel address space
+	 */
+	error = vm_mmap(exech_map,			/* map */
+			(vm_offset_t *)&imgp->image_header, /* address */
+			PAGE_SIZE,			/* size */
+			VM_PROT_READ, 			/* protection */
+			VM_PROT_READ, 			/* max protection */
+			0,	 			/* flags */
+			(caddr_t)imgp->vp,		/* vnode */
+			0);				/* offset */
+	if (error) {
+		uprintf("mmap failed: %d\n",error);
+		goto exec_fail_dealloc;
+	}
+
+	/*
+	 * Loop through list of image activators, calling each one.
+	 *	If there is no match, the activator returns -1. If there
+	 *	is a match, but there was an error during the activation,
+	 *	the error is returned. Otherwise 0 means success. If the
+	 *	image is interpreted, loop back up and try activating
+	 *	the interpreter.
+	 */
+	for (i = 0; execsw[i]; ++i) {
+		if (execsw[i]->ex_imgact)
+			error = (*execsw[i]->ex_imgact)(imgp);
+		else
+			continue;
+
+		if (error == -1)
+			continue;
+		if (error)
+			goto exec_fail_dealloc;
+		if (imgp->interpreted) {
+			/* free old vnode and name buffer */
+			vrele(ndp->ni_vp);
+			FREE(ndp->ni_cnd.cn_pnbuf, M_NAMEI);
+			if (vm_map_remove(exech_map, (vm_offset_t)imgp->image_header,
+			    (vm_offset_t)imgp->image_header + PAGE_SIZE))
+				panic("execve: header dealloc failed (1)");
+
+			/* set new name to that of the interpreter */
+			NDINIT(ndp, LOOKUP, LOCKLEAF | FOLLOW | SAVENAME,
+			    UIO_SYSSPACE, imgp->interpreter_name, p);
+			goto interpret;
+		}
+		break;
+	}
+	/* If we made it through all the activators and none matched, exit. */
+	if (error == -1) {
+		error = ENOEXEC;
+		goto exec_fail_dealloc;
+	}
+
+	/*
+	 * Copy out strings (args and env) and initialize stack base
+	 */
+	stack_base = exec_copyout_strings(imgp);
+	p->p_vmspace->vm_minsaddr = (char *)stack_base;
+
+	/*
+	 * If custom stack fixup routine present for this process
+	 * let it do the stack setup.
+	 * Else stuff argument count as first item on stack
+	 */
+	if (p->p_sysent->sv_fixup)
+		(*p->p_sysent->sv_fixup)(&stack_base, imgp);
+	else
+		suword(--stack_base, imgp->argc);
+
+	/* close files on exec */
+	fdcloseexec(p);
+
+	/* reset caught signals */
+	execsigs(p);
+
+	/* name this process - nameiexec(p, ndp) */
+	len = min(ndp->ni_cnd.cn_namelen,MAXCOMLEN);
+	bcopy(ndp->ni_cnd.cn_nameptr, p->p_comm, len);
+	p->p_comm[len] = 0;
+
+	/*
+	 * mark as execed, wakeup the process that vforked (if any) and tell
+	 * it that it now has it's own resources back
+	 */
+	p->p_flag |= P_EXEC;
+	if (p->p_pptr && (p->p_flag & P_PPWAIT)) {
+		p->p_flag &= ~P_PPWAIT;
+		wakeup((caddr_t)p->p_pptr);
+	}
+
+	/*
+	 * Implement image setuid/setgid. Disallow if the process is
+	 * being traced.
+	 */
+	if ((attr.va_mode & (VSUID | VSGID)) &&
+	    (p->p_flag & P_TRACED) == 0) {
+		/*
+		 * Turn off syscall tracing for set-id programs, except for
+		 * root.
+		 */
+		if (p->p_tracep && suser(p->p_ucred, &p->p_acflag)) {
+			p->p_traceflag = 0;
+			vrele(p->p_tracep);
+			p->p_tracep = NULL;
+		}
+		/*
+		 * Set the new credentials.
+		 */
+		p->p_ucred = crcopy(p->p_ucred);
+		if (attr.va_mode & VSUID)
+			p->p_ucred->cr_uid = attr.va_uid;
+		if (attr.va_mode & VSGID)
+			p->p_ucred->cr_groups[0] = attr.va_gid;
+		p->p_flag |= P_SUGID;
+	} else {
+	        if (p->p_ucred->cr_uid == p->p_cred->p_ruid &&
+		    p->p_ucred->cr_gid == p->p_cred->p_rgid)
+			p->p_flag &= ~P_SUGID;
+	}
+
+	/*
+	 * Implement correct POSIX saved-id behavior.
+	 */
+	p->p_cred->p_svuid = p->p_ucred->cr_uid;
+	p->p_cred->p_svgid = p->p_ucred->cr_gid;
+
+	/*
+	 * Store the vp for use in procfs
+	 */
+	if (p->p_textvp)		/* release old reference */
+		vrele(p->p_textvp);
+	VREF(ndp->ni_vp);
+	p->p_textvp = ndp->ni_vp;
+
+	/*
+	 * If tracing the process, trap to debugger so breakpoints
+	 * 	can be set before the program executes.
+	 */
+	if (p->p_flag & P_TRACED)
+		psignal(p, SIGTRAP);
+
+	/* clear "fork but no exec" flag, as we _are_ execing */
+	p->p_acflag &= ~AFORK;
+
+	/* Set entry address */
+	setregs(p, imgp->entry_addr, (u_long)stack_base);
+
+	/*
+	 * free various allocated resources
+	 */
+	kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX);
+	if (vm_map_remove(exech_map, (vm_offset_t)imgp->image_header,
+	    (vm_offset_t)imgp->image_header + PAGE_SIZE))
+		panic("execve: header dealloc failed (2)");
+	vrele(ndp->ni_vp);
+	FREE(ndp->ni_cnd.cn_pnbuf, M_NAMEI);
+
+	return (0);
+
+exec_fail_dealloc:
+	if (imgp->stringbase != NULL)
+		kmem_free_wakeup(exec_map, (vm_offset_t)imgp->stringbase, ARG_MAX);
+	if (imgp->image_header && imgp->image_header != (char *)-1)
+		if (vm_map_remove(exech_map, (vm_offset_t)imgp->image_header,
+		    (vm_offset_t)imgp->image_header + PAGE_SIZE))
+			panic("execve: header dealloc failed (3)");
+	if (ndp->ni_vp)
+		vrele(ndp->ni_vp);
+	FREE(ndp->ni_cnd.cn_pnbuf, M_NAMEI);
+
+exec_fail:
+	if (imgp->vmspace_destroyed) {
+		/* sorry, no more process anymore. exit gracefully */
+		exit1(p, W_EXITCODE(0, SIGABRT));
+		/* NOT REACHED */
+		return(0);
+	} else {
+		return(error);
+	}
+}
+
+/*
+ * Destroy old address space, and allocate a new stack
+ *	The new stack is only SGROWSIZ large because it is grown
+ *	automatically in trap.c.
+ */
+int
+exec_new_vmspace(imgp)
+	struct image_params *imgp;
+{
+	int error;
+	struct vmspace *vmspace = imgp->proc->p_vmspace;
+	caddr_t	stack_addr = (caddr_t) (USRSTACK - SGROWSIZ);
+
+	imgp->vmspace_destroyed = 1;
+
+	/* Blow away entire process VM */
+	if (vmspace->vm_shm)
+		shmexit(imgp->proc);
+	pmap_remove_pages(&vmspace->vm_pmap, 0, USRSTACK);
+	vm_map_remove(&vmspace->vm_map, 0, USRSTACK);
+
+	/* Allocate a new stack */
+	error = vm_map_find(&vmspace->vm_map, NULL, 0, (vm_offset_t *)&stack_addr,
+	    SGROWSIZ, FALSE, VM_PROT_ALL, VM_PROT_ALL, 0);
+	if (error)
+		return(error);
+
+	vmspace->vm_ssize = SGROWSIZ >> PAGE_SHIFT;
+
+	/* Initialize maximum stack address */
+	vmspace->vm_maxsaddr = (char *)USRSTACK - MAXSSIZ;
+
+	return(0);
+}
+
+/*
+ * Copy out argument and environment strings from the old process
+ *	address space into the temporary string buffer.
+ */
+int
+exec_extract_strings(imgp)
+	struct image_params *imgp;
+{
+	char	**argv, **envv;
+	char	*argp, *envp;
+	int	error, length;
+
+	/*
+	 * extract arguments first
+	 */
+
+	argv = imgp->uap->argv;
+
+	if (argv) {
+		while ((argp = (caddr_t) fuword(argv++))) {
+			if (argp == (caddr_t) -1)
+				return (EFAULT);
+			if ((error = copyinstr(argp, imgp->stringp,
+			    imgp->stringspace, &length))) {
+				if (error == ENAMETOOLONG)
+					return(E2BIG);
+				return (error);
+			}
+			imgp->stringspace -= length;
+			imgp->stringp += length;
+			imgp->argc++;
+		}
+	}
+
+	/*
+	 * extract environment strings
+	 */
+
+	envv = imgp->uap->envv;
+
+	if (envv) {
+		while ((envp = (caddr_t) fuword(envv++))) {
+			if (envp == (caddr_t) -1)
+				return (EFAULT);
+			if ((error = copyinstr(envp, imgp->stringp,
+			    imgp->stringspace, &length))) {
+				if (error == ENAMETOOLONG)
+					return(E2BIG);
+				return (error);
+			}
+			imgp->stringspace -= length;
+			imgp->stringp += length;
+			imgp->envc++;
+		}
+	}
+
+	return (0);
+}
+
+/*
+ * Copy strings out to the new process address space, constructing
+ *	new arg and env vector tables. Return a pointer to the base
+ *	so that it can be used as the initial stack pointer.
+ */
+int *
+exec_copyout_strings(imgp)
+	struct image_params *imgp;
 {
+	int argc, envc;
+	char **vectp;
+	char *stringp, *destp;
+	int *stack_base;
+	struct ps_strings *arginfo;
+	int szsigcode;
+
+	/*
+	 * Calculate string base and vector table pointers.
+	 * Also deal with signal trampoline code for this exec type.
+	 */
+	arginfo = PS_STRINGS;
+	szsigcode = *(imgp->proc->p_sysent->sv_szsigcode);
+	destp =	(caddr_t)arginfo - szsigcode - SPARE_USRSPACE -
+		roundup((ARG_MAX - imgp->stringspace), sizeof(char *));
+
+	/*
+	 * install sigcode
+	 */
+	if (szsigcode)
+		copyout(imgp->proc->p_sysent->sv_sigcode,
+			((caddr_t)arginfo - szsigcode), szsigcode);
+
+	/*
+	 * If we have a valid auxargs ptr, prepare some room
+	 * on the stack.
+	 */
+	if (imgp->auxargs)
+	/*
+	 * The '+ 2' is for the null pointers at the end of each of the
+	 * arg and env vector sets, and 'AT_COUNT*2' is room for the
+	 * ELF Auxargs data.
+	 */
+		vectp = (char **)(destp - (imgp->argc + imgp->envc + 2 +
+				  AT_COUNT*2) * sizeof(char*));
+	else 
+	/*
+	 * The '+ 2' is for the null pointers at the end of each of the
+	 * arg and env vector sets
+	 */
+		vectp = (char **)
+			(destp - (imgp->argc + imgp->envc + 2) * sizeof(char*));
 
 	/*
-	 * Body deleted.
+	 * vectp also becomes our initial stack base
 	 */
-	return (ENOSYS);
+	stack_base = (int *)vectp;
+
+	stringp = imgp->stringbase;
+	argc = imgp->argc;
+	envc = imgp->envc;
+
+	/*
+	 * Copy out strings - arguments and environment.
+	 */
+	copyout(stringp, destp, ARG_MAX - imgp->stringspace);
+
+	/*
+	 * Fill in "ps_strings" struct for ps, w, etc.
+	 */
+	suword(&arginfo->ps_argvstr, (int)vectp);
+	suword(&arginfo->ps_nargvstr, argc);
+
+	/*
+	 * Fill in argument portion of vector table.
+	 */
+	for (; argc > 0; --argc) {
+		suword(vectp++, (int)destp);
+		while (*stringp++ != 0)
+			destp++;
+		destp++;
+	}
+
+	/* a null vector table pointer seperates the argp's from the envp's */
+	suword(vectp++, 0);
+
+	suword(&arginfo->ps_envstr, (int)vectp);
+	suword(&arginfo->ps_nenvstr, envc);
+
+	/*
+	 * Fill in environment portion of vector table.
+	 */
+	for (; envc > 0; --envc) {
+		suword(vectp++, (int)destp);
+		while (*stringp++ != 0)
+			destp++;
+		destp++;
+	}
+
+	/* end of vector table is a null pointer */
+	suword(vectp, 0);
+
+	return (stack_base);
+}
+
+/*
+ * Check permissions of file to execute.
+ *	Return 0 for success or error code on failure.
+ */
+static int
+exec_check_permissions(imgp)
+	struct image_params *imgp;
+{
+	struct proc *p = imgp->proc;
+	struct vnode *vp = imgp->vp;
+	struct vattr *attr = imgp->attr;
+	int error;
+
+	/*
+	 * Check number of open-for-writes on the file and deny execution
+	 *	if there are any.
+	 */
+	if (vp->v_writecount) {
+		return (ETXTBSY);
+	}
+
+	/* Get file attributes */
+	error = VOP_GETATTR(vp, attr, p->p_ucred, p);
+	if (error)
+		return (error);
+
+	/*
+	 * 1) Check if file execution is disabled for the filesystem that this
+	 *	file resides on.
+	 * 2) Insure that at least one execute bit is on - otherwise root
+	 *	will always succeed, and we don't want to happen unless the
+	 *	file really is executable.
+	 * 3) Insure that the file is a regular file.
+	 */
+	if ((vp->v_mount->mnt_flag & MNT_NOEXEC) ||
+	    ((attr->va_mode & 0111) == 0) ||
+	    (attr->va_type != VREG)) {
+		return (EACCES);
+	}
+
+	/*
+	 * Zero length files can't be exec'd
+	 */
+	if (attr->va_size == 0)
+		return (ENOEXEC);
+
+	/*
+	 * Disable setuid/setgid if the filesystem prohibits it or if
+	 *	the process is being traced.
+	 */
+        if ((vp->v_mount->mnt_flag & MNT_NOSUID) || (p->p_flag & P_TRACED))
+		attr->va_mode &= ~(VSUID | VSGID);
+
+	/*
+	 *  Check for execute permission to file based on current credentials.
+	 *	Then call filesystem specific open routine (which does nothing
+	 *	in the general case).
+	 */
+	error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p);
+	if (error)
+		return (error);
+
+	error = VOP_OPEN(vp, FREAD, p->p_ucred, p);
+	if (error)
+		return (error);
+
+	return (0);
 }
diff --git a/sys/kern/kern_exit.c b/sys/kern/kern_exit.c
index 4ed48ac..2f8074c 100644
--- a/sys/kern/kern_exit.c
+++ b/sys/kern/kern_exit.c
@@ -35,13 +35,16 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)kern_exit.c	8.10 (Berkeley) 2/23/95
+ *	@(#)kern_exit.c	8.7 (Berkeley) 2/12/94
+ * $Id: kern_exit.c,v 1.45 1997/02/22 09:39:04 peter Exp $
  */
 
+#include "opt_ktrace.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/map.h>
-#include <sys/ioctl.h>
+#include <sys/sysproto.h>
+#include <sys/sysent.h>
 #include <sys/proc.h>
 #include <sys/tty.h>
 #include <sys/time.h>
@@ -54,31 +57,48 @@
 #include <sys/syslog.h>
 #include <sys/malloc.h>
 #include <sys/resourcevar.h>
+#include <sys/signalvar.h>
 #include <sys/ptrace.h>
+#include <sys/acct.h>		/* for acct_process() function prototype */
+#include <sys/filedesc.h>
+#include <sys/shm.h>
+#include <sys/sem.h>
 
-#include <machine/cpu.h>
 #ifdef COMPAT_43
 #include <machine/reg.h>
 #include <machine/psl.h>
 #endif
 
 #include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
 #include <vm/vm_kern.h>
 
-__dead void cpu_exit __P((struct proc *));
-__dead void exit1 __P((struct proc *, int));
+static int wait1 __P((struct proc *, struct wait_args *, int [], int));
+
+/*
+ * callout list for things to do at exit time
+ */
+typedef struct exit_list_element {
+	struct exit_list_element *next;
+	exitlist_fn function;
+} *ele_p;
+
+static ele_p exit_list;
 
 /*
  * exit --
  *	Death of process.
  */
-struct rexit_args {
-	int	rval;
-};
-__dead void
+void
 exit(p, uap, retval)
 	struct proc *p;
-	struct rexit_args *uap;
+	struct rexit_args /* {
+		int	rval;
+	} */ *uap;
 	int *retval;
 {
 
@@ -91,21 +111,33 @@ exit(p, uap, retval)
  * to zombie, and unlink proc from allproc and parent's lists.  Save exit
  * status and rusage for wait().  Check for child processes and orphan them.
  */
-__dead void
+void
 exit1(p, rv)
 	register struct proc *p;
 	int rv;
 {
 	register struct proc *q, *nq;
-	register struct proc **pp;
 	register struct vmspace *vm;
+	ele_p ep = exit_list;
 
-	if (p->p_pid == 1)
-		panic("init died (signal %d, exit %d)",
+	if (p->p_pid == 1) {
+		printf("init died (signal %d, exit %d)\n",
 		    WTERMSIG(rv), WEXITSTATUS(rv));
+		panic("Going nowhere without my init!");
+	}
 #ifdef PGINPROF
 	vmsizmon();
 #endif
+	/* 
+	 * Check if any LKMs need anything done at process exit.
+	 * e.g. SYSV IPC stuff
+	 * XXX what if one of these generates an error?
+	 */
+	while (ep) {
+		(*ep->function)(p);
+		ep = ep->next;
+	}
+
 	if (p->p_flag & P_PROFIL)
 		stopprofclock(p);
 	MALLOC(p->p_ru, struct rusage *, sizeof(struct rusage),
@@ -126,12 +158,21 @@ exit1(p, rv)
 	 */
 	fdfree(p);
 
+	/*
+	 * Delete select() buffers
+	 */
+	if (p->p_selbits)
+		free (p->p_selbits, M_SELECT);
+
+	/*
+	 * XXX Shutdown SYSV semaphores
+	 */
+	semexit(p);
+
 	/* The next two chunks should probably be moved to vmspace_exit. */
 	vm = p->p_vmspace;
-#ifdef SYSVSHM
 	if (vm->vm_shm)
 		shmexit(p);
-#endif
 	/*
 	 * Release user portion of address space.
 	 * This releases references to vnodes,
@@ -140,9 +181,12 @@ exit1(p, rv)
 	 * Can't free the entire vmspace as the kernel stack
 	 * may be mapped within that space also.
 	 */
-	if (vm->vm_refcnt == 1)
+	if (vm->vm_refcnt == 1) {
+		pmap_remove_pages(&vm->vm_pmap, VM_MIN_ADDRESS,
+		    VM_MAXUSER_ADDRESS);
 		(void) vm_map_remove(&vm->vm_map, VM_MIN_ADDRESS,
 		    VM_MAXUSER_ADDRESS);
+	}
 
 	if (SESS_LEADER(p)) {
 		register struct session *sp = p->p_session;
@@ -154,7 +198,7 @@ exit1(p, rv)
 			 * drain controlling terminal
 			 * and revoke access to controlling terminal.
 			 */
-			if (sp->s_ttyp->t_session == sp) {
+			if (sp->s_ttyp && (sp->s_ttyp->t_session == sp)) {
 				if (sp->s_ttyp->t_pgrp)
 					pgsignal(sp->s_ttyp->t_pgrp, SIGHUP, 1);
 				(void) ttywait(sp->s_ttyp);
@@ -177,10 +221,15 @@ exit1(p, rv)
 		sp->s_leader = NULL;
 	}
 	fixjobc(p, p->p_pgrp, 0);
+	if (p->p_limit->p_refcnt > 1 &&
+	    (p->p_limit->p_lflags & PL_SHAREMOD) == 0) {
+		p->p_limit->p_refcnt--;
+		p->p_limit = limcopy(p->p_limit);
+	}
 	p->p_rlimit[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY;
 	(void)acct_process(p);
 #ifdef KTRACE
-	/* 
+	/*
 	 * release trace file
 	 */
 	p->p_traceflag = 0;	/* don't trace the vrele() */
@@ -244,8 +293,10 @@ exit1(p, rv)
 	 * Other substructures are freed from wait().
 	 */
 	curproc = NULL;
-	if (--p->p_limit->p_refcnt == 0)
+	if (--p->p_limit->p_refcnt == 0) {
 		FREE(p->p_limit, M_SUBPROC);
+		p->p_limit = NULL;
+	}
 
 	/*
 	 * Finally, call machine-dependent code to release the remaining
@@ -253,22 +304,12 @@ exit1(p, rv)
 	 * The address space is released by "vmspace_free(p->p_vmspace)";
 	 * This is machine-dependent, as we may have to change stacks
 	 * or ensure that the current one isn't reallocated before we
-	 * finish.  cpu_exit will end with a call to cpu_swtch(), finishing
+	 * finish.  cpu_exit will end with a call to cpu_switch(), finishing
 	 * our execution (pun intended).
 	 */
 	cpu_exit(p);
 }
 
-struct wait_args {
-	int	pid;
-	int	*status;
-	int	options;
-	struct	rusage *rusage;
-#ifdef COMPAT_43
-	int	compat;		/* pseudo */
-#endif
-};
-
 #ifdef COMPAT_43
 #if defined(hp300) || defined(luna68k)
 #include <machine/frame.h>
@@ -277,48 +318,55 @@ struct wait_args {
 #define GETPS(rp)	(rp)[PS]
 #endif
 
-compat_43_wait(p, uap, retval)
+int
+owait(p, uap, retval)
 	struct proc *p;
-	register struct wait_args *uap;
+	register struct owait_args /* {
+		int     dummy;
+	} */ *uap;
 	int *retval;
 {
+	struct wait_args w;
 
 #ifdef PSL_ALLCC
 	if ((GETPS(p->p_md.md_regs) & PSL_ALLCC) != PSL_ALLCC) {
-		uap->options = 0;
-		uap->rusage = NULL;
+		w.options = 0;
+		w.rusage = NULL;
 	} else {
-		uap->options = p->p_md.md_regs[R0];
-		uap->rusage = (struct rusage *)p->p_md.md_regs[R1];
+		w.options = p->p_md.md_regs[R0];
+		w.rusage = (struct rusage *)p->p_md.md_regs[R1];
 	}
 #else
-	uap->options = 0;
-	uap->rusage = NULL;
+	w.options = 0;
+	w.rusage = NULL;
 #endif
-	uap->pid = WAIT_ANY;
-	uap->status = NULL;
-	uap->compat = 1;
-	return (wait1(p, uap, retval));
+	w.pid = WAIT_ANY;
+	w.status = NULL;
+	return (wait1(p, &w, retval, 1));
 }
+#endif /* COMPAT_43 */
 
+int
 wait4(p, uap, retval)
 	struct proc *p;
 	struct wait_args *uap;
 	int *retval;
 {
 
-	uap->compat = 0;
-	return (wait1(p, uap, retval));
+	return (wait1(p, uap, retval, 0));
 }
-#else
-#define	wait1	wait4
-#endif
 
-int
-wait1(q, uap, retval)
+static int
+wait1(q, uap, retval, compat)
 	register struct proc *q;
-	register struct wait_args *uap;
+	register struct wait_args /* {
+		int pid;
+		int *status;
+		int options;
+		struct rusage *rusage;
+	} */ *uap;
 	int retval[];
+	int compat;
 {
 	register int nfound;
 	register struct proc *p, *t;
@@ -338,16 +386,22 @@ loop:
 			continue;
 		nfound++;
 		if (p->p_stat == SZOMB) {
+			/* charge childs scheduling cpu usage to parent */
+			if (curproc->p_pid != 1) {
+				curproc->p_estcpu = min(curproc->p_estcpu +
+				    p->p_estcpu, UCHAR_MAX);
+			}
+
 			retval[0] = p->p_pid;
 #ifdef COMPAT_43
-			if (uap->compat)
+			if (compat)
 				retval[1] = p->p_xstat;
 			else
 #endif
 			if (uap->status) {
 				status = p->p_xstat;	/* convert to int */
-				if (error = copyout((caddr_t)&status,
-				    (caddr_t)uap->status, sizeof(status)))
+				if ((error = copyout((caddr_t)&status,
+				    (caddr_t)uap->status, sizeof(status))))
 					return (error);
 			}
 			if (uap->rusage && (error = copyout((caddr_t)p->p_ru,
@@ -367,6 +421,7 @@ loop:
 			p->p_xstat = 0;
 			ruadd(&q->p_stats->p_cru, p->p_ru);
 			FREE(p->p_ru, M_ZOMBIE);
+			p->p_ru = NULL;
 
 			/*
 			 * Decrement the count of procs running with this uid.
@@ -374,20 +429,21 @@ loop:
 			(void)chgproccnt(p->p_cred->p_ruid, -1);
 
 			/*
+			 * Release reference to text vnode
+			 */
+			if (p->p_textvp)
+				vrele(p->p_textvp);
+
+			/*
 			 * Free up credentials.
 			 */
 			if (--p->p_cred->p_refcnt == 0) {
 				crfree(p->p_cred->pc_ucred);
 				FREE(p->p_cred, M_SUBPROC);
+				p->p_cred = NULL;
 			}
 
 			/*
-			 * Release reference to text vnode
-			 */
-			if (p->p_textvp)
-				vrele(p->p_textvp);
-
-			/*
 			 * Finally finished with old proc entry.
 			 * Unlink it from its process group and free it.
 			 */
@@ -410,7 +466,7 @@ loop:
 			p->p_flag |= P_WAITED;
 			retval[0] = p->p_pid;
 #ifdef COMPAT_43
-			if (uap->compat) {
+			if (compat) {
 				retval[1] = W_STOPCODE(p->p_xstat);
 				error = 0;
 			} else
@@ -430,7 +486,7 @@ loop:
 		retval[0] = 0;
 		return (0);
 	}
-	if (error = tsleep((caddr_t)q, PWAIT | PCATCH, "wait", 0))
+	if ((error = tsleep((caddr_t)q, PWAIT | PCATCH, "wait", 0)))
 		return (error);
 	goto loop;
 }
@@ -451,3 +507,57 @@ proc_reparent(child, parent)
 	LIST_INSERT_HEAD(&parent->p_children, child, p_sibling);
 	child->p_pptr = parent;
 }
+
+/*
+ * The next two functions are to handle adding/deleting items on the
+ * exit callout list
+ * 
+ * at_exit():
+ * Take the arguments given and put them onto the exit callout list,
+ * However first make sure that it's not already there.
+ * returns 0 on success.
+ */
+int
+at_exit(exitlist_fn function)
+{
+	ele_p ep;
+
+	/* Be noisy if the programmer has lost track of things */
+	if (rm_at_exit(function)) 
+		printf("exit callout entry already present\n");
+	ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT);
+	if (ep == NULL)
+		return (ENOMEM);
+	ep->next = exit_list;
+	ep->function = function;
+	exit_list = ep;
+	return (0);
+}
+/*
+ * Scan the exit callout list for the given items and remove them.
+ * Returns the number of items removed.
+ * Logically this can only be 0 or 1.
+ */
+int
+rm_at_exit(exitlist_fn function)
+{
+	ele_p *epp, ep;
+	int count;
+
+	count = 0;
+	epp = &exit_list;
+	ep = *epp;
+	while (ep) {
+		if (ep->function == function) {
+			*epp = ep->next;
+			free(ep, M_TEMP);
+			count++;
+		} else {
+			epp = &ep->next;
+		}
+		ep = *epp;
+	}
+	return (count);
+}
+
+
diff --git a/sys/kern/kern_fork.c b/sys/kern/kern_fork.c
index 6c5f22f..8327b81 100644
--- a/sys/kern/kern_fork.c
+++ b/sys/kern/kern_fork.c
@@ -35,55 +35,104 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)kern_fork.c	8.8 (Berkeley) 2/14/95
+ *	@(#)kern_fork.c	8.6 (Berkeley) 4/8/94
+ * $Id$
  */
 
+#include "opt_ktrace.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/map.h>
+#include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 #include <sys/resourcevar.h>
 #include <sys/vnode.h>
-#include <sys/file.h>
 #include <sys/acct.h>
 #include <sys/ktrace.h>
+#include <sys/unistd.h>	
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_inherit.h>
+
+static int fork1 __P((struct proc *p, int flags, int *retval));
+
+/*
+ * These are the stuctures used to create a callout list for things to do
+ * when forking a process
+ */
+typedef struct fork_list_element {
+	struct fork_list_element *next;
+	forklist_fn function;
+} *fle_p;
+
+static fle_p	fork_list;
+
+#ifndef _SYS_SYSPROTO_H_
+struct fork_args {
+	int     dummy;
+};
+#endif
 
 /* ARGSUSED */
+int
 fork(p, uap, retval)
 	struct proc *p;
-	void *uap;
-	register_t *retval;
+	struct fork_args *uap;
+	int retval[];
 {
-
-	return (fork1(p, 0, retval));
+	return (fork1(p, (RFFDG|RFPROC), retval));
 }
 
 /* ARGSUSED */
+int
 vfork(p, uap, retval)
 	struct proc *p;
-	void *uap;
-	register_t *retval;
+	struct vfork_args *uap;
+	int retval[];
 {
+	return (fork1(p, (RFFDG|RFPROC|RFPPWAIT), retval));
+}
 
-	return (fork1(p, 1, retval));
+/* ARGSUSED */
+int
+rfork(p, uap, retval)
+	struct proc *p;
+	struct rfork_args *uap;
+	int retval[];
+{
+	return (fork1(p, uap->flags, retval));
 }
 
+
 int	nprocs = 1;		/* process 0 */
+static int nextpid = 0;
 
-fork1(p1, isvfork, retval)
+static int
+fork1(p1, flags, retval)
 	register struct proc *p1;
-	int isvfork;
-	register_t *retval;
+	int flags;
+	int retval[];
 {
-	register struct proc *p2;
+	register struct proc *p2, *pptr;
 	register uid_t uid;
 	struct proc *newproc;
-	struct proc **hash;
 	int count;
-	static int nextpid, pidchecked = 0;
+	static int pidchecked = 0;
+	fle_p ep ;
+
+	ep = fork_list;
+	if ((flags & RFPROC) == 0)
+		return (EINVAL);
+	if ((flags & (RFFDG|RFCFDG)) == (RFFDG|RFCFDG))
+		return (EINVAL);
 
 	/*
 	 * Although process entries are dynamically created, we still keep
@@ -97,6 +146,11 @@ fork1(p1, isvfork, retval)
 		tablefull("proc");
 		return (EAGAIN);
 	}
+	/*
+	 * Increment the nprocs resource before blocking can occur.  There
+	 * are hard-limits as to the number of processes that can run.
+	 */
+	nprocs++;
 
 	/*
 	 * Increment the count of procs running with this uid. Don't allow
@@ -105,6 +159,10 @@ fork1(p1, isvfork, retval)
 	count = chgproccnt(uid, 1);
 	if (uid != 0 && count > p1->p_rlimit[RLIMIT_NPROC].rlim_cur) {
 		(void)chgproccnt(uid, -1);
+		/*
+		 * Back out the process count
+		 */
+		nprocs--;
 		return (EAGAIN);
 	}
 
@@ -146,7 +204,7 @@ again:
 			}
 			if (p2->p_pid > nextpid && pidchecked > p2->p_pid)
 				pidchecked = p2->p_pid;
-			if (p2->p_pgrp->pg_id > nextpid && 
+			if (p2->p_pgrp->pg_id > nextpid &&
 			    pidchecked > p2->p_pgrp->pg_id)
 				pidchecked = p2->p_pgrp->pg_id;
 		}
@@ -157,12 +215,10 @@ again:
 		}
 	}
 
-	nprocs++;
 	p2 = newproc;
 	p2->p_stat = SIDL;			/* protect against others */
 	p2->p_pid = nextpid;
 	LIST_INSERT_HEAD(&allproc, p2, p_list);
-	p2->p_forw = p2->p_back = NULL;		/* shouldn't be necessary */
 	LIST_INSERT_HEAD(PIDHASH(p2->p_pid), p2, p_hash);
 
 	/*
@@ -176,6 +232,11 @@ again:
 	    (unsigned) ((caddr_t)&p2->p_endcopy - (caddr_t)&p2->p_startcopy));
 
 	/*
+	 * XXX: this should be done as part of the startzero above
+	 */
+	p2->p_vmspace = 0;		/* XXX */
+
+	/*
 	 * Duplicate sub-structures as needed.
 	 * Increase reference counts on shared objects.
 	 * The p_stats and p_sigacts substructs are set in vm_fork.
@@ -194,7 +255,13 @@ again:
 	if (p2->p_textvp)
 		VREF(p2->p_textvp);
 
-	p2->p_fd = fdcopy(p1);
+	if (flags & RFCFDG)
+		p2->p_fd = fdinit(p1);
+	else if (flags & RFFDG)
+		p2->p_fd = fdcopy(p1);
+	else
+		p2->p_fd = fdshare(p1);
+
 	/*
 	 * If p_limit is still copy-on-write, bump refcnt,
 	 * otherwise get a copy that won't be modified.
@@ -208,13 +275,29 @@ again:
 		p2->p_limit->p_refcnt++;
 	}
 
+	/*
+	 * Preserve some flags in subprocess.
+	 */
+	p2->p_flag |= p1->p_flag & P_SUGID;
 	if (p1->p_session->s_ttyvp != NULL && p1->p_flag & P_CONTROLT)
 		p2->p_flag |= P_CONTROLT;
-	if (isvfork)
+	if (flags & RFPPWAIT)
 		p2->p_flag |= P_PPWAIT;
 	LIST_INSERT_AFTER(p1, p2, p_pglist);
-	p2->p_pptr = p1;
-	LIST_INSERT_HEAD(&p1->p_children, p2, p_sibling);
+
+	/*
+	 * Attach the new process to its parent.
+	 *
+	 * If RFNOWAIT is set, the newly created process becomes a child
+	 * of init.  This effectively disassociates the child from the
+	 * parent.
+	 */
+	if (flags & RFNOWAIT)
+		pptr = initproc;
+	else
+		pptr = p1;
+	p2->p_pptr = pptr;
+	LIST_INSERT_HEAD(&pptr->p_children, p2, p_sibling);
 	LIST_INIT(&p2->p_children);
 
 #ifdef KTRACE
@@ -230,10 +313,25 @@ again:
 #endif
 
 	/*
+	 * set priority of child to be that of parent
+	 */
+	p2->p_estcpu = p1->p_estcpu;
+
+	/*
 	 * This begins the section where we must prevent the parent
 	 * from being swapped.
 	 */
 	p1->p_flag |= P_NOSWAP;
+
+	/*
+	 * share as much address space as possible
+	 * XXX this should probably go in vm_fork()
+	 */
+	if (flags & RFMEM)
+		(void) vm_map_inherit(&p1->p_vmspace->vm_map,
+		    VM_MIN_ADDRESS, VM_MAXUSER_ADDRESS - MAXSSIZ,
+		    VM_INHERIT_SHARE);
+
 	/*
 	 * Set return values for child before vm_fork,
 	 * so they can be copied to child stack.
@@ -244,18 +342,28 @@ again:
 	 */
 	retval[0] = p1->p_pid;
 	retval[1] = 1;
-	if (vm_fork(p1, p2, isvfork)) {
+	if (vm_fork(p1, p2)) {
 		/*
 		 * Child process.  Set start time and get to work.
 		 */
-		(void) splclock();
-		p2->p_stats->p_start = time;
+		microtime(&runtime);
 		(void) spl0();
+		p2->p_stats->p_start = runtime;
 		p2->p_acflag = AFORK;
 		return (0);
 	}
 
 	/*
+	 * Both processes are set up, now check if any LKMs want
+	 * to adjust anything.
+	 *   What if they have an error? XXX
+	 */
+	while (ep) {
+		(*ep->function)(p1, p2, flags);
+		ep = ep->next;
+	}
+
+	/*
 	 * Make child runnable and add to run queue.
 	 */
 	(void) splhigh();
@@ -273,9 +381,8 @@ again:
 	 * child to exec or exit, set P_PPWAIT on child, and sleep on our
 	 * proc (in case of exit).
 	 */
-	if (isvfork)
-		while (p2->p_flag & P_PPWAIT)
-			tsleep(p1, PWAIT, "ppwait", 0);
+	while (p2->p_flag & P_PPWAIT)
+		tsleep(p1, PWAIT, "ppwait", 0);
 
 	/*
 	 * Return child pid to parent process,
@@ -285,3 +392,58 @@ again:
 	retval[1] = 0;
 	return (0);
 }
+
+/*
+ * The next two functionms are general routines to handle adding/deleting
+ * items on the fork callout list.
+ *
+ * at_fork():
+ * Take the arguments given and put them onto the fork callout list,
+ * However first make sure that it's not already there.
+ * Returns 0 on success or a standard error number.
+ */
+int
+at_fork(forklist_fn function)
+{
+	fle_p ep;
+
+	/* let the programmer know if he's been stupid */
+	if (rm_at_fork(function)) 
+		printf("fork callout entry already present\n");
+	ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT);
+	if (ep == NULL)
+		return (ENOMEM);
+	ep->next = fork_list;
+	ep->function = function;
+	fork_list = ep;
+	return (0);
+}
+
+/*
+ * Scan the exit callout list for the given items and remove them.
+ * Returns the number of items removed.
+ * Theoretically this value can only be 0 or 1.
+ */
+int
+rm_at_fork(forklist_fn function)
+{
+	fle_p *epp, ep;
+	int count;
+
+	count= 0;
+	epp = &fork_list;
+	ep = *epp;
+	while (ep) {
+		if (ep->function == function) {
+			*epp = ep->next;
+			free(ep, M_TEMP);
+			count++;
+		} else {
+			epp = &ep->next;
+		}
+		ep = *epp;
+	}
+	return (count);
+}
+
+
diff --git a/sys/kern/kern_ktrace.c b/sys/kern/kern_ktrace.c
index b841754..f8e4e25 100644
--- a/sys/kern/kern_ktrace.c
+++ b/sys/kern/kern_ktrace.c
@@ -30,33 +30,40 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)kern_ktrace.c	8.5 (Berkeley) 5/14/95
+ *	@(#)kern_ktrace.c	8.2 (Berkeley) 9/23/93
+ * $Id: kern_ktrace.c,v 1.17 1997/02/22 09:39:05 peter Exp $
  */
 
-#ifdef KTRACE
+#include "opt_ktrace.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/sysproto.h>
 #include <sys/proc.h>
-#include <sys/file.h>
+#include <sys/fcntl.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
 #include <sys/ktrace.h>
 #include <sys/malloc.h>
 #include <sys/syslog.h>
 
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
+#ifdef KTRACE
+static struct ktr_header *ktrgetheader __P((int type));
+static void ktrwrite __P((struct vnode *, struct ktr_header *));
+static int ktrcanset __P((struct proc *,struct proc *));
+static int ktrsetchildren __P((struct proc *,struct proc *,int,int,struct vnode *));
+static int ktrops __P((struct proc *,struct proc *,int,int,struct vnode *));
+
 
-struct ktr_header *
+static struct ktr_header *
 ktrgetheader(type)
 	int type;
 {
 	register struct ktr_header *kth;
 	struct proc *p = curproc;	/* XXX */
 
-	MALLOC(kth, struct ktr_header *, sizeof (struct ktr_header), 
-		M_TEMP, M_WAITOK);
+	MALLOC(kth, struct ktr_header *, sizeof (struct ktr_header),
+		M_KTRACE, M_WAITOK);
 	kth->ktr_type = type;
 	microtime(&kth->ktr_time);
 	kth->ktr_pid = p->p_pid;
@@ -65,31 +72,29 @@ ktrgetheader(type)
 }
 
 void
-ktrsyscall(vp, code, argsize, args)
+ktrsyscall(vp, code, narg, args)
 	struct vnode *vp;
-	int code, argsize;
-	register_t args[];
+	int code, narg, args[];
 {
 	struct	ktr_header *kth;
 	struct	ktr_syscall *ktp;
-	register len = sizeof(struct ktr_syscall) + argsize;
+	register len = sizeof(struct ktr_syscall) + (narg * sizeof(int));
 	struct proc *p = curproc;	/* XXX */
-	register_t *argp;
-	int i;
+	int 	*argp, i;
 
 	p->p_traceflag |= KTRFAC_ACTIVE;
 	kth = ktrgetheader(KTR_SYSCALL);
-	MALLOC(ktp, struct ktr_syscall *, len, M_TEMP, M_WAITOK);
+	MALLOC(ktp, struct ktr_syscall *, len, M_KTRACE, M_WAITOK);
 	ktp->ktr_code = code;
-	ktp->ktr_argsize = argsize;
-	argp = (register_t *)((char *)ktp + sizeof(struct ktr_syscall));
-	for (i = 0; i < (argsize / sizeof *argp); i++)
+	ktp->ktr_narg = narg;
+	argp = (int *)((char *)ktp + sizeof(struct ktr_syscall));
+	for (i = 0; i < narg; i++)
 		*argp++ = args[i];
 	kth->ktr_buf = (caddr_t)ktp;
 	kth->ktr_len = len;
 	ktrwrite(vp, kth);
-	FREE(ktp, M_TEMP);
-	FREE(kth, M_TEMP);
+	FREE(ktp, M_KTRACE);
+	FREE(kth, M_KTRACE);
 	p->p_traceflag &= ~KTRFAC_ACTIVE;
 }
 
@@ -112,7 +117,7 @@ ktrsysret(vp, code, error, retval)
 	kth->ktr_len = sizeof(struct ktr_sysret);
 
 	ktrwrite(vp, kth);
-	FREE(kth, M_TEMP);
+	FREE(kth, M_KTRACE);
 	p->p_traceflag &= ~KTRFAC_ACTIVE;
 }
 
@@ -130,7 +135,7 @@ ktrnamei(vp, path)
 	kth->ktr_buf = path;
 
 	ktrwrite(vp, kth);
-	FREE(kth, M_TEMP);
+	FREE(kth, M_KTRACE);
 	p->p_traceflag &= ~KTRFAC_ACTIVE;
 }
 
@@ -147,13 +152,13 @@ ktrgenio(vp, fd, rw, iov, len, error)
 	register caddr_t cp;
 	register int resid = len, cnt;
 	struct proc *p = curproc;	/* XXX */
-	
+
 	if (error)
 		return;
 	p->p_traceflag |= KTRFAC_ACTIVE;
 	kth = ktrgetheader(KTR_GENIO);
 	MALLOC(ktp, struct ktr_genio *, sizeof(struct ktr_genio) + len,
-		M_TEMP, M_WAITOK);
+		M_KTRACE, M_WAITOK);
 	ktp->ktr_fd = fd;
 	ktp->ktr_rw = rw;
 	cp = (caddr_t)((char *)ktp + sizeof (struct ktr_genio));
@@ -171,8 +176,8 @@ ktrgenio(vp, fd, rw, iov, len, error)
 
 	ktrwrite(vp, kth);
 done:
-	FREE(kth, M_TEMP);
-	FREE(ktp, M_TEMP);
+	FREE(kth, M_KTRACE);
+	FREE(ktp, M_KTRACE);
 	p->p_traceflag &= ~KTRFAC_ACTIVE;
 }
 
@@ -197,7 +202,7 @@ ktrpsig(vp, sig, action, mask, code)
 	kth->ktr_len = sizeof (struct ktr_psig);
 
 	ktrwrite(vp, kth);
-	FREE(kth, M_TEMP);
+	FREE(kth, M_KTRACE);
 	p->p_traceflag &= ~KTRFAC_ACTIVE;
 }
 
@@ -218,33 +223,38 @@ ktrcsw(vp, out, user)
 	kth->ktr_len = sizeof (struct ktr_csw);
 
 	ktrwrite(vp, kth);
-	FREE(kth, M_TEMP);
+	FREE(kth, M_KTRACE);
 	p->p_traceflag &= ~KTRFAC_ACTIVE;
 }
+#endif
 
 /* Interface and common routines */
 
 /*
  * ktrace system call
  */
+#ifndef _SYS_SYSPROTO_H_
+struct ktrace_args {
+	char	*fname;
+	int	ops;
+	int	facs;
+	int	pid;
+};
+#endif
 /* ARGSUSED */
 int
 ktrace(curp, uap, retval)
 	struct proc *curp;
-	register struct ktrace_args /* {
-		syscallarg(char *) fname;
-		syscallarg(int) ops;
-		syscallarg(int) facs;
-		syscallarg(int) pid;
-	} */ *uap;
-	register_t *retval;
+	register struct ktrace_args *uap;
+	int *retval;
 {
+#ifdef KTRACE
 	register struct vnode *vp = NULL;
 	register struct proc *p;
 	struct pgrp *pg;
-	int facs = SCARG(uap, facs) & ~KTRFAC_ROOT;
-	int ops = KTROP(SCARG(uap, ops));
-	int descend = SCARG(uap, ops) & KTRFLAG_DESCEND;
+	int facs = uap->facs & ~KTRFAC_ROOT;
+	int ops = KTROP(uap->ops);
+	int descend = uap->ops & KTRFLAG_DESCEND;
 	int ret = 0;
 	int error = 0;
 	struct nameidata nd;
@@ -254,14 +264,14 @@ ktrace(curp, uap, retval)
 		/*
 		 * an operation which requires a file argument.
 		 */
-		NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, fname),
-		    curp);
-		if (error = vn_open(&nd, FREAD|FWRITE, 0)) {
+		NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, uap->fname, curp);
+		error = vn_open(&nd, FREAD|FWRITE, 0);
+		if (error) {
 			curp->p_traceflag &= ~KTRFAC_ACTIVE;
 			return (error);
 		}
 		vp = nd.ni_vp;
-		VOP_UNLOCK(vp, 0, p);
+		VOP_UNLOCK(vp, 0, curp);
 		if (vp->v_type != VREG) {
 			(void) vn_close(vp, FREAD|FWRITE, curp->p_ucred, curp);
 			curp->p_traceflag &= ~KTRFAC_ACTIVE;
@@ -292,14 +302,14 @@ ktrace(curp, uap, retval)
 		error = EINVAL;
 		goto done;
 	}
-	/* 
+	/*
 	 * do it
 	 */
-	if (SCARG(uap, pid) < 0) {
+	if (uap->pid < 0) {
 		/*
 		 * by process group
 		 */
-		pg = pgfind(-SCARG(uap, pid));
+		pg = pgfind(-uap->pid);
 		if (pg == NULL) {
 			error = ESRCH;
 			goto done;
@@ -307,14 +317,14 @@ ktrace(curp, uap, retval)
 		for (p = pg->pg_members.lh_first; p != 0; p = p->p_pglist.le_next)
 			if (descend)
 				ret |= ktrsetchildren(curp, p, ops, facs, vp);
-			else 
+			else
 				ret |= ktrops(curp, p, ops, facs, vp);
-					
+
 	} else {
 		/*
 		 * by pid
 		 */
-		p = pfind(SCARG(uap, pid));
+		p = pfind(uap->pid);
 		if (p == NULL) {
 			error = ESRCH;
 			goto done;
@@ -331,9 +341,48 @@ done:
 		(void) vn_close(vp, FWRITE, curp->p_ucred, curp);
 	curp->p_traceflag &= ~KTRFAC_ACTIVE;
 	return (error);
+#else
+	return ENOSYS;
+#endif
 }
 
+/*
+ * utrace system call
+ */
+/* ARGSUSED */
 int
+utrace(curp, uap, retval)
+	struct proc *curp;
+	register struct utrace_args *uap;
+	int *retval;
+{
+#ifdef KTRACE
+	struct ktr_header *kth;
+	struct proc *p = curproc;	/* XXX */
+	register caddr_t cp;
+
+	if (!KTRPOINT(p, KTR_USER))
+		return (0);
+	p->p_traceflag |= KTRFAC_ACTIVE;
+	kth = ktrgetheader(KTR_USER);
+	MALLOC(cp, caddr_t, uap->len, M_KTRACE, M_WAITOK);
+	if (!copyin(uap->addr, cp, uap->len)) {
+		kth->ktr_buf = cp;
+		kth->ktr_len = uap->len;
+		ktrwrite(p->p_tracep, kth);
+	}
+	FREE(kth, M_KTRACE);
+	FREE(cp, M_KTRACE);
+	p->p_traceflag &= ~KTRFAC_ACTIVE;
+
+	return (0);
+#else
+	return (ENOSYS);
+#endif
+}
+
+#ifdef KTRACE
+static int
 ktrops(curp, p, ops, facs, vp)
 	struct proc *p, *curp;
 	int ops, facs;
@@ -343,7 +392,7 @@ ktrops(curp, p, ops, facs, vp)
 	if (!ktrcanset(curp, p))
 		return (0);
 	if (ops == KTROP_SET) {
-		if (p->p_tracep != vp) { 
+		if (p->p_tracep != vp) {
 			/*
 			 * if trace file already in use, relinquish
 			 */
@@ -355,7 +404,7 @@ ktrops(curp, p, ops, facs, vp)
 		p->p_traceflag |= facs;
 		if (curp->p_ucred->cr_uid == 0)
 			p->p_traceflag |= KTRFAC_ROOT;
-	} else {	
+	} else {
 		/* KTROP_CLEAR */
 		if (((p->p_traceflag &= ~facs) & KTRFAC_MASK) == 0) {
 			/* no more tracing */
@@ -370,6 +419,7 @@ ktrops(curp, p, ops, facs, vp)
 	return (1);
 }
 
+static int
 ktrsetchildren(curp, top, ops, facs, vp)
 	struct proc *curp, *top;
 	int ops, facs;
@@ -401,6 +451,7 @@ ktrsetchildren(curp, top, ops, facs, vp)
 	/*NOTREACHED*/
 }
 
+static void
 ktrwrite(vp, kth)
 	struct vnode *vp;
 	register struct ktr_header *kth;
@@ -450,11 +501,12 @@ ktrwrite(vp, kth)
  * Return true if caller has permission to set the ktracing state
  * of target.  Essentially, the target can't possess any
  * more permissions than the caller.  KTRFAC_ROOT signifies that
- * root previously set the tracing status on the target process, and 
+ * root previously set the tracing status on the target process, and
  * so, only root may further change it.
  *
  * TODO: check groups.  use caller effective gid.
  */
+static int
 ktrcanset(callp, targetp)
 	struct proc *callp, *targetp;
 {
@@ -472,4 +524,4 @@ ktrcanset(callp, targetp)
 	return (0);
 }
 
-#endif
+#endif /* KTRACE */
diff --git a/sys/kern/kern_lkm.c b/sys/kern/kern_lkm.c
new file mode 100644
index 0000000..f371c37
--- /dev/null
+++ b/sys/kern/kern_lkm.c
@@ -0,0 +1,957 @@
+/*-
+ * Copyright (c) 1992 Terrence R. Lambert.
+ * Copyright (c) 1994 Christopher G. Demetriou
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Terrence R. Lambert.
+ * 4. The name Terrence R. Lambert may not be used to endorse or promote
+ *    products derived from this software without specific prior written
+ *    permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY TERRENCE R. LAMBERT ``AS IS'' AND ANY
+ * EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE TERRENCE R. LAMBERT BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: kern_lkm.c,v 1.38 1997/03/23 03:36:20 bde Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/tty.h>
+#include <sys/conf.h>
+#include <sys/fcntl.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#include <sys/mount.h>
+#include <sys/sysent.h>
+#include <sys/exec.h>
+#include <sys/imgact.h>
+#include <sys/lkm.h>
+#include <sys/vnode.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+
+#define PAGESIZE 1024		/* kmem_alloc() allocation quantum */
+
+#define	LKM_ALLOC	0x01
+#define	LKM_WANT	0x02
+
+#define	LKMS_IDLE	0x00
+#define	LKMS_RESERVED	0x01
+#define	LKMS_LOADING	0x02
+#define	LKMS_LOADED	0x04
+#define	LKMS_UNLOADING	0x08
+
+static int	lkm_v = 0;
+static int	lkm_state = LKMS_IDLE;
+
+#ifndef MAXLKMS
+#define	MAXLKMS		20
+#endif
+
+static struct lkm_table	lkmods[MAXLKMS];	/* table of loaded modules */
+static struct lkm_table	*curp;			/* global for in-progress ops */
+
+static int	_lkm_dev __P((struct lkm_table *lkmtp, int cmd));
+static int	_lkm_exec __P((struct lkm_table *lkmtp, int cmd));
+static int	_lkm_vfs __P((struct lkm_table *lkmtp, int cmd));
+static int	_lkm_syscall __P((struct lkm_table *lkmtp, int cmd));
+static void	lkmunreserve __P((void));
+
+static	d_open_t	lkmcopen;
+static	d_close_t	lkmcclose;
+static	d_ioctl_t	lkmcioctl;
+
+#define CDEV_MAJOR 32
+static struct cdevsw lkmc_cdevsw = 
+	{ lkmcopen,	lkmcclose,	noread,		nowrite,	/*32*/
+	  lkmcioctl,	nostop,		nullreset,	nodevtotty,
+	  noselect,	nommap,		NULL,	"lkm",	NULL,	-1 };
+
+
+/*ARGSUSED*/
+static	int
+lkmcopen(dev, flag, devtype, p)
+	dev_t dev;
+	int flag;
+	int devtype;
+	struct proc *p;
+{
+	int error;
+
+	if (minor(dev) != 0)
+		return(ENXIO);		/* bad minor # */
+
+	/*
+	 * Use of the loadable kernel module device must be exclusive; we
+	 * may try to remove this restriction later, but it's really no
+	 * hardship.
+	 */
+	while (lkm_v & LKM_ALLOC) {
+		if (flag & FNONBLOCK)		/* don't hang */
+			return(EBUSY);
+		lkm_v |= LKM_WANT;
+		/*
+		 * Sleep pending unlock; we use tsleep() to allow
+		 * an alarm out of the open.
+		 */
+		error = tsleep((caddr_t)&lkm_v, TTIPRI|PCATCH, "lkmopn", 0);
+		if (error)
+			return(error);	/* leave LKM_WANT set -- no problem */
+	}
+	lkm_v |= LKM_ALLOC;
+
+	return(0);		/* pseudo-device open */
+}
+
+/*
+ * Unreserve the memory associated with the current loaded module; done on
+ * a coerced close of the lkm device (close on premature exit of modload)
+ * or explicitly by modload as a result of a link failure.
+ */
+static void
+lkmunreserve()
+{
+
+	if (lkm_state == LKMS_IDLE)
+		return;
+
+	/*
+	 * Actually unreserve the memory
+	 */
+	if (curp && curp->area) {
+		kmem_free(kernel_map, curp->area, curp->size);/**/
+		curp->area = 0;
+		if (curp->private.lkm_any != NULL)
+			curp->private.lkm_any = NULL;
+	}
+
+	lkm_state = LKMS_IDLE;
+}
+
+static	int
+lkmcclose(dev, flag, mode, p)
+	dev_t dev;
+	int flag;
+	int mode;
+	struct proc *p;
+{
+
+	if (!(lkm_v & LKM_ALLOC)) {
+#ifdef DEBUG
+		printf("LKM: close before open!\n");
+#endif	/* DEBUG */
+		return(EBADF);
+	}
+
+	/* do this before waking the herd... */
+	if (curp && !curp->used) {
+		/*
+		 * If we close before setting used, we have aborted
+		 * by way of error or by way of close-on-exit from
+		 * a premature exit of "modload".
+		 */
+		lkmunreserve();	/* coerce state to LKM_IDLE */
+	}
+
+	lkm_v &= ~LKM_ALLOC;
+	wakeup((caddr_t)&lkm_v);	/* thundering herd "problem" here */
+
+	return(0);		/* pseudo-device closed */
+}
+
+/*ARGSUSED*/
+static	int
+lkmcioctl(dev, cmd, data, flag, p)
+	dev_t dev;
+	int cmd;
+	caddr_t data;
+	int flag;
+	struct proc *p;
+{
+	int err = 0;
+	int i;
+	struct lmc_resrv *resrvp;
+	struct lmc_loadbuf *loadbufp;
+	struct lmc_unload *unloadp;
+	struct lmc_stat	 *statp;
+	char istr[MAXLKMNAME];
+
+	switch(cmd) {
+	case LMRESERV:		/* reserve pages for a module */
+		if ((flag & FWRITE) == 0 || securelevel > 0) 
+			/* only allow this if writing and insecure */
+			return EPERM;
+
+		resrvp = (struct lmc_resrv *)data;
+
+		/*
+		 * Find a free slot.
+		 */
+		for (i = 0; i < MAXLKMS; i++)
+			if (!lkmods[i].used)
+				break;
+		if (i == MAXLKMS) {
+			err = ENOMEM;		/* no slots available */
+			break;
+		}
+		curp = &lkmods[i];
+		curp->id = i;		/* self reference slot offset */
+
+		resrvp->slot = i;		/* return slot */
+
+		/*
+		 * Get memory for module
+		 */
+		curp->size = resrvp->size;
+
+		curp->area = kmem_alloc(kernel_map, curp->size);/**/
+
+		curp->offset = 0;		/* load offset */
+
+		resrvp->addr = curp->area; /* ret kernel addr */
+
+#ifdef DEBUG
+		printf("LKM: LMRESERV (actual   = 0x%08x)\n", curp->area);
+		printf("LKM: LMRESERV (adjusted = 0x%08x)\n",
+			trunc_page(curp->area));
+#endif	/* DEBUG */
+		lkm_state = LKMS_RESERVED;
+		break;
+
+	case LMLOADBUF:		/* Copy in; stateful, follows LMRESERV */
+		if ((flag & FWRITE) == 0 || securelevel > 0)
+			/* only allow this if writing and insecure */
+			return EPERM;
+
+		loadbufp = (struct lmc_loadbuf *)data;
+		i = loadbufp->cnt;
+		if ((lkm_state != LKMS_RESERVED && lkm_state != LKMS_LOADING)
+		    || i < 0
+		    || i > MODIOBUF
+		    || i > curp->size - curp->offset) {
+			err = ENOMEM;
+			break;
+		}
+
+		/* copy in buffer full of data */
+		err = copyin((caddr_t)loadbufp->data,
+		    (caddr_t)curp->area + curp->offset, i);
+		if (err)
+			break;
+
+		if ((curp->offset + i) < curp->size) {
+			lkm_state = LKMS_LOADING;
+#ifdef DEBUG
+			printf("LKM: LMLOADBUF (loading @ %d of %d, i = %d)\n",
+			curp->offset, curp->size, i);
+#endif	/* DEBUG */
+		} else {
+			lkm_state = LKMS_LOADED;
+#ifdef DEBUG
+			printf("LKM: LMLOADBUF (loaded)\n");
+#endif	/* DEBUG */
+		}
+		curp->offset += i;
+		break;
+
+	case LMUNRESRV:		/* discard reserved pages for a module */
+		if ((flag & FWRITE) == 0 || securelevel > 0)
+			/* only allow this if writing and insecure */
+			return EPERM;
+
+		lkmunreserve();	/* coerce state to LKM_IDLE */
+#ifdef DEBUG
+		printf("LKM: LMUNRESERV\n");
+#endif	/* DEBUG */
+		break;
+
+	case LMREADY:		/* module loaded: call entry */
+		if ((flag & FWRITE) == 0 || securelevel > 0)
+			/* only allow this if writing or insecure */
+			return EPERM;
+
+		switch (lkm_state) {
+		case LKMS_LOADED:
+			break;
+		case LKMS_LOADING:
+			/* The remainder must be bss, so we clear it */
+			bzero((caddr_t)curp->area + curp->offset,
+			      curp->size - curp->offset);
+			break;
+		default:
+
+#ifdef DEBUG
+			printf("lkm_state is %02x\n", lkm_state);
+#endif	/* DEBUG */
+			return ENXIO;
+		}
+
+		/* XXX gack */
+		curp->entry = (int (*) __P((struct lkm_table *, int, int)))
+			      (*((int *)data));
+
+		/* call entry(load)... (assigns "private" portion) */
+		err = (*(curp->entry))(curp, LKM_E_LOAD, LKM_VERSION);
+		if (err) {
+			/*
+			 * Module may refuse loading or may have a
+			 * version mismatch...
+			 */
+			lkm_state = LKMS_UNLOADING;	/* for lkmunreserve */
+			lkmunreserve();			/* free memory */
+			curp->used = 0;			/* free slot */
+			break;
+		}
+		/*
+		 * It's possible for a user to load a module that doesn't
+		 * initialize itself correctly. (You can even get away with
+		 * using it for a while.) Unfortunately, we are faced with
+		 * the following problems:
+		 * - we can't tell a good module from a bad one until
+		 *   after we've run its entry function (if the private
+		 *   section is uninitalized after we return from the
+		 *   entry, then something's fishy)
+		 * - now that we've called the entry function, we can't
+		 *   forcibly unload the module without risking a crash
+		 * - since we don't know what the module's entry function
+		 *   did, we can't easily clean up the mess it may have
+		 *   made, so we can't know just how unstable the system
+		 *   may be
+		 * So, being stuck between a rock and a hard place, we
+		 * have no choice but to do this...
+		 */
+		if (curp->private.lkm_any == NULL)
+			panic("loadable module initialization failed");
+
+		curp->used = 1;
+#ifdef DEBUG
+		printf("LKM: LMREADY\n");
+#endif	/* DEBUG */
+		lkm_state = LKMS_IDLE;
+		break;
+
+	case LMUNLOAD:		/* unload a module */
+		if ((flag & FWRITE) == 0 || securelevel > 0)
+			/* only allow this if writing and insecure */
+			return EPERM;
+
+		unloadp = (struct lmc_unload *)data;
+
+		if ((i = unloadp->id) == -1) {		/* unload by name */
+			/*
+			 * Copy name and lookup id from all loaded
+			 * modules.  May fail.
+			 */
+		 	err =copyinstr(unloadp->name, istr, MAXLKMNAME-1, NULL);
+		 	if (err)
+				break;
+
+			/*
+			 * look up id...
+			 */
+			for (i = 0; i < MAXLKMS; i++) {
+				if (!lkmods[i].used)
+					continue;
+				if (!strcmp(istr,
+				        lkmods[i].private.lkm_any->lkm_name))
+					break;
+			}
+		}
+
+		/*
+		 * Range check the value; on failure, return EINVAL
+		 */
+		if (i < 0 || i >= MAXLKMS) {
+			err = EINVAL;
+			break;
+		}
+
+		curp = &lkmods[i];
+
+		if (!curp->used) {
+			err = ENOENT;
+			break;
+		}
+
+		/* call entry(unload) */
+		if ((*(curp->entry))(curp, LKM_E_UNLOAD, LKM_VERSION)) {
+			err = EBUSY;
+			break;
+		}
+
+		lkm_state = LKMS_UNLOADING;	/* non-idle for lkmunreserve */
+		lkmunreserve();			/* free memory */
+		curp->used = 0;			/* free slot */
+		break;
+
+	case LMSTAT:		/* stat a module by id/name */
+		/* allow readers and writers to stat */
+
+		statp = (struct lmc_stat *)data;
+
+		if ((i = statp->id) == -1) {		/* stat by name */
+			/*
+			 * Copy name and lookup id from all loaded
+			 * modules.
+			 */
+		 	copystr(statp->name, istr, MAXLKMNAME-1, NULL);
+			/*
+			 * look up id...
+			 */
+			for (i = 0; i < MAXLKMS; i++) {
+				if (!lkmods[i].used)
+					continue;
+				if (!strcmp(istr,
+				        lkmods[i].private.lkm_any->lkm_name))
+					break;
+			}
+
+			if (i == MAXLKMS) {		/* Not found */
+				err = ENOENT;
+				break;
+			}
+		}
+
+		/*
+		 * Range check the value; on failure, return EINVAL
+		 */
+		if (i < 0 || i >= MAXLKMS) {
+			err = EINVAL;
+			break;
+		}
+
+		curp = &lkmods[i];
+
+		if (!curp->used) {			/* Not found */
+			err = ENOENT;
+			break;
+		}
+
+		/*
+		 * Copy out stat information for this module...
+		 */
+		statp->id	= curp->id;
+		statp->offset	= curp->private.lkm_any->lkm_offset;
+		statp->type	= curp->private.lkm_any->lkm_type;
+		statp->area	= curp->area;
+		statp->size	= curp->size / PAGESIZE;
+		statp->private	= (unsigned long)curp->private.lkm_any;
+		statp->ver	= curp->private.lkm_any->lkm_ver;
+		copystr(curp->private.lkm_any->lkm_name,
+			  statp->name,
+			  MAXLKMNAME - 2,
+			  NULL);
+
+		break;
+
+	default:		/* bad ioctl()... */
+		err = ENOTTY;
+		break;
+	}
+
+	return (err);
+}
+
+/*
+ * Acts like "nosys" but can be identified in sysent for dynamic call
+ * number assignment for a limited number of calls.
+ *
+ * Place holder for system call slots reserved for loadable modules.
+ */
+int
+lkmnosys(p, args, retval)
+	struct proc *p;
+	struct nosys_args *args;
+	int *retval;
+{
+
+	return(nosys(p, args, retval));
+}
+
+int
+lkmexists(lkmtp)
+	struct lkm_table *lkmtp;
+{
+	int i;
+
+	/*
+	 * see if name exists...
+	 */
+	for (i = 0; i < MAXLKMS; i++) {
+		/*
+		 * An unused module and the one we are testing are not
+		 * considered.
+		 */
+		if (!lkmods[i].used || &lkmods[i] == lkmtp)
+			continue;
+		if (!strcmp(lkmtp->private.lkm_any->lkm_name,
+			lkmods[i].private.lkm_any->lkm_name))
+			return(1);		/* already loaded... */
+	}
+
+	return(0);		/* module not loaded... */
+}
+
+/*
+ * For the loadable system call described by the structure pointed to
+ * by lkmtp, load/unload/stat it depending on the cmd requested.
+ */
+static int
+_lkm_syscall(lkmtp, cmd)
+	struct lkm_table *lkmtp;
+	int cmd;
+{
+	struct lkm_syscall *args = lkmtp->private.lkm_syscall;
+	int i;
+	int err = 0;
+
+	switch(cmd) {
+	case LKM_E_LOAD:
+		/* don't load twice! */
+		if (lkmexists(lkmtp))
+			return(EEXIST);
+		if ((i = args->lkm_offset) == -1) {	/* auto */
+			/*
+			 * Search the table looking for a slot...
+			 */
+			for (i = 0; i < aout_sysvec.sv_size; i++)
+				if (aout_sysvec.sv_table[i].sy_call ==
+				    (sy_call_t *)lkmnosys)
+					break;		/* found it! */
+			/* out of allocable slots? */
+			if (i == aout_sysvec.sv_size) {
+				err = ENFILE;
+				break;
+			}
+		} else {				/* assign */
+			if (i < 0 || i >= aout_sysvec.sv_size) {
+				err = EINVAL;
+				break;
+			}
+		}
+
+		/* save old */
+		bcopy(&aout_sysvec.sv_table[i],
+		      &(args->lkm_oldent),
+		      sizeof(struct sysent));
+
+		/* replace with new */
+		bcopy(args->lkm_sysent,
+		      &aout_sysvec.sv_table[i],
+		      sizeof(struct sysent));
+
+		/* done! */
+		args->lkm_offset = i;	/* slot in sysent[] */
+
+		break;
+
+	case LKM_E_UNLOAD:
+		/* current slot... */
+		i = args->lkm_offset;
+
+		/* replace current slot contents with old contents */
+		bcopy(&(args->lkm_oldent),
+		      &aout_sysvec.sv_table[i],
+		      sizeof(struct sysent));
+
+		break;
+
+	case LKM_E_STAT:	/* no special handling... */
+		break;
+	}
+
+	return(err);
+}
+
+/*
+ * For the loadable virtual file system described by the structure pointed
+ * to by lkmtp, load/unload/stat it depending on the cmd requested.
+ */
+static int
+_lkm_vfs(lkmtp, cmd)
+	struct lkm_table *lkmtp;
+	int cmd;
+{
+	struct lkm_vfs *args = lkmtp->private.lkm_vfs;
+	struct vfsconf *vfc = args->lkm_vfsconf;
+	struct vfsconf *vfsp, *prev_vfsp;
+	int i, maxtypenum;
+	int err = 0;
+
+	switch(cmd) {
+	case LKM_E_LOAD:
+		/* don't load twice! */
+		if (lkmexists(lkmtp))
+			return(EEXIST);
+
+		for (vfsp = vfsconf; vfsp->vfc_next; vfsp = vfsp->vfc_next) {
+			if (!strcmp(vfc->vfc_name, vfsp->vfc_name)) {
+				return EEXIST;
+			}
+		}
+
+		i = args->lkm_offset = vfc->vfc_typenum;
+		if (i < 0) {
+			i = maxvfsconf;
+		}
+		args->lkm_offset = vfc->vfc_typenum = i;
+
+		if (maxvfsconf <= i)
+			maxvfsconf = i + 1;
+
+		vfsp->vfc_next = vfc;
+		vfc->vfc_next = NULL;
+
+		/* like in vfs_op_init */
+		for(i = 0; args->lkm_vnodeops->ls_items[i]; i++) {
+			const struct vnodeopv_desc *opv =
+				args->lkm_vnodeops->ls_items[i];
+			*(opv->opv_desc_vector_p) = NULL;
+		}
+		vfs_opv_init((struct vnodeopv_desc **)args->lkm_vnodeops->ls_items);
+
+		/*
+		 * Call init function for this VFS...
+		 */
+	 	(*(vfc->vfc_vfsops->vfs_init))(vfc);
+
+		/* done! */
+		break;
+
+	case LKM_E_UNLOAD:
+		/* current slot... */
+		i = args->lkm_offset;
+
+		prev_vfsp = NULL;
+		for (vfsp = vfsconf; vfsp;
+				prev_vfsp = vfsp, vfsp = vfsp->vfc_next) {
+			if (!strcmp(vfc->vfc_name, vfsp->vfc_name))
+				break;
+		}
+		if (vfsp == NULL) {
+			return EINVAL;
+		}
+
+		if (vfsp->vfc_refcount) {
+			return EBUSY;
+		}
+
+		FREE(vfsp, M_VFSCONF);
+
+		prev_vfsp->vfc_next = vfsp->vfc_next;
+
+		/*
+		 * Maintain maxvfsconf.
+		 */
+		maxtypenum = 0;
+		for (vfsp = vfsconf; vfsp != NULL; vfsp = vfsp->vfc_next)
+			if (maxtypenum < vfsp->vfc_typenum)
+				maxtypenum = vfsp->vfc_typenum;
+		maxvfsconf = maxtypenum + 1;
+
+		break;
+
+	case LKM_E_STAT:	/* no special handling... */
+		break;
+	}
+	return(err);
+}
+
+/*
+ * For the loadable device driver described by the structure pointed to
+ * by lkmtp, load/unload/stat it depending on the cmd requested.
+ */
+static int
+_lkm_dev(lkmtp, cmd)
+	struct lkm_table *lkmtp;
+	int cmd;
+{
+	struct lkm_dev *args = lkmtp->private.lkm_dev;
+	int i;
+	dev_t descrip;
+	int err = 0;
+
+	switch(cmd) {
+	case LKM_E_LOAD:
+		/* don't load twice! */
+		if (lkmexists(lkmtp))
+			return(EEXIST);
+		switch(args->lkm_devtype) {
+		case LM_DT_BLOCK:
+			if ((i = args->lkm_offset) == -1)
+				descrip = (dev_t) -1;
+			else
+				descrip = makedev(args->lkm_offset,0);
+			if ( err = bdevsw_add(&descrip, args->lkm_dev.bdev,
+					&(args->lkm_olddev.bdev))) {
+				break;
+			}
+			args->lkm_offset = major(descrip) ;
+			break;
+
+		case LM_DT_CHAR:
+			if ((i = args->lkm_offset) == -1)
+				descrip = (dev_t) -1;
+			else
+				descrip = makedev(args->lkm_offset,0);
+			if ( err = cdevsw_add(&descrip, args->lkm_dev.cdev,
+					&(args->lkm_olddev.cdev))) {
+				break;
+			}
+			args->lkm_offset = major(descrip) ;
+			break;
+
+		default:
+			err = ENODEV;
+			break;
+		}
+		break;
+
+	case LKM_E_UNLOAD:
+		/* current slot... */
+		i = args->lkm_offset;
+		descrip = makedev(i,0);
+
+		switch(args->lkm_devtype) {
+		case LM_DT_BLOCK:
+			/* replace current slot contents with old contents */
+			bdevsw_add(&descrip, args->lkm_olddev.bdev,NULL);
+			break;
+
+		case LM_DT_CHAR:
+			/* replace current slot contents with old contents */
+			cdevsw_add(&descrip, args->lkm_olddev.cdev,NULL);
+			break;
+
+		default:
+			err = ENODEV;
+			break;
+		}
+		break;
+
+	case LKM_E_STAT:	/* no special handling... */
+		break;
+	}
+
+	return(err);
+}
+
+#ifdef STREAMS
+/*
+ * For the loadable streams module described by the structure pointed to
+ * by lkmtp, load/unload/stat it depending on the cmd requested.
+ */
+static int
+_lkm_strmod(lkmtp, cmd)
+	struct lkm_table *lkmtp;
+	int cmd;
+{
+	struct lkm_strmod *args = lkmtp->private.lkm_strmod;
+	int i;
+	int err = 0;
+
+	switch(cmd) {
+	case LKM_E_LOAD:
+		/* don't load twice! */
+		if (lkmexists(lkmtp))
+			return(EEXIST);
+		break;
+
+	case LKM_E_UNLOAD:
+		break;
+
+	case LKM_E_STAT:	/* no special handling... */
+		break;
+	}
+
+	return(err);
+}
+#endif	/* STREAMS */
+
+/*
+ * For the loadable execution class described by the structure pointed to
+ * by lkmtp, load/unload/stat it depending on the cmd requested.
+ */
+static int
+_lkm_exec(lkmtp, cmd)
+	struct lkm_table *lkmtp;
+	int cmd;
+{
+	struct lkm_exec *args = lkmtp->private.lkm_exec;
+	int i;
+	int err = 0;
+	const struct execsw **execsw =
+		(const struct execsw **)&execsw_set.ls_items[0];
+
+	switch(cmd) {
+	case LKM_E_LOAD:
+		/* don't load twice! */
+		if (lkmexists(lkmtp))
+			return(EEXIST);
+		if ((i = args->lkm_offset) == -1) {	/* auto */
+			/*
+			 * Search the table looking for a slot...
+			 */
+			for (i = 0; execsw[i] != NULL; i++)
+				if (execsw[i]->ex_imgact == NULL)
+					break;		/* found it! */
+			/* out of allocable slots? */
+			if (execsw[i] == NULL) {
+				err = ENFILE;
+				break;
+			}
+		} else {				/* assign */
+			err = EINVAL;
+			break;
+		}
+
+		/* save old */
+		bcopy(&execsw[i], &(args->lkm_oldexec), sizeof(struct execsw*));
+
+		/* replace with new */
+		bcopy(&(args->lkm_exec), &execsw[i], sizeof(struct execsw*));
+
+		/* done! */
+		args->lkm_offset = i;	/* slot in execsw[] */
+
+		break;
+
+	case LKM_E_UNLOAD:
+		/* current slot... */
+		i = args->lkm_offset;
+
+		/* replace current slot contents with old contents */
+		bcopy(&(args->lkm_oldexec), &execsw[i], sizeof(struct execsw*));
+
+		break;
+
+	case LKM_E_STAT:	/* no special handling... */
+		break;
+	}
+	return(err);
+}
+
+/* XXX: This is bogus.  we should find a better method RSN! */
+static const struct execsw lkm_exec_dummy1 = { NULL, "lkm" };
+static const struct execsw lkm_exec_dummy2 = { NULL, "lkm" };
+static const struct execsw lkm_exec_dummy3 = { NULL, "lkm" };
+static const struct execsw lkm_exec_dummy4 = { NULL, "lkm" };
+TEXT_SET(execsw_set, lkm_exec_dummy1);
+TEXT_SET(execsw_set, lkm_exec_dummy2);
+TEXT_SET(execsw_set, lkm_exec_dummy3);
+TEXT_SET(execsw_set, lkm_exec_dummy4);
+
+/*
+ * This code handles the per-module type "wiring-in" of loadable modules
+ * into existing kernel tables.  For "LM_MISC" modules, wiring and unwiring
+ * is assumed to be done in their entry routines internal to the module
+ * itself.
+ */
+int
+lkmdispatch(lkmtp, cmd)
+	struct lkm_table *lkmtp;
+	int cmd;
+{
+	int err = 0;		/* default = success */
+
+	switch(lkmtp->private.lkm_any->lkm_type) {
+	case LM_SYSCALL:
+		err = _lkm_syscall(lkmtp, cmd);
+		break;
+
+	case LM_VFS:
+		err = _lkm_vfs(lkmtp, cmd);
+		break;
+
+	case LM_DEV:
+		err = _lkm_dev(lkmtp, cmd);
+		break;
+
+#ifdef STREAMS
+	case LM_STRMOD:
+	    {
+		struct lkm_strmod *args = lkmtp->private.lkm_strmod;
+	    }
+		break;
+
+#endif	/* STREAMS */
+
+	case LM_EXEC:
+		err = _lkm_exec(lkmtp, cmd);
+		break;
+
+	case LM_MISC:	/* ignore content -- no "misc-specific" procedure */
+		if (lkmexists(lkmtp))
+			err = EEXIST;
+		break;
+
+	default:
+		err = ENXIO;	/* unknown type */
+		break;
+	}
+
+	return(err);
+}
+
+int
+lkm_nullcmd(lkmtp, cmd)
+	struct lkm_table *lkmtp;
+	int cmd;
+{
+
+	return (0);
+}
+
+static lkm_devsw_installed = 0;
+#ifdef DEVFS
+static void	*lkmc_devfs_token;
+#endif
+
+static void 	lkm_drvinit(void *unused)
+{
+	dev_t dev;
+
+	if( ! lkm_devsw_installed ) {
+		dev = makedev(CDEV_MAJOR, 0);
+		cdevsw_add(&dev,&lkmc_cdevsw, NULL);
+		lkm_devsw_installed = 1;
+#ifdef DEVFS
+		lkmc_devfs_token = devfs_add_devswf(&lkmc_cdevsw, 0, DV_CHR,
+						    UID_ROOT, GID_WHEEL, 0644,
+						    "lkm");
+#endif
+    	}
+}
+
+SYSINIT(lkmdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,lkm_drvinit,NULL)
+
+
diff --git a/sys/kern/kern_lockf.c b/sys/kern/kern_lockf.c
new file mode 100644
index 0000000..fb1a8a0
--- /dev/null
+++ b/sys/kern/kern_lockf.c
@@ -0,0 +1,796 @@
+/*
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Scooter Morris at Genentech Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ufs_lockf.c	8.3 (Berkeley) 1/6/94
+ * $Id$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/malloc.h>
+#include <sys/fcntl.h>
+
+#include <sys/lockf.h>
+
+/*
+ * This variable controls the maximum number of processes that will
+ * be checked in doing deadlock detection.
+ */
+static int maxlockdepth = MAXDEPTH;
+
+#ifdef LOCKF_DEBUG
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
+
+#include <vm/vm.h>
+
+#include <ufs/ufs/quota.h>
+#include <ufs/ufs/inode.h>
+
+int	lockf_debug = 0;
+SYSCTL_INT(_debug, 4, lockf_debug, CTLFLAG_RW, &lockf_debug, 0, "");
+#endif
+
+#define NOLOCKF (struct lockf *)0
+#define SELF	0x1
+#define OTHERS	0x2
+static int	 lf_clearlock __P((struct lockf *));
+static int	 lf_findoverlap __P((struct lockf *,
+	    struct lockf *, int, struct lockf ***, struct lockf **));
+static struct lockf *
+	 lf_getblock __P((struct lockf *));
+static int	 lf_getlock __P((struct lockf *, struct flock *));
+static int	 lf_setlock __P((struct lockf *));
+static void	 lf_split __P((struct lockf *, struct lockf *));
+static void	 lf_wakelock __P((struct lockf *));
+
+/*
+ * Advisory record locking support
+ */
+int
+lf_advlock(ap, head, size)
+	struct vop_advlock_args /* {
+		struct vnode *a_vp;
+		caddr_t  a_id;
+		int  a_op;
+		struct flock *a_fl;
+		int  a_flags;
+	} */ *ap;
+	struct lockf **head;
+	u_quad_t size;
+{
+	register struct flock *fl = ap->a_fl;
+	register struct lockf *lock;
+	off_t start, end;
+	int error;
+
+	/*
+	 * Convert the flock structure into a start and end.
+	 */
+	switch (fl->l_whence) {
+
+	case SEEK_SET:
+	case SEEK_CUR:
+		/*
+		 * Caller is responsible for adding any necessary offset
+		 * when SEEK_CUR is used.
+		 */
+		start = fl->l_start;
+		break;
+
+	case SEEK_END:
+		start = size + fl->l_start;
+		break;
+
+	default:
+		return (EINVAL);
+	}
+	if (start < 0)
+		return (EINVAL);
+	if (fl->l_len == 0)
+		end = -1;
+	else {
+		end = start + fl->l_len - 1;
+		if (end < start)
+			return (EINVAL);
+	}
+	/*
+	 * Avoid the common case of unlocking when inode has no locks.
+	 */
+	if (*head == (struct lockf *)0) {
+		if (ap->a_op != F_SETLK) {
+			fl->l_type = F_UNLCK;
+			return (0);
+		}
+	}
+	/*
+	 * Create the lockf structure
+	 */
+	MALLOC(lock, struct lockf *, sizeof *lock, M_LOCKF, M_WAITOK);
+	lock->lf_start = start;
+	lock->lf_end = end;
+	lock->lf_id = ap->a_id;
+/*	lock->lf_inode = ip; */	/* XXX JH */
+	lock->lf_type = fl->l_type;
+	lock->lf_head = head;
+	lock->lf_next = (struct lockf *)0;
+	TAILQ_INIT(&lock->lf_blkhd);
+	lock->lf_flags = ap->a_flags;
+	/*
+	 * Do the requested operation.
+	 */
+	switch(ap->a_op) {
+	case F_SETLK:
+		return (lf_setlock(lock));
+
+	case F_UNLCK:
+		error = lf_clearlock(lock);
+		FREE(lock, M_LOCKF);
+		return (error);
+
+	case F_GETLK:
+		error = lf_getlock(lock, fl);
+		FREE(lock, M_LOCKF);
+		return (error);
+
+	default:
+		free(lock, M_LOCKF);
+		return (EINVAL);
+	}
+	/* NOTREACHED */
+}
+
+/*
+ * Set a byte-range lock.
+ */
+static int
+lf_setlock(lock)
+	register struct lockf *lock;
+{
+	register struct lockf *block;
+	struct lockf **head = lock->lf_head;
+	struct lockf **prev, *overlap, *ltmp;
+	static char lockstr[] = "lockf";
+	int ovcase, priority, needtolink, error;
+
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 1)
+		lf_print("lf_setlock", lock);
+#endif /* LOCKF_DEBUG */
+
+	/*
+	 * Set the priority
+	 */
+	priority = PLOCK;
+	if (lock->lf_type == F_WRLCK)
+		priority += 4;
+	priority |= PCATCH;
+	/*
+	 * Scan lock list for this file looking for locks that would block us.
+	 */
+	while ((block = lf_getblock(lock))) {
+		/*
+		 * Free the structure and return if nonblocking.
+		 */
+		if ((lock->lf_flags & F_WAIT) == 0) {
+			FREE(lock, M_LOCKF);
+			return (EAGAIN);
+		}
+		/*
+		 * We are blocked. Since flock style locks cover
+		 * the whole file, there is no chance for deadlock.
+		 * For byte-range locks we must check for deadlock.
+		 *
+		 * Deadlock detection is done by looking through the
+		 * wait channels to see if there are any cycles that
+		 * involve us. MAXDEPTH is set just to make sure we
+		 * do not go off into neverland.
+		 */
+		if ((lock->lf_flags & F_POSIX) &&
+		    (block->lf_flags & F_POSIX)) {
+			register struct proc *wproc;
+			register struct lockf *waitblock;
+			int i = 0;
+
+			/* The block is waiting on something */
+			wproc = (struct proc *)block->lf_id;
+			while (wproc->p_wchan &&
+			       (wproc->p_wmesg == lockstr) &&
+			       (i++ < maxlockdepth)) {
+				waitblock = (struct lockf *)wproc->p_wchan;
+				/* Get the owner of the blocking lock */
+				waitblock = waitblock->lf_next;
+				if ((waitblock->lf_flags & F_POSIX) == 0)
+					break;
+				wproc = (struct proc *)waitblock->lf_id;
+				if (wproc == (struct proc *)lock->lf_id) {
+					free(lock, M_LOCKF);
+					return (EDEADLK);
+				}
+			}
+		}
+		/*
+		 * For flock type locks, we must first remove
+		 * any shared locks that we hold before we sleep
+		 * waiting for an exclusive lock.
+		 */
+		if ((lock->lf_flags & F_FLOCK) &&
+		    lock->lf_type == F_WRLCK) {
+			lock->lf_type = F_UNLCK;
+			(void) lf_clearlock(lock);
+			lock->lf_type = F_WRLCK;
+		}
+		/*
+		 * Add our lock to the blocked list and sleep until we're free.
+		 * Remember who blocked us (for deadlock detection).
+		 */
+		lock->lf_next = block;
+		TAILQ_INSERT_TAIL(&block->lf_blkhd, lock, lf_block);
+#ifdef LOCKF_DEBUG
+		if (lockf_debug & 1) {
+			lf_print("lf_setlock: blocking on", block);
+			lf_printlist("lf_setlock", block);
+		}
+#endif /* LOCKF_DEBUG */
+		if ((error = tsleep((caddr_t)lock, priority, lockstr, 0))) {
+                        /*
+			 * We may have been awakened by a signal (in
+			 * which case we must remove ourselves from the
+			 * blocked list) and/or by another process
+			 * releasing a lock (in which case we have already
+			 * been removed from the blocked list and our
+			 * lf_next field set to NOLOCKF).
+                         */
+			if (lock->lf_next)
+				TAILQ_REMOVE(&lock->lf_next->lf_blkhd, lock,
+					lf_block);
+                        free(lock, M_LOCKF);
+                        return (error);
+		}
+	}
+	/*
+	 * No blocks!!  Add the lock.  Note that we will
+	 * downgrade or upgrade any overlapping locks this
+	 * process already owns.
+	 *
+	 * Skip over locks owned by other processes.
+	 * Handle any locks that overlap and are owned by ourselves.
+	 */
+	prev = head;
+	block = *head;
+	needtolink = 1;
+	for (;;) {
+		ovcase = lf_findoverlap(block, lock, SELF, &prev, &overlap);
+		if (ovcase)
+			block = overlap->lf_next;
+		/*
+		 * Six cases:
+		 *	0) no overlap
+		 *	1) overlap == lock
+		 *	2) overlap contains lock
+		 *	3) lock contains overlap
+		 *	4) overlap starts before lock
+		 *	5) overlap ends after lock
+		 */
+		switch (ovcase) {
+		case 0: /* no overlap */
+			if (needtolink) {
+				*prev = lock;
+				lock->lf_next = overlap;
+			}
+			break;
+
+		case 1: /* overlap == lock */
+			/*
+			 * If downgrading lock, others may be
+			 * able to acquire it.
+			 */
+			if (lock->lf_type == F_RDLCK &&
+			    overlap->lf_type == F_WRLCK)
+				lf_wakelock(overlap);
+			overlap->lf_type = lock->lf_type;
+			FREE(lock, M_LOCKF);
+			lock = overlap; /* for debug output below */
+			break;
+
+		case 2: /* overlap contains lock */
+			/*
+			 * Check for common starting point and different types.
+			 */
+			if (overlap->lf_type == lock->lf_type) {
+				free(lock, M_LOCKF);
+				lock = overlap; /* for debug output below */
+				break;
+			}
+			if (overlap->lf_start == lock->lf_start) {
+				*prev = lock;
+				lock->lf_next = overlap;
+				overlap->lf_start = lock->lf_end + 1;
+			} else
+				lf_split(overlap, lock);
+			lf_wakelock(overlap);
+			break;
+
+		case 3: /* lock contains overlap */
+			/*
+			 * If downgrading lock, others may be able to
+			 * acquire it, otherwise take the list.
+			 */
+			if (lock->lf_type == F_RDLCK &&
+			    overlap->lf_type == F_WRLCK) {
+				lf_wakelock(overlap);
+			} else {
+				while (ltmp = overlap->lf_blkhd.tqh_first) {
+					TAILQ_REMOVE(&overlap->lf_blkhd, ltmp,
+					    lf_block);
+					TAILQ_INSERT_TAIL(&lock->lf_blkhd,
+					    ltmp, lf_block);
+				}
+			}
+			/*
+			 * Add the new lock if necessary and delete the overlap.
+			 */
+			if (needtolink) {
+				*prev = lock;
+				lock->lf_next = overlap->lf_next;
+				prev = &lock->lf_next;
+				needtolink = 0;
+			} else
+				*prev = overlap->lf_next;
+			free(overlap, M_LOCKF);
+			continue;
+
+		case 4: /* overlap starts before lock */
+			/*
+			 * Add lock after overlap on the list.
+			 */
+			lock->lf_next = overlap->lf_next;
+			overlap->lf_next = lock;
+			overlap->lf_end = lock->lf_start - 1;
+			prev = &lock->lf_next;
+			lf_wakelock(overlap);
+			needtolink = 0;
+			continue;
+
+		case 5: /* overlap ends after lock */
+			/*
+			 * Add the new lock before overlap.
+			 */
+			if (needtolink) {
+				*prev = lock;
+				lock->lf_next = overlap;
+			}
+			overlap->lf_start = lock->lf_end + 1;
+			lf_wakelock(overlap);
+			break;
+		}
+		break;
+	}
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 1) {
+		lf_print("lf_setlock: got the lock", lock);
+		lf_printlist("lf_setlock", lock);
+	}
+#endif /* LOCKF_DEBUG */
+	return (0);
+}
+
+/*
+ * Remove a byte-range lock on an inode.
+ *
+ * Generally, find the lock (or an overlap to that lock)
+ * and remove it (or shrink it), then wakeup anyone we can.
+ */
+static int
+lf_clearlock(unlock)
+	register struct lockf *unlock;
+{
+	struct lockf **head = unlock->lf_head;
+	register struct lockf *lf = *head;
+	struct lockf *overlap, **prev;
+	int ovcase;
+
+	if (lf == NOLOCKF)
+		return (0);
+#ifdef LOCKF_DEBUG
+	if (unlock->lf_type != F_UNLCK)
+		panic("lf_clearlock: bad type");
+	if (lockf_debug & 1)
+		lf_print("lf_clearlock", unlock);
+#endif /* LOCKF_DEBUG */
+	prev = head;
+	while ((ovcase = lf_findoverlap(lf, unlock, SELF, &prev, &overlap))) {
+		/*
+		 * Wakeup the list of locks to be retried.
+		 */
+		lf_wakelock(overlap);
+
+		switch (ovcase) {
+
+		case 1: /* overlap == lock */
+			*prev = overlap->lf_next;
+			FREE(overlap, M_LOCKF);
+			break;
+
+		case 2: /* overlap contains lock: split it */
+			if (overlap->lf_start == unlock->lf_start) {
+				overlap->lf_start = unlock->lf_end + 1;
+				break;
+			}
+			lf_split(overlap, unlock);
+			overlap->lf_next = unlock->lf_next;
+			break;
+
+		case 3: /* lock contains overlap */
+			*prev = overlap->lf_next;
+			lf = overlap->lf_next;
+			free(overlap, M_LOCKF);
+			continue;
+
+		case 4: /* overlap starts before lock */
+			overlap->lf_end = unlock->lf_start - 1;
+			prev = &overlap->lf_next;
+			lf = overlap->lf_next;
+			continue;
+
+		case 5: /* overlap ends after lock */
+			overlap->lf_start = unlock->lf_end + 1;
+			break;
+		}
+		break;
+	}
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 1)
+		lf_printlist("lf_clearlock", unlock);
+#endif /* LOCKF_DEBUG */
+	return (0);
+}
+
+/*
+ * Check whether there is a blocking lock,
+ * and if so return its process identifier.
+ */
+static int
+lf_getlock(lock, fl)
+	register struct lockf *lock;
+	register struct flock *fl;
+{
+	register struct lockf *block;
+
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 1)
+		lf_print("lf_getlock", lock);
+#endif /* LOCKF_DEBUG */
+
+	if ((block = lf_getblock(lock))) {
+		fl->l_type = block->lf_type;
+		fl->l_whence = SEEK_SET;
+		fl->l_start = block->lf_start;
+		if (block->lf_end == -1)
+			fl->l_len = 0;
+		else
+			fl->l_len = block->lf_end - block->lf_start + 1;
+		if (block->lf_flags & F_POSIX)
+			fl->l_pid = ((struct proc *)(block->lf_id))->p_pid;
+		else
+			fl->l_pid = -1;
+	} else {
+		fl->l_type = F_UNLCK;
+	}
+	return (0);
+}
+
+/*
+ * Walk the list of locks for an inode and
+ * return the first blocking lock.
+ */
+static struct lockf *
+lf_getblock(lock)
+	register struct lockf *lock;
+{
+	struct lockf **prev, *overlap, *lf = *(lock->lf_head);
+	int ovcase;
+
+	prev = lock->lf_head;
+	while ((ovcase = lf_findoverlap(lf, lock, OTHERS, &prev, &overlap))) {
+		/*
+		 * We've found an overlap, see if it blocks us
+		 */
+		if ((lock->lf_type == F_WRLCK || overlap->lf_type == F_WRLCK))
+			return (overlap);
+		/*
+		 * Nope, point to the next one on the list and
+		 * see if it blocks us
+		 */
+		lf = overlap->lf_next;
+	}
+	return (NOLOCKF);
+}
+
+/*
+ * Walk the list of locks for an inode to
+ * find an overlapping lock (if any).
+ *
+ * NOTE: this returns only the FIRST overlapping lock.  There
+ *	 may be more than one.
+ */
+static int
+lf_findoverlap(lf, lock, type, prev, overlap)
+	register struct lockf *lf;
+	struct lockf *lock;
+	int type;
+	struct lockf ***prev;
+	struct lockf **overlap;
+{
+	off_t start, end;
+
+	*overlap = lf;
+	if (lf == NOLOCKF)
+		return (0);
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 2)
+		lf_print("lf_findoverlap: looking for overlap in", lock);
+#endif /* LOCKF_DEBUG */
+	start = lock->lf_start;
+	end = lock->lf_end;
+	while (lf != NOLOCKF) {
+		if (((type & SELF) && lf->lf_id != lock->lf_id) ||
+		    ((type & OTHERS) && lf->lf_id == lock->lf_id)) {
+			*prev = &lf->lf_next;
+			*overlap = lf = lf->lf_next;
+			continue;
+		}
+#ifdef LOCKF_DEBUG
+		if (lockf_debug & 2)
+			lf_print("\tchecking", lf);
+#endif /* LOCKF_DEBUG */
+		/*
+		 * OK, check for overlap
+		 *
+		 * Six cases:
+		 *	0) no overlap
+		 *	1) overlap == lock
+		 *	2) overlap contains lock
+		 *	3) lock contains overlap
+		 *	4) overlap starts before lock
+		 *	5) overlap ends after lock
+		 */
+		if ((lf->lf_end != -1 && start > lf->lf_end) ||
+		    (end != -1 && lf->lf_start > end)) {
+			/* Case 0 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("no overlap\n");
+#endif /* LOCKF_DEBUG */
+			if ((type & SELF) && end != -1 && lf->lf_start > end)
+				return (0);
+			*prev = &lf->lf_next;
+			*overlap = lf = lf->lf_next;
+			continue;
+		}
+		if ((lf->lf_start == start) && (lf->lf_end == end)) {
+			/* Case 1 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("overlap == lock\n");
+#endif /* LOCKF_DEBUG */
+			return (1);
+		}
+		if ((lf->lf_start <= start) &&
+		    (end != -1) &&
+		    ((lf->lf_end >= end) || (lf->lf_end == -1))) {
+			/* Case 2 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("overlap contains lock\n");
+#endif /* LOCKF_DEBUG */
+			return (2);
+		}
+		if (start <= lf->lf_start &&
+		           (end == -1 ||
+			   (lf->lf_end != -1 && end >= lf->lf_end))) {
+			/* Case 3 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("lock contains overlap\n");
+#endif /* LOCKF_DEBUG */
+			return (3);
+		}
+		if ((lf->lf_start < start) &&
+			((lf->lf_end >= start) || (lf->lf_end == -1))) {
+			/* Case 4 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("overlap starts before lock\n");
+#endif /* LOCKF_DEBUG */
+			return (4);
+		}
+		if ((lf->lf_start > start) &&
+			(end != -1) &&
+			((lf->lf_end > end) || (lf->lf_end == -1))) {
+			/* Case 5 */
+#ifdef LOCKF_DEBUG
+			if (lockf_debug & 2)
+				printf("overlap ends after lock\n");
+#endif /* LOCKF_DEBUG */
+			return (5);
+		}
+		panic("lf_findoverlap: default");
+	}
+	return (0);
+}
+
+/*
+ * Split a lock and a contained region into
+ * two or three locks as necessary.
+ */
+static void
+lf_split(lock1, lock2)
+	register struct lockf *lock1;
+	register struct lockf *lock2;
+{
+	register struct lockf *splitlock;
+
+#ifdef LOCKF_DEBUG
+	if (lockf_debug & 2) {
+		lf_print("lf_split", lock1);
+		lf_print("splitting from", lock2);
+	}
+#endif /* LOCKF_DEBUG */
+	/*
+	 * Check to see if spliting into only two pieces.
+	 */
+	if (lock1->lf_start == lock2->lf_start) {
+		lock1->lf_start = lock2->lf_end + 1;
+		lock2->lf_next = lock1;
+		return;
+	}
+	if (lock1->lf_end == lock2->lf_end) {
+		lock1->lf_end = lock2->lf_start - 1;
+		lock2->lf_next = lock1->lf_next;
+		lock1->lf_next = lock2;
+		return;
+	}
+	/*
+	 * Make a new lock consisting of the last part of
+	 * the encompassing lock
+	 */
+	MALLOC(splitlock, struct lockf *, sizeof *splitlock, M_LOCKF, M_WAITOK);
+	bcopy((caddr_t)lock1, (caddr_t)splitlock, sizeof *splitlock);
+	splitlock->lf_start = lock2->lf_end + 1;
+	TAILQ_INIT(&splitlock->lf_blkhd);
+	lock1->lf_end = lock2->lf_start - 1;
+	/*
+	 * OK, now link it in
+	 */
+	splitlock->lf_next = lock1->lf_next;
+	lock2->lf_next = splitlock;
+	lock1->lf_next = lock2;
+}
+
+/*
+ * Wakeup a blocklist
+ */
+static void
+lf_wakelock(listhead)
+	struct lockf *listhead;
+{
+	register struct lockf *wakelock;
+
+	while (wakelock = listhead->lf_blkhd.tqh_first) {
+		TAILQ_REMOVE(&listhead->lf_blkhd, wakelock, lf_block);
+		wakelock->lf_next = NOLOCKF;
+#ifdef LOCKF_DEBUG
+		if (lockf_debug & 2)
+			lf_print("lf_wakelock: awakening", wakelock);
+#endif /* LOCKF_DEBUG */
+		wakeup((caddr_t)wakelock);
+	}
+}
+
+#ifdef LOCKF_DEBUG
+/*
+ * Print out a lock.
+ */
+void
+lf_print(tag, lock)
+	char *tag;
+	register struct lockf *lock;
+{
+
+	printf("%s: lock 0x%lx for ", tag, lock);
+	if (lock->lf_flags & F_POSIX)
+		printf("proc %d", ((struct proc *)(lock->lf_id))->p_pid);
+	else
+		printf("id 0x%x", lock->lf_id);
+	printf(" in ino %d on dev <%d, %d>, %s, start %d, end %d",
+		lock->lf_inode->i_number,
+		major(lock->lf_inode->i_dev),
+		minor(lock->lf_inode->i_dev),
+		lock->lf_type == F_RDLCK ? "shared" :
+		lock->lf_type == F_WRLCK ? "exclusive" :
+		lock->lf_type == F_UNLCK ? "unlock" :
+		"unknown", lock->lf_start, lock->lf_end);
+	if (lock->lf_blkhd.tqh_first)
+		printf(" block 0x%x\n", lock->lf_blkhd.tqh_first);
+	else
+		printf("\n");
+}
+
+void
+lf_printlist(tag, lock)
+	char *tag;
+	struct lockf *lock;
+{
+	register struct lockf *lf, *blk;
+
+	printf("%s: Lock list for ino %d on dev <%d, %d>:\n",
+		tag, lock->lf_inode->i_number,
+		major(lock->lf_inode->i_dev),
+		minor(lock->lf_inode->i_dev));
+	for (lf = lock->lf_inode->i_lockf; lf; lf = lf->lf_next) {
+		printf("\tlock 0x%lx for ", lf);
+		if (lf->lf_flags & F_POSIX)
+			printf("proc %d", ((struct proc *)(lf->lf_id))->p_pid);
+		else
+			printf("id 0x%x", lf->lf_id);
+		printf(", %s, start %d, end %d",
+			lf->lf_type == F_RDLCK ? "shared" :
+			lf->lf_type == F_WRLCK ? "exclusive" :
+			lf->lf_type == F_UNLCK ? "unlock" :
+			"unknown", lf->lf_start, lf->lf_end);
+		for (blk = lf->lf_blkhd.tqh_first; blk;
+		     blk = blk->lf_block.tqe_next) {
+			printf("\n\t\tlock request 0x%lx for ", blk);
+			if (blk->lf_flags & F_POSIX)
+				printf("proc %d",
+				    ((struct proc *)(blk->lf_id))->p_pid);
+			else
+				printf("id 0x%x", blk->lf_id);
+			printf(", %s, start %d, end %d",
+				blk->lf_type == F_RDLCK ? "shared" :
+				blk->lf_type == F_WRLCK ? "exclusive" :
+				blk->lf_type == F_UNLCK ? "unlock" :
+				"unknown", blk->lf_start, blk->lf_end);
+			if (blk->lf_blkhd.tqh_first)
+				panic("lf_printlist: bad list");
+		}
+		printf("\n");
+	}
+}
+#endif /* LOCKF_DEBUG */
diff --git a/sys/kern/kern_malloc.c b/sys/kern/kern_malloc.c
index 363cde5..94c6b4e 100644
--- a/sys/kern/kern_malloc.c
+++ b/sys/kern/kern_malloc.c
@@ -30,19 +30,27 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)kern_malloc.c	8.4 (Berkeley) 5/20/95
+ *	@(#)kern_malloc.c	8.3 (Berkeley) 1/4/94
+ * $Id$
  */
 
 #include <sys/param.h>
+#include <sys/systm.h>
 #include <sys/proc.h>
-#include <sys/map.h>
 #include <sys/kernel.h>
 #include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/vmmeter.h>
 
 #include <vm/vm.h>
+#include <vm/vm_param.h>
 #include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
 
-struct kmembuckets bucket[MINBUCKET + 16];
+static void kmeminit __P((void *));
+SYSINIT(kmem, SI_SUB_KMEM, SI_ORDER_FIRST, kmeminit, NULL)
+
+static struct kmembuckets bucket[MINBUCKET + 16];
 struct kmemstats kmemstats[M_LAST];
 struct kmemusage *kmemusage;
 char *kmembase, *kmemlimit;
@@ -52,7 +60,7 @@ char *memname[] = INITKMEMNAMES;
 /*
  * This structure provides a set of masks to catch unaligned frees.
  */
-long addrmask[] = { 0,
+static long addrmask[] = { 0,
 	0x00000001, 0x00000003, 0x00000007, 0x0000000f,
 	0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff,
 	0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
@@ -63,8 +71,8 @@ long addrmask[] = { 0,
  * The WEIRD_ADDR is used as known text to copy into free objects so
  * that modifications after frees can be detected.
  */
-#define WEIRD_ADDR	0xdeadbeef
-#define MAX_COPY	32
+#define WEIRD_ADDR	0xdeadc0de
+#define MAX_COPY	64
 
 /*
  * Normally the first word of the structure is used to hold the list
@@ -103,9 +111,6 @@ malloc(size, type, flags)
 	int copysize;
 	char *savedtype;
 #endif
-#ifdef DEBUG
-	extern int simplelockrecurse;
-#endif
 #ifdef KMEMSTATS
 	register struct kmemstats *ksp = &kmemstats[type];
 
@@ -114,7 +119,7 @@ malloc(size, type, flags)
 #endif
 	indx = BUCKETINDX(size);
 	kbp = &bucket[indx];
-	s = splimp();
+	s = splhigh();
 #ifdef KMEMSTATS
 	while (ksp->ks_memuse >= ksp->ks_limit) {
 		if (flags & M_NOWAIT) {
@@ -130,25 +135,16 @@ malloc(size, type, flags)
 #ifdef DIAGNOSTIC
 	copysize = 1 << indx < MAX_COPY ? 1 << indx : MAX_COPY;
 #endif
-#ifdef DEBUG
-	if (flags & M_NOWAIT)
-		simplelockrecurse++;
-#endif
 	if (kbp->kb_next == NULL) {
 		kbp->kb_last = NULL;
 		if (size > MAXALLOCSAVE)
-			allocsize = roundup(size, CLBYTES);
+			allocsize = roundup(size, PAGE_SIZE);
 		else
 			allocsize = 1 << indx;
-		npg = clrnd(btoc(allocsize));
-		va = (caddr_t) kmem_malloc(kmem_map, (vm_size_t)ctob(npg),
-					   !(flags & M_NOWAIT));
+		npg = btoc(allocsize);
+		va = (caddr_t) kmem_malloc(kmem_map, (vm_size_t)ctob(npg), flags);
 		if (va == NULL) {
 			splx(s);
-#ifdef DEBUG
-			if (flags & M_NOWAIT)
-				simplelockrecurse--;
-#endif
 			return ((void *) NULL);
 		}
 #ifdef KMEMSTATS
@@ -175,7 +171,7 @@ malloc(size, type, flags)
 		 * bucket, don't assume the list is still empty.
 		 */
 		savedlist = kbp->kb_next;
-		kbp->kb_next = cp = va + (npg * NBPG) - allocsize;
+		kbp->kb_next = cp = va + (npg * PAGE_SIZE) - allocsize;
 		for (;;) {
 			freep = (struct freelist *)cp;
 #ifdef DIAGNOSTIC
@@ -205,7 +201,7 @@ malloc(size, type, flags)
 		memname[freep->type] : "???";
 	if (kbp->kb_next &&
 	    !kernacc(kbp->kb_next, sizeof(struct freelist), 0)) {
-		printf("%s of object 0x%x size %d %s %s (invalid addr 0x%x)\n",
+		printf("%s of object %p size %ld %s %s (invalid addr %p)\n",
 			"Data modified on freelist: word 2.5", va, size,
 			"previous type", savedtype, kbp->kb_next);
 		kbp->kb_next = NULL;
@@ -224,7 +220,7 @@ malloc(size, type, flags)
 	for (lp = (long *)va; lp < end; lp++) {
 		if (*lp == WEIRD_ADDR)
 			continue;
-		printf("%s %d of object 0x%x size %d %s %s (0x%x != 0x%x)\n",
+		printf("%s %d of object %p size %ld %s %s (0x%lx != 0x%x)\n",
 			"Data modified on freelist: word", lp - (long *)va,
 			va, size, "previous type", savedtype, *lp, WEIRD_ADDR);
 		break;
@@ -250,10 +246,6 @@ out:
 out:
 #endif
 	splx(s);
-#ifdef DEBUG
-	if (flags & M_NOWAIT)
-		simplelockrecurse--;
-#endif
 	return ((void *) va);
 }
 
@@ -271,34 +263,42 @@ free(addr, type)
 	long size;
 	int s;
 #ifdef DIAGNOSTIC
-	caddr_t cp;
+	struct freelist *fp;
 	long *end, *lp, alloc, copysize;
 #endif
 #ifdef KMEMSTATS
 	register struct kmemstats *ksp = &kmemstats[type];
 #endif
 
+#ifdef DIAGNOSTIC
+	if ((char *)addr < kmembase || (char *)addr >= kmemlimit) {
+		panic("free: address 0x%x out of range", addr);
+	}
+	if ((u_long)type > M_LAST) {
+		panic("free: type %d out of range", type);
+	}
+#endif
 	kup = btokup(addr);
 	size = 1 << kup->ku_indx;
 	kbp = &bucket[kup->ku_indx];
-	s = splimp();
+	s = splhigh();
 #ifdef DIAGNOSTIC
 	/*
 	 * Check for returns of data that do not point to the
 	 * beginning of the allocation.
 	 */
-	if (size > NBPG * CLSIZE)
-		alloc = addrmask[BUCKETINDX(NBPG * CLSIZE)];
+	if (size > PAGE_SIZE)
+		alloc = addrmask[BUCKETINDX(PAGE_SIZE)];
 	else
 		alloc = addrmask[kup->ku_indx];
 	if (((u_long)addr & alloc) != 0)
-		panic("free: unaligned addr 0x%x, size %d, type %s, mask %d\n",
+		panic("free: unaligned addr 0x%x, size %d, type %s, mask %d",
 			addr, size, memname[type], alloc);
 #endif /* DIAGNOSTIC */
 	if (size > MAXALLOCSAVE) {
 		kmem_free(kmem_map, (vm_offset_t)addr, ctob(kup->ku_pagecnt));
 #ifdef KMEMSTATS
-		size = kup->ku_pagecnt << PGSHIFT;
+		size = kup->ku_pagecnt << PAGE_SHIFT;
 		ksp->ks_memuse -= size;
 		kup->ku_indx = 0;
 		kup->ku_pagecnt = 0;
@@ -318,11 +318,16 @@ free(addr, type)
 	 * it looks free before laboriously searching the freelist.
 	 */
 	if (freep->spare0 == WEIRD_ADDR) {
-		for (cp = kbp->kb_next; cp; cp = *(caddr_t *)cp) {
-			if (addr != cp)
-				continue;
-			printf("multiply freed item 0x%x\n", addr);
-			panic("free: duplicated free");
+		fp = (struct freelist *)kbp->kb_next;
+		while (fp) {
+			if (fp->spare0 != WEIRD_ADDR) {
+				printf("trashed free item %p\n", fp);
+				panic("free: free item modified");
+			} else if (addr == (caddr_t)fp) {
+				printf("multiple freed item %p\n", addr);
+				panic("free: multiple free");
+			}
+			fp = (struct freelist *)fp->next;
 		}
 	}
 	/*
@@ -351,46 +356,75 @@ free(addr, type)
 		wakeup((caddr_t)ksp);
 	ksp->ks_inuse--;
 #endif
+#ifdef OLD_MALLOC_MEMORY_POLICY
 	if (kbp->kb_next == NULL)
 		kbp->kb_next = addr;
 	else
 		((struct freelist *)kbp->kb_last)->next = addr;
 	freep->next = NULL;
 	kbp->kb_last = addr;
+#else
+	/*
+	 * Return memory to the head of the queue for quick reuse.  This
+	 * can improve performance by improving the probability of the
+	 * item being in the cache when it is reused.
+	 */
+	if (kbp->kb_next == NULL) {
+		kbp->kb_next = addr;
+		kbp->kb_last = addr;
+		freep->next = NULL;
+	} else {
+		freep->next = kbp->kb_next;
+		kbp->kb_next = addr;
+	}
+#endif
 	splx(s);
 }
 
 /*
  * Initialize the kernel memory allocator
  */
-kmeminit()
+/* ARGSUSED*/
+static void
+kmeminit(dummy)
+	void *dummy;
 {
 	register long indx;
 	int npg;
 
 #if	((MAXALLOCSAVE & (MAXALLOCSAVE - 1)) != 0)
-		ERROR!_kmeminit:_MAXALLOCSAVE_not_power_of_2
+#error "kmeminit: MAXALLOCSAVE not power of 2"
 #endif
 #if	(MAXALLOCSAVE > MINALLOCSIZE * 32768)
-		ERROR!_kmeminit:_MAXALLOCSAVE_too_big
+#error "kmeminit: MAXALLOCSAVE too big"
 #endif
-#if	(MAXALLOCSAVE < CLBYTES)
-		ERROR!_kmeminit:_MAXALLOCSAVE_too_small
+#if	(MAXALLOCSAVE < PAGE_SIZE)
+#error "kmeminit: MAXALLOCSAVE too small"
 #endif
-	npg = VM_KMEM_SIZE/ NBPG;
+	npg = (nmbufs * MSIZE + nmbclusters * MCLBYTES + VM_KMEM_SIZE)
+		/ PAGE_SIZE;
+
 	kmemusage = (struct kmemusage *) kmem_alloc(kernel_map,
 		(vm_size_t)(npg * sizeof(struct kmemusage)));
 	kmem_map = kmem_suballoc(kernel_map, (vm_offset_t *)&kmembase,
-		(vm_offset_t *)&kmemlimit, (vm_size_t)(npg * NBPG), FALSE);
+		(vm_offset_t *)&kmemlimit, (vm_size_t)(npg * PAGE_SIZE),
+		FALSE);
 #ifdef KMEMSTATS
 	for (indx = 0; indx < MINBUCKET + 16; indx++) {
-		if (1 << indx >= CLBYTES)
+		if (1 << indx >= PAGE_SIZE)
 			bucket[indx].kb_elmpercl = 1;
 		else
-			bucket[indx].kb_elmpercl = CLBYTES / (1 << indx);
+			bucket[indx].kb_elmpercl = PAGE_SIZE / (1 << indx);
 		bucket[indx].kb_highwat = 5 * bucket[indx].kb_elmpercl;
 	}
-	for (indx = 0; indx < M_LAST; indx++)
-		kmemstats[indx].ks_limit = npg * NBPG * 6 / 10;
+	/*
+	 * Limit maximum memory for each type to 60% of malloc area size or
+	 * 60% of physical memory, whichever is smaller.
+	 */
+	for (indx = 0; indx < M_LAST; indx++) {
+		kmemstats[indx].ks_limit = min(cnt.v_page_count * PAGE_SIZE,
+			(npg * PAGE_SIZE - nmbclusters * MCLBYTES
+			 - nmbufs * MSIZE)) * 6 / 10;
+	}
 #endif
 }
diff --git a/sys/kern/kern_mib.c b/sys/kern/kern_mib.c
new file mode 100644
index 0000000..8105aa4
--- /dev/null
+++ b/sys/kern/kern_mib.c
@@ -0,0 +1,167 @@
+/*-
+ * Copyright (c) 1982, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * Mike Karels at Berkeley Software Design, Inc.
+ *
+ * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
+ * project, to make these variables more userfriendly.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_sysctl.c	8.4 (Berkeley) 4/14/94
+ * $Id: kern_mib.c,v 1.7 1997/03/03 12:58:19 bde Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/unistd.h>
+
+SYSCTL_NODE(, 0,	  sysctl, CTLFLAG_RW, 0,
+	"Sysctl internal magic");
+SYSCTL_NODE(, CTL_KERN,	  kern,   CTLFLAG_RW, 0,
+	"High kernel, proc, limits &c");
+SYSCTL_NODE(, CTL_VM,	  vm,     CTLFLAG_RW, 0,
+	"Virtual memory");
+SYSCTL_NODE(, CTL_VFS,	  vfs,     CTLFLAG_RW, 0,
+	"File system");
+SYSCTL_NODE(, CTL_NET,	  net,    CTLFLAG_RW, 0,
+	"Network, (see socket.h)");
+SYSCTL_NODE(, CTL_DEBUG,  debug,  CTLFLAG_RW, 0,
+	"Debugging");
+SYSCTL_NODE(, CTL_HW,	  hw,     CTLFLAG_RW, 0,
+	"hardware");
+SYSCTL_NODE(, CTL_MACHDEP, machdep, CTLFLAG_RW, 0,
+	"machine dependent");
+SYSCTL_NODE(, CTL_USER,	  user,   CTLFLAG_RW, 0,
+	"user-level");
+
+SYSCTL_STRING(_kern, KERN_OSRELEASE, osrelease, CTLFLAG_RD, osrelease, 0, "");
+
+SYSCTL_INT(_kern, KERN_OSREV, osrevision, CTLFLAG_RD, 0, BSD, "");
+
+SYSCTL_STRING(_kern, KERN_VERSION, version, CTLFLAG_RD, version, 0, "");
+
+SYSCTL_STRING(_kern, KERN_OSTYPE, ostype, CTLFLAG_RD, ostype, 0, "");
+
+extern int osreldate;
+SYSCTL_INT(_kern, KERN_OSRELDATE, osreldate, CTLFLAG_RD, &osreldate, 0, "");
+
+SYSCTL_INT(_kern, KERN_MAXPROC, maxproc, CTLFLAG_RW, &maxproc, 0, "");
+
+SYSCTL_INT(_kern, KERN_MAXPROCPERUID, maxprocperuid,
+	CTLFLAG_RW, &maxprocperuid, 0, "");
+
+SYSCTL_INT(_kern, KERN_ARGMAX, argmax, CTLFLAG_RD, 0, ARG_MAX, "");
+
+SYSCTL_INT(_kern, KERN_POSIX1, posix1version, CTLFLAG_RD, 0, _POSIX_VERSION, "");
+
+SYSCTL_INT(_kern, KERN_NGROUPS, ngroups, CTLFLAG_RD, 0, NGROUPS_MAX, "");
+
+SYSCTL_INT(_kern, KERN_JOB_CONTROL, job_control, CTLFLAG_RD, 0, 1, "");
+
+#ifdef _POSIX_SAVED_IDS
+SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD, 0, 1, "");
+#else
+SYSCTL_INT(_kern, KERN_SAVED_IDS, saved_ids, CTLFLAG_RD, 0, 0, "");
+#endif
+
+char kernelname[MAXPATHLEN] = "/kernel";	/* XXX bloat */
+
+SYSCTL_STRING(_kern, KERN_BOOTFILE, bootfile,
+	CTLFLAG_RW, kernelname, sizeof kernelname, "");
+
+SYSCTL_INT(_hw, HW_NCPU, ncpu, CTLFLAG_RD, 0, 1, "");
+
+SYSCTL_INT(_hw, HW_BYTEORDER, byteorder, CTLFLAG_RD, 0, BYTE_ORDER, "");
+
+SYSCTL_INT(_hw, HW_PAGESIZE, pagesize, CTLFLAG_RD, 0, PAGE_SIZE, "");
+
+char hostname[MAXHOSTNAMELEN];
+
+SYSCTL_STRING(_kern, KERN_HOSTNAME, hostname, CTLFLAG_RW,
+	hostname, sizeof(hostname), "");
+
+int securelevel = -1;
+
+static int
+sysctl_kern_securelvl SYSCTL_HANDLER_ARGS
+{
+		int error, level;
+
+		level = securelevel;
+		error = sysctl_handle_int(oidp, &level, 0, req);
+		if (error || !req->newptr)
+			return (error);
+		if (level < securelevel && req->p->p_pid != 1)
+			return (EPERM);
+		securelevel = level;
+		return (error);
+}
+
+SYSCTL_PROC(_kern, KERN_SECURELVL, securelevel, CTLTYPE_INT|CTLFLAG_RW,
+	0, 0, sysctl_kern_securelvl, "I", "");
+
+char domainname[MAXHOSTNAMELEN];
+SYSCTL_STRING(_kern, KERN_NISDOMAINNAME, domainname, CTLFLAG_RW,
+	&domainname, sizeof(domainname), "");
+
+long hostid;
+/* Some trouble here, if sizeof (int) != sizeof (long) */
+SYSCTL_INT(_kern, KERN_HOSTID, hostid, CTLFLAG_RW, &hostid, 0, "");
+
+/*
+ * This is really cheating.  These actually live in the libc, something
+ * which I'm not quite sure is a good idea anyway, but in order for 
+ * getnext and friends to actually work, we define dummies here.
+ */
+SYSCTL_STRING(_user, USER_CS_PATH, cs_path, CTLFLAG_RD, "", 0, "");
+SYSCTL_INT(_user, USER_BC_BASE_MAX, bc_base_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_BC_DIM_MAX, bc_dim_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_BC_SCALE_MAX, bc_scale_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_BC_STRING_MAX, bc_string_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_COLL_WEIGHTS_MAX, coll_weights_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_EXPR_NEST_MAX, expr_nest_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_LINE_MAX, line_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_RE_DUP_MAX, re_dup_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_VERSION, posix2_version, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_C_BIND, posix2_c_bind, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_C_DEV, posix2_c_dev, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_CHAR_TERM, posix2_char_term, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_FORT_DEV, posix2_fort_dev, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_FORT_RUN, posix2_fort_run, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_LOCALEDEF, posix2_localedef, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_SW_DEV, posix2_sw_dev, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_POSIX2_UPE, posix2_upe, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_STREAM_MAX, stream_max, CTLFLAG_RD, 0, 0, "");
+SYSCTL_INT(_user, USER_TZNAME_MAX, tzname_max, CTLFLAG_RD, 0, 0, "");
diff --git a/sys/kern/kern_ntptime.c b/sys/kern/kern_ntptime.c
new file mode 100644
index 0000000..88ba077b
--- /dev/null
+++ b/sys/kern/kern_ntptime.c
@@ -0,0 +1,269 @@
+/******************************************************************************
+ *                                                                            *
+ * Copyright (c) David L. Mills 1993, 1994                                    *
+ *                                                                            *
+ * Permission to use, copy, modify, and distribute this software and its      *
+ * documentation for any purpose and without fee is hereby granted, provided  *
+ * that the above copyright notice appears in all copies and that both the    *
+ * copyright notice and this permission notice appear in supporting           *
+ * documentation, and that the name University of Delaware not be used in     *
+ * advertising or publicity pertaining to distribution of the software        *
+ * without specific, written prior permission.  The University of Delaware    *
+ * makes no representations about the suitability this software for any       *
+ * purpose.  It is provided "as is" without express or implied warranty.      *
+ *                                                                            *
+ ******************************************************************************/
+
+/*
+ * Modification history kern_ntptime.c
+ *
+ * 24 Sep 94	David L. Mills
+ *	Tightened code at exits.
+ *
+ * 24 Mar 94	David L. Mills
+ *	Revised syscall interface to include new variables for PPS
+ *	time discipline.
+ *
+ * 14 Feb 94	David L. Mills
+ *	Added code for external clock
+ *
+ * 28 Nov 93	David L. Mills
+ *	Revised frequency scaling to conform with adjusted parameters
+ *
+ * 17 Sep 93	David L. Mills
+ *	Created file
+ */
+/*
+ * ntp_gettime(), ntp_adjtime() - precision time interface for SunOS
+ * V4.1.1 and V4.1.3
+ *
+ * These routines consitute the Network Time Protocol (NTP) interfaces
+ * for user and daemon application programs. The ntp_gettime() routine
+ * provides the time, maximum error (synch distance) and estimated error
+ * (dispersion) to client user application programs. The ntp_adjtime()
+ * routine is used by the NTP daemon to adjust the system clock to an
+ * externally derived time. The time offset and related variables set by
+ * this routine are used by hardclock() to adjust the phase and
+ * frequency of the phase-lock loop which controls the system clock.
+ */
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/timex.h>
+#include <sys/sysctl.h>
+
+/*
+ * The following variables are used by the hardclock() routine in the
+ * kern_clock.c module and are described in that module.
+ */
+extern int time_state;		/* clock state */
+extern int time_status;		/* clock status bits */
+extern long time_offset;	/* time adjustment (us) */
+extern long time_freq;		/* frequency offset (scaled ppm) */
+extern long time_maxerror;	/* maximum error (us) */
+extern long time_esterror;	/* estimated error (us) */
+extern long time_constant;	/* pll time constant */
+extern long time_precision;	/* clock precision (us) */
+extern long time_tolerance;	/* frequency tolerance (scaled ppm) */
+
+#ifdef PPS_SYNC
+/*
+ * The following variables are used only if the PPS signal discipline
+ * is configured in the kernel.
+ */
+extern int pps_shift;		/* interval duration (s) (shift) */
+extern long pps_freq;		/* pps frequency offset (scaled ppm) */
+extern long pps_jitter;		/* pps jitter (us) */
+extern long pps_stabil;		/* pps stability (scaled ppm) */
+extern long pps_jitcnt;		/* jitter limit exceeded */
+extern long pps_calcnt;		/* calibration intervals */
+extern long pps_errcnt;		/* calibration errors */
+extern long pps_stbcnt;		/* stability limit exceeded */
+#endif /* PPS_SYNC */
+
+static int
+ntp_sysctl SYSCTL_HANDLER_ARGS
+{
+	struct timeval atv;
+	struct ntptimeval ntv;
+	int s;
+
+	s = splclock();
+#ifdef EXT_CLOCK
+	/*
+	 * The microtime() external clock routine returns a
+	 * status code. If less than zero, we declare an error
+	 * in the clock status word and return the kernel
+	 * (software) time variable. While there are other
+	 * places that call microtime(), this is the only place
+	 * that matters from an application point of view.
+	 */
+	if (microtime(&atv) < 0) {
+		time_status |= STA_CLOCKERR;
+		ntv.time = time;
+	} else {
+		time_status &= ~STA_CLOCKERR;
+	}
+#else /* EXT_CLOCK */
+	microtime(&atv);
+#endif /* EXT_CLOCK */
+	ntv.time = atv;
+	ntv.maxerror = time_maxerror;
+	ntv.esterror = time_esterror;
+	splx(s);
+
+	ntv.time_state = time_state;
+
+	/*
+	 * Status word error decode. If any of these conditions
+	 * occur, an error is returned, instead of the status
+	 * word. Most applications will care only about the fact
+	 * the system clock may not be trusted, not about the
+	 * details.
+	 *
+	 * Hardware or software error
+	 */
+	if (time_status & (STA_UNSYNC | STA_CLOCKERR)) {
+		ntv.time_state = TIME_ERROR;
+	}
+
+	/*
+	 * PPS signal lost when either time or frequency
+	 * synchronization requested
+	 */
+	if (time_status & (STA_PPSFREQ | STA_PPSTIME) &&
+	    !(time_status & STA_PPSSIGNAL)) {
+		ntv.time_state = TIME_ERROR;
+	}
+
+	/*
+	 * PPS jitter exceeded when time synchronization
+	 * requested
+	 */
+	if (time_status & STA_PPSTIME &&
+	    time_status & STA_PPSJITTER) {
+		ntv.time_state = TIME_ERROR;
+	}
+
+	/*
+	 * PPS wander exceeded or calibration error when
+	 * frequency synchronization requested
+	 */
+	if (time_status & STA_PPSFREQ &&
+	    time_status & (STA_PPSWANDER | STA_PPSERROR)) {
+		ntv.time_state = TIME_ERROR;
+	}
+	return (sysctl_handle_opaque(oidp, &ntv, sizeof ntv, req));
+}
+
+SYSCTL_NODE(_kern, KERN_NTP_PLL, ntp_pll, CTLFLAG_RW, 0,
+	"NTP kernel PLL related stuff");
+SYSCTL_PROC(_kern_ntp_pll, NTP_PLL_GETTIME, gettime, CTLTYPE_OPAQUE|CTLFLAG_RD,
+	0, sizeof(struct ntptimeval) , ntp_sysctl, "S,ntptimeval", "");
+
+/*
+ * ntp_adjtime() - NTP daemon application interface
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ntp_adjtime_args {
+  struct timex *tp;
+};
+#endif
+
+int
+ntp_adjtime(struct proc *p, struct ntp_adjtime_args *uap, int *retval)
+{
+	struct timex ntv;
+	int modes;
+	int s;
+	int error;
+
+	error = copyin((caddr_t)uap->tp, (caddr_t)&ntv, sizeof(ntv));
+	if (error)
+		return error;
+
+	/*
+	 * Update selected clock variables - only the superuser can
+	 * change anything. Note that there is no error checking here on
+	 * the assumption the superuser should know what it is doing.
+	 */
+	modes = ntv.modes;
+	if ((modes != 0)
+	    && (error = suser(p->p_cred->pc_ucred, &p->p_acflag)))
+		return error;
+
+	s = splclock();
+	if (modes & MOD_FREQUENCY)
+#ifdef PPS_SYNC
+		time_freq = ntv.freq - pps_freq;
+#else /* PPS_SYNC */
+		time_freq = ntv.freq;
+#endif /* PPS_SYNC */
+	if (modes & MOD_MAXERROR)
+		time_maxerror = ntv.maxerror;
+	if (modes & MOD_ESTERROR)
+		time_esterror = ntv.esterror;
+	if (modes & MOD_STATUS) {
+		time_status &= STA_RONLY;
+		time_status |= ntv.status & ~STA_RONLY;
+	}
+	if (modes & MOD_TIMECONST)
+		time_constant = ntv.constant;
+	if (modes & MOD_OFFSET)
+		hardupdate(ntv.offset);
+
+	/*
+	 * Retrieve all clock variables
+	 */
+	if (time_offset < 0)
+		ntv.offset = -(-time_offset >> SHIFT_UPDATE);
+	else
+		ntv.offset = time_offset >> SHIFT_UPDATE;
+#ifdef PPS_SYNC
+	ntv.freq = time_freq + pps_freq;
+#else /* PPS_SYNC */
+	ntv.freq = time_freq;
+#endif /* PPS_SYNC */
+	ntv.maxerror = time_maxerror;
+	ntv.esterror = time_esterror;
+	ntv.status = time_status;
+	ntv.constant = time_constant;
+	ntv.precision = time_precision;
+	ntv.tolerance = time_tolerance;
+#ifdef PPS_SYNC
+	ntv.shift = pps_shift;
+	ntv.ppsfreq = pps_freq;
+	ntv.jitter = pps_jitter >> PPS_AVG;
+	ntv.stabil = pps_stabil;
+	ntv.calcnt = pps_calcnt;
+	ntv.errcnt = pps_errcnt;
+	ntv.jitcnt = pps_jitcnt;
+	ntv.stbcnt = pps_stbcnt;
+#endif /* PPS_SYNC */
+	(void)splx(s);
+
+	error = copyout((caddr_t)&ntv, (caddr_t)uap->tp, sizeof(ntv));
+	if (!error) {
+		/*
+		 * Status word error decode. See comments in
+		 * ntp_gettime() routine.
+		 */
+		retval[0] = time_state;
+		if (time_status & (STA_UNSYNC | STA_CLOCKERR))
+			retval[0] = TIME_ERROR;
+		if (time_status & (STA_PPSFREQ | STA_PPSTIME) &&
+		    !(time_status & STA_PPSSIGNAL))
+			retval[0] = TIME_ERROR;
+		if (time_status & STA_PPSTIME &&
+		    time_status & STA_PPSJITTER)
+			retval[0] = TIME_ERROR;
+		if (time_status & STA_PPSFREQ &&
+		    time_status & (STA_PPSWANDER | STA_PPSERROR))
+			retval[0] = TIME_ERROR;
+	}
+	return error;
+}
+
+
diff --git a/sys/kern/kern_opt.c b/sys/kern/kern_opt.c
new file mode 100644
index 0000000..08b04b2
--- /dev/null
+++ b/sys/kern/kern_opt.c
@@ -0,0 +1,49 @@
+/*-
+ * Copyright (c) 1997 Bruce D. Evans
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	$Id$
+ */
+
+#include "opt_defunct.h"
+
+#ifdef ARP_PROXYALL
+#warning "obsolete option ARP_PROXYALL - use `sysctl -w net.link.ether.inet.proxyall=1'"
+#endif
+
+#ifdef CHILD_MAX
+#warning "obsolete option CHILD_MAX - use /etc/login.conf"
+#endif
+
+#ifdef EXTRAVNODES
+#warning "obsolete option EXTRAVNODES - use `sysctl -w kern.maxvnodes=value'"
+#endif
+
+#ifdef GATEWAY
+#warning "obsolete option GATEWAY - use `sysctl -w net.inet.ip.forwarding=1'"
+#endif
+
+#ifdef OPEN_MAX
+#warning "obsolete option OPEN_MAX - use /etc/login.conf"
+#endif
diff --git a/sys/kern/kern_physio.c b/sys/kern/kern_physio.c
index 1eaae35..42d1d21 100644
--- a/sys/kern/kern_physio.c
+++ b/sys/kern/kern_physio.c
@@ -1,41 +1,22 @@
-/*-
- * Copyright (c) 1982, 1986, 1990, 1993
- *	The Regents of the University of California.  All rights reserved.
- * (c) UNIX System Laboratories, Inc.
- * All or some portions of this file are derived from material licensed
- * to the University of California by American Telephone and Telegraph
- * Co. or Unix System Laboratories, Inc. and are reproduced herein with
- * the permission of UNIX System Laboratories, Inc.
+/*
+ * Copyright (c) 1994 John S. Dyson
+ * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
+ *    notice immediately at the beginning of the file, without modification,
+ *    this list of conditions, and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *	This product includes software developed by the University of
- *	California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * 3. Absolutely no warranty of function or purpose is made by the author
+ *    John S. Dyson.
+ * 4. Modifications may be freely made to this file if the above conditions
+ *    are met.
  *
- *	from: @(#)kern_physio.c	8.1 (Berkeley) 6/10/93
+ * $Id$
  */
 
 #include <sys/param.h>
@@ -43,51 +24,176 @@
 #include <sys/buf.h>
 #include <sys/conf.h>
 #include <sys/proc.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_extern.h>
+
+static void	physwakeup __P((struct buf *bp));
 
-physio(a1, a2, a3, a4, a5, a6)
-	int (*a1)(); 
-	struct buf *a2;
-	dev_t a3;
-	int a4;
-	u_int (*a5)();
-	struct uio *a6;
+int
+physio(strategy, bp, dev, rw, minp, uio)
+	d_strategy_t *strategy;
+	struct buf *bp;
+	dev_t dev;
+	int rw;
+	u_int (*minp) __P((struct buf *bp));
+	struct uio *uio;
 {
+	int i;
+	int bufflags = rw?B_READ:0;
+	int error;
+	int spl;
+	caddr_t sa;
+	int bp_alloc = (bp == 0);
+	struct buf *bpa;
+
+/*
+ * keep the process from being swapped
+ */
+	curproc->p_flag |= P_PHYSIO;
+
+	/* create and build a buffer header for a transfer */
+	bpa = (struct buf *)getpbuf();
+	if (!bp_alloc) {
+		spl = splbio();
+		while (bp->b_flags & B_BUSY) {
+			bp->b_flags |= B_WANTED;
+			tsleep((caddr_t)bp, PRIBIO, "physbw", 0);
+		}
+		bp->b_flags |= B_BUSY;
+		splx(spl);
+	} else {
+		bp = bpa;
+	}
 
 	/*
-	 * Body deleted.
+	 * get a copy of the kva from the physical buffer
 	 */
-	return (EIO);
+	sa = bpa->b_data;
+	bp->b_proc = curproc;
+	bp->b_dev = dev;
+	error = bp->b_error = 0;
+
+	for(i=0;i<uio->uio_iovcnt;i++) {
+		while( uio->uio_iov[i].iov_len) {
+
+			bp->b_bcount = uio->uio_iov[i].iov_len;
+			bp->b_flags = B_BUSY | B_PHYS | B_CALL | bufflags;
+			bp->b_iodone = physwakeup;
+			bp->b_data = uio->uio_iov[i].iov_base;
+			bp->b_bcount = minp( bp);
+			if( minp != minphys)
+				bp->b_bcount = minphys( bp);
+			bp->b_bufsize = bp->b_bcount;
+			/*
+			 * pass in the kva from the physical buffer
+			 * for the temporary kernel mapping.
+			 */
+			bp->b_saveaddr = sa;
+			bp->b_blkno = btodb(uio->uio_offset);
+
+
+			if (uio->uio_segflg == UIO_USERSPACE) {
+				if (rw && !useracc(bp->b_data, bp->b_bufsize, B_WRITE)) {
+					error = EFAULT;
+					goto doerror;
+				}
+				if (!rw && !useracc(bp->b_data, bp->b_bufsize, B_READ)) {
+					error = EFAULT;
+					goto doerror;
+				}
+
+				/* bring buffer into kernel space */
+				vmapbuf(bp);
+			}
+
+			/* perform transfer */
+			(*strategy)(bp);
+
+			spl = splbio();
+			while ((bp->b_flags & B_DONE) == 0)
+				tsleep((caddr_t)bp, PRIBIO, "physstr", 0);
+			splx(spl);
+
+			/* release mapping into kernel space */
+			if (uio->uio_segflg == UIO_USERSPACE)
+				vunmapbuf(bp);
+
+			/*
+			 * update the uio data
+			 */
+			{
+				int iolen = bp->b_bcount - bp->b_resid;
+
+				if (iolen == 0 && !(bp->b_flags & B_ERROR))
+					goto doerror;	/* EOF */
+				uio->uio_iov[i].iov_len -= iolen;
+				uio->uio_iov[i].iov_base += iolen;
+				uio->uio_resid -= iolen;
+				uio->uio_offset += iolen;
+			}
+
+			/*
+			 * check for an error
+			 */
+			if( bp->b_flags & B_ERROR) {
+				error = bp->b_error;
+				goto doerror;
+			}
+		}
+	}
+
+
+doerror:
+	relpbuf(bpa);
+	if (!bp_alloc) {
+		bp->b_flags &= ~(B_BUSY|B_PHYS);
+		if( bp->b_flags & B_WANTED) {
+			bp->b_flags &= ~B_WANTED;
+			wakeup((caddr_t)bp);
+		}
+	}
+/*
+ * allow the process to be swapped
+ */
+	curproc->p_flag &= ~P_PHYSIO;
+
+	return (error);
 }
 
 u_int
-minphys(a1)
-	struct buf *a1;
+minphys(struct buf *bp)
 {
+	u_int maxphys = MAXPHYS;
 
-	/*
-	 * Body deleted.
-	 */
-	return (0);
+	if( ((vm_offset_t) bp->b_data) & PAGE_MASK) {
+		maxphys = MAXPHYS - PAGE_SIZE;
+	}
+
+	if( bp->b_bcount > maxphys) {
+		bp->b_bcount = maxphys;
+	}
+	return bp->b_bcount;
 }
 
-/*
- * Do a read on a device for a user process.
- */
-rawread(dev, uio)
-	dev_t dev;
-	struct uio *uio;
+int
+rawread(dev_t dev, struct uio *uio, int ioflag)
 {
-	return (physio(cdevsw[major(dev)].d_strategy, (struct buf *)NULL,
-	    dev, B_READ, minphys, uio));
+	return (physio(cdevsw[major(dev)]->d_strategy, (struct buf *)NULL,
+	    dev, 1, minphys, uio));
 }
 
-/*
- * Do a write on a device for a user process.
- */
-rawwrite(dev, uio)
-	dev_t dev;
-	struct uio *uio;
+int
+rawwrite(dev_t dev, struct uio *uio, int ioflag)
+{
+	return (physio(cdevsw[major(dev)]->d_strategy, (struct buf *)NULL,
+	    dev, 0, minphys, uio));
+}
+
+static void
+physwakeup(bp)
+	struct buf *bp;
 {
-	return (physio(cdevsw[major(dev)].d_strategy, (struct buf *)NULL,
-	    dev, B_WRITE, minphys, uio));
+	wakeup((caddr_t) bp);
+	bp->b_flags &= ~B_CALL;
 }
diff --git a/sys/kern/kern_proc.c b/sys/kern/kern_proc.c
index 6701793..cecf89f 100644
--- a/sys/kern/kern_proc.c
+++ b/sys/kern/kern_proc.c
@@ -31,12 +31,13 @@
  * SUCH DAMAGE.
  *
  *	@(#)kern_proc.c	8.7 (Berkeley) 2/14/95
+ * $Id: kern_proc.c,v 1.25 1997/02/22 09:39:08 peter Exp $
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/map.h>
 #include <sys/kernel.h>
+#include <sys/sysctl.h>
 #include <sys/proc.h>
 #include <sys/buf.h>
 #include <sys/acct.h>
@@ -46,8 +47,21 @@
 #include <sys/uio.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
-#include <sys/ioctl.h>
 #include <sys/tty.h>
+#include <sys/signalvar.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/user.h>
+
+struct prochd qs[NQS];		/* as good a place as any... */
+struct prochd rtqs[NQS];	/* Space for REALTIME queues too */
+struct prochd idqs[NQS];	/* Space for IDLE queues too */
+
+static void pgdelete	__P((struct pgrp *));
 
 /*
  * Structure associated with user cacheing.
@@ -59,7 +73,9 @@ struct uidinfo {
 };
 #define	UIHASH(uid)	(&uihashtbl[(uid) & uihash])
 LIST_HEAD(uihashhead, uidinfo) *uihashtbl;
-u_long uihash;		/* size of hash table - 1 */
+static u_long uihash;		/* size of hash table - 1 */
+
+static void	orphanpg __P((struct pgrp *pg));
 
 /*
  * Other process lists
@@ -126,6 +142,7 @@ chgproccnt(uid, diff)
 /*
  * Is p an inferior of the current process?
  */
+int
 inferior(p)
 	register struct proc *p;
 {
@@ -263,12 +280,12 @@ leavepgrp(p)
 /*
  * delete a process group
  */
-void
+static void
 pgdelete(pgrp)
 	register struct pgrp *pgrp;
 {
 
-	if (pgrp->pg_session->s_ttyp != NULL && 
+	if (pgrp->pg_session->s_ttyp != NULL &&
 	    pgrp->pg_session->s_ttyp->t_pgrp == pgrp)
 		pgrp->pg_session->s_ttyp->t_pgrp = NULL;
 	LIST_REMOVE(pgrp, pg_hash);
@@ -277,8 +294,6 @@ pgdelete(pgrp)
 	FREE(pgrp, M_PGRP);
 }
 
-static void orphanpg();
-
 /*
  * Adjust pgrp jobc counters when specified process changes process group.
  * We count the number of processes in each process group that "qualify"
@@ -324,7 +339,7 @@ fixjobc(p, pgrp, entering)
 				orphanpg(hispgrp);
 }
 
-/* 
+/*
  * A process group has become orphaned;
  * if there are any stopped processes in the group,
  * hang-up all process in that group.
@@ -347,8 +362,11 @@ orphanpg(pg)
 	}
 }
 
-#ifdef DEBUG
-pgrpdump()
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(pgrpdump, pgrpdump)
 {
 	register struct pgrp *pgrp;
 	register struct proc *p;
@@ -371,4 +389,204 @@ pgrpdump()
 		}
 	}
 }
-#endif /* DEBUG */
+#endif /* DDB */
+
+/*
+ * Fill in an eproc structure for the specified process.
+ */
+void
+fill_eproc(p, ep)
+	register struct proc *p;
+	register struct eproc *ep;
+{
+	register struct tty *tp;
+
+	bzero(ep, sizeof(*ep));
+
+	ep->e_paddr = p;
+	if (p->p_cred) {
+		ep->e_pcred = *p->p_cred;
+		if (p->p_ucred)
+			ep->e_ucred = *p->p_ucred;
+	}
+	if (p->p_stat != SIDL && p->p_stat != SZOMB && p->p_vmspace != NULL) {
+		register struct vmspace *vm = p->p_vmspace;
+
+#ifdef pmap_resident_count
+		ep->e_vm.vm_rssize = pmap_resident_count(&vm->vm_pmap); /*XXX*/
+#else
+		ep->e_vm.vm_rssize = vm->vm_rssize;
+#endif
+		ep->e_vm.vm_tsize = vm->vm_tsize;
+		ep->e_vm.vm_dsize = vm->vm_dsize;
+		ep->e_vm.vm_ssize = vm->vm_ssize;
+#ifndef sparc
+		ep->e_vm.vm_pmap = vm->vm_pmap;
+#endif
+	}
+	if (p->p_pptr)
+		ep->e_ppid = p->p_pptr->p_pid;
+	if (p->p_pgrp) {
+		ep->e_pgid = p->p_pgrp->pg_id;
+		ep->e_jobc = p->p_pgrp->pg_jobc;
+		ep->e_sess = p->p_pgrp->pg_session;
+
+		if (ep->e_sess) {
+			bcopy(ep->e_sess->s_login, ep->e_login, sizeof(ep->e_login));
+			if (ep->e_sess->s_ttyvp)
+				ep->e_flag = EPROC_CTTY;
+			if (p->p_session && SESS_LEADER(p))
+				ep->e_flag |= EPROC_SLEADER;
+		}
+	}
+	if ((p->p_flag & P_CONTROLT) &&
+	    (ep->e_sess != NULL) &&
+	    ((tp = ep->e_sess->s_ttyp) != NULL)) {
+		ep->e_tdev = tp->t_dev;
+		ep->e_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
+		ep->e_tsess = tp->t_session;
+	} else
+		ep->e_tdev = NODEV;
+	if (p->p_wmesg) {
+		strncpy(ep->e_wmesg, p->p_wmesg, WMESGLEN);
+		ep->e_wmesg[WMESGLEN] = 0;
+	}
+}
+
+static struct proc *
+zpfind(pid_t pid)
+{
+	struct proc *p;
+
+	for (p = zombproc.lh_first; p != 0; p = p->p_list.le_next)
+		if (p->p_pid == pid)
+			return (p);
+	return (NULL);
+}
+
+
+static int
+sysctl_out_proc(struct proc *p, struct sysctl_req *req, int doingzomb)
+{
+	struct eproc eproc;
+	int error;
+	pid_t pid = p->p_pid;
+
+	fill_eproc(p, &eproc);
+	error = SYSCTL_OUT(req,(caddr_t)p, sizeof(struct proc));
+	if (error)
+		return (error);
+	error = SYSCTL_OUT(req,(caddr_t)&eproc, sizeof(eproc));
+	if (error)
+		return (error);
+	if (!doingzomb && pid && (pfind(pid) != p))
+		return EAGAIN;
+	if (doingzomb && zpfind(pid) != p)
+		return EAGAIN;
+	return (0);
+}
+
+static int
+sysctl_kern_proc SYSCTL_HANDLER_ARGS
+{
+	int *name = (int*) arg1;
+	u_int namelen = arg2;
+	struct proc *p;
+	int doingzomb;
+	int error = 0;
+
+	if (oidp->oid_number == KERN_PROC_PID) {
+		if (namelen != 1) 
+			return (EINVAL);
+		p = pfind((pid_t)name[0]);
+		if (!p)
+			return (0);
+		error = sysctl_out_proc(p, req, 0);
+		return (error);
+	}
+	if (oidp->oid_number == KERN_PROC_ALL && !namelen)
+		;
+	else if (oidp->oid_number != KERN_PROC_ALL && namelen == 1)
+		;
+	else
+		return (EINVAL);
+	
+	if (!req->oldptr) {
+		/* overestimate by 5 procs */
+		error = SYSCTL_OUT(req, 0, sizeof (struct kinfo_proc) * 5);
+		if (error)
+			return (error);
+	}
+	for (doingzomb=0 ; doingzomb < 2 ; doingzomb++) {
+		if (!doingzomb)
+			p = allproc.lh_first;
+		else
+			p = zombproc.lh_first;
+		for (; p != 0; p = p->p_list.le_next) {
+			/*
+			 * Skip embryonic processes.
+			 */
+			if (p->p_stat == SIDL)
+				continue;
+			/*
+			 * TODO - make more efficient (see notes below).
+			 * do by session.
+			 */
+			switch (oidp->oid_number) {
+
+			case KERN_PROC_PGRP:
+				/* could do this by traversing pgrp */
+				if (p->p_pgrp == NULL || 
+				    p->p_pgrp->pg_id != (pid_t)name[0])
+					continue;
+				break;
+
+			case KERN_PROC_TTY:
+				if ((p->p_flag & P_CONTROLT) == 0 ||
+				    p->p_session == NULL ||
+				    p->p_session->s_ttyp == NULL ||
+				    p->p_session->s_ttyp->t_dev != (dev_t)name[0])
+					continue;
+				break;
+
+			case KERN_PROC_UID:
+				if (p->p_ucred == NULL || 
+				    p->p_ucred->cr_uid != (uid_t)name[0])
+					continue;
+				break;
+
+			case KERN_PROC_RUID:
+				if (p->p_ucred == NULL || 
+				    p->p_cred->p_ruid != (uid_t)name[0])
+					continue;
+				break;
+			}
+
+			error = sysctl_out_proc(p, req, doingzomb);
+			if (error)
+				return (error);
+		}
+	}
+	return (0);
+}
+
+
+SYSCTL_NODE(_kern, KERN_PROC, proc, CTLFLAG_RD,  0, "Process table");
+
+SYSCTL_PROC(_kern_proc, KERN_PROC_ALL, all, CTLFLAG_RD|CTLTYPE_STRUCT,
+	0, 0, sysctl_kern_proc, "S,proc", "");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_PGRP, pgrp, CTLFLAG_RD, 
+	sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_TTY, tty, CTLFLAG_RD, 
+	sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_UID, uid, CTLFLAG_RD, 
+	sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_RUID, ruid, CTLFLAG_RD, 
+	sysctl_kern_proc, "Process table");
+
+SYSCTL_NODE(_kern_proc, KERN_PROC_PID, pid, CTLFLAG_RD, 
+	sysctl_kern_proc, "Process table");
diff --git a/sys/kern/kern_prot.c b/sys/kern/kern_prot.c
index 29e4c67..5c2ec5b 100644
--- a/sys/kern/kern_prot.c
+++ b/sys/kern/kern_prot.c
@@ -35,7 +35,8 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)kern_prot.c	8.9 (Berkeley) 2/14/95
+ *	@(#)kern_prot.c	8.6 (Berkeley) 1/21/94
+ * $Id: kern_prot.c,v 1.25 1997/03/03 22:46:16 ache Exp $
  */
 
 /*
@@ -45,21 +46,26 @@
 #include <sys/param.h>
 #include <sys/acct.h>
 #include <sys/systm.h>
+#include <sys/sysproto.h>
 #include <sys/ucred.h>
 #include <sys/proc.h>
 #include <sys/timeb.h>
 #include <sys/times.h>
 #include <sys/malloc.h>
+#include <sys/unistd.h>
 
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
+#ifndef _SYS_SYSPROTO_H_
+struct getpid_args {
+	int	dummy;
+};
+#endif
 
 /* ARGSUSED */
 int
 getpid(p, uap, retval)
 	struct proc *p;
-	void *uap;
-	register_t *retval;
+	struct getpid_args *uap;
+	int *retval;
 {
 
 	*retval = p->p_pid;
@@ -69,12 +75,17 @@ getpid(p, uap, retval)
 	return (0);
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct getppid_args {
+        int     dummy;
+};
+#endif
 /* ARGSUSED */
 int
 getppid(p, uap, retval)
 	struct proc *p;
-	void *uap;
-	register_t *retval;
+	struct getppid_args *uap;
+	int *retval;
 {
 
 	*retval = p->p_pptr->p_pid;
@@ -82,23 +93,35 @@ getppid(p, uap, retval)
 }
 
 /* Get process group ID; note that POSIX getpgrp takes no parameter */
+#ifndef _SYS_SYSPROTO_H_
+struct getpgrp_args {
+        int     dummy;
+};
+#endif
+
 int
 getpgrp(p, uap, retval)
 	struct proc *p;
-	void *uap;
-	register_t *retval;
+	struct getpgrp_args *uap;
+	int *retval;
 {
 
 	*retval = p->p_pgrp->pg_id;
 	return (0);
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct getuid_args {
+        int     dummy;
+};
+#endif
+
 /* ARGSUSED */
 int
 getuid(p, uap, retval)
 	struct proc *p;
-	void *uap;
-	register_t *retval;
+	struct getuid_args *uap;
+	int *retval;
 {
 
 	*retval = p->p_cred->p_ruid;
@@ -108,24 +131,36 @@ getuid(p, uap, retval)
 	return (0);
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct geteuid_args {
+        int     dummy;
+};
+#endif
+
 /* ARGSUSED */
 int
 geteuid(p, uap, retval)
 	struct proc *p;
-	void *uap;
-	register_t *retval;
+	struct geteuid_args *uap;
+	int *retval;
 {
 
 	*retval = p->p_ucred->cr_uid;
 	return (0);
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct getgid_args {
+        int     dummy;
+};
+#endif
+
 /* ARGSUSED */
 int
 getgid(p, uap, retval)
 	struct proc *p;
-	void *uap;
-	register_t *retval;
+	struct getgid_args *uap;
+	int *retval;
 {
 
 	*retval = p->p_cred->p_rgid;
@@ -140,51 +175,66 @@ getgid(p, uap, retval)
  * via getgroups.  This syscall exists because it is somewhat painful to do
  * correctly in a library function.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct getegid_args {
+        int     dummy;
+};
+#endif
+
 /* ARGSUSED */
 int
 getegid(p, uap, retval)
 	struct proc *p;
-	void *uap;
-	register_t *retval;
+	struct getegid_args *uap;
+	int *retval;
 {
 
 	*retval = p->p_ucred->cr_groups[0];
 	return (0);
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct getgroups_args {
+	u_int	gidsetsize;
+	gid_t	*gidset;
+};
+#endif
 int
 getgroups(p, uap, retval)
 	struct proc *p;
-	register struct getgroups_args /* {
-		syscallarg(u_int) gidsetsize;
-		syscallarg(gid_t *) gidset;
-	} */ *uap;
-	register_t *retval;
+	register struct	getgroups_args *uap;
+	int *retval;
 {
 	register struct pcred *pc = p->p_cred;
 	register u_int ngrp;
 	int error;
 
-	if ((ngrp = SCARG(uap, gidsetsize)) == 0) {
+	if ((ngrp = uap->gidsetsize) == 0) {
 		*retval = pc->pc_ucred->cr_ngroups;
 		return (0);
 	}
 	if (ngrp < pc->pc_ucred->cr_ngroups)
 		return (EINVAL);
 	ngrp = pc->pc_ucred->cr_ngroups;
-	if (error = copyout((caddr_t)pc->pc_ucred->cr_groups,
-	    (caddr_t)SCARG(uap, gidset), ngrp * sizeof(gid_t)))
+	if ((error = copyout((caddr_t)pc->pc_ucred->cr_groups,
+	    (caddr_t)uap->gidset, ngrp * sizeof(gid_t))))
 		return (error);
 	*retval = ngrp;
 	return (0);
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct setsid_args {
+        int     dummy;
+};
+#endif
+
 /* ARGSUSED */
 int
 setsid(p, uap, retval)
 	register struct proc *p;
-	void *uap;
-	register_t *retval;
+	struct setsid_args *uap;
+	int *retval;
 {
 
 	if (p->p_pgid == p->p_pid || pgfind(p->p_pid)) {
@@ -209,23 +259,28 @@ setsid(p, uap, retval)
  * 	there must exist some pid in same session having pgid (EPERM)
  * pid must not be session leader (EPERM)
  */
+#ifndef _SYS_SYSPROTO_H_
+struct setpgid_args {
+	int	pid;	/* target process id */
+	int	pgid;	/* target pgrp id */
+};
+#endif
 /* ARGSUSED */
 int
 setpgid(curp, uap, retval)
 	struct proc *curp;
-	register struct setpgid_args /* {
-		syscallarg(int) pid;
-		syscallarg(int) pgid;
-	} */ *uap;
-	register_t *retval;
+	register struct setpgid_args *uap;
+	int *retval;
 {
 	register struct proc *targp;		/* target process */
 	register struct pgrp *pgrp;		/* target pgrp */
 
-	if (SCARG(uap, pid) != 0 && SCARG(uap, pid) != curp->p_pid) {
-		if ((targp = pfind(SCARG(uap, pid))) == 0 || !inferior(targp))
+	if (uap->pgid < 0)
+		return (EINVAL);
+	if (uap->pid != 0 && uap->pid != curp->p_pid) {
+		if ((targp = pfind(uap->pid)) == 0 || !inferior(targp))
 			return (ESRCH);
-		if (targp->p_session != curp->p_session)
+		if (targp->p_pgrp == NULL ||  targp->p_session != curp->p_session)
 			return (EPERM);
 		if (targp->p_flag & P_EXEC)
 			return (EACCES);
@@ -233,30 +288,36 @@ setpgid(curp, uap, retval)
 		targp = curp;
 	if (SESS_LEADER(targp))
 		return (EPERM);
-	if (SCARG(uap, pgid) == 0)
-		SCARG(uap, pgid) = targp->p_pid;
-	else if (SCARG(uap, pgid) != targp->p_pid)
-		if ((pgrp = pgfind(SCARG(uap, pgid))) == 0 ||
+	if (uap->pgid == 0)
+		uap->pgid = targp->p_pid;
+	else if (uap->pgid != targp->p_pid)
+		if ((pgrp = pgfind(uap->pgid)) == 0 ||
 	            pgrp->pg_session != curp->p_session)
 			return (EPERM);
-	return (enterpgrp(targp, SCARG(uap, pgid), 0));
+	return (enterpgrp(targp, uap->pgid, 0));
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct setuid_args {
+	uid_t	uid;
+};
+#endif
 /* ARGSUSED */
 int
 setuid(p, uap, retval)
 	struct proc *p;
-	struct setuid_args /* {
-		syscallarg(uid_t) uid;
-	} */ *uap;
-	register_t *retval;
+	struct setuid_args *uap;
+	int *retval;
 {
 	register struct pcred *pc = p->p_cred;
 	register uid_t uid;
 	int error;
 
-	uid = SCARG(uap, uid);
+	uid = uap->uid;
 	if (uid != pc->p_ruid &&
+#ifdef _POSIX_SAVED_IDS
+	    uid != pc->p_svuid &&
+#endif
 	    (error = suser(pc->pc_ucred, &p->p_acflag)))
 		return (error);
 	/*
@@ -264,30 +325,45 @@ setuid(p, uap, retval)
 	 * Transfer proc count to new user.
 	 * Copy credentials so other references do not see our changes.
 	 */
-	(void)chgproccnt(pc->p_ruid, -1);
-	(void)chgproccnt(uid, 1);
+	if (
+#ifdef _POSIX_SAVED_IDS
+	    pc->pc_ucred->cr_uid == 0 &&
+#endif
+	    uid != pc->p_ruid) {
+		(void)chgproccnt(pc->p_ruid, -1);
+		(void)chgproccnt(uid, 1);
+	}
 	pc->pc_ucred = crcopy(pc->pc_ucred);
+#ifdef _POSIX_SAVED_IDS
+	if (pc->pc_ucred->cr_uid == 0) {
+#endif
+		pc->p_ruid = uid;
+		pc->p_svuid = uid;
+#ifdef _POSIX_SAVED_IDS
+	}
+#endif
 	pc->pc_ucred->cr_uid = uid;
-	pc->p_ruid = uid;
-	pc->p_svuid = uid;
 	p->p_flag |= P_SUGID;
 	return (0);
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct seteuid_args {
+	uid_t	euid;
+};
+#endif
 /* ARGSUSED */
 int
 seteuid(p, uap, retval)
 	struct proc *p;
-	struct seteuid_args /* {
-		syscallarg(uid_t) euid;
-	} */ *uap;
-	register_t *retval;
+	struct seteuid_args *uap;
+	int *retval;
 {
 	register struct pcred *pc = p->p_cred;
 	register uid_t euid;
 	int error;
 
-	euid = SCARG(uap, euid);
+	euid = uap->euid;
 	if (euid != pc->p_ruid && euid != pc->p_svuid &&
 	    (error = suser(pc->pc_ucred, &p->p_acflag)))
 		return (error);
@@ -301,44 +377,60 @@ seteuid(p, uap, retval)
 	return (0);
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct setgid_args {
+	gid_t	gid;
+};
+#endif
 /* ARGSUSED */
 int
 setgid(p, uap, retval)
 	struct proc *p;
-	struct setgid_args /* {
-		syscallarg(gid_t) gid;
-	} */ *uap;
-	register_t *retval;
+	struct setgid_args *uap;
+	int *retval;
 {
 	register struct pcred *pc = p->p_cred;
 	register gid_t gid;
 	int error;
 
-	gid = SCARG(uap, gid);
-	if (gid != pc->p_rgid && (error = suser(pc->pc_ucred, &p->p_acflag)))
+	gid = uap->gid;
+	if (gid != pc->p_rgid &&
+#ifdef _POSIX_SAVED_IDS
+	    gid != pc->p_svgid &&
+#endif
+	    (error = suser(pc->pc_ucred, &p->p_acflag)))
 		return (error);
 	pc->pc_ucred = crcopy(pc->pc_ucred);
 	pc->pc_ucred->cr_groups[0] = gid;
-	pc->p_rgid = gid;
-	pc->p_svgid = gid;		/* ??? */
+#ifdef _POSIX_SAVED_IDS
+	if (pc->pc_ucred->cr_uid == 0) {
+#endif
+		pc->p_rgid = gid;
+		pc->p_svgid = gid;
+#ifdef _POSIX_SAVED_IDS
+	}
+#endif
 	p->p_flag |= P_SUGID;
 	return (0);
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct setegid_args {
+	gid_t	egid;
+};
+#endif
 /* ARGSUSED */
 int
 setegid(p, uap, retval)
 	struct proc *p;
-	struct setegid_args /* {
-		syscallarg(gid_t) egid;
-	} */ *uap;
-	register_t *retval;
+	struct setegid_args *uap;
+	int *retval;
 {
 	register struct pcred *pc = p->p_cred;
 	register gid_t egid;
 	int error;
 
-	egid = SCARG(uap, egid);
+	egid = uap->egid;
 	if (egid != pc->p_rgid && egid != pc->p_svgid &&
 	    (error = suser(pc->pc_ucred, &p->p_acflag)))
 		return (error);
@@ -348,113 +440,109 @@ setegid(p, uap, retval)
 	return (0);
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct setgroups_args {
+	u_int	gidsetsize;
+	gid_t	*gidset;
+};
+#endif
 /* ARGSUSED */
 int
 setgroups(p, uap, retval)
 	struct proc *p;
-	struct setgroups_args /* {
-		syscallarg(u_int) gidsetsize;
-		syscallarg(gid_t *) gidset;
-	} */ *uap;
-	register_t *retval;
+	struct setgroups_args *uap;
+	int *retval;
 {
 	register struct pcred *pc = p->p_cred;
 	register u_int ngrp;
 	int error;
 
-	if (error = suser(pc->pc_ucred, &p->p_acflag))
+	if ((error = suser(pc->pc_ucred, &p->p_acflag)))
 		return (error);
-	ngrp = SCARG(uap, gidsetsize);
+	ngrp = uap->gidsetsize;
 	if (ngrp < 1 || ngrp > NGROUPS)
 		return (EINVAL);
 	pc->pc_ucred = crcopy(pc->pc_ucred);
-	if (error = copyin((caddr_t)SCARG(uap, gidset),
-	    (caddr_t)pc->pc_ucred->cr_groups, ngrp * sizeof(gid_t)))
+	if ((error = copyin((caddr_t)uap->gidset,
+	    (caddr_t)pc->pc_ucred->cr_groups, ngrp * sizeof(gid_t))))
 		return (error);
 	pc->pc_ucred->cr_ngroups = ngrp;
 	p->p_flag |= P_SUGID;
 	return (0);
 }
 
-#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#ifndef _SYS_SYSPROTO_H_
+struct setreuid_args {
+	uid_t	ruid;
+	uid_t	euid;
+};
+#endif
 /* ARGSUSED */
 int
-compat_43_setreuid(p, uap, retval)
+setreuid(p, uap, retval)
 	register struct proc *p;
-	struct compat_43_setreuid_args /* {
-		syscallarg(int) ruid;
-		syscallarg(int) euid;
-	} */ *uap;
-	register_t *retval;
+	struct setreuid_args *uap;
+	int *retval;
 {
 	register struct pcred *pc = p->p_cred;
-	union {
-		struct setuid_args sa;
-		struct seteuid_args ea;
-	} args;
+	register uid_t ruid, euid;
+	int error;
 
-	/*
-	 * If ruid == euid then setreuid is being used to emulate setuid,
-	 * just do it.
-	 */
-	if (SCARG(uap, ruid) != -1 && SCARG(uap, ruid) == SCARG(uap, euid)) {
-		SCARG(&args.sa, uid) = SCARG(uap, ruid);
-		return (setuid(p, &args.sa, retval));
+	ruid = uap->ruid;
+	euid = uap->euid;
+	if ((ruid != (uid_t)-1 && ruid != pc->p_ruid && ruid != pc->p_svuid ||
+	     euid != (uid_t)-1 && euid != pc->p_ruid && euid != pc->p_svuid) &&
+	    (error = suser(pc->pc_ucred, &p->p_acflag)))
+		return (error);
+
+	pc->pc_ucred = crcopy(pc->pc_ucred);
+	if (euid != (uid_t)-1)
+		pc->pc_ucred->cr_uid = euid;
+	if (ruid != (uid_t)-1 && ruid != pc->p_ruid) {
+		(void)chgproccnt(pc->p_ruid, -1);
+		(void)chgproccnt(ruid, 1);
+		pc->p_ruid = ruid;
 	}
-	/*
-	 * Otherwise we assume that the intent of setting ruid is to be
-	 * able to get back ruid priviledge (i.e. swapping ruid and euid).
-	 * So we make sure that we will be able to do so, but do not
-	 * actually set the ruid.
-	 */
-	if (SCARG(uap, ruid) != (uid_t)-1 && SCARG(uap, ruid) != pc->p_ruid &&
-	    SCARG(uap, ruid) != pc->p_svuid)
-		return (EPERM);
-	if (SCARG(uap, euid) == (uid_t)-1)
-		return (0);
-	SCARG(&args.ea, euid) = SCARG(uap, euid);
-	return (seteuid(p, &args.ea, retval));
+	if (ruid != (uid_t)-1 || pc->pc_ucred->cr_uid != pc->p_ruid)
+		pc->p_svuid = pc->pc_ucred->cr_uid;
+	p->p_flag |= P_SUGID;
+	return (0);
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct setregid_args {
+	gid_t	rgid;
+	gid_t	egid;
+};
+#endif
 /* ARGSUSED */
 int
-compat_43_setregid(p, uap, retval)
+setregid(p, uap, retval)
 	register struct proc *p;
-	struct compat_43_setregid_args /* {
-		syscallarg(int) rgid;
-		syscallarg(int) egid;
-	} */ *uap;
-	register_t *retval;
+	struct setregid_args *uap;
+	int *retval;
 {
 	register struct pcred *pc = p->p_cred;
-	union {
-		struct setgid_args sa;
-		struct setegid_args ea;
-	} args;
+	register gid_t rgid, egid;
+	int error;
 
-	/*
-	 * If rgid == egid then setreuid is being used to emulate setgid,
-	 * just do it.
-	 */
-	if (SCARG(uap, rgid) != -1 && SCARG(uap, rgid) == SCARG(uap, egid)) {
-		SCARG(&args.sa, gid) = SCARG(uap, rgid);
-		return (setgid(p, &args.sa, retval));
-	}
-	/*
-	 * Otherwise we assume that the intent of setting rgid is to be
-	 * able to get back rgid priviledge (i.e. swapping rgid and egid).
-	 * So we make sure that we will be able to do so, but do not
-	 * actually set the rgid.
-	 */
-	if (SCARG(uap, rgid) != (gid_t)-1 && SCARG(uap, rgid) != pc->p_rgid &&
-	    SCARG(uap, rgid) != pc->p_svgid)
-		return (EPERM);
-	if (SCARG(uap, egid) == (gid_t)-1)
-		return (0);
-	SCARG(&args.ea, egid) = SCARG(uap, egid);
-	return (setegid(p, &args.ea, retval));
+	rgid = uap->rgid;
+	egid = uap->egid;
+	if ((rgid != (gid_t)-1 && rgid != pc->p_rgid && rgid != pc->p_svgid ||
+	     egid != (gid_t)-1 && egid != pc->p_rgid && egid != pc->p_svgid) &&
+	    (error = suser(pc->pc_ucred, &p->p_acflag)))
+		return (error);
+
+	pc->pc_ucred = crcopy(pc->pc_ucred);
+	if (egid != (gid_t)-1)
+		pc->pc_ucred->cr_groups[0] = egid;
+	if (rgid != (gid_t)-1)
+		pc->p_rgid = rgid;
+	if (rgid != (gid_t)-1 || pc->pc_ucred->cr_groups[0] != pc->p_rgid)
+		pc->p_svgid = pc->pc_ucred->cr_groups[0];
+	p->p_flag |= P_SUGID;
+	return (0);
 }
-#endif /* defined(COMPAT_43) || defined(COMPAT_SUNOS) */
 
 /*
  * Check if gid is a member of the group set.
@@ -559,43 +647,52 @@ crdup(cr)
 /*
  * Get login name, if available.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct getlogin_args {
+	char	*namebuf;
+	u_int	namelen;
+};
+#endif
 /* ARGSUSED */
 int
 getlogin(p, uap, retval)
 	struct proc *p;
-	struct getlogin_args /* {
-		syscallarg(char *) namebuf;
-		syscallarg(u_int) namelen;
-	} */ *uap;
-	register_t *retval;
+	struct getlogin_args *uap;
+	int *retval;
 {
 
-	if (SCARG(uap, namelen) > sizeof (p->p_pgrp->pg_session->s_login))
-		SCARG(uap, namelen) = sizeof (p->p_pgrp->pg_session->s_login);
+	if (uap->namelen > MAXLOGNAME)
+		uap->namelen = MAXLOGNAME;
 	return (copyout((caddr_t) p->p_pgrp->pg_session->s_login,
-	    (caddr_t) SCARG(uap, namebuf), SCARG(uap, namelen)));
+	    (caddr_t) uap->namebuf, uap->namelen));
 }
 
 /*
  * Set login name.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct setlogin_args {
+	char	*namebuf;
+};
+#endif
 /* ARGSUSED */
 int
 setlogin(p, uap, retval)
 	struct proc *p;
-	struct setlogin_args /* {
-		syscallarg(char *) namebuf;
-	} */ *uap;
-	register_t *retval;
+	struct setlogin_args *uap;
+	int *retval;
 {
 	int error;
+	char logintmp[MAXLOGNAME];
 
-	if (error = suser(p->p_ucred, &p->p_acflag))
+	if ((error = suser(p->p_ucred, &p->p_acflag)))
 		return (error);
-	error = copyinstr((caddr_t) SCARG(uap, namebuf),
-	    (caddr_t) p->p_pgrp->pg_session->s_login,
-	    sizeof (p->p_pgrp->pg_session->s_login) - 1, (u_int *)0);
+	error = copyinstr((caddr_t) uap->namebuf, (caddr_t) logintmp,
+	    sizeof(logintmp), (u_int *)0);
 	if (error == ENAMETOOLONG)
 		error = EINVAL;
+	else if (!error)
+		(void) memcpy(p->p_pgrp->pg_session->s_login, logintmp,
+		    sizeof(logintmp));
 	return (error);
 }
diff --git a/sys/kern/kern_random.c b/sys/kern/kern_random.c
new file mode 100644
index 0000000..64c215c
--- /dev/null
+++ b/sys/kern/kern_random.c
@@ -0,0 +1,515 @@
+/*
+ * random_machdep.c -- A strong random number generator
+ *
+ * $Id$
+ *
+ * Version 0.95, last modified 18-Oct-95
+ * 
+ * Copyright Theodore Ts'o, 1994, 1995.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, and the entire permission notice in its entirety,
+ *    including the disclaimer of warranties.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. The name of the author may not be used to endorse or promote
+ *    products derived from this software without specific prior
+ *    written permission.
+ * 
+ * ALTERNATIVELY, this product may be distributed under the terms of
+ * the GNU Public License, in which case the provisions of the GPL are
+ * required INSTEAD OF the above restrictions.  (This clause is
+ * necessary due to a potential bad interaction between the GPL and
+ * the restrictions contained in a BSD-style copyright.)
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED
+ * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_cpu.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/select.h>
+#include <sys/fcntl.h>
+
+#include <machine/clock.h>
+#include <machine/random.h>
+
+#include <i386/isa/icu.h>
+#ifdef PC98
+#include <pc98/pc98/pc98.h>
+#else
+#include <i386/isa/isa.h>
+#endif
+#include <i386/isa/timerreg.h>
+
+#define MAX_BLKDEV 4
+
+/*
+ * The pool is stirred with a primitive polynomial of degree 128
+ * over GF(2), namely x^128 + x^99 + x^59 + x^31 + x^9 + x^7 + 1.
+ * For a pool of size 64, try x^64+x^62+x^38+x^10+x^6+x+1.
+ */
+#define POOLWORDS 128    /* Power of 2 - note that this is 32-bit words */
+#define POOLBITS (POOLWORDS*32)
+
+#if POOLWORDS == 128
+#define TAP1    99     /* The polynomial taps */
+#define TAP2    59
+#define TAP3    31
+#define TAP4    9
+#define TAP5    7
+#elif POOLWORDS == 64
+#define TAP1    62      /* The polynomial taps */
+#define TAP2    38
+#define TAP3    10
+#define TAP4    6
+#define TAP5    1
+#else
+#error No primitive polynomial available for chosen POOLWORDS
+#endif
+
+#define WRITEBUFFER 512 /* size in bytes */
+
+/* There is actually only one of these, globally. */
+struct random_bucket {
+	u_int	add_ptr;
+	u_int	entropy_count;
+	int	input_rotate;
+	u_int32_t *pool;
+	struct	selinfo rsel;
+};
+
+/* There is one of these per entropy source */
+struct timer_rand_state {
+	u_long	last_time;
+	int 	last_delta;
+	int 	nbits;
+};
+
+static struct random_bucket random_state;
+static u_int32_t random_pool[POOLWORDS];
+static struct timer_rand_state keyboard_timer_state;
+static struct timer_rand_state extract_timer_state;
+static struct timer_rand_state irq_timer_state[ICU_LEN];
+#ifdef notyet
+static struct timer_rand_state blkdev_timer_state[MAX_BLKDEV];
+#endif
+static struct wait_queue *random_wait;
+
+inthand2_t *sec_intr_handler[ICU_LEN];
+int sec_intr_unit[ICU_LEN];
+
+#ifndef MIN
+#define MIN(a,b) (((a) < (b)) ? (a) : (b))
+#endif
+	
+void
+rand_initialize(void)
+{
+	random_state.add_ptr = 0;
+	random_state.entropy_count = 0;
+	random_state.pool = random_pool;
+	random_wait = NULL;
+	random_state.rsel.si_flags = 0;
+	random_state.rsel.si_pid = 0;
+}
+
+/*
+ * This function adds an int into the entropy "pool".  It does not
+ * update the entropy estimate.  The caller must do this if appropriate.
+ *
+ * The pool is stirred with a primitive polynomial of degree 128
+ * over GF(2), namely x^128 + x^99 + x^59 + x^31 + x^9 + x^7 + 1.
+ * For a pool of size 64, try x^64+x^62+x^38+x^10+x^6+x+1.
+ * 
+ * We rotate the input word by a changing number of bits, to help
+ * assure that all bits in the entropy get toggled.  Otherwise, if we
+ * consistently feed the entropy pool small numbers (like ticks and
+ * scancodes, for example), the upper bits of the entropy pool don't
+ * get affected. --- TYT, 10/11/95
+ */
+static inline void
+add_entropy_word(struct random_bucket *r, const u_int32_t input)
+{
+	u_int i;
+	u_int32_t w;
+
+	w = (input << r->input_rotate) | (input >> (32 - r->input_rotate));
+	i = r->add_ptr = (r->add_ptr - 1) & (POOLWORDS-1);
+	if (i)
+		r->input_rotate = (r->input_rotate + 7) & 31;
+	else
+		/*
+		 * At the beginning of the pool, add an extra 7 bits
+		 * rotation, so that successive passes spread the
+		 * input bits across the pool evenly.
+		 */
+		r->input_rotate = (r->input_rotate + 14) & 31;
+
+	/* XOR in the various taps */
+	w ^= r->pool[(i+TAP1)&(POOLWORDS-1)];
+	w ^= r->pool[(i+TAP2)&(POOLWORDS-1)];
+	w ^= r->pool[(i+TAP3)&(POOLWORDS-1)];
+	w ^= r->pool[(i+TAP4)&(POOLWORDS-1)];
+	w ^= r->pool[(i+TAP5)&(POOLWORDS-1)];
+	w ^= r->pool[i];
+	/* Rotate w left 1 bit (stolen from SHA) and store */
+	r->pool[i] = (w << 1) | (w >> 31);
+}
+
+/*
+ * This function adds entropy to the entropy "pool" by using timing
+ * delays.  It uses the timer_rand_state structure to make an estimate
+ * of how  any bits of entropy this call has added to the pool.
+ *
+ * The number "num" is also added to the pool - it should somehow describe
+ * the type of event which just happened.  This is currently 0-255 for
+ * keyboard scan codes, and 256 upwards for interrupts.
+ * On the i386, this is assumed to be at most 16 bits, and the high bits
+ * are used for a high-resolution timer.
+ */
+static void
+add_timer_randomness(struct random_bucket *r, struct timer_rand_state *state,
+	u_int num)
+{
+	int		delta, delta2;
+	u_int		nbits;
+	u_int32_t	time;
+
+#if defined(I586_CPU) || defined(I686_CPU)
+	if (i586_ctr_freq != 0) {
+		num ^= (u_int32_t) rdtsc() << 16;
+		r->entropy_count += 2;
+	} else {
+#endif
+		disable_intr();
+		outb(TIMER_MODE, TIMER_SEL0 | TIMER_LATCH);
+		num ^= inb(TIMER_CNTR0) << 16;
+		num ^= inb(TIMER_CNTR0) << 24;
+		enable_intr();
+		r->entropy_count += 2;
+#if defined(I586_CPU) || defined(I686_CPU)
+	}
+#endif
+		
+	time = ticks;
+
+	add_entropy_word(r, (u_int32_t) num);
+	add_entropy_word(r, time);
+
+	/*
+	 * Calculate number of bits of randomness we probably
+	 * added.  We take into account the first and second order
+	 * deltas in order to make our estimate.
+	 */
+	delta = time - state->last_time;
+	state->last_time = time;
+
+	delta2 = delta - state->last_delta;
+	state->last_delta = delta;
+
+	if (delta < 0) delta = -delta;
+	if (delta2 < 0) delta2 = -delta2;
+	delta = MIN(delta, delta2) >> 1;
+	for (nbits = 0; delta; nbits++)
+		delta >>= 1;
+
+	r->entropy_count += nbits;
+	
+	/* Prevent overflow */
+	if (r->entropy_count > POOLBITS)
+		r->entropy_count = POOLBITS;
+
+	if (r->entropy_count >= 8)
+		selwakeup(&random_state.rsel);
+}
+
+void
+add_keyboard_randomness(u_char scancode)
+{
+	add_timer_randomness(&random_state, &keyboard_timer_state, scancode);
+}
+
+void
+add_interrupt_randomness(int irq)
+{
+	(sec_intr_handler[irq])(sec_intr_unit[irq]);
+	add_timer_randomness(&random_state, &irq_timer_state[irq], irq);
+}
+
+#ifdef notused
+void
+add_blkdev_randomness(int major)
+{
+	if (major >= MAX_BLKDEV)
+		return;
+
+	add_timer_randomness(&random_state, &blkdev_timer_state[major],
+			     0x200+major);
+}
+#endif /* notused */
+
+/*
+ * MD5 transform algorithm, taken from code written by Colin Plumb,
+ * and put into the public domain
+ *
+ * QUESTION: Replace this with SHA, which as generally received better
+ * reviews from the cryptographic community?
+ */
+
+/* The four core functions - F1 is optimized somewhat */
+
+/* #define F1(x, y, z) (x & y | ~x & z) */
+#define F1(x, y, z) (z ^ (x & (y ^ z)))
+#define F2(x, y, z) F1(z, x, y)
+#define F3(x, y, z) (x ^ y ^ z)
+#define F4(x, y, z) (y ^ (x | ~z))
+
+/* This is the central step in the MD5 algorithm. */
+#define MD5STEP(f, w, x, y, z, data, s) \
+	( w += f(x, y, z) + data,  w = w<<s | w>>(32-s),  w += x )
+
+/*
+ * The core of the MD5 algorithm, this alters an existing MD5 hash to
+ * reflect the addition of 16 longwords of new data.  MD5Update blocks
+ * the data and converts bytes into longwords for this routine.
+ */
+static void
+MD5Transform(u_int32_t buf[4],
+			 u_int32_t const in[16])
+{
+	u_int32_t a, b, c, d;
+
+	a = buf[0];
+	b = buf[1];
+	c = buf[2];
+	d = buf[3];
+
+	MD5STEP(F1, a, b, c, d, in[ 0]+0xd76aa478,  7);
+	MD5STEP(F1, d, a, b, c, in[ 1]+0xe8c7b756, 12);
+	MD5STEP(F1, c, d, a, b, in[ 2]+0x242070db, 17);
+	MD5STEP(F1, b, c, d, a, in[ 3]+0xc1bdceee, 22);
+	MD5STEP(F1, a, b, c, d, in[ 4]+0xf57c0faf,  7);
+	MD5STEP(F1, d, a, b, c, in[ 5]+0x4787c62a, 12);
+	MD5STEP(F1, c, d, a, b, in[ 6]+0xa8304613, 17);
+	MD5STEP(F1, b, c, d, a, in[ 7]+0xfd469501, 22);
+	MD5STEP(F1, a, b, c, d, in[ 8]+0x698098d8,  7);
+	MD5STEP(F1, d, a, b, c, in[ 9]+0x8b44f7af, 12);
+	MD5STEP(F1, c, d, a, b, in[10]+0xffff5bb1, 17);
+	MD5STEP(F1, b, c, d, a, in[11]+0x895cd7be, 22);
+	MD5STEP(F1, a, b, c, d, in[12]+0x6b901122,  7);
+	MD5STEP(F1, d, a, b, c, in[13]+0xfd987193, 12);
+	MD5STEP(F1, c, d, a, b, in[14]+0xa679438e, 17);
+	MD5STEP(F1, b, c, d, a, in[15]+0x49b40821, 22);
+
+	MD5STEP(F2, a, b, c, d, in[ 1]+0xf61e2562,  5);
+	MD5STEP(F2, d, a, b, c, in[ 6]+0xc040b340,  9);
+	MD5STEP(F2, c, d, a, b, in[11]+0x265e5a51, 14);
+	MD5STEP(F2, b, c, d, a, in[ 0]+0xe9b6c7aa, 20);
+	MD5STEP(F2, a, b, c, d, in[ 5]+0xd62f105d,  5);
+	MD5STEP(F2, d, a, b, c, in[10]+0x02441453,  9);
+	MD5STEP(F2, c, d, a, b, in[15]+0xd8a1e681, 14);
+	MD5STEP(F2, b, c, d, a, in[ 4]+0xe7d3fbc8, 20);
+	MD5STEP(F2, a, b, c, d, in[ 9]+0x21e1cde6,  5);
+	MD5STEP(F2, d, a, b, c, in[14]+0xc33707d6,  9);
+	MD5STEP(F2, c, d, a, b, in[ 3]+0xf4d50d87, 14);
+	MD5STEP(F2, b, c, d, a, in[ 8]+0x455a14ed, 20);
+	MD5STEP(F2, a, b, c, d, in[13]+0xa9e3e905,  5);
+	MD5STEP(F2, d, a, b, c, in[ 2]+0xfcefa3f8,  9);
+	MD5STEP(F2, c, d, a, b, in[ 7]+0x676f02d9, 14);
+	MD5STEP(F2, b, c, d, a, in[12]+0x8d2a4c8a, 20);
+
+	MD5STEP(F3, a, b, c, d, in[ 5]+0xfffa3942,  4);
+	MD5STEP(F3, d, a, b, c, in[ 8]+0x8771f681, 11);
+	MD5STEP(F3, c, d, a, b, in[11]+0x6d9d6122, 16);
+	MD5STEP(F3, b, c, d, a, in[14]+0xfde5380c, 23);
+	MD5STEP(F3, a, b, c, d, in[ 1]+0xa4beea44,  4);
+	MD5STEP(F3, d, a, b, c, in[ 4]+0x4bdecfa9, 11);
+	MD5STEP(F3, c, d, a, b, in[ 7]+0xf6bb4b60, 16);
+	MD5STEP(F3, b, c, d, a, in[10]+0xbebfbc70, 23);
+	MD5STEP(F3, a, b, c, d, in[13]+0x289b7ec6,  4);
+	MD5STEP(F3, d, a, b, c, in[ 0]+0xeaa127fa, 11);
+	MD5STEP(F3, c, d, a, b, in[ 3]+0xd4ef3085, 16);
+	MD5STEP(F3, b, c, d, a, in[ 6]+0x04881d05, 23);
+	MD5STEP(F3, a, b, c, d, in[ 9]+0xd9d4d039,  4);
+	MD5STEP(F3, d, a, b, c, in[12]+0xe6db99e5, 11);
+	MD5STEP(F3, c, d, a, b, in[15]+0x1fa27cf8, 16);
+	MD5STEP(F3, b, c, d, a, in[ 2]+0xc4ac5665, 23);
+
+	MD5STEP(F4, a, b, c, d, in[ 0]+0xf4292244,  6);
+	MD5STEP(F4, d, a, b, c, in[ 7]+0x432aff97, 10);
+	MD5STEP(F4, c, d, a, b, in[14]+0xab9423a7, 15);
+	MD5STEP(F4, b, c, d, a, in[ 5]+0xfc93a039, 21);
+	MD5STEP(F4, a, b, c, d, in[12]+0x655b59c3,  6);
+	MD5STEP(F4, d, a, b, c, in[ 3]+0x8f0ccc92, 10);
+	MD5STEP(F4, c, d, a, b, in[10]+0xffeff47d, 15);
+	MD5STEP(F4, b, c, d, a, in[ 1]+0x85845dd1, 21);
+	MD5STEP(F4, a, b, c, d, in[ 8]+0x6fa87e4f,  6);
+	MD5STEP(F4, d, a, b, c, in[15]+0xfe2ce6e0, 10);
+	MD5STEP(F4, c, d, a, b, in[ 6]+0xa3014314, 15);
+	MD5STEP(F4, b, c, d, a, in[13]+0x4e0811a1, 21);
+	MD5STEP(F4, a, b, c, d, in[ 4]+0xf7537e82,  6);
+	MD5STEP(F4, d, a, b, c, in[11]+0xbd3af235, 10);
+	MD5STEP(F4, c, d, a, b, in[ 2]+0x2ad7d2bb, 15);
+	MD5STEP(F4, b, c, d, a, in[ 9]+0xeb86d391, 21);
+
+	buf[0] += a;
+	buf[1] += b;
+	buf[2] += c;
+	buf[3] += d;
+}
+
+#undef F1
+#undef F2
+#undef F3
+#undef F4
+#undef MD5STEP
+
+
+#if POOLWORDS % 16
+#error extract_entropy() assumes that POOLWORDS is a multiple of 16 words.
+#endif
+/*
+ * This function extracts randomness from the "entropy pool", and
+ * returns it in a buffer.  This function computes how many remaining
+ * bits of entropy are left in the pool, but it does not restrict the
+ * number of bytes that are actually obtained.
+ */
+static inline int
+extract_entropy(struct random_bucket *r, char *buf, int nbytes)
+{
+	int ret, i;
+	u_int32_t tmp[4];
+	
+	add_timer_randomness(r, &extract_timer_state, nbytes);
+	
+	/* Redundant, but just in case... */
+	if (r->entropy_count > POOLBITS) 
+		r->entropy_count = POOLBITS;
+	/* Why is this here?  Left in from Ted Ts'o.  Perhaps to limit time. */
+	if (nbytes > 32768)
+		nbytes = 32768;
+
+	ret = nbytes;
+	if (r->entropy_count / 8 >= nbytes)
+		r->entropy_count -= nbytes*8;
+	else
+		r->entropy_count = 0;
+
+	while (nbytes) {
+		/* Hash the pool to get the output */
+		tmp[0] = 0x67452301;
+		tmp[1] = 0xefcdab89;
+		tmp[2] = 0x98badcfe;
+		tmp[3] = 0x10325476;
+		for (i = 0; i < POOLWORDS; i += 16)
+			MD5Transform(tmp, r->pool+i);
+		/* Modify pool so next hash will produce different results */
+		add_entropy_word(r, tmp[0]);
+		add_entropy_word(r, tmp[1]);
+		add_entropy_word(r, tmp[2]);
+		add_entropy_word(r, tmp[3]);
+		/*
+		 * Run the MD5 Transform one more time, since we want
+		 * to add at least minimal obscuring of the inputs to
+		 * add_entropy_word().  --- TYT
+		 */
+		MD5Transform(tmp, r->pool);
+		
+		/* Copy data to destination buffer */
+		i = MIN(nbytes, 16);
+		bcopy(tmp, buf, i);
+		nbytes -= i;
+		buf += i;
+	}
+
+	/* Wipe data from memory */
+	bzero(tmp, sizeof(tmp));
+	
+	return ret;
+}
+
+#ifdef notused /* XXX NOT the exported kernel interface */
+/*
+ * This function is the exported kernel interface.  It returns some
+ * number of good random numbers, suitable for seeding TCP sequence
+ * numbers, etc.
+ */
+void
+get_random_bytes(void *buf, u_int nbytes)
+{
+	extract_entropy(&random_state, (char *) buf, nbytes);
+}
+#endif /* notused */
+
+u_int
+read_random(char *buf, u_int nbytes)
+{
+	if ((nbytes * 8) > random_state.entropy_count)
+		nbytes = random_state.entropy_count / 8;
+	
+	return extract_entropy(&random_state, buf, nbytes);
+}
+
+u_int
+read_random_unlimited(char *buf, u_int nbytes)
+{
+	return extract_entropy(&random_state, buf, nbytes);
+}
+
+#ifdef notused
+u_int
+write_random(const char *buf, u_int nbytes)
+{
+	u_int i;
+	u_int32_t word, *p;
+
+	for (i = nbytes, p = (u_int32_t *)buf;
+	     i >= sizeof(u_int32_t);
+	     i-= sizeof(u_int32_t), p++)
+		add_entropy_word(&random_state, *p);
+	if (i) {
+		word = 0;
+		bcopy(p, &word, i);
+		add_entropy_word(&random_state, word);
+	}
+	return nbytes;
+}
+#endif /* notused */
+
+int
+random_select(dev_t dev, int rw, struct proc *p)
+{
+	int s, ret;
+
+	if (rw == FWRITE)
+		return 1;	/* heh. */
+
+	s = splhigh();
+	if (random_state.entropy_count >= 8)
+		ret = 1;
+	else {
+		selrecord(p, &random_state.rsel);
+		ret = 0;
+	}
+	splx(s);
+
+	return ret;
+}
+
diff --git a/sys/kern/kern_resource.c b/sys/kern/kern_resource.c
index 569b9d9..fe50cf9 100644
--- a/sys/kern/kern_resource.c
+++ b/sys/kern/kern_resource.c
@@ -35,21 +35,27 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)kern_resource.c	8.8 (Berkeley) 2/14/95
+ *	@(#)kern_resource.c	8.5 (Berkeley) 1/21/94
+ * $Id$
  */
 
+#include "opt_rlimit.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/file.h>
 #include <sys/resourcevar.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
 
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
-
 #include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
 
 int	donice __P((struct proc *curp, struct proc *chgp, int n));
 int	dosetrlimit __P((struct proc *p, u_int which, struct rlimit *limp));
@@ -58,25 +64,28 @@ int	dosetrlimit __P((struct proc *p, u_int which, struct rlimit *limp));
  * Resource controls and accounting.
  */
 
+#ifndef _SYS_SYSPROTO_H_
+struct getpriority_args {
+	int	which;
+	int	who;
+};
+#endif
 int
 getpriority(curp, uap, retval)
 	struct proc *curp;
-	register struct getpriority_args /* {
-		syscallarg(int) which;
-		syscallarg(int) who;
-	} */ *uap;
-	register_t *retval;
+	register struct getpriority_args *uap;
+	int *retval;
 {
 	register struct proc *p;
 	register int low = PRIO_MAX + 1;
 
-	switch (SCARG(uap, which)) {
+	switch (uap->which) {
 
 	case PRIO_PROCESS:
-		if (SCARG(uap, who) == 0)
+		if (uap->who == 0)
 			p = curp;
 		else
-			p = pfind(SCARG(uap, who));
+			p = pfind(uap->who);
 		if (p == 0)
 			break;
 		low = p->p_nice;
@@ -85,9 +94,9 @@ getpriority(curp, uap, retval)
 	case PRIO_PGRP: {
 		register struct pgrp *pg;
 
-		if (SCARG(uap, who) == 0)
+		if (uap->who == 0)
 			pg = curp->p_pgrp;
-		else if ((pg = pgfind(SCARG(uap, who))) == NULL)
+		else if ((pg = pgfind(uap->who)) == NULL)
 			break;
 		for (p = pg->pg_members.lh_first; p != 0;
 		     p = p->p_pglist.le_next) {
@@ -98,10 +107,10 @@ getpriority(curp, uap, retval)
 	}
 
 	case PRIO_USER:
-		if (SCARG(uap, who) == 0)
-			SCARG(uap, who) = curp->p_ucred->cr_uid;
+		if (uap->who == 0)
+			uap->who = curp->p_ucred->cr_uid;
 		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next)
-			if (p->p_ucred->cr_uid == SCARG(uap, who) &&
+			if (p->p_ucred->cr_uid == uap->who &&
 			    p->p_nice < low)
 				low = p->p_nice;
 		break;
@@ -115,54 +124,57 @@ getpriority(curp, uap, retval)
 	return (0);
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct setpriority_args {
+	int	which;
+	int	who;
+	int	prio;
+};
+#endif
 /* ARGSUSED */
 int
 setpriority(curp, uap, retval)
 	struct proc *curp;
-	register struct setpriority_args /* {
-		syscallarg(int) which;
-		syscallarg(int) who;
-		syscallarg(int) prio;
-	} */ *uap;
-	register_t *retval;
+	register struct setpriority_args *uap;
+	int *retval;
 {
 	register struct proc *p;
 	int found = 0, error = 0;
 
-	switch (SCARG(uap, which)) {
+	switch (uap->which) {
 
 	case PRIO_PROCESS:
-		if (SCARG(uap, who) == 0)
+		if (uap->who == 0)
 			p = curp;
 		else
-			p = pfind(SCARG(uap, who));
+			p = pfind(uap->who);
 		if (p == 0)
 			break;
-		error = donice(curp, p, SCARG(uap, prio));
+		error = donice(curp, p, uap->prio);
 		found++;
 		break;
 
 	case PRIO_PGRP: {
 		register struct pgrp *pg;
-		 
-		if (SCARG(uap, who) == 0)
+
+		if (uap->who == 0)
 			pg = curp->p_pgrp;
-		else if ((pg = pgfind(SCARG(uap, who))) == NULL)
+		else if ((pg = pgfind(uap->who)) == NULL)
 			break;
 		for (p = pg->pg_members.lh_first; p != 0;
 		    p = p->p_pglist.le_next) {
-			error = donice(curp, p, SCARG(uap, prio));
+			error = donice(curp, p, uap->prio);
 			found++;
 		}
 		break;
 	}
 
 	case PRIO_USER:
-		if (SCARG(uap, who) == 0)
-			SCARG(uap, who) = curp->p_ucred->cr_uid;
+		if (uap->who == 0)
+			uap->who = curp->p_ucred->cr_uid;
 		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next)
-			if (p->p_ucred->cr_uid == SCARG(uap, who)) {
-				error = donice(curp, p, SCARG(uap, prio));
+			if (p->p_ucred->cr_uid == uap->who) {
+				error = donice(curp, p, uap->prio);
 				found++;
 			}
 		break;
@@ -197,71 +209,150 @@ donice(curp, chgp, n)
 	return (0);
 }
 
+/* rtprio system call */
+#ifndef _SYS_SYSPROTO_H_
+struct rtprio_args {
+	int		function;
+	pid_t		pid;
+	struct rtprio	*rtp;
+};
+#endif
+
+/*
+ * Set realtime priority
+ */
+
+/* ARGSUSED */
+int
+rtprio(curp, uap, retval)
+	struct proc *curp;
+	register struct rtprio_args *uap;
+	int *retval;
+{
+	register struct proc *p;
+	register struct pcred *pcred = curp->p_cred;
+	struct rtprio rtp;
+	int error;
+
+	error = copyin(uap->rtp, &rtp, sizeof(struct rtprio));
+	if (error)
+		return (error);
+
+	if (uap->pid == 0)
+		p = curp;
+	else
+		p = pfind(uap->pid);
+
+	if (p == 0)
+		return (ESRCH);
+
+	switch (uap->function) {
+	case RTP_LOOKUP:
+		return (copyout(&p->p_rtprio, uap->rtp, sizeof(struct rtprio)));
+	case RTP_SET:
+		if (pcred->pc_ucred->cr_uid && pcred->p_ruid &&
+		    pcred->pc_ucred->cr_uid != p->p_ucred->cr_uid &&
+		    pcred->p_ruid != p->p_ucred->cr_uid)
+		        return (EPERM);
+		/* disallow setting rtprio in most cases if not superuser */
+		if (suser(pcred->pc_ucred, &curp->p_acflag)) {
+			/* can't set someone else's */
+			if (uap->pid)
+				return (EPERM);
+			/* can't set realtime priority */
+			if (rtp.type == RTP_PRIO_REALTIME)
+				return (EPERM);
+		}
+		switch (rtp.type) {
+		case RTP_PRIO_REALTIME:
+		case RTP_PRIO_NORMAL:
+		case RTP_PRIO_IDLE:
+			if (rtp.prio > RTP_PRIO_MAX)
+				return (EINVAL);
+			p->p_rtprio = rtp;
+			return (0);
+		default:
+			return (EINVAL);
+		}
+
+	default:
+		return (EINVAL);
+	}
+}
+
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#ifndef _SYS_SYSPROTO_H_
+struct osetrlimit_args {
+	u_int	which;
+	struct	orlimit *rlp;
+};
+#endif
 /* ARGSUSED */
 int
-compat_43_setrlimit(p, uap, retval)
+osetrlimit(p, uap, retval)
 	struct proc *p;
-	struct compat_43_setrlimit_args /* {
-		syscallarg(u_int) which;
-		syscallarg(struct ogetrlimit *) rlp;
-	} */ *uap;
-	register_t *retval;
+	register struct osetrlimit_args *uap;
+	int *retval;
 {
 	struct orlimit olim;
 	struct rlimit lim;
 	int error;
 
-	if (error = copyin((caddr_t)SCARG(uap, rlp), (caddr_t)&olim,
-	    sizeof (struct orlimit)))
+	if ((error =
+	    copyin((caddr_t)uap->rlp, (caddr_t)&olim, sizeof(struct orlimit))))
 		return (error);
 	lim.rlim_cur = olim.rlim_cur;
 	lim.rlim_max = olim.rlim_max;
-	return (dosetrlimit(p, SCARG(uap, which), &lim));
+	return (dosetrlimit(p, uap->which, &lim));
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct ogetrlimit_args {
+	u_int	which;
+	struct	orlimit *rlp;
+};
+#endif
 /* ARGSUSED */
 int
-compat_43_getrlimit(p, uap, retval)
+ogetrlimit(p, uap, retval)
 	struct proc *p;
-	register struct compat_43_getrlimit_args /* {
-		syscallarg(u_int) which;
-		syscallarg(struct ogetrlimit *) rlp;
-	} */ *uap;
-	register_t *retval;
+	register struct ogetrlimit_args *uap;
+	int *retval;
 {
 	struct orlimit olim;
 
-	if (SCARG(uap, which) >= RLIM_NLIMITS)
+	if (uap->which >= RLIM_NLIMITS)
 		return (EINVAL);
-	olim.rlim_cur = p->p_rlimit[SCARG(uap, which)].rlim_cur;
+	olim.rlim_cur = p->p_rlimit[uap->which].rlim_cur;
 	if (olim.rlim_cur == -1)
 		olim.rlim_cur = 0x7fffffff;
-	olim.rlim_max = p->p_rlimit[SCARG(uap, which)].rlim_max;
+	olim.rlim_max = p->p_rlimit[uap->which].rlim_max;
 	if (olim.rlim_max == -1)
 		olim.rlim_max = 0x7fffffff;
-	return (copyout((caddr_t)&olim, (caddr_t)SCARG(uap, rlp),
-	    sizeof(olim)));
+	return (copyout((caddr_t)&olim, (caddr_t)uap->rlp, sizeof(olim)));
 }
 #endif /* COMPAT_43 || COMPAT_SUNOS */
 
+#ifndef _SYS_SYSPROTO_H_
+struct __setrlimit_args {
+	u_int	which;
+	struct	rlimit *rlp;
+};
+#endif
 /* ARGSUSED */
 int
 setrlimit(p, uap, retval)
 	struct proc *p;
-	register struct setrlimit_args /* {
-		syscallarg(u_int) which;
-		syscallarg(struct rlimit *) rlp;
-	} */ *uap;
-	register_t *retval;
+	register struct __setrlimit_args *uap;
+	int *retval;
 {
 	struct rlimit alim;
 	int error;
 
-	if (error = copyin((caddr_t)SCARG(uap, rlp), (caddr_t)&alim,
-	    sizeof (struct rlimit)))
+	if ((error =
+	    copyin((caddr_t)uap->rlp, (caddr_t)&alim, sizeof (struct rlimit))))
 		return (error);
-	return (dosetrlimit(p, SCARG(uap, which), &alim));
+	return (dosetrlimit(p, uap->which, &alim));
 }
 
 int
@@ -271,15 +362,23 @@ dosetrlimit(p, which, limp)
 	struct rlimit *limp;
 {
 	register struct rlimit *alimp;
-	extern unsigned maxdmap;
 	int error;
 
 	if (which >= RLIM_NLIMITS)
 		return (EINVAL);
 	alimp = &p->p_rlimit[which];
-	if (limp->rlim_cur > alimp->rlim_max || 
+
+	/*
+	 * Preserve historical bugs by treating negative limits as unsigned.
+	 */
+	if (limp->rlim_cur < 0)
+		limp->rlim_cur = RLIM_INFINITY;
+	if (limp->rlim_max < 0)
+		limp->rlim_max = RLIM_INFINITY;
+
+	if (limp->rlim_cur > alimp->rlim_max ||
 	    limp->rlim_max > alimp->rlim_max)
-		if (error = suser(p->p_ucred, &p->p_acflag))
+		if ((error = suser(p->p_ucred, &p->p_acflag)))
 			return (error);
 	if (limp->rlim_cur > limp->rlim_max)
 		limp->rlim_cur = limp->rlim_max;
@@ -293,17 +392,17 @@ dosetrlimit(p, which, limp)
 	switch (which) {
 
 	case RLIMIT_DATA:
-		if (limp->rlim_cur > maxdmap)
-			limp->rlim_cur = maxdmap;
-		if (limp->rlim_max > maxdmap)
-			limp->rlim_max = maxdmap;
+		if (limp->rlim_cur > MAXDSIZ)
+			limp->rlim_cur = MAXDSIZ;
+		if (limp->rlim_max > MAXDSIZ)
+			limp->rlim_max = MAXDSIZ;
 		break;
 
 	case RLIMIT_STACK:
-		if (limp->rlim_cur > maxdmap)
-			limp->rlim_cur = maxdmap;
-		if (limp->rlim_max > maxdmap)
-			limp->rlim_max = maxdmap;
+		if (limp->rlim_cur > MAXSSIZ)
+			limp->rlim_cur = MAXSSIZ;
+		if (limp->rlim_max > MAXSSIZ)
+			limp->rlim_max = MAXSSIZ;
 		/*
 		 * Stack is allocated to the max at exec time with only
 		 * "rlim_cur" bytes accessible.  If stack limit is going
@@ -331,38 +430,41 @@ dosetrlimit(p, which, limp)
 		break;
 
 	case RLIMIT_NOFILE:
-		if (limp->rlim_cur > maxfiles)
-			limp->rlim_cur = maxfiles;
-		if (limp->rlim_max > maxfiles)
-			limp->rlim_max = maxfiles;
+		if (limp->rlim_cur > maxfilesperproc)
+			limp->rlim_cur = maxfilesperproc;
+		if (limp->rlim_max > maxfilesperproc)
+			limp->rlim_max = maxfilesperproc;
 		break;
 
 	case RLIMIT_NPROC:
-		if (limp->rlim_cur > maxproc)
-			limp->rlim_cur = maxproc;
-		if (limp->rlim_max > maxproc)
-			limp->rlim_max = maxproc;
+		if (limp->rlim_cur > maxprocperuid)
+			limp->rlim_cur = maxprocperuid;
+		if (limp->rlim_max > maxprocperuid)
+			limp->rlim_max = maxprocperuid;
 		break;
 	}
 	*alimp = *limp;
 	return (0);
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct __getrlimit_args {
+	u_int	which;
+	struct	rlimit *rlp;
+};
+#endif
 /* ARGSUSED */
 int
 getrlimit(p, uap, retval)
 	struct proc *p;
-	register struct getrlimit_args /* {
-		syscallarg(u_int) which;
-		syscallarg(struct rlimit *) rlp;
-	} */ *uap;
-	register_t *retval;
+	register struct __getrlimit_args *uap;
+	int *retval;
 {
 
-	if (SCARG(uap, which) >= RLIM_NLIMITS)
+	if (uap->which >= RLIM_NLIMITS)
 		return (EINVAL);
-	return (copyout((caddr_t)&p->p_rlimit[SCARG(uap, which)],
-	    (caddr_t)SCARG(uap, rlp), sizeof (struct rlimit)));
+	return (copyout((caddr_t)&p->p_rlimit[uap->which], (caddr_t)uap->rlp,
+	    sizeof (struct rlimit)));
 }
 
 /*
@@ -371,14 +473,15 @@ getrlimit(p, uap, retval)
  */
 void
 calcru(p, up, sp, ip)
-	register struct proc *p;
-	register struct timeval *up;
-	register struct timeval *sp;
-	register struct timeval *ip;
+	struct proc *p;
+	struct timeval *up;
+	struct timeval *sp;
+	struct timeval *ip;
 {
-	register u_quad_t u, st, ut, it, tot;
-	register u_long sec, usec;
-	register int s;
+	quad_t totusec;
+	u_quad_t u, st, ut, it, tot;
+	long sec, usec;
+	int s;
 	struct timeval tv;
 
 	s = splstatclock();
@@ -389,11 +492,8 @@ calcru(p, up, sp, ip)
 
 	tot = st + ut + it;
 	if (tot == 0) {
-		up->tv_sec = up->tv_usec = 0;
-		sp->tv_sec = sp->tv_usec = 0;
-		if (ip != NULL)
-			ip->tv_sec = ip->tv_usec = 0;
-		return;
+		st = 1;
+		tot = 1;
 	}
 
 	sec = p->p_rtime.tv_sec;
@@ -408,7 +508,13 @@ calcru(p, up, sp, ip)
 		sec += tv.tv_sec - runtime.tv_sec;
 		usec += tv.tv_usec - runtime.tv_usec;
 	}
-	u = sec * 1000000 + usec;
+	totusec = (quad_t)sec * 1000000 + usec;
+	if (totusec < 0) {
+		/* XXX no %qd in kernel.  Truncate. */
+		printf("calcru: negative time: %ld usec\n", (long)totusec);
+		totusec = 0;
+	}
+	u = totusec;
 	st = (u * st) / tot;
 	sp->tv_sec = st / 1000000;
 	sp->tv_usec = st % 1000000;
@@ -422,19 +528,22 @@ calcru(p, up, sp, ip)
 	}
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct getrusage_args {
+	int	who;
+	struct	rusage *rusage;
+};
+#endif
 /* ARGSUSED */
 int
 getrusage(p, uap, retval)
 	register struct proc *p;
-	register struct getrusage_args /* {
-		syscallarg(int) who;
-		syscallarg(struct rusage *) rusage;
-	} */ *uap;
-	register_t *retval;
+	register struct getrusage_args *uap;
+	int *retval;
 {
 	register struct rusage *rup;
 
-	switch (SCARG(uap, who)) {
+	switch (uap->who) {
 
 	case RUSAGE_SELF:
 		rup = &p->p_stats->p_ru;
@@ -448,7 +557,7 @@ getrusage(p, uap, retval)
 	default:
 		return (EINVAL);
 	}
-	return (copyout((caddr_t)rup, (caddr_t)SCARG(uap, rusage),
+	return (copyout((caddr_t)rup, (caddr_t)uap->rusage,
 	    sizeof (struct rusage)));
 }
 
diff --git a/sys/kern/kern_shutdown.c b/sys/kern/kern_shutdown.c
new file mode 100644
index 0000000..c4922d0
--- /dev/null
+++ b/sys/kern/kern_shutdown.c
@@ -0,0 +1,445 @@
+/*-
+ * Copyright (c) 1986, 1988, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_shutdown.c	8.3 (Berkeley) 1/21/94
+ * $Id$
+ */
+
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/reboot.h>
+#include <sys/msgbuf.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/tty.h>
+#include <sys/tprintf.h>
+#include <sys/syslog.h>
+#include <sys/malloc.h>
+#include <sys/kernel.h>
+#include <sys/mount.h>
+#include <sys/sysctl.h>
+#include <sys/conf.h>
+#include <sys/sysproto.h>
+
+#include <machine/pcb.h>
+#include <machine/clock.h>
+#include <machine/cons.h>
+#include <machine/md_var.h>
+
+#include <sys/utsname.h>
+#include <sys/signalvar.h>
+
+#ifndef PANIC_REBOOT_WAIT_TIME
+#define PANIC_REBOOT_WAIT_TIME 15 /* default to 15 seconds */
+#endif
+
+/*
+ * Note that stdarg.h and the ANSI style va_start macro is used for both
+ * ANSI and traditional C compilers.
+ */
+#include <machine/stdarg.h>
+
+#if defined(DDB)
+#ifdef DDB_UNATTENDED
+	static int debugger_on_panic = 0;
+#else
+	static int debugger_on_panic = 1;
+#endif
+
+SYSCTL_INT(_debug, OID_AUTO, debugger_on_panic, CTLFLAG_RW,
+	&debugger_on_panic, 0, "");
+#endif
+
+
+/*
+ * Variable panicstr contains argument to first call to panic; used as flag
+ * to indicate that the kernel has already called panic.
+ */
+const char *panicstr;
+
+/*
+ * callout list for things to do a shutdown
+ */
+typedef struct shutdown_list_element {
+	struct shutdown_list_element *next;
+	bootlist_fn function;
+	void *arg;
+} *sle_p;
+
+/*
+ * there are two shutdown lists. Some things need to be shut down
+ * Earlier than others.
+ */
+static sle_p shutdown_list1;
+static sle_p shutdown_list2;
+
+
+static void dumpsys(void);
+
+#ifndef _SYS_SYSPROTO_H_
+struct reboot_args {
+	int	opt;
+};
+#endif
+/* ARGSUSED */
+
+/*
+ * The system call that results in a reboot
+ */
+int
+reboot(p, uap, retval)
+	struct proc *p;
+	struct reboot_args *uap;
+	int *retval;
+{
+	int error;
+
+	if ((error = suser(p->p_ucred, &p->p_acflag)))
+		return (error);
+
+	boot(uap->opt);
+	return (0);
+}
+
+/*
+ * Called by events that want to shut down.. e.g  <CTL><ALT><DEL> on a PC
+ */
+void
+shutdown_nice(void)
+{
+	/* Send a signal to init(8) and have it shutdown the world */
+	if (initproc != NULL) {
+		psignal(initproc, SIGINT);
+	} else {
+		/* No init(8) running, so simply reboot */
+		boot(RB_NOSYNC);
+	}
+	return;
+}
+static int	waittime = -1;
+static struct pcb dumppcb;
+
+/*
+ *  Go through the rigmarole of shutting down..
+ * this used to be in machdep.c but I'll be dammned if I could see
+ * anything machine dependant in it.
+ */
+void
+boot(howto)
+	int howto;
+{
+	sle_p ep;
+
+	ep = shutdown_list1;
+	while (ep) {
+		shutdown_list1 = ep->next;
+		(*ep->function)(howto, ep->arg);
+		ep = ep->next;
+	}
+	if (!cold && (howto & RB_NOSYNC) == 0 && waittime < 0) {
+		register struct buf *bp;
+		int iter, nbusy;
+
+		waittime = 0;
+		printf("\nsyncing disks... ");
+
+		sync(&proc0, NULL, NULL);
+
+		for (iter = 0; iter < 20; iter++) {
+			nbusy = 0;
+			for (bp = &buf[nbuf]; --bp >= buf; ) {
+				if ((bp->b_flags & (B_BUSY | B_INVAL)) == B_BUSY) {
+					nbusy++;
+				}
+			}
+			if (nbusy == 0)
+				break;
+			printf("%d ", nbusy);
+			DELAY(40000 * iter);
+		}
+		if (nbusy) {
+			/*
+			 * Failed to sync all blocks. Indicate this and don't
+			 * unmount filesystems (thus forcing an fsck on reboot).
+			 */
+			printf("giving up\n");
+#ifdef SHOW_BUSYBUFS
+			nbusy = 0;
+			for (bp = &buf[nbuf]; --bp >= buf; ) {
+				if ((bp->b_flags & (B_BUSY | B_INVAL)) == B_BUSY) {
+					nbusy++;
+					printf("%d: dev:%08x, flags:%08x, blkno:%d, lblkno:%d\n", nbusy, bp->b_dev, bp->b_flags, bp->b_blkno, bp->b_lblkno);
+				}
+			}
+			DELAY(5000000);	/* 5 seconds */
+#endif
+		} else {
+			printf("done\n");
+			/*
+			 * Unmount filesystems
+			 */
+			if (panicstr == 0)
+				vfs_unmountall();
+		}
+		DELAY(100000);			/* wait for console output to finish */
+	}
+	ep = shutdown_list2;
+	while (ep) {
+		shutdown_list2 = ep->next;
+		(*ep->function)(howto, ep->arg);
+		ep = ep->next;
+	}
+	splhigh();
+	if (howto & RB_HALT) {
+		printf("\n");
+		printf("The operating system has halted.\n");
+		printf("Please press any key to reboot.\n\n");
+		switch (cngetc()) {
+		case -1:		/* No console, just die */
+			cpu_halt();
+			/* NOTREACHED */
+		default:
+			break;
+		}
+	} else {
+		if (howto & RB_DUMP) {
+			if (!cold) {
+				savectx(&dumppcb);
+				dumppcb.pcb_cr3 = rcr3();
+				dumpsys();
+			}
+
+			if (PANIC_REBOOT_WAIT_TIME != 0) {
+				if (PANIC_REBOOT_WAIT_TIME != -1) {
+					int loop;
+					printf("Automatic reboot in %d seconds - press a key on the console to abort\n",
+						PANIC_REBOOT_WAIT_TIME);
+					for (loop = PANIC_REBOOT_WAIT_TIME * 10; loop > 0; --loop) {
+						DELAY(1000 * 100); /* 1/10th second */
+						/* Did user type a key? */
+						if (cncheckc() != -1)
+							break;
+					}
+					if (!loop)
+						goto die;
+				}
+			} else { /* zero time specified - reboot NOW */
+				goto die;
+			}
+			printf("--> Press a key on the console to reboot <--\n");
+			cngetc();
+		}
+	}
+die:
+	printf("Rebooting...\n");
+	DELAY(1000000);	/* wait 1 sec for printf's to complete and be read */
+	/* cpu_boot(howto); */ /* doesn't do anything at the moment */
+	cpu_reset();
+	for(;;) ;
+	/* NOTREACHED */
+}
+
+/*
+ * Magic number for savecore
+ *
+ * exported (symorder) and used at least by savecore(8)
+ *
+ */
+static u_long const	dumpmag = 0x8fca0101UL;	
+
+static int	dumpsize = 0;		/* also for savecore */
+
+static int	dodump = 1;
+SYSCTL_INT(_machdep, OID_AUTO, do_dump, CTLFLAG_RW, &dodump, 0, "");
+
+/*
+ * Doadump comes here after turning off memory management and
+ * getting on the dump stack, either when called above, or by
+ * the auto-restart code.
+ */
+static void
+dumpsys(void)
+{
+
+	if (!dodump)
+		return;
+	if (dumpdev == NODEV)
+		return;
+	if ((minor(dumpdev)&07) != 1)
+		return;
+	if (!(bdevsw[major(dumpdev)]))
+		return;
+	if (!(bdevsw[major(dumpdev)]->d_dump))
+		return;
+	dumpsize = Maxmem;
+	printf("\ndumping to dev %lx, offset %ld\n", dumpdev, dumplo);
+	printf("dump ");
+	switch ((*bdevsw[major(dumpdev)]->d_dump)(dumpdev)) {
+
+	case ENXIO:
+		printf("device bad\n");
+		break;
+
+	case EFAULT:
+		printf("device not ready\n");
+		break;
+
+	case EINVAL:
+		printf("area improper\n");
+		break;
+
+	case EIO:
+		printf("i/o error\n");
+		break;
+
+	case EINTR:
+		printf("aborted from console\n");
+		break;
+
+	default:
+		printf("succeeded\n");
+		break;
+	}
+}
+
+/*
+ * Panic is called on unresolvable fatal errors.  It prints "panic: mesg",
+ * and then reboots.  If we are called twice, then we avoid trying to sync
+ * the disks as this often leads to recursive panics.
+ */
+void
+panic(const char *fmt, ...)
+{
+	int bootopt;
+	va_list ap;
+
+	bootopt = RB_AUTOBOOT | RB_DUMP;
+	if (panicstr)
+		bootopt |= RB_NOSYNC;
+	else
+		panicstr = fmt;
+
+	printf("panic: ");
+	va_start(ap, fmt);
+	vprintf(fmt, ap);
+	va_end(ap);
+	printf("\n");
+
+#if defined(DDB)
+	if (debugger_on_panic)
+		Debugger ("panic");
+#endif
+	boot(bootopt);
+}
+
+/*
+ * Two routines to handle adding/deleting items on the
+ * shutdown callout lists
+ *
+ * at_shutdown():
+ * Take the arguments given and put them onto the shutdown callout list.
+ * However first make sure that it's not already there.
+ * returns 0 on success.
+ */
+int
+at_shutdown(bootlist_fn function, void *arg, int position)
+{
+	sle_p ep, *epp;
+
+	switch(position) {
+	case SHUTDOWN_PRE_SYNC:
+		epp = &shutdown_list1;
+		break;
+	case SHUTDOWN_POST_SYNC:
+		epp = &shutdown_list2;
+		break;
+	default:
+		printf("bad exit callout list specified\n");
+		return (EINVAL);
+	}
+	if (rm_at_shutdown(function, arg))
+		printf("exit callout entry already present\n");
+	ep = malloc(sizeof(*ep), M_TEMP, M_NOWAIT);
+	if (ep == NULL)
+		return (ENOMEM);
+	ep->next = *epp;
+	ep->function = function;
+	ep->arg = arg;
+	*epp = ep;
+	return (0);
+}
+
+/*
+ * Scan the exit callout lists for the given items and remove them.
+ * Returns the number of items removed.
+ */
+int
+rm_at_shutdown(bootlist_fn function, void *arg)
+{
+	sle_p *epp, ep;
+	int count;
+
+	count = 0;
+	epp = &shutdown_list1;
+	ep = *epp;
+	while (ep) {
+		if ((ep->function == function) && (ep->arg == arg)) {
+			*epp = ep->next;
+			free(ep, M_TEMP);
+			count++;
+		} else {
+			epp = &ep->next;
+		}
+		ep = *epp;
+	}
+	epp = &shutdown_list2;
+	ep = *epp;
+	while (ep) {
+		if ((ep->function == function) && (ep->arg == arg)) {
+			*epp = ep->next;
+			free(ep, M_TEMP);
+			count++;
+		} else {
+			epp = &ep->next;
+		}
+		ep = *epp;
+	}
+	return (count);
+}
+
diff --git a/sys/kern/kern_sig.c b/sys/kern/kern_sig.c
index 5683b9c..e0b28e0 100644
--- a/sys/kern/kern_sig.c
+++ b/sys/kern/kern_sig.c
@@ -35,11 +35,15 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)kern_sig.c	8.14 (Berkeley) 5/14/95
+ *	@(#)kern_sig.c	8.7 (Berkeley) 4/18/94
+ * $Id: kern_sig.c,v 1.30 1997/02/22 09:39:11 peter Exp $
  */
 
+#include "opt_ktrace.h"
+
 #define	SIGPROP		/* include signal properties table */
 #include <sys/param.h>
+#include <sys/sysproto.h>
 #include <sys/signalvar.h>
 #include <sys/resourcevar.h>
 #include <sys/namei.h>
@@ -50,22 +54,27 @@
 #include <sys/times.h>
 #include <sys/buf.h>
 #include <sys/acct.h>
-#include <sys/file.h>
+#include <sys/fcntl.h>
 #include <sys/kernel.h>
 #include <sys/wait.h>
 #include <sys/ktrace.h>
 #include <sys/syslog.h>
 #include <sys/stat.h>
-
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
+#include <sys/sysent.h>
 
 #include <machine/cpu.h>
 
 #include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
 #include <sys/user.h>		/* for coredump */
 
-void stop __P((struct proc *p));
+static int coredump     __P((struct proc *p));
+static int killpg1	__P((struct proc *cp, int signum, int pgid, int all));
+static void stop	__P((struct proc *));
 
 /*
  * Can process p, with pcred pc, send the signal signum to process q?
@@ -78,16 +87,19 @@ void stop __P((struct proc *p));
 	    (pc)->pc_ucred->cr_uid == (q)->p_ucred->cr_uid || \
 	    ((signum) == SIGCONT && (q)->p_session == (p)->p_session))
 
+#ifndef _SYS_SYSPROTO_H_
+struct sigaction_args {
+	int	signum;
+	struct	sigaction *nsa;
+	struct	sigaction *osa;
+};
+#endif
 /* ARGSUSED */
 int
 sigaction(p, uap, retval)
 	struct proc *p;
-	register struct sigaction_args /* {
-		syscallarg(int) signum;
-		syscallarg(struct sigaction *) nsa;
-		syscallarg(struct sigaction *) osa;
-	} */ *uap;
-	register_t *retval;
+	register struct sigaction_args *uap;
+	int *retval;
 {
 	struct sigaction vec;
 	register struct sigaction *sa;
@@ -95,12 +107,11 @@ sigaction(p, uap, retval)
 	register int signum;
 	int bit, error;
 
-	signum = SCARG(uap, signum);
-	if (signum <= 0 || signum >= NSIG ||
-	    signum == SIGKILL || signum == SIGSTOP)
+	signum = uap->signum;
+	if (signum <= 0 || signum >= NSIG)
 		return (EINVAL);
 	sa = &vec;
-	if (SCARG(uap, osa)) {
+	if (uap->osa) {
 		sa->sa_handler = ps->ps_sigact[signum];
 		sa->sa_mask = ps->ps_catchmask[signum];
 		bit = sigmask(signum);
@@ -109,16 +120,23 @@ sigaction(p, uap, retval)
 			sa->sa_flags |= SA_ONSTACK;
 		if ((ps->ps_sigintr & bit) == 0)
 			sa->sa_flags |= SA_RESTART;
-		if (p->p_flag & P_NOCLDSTOP)
+		if ((ps->ps_sigreset & bit) != 0)
+			sa->sa_flags |= SA_RESETHAND;
+		if ((ps->ps_signodefer & bit) != 0)
+			sa->sa_flags |= SA_NODEFER;
+		if (signum == SIGCHLD && p->p_flag & P_NOCLDSTOP)
 			sa->sa_flags |= SA_NOCLDSTOP;
-		if (error = copyout((caddr_t)sa, (caddr_t)SCARG(uap, osa),
-		    sizeof (vec)))
+		if ((error = copyout((caddr_t)sa, (caddr_t)uap->osa,
+		    sizeof (vec))))
 			return (error);
 	}
-	if (SCARG(uap, nsa)) {
-		if (error = copyin((caddr_t)SCARG(uap, nsa), (caddr_t)sa,
-		    sizeof (vec)))
+	if (uap->nsa) {
+		if ((error = copyin((caddr_t)uap->nsa, (caddr_t)sa,
+		    sizeof (vec))))
 			return (error);
+		if ((signum == SIGKILL || signum == SIGSTOP) &&
+		    sa->sa_handler != SIG_DFL)
+			return (EINVAL);
 		setsigvec(p, signum, sa);
 	}
 	return (0);
@@ -148,6 +166,14 @@ setsigvec(p, signum, sa)
 		ps->ps_sigonstack |= bit;
 	else
 		ps->ps_sigonstack &= ~bit;
+	if (sa->sa_flags & SA_RESETHAND)
+		ps->ps_sigreset |= bit;
+	else
+		ps->ps_sigreset &= ~bit;
+	if (sa->sa_flags & SA_NODEFER)
+		ps->ps_signodefer |= bit;
+	else
+		ps->ps_signodefer &= ~bit;
 #ifdef COMPAT_SUNOS
 	if (sa->sa_flags & SA_USERTRAMP)
 		ps->ps_usertramp |= bit;
@@ -227,9 +253,9 @@ execsigs(p)
 	 * Reset stack state to the user stack.
 	 * Clear set of signals caught on the signal stack.
 	 */
-	ps->ps_sigstk.ss_flags = SA_DISABLE;
+	ps->ps_sigstk.ss_flags = SS_DISABLE;
 	ps->ps_sigstk.ss_size = 0;
-	ps->ps_sigstk.ss_base = 0;
+	ps->ps_sigstk.ss_sp = 0;
 	ps->ps_flags = 0;
 }
 
@@ -239,33 +265,36 @@ execsigs(p)
  * and return old mask as return value;
  * the library stub does the rest.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct sigprocmask_args {
+	int	how;
+	sigset_t mask;
+};
+#endif
 int
 sigprocmask(p, uap, retval)
 	register struct proc *p;
-	struct sigprocmask_args /* {
-		syscallarg(int) how;
-		syscallarg(sigset_t) mask;
-	} */ *uap;
-	register_t *retval;
+	struct sigprocmask_args *uap;
+	int *retval;
 {
 	int error = 0;
 
 	*retval = p->p_sigmask;
 	(void) splhigh();
 
-	switch (SCARG(uap, how)) {
+	switch (uap->how) {
 	case SIG_BLOCK:
-		p->p_sigmask |= SCARG(uap, mask) &~ sigcantmask;
+		p->p_sigmask |= uap->mask &~ sigcantmask;
 		break;
 
 	case SIG_UNBLOCK:
-		p->p_sigmask &= ~SCARG(uap, mask);
+		p->p_sigmask &= ~uap->mask;
 		break;
 
 	case SIG_SETMASK:
-		p->p_sigmask = SCARG(uap, mask) &~ sigcantmask;
+		p->p_sigmask = uap->mask &~ sigcantmask;
 		break;
-	
+
 	default:
 		error = EINVAL;
 		break;
@@ -274,12 +303,17 @@ sigprocmask(p, uap, retval)
 	return (error);
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct sigpending_args {
+	int	dummy;
+};
+#endif
 /* ARGSUSED */
 int
 sigpending(p, uap, retval)
 	struct proc *p;
-	void *uap;
-	register_t *retval;
+	struct sigpending_args *uap;
+	int *retval;
 {
 
 	*retval = p->p_siglist;
@@ -290,16 +324,19 @@ sigpending(p, uap, retval)
 /*
  * Generalized interface signal handler, 4.3-compatible.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct osigvec_args {
+	int	signum;
+	struct	sigvec *nsv;
+	struct	sigvec *osv;
+};
+#endif
 /* ARGSUSED */
 int
-compat_43_sigvec(p, uap, retval)
+osigvec(p, uap, retval)
 	struct proc *p;
-	register struct compat_43_sigvec_args /* {
-		syscallarg(int) signum;
-		syscallarg(struct sigvec *) nsv;
-		syscallarg(struct sigvec *) osv;
-	} */ *uap;
-	register_t *retval;
+	register struct osigvec_args *uap;
+	int *retval;
 {
 	struct sigvec vec;
 	register struct sigacts *ps = p->p_sigacts;
@@ -307,12 +344,11 @@ compat_43_sigvec(p, uap, retval)
 	register int signum;
 	int bit, error;
 
-	signum = SCARG(uap, signum);
-	if (signum <= 0 || signum >= NSIG ||
-	    signum == SIGKILL || signum == SIGSTOP)
+	signum = uap->signum;
+	if (signum <= 0 || signum >= NSIG)
 		return (EINVAL);
 	sv = &vec;
-	if (SCARG(uap, osv)) {
+	if (uap->osv) {
 		*(sig_t *)&sv->sv_handler = ps->ps_sigact[signum];
 		sv->sv_mask = ps->ps_catchmask[signum];
 		bit = sigmask(signum);
@@ -321,26 +357,26 @@ compat_43_sigvec(p, uap, retval)
 			sv->sv_flags |= SV_ONSTACK;
 		if ((ps->ps_sigintr & bit) != 0)
 			sv->sv_flags |= SV_INTERRUPT;
+		if ((ps->ps_sigreset & bit) != 0)
+			sv->sv_flags |= SV_RESETHAND;
+		if ((ps->ps_signodefer & bit) != 0)
+			sv->sv_flags |= SV_NODEFER;
 #ifndef COMPAT_SUNOS
-		if (p->p_flag & P_NOCLDSTOP)
-			sv->sv_flags |= SA_NOCLDSTOP;
+		if (signum == SIGCHLD && p->p_flag & P_NOCLDSTOP)
+			sv->sv_flags |= SV_NOCLDSTOP;
 #endif
-		if (error = copyout((caddr_t)sv, (caddr_t)SCARG(uap, osv),
-		    sizeof (vec)))
+		if ((error = copyout((caddr_t)sv, (caddr_t)uap->osv,
+		    sizeof (vec))))
 			return (error);
 	}
-	if (SCARG(uap, nsv)) {
-		if (error = copyin((caddr_t)SCARG(uap, nsv), (caddr_t)sv,
-		    sizeof (vec)))
+	if (uap->nsv) {
+		if ((error = copyin((caddr_t)uap->nsv, (caddr_t)sv,
+		    sizeof (vec))))
 			return (error);
-#ifdef COMPAT_SUNOS
-		/*
-		 * SunOS uses this bit (4, aka SA_DISABLE) as SV_RESETHAND,
-		 * `reset to SIG_DFL on delivery'. We have no such option
-		 * now or ever!
-		 */
-		if (sv->sv_flags & SA_DISABLE)
+		if ((signum == SIGKILL || signum == SIGSTOP) &&
+		    sv->sv_handler != SIG_DFL)
 			return (EINVAL);
+#ifdef COMPAT_SUNOS
 		sv->sv_flags |= SA_USERTRAMP;
 #endif
 		sv->sv_flags ^= SA_RESTART;	/* opposite of SV_INTERRUPT */
@@ -349,34 +385,40 @@ compat_43_sigvec(p, uap, retval)
 	return (0);
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct osigblock_args {
+	int	mask;
+};
+#endif
 int
-compat_43_sigblock(p, uap, retval)
+osigblock(p, uap, retval)
 	register struct proc *p;
-	struct compat_43_sigblock_args /* {
-		syscallarg(int) mask;
-	} */ *uap;
-	register_t *retval;
+	struct osigblock_args *uap;
+	int *retval;
 {
 
 	(void) splhigh();
 	*retval = p->p_sigmask;
-	p->p_sigmask |= SCARG(uap, mask) &~ sigcantmask;
+	p->p_sigmask |= uap->mask &~ sigcantmask;
 	(void) spl0();
 	return (0);
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct osigsetmask_args {
+	int	mask;
+};
+#endif
 int
-compat_43_sigsetmask(p, uap, retval)
+osigsetmask(p, uap, retval)
 	struct proc *p;
-	struct compat_43_sigsetmask_args /* {
-		syscallarg(int) mask;
-	} */ *uap;
-	register_t *retval;
+	struct osigsetmask_args *uap;
+	int *retval;
 {
 
 	(void) splhigh();
 	*retval = p->p_sigmask;
-	p->p_sigmask = SCARG(uap, mask) &~ sigcantmask;
+	p->p_sigmask = uap->mask &~ sigcantmask;
 	(void) spl0();
 	return (0);
 }
@@ -387,14 +429,17 @@ compat_43_sigsetmask(p, uap, retval)
  * in the meantime.  Note nonstandard calling convention:
  * libc stub passes mask, not pointer, to save a copyin.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct sigsuspend_args {
+	sigset_t mask;
+};
+#endif
 /* ARGSUSED */
 int
 sigsuspend(p, uap, retval)
 	register struct proc *p;
-	struct sigsuspend_args /* {
-		syscallarg(int) mask;
-	} */ *uap;
-	register_t *retval;
+	struct sigsuspend_args *uap;
+	int *retval;
 {
 	register struct sigacts *ps = p->p_sigacts;
 
@@ -407,7 +452,7 @@ sigsuspend(p, uap, retval)
 	 */
 	ps->ps_oldmask = p->p_sigmask;
 	ps->ps_flags |= SAS_OLDMASK;
-	p->p_sigmask = SCARG(uap, mask) &~ sigcantmask;
+	p->p_sigmask = uap->mask &~ sigcantmask;
 	while (tsleep((caddr_t) ps, PPAUSE|PCATCH, "pause", 0) == 0)
 		/* void */;
 	/* always return EINTR rather than ERESTART... */
@@ -415,46 +460,52 @@ sigsuspend(p, uap, retval)
 }
 
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#ifndef _SYS_SYSPROTO_H_
+struct osigstack_args {
+	struct	sigstack *nss;
+	struct	sigstack *oss;
+};
+#endif
 /* ARGSUSED */
 int
-compat_43_sigstack(p, uap, retval)
+osigstack(p, uap, retval)
 	struct proc *p;
-	register struct compat_43_sigstack_args /* {
-		syscallarg(struct sigstack *) nss;
-		syscallarg(struct sigstack *) oss;
-	} */ *uap;
-	register_t *retval;
+	register struct osigstack_args *uap;
+	int *retval;
 {
 	struct sigstack ss;
 	struct sigacts *psp;
 	int error = 0;
 
 	psp = p->p_sigacts;
-	ss.ss_sp = psp->ps_sigstk.ss_base;
-	ss.ss_onstack = psp->ps_sigstk.ss_flags & SA_ONSTACK;
-	if (SCARG(uap, oss) && (error = copyout((caddr_t)&ss,
-	    (caddr_t)SCARG(uap, oss), sizeof (struct sigstack))))
+	ss.ss_sp = psp->ps_sigstk.ss_sp;
+	ss.ss_onstack = psp->ps_sigstk.ss_flags & SS_ONSTACK;
+	if (uap->oss && (error = copyout((caddr_t)&ss, (caddr_t)uap->oss,
+	    sizeof (struct sigstack))))
 		return (error);
-	if (SCARG(uap, nss) && (error = copyin((caddr_t)SCARG(uap, nss),
-	    (caddr_t)&ss, sizeof (ss))) == 0) {
-		psp->ps_sigstk.ss_base = ss.ss_sp;
+	if (uap->nss && (error = copyin((caddr_t)uap->nss, (caddr_t)&ss,
+	    sizeof (ss))) == 0) {
+		psp->ps_sigstk.ss_sp = ss.ss_sp;
 		psp->ps_sigstk.ss_size = 0;
-		psp->ps_sigstk.ss_flags |= ss.ss_onstack & SA_ONSTACK;
+		psp->ps_sigstk.ss_flags |= ss.ss_onstack & SS_ONSTACK;
 		psp->ps_flags |= SAS_ALTSTACK;
 	}
 	return (error);
 }
 #endif /* COMPAT_43 || COMPAT_SUNOS */
 
+#ifndef _SYS_SYSPROTO_H_
+struct sigaltstack_args {
+	struct	sigaltstack *nss;
+	struct	sigaltstack *oss;
+};
+#endif
 /* ARGSUSED */
 int
 sigaltstack(p, uap, retval)
 	struct proc *p;
-	register struct sigaltstack_args /* {
-		syscallarg(struct sigaltstack *) nss;
-		syscallarg(struct sigaltstack *) oss;
-	} */ *uap;
-	register_t *retval;
+	register struct sigaltstack_args *uap;
+	int *retval;
 {
 	struct sigacts *psp;
 	struct sigaltstack ss;
@@ -462,17 +513,16 @@ sigaltstack(p, uap, retval)
 
 	psp = p->p_sigacts;
 	if ((psp->ps_flags & SAS_ALTSTACK) == 0)
-		psp->ps_sigstk.ss_flags |= SA_DISABLE;
-	if (SCARG(uap, oss) && (error = copyout((caddr_t)&psp->ps_sigstk,
-	    (caddr_t)SCARG(uap, oss), sizeof (struct sigaltstack))))
+		psp->ps_sigstk.ss_flags |= SS_DISABLE;
+	if (uap->oss && (error = copyout((caddr_t)&psp->ps_sigstk,
+	    (caddr_t)uap->oss, sizeof (struct sigaltstack))))
 		return (error);
-	if (SCARG(uap, nss) == 0)
+	if (uap->nss == 0)
 		return (0);
-	if (error = copyin((caddr_t)SCARG(uap, nss), (caddr_t)&ss,
-	    sizeof (ss)))
+	if ((error = copyin((caddr_t)uap->nss, (caddr_t)&ss, sizeof (ss))))
 		return (error);
-	if (ss.ss_flags & SA_DISABLE) {
-		if (psp->ps_sigstk.ss_flags & SA_ONSTACK)
+	if (ss.ss_flags & SS_DISABLE) {
+		if (psp->ps_sigstk.ss_flags & SS_ONSTACK)
 			return (EINVAL);
 		psp->ps_flags &= ~SAS_ALTSTACK;
 		psp->ps_sigstk.ss_flags = ss.ss_flags;
@@ -485,60 +535,6 @@ sigaltstack(p, uap, retval)
 	return (0);
 }
 
-/* ARGSUSED */
-int
-kill(cp, uap, retval)
-	register struct proc *cp;
-	register struct kill_args /* {
-		syscallarg(int) pid;
-		syscallarg(int) signum;
-	} */ *uap;
-	register_t *retval;
-{
-	register struct proc *p;
-	register struct pcred *pc = cp->p_cred;
-
-	if ((u_int)SCARG(uap, signum) >= NSIG)
-		return (EINVAL);
-	if (SCARG(uap, pid) > 0) {
-		/* kill single process */
-		if ((p = pfind(SCARG(uap, pid))) == NULL)
-			return (ESRCH);
-		if (!CANSIGNAL(cp, pc, p, SCARG(uap, signum)))
-			return (EPERM);
-		if (SCARG(uap, signum))
-			psignal(p, SCARG(uap, signum));
-		return (0);
-	}
-	switch (SCARG(uap, pid)) {
-	case -1:		/* broadcast signal */
-		return (killpg1(cp, SCARG(uap, signum), 0, 1));
-	case 0:			/* signal own process group */
-		return (killpg1(cp, SCARG(uap, signum), 0, 0));
-	default:		/* negative explicit process group */
-		return (killpg1(cp, SCARG(uap, signum), -SCARG(uap, pid), 0));
-	}
-	/* NOTREACHED */
-}
-
-#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
-/* ARGSUSED */
-int
-compat_43_killpg(p, uap, retval)
-	struct proc *p;
-	register struct compat_43_killpg_args /* {
-		syscallarg(int) pgid;
-		syscallarg(int) signum;
-	} */ *uap;
-	register_t *retval;
-{
-
-	if ((u_int)SCARG(uap, signum) >= NSIG)
-		return (EINVAL);
-	return (killpg1(p, SCARG(uap, signum), SCARG(uap, pgid), 0));
-}
-#endif /* COMPAT_43 || COMPAT_SUNOS */
-
 /*
  * Common code for kill process group/broadcast kill.
  * cp is calling process.
@@ -552,13 +548,13 @@ killpg1(cp, signum, pgid, all)
 	register struct pcred *pc = cp->p_cred;
 	struct pgrp *pgrp;
 	int nfound = 0;
-	
-	if (all)	
-		/* 
-		 * broadcast 
+
+	if (all)
+		/*
+		 * broadcast
 		 */
 		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
-			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM || 
+			if (p->p_pid <= 1 || p->p_flag & P_SYSTEM ||
 			    p == cp || !CANSIGNAL(cp, pc, p, signum))
 				continue;
 			nfound++;
@@ -566,8 +562,8 @@ killpg1(cp, signum, pgid, all)
 				psignal(p, signum);
 		}
 	else {
-		if (pgid == 0)		
-			/* 
+		if (pgid == 0)
+			/*
 			 * zero pgid means send to my process group.
 			 */
 			pgrp = cp->p_pgrp;
@@ -590,6 +586,66 @@ killpg1(cp, signum, pgid, all)
 	return (nfound ? 0 : ESRCH);
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct kill_args {
+	int	pid;
+	int	signum;
+};
+#endif
+/* ARGSUSED */
+int
+kill(cp, uap, retval)
+	register struct proc *cp;
+	register struct kill_args *uap;
+	int *retval;
+{
+	register struct proc *p;
+	register struct pcred *pc = cp->p_cred;
+
+	if ((u_int)uap->signum >= NSIG)
+		return (EINVAL);
+	if (uap->pid > 0) {
+		/* kill single process */
+		if ((p = pfind(uap->pid)) == NULL)
+			return (ESRCH);
+		if (!CANSIGNAL(cp, pc, p, uap->signum))
+			return (EPERM);
+		if (uap->signum)
+			psignal(p, uap->signum);
+		return (0);
+	}
+	switch (uap->pid) {
+	case -1:		/* broadcast signal */
+		return (killpg1(cp, uap->signum, 0, 1));
+	case 0:			/* signal own process group */
+		return (killpg1(cp, uap->signum, 0, 0));
+	default:		/* negative explicit process group */
+		return (killpg1(cp, uap->signum, -uap->pid, 0));
+	}
+	/* NOTREACHED */
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#ifndef _SYS_SYSPROTO_H_
+struct okillpg_args {
+	int	pgid;
+	int	signum;
+};
+#endif
+/* ARGSUSED */
+int
+okillpg(p, uap, retval)
+	struct proc *p;
+	register struct okillpg_args *uap;
+	int *retval;
+{
+
+	if ((u_int)uap->signum >= NSIG)
+		return (EINVAL);
+	return (killpg1(p, uap->signum, uap->pgid, 0));
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
 /*
  * Send a signal to a process group.
  */
@@ -641,11 +697,22 @@ trapsignal(p, signum, code)
 		p->p_stats->p_ru.ru_nsignals++;
 #ifdef KTRACE
 		if (KTRPOINT(p, KTR_PSIG))
-			ktrpsig(p->p_tracep, signum, ps->ps_sigact[signum], 
+			ktrpsig(p->p_tracep, signum, ps->ps_sigact[signum],
 				p->p_sigmask, code);
 #endif
-		sendsig(ps->ps_sigact[signum], signum, p->p_sigmask, code);
-		p->p_sigmask |= ps->ps_catchmask[signum] | mask;
+		(*p->p_sysent->sv_sendsig)(ps->ps_sigact[signum], signum,
+						p->p_sigmask, code);
+		p->p_sigmask |= ps->ps_catchmask[signum] |
+				(mask & ~ps->ps_signodefer);
+		if ((ps->ps_sigreset & mask) != 0) {
+			/*
+			 * See setsigvec() for origin of this code.
+			 */
+			p->p_sigcatch &= ~mask;
+			if (signum != SIGCONT && sigprop[signum] & SA_IGNORE)
+				p->p_sigignore |= mask;
+			ps->ps_sigact[signum] = SIG_DFL;
+		}
 	} else {
 		ps->ps_code = code;	/* XXX for core dump/debugger */
 		ps->ps_sig = signum;	/* XXX to verify code */
@@ -719,7 +786,7 @@ psignal(p, signum)
 		 */
 		if (prop & SA_TTYSTOP && p->p_pgrp->pg_jobc == 0 &&
 		    action == SIG_DFL)
-			return;
+		        return;
 		p->p_siglist &= ~contsigmask;
 	}
 	p->p_siglist |= mask;
@@ -900,17 +967,8 @@ issignal(p)
 			/*
 			 * If traced, always stop, and stay
 			 * stopped until released by the parent.
-			 *
-			 * Note that we must clear the pending signal
-			 * before we call trace_req since that routine
-			 * might cause a fault, calling tsleep and
-			 * leading us back here again with the same signal.
-			 * Then we would be deadlocked because the tracer
-			 * would still be blocked on the ipc struct from
-			 * the initial request.
 			 */
 			p->p_xstat = signum;
-			p->p_siglist &= ~mask;
 			psignal(p->p_pptr, SIGCHLD);
 			do {
 				stop(p);
@@ -918,10 +976,19 @@ issignal(p)
 			} while (!trace_req(p) && p->p_flag & P_TRACED);
 
 			/*
+			 * If the traced bit got turned off, go back up
+			 * to the top to rescan signals.  This ensures
+			 * that p_sig* and ps_sigact are consistent.
+			 */
+			if ((p->p_flag & P_TRACED) == 0)
+				continue;
+
+			/*
 			 * If parent wants us to take the signal,
 			 * then it will leave it in p->p_xstat;
 			 * otherwise we just look for signals again.
 			 */
+			p->p_siglist &= ~mask;	/* clear the old signal */
 			signum = p->p_xstat;
 			if (signum == 0)
 				continue;
@@ -934,14 +1001,6 @@ issignal(p)
 			p->p_siglist |= mask;
 			if (p->p_sigmask & mask)
 				continue;
-
-			/*
-			 * If the traced bit got turned off, go back up
-			 * to the top to rescan signals.  This ensures
-			 * that p_sig* and ps_sigact are consistent.
-			 */
-			if ((p->p_flag & P_TRACED) == 0)
-				continue;
 		}
 
 		/*
@@ -949,9 +1008,9 @@ issignal(p)
 		 * Return the signal's number, or fall through
 		 * to clear it from the pending mask.
 		 */
-		switch ((long)p->p_sigacts->ps_sigact[signum]) {
+		switch ((int)p->p_sigacts->ps_sigact[signum]) {
 
-		case (long)SIG_DFL:
+		case (int)SIG_DFL:
 			/*
 			 * Don't take default actions on system processes.
 			 */
@@ -961,8 +1020,8 @@ issignal(p)
 				 * Are you sure you want to ignore SIGSEGV
 				 * in init? XXX
 				 */
-				printf("Process (pid %d) got signal %d\n",
-					p->p_pid, signum);
+				printf("Process (pid %lu) got signal %d\n",
+					(u_long)p->p_pid, signum);
 #endif
 				break;		/* == ignore */
 			}
@@ -994,7 +1053,7 @@ issignal(p)
 				return (signum);
 			/*NOTREACHED*/
 
-		case (long)SIG_IGN:
+		case (int)SIG_IGN:
 			/*
 			 * Masking above should prevent us ever trying
 			 * to take action on an ignored signal other
@@ -1043,8 +1102,7 @@ postsig(signum)
 	register struct proc *p = curproc;
 	register struct sigacts *ps = p->p_sigacts;
 	register sig_t action;
-	u_long code;
-	int mask, returnmask;
+	int code, mask, returnmask;
 
 #ifdef DIAGNOSTIC
 	if (signum == 0)
@@ -1089,7 +1147,17 @@ postsig(signum)
 			ps->ps_flags &= ~SAS_OLDMASK;
 		} else
 			returnmask = p->p_sigmask;
-		p->p_sigmask |= ps->ps_catchmask[signum] | mask;
+		p->p_sigmask |= ps->ps_catchmask[signum] |
+				(mask & ~ps->ps_signodefer);
+		if ((ps->ps_sigreset & mask) != 0) {
+			/*
+			 * See setsigvec() for origin of this code.
+			 */
+			p->p_sigcatch &= ~mask;
+			if (signum != SIGCONT && sigprop[signum] & SA_IGNORE)
+				p->p_sigignore |= mask;
+			ps->ps_sigact[signum] = SIG_DFL;
+		}
 		(void) spl0();
 		p->p_stats->p_ru.ru_nsignals++;
 		if (ps->ps_sig != signum) {
@@ -1099,7 +1167,7 @@ postsig(signum)
 			ps->ps_code = 0;
 			ps->ps_sig = 0;
 		}
-		sendsig(action, signum, returnmask, code);
+		(*p->p_sysent->sv_sendsig)(action, signum, returnmask, code);
 	}
 }
 
@@ -1111,9 +1179,8 @@ killproc(p, why)
 	struct proc *p;
 	char *why;
 {
-
-	log(LOG_ERR, "pid %d was killed: %s\n", p->p_pid, why);
-	uprintf("sorry, pid %d was killed: %s\n", p->p_pid, why);
+	log(LOG_ERR, "pid %d (%s), uid %d, was killed: %s\n", p->p_pid, p->p_comm,
+		p->p_cred && p->p_ucred ? p->p_ucred->cr_uid : -1, why);
 	psignal(p, SIGKILL);
 }
 
@@ -1134,8 +1201,19 @@ sigexit(p, signum)
 	p->p_acflag |= AXSIG;
 	if (sigprop[signum] & SA_CORE) {
 		p->p_sigacts->ps_sig = signum;
+		/*
+		 * Log signals which would cause core dumps
+		 * (Log as LOG_INFO to appease those who don't want
+		 * these messages.)
+		 * XXX : Todo, as well as euid, write out ruid too
+		 */
 		if (coredump(p) == 0)
 			signum |= WCOREFLAG;
+		log(LOG_INFO, "pid %d (%s), uid %d: exited on signal %d%s\n",
+			p->p_pid, p->p_comm,
+			p->p_cred && p->p_ucred ? p->p_ucred->cr_uid : -1,
+			signum &~ WCOREFLAG,
+			signum & WCOREFLAG ? " (core dumped)" : "");
 	}
 	exit1(p, W_EXITCODE(0, signum));
 	/* NOTREACHED */
@@ -1145,28 +1223,27 @@ sigexit(p, signum)
  * Dump core, into a file named "progname.core", unless the process was
  * setuid/setgid.
  */
-int
+static int
 coredump(p)
 	register struct proc *p;
 {
 	register struct vnode *vp;
-	register struct pcred *pcred = p->p_cred;
-	register struct ucred *cred = pcred->pc_ucred;
+	register struct ucred *cred = p->p_cred->pc_ucred;
 	register struct vmspace *vm = p->p_vmspace;
 	struct nameidata nd;
 	struct vattr vattr;
 	int error, error1;
 	char name[MAXCOMLEN+6];		/* progname.core */
 
-	if (pcred->p_svuid != pcred->p_ruid || pcred->p_svgid != pcred->p_rgid)
+	if (p->p_flag & P_SUGID)
 		return (EFAULT);
 	if (ctob(UPAGES + vm->vm_dsize + vm->vm_ssize) >=
 	    p->p_rlimit[RLIMIT_CORE].rlim_cur)
 		return (EFAULT);
 	sprintf(name, "%s.core", p->p_comm);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_SYSSPACE, name, p);
-	if (error = vn_open(&nd,
-	    O_CREAT | FWRITE, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH))
+	if ((error = vn_open(&nd,
+	    O_CREAT | FWRITE, S_IRUSR | S_IWUSR)))
 		return (error);
 	vp = nd.ni_vp;
 
@@ -1206,14 +1283,19 @@ out:
  * Nonexistent system call-- signal process (may want to handle it).
  * Flag error in case process won't see signal immediately (blocked or ignored).
  */
+#ifndef _SYS_SYSPROTO_H_
+struct nosys_args {
+	int	dummy;
+};
+#endif
 /* ARGSUSED */
 int
 nosys(p, args, retval)
 	struct proc *p;
-	void *args;
-	register_t *retval;
+	struct nosys_args *args;
+	int *retval;
 {
 
 	psignal(p, SIGSYS);
-	return (ENOSYS);
+	return (EINVAL);
 }
diff --git a/sys/kern/kern_subr.c b/sys/kern/kern_subr.c
index df83710..d0097df 100644
--- a/sys/kern/kern_subr.c
+++ b/sys/kern/kern_subr.c
@@ -35,7 +35,8 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)kern_subr.c	8.4 (Berkeley) 2/14/95
+ *	@(#)kern_subr.c	8.3 (Berkeley) 1/21/94
+ * $Id$
  */
 
 #include <sys/param.h>
@@ -52,7 +53,7 @@ uiomove(cp, n, uio)
 {
 	register struct iovec *iov;
 	u_int cnt;
-	int error = 0;
+	int error;
 
 #ifdef DIAGNOSTIC
 	if (uio->uio_rw != UIO_READ && uio->uio_rw != UIO_WRITE)
@@ -70,6 +71,7 @@ uiomove(cp, n, uio)
 		}
 		if (cnt > n)
 			cnt = n;
+
 		switch (uio->uio_segflg) {
 
 		case UIO_USERSPACE:
@@ -88,6 +90,8 @@ uiomove(cp, n, uio)
 			else
 				bcopy(iov->iov_base, (caddr_t)cp, cnt);
 			break;
+		case UIO_NOCOPY:
+			break;
 		}
 		iov->iov_base += cnt;
 		iov->iov_len -= cnt;
@@ -96,7 +100,7 @@ uiomove(cp, n, uio)
 		cp += cnt;
 		n -= cnt;
 	}
-	return (error);
+	return (0);
 }
 
 /*
@@ -109,13 +113,11 @@ ureadc(c, uio)
 {
 	register struct iovec *iov;
 
-	if (uio->uio_resid <= 0)
-		panic("ureadc: non-positive resid");
 again:
-	if (uio->uio_iovcnt <= 0)
-		panic("ureadc: non-positive iovcnt");
+	if (uio->uio_iovcnt == 0 || uio->uio_resid == 0)
+		panic("ureadc");
 	iov = uio->uio_iov;
-	if (iov->iov_len <= 0) {
+	if (iov->iov_len == 0) {
 		uio->uio_iovcnt--;
 		uio->uio_iov++;
 		goto again;
@@ -135,6 +137,8 @@ again:
 		if (suibyte(iov->iov_base, c) < 0)
 			return (EFAULT);
 		break;
+	case UIO_NOCOPY:
+		break;
 	}
 	iov->iov_base++;
 	iov->iov_len--;
@@ -158,7 +162,7 @@ uwritec(uio)
 		return (-1);
 again:
 	if (uio->uio_iovcnt <= 0)
-		panic("uwritec: non-positive iovcnt");
+		panic("uwritec");
 	iov = uio->uio_iov;
 	if (iov->iov_len == 0) {
 		uio->uio_iov++;
@@ -203,7 +207,7 @@ hashinit(elements, type, hashmask)
 	int i;
 
 	if (elements <= 0)
-		panic("hashinit: bad cnt");
+		panic("hashinit: bad elements");
 	for (hashsize = 1; hashsize <= elements; hashsize <<= 1)
 		continue;
 	hashsize >>= 1;
@@ -213,3 +217,36 @@ hashinit(elements, type, hashmask)
 	*hashmask = hashsize - 1;
 	return (hashtbl);
 }
+
+#define NPRIMES 27
+static int primes[] = { 1, 13, 31, 61, 127, 251, 509, 761, 1021, 1531, 2039,
+			2557, 3067, 3583, 4093, 4603, 5119, 5623, 6143, 6653,
+			7159, 7673, 8191, 12281, 16381, 24571, 32749 };
+
+/*
+ * General routine to allocate a prime number sized hash table.
+ */
+void *
+phashinit(elements, type, nentries)
+	int elements, type;
+	u_long *nentries;
+{
+	long hashsize;
+	LIST_HEAD(generic, generic) *hashtbl;
+	int i;
+
+	if (elements <= 0)
+		panic("phashinit: bad elements");
+	for (i = 1, hashsize = primes[1]; hashsize <= elements;) {
+		i++;
+		if (i == NPRIMES)
+			break;
+		hashsize = primes[i];
+	}
+	hashsize = primes[i - 1];
+	hashtbl = malloc((u_long)hashsize * sizeof(*hashtbl), type, M_WAITOK);
+	for (i = 0; i < hashsize; i++)
+		LIST_INIT(&hashtbl[i]);
+	*nentries = hashsize;
+	return (hashtbl);
+}
diff --git a/sys/kern/kern_synch.c b/sys/kern/kern_synch.c
index 6c82027..04339cd 100644
--- a/sys/kern/kern_synch.c
+++ b/sys/kern/kern_synch.c
@@ -36,8 +36,11 @@
  * SUCH DAMAGE.
  *
  *	@(#)kern_synch.c	8.9 (Berkeley) 5/19/95
+ * $Id: kern_synch.c,v 1.29 1997/02/22 09:39:12 peter Exp $
  */
 
+#include "opt_ktrace.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
@@ -45,16 +48,26 @@
 #include <sys/buf.h>
 #include <sys/signalvar.h>
 #include <sys/resourcevar.h>
+#include <sys/signalvar.h>
 #include <sys/vmmeter.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_extern.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
 #include <machine/cpu.h>
 
+static void rqinit __P((void *));
+SYSINIT(runqueue, SI_SUB_RUN_QUEUE, SI_ORDER_FIRST, rqinit, NULL)
+
 u_char	curpriority;		/* usrpri of curproc */
 int	lbolt;			/* once a second sleep address */
 
+extern void	endtsleep __P((void *));
+extern void	updatepri __P((struct proc *p));
+
 /*
  * Force switch among equal priority processes every 100ms.
  */
@@ -75,7 +88,7 @@ roundrobin(arg)
  *          Note that, as ps(1) mentions, this can let percentages
  *          total over 100% (I've seen 137.9% for 3 processes).
  *
- * Note that hardclock updates p_estcpu and p_cpticks independently.
+ * Note that statclock updates p_estcpu and p_cpticks independently.
  *
  * We wish to decay away 90% of p_estcpu in (5 * loadavg) seconds.
  * That is, the system wants to compute a value of decay such
@@ -104,7 +117,7 @@ roundrobin(arg)
  * We now need to prove two things:
  *	1) Given factor ** (5 * loadavg) ~= .1, prove factor == b/(b+1)
  *	2) Given b/(b+1) ** power ~= .1, prove power == (5 * loadavg)
- *	
+ *
  * Facts:
  *         For x close to zero, exp(x) =~ 1 + x, since
  *              exp(x) = 0! + x**1/1! + x**2/2! + ... .
@@ -183,7 +196,7 @@ schedcpu(arg)
 		 */
 		if (p->p_slptime > 1)
 			continue;
-		s = splstatclock();	/* prevent state changes */
+		s = splhigh();	/* prevent state changes and protect run queue */
 		/*
 		 * p_pctcpu is only for ps.
 		 */
@@ -215,8 +228,6 @@ schedcpu(arg)
 		splx(s);
 	}
 	vmmeter();
-	if (bclnlist != NULL)
-		wakeup((caddr_t)pageproc);
 	timeout(schedcpu, (void *)0, hz);
 }
 
@@ -249,11 +260,8 @@ updatepri(p)
  * of 2.  Shift right by 8, i.e. drop the bottom 256 worth.
  */
 #define TABLESIZE	128
+TAILQ_HEAD(slpquehead, proc) slpque[TABLESIZE];
 #define LOOKUP(x)	(((long)(x) >> 8) & (TABLESIZE - 1))
-struct slpque {
-	struct proc *sq_head;
-	struct proc **sq_tailp;
-} slpque[TABLESIZE];
 
 /*
  * During autoconfiguration or after a panic, a sleep will simply
@@ -266,6 +274,15 @@ struct slpque {
  */
 int safepri;
 
+void
+sleepinit()
+{
+	int i;
+
+	for (i = 0; i < TABLESIZE; i++)
+		TAILQ_INIT(&slpque[i]);
+}
+
 /*
  * General sleep call.  Suspends the current process until a wakeup is
  * performed on the specified identifier.  The process will then be made
@@ -283,12 +300,8 @@ tsleep(ident, priority, wmesg, timo)
 	int priority, timo;
 	char *wmesg;
 {
-	register struct proc *p = curproc;
-	register struct slpque *qp;
-	register s;
-	int sig, catch = priority & PCATCH;
-	extern int cold;
-	void endtsleep __P((void *));
+	struct proc *p = curproc;
+	int s, sig, catch = priority & PCATCH;
 
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_CSW))
@@ -307,19 +320,14 @@ tsleep(ident, priority, wmesg, timo)
 		return (0);
 	}
 #ifdef DIAGNOSTIC
-	if (ident == NULL || p->p_stat != SRUN || p->p_back)
+	if (ident == NULL || p->p_stat != SRUN)
 		panic("tsleep");
 #endif
 	p->p_wchan = ident;
 	p->p_wmesg = wmesg;
 	p->p_slptime = 0;
 	p->p_priority = priority & PRIMASK;
-	qp = &slpque[LOOKUP(ident)];
-	if (qp->sq_head == 0)
-		qp->sq_head = p;
-	else
-		*qp->sq_tailp = p;
-	*(qp->sq_tailp = &p->p_forw) = 0;
+	TAILQ_INSERT_TAIL(&slpque[LOOKUP(ident)], p, p_procq);
 	if (timo)
 		timeout(endtsleep, (void *)p, timo);
 	/*
@@ -333,7 +341,7 @@ tsleep(ident, priority, wmesg, timo)
 	 */
 	if (catch) {
 		p->p_flag |= P_SINTR;
-		if (sig = CURSIG(p)) {
+		if ((sig = CURSIG(p))) {
 			if (p->p_wchan)
 				unsleep(p);
 			p->p_stat = SRUN;
@@ -405,85 +413,17 @@ endtsleep(arg)
 }
 
 /*
- * Short-term, non-interruptable sleep.
- */
-void
-sleep(ident, priority)
-	void *ident;
-	int priority;
-{
-	register struct proc *p = curproc;
-	register struct slpque *qp;
-	register s;
-	extern int cold;
-
-#ifdef DIAGNOSTIC
-	if (priority > PZERO) {
-		printf("sleep called with priority %d > PZERO, wchan: %x\n",
-		    priority, ident);
-		panic("old sleep");
-	}
-#endif
-	s = splhigh();
-	if (cold || panicstr) {
-		/*
-		 * After a panic, or during autoconfiguration,
-		 * just give interrupts a chance, then just return;
-		 * don't run any other procs or panic below,
-		 * in case this is the idle process and already asleep.
-		 */
-		splx(safepri);
-		splx(s);
-		return;
-	}
-#ifdef DIAGNOSTIC
-	if (ident == NULL || p->p_stat != SRUN || p->p_back)
-		panic("sleep");
-#endif
-	p->p_wchan = ident;
-	p->p_wmesg = NULL;
-	p->p_slptime = 0;
-	p->p_priority = priority;
-	qp = &slpque[LOOKUP(ident)];
-	if (qp->sq_head == 0)
-		qp->sq_head = p;
-	else
-		*qp->sq_tailp = p;
-	*(qp->sq_tailp = &p->p_forw) = 0;
-	p->p_stat = SSLEEP;
-	p->p_stats->p_ru.ru_nvcsw++;
-#ifdef KTRACE
-	if (KTRPOINT(p, KTR_CSW))
-		ktrcsw(p->p_tracep, 1, 0);
-#endif
-	mi_switch();
-#ifdef KTRACE
-	if (KTRPOINT(p, KTR_CSW))
-		ktrcsw(p->p_tracep, 0, 0);
-#endif
-	curpriority = p->p_usrpri;
-	splx(s);
-}
-
-/*
  * Remove a process from its wait queue
  */
 void
 unsleep(p)
 	register struct proc *p;
 {
-	register struct slpque *qp;
-	register struct proc **hp;
 	int s;
 
 	s = splhigh();
 	if (p->p_wchan) {
-		hp = &(qp = &slpque[LOOKUP(p->p_wchan)])->sq_head;
-		while (*hp != p)
-			hp = &(*hp)->p_forw;
-		*hp = p->p_forw;
-		if (qp->sq_tailp == &p->p_forw)
-			qp->sq_tailp = hp;
+		TAILQ_REMOVE(&slpque[LOOKUP(p->p_wchan)], p, p_procq);
 		p->p_wchan = 0;
 	}
 	splx(s);
@@ -496,45 +436,83 @@ void
 wakeup(ident)
 	register void *ident;
 {
-	register struct slpque *qp;
-	register struct proc *p, **q;
+	register struct slpquehead *qp;
+	register struct proc *p;
 	int s;
 
 	s = splhigh();
 	qp = &slpque[LOOKUP(ident)];
 restart:
-	for (q = &qp->sq_head; p = *q; ) {
+	for (p = qp->tqh_first; p != NULL; p = p->p_procq.tqe_next) {
 #ifdef DIAGNOSTIC
-		if (p->p_back || p->p_stat != SSLEEP && p->p_stat != SSTOP)
+		if (p->p_stat != SSLEEP && p->p_stat != SSTOP)
 			panic("wakeup");
 #endif
 		if (p->p_wchan == ident) {
+			TAILQ_REMOVE(qp, p, p_procq);
 			p->p_wchan = 0;
-			*q = p->p_forw;
-			if (qp->sq_tailp == &p->p_forw)
-				qp->sq_tailp = q;
 			if (p->p_stat == SSLEEP) {
 				/* OPTIMIZED EXPANSION OF setrunnable(p); */
 				if (p->p_slptime > 1)
 					updatepri(p);
 				p->p_slptime = 0;
 				p->p_stat = SRUN;
-				if (p->p_flag & P_INMEM)
+				if (p->p_flag & P_INMEM) {
 					setrunqueue(p);
-				/*
-				 * Since curpriority is a user priority,
-				 * p->p_priority is always better than
-				 * curpriority.
-				 */
-				if ((p->p_flag & P_INMEM) == 0)
-					wakeup((caddr_t)&proc0);
-				else
 					need_resched();
+				} else {
+					p->p_flag |= P_SWAPINREQ;
+					wakeup((caddr_t)&proc0);
+				}
 				/* END INLINE EXPANSION */
 				goto restart;
 			}
-		} else
-			q = &p->p_forw;
+		}
+	}
+	splx(s);
+}
+
+/*
+ * Make a process sleeping on the specified identifier runnable.
+ * May wake more than one process if a target prcoess is currently
+ * swapped out.
+ */
+void
+wakeup_one(ident)
+	register void *ident;
+{
+	register struct slpquehead *qp;
+	register struct proc *p;
+	int s;
+
+	s = splhigh();
+	qp = &slpque[LOOKUP(ident)];
+
+	for (p = qp->tqh_first; p != NULL; p = p->p_procq.tqe_next) {
+#ifdef DIAGNOSTIC
+		if (p->p_stat != SSLEEP && p->p_stat != SSTOP)
+			panic("wakeup_one");
+#endif
+		if (p->p_wchan == ident) {
+			TAILQ_REMOVE(qp, p, p_procq);
+			p->p_wchan = 0;
+			if (p->p_stat == SSLEEP) {
+				/* OPTIMIZED EXPANSION OF setrunnable(p); */
+				if (p->p_slptime > 1)
+					updatepri(p);
+				p->p_slptime = 0;
+				p->p_stat = SRUN;
+				if (p->p_flag & P_INMEM) {
+					setrunqueue(p);
+					need_resched();
+					break;
+				} else {
+					p->p_flag |= P_SWAPINREQ;
+					wakeup((caddr_t)&proc0);
+				}
+				/* END INLINE EXPANSION */
+			}
+		}
 	}
 	splx(s);
 }
@@ -549,11 +527,31 @@ mi_switch()
 	register struct proc *p = curproc;	/* XXX */
 	register struct rlimit *rlim;
 	register long s, u;
+	int x;
 	struct timeval tv;
 
-#ifdef DEBUG
+	/*
+	 * XXX this spl is almost unnecessary.  It is partly to allow for
+	 * sloppy callers that don't do it (issignal() via CURSIG() is the
+	 * main offender).  It is partly to work around a bug in the i386
+	 * cpu_switch() (the ipl is not preserved).  We ran for years
+	 * without it.  I think there was only a interrupt latency problem.
+	 * The main caller, tsleep(), does an splx() a couple of instructions
+	 * after calling here.  The buggy caller, issignal(), usually calls
+	 * here at spl0() and sometimes returns at splhigh().  The process
+	 * then runs for a little too long at splhigh().  The ipl gets fixed
+	 * when the process returns to user mode (or earlier).
+	 *
+	 * It would probably be better to always call here at spl0(). Callers
+	 * are prepared to give up control to another process, so they must
+	 * be prepared to be interrupted.  The clock stuff here may not
+	 * actually need splstatclock().
+	 */
+	x = splstatclock();
+
+#ifdef SIMPLELOCK_DEBUG
 	if (p->p_simple_locks)
-		panic("sleep: holding simple lock");
+		printf("sleep: holding simple lock");
 #endif
 	/*
 	 * Compute the amount of time during which the current
@@ -574,23 +572,20 @@ mi_switch()
 
 	/*
 	 * Check if the process exceeds its cpu resource allocation.
-	 * If over max, kill it.  In any case, if it has run for more
-	 * than 10 minutes, reduce priority to give others a chance.
+	 * If over max, kill it.
 	 */
-	rlim = &p->p_rlimit[RLIMIT_CPU];
-	if (s >= rlim->rlim_cur) {
-		if (s >= rlim->rlim_max)
-			psignal(p, SIGKILL);
-		else {
-			psignal(p, SIGXCPU);
-			if (rlim->rlim_cur < rlim->rlim_max)
-				rlim->rlim_cur += 5;
+	if (p->p_stat != SZOMB) {
+		rlim = &p->p_rlimit[RLIMIT_CPU];
+		if (s >= rlim->rlim_cur) {
+			if (s >= rlim->rlim_max)
+				killproc(p, "exceeded maximum CPU limit");
+			else {
+				psignal(p, SIGXCPU);
+				if (rlim->rlim_cur < rlim->rlim_max)
+					rlim->rlim_cur += 5;
+			}
 		}
 	}
-	if (s > 10 * 60 && p->p_ucred->cr_uid && p->p_nice == NZERO) {
-		p->p_nice = NZERO + 4;
-		resetpriority(p);
-	}
 
 	/*
 	 * Pick a new current process and record its start time.
@@ -598,19 +593,25 @@ mi_switch()
 	cnt.v_swtch++;
 	cpu_switch(p);
 	microtime(&runtime);
+	splx(x);
 }
 
 /*
  * Initialize the (doubly-linked) run queues
  * to be empty.
  */
-void
-rqinit()
+/* ARGSUSED*/
+static void
+rqinit(dummy)
+	void *dummy;
 {
 	register int i;
 
-	for (i = 0; i < NQS; i++)
+	for (i = 0; i < NQS; i++) {
 		qs[i].ph_link = qs[i].ph_rlink = (struct proc *)&qs[i];
+		rtqs[i].ph_link = rtqs[i].ph_rlink = (struct proc *)&rtqs[i];
+		idqs[i].ph_link = idqs[i].ph_rlink = (struct proc *)&idqs[i];
+	}
 }
 
 /*
@@ -646,8 +647,10 @@ setrunnable(p)
 	if (p->p_slptime > 1)
 		updatepri(p);
 	p->p_slptime = 0;
-	if ((p->p_flag & P_INMEM) == 0)
+	if ((p->p_flag & P_INMEM) == 0) {
+		p->p_flag |= P_SWAPINREQ;
 		wakeup((caddr_t)&proc0);
+	}
 	else if (p->p_priority < curpriority)
 		need_resched();
 }
@@ -663,9 +666,13 @@ resetpriority(p)
 {
 	register unsigned int newpriority;
 
-	newpriority = PUSER + p->p_estcpu / 4 + 2 * p->p_nice;
-	newpriority = min(newpriority, MAXPRI);
-	p->p_usrpri = newpriority;
-	if (newpriority < curpriority)
+	if (p->p_rtprio.type == RTP_PRIO_NORMAL) {
+		newpriority = PUSER + p->p_estcpu / 4 + 2 * p->p_nice;
+		newpriority = min(newpriority, MAXPRI);
+		p->p_usrpri = newpriority;
+		if (newpriority < curpriority)
+			need_resched();
+	} else {
 		need_resched();
+	}
 }
diff --git a/sys/kern/kern_sysctl.c b/sys/kern/kern_sysctl.c
index b178da3..fb07f18 100644
--- a/sys/kern/kern_sysctl.c
+++ b/sys/kern/kern_sysctl.c
@@ -5,6 +5,9 @@
  * This code is derived from software contributed to Berkeley by
  * Mike Karels at Berkeley Software Design, Inc.
  *
+ * Quite extensively rewritten by Poul-Henning Kamp of the FreeBSD
+ * project, to make these variables more userfriendly.
+ *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
@@ -33,39 +36,20 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)kern_sysctl.c	8.9 (Berkeley) 5/20/95
- */
-
-/*
- * sysctl system call.
+ *	@(#)kern_sysctl.c	8.4 (Berkeley) 4/14/94
+ * $Id$
  */
 
 #include <sys/param.h>
-#include <sys/systm.h>
 #include <sys/kernel.h>
+#include <sys/sysctl.h>
 #include <sys/malloc.h>
 #include <sys/proc.h>
-#include <sys/file.h>
-#include <sys/vnode.h>
-#include <sys/unistd.h>
-#include <sys/buf.h>
-#include <sys/ioctl.h>
-#include <sys/tty.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
 #include <vm/vm.h>
-#include <sys/sysctl.h>
-
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
-
-sysctlfn kern_sysctl;
-sysctlfn hw_sysctl;
-#ifdef DEBUG
-sysctlfn debug_sysctl;
-#endif
-extern sysctlfn vm_sysctl;
-extern sysctlfn vfs_sysctl;
-extern sysctlfn net_sysctl;
-extern sysctlfn cpu_sysctl;
+#include <vm/vm_extern.h>
+#include <sys/vnode.h>
 
 /*
  * Locking and stats
@@ -76,634 +60,818 @@ static struct sysctl_lock {
 	int	sl_locked;
 } memlock;
 
-int
-__sysctl(p, uap, retval)
-	struct proc *p;
-	register struct __sysctl_args /* {
-		syscallarg(int *) name;
-		syscallarg(u_int) namelen;
-		syscallarg(void *) old;
-		syscallarg(size_t *) oldlenp;
-		syscallarg(void *) new;
-		syscallarg(size_t) newlen;
-	} */ *uap;
-	register_t *retval;
+static int sysctl_root SYSCTL_HANDLER_ARGS;
+
+extern struct linker_set sysctl_;
+
+/*
+ * Initialization of the MIB tree.
+ *
+ * Order by number in each linker_set.
+ */
+
+static int
+sysctl_order_cmp(const void *a, const void *b)
 {
-	int error, dolock = 1;
-	size_t savelen, oldlen = 0;
-	sysctlfn *fn;
-	int name[CTL_MAXNAME];
+	struct sysctl_oid const * const *pa;
+	struct sysctl_oid const * const *pb;
 
-	if (SCARG(uap, new) != NULL &&
-	    (error = suser(p->p_ucred, &p->p_acflag)))
-		return (error);
-	/*
-	 * all top-level sysctl names are non-terminal
-	 */
-	if (SCARG(uap, namelen) > CTL_MAXNAME || SCARG(uap, namelen) < 2)
-		return (EINVAL);
-	if (error =
-	    copyin(SCARG(uap, name), &name, SCARG(uap, namelen) * sizeof(int)))
-		return (error);
+	pa = (struct sysctl_oid const * const *)a;
+	pb = (struct sysctl_oid const * const *)b;
+	if (*pa == NULL)
+		return (1);
+	if (*pb == NULL)
+		return (-1);
+	return ((*pa)->oid_number - (*pb)->oid_number);
+}
 
-	switch (name[0]) {
-	case CTL_KERN:
-		fn = kern_sysctl;
-		if (name[2] == KERN_VNODE)	/* XXX */
-			dolock = 0;
-		break;
-	case CTL_HW:
-		fn = hw_sysctl;
-		break;
-	case CTL_VM:
-		fn = vm_sysctl;
-		break;
-	case CTL_NET:
-		fn = net_sysctl;
-		break;
-	case CTL_VFS:
-		fn = vfs_sysctl;
-		break;
-	case CTL_MACHDEP:
-		fn = cpu_sysctl;
-		break;
-#ifdef DEBUG
-	case CTL_DEBUG:
-		fn = debug_sysctl;
-		break;
-#endif
-	default:
-		return (EOPNOTSUPP);
-	}
+static void
+sysctl_order(void *arg)
+{
+	int j, k;
+	struct linker_set *l = (struct linker_set *) arg;
+	struct sysctl_oid **oidpp;
 
-	if (SCARG(uap, oldlenp) &&
-	    (error = copyin(SCARG(uap, oldlenp), &oldlen, sizeof(oldlen))))
-		return (error);
-	if (SCARG(uap, old) != NULL) {
-		if (!useracc(SCARG(uap, old), oldlen, B_WRITE))
-			return (EFAULT);
-		while (memlock.sl_lock) {
-			memlock.sl_want = 1;
-			sleep((caddr_t)&memlock, PRIBIO+1);
-			memlock.sl_locked++;
+	/* First, find the highest oid we have */
+	j = l->ls_length;
+	oidpp = (struct sysctl_oid **) l->ls_items;
+	for (k = 0; j--; oidpp++) {
+		if ((*oidpp)->oid_arg1 == arg) {
+			*oidpp = 0;
+			continue;
 		}
-		memlock.sl_lock = 1;
-		if (dolock)
-			vslock(SCARG(uap, old), oldlen);
-		savelen = oldlen;
+		if (*oidpp && (*oidpp)->oid_number > k)
+			k = (*oidpp)->oid_number;
 	}
-	error = (*fn)(name + 1, SCARG(uap, namelen) - 1, SCARG(uap, old),
-	    &oldlen, SCARG(uap, new), SCARG(uap, newlen), p);
-	if (SCARG(uap, old) != NULL) {
-		if (dolock)
-			vsunlock(SCARG(uap, old), savelen, B_WRITE);
-		memlock.sl_lock = 0;
-		if (memlock.sl_want) {
-			memlock.sl_want = 0;
-			wakeup((caddr_t)&memlock);
-		}
+
+	/* Next, replace all OID_AUTO oids with new numbers */
+	j = l->ls_length;
+	oidpp = (struct sysctl_oid **) l->ls_items;
+	k += 100;
+	for (; j--; oidpp++) 
+		if (*oidpp && (*oidpp)->oid_number == OID_AUTO)
+			(*oidpp)->oid_number = k++;
+
+	/* Finally: sort by oid */
+	j = l->ls_length;
+	oidpp = (struct sysctl_oid **) l->ls_items;
+	for (; j--; oidpp++) {
+		if (!*oidpp)
+			continue;
+		if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE)
+			if (!(*oidpp)->oid_handler)
+				sysctl_order((*oidpp)->oid_arg1);
 	}
-	if (error)
-		return (error);
-	if (SCARG(uap, oldlenp))
-		error = copyout(&oldlen, SCARG(uap, oldlenp), sizeof(oldlen));
-	*retval = oldlen;
-	return (0);
+	qsort(l->ls_items, l->ls_length, sizeof l->ls_items[0],
+		sysctl_order_cmp);
 }
 
-/*
- * Attributes stored in the kernel.
- */
-char hostname[MAXHOSTNAMELEN];
-int hostnamelen;
-long hostid;
-int securelevel;
+SYSINIT(sysctl, SI_SUB_KMEM, SI_ORDER_ANY, sysctl_order, &sysctl_);
 
 /*
- * kernel related system variables.
+ * "Staff-functions"
+ *
+ * These functions implement a presently undocumented interface 
+ * used by the sysctl program to walk the tree, and get the type
+ * so it can print the value.
+ * This interface is under work and consideration, and should probably
+ * be killed with a big axe by the first person who can find the time.
+ * (be aware though, that the proper interface isn't as obvious as it
+ * may seem, there are various conflicting requirements.
+ *
+ * {0,0}	printf the entire MIB-tree.
+ * {0,1,...}	return the name of the "..." OID.
+ * {0,2,...}	return the next OID.
+ * {0,3}	return the OID of the name in "new"
+ * {0,4,...}	return the kind & format info for the "..." OID.
  */
-kern_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
-	int *name;
-	u_int namelen;
-	void *oldp;
-	size_t *oldlenp;
-	void *newp;
-	size_t newlen;
-	struct proc *p;
+
+static void
+sysctl_sysctl_debug_dump_node(struct linker_set *l, int i)
 {
-	int error, level, inthostid;
-	extern char ostype[], osrelease[], version[];
-
-	/* all sysctl names at this level are terminal */
-	if (namelen != 1 && !(name[0] == KERN_PROC || name[0] == KERN_PROF))
-		return (ENOTDIR);		/* overloaded */
-
-	switch (name[0]) {
-	case KERN_OSTYPE:
-		return (sysctl_rdstring(oldp, oldlenp, newp, ostype));
-	case KERN_OSRELEASE:
-		return (sysctl_rdstring(oldp, oldlenp, newp, osrelease));
-	case KERN_OSREV:
-		return (sysctl_rdint(oldp, oldlenp, newp, BSD));
-	case KERN_VERSION:
-		return (sysctl_rdstring(oldp, oldlenp, newp, version));
-	case KERN_MAXVNODES:
-		return(sysctl_int(oldp, oldlenp, newp, newlen, &desiredvnodes));
-	case KERN_MAXPROC:
-		return (sysctl_int(oldp, oldlenp, newp, newlen, &maxproc));
-	case KERN_MAXFILES:
-		return (sysctl_int(oldp, oldlenp, newp, newlen, &maxfiles));
-	case KERN_ARGMAX:
-		return (sysctl_rdint(oldp, oldlenp, newp, ARG_MAX));
-	case KERN_SECURELVL:
-		level = securelevel;
-		if ((error = sysctl_int(oldp, oldlenp, newp, newlen, &level)) ||
-		    newp == NULL)
-			return (error);
-		if (level < securelevel && p->p_pid != 1)
-			return (EPERM);
-		securelevel = level;
-		return (0);
-	case KERN_HOSTNAME:
-		error = sysctl_string(oldp, oldlenp, newp, newlen,
-		    hostname, sizeof(hostname));
-		if (newp && !error)
-			hostnamelen = newlen;
-		return (error);
-	case KERN_HOSTID:
-		inthostid = hostid;  /* XXX assumes sizeof long <= sizeof int */
-		error =  sysctl_int(oldp, oldlenp, newp, newlen, &inthostid);
-		hostid = inthostid;
-		return (error);
-	case KERN_CLOCKRATE:
-		return (sysctl_clockrate(oldp, oldlenp));
-	case KERN_BOOTTIME:
-		return (sysctl_rdstruct(oldp, oldlenp, newp, &boottime,
-		    sizeof(struct timeval)));
-	case KERN_VNODE:
-		return (sysctl_vnode(oldp, oldlenp, p));
-	case KERN_PROC:
-		return (sysctl_doproc(name + 1, namelen - 1, oldp, oldlenp));
-	case KERN_FILE:
-		return (sysctl_file(oldp, oldlenp));
-#ifdef GPROF
-	case KERN_PROF:
-		return (sysctl_doprof(name + 1, namelen - 1, oldp, oldlenp,
-		    newp, newlen));
-#endif
-	case KERN_POSIX1:
-		return (sysctl_rdint(oldp, oldlenp, newp, _POSIX_VERSION));
-	case KERN_NGROUPS:
-		return (sysctl_rdint(oldp, oldlenp, newp, NGROUPS_MAX));
-	case KERN_JOB_CONTROL:
-		return (sysctl_rdint(oldp, oldlenp, newp, 1));
-	case KERN_SAVED_IDS:
-#ifdef _POSIX_SAVED_IDS
-		return (sysctl_rdint(oldp, oldlenp, newp, 1));
-#else
-		return (sysctl_rdint(oldp, oldlenp, newp, 0));
-#endif
-	default:
-		return (EOPNOTSUPP);
+	int j, k;
+	struct sysctl_oid **oidpp;
+
+	j = l->ls_length;
+	oidpp = (struct sysctl_oid **) l->ls_items;
+	for (; j--; oidpp++) {
+
+		if (!*oidpp)
+			continue;
+
+		for (k=0; k<i; k++)
+			printf(" ");
+
+		printf("%d %s ", (*oidpp)->oid_number, (*oidpp)->oid_name);
+
+		printf("%c%c",
+			(*oidpp)->oid_kind & CTLFLAG_RD ? 'R':' ',
+			(*oidpp)->oid_kind & CTLFLAG_WR ? 'W':' ');
+
+		if ((*oidpp)->oid_handler)
+			printf(" *Handler");
+
+		switch ((*oidpp)->oid_kind & CTLTYPE) {
+			case CTLTYPE_NODE:
+				printf(" Node\n");
+				if (!(*oidpp)->oid_handler) {
+					sysctl_sysctl_debug_dump_node(
+						(*oidpp)->oid_arg1, i+2);
+				}
+				break;
+			case CTLTYPE_INT:    printf(" Int\n"); break;
+			case CTLTYPE_STRING: printf(" String\n"); break;
+			case CTLTYPE_QUAD:   printf(" Quad\n"); break;
+			case CTLTYPE_OPAQUE: printf(" Opaque/struct\n"); break;
+			default:	     printf("\n");
+		}
+
 	}
-	/* NOTREACHED */
 }
 
-/*
- * hardware related system variables.
- */
-hw_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
-	int *name;
-	u_int namelen;
-	void *oldp;
-	size_t *oldlenp;
-	void *newp;
-	size_t newlen;
-	struct proc *p;
+static int
+sysctl_sysctl_debug SYSCTL_HANDLER_ARGS
 {
-	extern char machine[], cpu_model[];
-
-	/* all sysctl names at this level are terminal */
-	if (namelen != 1)
-		return (ENOTDIR);		/* overloaded */
-
-	switch (name[0]) {
-	case HW_MACHINE:
-		return (sysctl_rdstring(oldp, oldlenp, newp, machine));
-	case HW_MODEL:
-		return (sysctl_rdstring(oldp, oldlenp, newp, cpu_model));
-	case HW_NCPU:
-		return (sysctl_rdint(oldp, oldlenp, newp, 1));	/* XXX */
-	case HW_BYTEORDER:
-		return (sysctl_rdint(oldp, oldlenp, newp, BYTE_ORDER));
-	case HW_PHYSMEM:
-		return (sysctl_rdint(oldp, oldlenp, newp, ctob(physmem)));
-	case HW_USERMEM:
-		return (sysctl_rdint(oldp, oldlenp, newp,
-		    ctob(physmem - cnt.v_wire_count)));
-	case HW_PAGESIZE:
-		return (sysctl_rdint(oldp, oldlenp, newp, PAGE_SIZE));
-	default:
-		return (EOPNOTSUPP);
+	sysctl_sysctl_debug_dump_node(&sysctl_, 0);
+	return ENOENT;
+}
+
+SYSCTL_PROC(_sysctl, 0, debug, CTLTYPE_STRING|CTLFLAG_RD,
+	0, 0, sysctl_sysctl_debug, "-", "");
+
+static int
+sysctl_sysctl_name SYSCTL_HANDLER_ARGS
+{
+	int *name = (int *) arg1;
+	u_int namelen = arg2;
+	int i, j, error = 0;
+	struct sysctl_oid **oidpp;
+	struct linker_set *lsp = &sysctl_;
+	char buf[10];
+
+	while (namelen) {
+		if (!lsp) {
+			sprintf(buf,"%d",*name);
+			if (req->oldidx)
+				error = SYSCTL_OUT(req, ".", 1);
+			if (!error)
+				error = SYSCTL_OUT(req, buf, strlen(buf));
+			if (error)
+				return (error);
+			namelen--;
+			name++;
+			continue;
+		}
+		oidpp = (struct sysctl_oid **) lsp->ls_items;
+		j = lsp->ls_length;
+		lsp = 0;
+		for (i = 0; i < j; i++, oidpp++) {
+			if (*oidpp && ((*oidpp)->oid_number != *name))
+				continue;
+
+			if (req->oldidx)
+				error = SYSCTL_OUT(req, ".", 1);
+			if (!error)
+				error = SYSCTL_OUT(req, (*oidpp)->oid_name,
+					strlen((*oidpp)->oid_name));
+			if (error)
+				return (error);
+
+			namelen--;
+			name++;
+
+			if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+				break;
+
+			if ((*oidpp)->oid_handler)
+				break;
+
+			lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+			break;
+		}
 	}
-	/* NOTREACHED */
+	return (SYSCTL_OUT(req, "", 1));
 }
 
-#ifdef DEBUG
-/*
- * Debugging related system variables.
- */
-struct ctldebug debug0, debug1, debug2, debug3, debug4;
-struct ctldebug debug5, debug6, debug7, debug8, debug9;
-struct ctldebug debug10, debug11, debug12, debug13, debug14;
-struct ctldebug debug15, debug16, debug17, debug18, debug19;
-static struct ctldebug *debugvars[CTL_DEBUG_MAXID] = {
-	&debug0, &debug1, &debug2, &debug3, &debug4,
-	&debug5, &debug6, &debug7, &debug8, &debug9,
-	&debug10, &debug11, &debug12, &debug13, &debug14,
-	&debug15, &debug16, &debug17, &debug18, &debug19,
-};
-int
-debug_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
-	int *name;
-	u_int namelen;
-	void *oldp;
-	size_t *oldlenp;
-	void *newp;
-	size_t newlen;
-	struct proc *p;
+SYSCTL_NODE(_sysctl, 1, name, CTLFLAG_RD, sysctl_sysctl_name, "");
+
+static int
+sysctl_sysctl_next_ls (struct linker_set *lsp, int *name, u_int namelen, 
+	int *next, int *len, int level, struct sysctl_oid **oidp)
 {
-	struct ctldebug *cdp;
+	int i, j;
+	struct sysctl_oid **oidpp;
 
-	/* all sysctl names at this level are name and field */
-	if (namelen != 2)
-		return (ENOTDIR);		/* overloaded */
-	cdp = debugvars[name[0]];
-	if (name[0] >= CTL_DEBUG_MAXID || cdp->debugname == 0)
-		return (EOPNOTSUPP);
-	switch (name[1]) {
-	case CTL_DEBUG_NAME:
-		return (sysctl_rdstring(oldp, oldlenp, newp, cdp->debugname));
-	case CTL_DEBUG_VALUE:
-		return (sysctl_int(oldp, oldlenp, newp, newlen, cdp->debugvar));
-	default:
-		return (EOPNOTSUPP);
+	oidpp = (struct sysctl_oid **) lsp->ls_items;
+	j = lsp->ls_length;
+	*len = level;
+	for (i = 0; i < j; i++, oidpp++) {
+		if (!*oidpp)
+			continue;
+
+		*next = (*oidpp)->oid_number;
+		*oidp = *oidpp;
+
+		if (!namelen) {
+			if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE) 
+				return 0;
+			if ((*oidpp)->oid_handler) 
+				/* We really should call the handler here...*/
+				return 0;
+			lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+			if (!sysctl_sysctl_next_ls (lsp, 0, 0, next+1, 
+				len, level+1, oidp))
+				return 0;
+			goto next;
+		}
+
+		if ((*oidpp)->oid_number < *name)
+			continue;
+
+		if ((*oidpp)->oid_number > *name) {
+			if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+				return 0;
+			if ((*oidpp)->oid_handler)
+				return 0;
+			lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+			if (!sysctl_sysctl_next_ls (lsp, name+1, namelen-1, 
+				next+1, len, level+1, oidp))
+				return (0);
+			goto next;
+		}
+		if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+			continue;
+
+		if ((*oidpp)->oid_handler)
+			continue;
+
+		lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+		if (!sysctl_sysctl_next_ls (lsp, name+1, namelen-1, next+1, 
+			len, level+1, oidp))
+			return (0);
+	next:
+		namelen = 1;
+		*len = level;
 	}
-	/* NOTREACHED */
+	return 1;
 }
-#endif /* DEBUG */
 
-/*
- * Validate parameters and get old / set new parameters
- * for an integer-valued sysctl function.
- */
-sysctl_int(oldp, oldlenp, newp, newlen, valp)
-	void *oldp;
-	size_t *oldlenp;
-	void *newp;
-	size_t newlen;
-	int *valp;
+static int
+sysctl_sysctl_next SYSCTL_HANDLER_ARGS
 {
-	int error = 0;
+	int *name = (int *) arg1;
+	u_int namelen = arg2;
+	int i, j, error;
+	struct sysctl_oid *oid;
+	struct linker_set *lsp = &sysctl_;
+	int newoid[CTL_MAXNAME];
 
-	if (oldp && *oldlenp < sizeof(int))
-		return (ENOMEM);
-	if (newp && newlen != sizeof(int))
-		return (EINVAL);
-	*oldlenp = sizeof(int);
-	if (oldp)
-		error = copyout(valp, oldp, sizeof(int));
-	if (error == 0 && newp)
-		error = copyin(newp, valp, sizeof(int));
+	i = sysctl_sysctl_next_ls (lsp, name, namelen, newoid, &j, 1, &oid);
+	if (i)
+		return ENOENT;
+	error = SYSCTL_OUT(req, newoid, j * sizeof (int));
 	return (error);
 }
 
+SYSCTL_NODE(_sysctl, 2, next, CTLFLAG_RD, sysctl_sysctl_next, "");
+
+static int
+name2oid (char *name, int *oid, int *len, struct sysctl_oid **oidp)
+{
+	int i, j;
+	struct sysctl_oid **oidpp;
+	struct linker_set *lsp = &sysctl_;
+	char *p;
+
+	if (!*name)
+		return ENOENT;
+
+	p = name + strlen(name) - 1 ;
+	if (*p == '.')
+		*p = '\0';
+
+	*len = 0;
+
+	for (p = name; *p && *p != '.'; p++) 
+		;
+	i = *p;
+	if (i == '.')
+		*p = '\0';
+
+	j = lsp->ls_length;
+	oidpp = (struct sysctl_oid **) lsp->ls_items;
+
+	while (j-- && *len < CTL_MAXNAME) {
+		if (!*oidpp)
+			continue;
+		if (strcmp(name, (*oidpp)->oid_name)) {
+			oidpp++;
+			continue;
+		}
+		*oid++ = (*oidpp)->oid_number;
+		(*len)++;
+
+		if (!i) {
+			if (oidp)
+				*oidp = *oidpp;
+			return (0);
+		}
+
+		if (((*oidpp)->oid_kind & CTLTYPE) != CTLTYPE_NODE)
+			break;
+
+		if ((*oidpp)->oid_handler)
+			break;
+
+		lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+		j = lsp->ls_length;
+		oidpp = (struct sysctl_oid **)lsp->ls_items;
+		name = p+1;
+		for (p = name; *p && *p != '.'; p++) 
+				;
+		i = *p;
+		if (i == '.')
+			*p = '\0';
+	}
+	return ENOENT;
+}
+
+static int
+sysctl_sysctl_name2oid SYSCTL_HANDLER_ARGS
+{
+	char *p;
+	int error, oid[CTL_MAXNAME], len;
+	struct sysctl_oid *op = 0;
+
+	if (!req->newlen) 
+		return ENOENT;
+
+	p = malloc(req->newlen+1, M_SYSCTL, M_WAITOK);
+
+	error = SYSCTL_IN(req, p, req->newlen);
+	if (error) {
+		free(p, M_SYSCTL);
+		return (error);
+	}
+
+	p [req->newlen] = '\0';
+
+	error = name2oid(p, oid, &len, &op);
+
+	free(p, M_SYSCTL);
+
+	if (error)
+		return (error);
+
+	error = SYSCTL_OUT(req, oid, len * sizeof *oid);
+	return (error);
+}
+
+SYSCTL_PROC(_sysctl, 3, name2oid, CTLFLAG_RW|CTLFLAG_ANYBODY, 0, 0, 
+	sysctl_sysctl_name2oid, "I", "");
+
+static int
+sysctl_sysctl_oidfmt SYSCTL_HANDLER_ARGS
+{
+	int *name = (int *) arg1, error;
+	u_int namelen = arg2;
+	int indx, j;
+	struct sysctl_oid **oidpp;
+	struct linker_set *lsp = &sysctl_;
+
+	j = lsp->ls_length;
+	oidpp = (struct sysctl_oid **) lsp->ls_items;
+
+	indx = 0;
+	while (j-- && indx < CTL_MAXNAME) {
+		if (*oidpp && ((*oidpp)->oid_number == name[indx])) {
+			indx++;
+			if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+				if ((*oidpp)->oid_handler)
+					goto found;
+				if (indx == namelen)
+					goto found;
+				lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+				j = lsp->ls_length;
+				oidpp = (struct sysctl_oid **)lsp->ls_items;
+			} else {
+				if (indx != namelen)
+					return EISDIR;
+				goto found;
+			}
+		} else {
+			oidpp++;
+		}
+	}
+	return ENOENT;
+found:
+	if (!(*oidpp)->oid_fmt)
+		return ENOENT;
+	error = SYSCTL_OUT(req, 
+		&(*oidpp)->oid_kind, sizeof((*oidpp)->oid_kind));
+	if (!error)
+		error = SYSCTL_OUT(req, (*oidpp)->oid_fmt, 
+			strlen((*oidpp)->oid_fmt)+1);
+	return (error);
+}
+
+
+SYSCTL_NODE(_sysctl, 4, oidfmt, CTLFLAG_RD, sysctl_sysctl_oidfmt, "");
+
+/*
+ * Default "handler" functions.
+ */
+
 /*
- * As above, but read-only.
+ * Handle an integer, signed or unsigned.
+ * Two cases:
+ *     a variable:  point arg1 at it.
+ *     a constant:  pass it in arg2.
  */
-sysctl_rdint(oldp, oldlenp, newp, val)
-	void *oldp;
-	size_t *oldlenp;
-	void *newp;
-	int val;
+
+int
+sysctl_handle_int SYSCTL_HANDLER_ARGS
 {
 	int error = 0;
 
-	if (oldp && *oldlenp < sizeof(int))
-		return (ENOMEM);
-	if (newp)
-		return (EPERM);
-	*oldlenp = sizeof(int);
-	if (oldp)
-		error = copyout((caddr_t)&val, oldp, sizeof(int));
+	if (arg1)
+		error = SYSCTL_OUT(req, arg1, sizeof(int));
+	else
+		error = SYSCTL_OUT(req, &arg2, sizeof(int));
+
+	if (error || !req->newptr)
+		return (error);
+
+	if (!arg1)
+		error = EPERM;
+	else
+		error = SYSCTL_IN(req, arg1, sizeof(int));
 	return (error);
 }
 
 /*
- * Validate parameters and get old / set new parameters
- * for a string-valued sysctl function.
+ * Handle our generic '\0' terminated 'C' string.
+ * Two cases:
+ * 	a variable string:  point arg1 at it, arg2 is max length.
+ * 	a constant string:  point arg1 at it, arg2 is zero.
  */
-sysctl_string(oldp, oldlenp, newp, newlen, str, maxlen)
-	void *oldp;
-	size_t *oldlenp;
-	void *newp;
-	size_t newlen;
-	char *str;
-	int maxlen;
+
+int
+sysctl_handle_string SYSCTL_HANDLER_ARGS
 {
-	int len, error = 0;
+	int error=0;
 
-	len = strlen(str) + 1;
-	if (oldp && *oldlenp < len)
-		return (ENOMEM);
-	if (newp && newlen >= maxlen)
-		return (EINVAL);
-	if (oldp) {
-		*oldlenp = len;
-		error = copyout(str, oldp, len);
-	}
-	if (error == 0 && newp) {
-		error = copyin(newp, str, newlen);
-		str[newlen] = 0;
+	error = SYSCTL_OUT(req, arg1, strlen((char *)arg1)+1);
+
+	if (error || !req->newptr || !arg2)
+		return (error);
+
+	if ((req->newlen - req->newidx) > arg2) {
+		error = E2BIG;
+	} else {
+		arg2 = (req->newlen - req->newidx);
+		error = SYSCTL_IN(req, arg1, arg2);
+		((char *)arg1)[arg2] = '\0';
 	}
+
 	return (error);
 }
 
 /*
- * As above, but read-only.
+ * Handle any kind of opaque data.
+ * arg1 points to it, arg2 is the size.
  */
-sysctl_rdstring(oldp, oldlenp, newp, str)
-	void *oldp;
-	size_t *oldlenp;
-	void *newp;
-	char *str;
+
+int
+sysctl_handle_opaque SYSCTL_HANDLER_ARGS
 {
-	int len, error = 0;
+	int error;
+
+	error = SYSCTL_OUT(req, arg1, arg2);
+
+	if (error || !req->newptr)
+		return (error);
+
+	error = SYSCTL_IN(req, arg1, arg2);
 
-	len = strlen(str) + 1;
-	if (oldp && *oldlenp < len)
-		return (ENOMEM);
-	if (newp)
-		return (EPERM);
-	*oldlenp = len;
-	if (oldp)
-		error = copyout(str, oldp, len);
 	return (error);
 }
 
 /*
- * Validate parameters and get old / set new parameters
- * for a structure oriented sysctl function.
+ * Transfer functions to/from kernel space.
+ * XXX: rather untested at this point
  */
-sysctl_struct(oldp, oldlenp, newp, newlen, sp, len)
-	void *oldp;
-	size_t *oldlenp;
-	void *newp;
-	size_t newlen;
-	void *sp;
-	int len;
+static int
+sysctl_old_kernel(struct sysctl_req *req, const void *p, int l)
 {
-	int error = 0;
+	int i = 0;
 
-	if (oldp && *oldlenp < len)
+	if (req->oldptr) {
+		i = min(req->oldlen - req->oldidx, l);
+		if (i > 0)
+			bcopy(p, (char *)req->oldptr + req->oldidx, i);
+	}
+	req->oldidx += l;
+	if (req->oldptr && i != l)
 		return (ENOMEM);
-	if (newp && newlen > len)
+	return (0);
+}
+
+static int
+sysctl_new_kernel(struct sysctl_req *req, void *p, int l)
+{
+	if (!req->newptr)
+		return 0;
+	if (req->newlen - req->newidx < l)
 		return (EINVAL);
-	if (oldp) {
-		*oldlenp = len;
-		error = copyout(sp, oldp, len);
+	bcopy((char *)req->newptr + req->newidx, p, l);
+	req->newidx += l;
+	return (0);
+}
+
+int
+kernel_sysctl(struct proc *p, int *name, u_int namelen, void *old, size_t *oldlenp, void *new, size_t newlen, int *retval)
+{
+	int error = 0;
+	struct sysctl_req req;
+
+	bzero(&req, sizeof req);
+
+	req.p = p;
+
+	if (oldlenp) {
+		req.oldlen = *oldlenp;
+	}
+
+	if (old) {
+		req.oldptr= old;
+	}
+
+	if (newlen) {
+		req.newlen = newlen;
+		req.newptr = new;
+	}
+
+	req.oldfunc = sysctl_old_kernel;
+	req.newfunc = sysctl_new_kernel;
+	req.lock = 1;
+
+	/* XXX this should probably be done in a general way */
+	while (memlock.sl_lock) {
+		memlock.sl_want = 1;
+		(void) tsleep((caddr_t)&memlock, PRIBIO+1, "sysctl", 0);
+		memlock.sl_locked++;
+	}
+	memlock.sl_lock = 1;
+
+	error = sysctl_root(0, name, namelen, &req);
+
+	if (req.lock == 2)
+		vsunlock(req.oldptr, req.oldlen, B_WRITE);
+
+	memlock.sl_lock = 0;
+
+	if (memlock.sl_want) {
+		memlock.sl_want = 0;
+		wakeup((caddr_t)&memlock);
+	}
+
+	if (error && error != ENOMEM)
+		return (error);
+
+	if (retval) {
+		if (req.oldptr && req.oldidx > req.oldlen)
+			*retval = req.oldlen;
+		else
+			*retval = req.oldidx;
 	}
-	if (error == 0 && newp)
-		error = copyin(newp, sp, len);
 	return (error);
 }
 
 /*
- * Validate parameters and get old parameters
- * for a structure oriented sysctl function.
+ * Transfer function to/from user space.
  */
-sysctl_rdstruct(oldp, oldlenp, newp, sp, len)
-	void *oldp;
-	size_t *oldlenp;
-	void *newp, *sp;
-	int len;
+static int
+sysctl_old_user(struct sysctl_req *req, const void *p, int l)
 {
-	int error = 0;
+	int error = 0, i = 0;
 
-	if (oldp && *oldlenp < len)
+	if (req->lock == 1 && req->oldptr) {
+		vslock(req->oldptr, req->oldlen);
+		req->lock = 2;
+	}
+	if (req->oldptr) {
+		i = min(req->oldlen - req->oldidx, l);
+		if (i > 0)
+			error = copyout(p, (char *)req->oldptr + req->oldidx,
+					i);
+	}
+	req->oldidx += l;
+	if (error)
+		return (error);
+	if (req->oldptr && i < l)
 		return (ENOMEM);
-	if (newp)
-		return (EPERM);
-	*oldlenp = len;
-	if (oldp)
-		error = copyout(sp, oldp, len);
+	return (0);
+}
+
+static int
+sysctl_new_user(struct sysctl_req *req, void *p, int l)
+{
+	int error;
+
+	if (!req->newptr)
+		return 0;
+	if (req->newlen - req->newidx < l)
+		return (EINVAL);
+	error = copyin((char *)req->newptr + req->newidx, p, l);
+	req->newidx += l;
 	return (error);
 }
 
 /*
- * Get file structures.
+ * Traverse our tree, and find the right node, execute whatever it points
+ * at, and return the resulting error code.
  */
-sysctl_file(where, sizep)
-	char *where;
-	size_t *sizep;
+
+int
+sysctl_root SYSCTL_HANDLER_ARGS
 {
-	int buflen, error;
-	struct file *fp;
-	char *start = where;
+	int *name = (int *) arg1;
+	u_int namelen = arg2;
+	int indx, i, j;
+	struct sysctl_oid **oidpp;
+	struct linker_set *lsp = &sysctl_;
 
-	buflen = *sizep;
-	if (where == NULL) {
-		/*
-		 * overestimate by 10 files
-		 */
-		*sizep = sizeof(filehead) + (nfiles + 10) * sizeof(struct file);
-		return (0);
-	}
+	j = lsp->ls_length;
+	oidpp = (struct sysctl_oid **) lsp->ls_items;
 
-	/*
-	 * first copyout filehead
-	 */
-	if (buflen < sizeof(filehead)) {
-		*sizep = 0;
-		return (0);
-	}
-	if (error = copyout((caddr_t)&filehead, where, sizeof(filehead)))
-		return (error);
-	buflen -= sizeof(filehead);
-	where += sizeof(filehead);
-
-	/*
-	 * followed by an array of file structures
-	 */
-	for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next) {
-		if (buflen < sizeof(struct file)) {
-			*sizep = where - start;
-			return (ENOMEM);
+	indx = 0;
+	while (j-- && indx < CTL_MAXNAME) {
+		if (*oidpp && ((*oidpp)->oid_number == name[indx])) {
+			indx++;
+			if ((*oidpp)->oid_kind & CTLFLAG_NOLOCK)
+				req->lock = 0;
+			if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+				if ((*oidpp)->oid_handler)
+					goto found;
+				if (indx == namelen)
+					return ENOENT;
+				lsp = (struct linker_set*)(*oidpp)->oid_arg1;
+				j = lsp->ls_length;
+				oidpp = (struct sysctl_oid **)lsp->ls_items;
+			} else {
+				if (indx != namelen)
+					return EISDIR;
+				goto found;
+			}
+		} else {
+			oidpp++;
 		}
-		if (error = copyout((caddr_t)fp, where, sizeof (struct file)))
-			return (error);
-		buflen -= sizeof(struct file);
-		where += sizeof(struct file);
 	}
-	*sizep = where - start;
-	return (0);
+	return ENOENT;
+found:
+	/* If writing isn't allowed */
+	if (req->newptr && !((*oidpp)->oid_kind & CTLFLAG_WR))
+		return (EPERM);
+
+	/* Most likely only root can write */
+	if (!((*oidpp)->oid_kind & CTLFLAG_ANYBODY) &&
+	    req->newptr && req->p &&
+	    (i = suser(req->p->p_ucred, &req->p->p_acflag)))
+		return (i);
+
+	if (!(*oidpp)->oid_handler)
+		return EINVAL;
+
+	if (((*oidpp)->oid_kind & CTLTYPE) == CTLTYPE_NODE) {
+		i = ((*oidpp)->oid_handler) (*oidpp,
+					name + indx, namelen - indx,
+					req);
+	} else {
+		i = ((*oidpp)->oid_handler) (*oidpp,
+					(*oidpp)->oid_arg1, (*oidpp)->oid_arg2,
+					req);
+	}
+	return (i);
 }
 
-/*
- * try over estimating by 5 procs
- */
-#define KERN_PROCSLOP	(5 * sizeof (struct kinfo_proc))
+#ifndef _SYS_SYSPROTO_H_
+struct sysctl_args {
+	int	*name;
+	u_int	namelen;
+	void	*old;
+	size_t	*oldlenp;
+	void	*new;
+	size_t	newlen;
+};
+#endif
 
-sysctl_doproc(name, namelen, where, sizep)
-	int *name;
-	u_int namelen;
-	char *where;
-	size_t *sizep;
+int
+__sysctl(struct proc *p, struct sysctl_args *uap, int *retval)
 {
-	register struct proc *p;
-	register struct kinfo_proc *dp = (struct kinfo_proc *)where;
-	register int needed = 0;
-	int buflen = where != NULL ? *sizep : 0;
-	int doingzomb;
-	struct eproc eproc;
-	int error = 0;
+	int error, i, j, name[CTL_MAXNAME];
 
-	if (namelen != 2 && !(namelen == 1 && name[0] == KERN_PROC_ALL))
+	if (uap->namelen > CTL_MAXNAME || uap->namelen < 2)
 		return (EINVAL);
-	p = allproc.lh_first;
-	doingzomb = 0;
-again:
-	for (; p != 0; p = p->p_list.le_next) {
-		/*
-		 * Skip embryonic processes.
-		 */
-		if (p->p_stat == SIDL)
-			continue;
-		/*
-		 * TODO - make more efficient (see notes below).
-		 * do by session.
-		 */
-		switch (name[0]) {
 
-		case KERN_PROC_PID:
-			/* could do this with just a lookup */
-			if (p->p_pid != (pid_t)name[1])
-				continue;
-			break;
+ 	error = copyin(uap->name, &name, uap->namelen * sizeof(int));
+ 	if (error)
+		return (error);
 
-		case KERN_PROC_PGRP:
-			/* could do this by traversing pgrp */
-			if (p->p_pgrp->pg_id != (pid_t)name[1])
-				continue;
-			break;
+	error = userland_sysctl(p, name, uap->namelen,
+		uap->old, uap->oldlenp, 0,
+		uap->new, uap->newlen, &j);
+	if (error && error != ENOMEM)
+		return (error);
+	if (uap->oldlenp) {
+		i = copyout(&j, uap->oldlenp, sizeof(j));
+		if (i)
+			return (i);
+	}
+	return (error);
+}
 
-		case KERN_PROC_TTY:
-			if ((p->p_flag & P_CONTROLT) == 0 ||
-			    p->p_session->s_ttyp == NULL ||
-			    p->p_session->s_ttyp->t_dev != (dev_t)name[1])
-				continue;
-			break;
+/*
+ * This is used from various compatibility syscalls too.  That's why name
+ * must be in kernel space.
+ */
+int
+userland_sysctl(struct proc *p, int *name, u_int namelen, void *old, size_t *oldlenp, int inkernel, void *new, size_t newlen, int *retval)
+{
+	int error = 0;
+	struct sysctl_req req, req2;
 
-		case KERN_PROC_UID:
-			if (p->p_ucred->cr_uid != (uid_t)name[1])
-				continue;
-			break;
+	bzero(&req, sizeof req);
 
-		case KERN_PROC_RUID:
-			if (p->p_cred->p_ruid != (uid_t)name[1])
-				continue;
-			break;
-		}
-		if (buflen >= sizeof(struct kinfo_proc)) {
-			fill_eproc(p, &eproc);
-			if (error = copyout((caddr_t)p, &dp->kp_proc,
-			    sizeof(struct proc)))
-				return (error);
-			if (error = copyout((caddr_t)&eproc, &dp->kp_eproc,
-			    sizeof(eproc)))
+	req.p = p;
+
+	if (oldlenp) {
+		if (inkernel) {
+			req.oldlen = *oldlenp;
+		} else {
+			error = copyin(oldlenp, &req.oldlen, sizeof(*oldlenp));
+			if (error)
 				return (error);
-			dp++;
-			buflen -= sizeof(struct kinfo_proc);
 		}
-		needed += sizeof(struct kinfo_proc);
 	}
-	if (doingzomb == 0) {
-		p = zombproc.lh_first;
-		doingzomb++;
-		goto again;
+
+	if (old) {
+		if (!useracc(old, req.oldlen, B_WRITE))
+			return (EFAULT);
+		req.oldptr= old;
+	}
+
+	if (newlen) {
+		if (!useracc(new, req.newlen, B_READ))
+			return (EFAULT);
+		req.newlen = newlen;
+		req.newptr = new;
 	}
-	if (where != NULL) {
-		*sizep = (caddr_t)dp - where;
-		if (needed > *sizep)
-			return (ENOMEM);
-	} else {
-		needed += KERN_PROCSLOP;
-		*sizep = needed;
+
+	req.oldfunc = sysctl_old_user;
+	req.newfunc = sysctl_new_user;
+	req.lock = 1;
+
+	/* XXX this should probably be done in a general way */
+	while (memlock.sl_lock) {
+		memlock.sl_want = 1;
+		(void) tsleep((caddr_t)&memlock, PRIBIO+1, "sysctl", 0);
+		memlock.sl_locked++;
 	}
-	return (0);
-}
+	memlock.sl_lock = 1;
 
-/*
- * Fill in an eproc structure for the specified process.
- */
-void
-fill_eproc(p, ep)
-	register struct proc *p;
-	register struct eproc *ep;
-{
-	register struct tty *tp;
-
-	ep->e_paddr = p;
-	ep->e_sess = p->p_pgrp->pg_session;
-	ep->e_pcred = *p->p_cred;
-	ep->e_ucred = *p->p_ucred;
-	if (p->p_stat == SIDL || p->p_stat == SZOMB) {
-		ep->e_vm.vm_rssize = 0;
-		ep->e_vm.vm_tsize = 0;
-		ep->e_vm.vm_dsize = 0;
-		ep->e_vm.vm_ssize = 0;
-#ifndef sparc
-		/* ep->e_vm.vm_pmap = XXX; */
-#endif
-	} else {
-		register struct vmspace *vm = p->p_vmspace;
+	do {
+	    req2 = req;
+	    error = sysctl_root(0, name, namelen, &req2);
+	} while (error == EAGAIN);
 
-#ifdef pmap_resident_count
-		ep->e_vm.vm_rssize = pmap_resident_count(&vm->vm_pmap); /*XXX*/
-#else
-		ep->e_vm.vm_rssize = vm->vm_rssize;
-#endif
-		ep->e_vm.vm_tsize = vm->vm_tsize;
-		ep->e_vm.vm_dsize = vm->vm_dsize;
-		ep->e_vm.vm_ssize = vm->vm_ssize;
-#ifndef sparc
-		ep->e_vm.vm_pmap = vm->vm_pmap;
-#endif
+	req = req2;
+	if (req.lock == 2)
+		vsunlock(req.oldptr, req.oldlen, B_WRITE);
+
+	memlock.sl_lock = 0;
+
+	if (memlock.sl_want) {
+		memlock.sl_want = 0;
+		wakeup((caddr_t)&memlock);
 	}
-	if (p->p_pptr)
-		ep->e_ppid = p->p_pptr->p_pid;
-	else
-		ep->e_ppid = 0;
-	ep->e_pgid = p->p_pgrp->pg_id;
-	ep->e_jobc = p->p_pgrp->pg_jobc;
-	if ((p->p_flag & P_CONTROLT) &&
-	     (tp = ep->e_sess->s_ttyp)) {
-		ep->e_tdev = tp->t_dev;
-		ep->e_tpgid = tp->t_pgrp ? tp->t_pgrp->pg_id : NO_PID;
-		ep->e_tsess = tp->t_session;
-	} else
-		ep->e_tdev = NODEV;
-	ep->e_flag = ep->e_sess->s_ttyvp ? EPROC_CTTY : 0;
-	if (SESS_LEADER(p))
-		ep->e_flag |= EPROC_SLEADER;
-	if (p->p_wmesg)
-		strncpy(ep->e_wmesg, p->p_wmesg, WMESGLEN);
-	ep->e_xsize = ep->e_xrssize = 0;
-	ep->e_xccount = ep->e_xswrss = 0;
+
+	if (error && error != ENOMEM)
+		return (error);
+
+	if (retval) {
+		if (req.oldptr && req.oldidx > req.oldlen)
+			*retval = req.oldlen;
+		else
+			*retval = req.oldidx;
+	}
+	return (error);
 }
 
 #ifdef COMPAT_43
 #include <sys/socket.h>
+#include <vm/vm_param.h>
+
 #define	KINFO_PROC		(0<<8)
 #define	KINFO_RT		(1<<8)
 #define	KINFO_VNODE		(2<<8)
@@ -712,81 +880,197 @@ fill_eproc(p, ep)
 #define	KINFO_LOADAVG		(5<<8)
 #define	KINFO_CLOCKRATE		(6<<8)
 
-compat_43_getkerninfo(p, uap, retval)
-	struct proc *p;
-	register struct compat_43_getkerninfo_args /* {
-		syscallarg(int) op;
-		syscallarg(char *) where;
-		syscallarg(int *) size;
-		syscallarg(int) arg;
-	} */ *uap;
-	register_t *retval;
-{
-	int error, name[5];
-	size_t size;
+/* Non-standard BSDI extension - only present on their 4.3 net-2 releases */
+#define	KINFO_BSDI_SYSINFO	(101<<8)
 
-	if (SCARG(uap, size) && (error = copyin((caddr_t)SCARG(uap, size),
-	    (caddr_t)&size, sizeof(size))))
-		return (error);
+/*
+ * XXX this is bloat, but I hope it's better here than on the potentially
+ * limited kernel stack...  -Peter
+ */
+
+static struct {
+	int	bsdi_machine;		/* "i386" on BSD/386 */
+/*      ^^^ this is an offset to the string, relative to the struct start */
+	char	*pad0;
+	long	pad1;
+	long	pad2;
+	long	pad3;
+	u_long	pad4;
+	u_long	pad5;
+	u_long	pad6;
+
+	int	bsdi_ostype;		/* "BSD/386" on BSD/386 */
+	int	bsdi_osrelease;		/* "1.1" on BSD/386 */
+	long	pad7;
+	long	pad8;
+	char	*pad9;
+
+	long	pad10;
+	long	pad11;
+	int	pad12;
+	long	pad13;
+	quad_t	pad14;
+	long	pad15;
+
+	struct	timeval pad16;
+	/* we dont set this, because BSDI's uname used gethostname() instead */
+	int	bsdi_hostname;		/* hostname on BSD/386 */
 
-	switch (SCARG(uap, op) & 0xff00) {
+	/* the actual string data is appended here */
+
+} bsdi_si;
+/*
+ * this data is appended to the end of the bsdi_si structure during copyout.
+ * The "char *" offsets are relative to the base of the bsdi_si struct.
+ * This contains "FreeBSD\02.0-BUILT-nnnnnn\0i386\0", and these strings
+ * should not exceed the length of the buffer here... (or else!! :-)
+ */
+static char bsdi_strings[80];	/* It had better be less than this! */
+
+#ifndef _SYS_SYSPROTO_H_
+struct getkerninfo_args {
+	int	op;
+	char	*where;
+	int	*size;
+	int	arg;
+};
+#endif
+
+int
+ogetkerninfo(struct proc *p, struct getkerninfo_args *uap, int *retval)
+{
+	int error, name[6];
+	u_int size;
+
+	switch (uap->op & 0xff00) {
 
 	case KINFO_RT:
-		name[0] = PF_ROUTE;
-		name[1] = 0;
-		name[2] = (SCARG(uap, op) & 0xff0000) >> 16;
-		name[3] = SCARG(uap, op) & 0xff;
-		name[4] = SCARG(uap, arg);
-		error =
-		    net_sysctl(name, 5, SCARG(uap, where), &size, NULL, 0, p);
+		name[0] = CTL_NET;
+		name[1] = PF_ROUTE;
+		name[2] = 0;
+		name[3] = (uap->op & 0xff0000) >> 16;
+		name[4] = uap->op & 0xff;
+		name[5] = uap->arg;
+		error = userland_sysctl(p, name, 6, uap->where, uap->size,
+			0, 0, 0, &size);
 		break;
 
 	case KINFO_VNODE:
-		name[0] = KERN_VNODE;
-		error =
-		    kern_sysctl(name, 1, SCARG(uap, where), &size, NULL, 0, p);
+		name[0] = CTL_KERN;
+		name[1] = KERN_VNODE;
+		error = userland_sysctl(p, name, 2, uap->where, uap->size,
+			0, 0, 0, &size);
 		break;
 
 	case KINFO_PROC:
-		name[0] = KERN_PROC;
-		name[1] = SCARG(uap, op) & 0xff;
-		name[2] = SCARG(uap, arg);
-		error =
-		    kern_sysctl(name, 3, SCARG(uap, where), &size, NULL, 0, p);
+		name[0] = CTL_KERN;
+		name[1] = KERN_PROC;
+		name[2] = uap->op & 0xff;
+		name[3] = uap->arg;
+		error = userland_sysctl(p, name, 4, uap->where, uap->size,
+			0, 0, 0, &size);
 		break;
 
 	case KINFO_FILE:
-		name[0] = KERN_FILE;
-		error =
-		    kern_sysctl(name, 1, SCARG(uap, where), &size, NULL, 0, p);
+		name[0] = CTL_KERN;
+		name[1] = KERN_FILE;
+		error = userland_sysctl(p, name, 2, uap->where, uap->size,
+			0, 0, 0, &size);
 		break;
 
 	case KINFO_METER:
-		name[0] = VM_METER;
-		error =
-		    vm_sysctl(name, 1, SCARG(uap, where), &size, NULL, 0, p);
+		name[0] = CTL_VM;
+		name[1] = VM_METER;
+		error = userland_sysctl(p, name, 2, uap->where, uap->size,
+			0, 0, 0, &size);
 		break;
 
 	case KINFO_LOADAVG:
-		name[0] = VM_LOADAVG;
-		error =
-		    vm_sysctl(name, 1, SCARG(uap, where), &size, NULL, 0, p);
+		name[0] = CTL_VM;
+		name[1] = VM_LOADAVG;
+		error = userland_sysctl(p, name, 2, uap->where, uap->size,
+			0, 0, 0, &size);
 		break;
 
 	case KINFO_CLOCKRATE:
-		name[0] = KERN_CLOCKRATE;
-		error =
-		    kern_sysctl(name, 1, SCARG(uap, where), &size, NULL, 0, p);
+		name[0] = CTL_KERN;
+		name[1] = KERN_CLOCKRATE;
+		error = userland_sysctl(p, name, 2, uap->where, uap->size,
+			0, 0, 0, &size);
 		break;
 
+	case KINFO_BSDI_SYSINFO: {
+		/*
+		 * this is pretty crude, but it's just enough for uname()
+		 * from BSDI's 1.x libc to work.
+		 *
+		 * In particular, it doesn't return the same results when
+		 * the supplied buffer is too small.  BSDI's version apparently
+		 * will return the amount copied, and set the *size to how
+		 * much was needed.  The emulation framework here isn't capable
+		 * of that, so we just set both to the amount copied.
+		 * BSDI's 2.x product apparently fails with ENOMEM in this
+		 * scenario.
+		 */
+
+		u_int needed;
+		u_int left;
+		char *s;
+
+		bzero((char *)&bsdi_si, sizeof(bsdi_si));
+		bzero(bsdi_strings, sizeof(bsdi_strings));
+
+		s = bsdi_strings;
+
+		bsdi_si.bsdi_ostype = (s - bsdi_strings) + sizeof(bsdi_si);
+		strcpy(s, ostype);
+		s += strlen(s) + 1;
+
+		bsdi_si.bsdi_osrelease = (s - bsdi_strings) + sizeof(bsdi_si);
+		strcpy(s, osrelease);
+		s += strlen(s) + 1;
+
+		bsdi_si.bsdi_machine = (s - bsdi_strings) + sizeof(bsdi_si);
+		strcpy(s, machine);
+		s += strlen(s) + 1;
+
+		needed = sizeof(bsdi_si) + (s - bsdi_strings);
+
+		if (uap->where == NULL) {
+			/* process is asking how much buffer to supply.. */
+			size = needed;
+			error = 0;
+			break;
+		}
+
+
+		/* if too much buffer supplied, trim it down */
+		if (size > needed)
+			size = needed;
+
+		/* how much of the buffer is remaining */
+		left = size;
+
+		if ((error = copyout((char *)&bsdi_si, uap->where, left)) != 0)
+			break;
+
+		/* is there any point in continuing? */
+		if (left > sizeof(bsdi_si)) {
+			left -= sizeof(bsdi_si);
+			error = copyout(&bsdi_strings,
+					uap->where + sizeof(bsdi_si), left);
+		}
+		break;
+	}
+
 	default:
 		return (EOPNOTSUPP);
 	}
 	if (error)
 		return (error);
 	*retval = size;
-	if (SCARG(uap, size))
-		error = copyout((caddr_t)&size, (caddr_t)SCARG(uap, size),
+	if (uap->size)
+		error = copyout((caddr_t)&size, (caddr_t)uap->size,
 		    sizeof(size));
 	return (error);
 }
diff --git a/sys/kern/kern_tc.c b/sys/kern/kern_tc.c
new file mode 100644
index 0000000..171ed0e
--- /dev/null
+++ b/sys/kern/kern_tc.c
@@ -0,0 +1,1303 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
+ * $Id: kern_clock.c,v 1.34 1997/03/22 16:52:19 mpp Exp $
+ */
+
+/* Portions of this software are covered by the following: */
+/******************************************************************************
+ *                                                                            *
+ * Copyright (c) David L. Mills 1993, 1994                                    *
+ *                                                                            *
+ * Permission to use, copy, modify, and distribute this software and its      *
+ * documentation for any purpose and without fee is hereby granted, provided  *
+ * that the above copyright notice appears in all copies and that both the    *
+ * copyright notice and this permission notice appear in supporting           *
+ * documentation, and that the name University of Delaware not be used in     *
+ * advertising or publicity pertaining to distribution of the software        *
+ * without specific, written prior permission.  The University of Delaware    *
+ * makes no representations about the suitability this software for any       *
+ * purpose.  It is provided "as is" without express or implied warranty.      *
+ *                                                                            *
+ *****************************************************************************/
+
+#include "opt_cpu.h"		/* XXX */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/dkstat.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/timex.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/sysctl.h>
+
+#include <machine/cpu.h>
+#define CLOCK_HAIR		/* XXX */
+#include <machine/clock.h>
+
+#ifdef GPROF
+#include <sys/gmon.h>
+#endif
+
+static void initclocks __P((void *dummy));
+SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
+
+/* Exported to machdep.c. */
+struct callout *callfree, *callout;
+
+static struct callout calltodo;
+
+/* Some of these don't belong here, but it's easiest to concentrate them. */
+static long cp_time[CPUSTATES];
+long dk_seek[DK_NDRIVE];
+static long dk_time[DK_NDRIVE];
+long dk_wds[DK_NDRIVE];
+long dk_wpms[DK_NDRIVE];
+long dk_xfer[DK_NDRIVE];
+
+int dk_busy;
+int dk_ndrive = 0;
+char dk_names[DK_NDRIVE][DK_NAMELEN];
+
+long tk_cancc;
+long tk_nin;
+long tk_nout;
+long tk_rawcc;
+
+/*
+ * Clock handling routines.
+ *
+ * This code is written to operate with two timers that run independently of
+ * each other.  The main clock, running hz times per second, is used to keep
+ * track of real time.  The second timer handles kernel and user profiling,
+ * and does resource use estimation.  If the second timer is programmable,
+ * it is randomized to avoid aliasing between the two clocks.  For example,
+ * the randomization prevents an adversary from always giving up the cpu
+ * just before its quantum expires.  Otherwise, it would never accumulate
+ * cpu ticks.  The mean frequency of the second timer is stathz.
+ *
+ * If no second timer exists, stathz will be zero; in this case we drive
+ * profiling and statistics off the main clock.  This WILL NOT be accurate;
+ * do not do it unless absolutely necessary.
+ *
+ * The statistics clock may (or may not) be run at a higher rate while
+ * profiling.  This profile clock runs at profhz.  We require that profhz
+ * be an integral multiple of stathz.
+ *
+ * If the statistics clock is running fast, it must be divided by the ratio
+ * profhz/stathz for statistics.  (For profiling, every tick counts.)
+ */
+
+/*
+ * TODO:
+ *	allocate more timeout table slots when table overflows.
+ */
+
+/*
+ * Bump a timeval by a small number of usec's.
+ */
+#define BUMPTIME(t, usec) { \
+	register volatile struct timeval *tp = (t); \
+	register long us; \
+ \
+	tp->tv_usec = us = tp->tv_usec + (usec); \
+	if (us >= 1000000) { \
+		tp->tv_usec = us - 1000000; \
+		tp->tv_sec++; \
+	} \
+}
+
+int	stathz;
+int	profhz;
+static int profprocs;
+int	ticks;
+static int psdiv, pscnt;	/* prof => stat divider */
+int psratio;			/* ratio: prof / stat */
+
+volatile struct	timeval time;
+volatile struct	timeval mono_time;
+
+/*
+ * Phase/frequency-lock loop (PLL/FLL) definitions
+ *
+ * The following variables are read and set by the ntp_adjtime() system
+ * call.
+ *
+ * time_state shows the state of the system clock, with values defined
+ * in the timex.h header file.
+ *
+ * time_status shows the status of the system clock, with bits defined
+ * in the timex.h header file.
+ *
+ * time_offset is used by the PLL/FLL to adjust the system time in small
+ * increments.
+ *
+ * time_constant determines the bandwidth or "stiffness" of the PLL.
+ *
+ * time_tolerance determines maximum frequency error or tolerance of the
+ * CPU clock oscillator and is a property of the architecture; however,
+ * in principle it could change as result of the presence of external
+ * discipline signals, for instance.
+ *
+ * time_precision is usually equal to the kernel tick variable; however,
+ * in cases where a precision clock counter or external clock is
+ * available, the resolution can be much less than this and depend on
+ * whether the external clock is working or not.
+ *
+ * time_maxerror is initialized by a ntp_adjtime() call and increased by
+ * the kernel once each second to reflect the maximum error
+ * bound growth.
+ *
+ * time_esterror is set and read by the ntp_adjtime() call, but
+ * otherwise not used by the kernel.
+ */
+int time_status = STA_UNSYNC;	/* clock status bits */
+int time_state = TIME_OK;	/* clock state */
+long time_offset = 0;		/* time offset (us) */
+long time_constant = 0;		/* pll time constant */
+long time_tolerance = MAXFREQ;	/* frequency tolerance (scaled ppm) */
+long time_precision = 1;	/* clock precision (us) */
+long time_maxerror = MAXPHASE;	/* maximum error (us) */
+long time_esterror = MAXPHASE;	/* estimated error (us) */
+
+/*
+ * The following variables establish the state of the PLL/FLL and the
+ * residual time and frequency offset of the local clock. The scale
+ * factors are defined in the timex.h header file.
+ *
+ * time_phase and time_freq are the phase increment and the frequency
+ * increment, respectively, of the kernel time variable at each tick of
+ * the clock.
+ *
+ * time_freq is set via ntp_adjtime() from a value stored in a file when
+ * the synchronization daemon is first started. Its value is retrieved
+ * via ntp_adjtime() and written to the file about once per hour by the
+ * daemon.
+ *
+ * time_adj is the adjustment added to the value of tick at each timer
+ * interrupt and is recomputed from time_phase and time_freq at each
+ * seconds rollover.
+ *
+ * time_reftime is the second's portion of the system time on the last
+ * call to ntp_adjtime(). It is used to adjust the time_freq variable
+ * and to increase the time_maxerror as the time since last update
+ * increases.
+ */
+static long time_phase = 0;		/* phase offset (scaled us) */
+long time_freq = 0;			/* frequency offset (scaled ppm) */
+static long time_adj = 0;		/* tick adjust (scaled 1 / hz) */
+static long time_reftime = 0;		/* time at last adjustment (s) */
+
+#ifdef PPS_SYNC
+/*
+ * The following variables are used only if the kernel PPS discipline
+ * code is configured (PPS_SYNC). The scale factors are defined in the
+ * timex.h header file.
+ *
+ * pps_time contains the time at each calibration interval, as read by
+ * microtime(). pps_count counts the seconds of the calibration
+ * interval, the duration of which is nominally pps_shift in powers of
+ * two.
+ *
+ * pps_offset is the time offset produced by the time median filter
+ * pps_tf[], while pps_jitter is the dispersion (jitter) measured by
+ * this filter.
+ *
+ * pps_freq is the frequency offset produced by the frequency median
+ * filter pps_ff[], while pps_stabil is the dispersion (wander) measured
+ * by this filter.
+ *
+ * pps_usec is latched from a high resolution counter or external clock
+ * at pps_time. Here we want the hardware counter contents only, not the
+ * contents plus the time_tv.usec as usual.
+ *
+ * pps_valid counts the number of seconds since the last PPS update. It
+ * is used as a watchdog timer to disable the PPS discipline should the
+ * PPS signal be lost.
+ *
+ * pps_glitch counts the number of seconds since the beginning of an
+ * offset burst more than tick/2 from current nominal offset. It is used
+ * mainly to suppress error bursts due to priority conflicts between the
+ * PPS interrupt and timer interrupt.
+ *
+ * pps_intcnt counts the calibration intervals for use in the interval-
+ * adaptation algorithm. It's just too complicated for words.
+ */
+struct timeval pps_time;	/* kernel time at last interval */
+long pps_offset = 0;		/* pps time offset (us) */
+long pps_jitter = MAXTIME;	/* pps time dispersion (jitter) (us) */
+long pps_tf[] = {0, 0, 0};	/* pps time offset median filter (us) */
+long pps_freq = 0;		/* frequency offset (scaled ppm) */
+long pps_stabil = MAXFREQ;	/* frequency dispersion (scaled ppm) */
+long pps_ff[] = {0, 0, 0};	/* frequency offset median filter */
+long pps_usec = 0;		/* microsec counter at last interval */
+long pps_valid = PPS_VALID;	/* pps signal watchdog counter */
+int pps_glitch = 0;		/* pps signal glitch counter */
+int pps_count = 0;		/* calibration interval counter (s) */
+int pps_shift = PPS_SHIFT;	/* interval duration (s) (shift) */
+int pps_intcnt = 0;		/* intervals at current duration */
+
+/*
+ * PPS signal quality monitors
+ *
+ * pps_jitcnt counts the seconds that have been discarded because the
+ * jitter measured by the time median filter exceeds the limit MAXTIME
+ * (100 us).
+ *
+ * pps_calcnt counts the frequency calibration intervals, which are
+ * variable from 4 s to 256 s.
+ *
+ * pps_errcnt counts the calibration intervals which have been discarded
+ * because the wander exceeds the limit MAXFREQ (100 ppm) or where the
+ * calibration interval jitter exceeds two ticks.
+ *
+ * pps_stbcnt counts the calibration intervals that have been discarded
+ * because the frequency wander exceeds the limit MAXFREQ / 4 (25 us).
+ */
+long pps_jitcnt = 0;		/* jitter limit exceeded */
+long pps_calcnt = 0;		/* calibration intervals */
+long pps_errcnt = 0;		/* calibration errors */
+long pps_stbcnt = 0;		/* stability limit exceeded */
+#endif /* PPS_SYNC */
+
+/* XXX none of this stuff works under FreeBSD */
+#ifdef EXT_CLOCK
+/*
+ * External clock definitions
+ *
+ * The following definitions and declarations are used only if an
+ * external clock (HIGHBALL or TPRO) is configured on the system.
+ */
+#define CLOCK_INTERVAL 30	/* CPU clock update interval (s) */
+
+/*
+ * The clock_count variable is set to CLOCK_INTERVAL at each PPS
+ * interrupt and decremented once each second.
+ */
+int clock_count = 0;		/* CPU clock counter */
+
+#ifdef HIGHBALL
+/*
+ * The clock_offset and clock_cpu variables are used by the HIGHBALL
+ * interface. The clock_offset variable defines the offset between
+ * system time and the HIGBALL counters. The clock_cpu variable contains
+ * the offset between the system clock and the HIGHBALL clock for use in
+ * disciplining the kernel time variable.
+ */
+extern struct timeval clock_offset; /* Highball clock offset */
+long clock_cpu = 0;		/* CPU clock adjust */
+#endif /* HIGHBALL */
+#endif /* EXT_CLOCK */
+
+/*
+ * hardupdate() - local clock update
+ *
+ * This routine is called by ntp_adjtime() to update the local clock
+ * phase and frequency. The implementation is of an adaptive-parameter,
+ * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new
+ * time and frequency offset estimates for each call. If the kernel PPS
+ * discipline code is configured (PPS_SYNC), the PPS signal itself
+ * determines the new time offset, instead of the calling argument.
+ * Presumably, calls to ntp_adjtime() occur only when the caller
+ * believes the local clock is valid within some bound (+-128 ms with
+ * NTP). If the caller's time is far different than the PPS time, an
+ * argument will ensue, and it's not clear who will lose.
+ *
+ * For uncompensated quartz crystal oscillatores and nominal update
+ * intervals less than 1024 s, operation should be in phase-lock mode
+ * (STA_FLL = 0), where the loop is disciplined to phase. For update
+ * intervals greater than thiss, operation should be in frequency-lock
+ * mode (STA_FLL = 1), where the loop is disciplined to frequency.
+ *
+ * Note: splclock() is in effect.
+ */
+void
+hardupdate(offset)
+	long offset;
+{
+	long ltemp, mtemp;
+
+	if (!(time_status & STA_PLL) && !(time_status & STA_PPSTIME))
+		return;
+	ltemp = offset;
+#ifdef PPS_SYNC
+	if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
+		ltemp = pps_offset;
+#endif /* PPS_SYNC */
+
+	/*
+	 * Scale the phase adjustment and clamp to the operating range.
+	 */
+	if (ltemp > MAXPHASE)
+		time_offset = MAXPHASE << SHIFT_UPDATE;
+	else if (ltemp < -MAXPHASE)
+		time_offset = -(MAXPHASE << SHIFT_UPDATE);
+	else
+		time_offset = ltemp << SHIFT_UPDATE;
+
+	/*
+	 * Select whether the frequency is to be controlled and in which
+	 * mode (PLL or FLL). Clamp to the operating range. Ugly
+	 * multiply/divide should be replaced someday.
+	 */
+	if (time_status & STA_FREQHOLD || time_reftime == 0)
+		time_reftime = time.tv_sec;
+	mtemp = time.tv_sec - time_reftime;
+	time_reftime = time.tv_sec;
+	if (time_status & STA_FLL) {
+		if (mtemp >= MINSEC) {
+			ltemp = ((time_offset / mtemp) << (SHIFT_USEC -
+			    SHIFT_UPDATE));
+			if (ltemp < 0)
+				time_freq -= -ltemp >> SHIFT_KH;
+			else
+				time_freq += ltemp >> SHIFT_KH;
+		}
+	} else {
+		if (mtemp < MAXSEC) {
+			ltemp *= mtemp;
+			if (ltemp < 0)
+				time_freq -= -ltemp >> (time_constant +
+				    time_constant + SHIFT_KF -
+				    SHIFT_USEC);
+			else
+				time_freq += ltemp >> (time_constant +
+				    time_constant + SHIFT_KF -
+				    SHIFT_USEC);
+		}
+	}
+	if (time_freq > time_tolerance)
+		time_freq = time_tolerance;
+	else if (time_freq < -time_tolerance)
+		time_freq = -time_tolerance;
+}
+
+
+
+/*
+ * Initialize clock frequencies and start both clocks running.
+ */
+/* ARGSUSED*/
+static void
+initclocks(dummy)
+	void *dummy;
+{
+	register int i;
+
+	/*
+	 * Set divisors to 1 (normal case) and let the machine-specific
+	 * code do its bit.
+	 */
+	psdiv = pscnt = 1;
+	cpu_initclocks();
+
+	/*
+	 * Compute profhz/stathz, and fix profhz if needed.
+	 */
+	i = stathz ? stathz : hz;
+	if (profhz == 0)
+		profhz = i;
+	psratio = profhz / i;
+}
+
+/*
+ * The real-time timer, interrupting hz times per second.
+ */
+void
+hardclock(frame)
+	register struct clockframe *frame;
+{
+	register struct callout *p1;
+	register struct proc *p;
+	register int needsoft;
+
+	/*
+	 * Update real-time timeout queue.
+	 * At front of queue are some number of events which are ``due''.
+	 * The time to these is <= 0 and if negative represents the
+	 * number of ticks which have passed since it was supposed to happen.
+	 * The rest of the q elements (times > 0) are events yet to happen,
+	 * where the time for each is given as a delta from the previous.
+	 * Decrementing just the first of these serves to decrement the time
+	 * to all events.
+	 */
+	needsoft = 0;
+	for (p1 = calltodo.c_next; p1 != NULL; p1 = p1->c_next) {
+		if (--p1->c_time > 0)
+			break;
+		needsoft = 1;
+		if (p1->c_time == 0)
+			break;
+	}
+
+	p = curproc;
+	if (p) {
+		register struct pstats *pstats;
+
+		/*
+		 * Run current process's virtual and profile time, as needed.
+		 */
+		pstats = p->p_stats;
+		if (CLKF_USERMODE(frame) &&
+		    timerisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
+		    itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
+			psignal(p, SIGVTALRM);
+		if (timerisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
+		    itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
+			psignal(p, SIGPROF);
+	}
+
+	/*
+	 * If no separate statistics clock is available, run it from here.
+	 */
+	if (stathz == 0)
+		statclock(frame);
+
+	/*
+	 * Increment the time-of-day.
+	 */
+	ticks++;
+	{
+		int time_update;
+		struct timeval newtime = time;
+		long ltemp;
+
+		if (timedelta == 0) {
+			time_update = CPU_THISTICKLEN(tick);
+		} else {
+			time_update = CPU_THISTICKLEN(tick) + tickdelta;
+			timedelta -= tickdelta;
+		}
+		BUMPTIME(&mono_time, time_update);
+
+		/*
+		 * Compute the phase adjustment. If the low-order bits
+		 * (time_phase) of the update overflow, bump the high-order bits
+		 * (time_update).
+		 */
+		time_phase += time_adj;
+		if (time_phase <= -FINEUSEC) {
+		  ltemp = -time_phase >> SHIFT_SCALE;
+		  time_phase += ltemp << SHIFT_SCALE;
+		  time_update -= ltemp;
+		}
+		else if (time_phase >= FINEUSEC) {
+		  ltemp = time_phase >> SHIFT_SCALE;
+		  time_phase -= ltemp << SHIFT_SCALE;
+		  time_update += ltemp;
+		}
+
+		newtime.tv_usec += time_update;
+		/*
+		 * On rollover of the second the phase adjustment to be used for
+		 * the next second is calculated. Also, the maximum error is
+		 * increased by the tolerance. If the PPS frequency discipline
+		 * code is present, the phase is increased to compensate for the
+		 * CPU clock oscillator frequency error.
+		 *
+		 * On a 32-bit machine and given parameters in the timex.h
+		 * header file, the maximum phase adjustment is +-512 ms and
+		 * maximum frequency offset is a tad less than) +-512 ppm. On a
+		 * 64-bit machine, you shouldn't need to ask.
+		 */
+		if (newtime.tv_usec >= 1000000) {
+		  newtime.tv_usec -= 1000000;
+		  newtime.tv_sec++;
+		  time_maxerror += time_tolerance >> SHIFT_USEC;
+
+		  /*
+		   * Compute the phase adjustment for the next second. In
+		   * PLL mode, the offset is reduced by a fixed factor
+		   * times the time constant. In FLL mode the offset is
+		   * used directly. In either mode, the maximum phase
+		   * adjustment for each second is clamped so as to spread
+		   * the adjustment over not more than the number of
+		   * seconds between updates.
+		   */
+		  if (time_offset < 0) {
+		    ltemp = -time_offset;
+		    if (!(time_status & STA_FLL))
+			ltemp >>= SHIFT_KG + time_constant;
+		    if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+			ltemp = (MAXPHASE / MINSEC) <<
+			    SHIFT_UPDATE;
+		    time_offset += ltemp;
+		    time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ -
+			SHIFT_UPDATE);
+		    } else {
+		        ltemp = time_offset;
+			if (!(time_status & STA_FLL))
+				ltemp >>= SHIFT_KG + time_constant;
+			if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+				ltemp = (MAXPHASE / MINSEC) <<
+				    SHIFT_UPDATE;
+			time_offset -= ltemp;
+			time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ -
+			    SHIFT_UPDATE);
+		    }
+
+		  /*
+		   * Compute the frequency estimate and additional phase
+		   * adjustment due to frequency error for the next
+		   * second. When the PPS signal is engaged, gnaw on the
+		   * watchdog counter and update the frequency computed by
+		   * the pll and the PPS signal.
+		   */
+#ifdef PPS_SYNC
+		  pps_valid++;
+		  if (pps_valid == PPS_VALID) {
+		    pps_jitter = MAXTIME;
+		    pps_stabil = MAXFREQ;
+		    time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+				     STA_PPSWANDER | STA_PPSERROR);
+		  }
+		  ltemp = time_freq + pps_freq;
+#else
+		  ltemp = time_freq;
+#endif /* PPS_SYNC */
+		  if (ltemp < 0)
+		    time_adj -= -ltemp >>
+		      (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+		  else
+		    time_adj += ltemp >>
+		      (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+
+#if SHIFT_HZ == 7
+		  /*
+		   * When the CPU clock oscillator frequency is not a
+		   * power of two in Hz, the SHIFT_HZ is only an
+		   * approximate scale factor. In the SunOS kernel, this
+		   * results in a PLL gain factor of 1/1.28 = 0.78 what it
+		   * should be. In the following code the overall gain is
+		   * increased by a factor of 1.25, which results in a
+		   * residual error less than 3 percent.
+		   */
+		  /* Same thing applies for FreeBSD --GAW */
+		  if (hz == 100) {
+		    if (time_adj < 0)
+		      time_adj -= -time_adj >> 2;
+		    else
+		      time_adj += time_adj >> 2;
+		  }
+#endif /* SHIFT_HZ */
+
+		  /* XXX - this is really bogus, but can't be fixed until
+		     xntpd's idea of the system clock is fixed to know how
+		     the user wants leap seconds handled; in the mean time,
+		     we assume that users of NTP are running without proper
+		     leap second support (this is now the default anyway) */
+		  /*
+		   * Leap second processing. If in leap-insert state at
+		   * the end of the day, the system clock is set back one
+		   * second; if in leap-delete state, the system clock is
+		   * set ahead one second. The microtime() routine or
+		   * external clock driver will insure that reported time
+		   * is always monotonic. The ugly divides should be
+		   * replaced.
+		   */
+		  switch (time_state) {
+
+		  case TIME_OK:
+		    if (time_status & STA_INS)
+		      time_state = TIME_INS;
+		    else if (time_status & STA_DEL)
+		      time_state = TIME_DEL;
+		    break;
+
+		  case TIME_INS:
+		    if (newtime.tv_sec % 86400 == 0) {
+		      newtime.tv_sec--;
+		      time_state = TIME_OOP;
+		    }
+		    break;
+
+		  case TIME_DEL:
+		    if ((newtime.tv_sec + 1) % 86400 == 0) {
+		      newtime.tv_sec++;
+		      time_state = TIME_WAIT;
+		    }
+		    break;
+
+		  case TIME_OOP:
+		    time_state = TIME_WAIT;
+		    break;
+
+		  case TIME_WAIT:
+		    if (!(time_status & (STA_INS | STA_DEL)))
+		      time_state = TIME_OK;
+		  }
+		}
+		CPU_CLOCKUPDATE(&time, &newtime);
+	}
+
+	/*
+	 * Process callouts at a very low cpu priority, so we don't keep the
+	 * relatively high clock interrupt priority any longer than necessary.
+	 */
+	if (needsoft) {
+		if (CLKF_BASEPRI(frame)) {
+			/*
+			 * Save the overhead of a software interrupt;
+			 * it will happen as soon as we return, so do it now.
+			 */
+			(void)splsoftclock();
+			softclock();
+		} else
+			setsoftclock();
+	}
+}
+
+/*
+ * Software (low priority) clock interrupt.
+ * Run periodic events from timeout queue.
+ */
+/*ARGSUSED*/
+void
+softclock()
+{
+	register struct callout *c;
+	register void *arg;
+	register void (*func) __P((void *));
+	register int s;
+
+	s = splhigh();
+	while ((c = calltodo.c_next) != NULL && c->c_time <= 0) {
+		func = c->c_func;
+		arg = c->c_arg;
+		calltodo.c_next = c->c_next;
+		c->c_next = callfree;
+		callfree = c;
+		splx(s);
+		(*func)(arg);
+		(void) splhigh();
+	}
+	splx(s);
+}
+
+/*
+ * timeout --
+ *	Execute a function after a specified length of time.
+ *
+ * untimeout --
+ *	Cancel previous timeout function call.
+ *
+ *	See AT&T BCI Driver Reference Manual for specification.  This
+ *	implementation differs from that one in that no identification
+ *	value is returned from timeout, rather, the original arguments
+ *	to timeout are used to identify entries for untimeout.
+ */
+void
+timeout(ftn, arg, ticks)
+	timeout_t ftn;
+	void *arg;
+	register int ticks;
+{
+	register struct callout *new, *p, *t;
+	register int s;
+
+	if (ticks <= 0)
+		ticks = 1;
+
+	/* Lock out the clock. */
+	s = splhigh();
+
+	/* Fill in the next free callout structure. */
+	if (callfree == NULL)
+		panic("timeout table full");
+	new = callfree;
+	callfree = new->c_next;
+	new->c_arg = arg;
+	new->c_func = ftn;
+
+	/*
+	 * The time for each event is stored as a difference from the time
+	 * of the previous event on the queue.  Walk the queue, correcting
+	 * the ticks argument for queue entries passed.  Correct the ticks
+	 * value for the queue entry immediately after the insertion point
+	 * as well.  Watch out for negative c_time values; these represent
+	 * overdue events.
+	 */
+	for (p = &calltodo;
+	    (t = p->c_next) != NULL && ticks > t->c_time; p = t)
+		if (t->c_time > 0)
+			ticks -= t->c_time;
+	new->c_time = ticks;
+	if (t != NULL)
+		t->c_time -= ticks;
+
+	/* Insert the new entry into the queue. */
+	p->c_next = new;
+	new->c_next = t;
+	splx(s);
+}
+
+void
+untimeout(ftn, arg)
+	timeout_t ftn;
+	void *arg;
+{
+	register struct callout *p, *t;
+	register int s;
+
+	s = splhigh();
+	for (p = &calltodo; (t = p->c_next) != NULL; p = t)
+		if (t->c_func == ftn && t->c_arg == arg) {
+			/* Increment next entry's tick count. */
+			if (t->c_next && t->c_time > 0)
+				t->c_next->c_time += t->c_time;
+
+			/* Move entry from callout queue to callfree queue. */
+			p->c_next = t->c_next;
+			t->c_next = callfree;
+			callfree = t;
+			break;
+		}
+	splx(s);
+}
+
+void
+gettime(struct timeval *tvp)
+{
+	int s;
+
+	s = splclock();
+	/* XXX should use microtime() iff tv_usec is used. */
+	*tvp = time;
+	splx(s);
+}
+
+/*
+ * Compute number of hz until specified time.  Used to
+ * compute third argument to timeout() from an absolute time.
+ */
+int
+hzto(tv)
+	struct timeval *tv;
+{
+	register unsigned long ticks;
+	register long sec, usec;
+	int s;
+
+	/*
+	 * If the number of usecs in the whole seconds part of the time
+	 * difference fits in a long, then the total number of usecs will
+	 * fit in an unsigned long.  Compute the total and convert it to
+	 * ticks, rounding up and adding 1 to allow for the current tick
+	 * to expire.  Rounding also depends on unsigned long arithmetic
+	 * to avoid overflow.
+	 *
+	 * Otherwise, if the number of ticks in the whole seconds part of
+	 * the time difference fits in a long, then convert the parts to
+	 * ticks separately and add, using similar rounding methods and
+	 * overflow avoidance.  This method would work in the previous
+	 * case but it is slightly slower and assumes that hz is integral.
+	 *
+	 * Otherwise, round the time difference down to the maximum
+	 * representable value.
+	 *
+	 * If ints have 32 bits, then the maximum value for any timeout in
+	 * 10ms ticks is 248 days.
+	 */
+	s = splclock();
+	sec = tv->tv_sec - time.tv_sec;
+	usec = tv->tv_usec - time.tv_usec;
+	splx(s);
+	if (usec < 0) {
+		sec--;
+		usec += 1000000;
+	}
+	if (sec < 0) {
+#ifdef DIAGNOSTIC
+		printf("hzto: negative time difference %ld sec %ld usec\n",
+		       sec, usec);
+#endif
+		ticks = 1;
+	} else if (sec <= LONG_MAX / 1000000)
+		ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
+			/ tick + 1;
+	else if (sec <= LONG_MAX / hz)
+		ticks = sec * hz
+			+ ((unsigned long)usec + (tick - 1)) / tick + 1;
+	else
+		ticks = LONG_MAX;
+	if (ticks > INT_MAX)
+		ticks = INT_MAX;
+	return (ticks);
+}
+
+/*
+ * Start profiling on a process.
+ *
+ * Kernel profiling passes proc0 which never exits and hence
+ * keeps the profile clock running constantly.
+ */
+void
+startprofclock(p)
+	register struct proc *p;
+{
+	int s;
+
+	if ((p->p_flag & P_PROFIL) == 0) {
+		p->p_flag |= P_PROFIL;
+		if (++profprocs == 1 && stathz != 0) {
+			s = splstatclock();
+			psdiv = pscnt = psratio;
+			setstatclockrate(profhz);
+			splx(s);
+		}
+	}
+}
+
+/*
+ * Stop profiling on a process.
+ */
+void
+stopprofclock(p)
+	register struct proc *p;
+{
+	int s;
+
+	if (p->p_flag & P_PROFIL) {
+		p->p_flag &= ~P_PROFIL;
+		if (--profprocs == 0 && stathz != 0) {
+			s = splstatclock();
+			psdiv = pscnt = 1;
+			setstatclockrate(stathz);
+			splx(s);
+		}
+	}
+}
+
+/*
+ * Statistics clock.  Grab profile sample, and if divider reaches 0,
+ * do process and kernel statistics.
+ */
+void
+statclock(frame)
+	register struct clockframe *frame;
+{
+#ifdef GPROF
+	register struct gmonparam *g;
+#endif
+	register struct proc *p;
+	register int i;
+	struct pstats *pstats;
+	long rss;
+	struct rusage *ru;
+	struct vmspace *vm;
+
+	if (CLKF_USERMODE(frame)) {
+		p = curproc;
+		if (p->p_flag & P_PROFIL)
+			addupc_intr(p, CLKF_PC(frame), 1);
+		if (--pscnt > 0)
+			return;
+		/*
+		 * Came from user mode; CPU was in user state.
+		 * If this process is being profiled record the tick.
+		 */
+		p->p_uticks++;
+		if (p->p_nice > NZERO)
+			cp_time[CP_NICE]++;
+		else
+			cp_time[CP_USER]++;
+	} else {
+#ifdef GPROF
+		/*
+		 * Kernel statistics are just like addupc_intr, only easier.
+		 */
+		g = &_gmonparam;
+		if (g->state == GMON_PROF_ON) {
+			i = CLKF_PC(frame) - g->lowpc;
+			if (i < g->textsize) {
+				i /= HISTFRACTION * sizeof(*g->kcount);
+				g->kcount[i]++;
+			}
+		}
+#endif
+		if (--pscnt > 0)
+			return;
+		/*
+		 * Came from kernel mode, so we were:
+		 * - handling an interrupt,
+		 * - doing syscall or trap work on behalf of the current
+		 *   user process, or
+		 * - spinning in the idle loop.
+		 * Whichever it is, charge the time as appropriate.
+		 * Note that we charge interrupts to the current process,
+		 * regardless of whether they are ``for'' that process,
+		 * so that we know how much of its real time was spent
+		 * in ``non-process'' (i.e., interrupt) work.
+		 */
+		p = curproc;
+		if (CLKF_INTR(frame)) {
+			if (p != NULL)
+				p->p_iticks++;
+			cp_time[CP_INTR]++;
+		} else if (p != NULL) {
+			p->p_sticks++;
+			cp_time[CP_SYS]++;
+		} else
+			cp_time[CP_IDLE]++;
+	}
+	pscnt = psdiv;
+
+	/*
+	 * We maintain statistics shown by user-level statistics
+	 * programs:  the amount of time in each cpu state, and
+	 * the amount of time each of DK_NDRIVE ``drives'' is busy.
+	 *
+	 * XXX	should either run linked list of drives, or (better)
+	 *	grab timestamps in the start & done code.
+	 */
+	for (i = 0; i < DK_NDRIVE; i++)
+		if (dk_busy & (1 << i))
+			dk_time[i]++;
+
+	/*
+	 * We adjust the priority of the current process.  The priority of
+	 * a process gets worse as it accumulates CPU time.  The cpu usage
+	 * estimator (p_estcpu) is increased here.  The formula for computing
+	 * priorities (in kern_synch.c) will compute a different value each
+	 * time p_estcpu increases by 4.  The cpu usage estimator ramps up
+	 * quite quickly when the process is running (linearly), and decays
+	 * away exponentially, at a rate which is proportionally slower when
+	 * the system is busy.  The basic principal is that the system will
+	 * 90% forget that the process used a lot of CPU time in 5 * loadav
+	 * seconds.  This causes the system to favor processes which haven't
+	 * run much recently, and to round-robin among other processes.
+	 */
+	if (p != NULL) {
+		p->p_cpticks++;
+		if (++p->p_estcpu == 0)
+			p->p_estcpu--;
+		if ((p->p_estcpu & 3) == 0) {
+			resetpriority(p);
+			if (p->p_priority >= PUSER)
+				p->p_priority = p->p_usrpri;
+		}
+
+		/* Update resource usage integrals and maximums. */
+		if ((pstats = p->p_stats) != NULL &&
+		    (ru = &pstats->p_ru) != NULL &&
+		    (vm = p->p_vmspace) != NULL) {
+			ru->ru_ixrss += vm->vm_tsize * PAGE_SIZE / 1024;
+			ru->ru_idrss += vm->vm_dsize * PAGE_SIZE / 1024;
+			ru->ru_isrss += vm->vm_ssize * PAGE_SIZE / 1024;
+			rss = vm->vm_pmap.pm_stats.resident_count *
+			      PAGE_SIZE / 1024;
+			if (ru->ru_maxrss < rss)
+				ru->ru_maxrss = rss;
+        	}
+	}
+}
+
+/*
+ * Return information about system clocks.
+ */
+static int
+sysctl_kern_clockrate SYSCTL_HANDLER_ARGS
+{
+	struct clockinfo clkinfo;
+	/*
+	 * Construct clockinfo structure.
+	 */
+	clkinfo.hz = hz;
+	clkinfo.tick = tick;
+	clkinfo.profhz = profhz;
+	clkinfo.stathz = stathz ? stathz : hz;
+	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
+}
+
+SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
+	0, 0, sysctl_kern_clockrate, "S,clockinfo","");
+
+#ifdef PPS_SYNC
+/*
+ * hardpps() - discipline CPU clock oscillator to external PPS signal
+ *
+ * This routine is called at each PPS interrupt in order to discipline
+ * the CPU clock oscillator to the PPS signal. It measures the PPS phase
+ * and leaves it in a handy spot for the hardclock() routine. It
+ * integrates successive PPS phase differences and calculates the
+ * frequency offset. This is used in hardclock() to discipline the CPU
+ * clock oscillator so that intrinsic frequency error is cancelled out.
+ * The code requires the caller to capture the time and hardware counter
+ * value at the on-time PPS signal transition.
+ *
+ * Note that, on some Unix systems, this routine runs at an interrupt
+ * priority level higher than the timer interrupt routine hardclock().
+ * Therefore, the variables used are distinct from the hardclock()
+ * variables, except for certain exceptions: The PPS frequency pps_freq
+ * and phase pps_offset variables are determined by this routine and
+ * updated atomically. The time_tolerance variable can be considered a
+ * constant, since it is infrequently changed, and then only when the
+ * PPS signal is disabled. The watchdog counter pps_valid is updated
+ * once per second by hardclock() and is atomically cleared in this
+ * routine.
+ */
+void
+hardpps(tvp, usec)
+	struct timeval *tvp;		/* time at PPS */
+	long usec;			/* hardware counter at PPS */
+{
+	long u_usec, v_usec, bigtick;
+	long cal_sec, cal_usec;
+
+	/*
+	 * An occasional glitch can be produced when the PPS interrupt
+	 * occurs in the hardclock() routine before the time variable is
+	 * updated. Here the offset is discarded when the difference
+	 * between it and the last one is greater than tick/2, but not
+	 * if the interval since the first discard exceeds 30 s.
+	 */
+	time_status |= STA_PPSSIGNAL;
+	time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
+	pps_valid = 0;
+	u_usec = -tvp->tv_usec;
+	if (u_usec < -500000)
+		u_usec += 1000000;
+	v_usec = pps_offset - u_usec;
+	if (v_usec < 0)
+		v_usec = -v_usec;
+	if (v_usec > (tick >> 1)) {
+		if (pps_glitch > MAXGLITCH) {
+			pps_glitch = 0;
+			pps_tf[2] = u_usec;
+			pps_tf[1] = u_usec;
+		} else {
+			pps_glitch++;
+			u_usec = pps_offset;
+		}
+	} else
+		pps_glitch = 0;
+
+	/*
+	 * A three-stage median filter is used to help deglitch the pps
+	 * time. The median sample becomes the time offset estimate; the
+	 * difference between the other two samples becomes the time
+	 * dispersion (jitter) estimate.
+	 */
+	pps_tf[2] = pps_tf[1];
+	pps_tf[1] = pps_tf[0];
+	pps_tf[0] = u_usec;
+	if (pps_tf[0] > pps_tf[1]) {
+		if (pps_tf[1] > pps_tf[2]) {
+			pps_offset = pps_tf[1];		/* 0 1 2 */
+			v_usec = pps_tf[0] - pps_tf[2];
+		} else if (pps_tf[2] > pps_tf[0]) {
+			pps_offset = pps_tf[0];		/* 2 0 1 */
+			v_usec = pps_tf[2] - pps_tf[1];
+		} else {
+			pps_offset = pps_tf[2];		/* 0 2 1 */
+			v_usec = pps_tf[0] - pps_tf[1];
+		}
+	} else {
+		if (pps_tf[1] < pps_tf[2]) {
+			pps_offset = pps_tf[1];		/* 2 1 0 */
+			v_usec = pps_tf[2] - pps_tf[0];
+		} else  if (pps_tf[2] < pps_tf[0]) {
+			pps_offset = pps_tf[0];		/* 1 0 2 */
+			v_usec = pps_tf[1] - pps_tf[2];
+		} else {
+			pps_offset = pps_tf[2];		/* 1 2 0 */
+			v_usec = pps_tf[1] - pps_tf[0];
+		}
+	}
+	if (v_usec > MAXTIME)
+		pps_jitcnt++;
+	v_usec = (v_usec << PPS_AVG) - pps_jitter;
+	if (v_usec < 0)
+		pps_jitter -= -v_usec >> PPS_AVG;
+	else
+		pps_jitter += v_usec >> PPS_AVG;
+	if (pps_jitter > (MAXTIME >> 1))
+		time_status |= STA_PPSJITTER;
+
+	/*
+	 * During the calibration interval adjust the starting time when
+	 * the tick overflows. At the end of the interval compute the
+	 * duration of the interval and the difference of the hardware
+	 * counters at the beginning and end of the interval. This code
+	 * is deliciously complicated by the fact valid differences may
+	 * exceed the value of tick when using long calibration
+	 * intervals and small ticks. Note that the counter can be
+	 * greater than tick if caught at just the wrong instant, but
+	 * the values returned and used here are correct.
+	 */
+	bigtick = (long)tick << SHIFT_USEC;
+	pps_usec -= pps_freq;
+	if (pps_usec >= bigtick)
+		pps_usec -= bigtick;
+	if (pps_usec < 0)
+		pps_usec += bigtick;
+	pps_time.tv_sec++;
+	pps_count++;
+	if (pps_count < (1 << pps_shift))
+		return;
+	pps_count = 0;
+	pps_calcnt++;
+	u_usec = usec << SHIFT_USEC;
+	v_usec = pps_usec - u_usec;
+	if (v_usec >= bigtick >> 1)
+		v_usec -= bigtick;
+	if (v_usec < -(bigtick >> 1))
+		v_usec += bigtick;
+	if (v_usec < 0)
+		v_usec = -(-v_usec >> pps_shift);
+	else
+		v_usec = v_usec >> pps_shift;
+	pps_usec = u_usec;
+	cal_sec = tvp->tv_sec;
+	cal_usec = tvp->tv_usec;
+	cal_sec -= pps_time.tv_sec;
+	cal_usec -= pps_time.tv_usec;
+	if (cal_usec < 0) {
+		cal_usec += 1000000;
+		cal_sec--;
+	}
+	pps_time = *tvp;
+
+	/*
+	 * Check for lost interrupts, noise, excessive jitter and
+	 * excessive frequency error. The number of timer ticks during
+	 * the interval may vary +-1 tick. Add to this a margin of one
+	 * tick for the PPS signal jitter and maximum frequency
+	 * deviation. If the limits are exceeded, the calibration
+	 * interval is reset to the minimum and we start over.
+	 */
+	u_usec = (long)tick << 1;
+	if (!((cal_sec == -1 && cal_usec > (1000000 - u_usec))
+	    || (cal_sec == 0 && cal_usec < u_usec))
+	    || v_usec > time_tolerance || v_usec < -time_tolerance) {
+		pps_errcnt++;
+		pps_shift = PPS_SHIFT;
+		pps_intcnt = 0;
+		time_status |= STA_PPSERROR;
+		return;
+	}
+
+	/*
+	 * A three-stage median filter is used to help deglitch the pps
+	 * frequency. The median sample becomes the frequency offset
+	 * estimate; the difference between the other two samples
+	 * becomes the frequency dispersion (stability) estimate.
+	 */
+	pps_ff[2] = pps_ff[1];
+	pps_ff[1] = pps_ff[0];
+	pps_ff[0] = v_usec;
+	if (pps_ff[0] > pps_ff[1]) {
+		if (pps_ff[1] > pps_ff[2]) {
+			u_usec = pps_ff[1];		/* 0 1 2 */
+			v_usec = pps_ff[0] - pps_ff[2];
+		} else if (pps_ff[2] > pps_ff[0]) {
+			u_usec = pps_ff[0];		/* 2 0 1 */
+			v_usec = pps_ff[2] - pps_ff[1];
+		} else {
+			u_usec = pps_ff[2];		/* 0 2 1 */
+			v_usec = pps_ff[0] - pps_ff[1];
+		}
+	} else {
+		if (pps_ff[1] < pps_ff[2]) {
+			u_usec = pps_ff[1];		/* 2 1 0 */
+			v_usec = pps_ff[2] - pps_ff[0];
+		} else  if (pps_ff[2] < pps_ff[0]) {
+			u_usec = pps_ff[0];		/* 1 0 2 */
+			v_usec = pps_ff[1] - pps_ff[2];
+		} else {
+			u_usec = pps_ff[2];		/* 1 2 0 */
+			v_usec = pps_ff[1] - pps_ff[0];
+		}
+	}
+
+	/*
+	 * Here the frequency dispersion (stability) is updated. If it
+	 * is less than one-fourth the maximum (MAXFREQ), the frequency
+	 * offset is updated as well, but clamped to the tolerance. It
+	 * will be processed later by the hardclock() routine.
+	 */
+	v_usec = (v_usec >> 1) - pps_stabil;
+	if (v_usec < 0)
+		pps_stabil -= -v_usec >> PPS_AVG;
+	else
+		pps_stabil += v_usec >> PPS_AVG;
+	if (pps_stabil > MAXFREQ >> 2) {
+		pps_stbcnt++;
+		time_status |= STA_PPSWANDER;
+		return;
+	}
+	if (time_status & STA_PPSFREQ) {
+		if (u_usec < 0) {
+			pps_freq -= -u_usec >> PPS_AVG;
+			if (pps_freq < -time_tolerance)
+				pps_freq = -time_tolerance;
+			u_usec = -u_usec;
+		} else {
+			pps_freq += u_usec >> PPS_AVG;
+			if (pps_freq > time_tolerance)
+				pps_freq = time_tolerance;
+		}
+	}
+
+	/*
+	 * Here the calibration interval is adjusted. If the maximum
+	 * time difference is greater than tick / 4, reduce the interval
+	 * by half. If this is not the case for four consecutive
+	 * intervals, double the interval.
+	 */
+	if (u_usec << pps_shift > bigtick >> 2) {
+		pps_intcnt = 0;
+		if (pps_shift > PPS_SHIFT)
+			pps_shift--;
+	} else if (pps_intcnt >= 4) {
+		pps_intcnt = 0;
+		if (pps_shift < PPS_SHIFTMAX)
+			pps_shift++;
+	} else
+		pps_intcnt++;
+}
+#endif /* PPS_SYNC */
diff --git a/sys/kern/kern_time.c b/sys/kern/kern_time.c
index f4facf6..797ea2c 100644
--- a/sys/kern/kern_time.c
+++ b/sys/kern/kern_time.c
@@ -30,22 +30,22 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)kern_time.c	8.4 (Berkeley) 5/26/95
+ *	@(#)kern_time.c	8.1 (Berkeley) 6/10/93
+ * $Id: kern_time.c,v 1.21 1997/02/22 09:39:13 peter Exp $
  */
 
 #include <sys/param.h>
+#include <sys/sysproto.h>
 #include <sys/resourcevar.h>
+#include <sys/signalvar.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
 
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
+struct timezone tz;
 
-#include <machine/cpu.h>
-
-/* 
+/*
  * Time of day and interval timer support.
  *
  * These routines provide the kernel entry points to get and set
@@ -55,81 +55,97 @@
  * timers when they expire.
  */
 
+static void	timevalfix __P((struct timeval *));
+
+#ifndef _SYS_SYSPROTO_H_
+struct gettimeofday_args {
+	struct	timeval *tp;
+	struct	timezone *tzp;
+};
+#endif
 /* ARGSUSED */
 int
 gettimeofday(p, uap, retval)
 	struct proc *p;
-	register struct gettimeofday_args /* {
-		syscallarg(struct timeval *) tp;
-		syscallarg(struct timezone *) tzp;
-	} */ *uap;
-	register_t *retval;
+	register struct gettimeofday_args *uap;
+	int *retval;
 {
 	struct timeval atv;
 	int error = 0;
 
-	if (SCARG(uap, tp)) {
+	if (uap->tp) {
 		microtime(&atv);
-		if (error = copyout((caddr_t)&atv, (caddr_t)SCARG(uap, tp),
-		    sizeof (atv)))
+		if ((error = copyout((caddr_t)&atv, (caddr_t)uap->tp,
+		    sizeof (atv))))
 			return (error);
 	}
-	if (SCARG(uap, tzp))
-		error = copyout((caddr_t)&tz, (caddr_t)SCARG(uap, tzp),
+	if (uap->tzp)
+		error = copyout((caddr_t)&tz, (caddr_t)uap->tzp,
 		    sizeof (tz));
 	return (error);
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct settimeofday_args {
+	struct	timeval *tv;
+	struct	timezone *tzp;
+};
+#endif
 /* ARGSUSED */
 int
 settimeofday(p, uap, retval)
 	struct proc *p;
-	struct settimeofday_args /* {
-		syscallarg(struct timeval *) tv;
-		syscallarg(struct timezone *) tzp;
-	} */ *uap;
-	register_t *retval;
+	struct settimeofday_args *uap;
+	int *retval;
 {
 	struct timeval atv, delta;
 	struct timezone atz;
 	int error, s;
 
-	if (error = suser(p->p_ucred, &p->p_acflag))
+	if ((error = suser(p->p_ucred, &p->p_acflag)))
 		return (error);
 	/* Verify all parameters before changing time. */
-	if (SCARG(uap, tv) && (error = copyin((caddr_t)SCARG(uap, tv),
-	    (caddr_t)&atv, sizeof(atv))))
+	if (uap->tv &&
+	    (error = copyin((caddr_t)uap->tv, (caddr_t)&atv, sizeof(atv))))
 		return (error);
-	if (SCARG(uap, tzp) && (error = copyin((caddr_t)SCARG(uap, tzp),
-	    (caddr_t)&atz, sizeof(atz))))
+	if (atv.tv_usec < 0 || atv.tv_usec >= 1000000)
+		return (EINVAL);
+	if (uap->tzp &&
+	    (error = copyin((caddr_t)uap->tzp, (caddr_t)&atz, sizeof(atz))))
 		return (error);
-	if (SCARG(uap, tv)) {
+	if (uap->tv) {
+		s = splclock();
 		/*
-		 * If the system is secure, we do not allow the time to be 
-		 * set to an earlier value (it may be slowed using adjtime,
-		 * but not set back). This feature prevent interlopers from
-		 * setting arbitrary time stamps on files.
+		 * Calculate delta directly to minimize clock interrupt
+		 * latency.  Fix it after the ipl has been lowered.
 		 */
-		if (securelevel > 0 && timercmp(&atv, &time, <))
-			return (EPERM);
-		/* WHAT DO WE DO ABOUT PENDING REAL-TIME TIMEOUTS??? */
-		s = splclock();
-		/* nb. delta.tv_usec may be < 0, but this is OK here */
 		delta.tv_sec = atv.tv_sec - time.tv_sec;
 		delta.tv_usec = atv.tv_usec - time.tv_usec;
 		time = atv;
+		/*
+		 * XXX should arrange for microtime() to agree with atv if
+		 * it is called now.  As it is, it may add up to about
+		 * `tick' unwanted usec.
+		 * Another problem is that clock interrupts may occur at
+		 * other than multiples of `tick'.  It's not worth fixing
+		 * this here, since the problem is also caused by tick
+		 * adjustments.
+		 */
 		(void) splsoftclock();
+		timevalfix(&delta);
 		timevaladd(&boottime, &delta);
-		timevalfix(&boottime);
 		timevaladd(&runtime, &delta);
-		timevalfix(&runtime);
+		/* re-use 'p' */
+		for (p = allproc.lh_first; p != 0; p = p->p_list.le_next)
+			if (timerisset(&p->p_realtimer.it_value))
+				timevaladd(&p->p_realtimer.it_value, &delta);
 #		ifdef NFS
 			lease_updatetime(delta.tv_sec);
 #		endif
 		splx(s);
 		resettodr();
 	}
-	if (SCARG(uap, tzp))
+	if (uap->tzp)
 		tz = atz;
 	return (0);
 }
@@ -137,26 +153,29 @@ settimeofday(p, uap, retval)
 extern	int tickadj;			/* "standard" clock skew, us./tick */
 int	tickdelta;			/* current clock skew, us. per tick */
 long	timedelta;			/* unapplied time correction, us. */
-long	bigadj = 1000000;		/* use 10x skew above bigadj us. */
+static long	bigadj = 1000000;	/* use 10x skew above bigadj us. */
 
+#ifndef _SYS_SYSPROTO_H_
+struct adjtime_args {
+	struct timeval *delta;
+	struct timeval *olddelta;
+};
+#endif
 /* ARGSUSED */
 int
 adjtime(p, uap, retval)
 	struct proc *p;
-	register struct adjtime_args /* {
-		syscallarg(struct timeval *) delta;
-		syscallarg(struct timeval *) olddelta;
-	} */ *uap;
-	register_t *retval;
+	register struct adjtime_args *uap;
+	int *retval;
 {
 	struct timeval atv;
 	register long ndelta, ntickdelta, odelta;
 	int s, error;
 
-	if (error = suser(p->p_ucred, &p->p_acflag))
+	if ((error = suser(p->p_ucred, &p->p_acflag)))
 		return (error);
-	if (error = copyin((caddr_t)SCARG(uap, delta), (caddr_t)&atv,
-	    sizeof(struct timeval)))
+	if ((error =
+	    copyin((caddr_t)uap->delta, (caddr_t)&atv, sizeof(struct timeval))))
 		return (error);
 
 	/*
@@ -167,7 +186,7 @@ adjtime(p, uap, retval)
 	 * overshoot and start taking us away from the desired final time.
 	 */
 	ndelta = atv.tv_sec * 1000000 + atv.tv_usec;
-	if (ndelta > bigadj)
+	if (ndelta > bigadj || ndelta < -bigadj)
 		ntickdelta = 10 * tickadj;
 	else
 		ntickdelta = tickadj;
@@ -187,10 +206,10 @@ adjtime(p, uap, retval)
 	tickdelta = ntickdelta;
 	splx(s);
 
-	if (SCARG(uap, olddelta)) {
+	if (uap->olddelta) {
 		atv.tv_sec = odelta / 1000000;
 		atv.tv_usec = odelta % 1000000;
-		(void) copyout((caddr_t)&atv, (caddr_t)SCARG(uap, olddelta),
+		(void) copyout((caddr_t)&atv, (caddr_t)uap->olddelta,
 		    sizeof(struct timeval));
 	}
 	return (0);
@@ -217,25 +236,28 @@ adjtime(p, uap, retval)
  * real time timers .it_interval.  Rather, we compute the next time in
  * absolute time the timer should go off.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct getitimer_args {
+	u_int	which;
+	struct	itimerval *itv;
+};
+#endif
 /* ARGSUSED */
 int
 getitimer(p, uap, retval)
 	struct proc *p;
-	register struct getitimer_args /* {
-		syscallarg(u_int) which;
-		syscallarg(struct itimerval *) itv;
-	} */ *uap;
-	register_t *retval;
+	register struct getitimer_args *uap;
+	int *retval;
 {
 	struct itimerval aitv;
 	int s;
 
-	if (SCARG(uap, which) > ITIMER_PROF)
+	if (uap->which > ITIMER_PROF)
 		return (EINVAL);
 	s = splclock();
-	if (SCARG(uap, which) == ITIMER_REAL) {
+	if (uap->which == ITIMER_REAL) {
 		/*
-		 * Convert from absolute to relative time in .it_value
+		 * Convert from absoulte to relative time in .it_value
 		 * part of real time timer.  If time for real time timer
 		 * has passed return 0, else return difference between
 		 * current time and time for the timer to go off.
@@ -245,53 +267,54 @@ getitimer(p, uap, retval)
 			if (timercmp(&aitv.it_value, &time, <))
 				timerclear(&aitv.it_value);
 			else
-				timevalsub(&aitv.it_value,
-				    (struct timeval *)&time);
+				timevalsub(&aitv.it_value, &time);
 	} else
-		aitv = p->p_stats->p_timer[SCARG(uap, which)];
+		aitv = p->p_stats->p_timer[uap->which];
 	splx(s);
-	return (copyout((caddr_t)&aitv, (caddr_t)SCARG(uap, itv),
+	return (copyout((caddr_t)&aitv, (caddr_t)uap->itv,
 	    sizeof (struct itimerval)));
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct setitimer_args {
+	u_int	which;
+	struct	itimerval *itv, *oitv;
+};
+#endif
 /* ARGSUSED */
 int
 setitimer(p, uap, retval)
 	struct proc *p;
-	register struct setitimer_args /* {
-		syscallarg(u_int) which;
-		syscallarg(struct itimerval *) itv;
-		syscallarg(struct itimerval *) oitv;
-	} */ *uap;
-	register_t *retval;
+	register struct setitimer_args *uap;
+	int *retval;
 {
 	struct itimerval aitv;
 	register struct itimerval *itvp;
 	int s, error;
 
-	if (SCARG(uap, which) > ITIMER_PROF)
+	if (uap->which > ITIMER_PROF)
 		return (EINVAL);
-	itvp = SCARG(uap, itv);
+	itvp = uap->itv;
 	if (itvp && (error = copyin((caddr_t)itvp, (caddr_t)&aitv,
 	    sizeof(struct itimerval))))
 		return (error);
-	if ((SCARG(uap, itv) = SCARG(uap, oitv)) &&
-	    (error = getitimer(p, uap, retval)))
+	if ((uap->itv = uap->oitv) &&
+	    (error = getitimer(p, (struct getitimer_args *)uap, retval)))
 		return (error);
 	if (itvp == 0)
 		return (0);
 	if (itimerfix(&aitv.it_value) || itimerfix(&aitv.it_interval))
 		return (EINVAL);
 	s = splclock();
-	if (SCARG(uap, which) == ITIMER_REAL) {
+	if (uap->which == ITIMER_REAL) {
 		untimeout(realitexpire, (caddr_t)p);
 		if (timerisset(&aitv.it_value)) {
-			timevaladd(&aitv.it_value, (struct timeval *)&time);
+			timevaladd(&aitv.it_value, &time);
 			timeout(realitexpire, (caddr_t)p, hzto(&aitv.it_value));
 		}
 		p->p_realtimer = aitv;
 	} else
-		p->p_stats->p_timer[SCARG(uap, which)] = aitv;
+		p->p_stats->p_timer[uap->which] = aitv;
 	splx(s);
 	return (0);
 }
@@ -303,6 +326,10 @@ setitimer(p, uap, retval)
  * Else compute next time timer should go off which is > current time.
  * This is where delay in processing this timeout causes multiple
  * SIGALRM calls to be compressed into one.
+ * hzto() always adds 1 to allow for the time until the next clock
+ * interrupt being strictly less than 1 clock tick, but we don't want
+ * that here since we want to appear to be in sync with the clock
+ * interrupt even when we're delayed.
  */
 void
 realitexpire(arg)
@@ -323,7 +350,7 @@ realitexpire(arg)
 		    &p->p_realtimer.it_interval);
 		if (timercmp(&p->p_realtimer.it_value, &time, >)) {
 			timeout(realitexpire, (caddr_t)p,
-			    hzto(&p->p_realtimer.it_value));
+			    hzto(&p->p_realtimer.it_value) - 1);
 			splx(s);
 			return;
 		}
@@ -400,6 +427,7 @@ expire:
  * it just gets very confused in this case.
  * Caveat emptor.
  */
+void
 timevaladd(t1, t2)
 	struct timeval *t1, *t2;
 {
@@ -409,6 +437,7 @@ timevaladd(t1, t2)
 	timevalfix(t1);
 }
 
+void
 timevalsub(t1, t2)
 	struct timeval *t1, *t2;
 {
@@ -418,6 +447,7 @@ timevalsub(t1, t2)
 	timevalfix(t1);
 }
 
+static void
 timevalfix(t1)
 	struct timeval *t1;
 {
diff --git a/sys/kern/kern_timeout.c b/sys/kern/kern_timeout.c
new file mode 100644
index 0000000..171ed0e
--- /dev/null
+++ b/sys/kern/kern_timeout.c
@@ -0,0 +1,1303 @@
+/*-
+ * Copyright (c) 1982, 1986, 1991, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)kern_clock.c	8.5 (Berkeley) 1/21/94
+ * $Id: kern_clock.c,v 1.34 1997/03/22 16:52:19 mpp Exp $
+ */
+
+/* Portions of this software are covered by the following: */
+/******************************************************************************
+ *                                                                            *
+ * Copyright (c) David L. Mills 1993, 1994                                    *
+ *                                                                            *
+ * Permission to use, copy, modify, and distribute this software and its      *
+ * documentation for any purpose and without fee is hereby granted, provided  *
+ * that the above copyright notice appears in all copies and that both the    *
+ * copyright notice and this permission notice appear in supporting           *
+ * documentation, and that the name University of Delaware not be used in     *
+ * advertising or publicity pertaining to distribution of the software        *
+ * without specific, written prior permission.  The University of Delaware    *
+ * makes no representations about the suitability this software for any       *
+ * purpose.  It is provided "as is" without express or implied warranty.      *
+ *                                                                            *
+ *****************************************************************************/
+
+#include "opt_cpu.h"		/* XXX */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/dkstat.h>
+#include <sys/callout.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/timex.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <sys/sysctl.h>
+
+#include <machine/cpu.h>
+#define CLOCK_HAIR		/* XXX */
+#include <machine/clock.h>
+
+#ifdef GPROF
+#include <sys/gmon.h>
+#endif
+
+static void initclocks __P((void *dummy));
+SYSINIT(clocks, SI_SUB_CLOCKS, SI_ORDER_FIRST, initclocks, NULL)
+
+/* Exported to machdep.c. */
+struct callout *callfree, *callout;
+
+static struct callout calltodo;
+
+/* Some of these don't belong here, but it's easiest to concentrate them. */
+static long cp_time[CPUSTATES];
+long dk_seek[DK_NDRIVE];
+static long dk_time[DK_NDRIVE];
+long dk_wds[DK_NDRIVE];
+long dk_wpms[DK_NDRIVE];
+long dk_xfer[DK_NDRIVE];
+
+int dk_busy;
+int dk_ndrive = 0;
+char dk_names[DK_NDRIVE][DK_NAMELEN];
+
+long tk_cancc;
+long tk_nin;
+long tk_nout;
+long tk_rawcc;
+
+/*
+ * Clock handling routines.
+ *
+ * This code is written to operate with two timers that run independently of
+ * each other.  The main clock, running hz times per second, is used to keep
+ * track of real time.  The second timer handles kernel and user profiling,
+ * and does resource use estimation.  If the second timer is programmable,
+ * it is randomized to avoid aliasing between the two clocks.  For example,
+ * the randomization prevents an adversary from always giving up the cpu
+ * just before its quantum expires.  Otherwise, it would never accumulate
+ * cpu ticks.  The mean frequency of the second timer is stathz.
+ *
+ * If no second timer exists, stathz will be zero; in this case we drive
+ * profiling and statistics off the main clock.  This WILL NOT be accurate;
+ * do not do it unless absolutely necessary.
+ *
+ * The statistics clock may (or may not) be run at a higher rate while
+ * profiling.  This profile clock runs at profhz.  We require that profhz
+ * be an integral multiple of stathz.
+ *
+ * If the statistics clock is running fast, it must be divided by the ratio
+ * profhz/stathz for statistics.  (For profiling, every tick counts.)
+ */
+
+/*
+ * TODO:
+ *	allocate more timeout table slots when table overflows.
+ */
+
+/*
+ * Bump a timeval by a small number of usec's.
+ */
+#define BUMPTIME(t, usec) { \
+	register volatile struct timeval *tp = (t); \
+	register long us; \
+ \
+	tp->tv_usec = us = tp->tv_usec + (usec); \
+	if (us >= 1000000) { \
+		tp->tv_usec = us - 1000000; \
+		tp->tv_sec++; \
+	} \
+}
+
+int	stathz;
+int	profhz;
+static int profprocs;
+int	ticks;
+static int psdiv, pscnt;	/* prof => stat divider */
+int psratio;			/* ratio: prof / stat */
+
+volatile struct	timeval time;
+volatile struct	timeval mono_time;
+
+/*
+ * Phase/frequency-lock loop (PLL/FLL) definitions
+ *
+ * The following variables are read and set by the ntp_adjtime() system
+ * call.
+ *
+ * time_state shows the state of the system clock, with values defined
+ * in the timex.h header file.
+ *
+ * time_status shows the status of the system clock, with bits defined
+ * in the timex.h header file.
+ *
+ * time_offset is used by the PLL/FLL to adjust the system time in small
+ * increments.
+ *
+ * time_constant determines the bandwidth or "stiffness" of the PLL.
+ *
+ * time_tolerance determines maximum frequency error or tolerance of the
+ * CPU clock oscillator and is a property of the architecture; however,
+ * in principle it could change as result of the presence of external
+ * discipline signals, for instance.
+ *
+ * time_precision is usually equal to the kernel tick variable; however,
+ * in cases where a precision clock counter or external clock is
+ * available, the resolution can be much less than this and depend on
+ * whether the external clock is working or not.
+ *
+ * time_maxerror is initialized by a ntp_adjtime() call and increased by
+ * the kernel once each second to reflect the maximum error
+ * bound growth.
+ *
+ * time_esterror is set and read by the ntp_adjtime() call, but
+ * otherwise not used by the kernel.
+ */
+int time_status = STA_UNSYNC;	/* clock status bits */
+int time_state = TIME_OK;	/* clock state */
+long time_offset = 0;		/* time offset (us) */
+long time_constant = 0;		/* pll time constant */
+long time_tolerance = MAXFREQ;	/* frequency tolerance (scaled ppm) */
+long time_precision = 1;	/* clock precision (us) */
+long time_maxerror = MAXPHASE;	/* maximum error (us) */
+long time_esterror = MAXPHASE;	/* estimated error (us) */
+
+/*
+ * The following variables establish the state of the PLL/FLL and the
+ * residual time and frequency offset of the local clock. The scale
+ * factors are defined in the timex.h header file.
+ *
+ * time_phase and time_freq are the phase increment and the frequency
+ * increment, respectively, of the kernel time variable at each tick of
+ * the clock.
+ *
+ * time_freq is set via ntp_adjtime() from a value stored in a file when
+ * the synchronization daemon is first started. Its value is retrieved
+ * via ntp_adjtime() and written to the file about once per hour by the
+ * daemon.
+ *
+ * time_adj is the adjustment added to the value of tick at each timer
+ * interrupt and is recomputed from time_phase and time_freq at each
+ * seconds rollover.
+ *
+ * time_reftime is the second's portion of the system time on the last
+ * call to ntp_adjtime(). It is used to adjust the time_freq variable
+ * and to increase the time_maxerror as the time since last update
+ * increases.
+ */
+static long time_phase = 0;		/* phase offset (scaled us) */
+long time_freq = 0;			/* frequency offset (scaled ppm) */
+static long time_adj = 0;		/* tick adjust (scaled 1 / hz) */
+static long time_reftime = 0;		/* time at last adjustment (s) */
+
+#ifdef PPS_SYNC
+/*
+ * The following variables are used only if the kernel PPS discipline
+ * code is configured (PPS_SYNC). The scale factors are defined in the
+ * timex.h header file.
+ *
+ * pps_time contains the time at each calibration interval, as read by
+ * microtime(). pps_count counts the seconds of the calibration
+ * interval, the duration of which is nominally pps_shift in powers of
+ * two.
+ *
+ * pps_offset is the time offset produced by the time median filter
+ * pps_tf[], while pps_jitter is the dispersion (jitter) measured by
+ * this filter.
+ *
+ * pps_freq is the frequency offset produced by the frequency median
+ * filter pps_ff[], while pps_stabil is the dispersion (wander) measured
+ * by this filter.
+ *
+ * pps_usec is latched from a high resolution counter or external clock
+ * at pps_time. Here we want the hardware counter contents only, not the
+ * contents plus the time_tv.usec as usual.
+ *
+ * pps_valid counts the number of seconds since the last PPS update. It
+ * is used as a watchdog timer to disable the PPS discipline should the
+ * PPS signal be lost.
+ *
+ * pps_glitch counts the number of seconds since the beginning of an
+ * offset burst more than tick/2 from current nominal offset. It is used
+ * mainly to suppress error bursts due to priority conflicts between the
+ * PPS interrupt and timer interrupt.
+ *
+ * pps_intcnt counts the calibration intervals for use in the interval-
+ * adaptation algorithm. It's just too complicated for words.
+ */
+struct timeval pps_time;	/* kernel time at last interval */
+long pps_offset = 0;		/* pps time offset (us) */
+long pps_jitter = MAXTIME;	/* pps time dispersion (jitter) (us) */
+long pps_tf[] = {0, 0, 0};	/* pps time offset median filter (us) */
+long pps_freq = 0;		/* frequency offset (scaled ppm) */
+long pps_stabil = MAXFREQ;	/* frequency dispersion (scaled ppm) */
+long pps_ff[] = {0, 0, 0};	/* frequency offset median filter */
+long pps_usec = 0;		/* microsec counter at last interval */
+long pps_valid = PPS_VALID;	/* pps signal watchdog counter */
+int pps_glitch = 0;		/* pps signal glitch counter */
+int pps_count = 0;		/* calibration interval counter (s) */
+int pps_shift = PPS_SHIFT;	/* interval duration (s) (shift) */
+int pps_intcnt = 0;		/* intervals at current duration */
+
+/*
+ * PPS signal quality monitors
+ *
+ * pps_jitcnt counts the seconds that have been discarded because the
+ * jitter measured by the time median filter exceeds the limit MAXTIME
+ * (100 us).
+ *
+ * pps_calcnt counts the frequency calibration intervals, which are
+ * variable from 4 s to 256 s.
+ *
+ * pps_errcnt counts the calibration intervals which have been discarded
+ * because the wander exceeds the limit MAXFREQ (100 ppm) or where the
+ * calibration interval jitter exceeds two ticks.
+ *
+ * pps_stbcnt counts the calibration intervals that have been discarded
+ * because the frequency wander exceeds the limit MAXFREQ / 4 (25 us).
+ */
+long pps_jitcnt = 0;		/* jitter limit exceeded */
+long pps_calcnt = 0;		/* calibration intervals */
+long pps_errcnt = 0;		/* calibration errors */
+long pps_stbcnt = 0;		/* stability limit exceeded */
+#endif /* PPS_SYNC */
+
+/* XXX none of this stuff works under FreeBSD */
+#ifdef EXT_CLOCK
+/*
+ * External clock definitions
+ *
+ * The following definitions and declarations are used only if an
+ * external clock (HIGHBALL or TPRO) is configured on the system.
+ */
+#define CLOCK_INTERVAL 30	/* CPU clock update interval (s) */
+
+/*
+ * The clock_count variable is set to CLOCK_INTERVAL at each PPS
+ * interrupt and decremented once each second.
+ */
+int clock_count = 0;		/* CPU clock counter */
+
+#ifdef HIGHBALL
+/*
+ * The clock_offset and clock_cpu variables are used by the HIGHBALL
+ * interface. The clock_offset variable defines the offset between
+ * system time and the HIGBALL counters. The clock_cpu variable contains
+ * the offset between the system clock and the HIGHBALL clock for use in
+ * disciplining the kernel time variable.
+ */
+extern struct timeval clock_offset; /* Highball clock offset */
+long clock_cpu = 0;		/* CPU clock adjust */
+#endif /* HIGHBALL */
+#endif /* EXT_CLOCK */
+
+/*
+ * hardupdate() - local clock update
+ *
+ * This routine is called by ntp_adjtime() to update the local clock
+ * phase and frequency. The implementation is of an adaptive-parameter,
+ * hybrid phase/frequency-lock loop (PLL/FLL). The routine computes new
+ * time and frequency offset estimates for each call. If the kernel PPS
+ * discipline code is configured (PPS_SYNC), the PPS signal itself
+ * determines the new time offset, instead of the calling argument.
+ * Presumably, calls to ntp_adjtime() occur only when the caller
+ * believes the local clock is valid within some bound (+-128 ms with
+ * NTP). If the caller's time is far different than the PPS time, an
+ * argument will ensue, and it's not clear who will lose.
+ *
+ * For uncompensated quartz crystal oscillatores and nominal update
+ * intervals less than 1024 s, operation should be in phase-lock mode
+ * (STA_FLL = 0), where the loop is disciplined to phase. For update
+ * intervals greater than thiss, operation should be in frequency-lock
+ * mode (STA_FLL = 1), where the loop is disciplined to frequency.
+ *
+ * Note: splclock() is in effect.
+ */
+void
+hardupdate(offset)
+	long offset;
+{
+	long ltemp, mtemp;
+
+	if (!(time_status & STA_PLL) && !(time_status & STA_PPSTIME))
+		return;
+	ltemp = offset;
+#ifdef PPS_SYNC
+	if (time_status & STA_PPSTIME && time_status & STA_PPSSIGNAL)
+		ltemp = pps_offset;
+#endif /* PPS_SYNC */
+
+	/*
+	 * Scale the phase adjustment and clamp to the operating range.
+	 */
+	if (ltemp > MAXPHASE)
+		time_offset = MAXPHASE << SHIFT_UPDATE;
+	else if (ltemp < -MAXPHASE)
+		time_offset = -(MAXPHASE << SHIFT_UPDATE);
+	else
+		time_offset = ltemp << SHIFT_UPDATE;
+
+	/*
+	 * Select whether the frequency is to be controlled and in which
+	 * mode (PLL or FLL). Clamp to the operating range. Ugly
+	 * multiply/divide should be replaced someday.
+	 */
+	if (time_status & STA_FREQHOLD || time_reftime == 0)
+		time_reftime = time.tv_sec;
+	mtemp = time.tv_sec - time_reftime;
+	time_reftime = time.tv_sec;
+	if (time_status & STA_FLL) {
+		if (mtemp >= MINSEC) {
+			ltemp = ((time_offset / mtemp) << (SHIFT_USEC -
+			    SHIFT_UPDATE));
+			if (ltemp < 0)
+				time_freq -= -ltemp >> SHIFT_KH;
+			else
+				time_freq += ltemp >> SHIFT_KH;
+		}
+	} else {
+		if (mtemp < MAXSEC) {
+			ltemp *= mtemp;
+			if (ltemp < 0)
+				time_freq -= -ltemp >> (time_constant +
+				    time_constant + SHIFT_KF -
+				    SHIFT_USEC);
+			else
+				time_freq += ltemp >> (time_constant +
+				    time_constant + SHIFT_KF -
+				    SHIFT_USEC);
+		}
+	}
+	if (time_freq > time_tolerance)
+		time_freq = time_tolerance;
+	else if (time_freq < -time_tolerance)
+		time_freq = -time_tolerance;
+}
+
+
+
+/*
+ * Initialize clock frequencies and start both clocks running.
+ */
+/* ARGSUSED*/
+static void
+initclocks(dummy)
+	void *dummy;
+{
+	register int i;
+
+	/*
+	 * Set divisors to 1 (normal case) and let the machine-specific
+	 * code do its bit.
+	 */
+	psdiv = pscnt = 1;
+	cpu_initclocks();
+
+	/*
+	 * Compute profhz/stathz, and fix profhz if needed.
+	 */
+	i = stathz ? stathz : hz;
+	if (profhz == 0)
+		profhz = i;
+	psratio = profhz / i;
+}
+
+/*
+ * The real-time timer, interrupting hz times per second.
+ */
+void
+hardclock(frame)
+	register struct clockframe *frame;
+{
+	register struct callout *p1;
+	register struct proc *p;
+	register int needsoft;
+
+	/*
+	 * Update real-time timeout queue.
+	 * At front of queue are some number of events which are ``due''.
+	 * The time to these is <= 0 and if negative represents the
+	 * number of ticks which have passed since it was supposed to happen.
+	 * The rest of the q elements (times > 0) are events yet to happen,
+	 * where the time for each is given as a delta from the previous.
+	 * Decrementing just the first of these serves to decrement the time
+	 * to all events.
+	 */
+	needsoft = 0;
+	for (p1 = calltodo.c_next; p1 != NULL; p1 = p1->c_next) {
+		if (--p1->c_time > 0)
+			break;
+		needsoft = 1;
+		if (p1->c_time == 0)
+			break;
+	}
+
+	p = curproc;
+	if (p) {
+		register struct pstats *pstats;
+
+		/*
+		 * Run current process's virtual and profile time, as needed.
+		 */
+		pstats = p->p_stats;
+		if (CLKF_USERMODE(frame) &&
+		    timerisset(&pstats->p_timer[ITIMER_VIRTUAL].it_value) &&
+		    itimerdecr(&pstats->p_timer[ITIMER_VIRTUAL], tick) == 0)
+			psignal(p, SIGVTALRM);
+		if (timerisset(&pstats->p_timer[ITIMER_PROF].it_value) &&
+		    itimerdecr(&pstats->p_timer[ITIMER_PROF], tick) == 0)
+			psignal(p, SIGPROF);
+	}
+
+	/*
+	 * If no separate statistics clock is available, run it from here.
+	 */
+	if (stathz == 0)
+		statclock(frame);
+
+	/*
+	 * Increment the time-of-day.
+	 */
+	ticks++;
+	{
+		int time_update;
+		struct timeval newtime = time;
+		long ltemp;
+
+		if (timedelta == 0) {
+			time_update = CPU_THISTICKLEN(tick);
+		} else {
+			time_update = CPU_THISTICKLEN(tick) + tickdelta;
+			timedelta -= tickdelta;
+		}
+		BUMPTIME(&mono_time, time_update);
+
+		/*
+		 * Compute the phase adjustment. If the low-order bits
+		 * (time_phase) of the update overflow, bump the high-order bits
+		 * (time_update).
+		 */
+		time_phase += time_adj;
+		if (time_phase <= -FINEUSEC) {
+		  ltemp = -time_phase >> SHIFT_SCALE;
+		  time_phase += ltemp << SHIFT_SCALE;
+		  time_update -= ltemp;
+		}
+		else if (time_phase >= FINEUSEC) {
+		  ltemp = time_phase >> SHIFT_SCALE;
+		  time_phase -= ltemp << SHIFT_SCALE;
+		  time_update += ltemp;
+		}
+
+		newtime.tv_usec += time_update;
+		/*
+		 * On rollover of the second the phase adjustment to be used for
+		 * the next second is calculated. Also, the maximum error is
+		 * increased by the tolerance. If the PPS frequency discipline
+		 * code is present, the phase is increased to compensate for the
+		 * CPU clock oscillator frequency error.
+		 *
+		 * On a 32-bit machine and given parameters in the timex.h
+		 * header file, the maximum phase adjustment is +-512 ms and
+		 * maximum frequency offset is a tad less than) +-512 ppm. On a
+		 * 64-bit machine, you shouldn't need to ask.
+		 */
+		if (newtime.tv_usec >= 1000000) {
+		  newtime.tv_usec -= 1000000;
+		  newtime.tv_sec++;
+		  time_maxerror += time_tolerance >> SHIFT_USEC;
+
+		  /*
+		   * Compute the phase adjustment for the next second. In
+		   * PLL mode, the offset is reduced by a fixed factor
+		   * times the time constant. In FLL mode the offset is
+		   * used directly. In either mode, the maximum phase
+		   * adjustment for each second is clamped so as to spread
+		   * the adjustment over not more than the number of
+		   * seconds between updates.
+		   */
+		  if (time_offset < 0) {
+		    ltemp = -time_offset;
+		    if (!(time_status & STA_FLL))
+			ltemp >>= SHIFT_KG + time_constant;
+		    if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+			ltemp = (MAXPHASE / MINSEC) <<
+			    SHIFT_UPDATE;
+		    time_offset += ltemp;
+		    time_adj = -ltemp << (SHIFT_SCALE - SHIFT_HZ -
+			SHIFT_UPDATE);
+		    } else {
+		        ltemp = time_offset;
+			if (!(time_status & STA_FLL))
+				ltemp >>= SHIFT_KG + time_constant;
+			if (ltemp > (MAXPHASE / MINSEC) << SHIFT_UPDATE)
+				ltemp = (MAXPHASE / MINSEC) <<
+				    SHIFT_UPDATE;
+			time_offset -= ltemp;
+			time_adj = ltemp << (SHIFT_SCALE - SHIFT_HZ -
+			    SHIFT_UPDATE);
+		    }
+
+		  /*
+		   * Compute the frequency estimate and additional phase
+		   * adjustment due to frequency error for the next
+		   * second. When the PPS signal is engaged, gnaw on the
+		   * watchdog counter and update the frequency computed by
+		   * the pll and the PPS signal.
+		   */
+#ifdef PPS_SYNC
+		  pps_valid++;
+		  if (pps_valid == PPS_VALID) {
+		    pps_jitter = MAXTIME;
+		    pps_stabil = MAXFREQ;
+		    time_status &= ~(STA_PPSSIGNAL | STA_PPSJITTER |
+				     STA_PPSWANDER | STA_PPSERROR);
+		  }
+		  ltemp = time_freq + pps_freq;
+#else
+		  ltemp = time_freq;
+#endif /* PPS_SYNC */
+		  if (ltemp < 0)
+		    time_adj -= -ltemp >>
+		      (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+		  else
+		    time_adj += ltemp >>
+		      (SHIFT_USEC + SHIFT_HZ - SHIFT_SCALE);
+
+#if SHIFT_HZ == 7
+		  /*
+		   * When the CPU clock oscillator frequency is not a
+		   * power of two in Hz, the SHIFT_HZ is only an
+		   * approximate scale factor. In the SunOS kernel, this
+		   * results in a PLL gain factor of 1/1.28 = 0.78 what it
+		   * should be. In the following code the overall gain is
+		   * increased by a factor of 1.25, which results in a
+		   * residual error less than 3 percent.
+		   */
+		  /* Same thing applies for FreeBSD --GAW */
+		  if (hz == 100) {
+		    if (time_adj < 0)
+		      time_adj -= -time_adj >> 2;
+		    else
+		      time_adj += time_adj >> 2;
+		  }
+#endif /* SHIFT_HZ */
+
+		  /* XXX - this is really bogus, but can't be fixed until
+		     xntpd's idea of the system clock is fixed to know how
+		     the user wants leap seconds handled; in the mean time,
+		     we assume that users of NTP are running without proper
+		     leap second support (this is now the default anyway) */
+		  /*
+		   * Leap second processing. If in leap-insert state at
+		   * the end of the day, the system clock is set back one
+		   * second; if in leap-delete state, the system clock is
+		   * set ahead one second. The microtime() routine or
+		   * external clock driver will insure that reported time
+		   * is always monotonic. The ugly divides should be
+		   * replaced.
+		   */
+		  switch (time_state) {
+
+		  case TIME_OK:
+		    if (time_status & STA_INS)
+		      time_state = TIME_INS;
+		    else if (time_status & STA_DEL)
+		      time_state = TIME_DEL;
+		    break;
+
+		  case TIME_INS:
+		    if (newtime.tv_sec % 86400 == 0) {
+		      newtime.tv_sec--;
+		      time_state = TIME_OOP;
+		    }
+		    break;
+
+		  case TIME_DEL:
+		    if ((newtime.tv_sec + 1) % 86400 == 0) {
+		      newtime.tv_sec++;
+		      time_state = TIME_WAIT;
+		    }
+		    break;
+
+		  case TIME_OOP:
+		    time_state = TIME_WAIT;
+		    break;
+
+		  case TIME_WAIT:
+		    if (!(time_status & (STA_INS | STA_DEL)))
+		      time_state = TIME_OK;
+		  }
+		}
+		CPU_CLOCKUPDATE(&time, &newtime);
+	}
+
+	/*
+	 * Process callouts at a very low cpu priority, so we don't keep the
+	 * relatively high clock interrupt priority any longer than necessary.
+	 */
+	if (needsoft) {
+		if (CLKF_BASEPRI(frame)) {
+			/*
+			 * Save the overhead of a software interrupt;
+			 * it will happen as soon as we return, so do it now.
+			 */
+			(void)splsoftclock();
+			softclock();
+		} else
+			setsoftclock();
+	}
+}
+
+/*
+ * Software (low priority) clock interrupt.
+ * Run periodic events from timeout queue.
+ */
+/*ARGSUSED*/
+void
+softclock()
+{
+	register struct callout *c;
+	register void *arg;
+	register void (*func) __P((void *));
+	register int s;
+
+	s = splhigh();
+	while ((c = calltodo.c_next) != NULL && c->c_time <= 0) {
+		func = c->c_func;
+		arg = c->c_arg;
+		calltodo.c_next = c->c_next;
+		c->c_next = callfree;
+		callfree = c;
+		splx(s);
+		(*func)(arg);
+		(void) splhigh();
+	}
+	splx(s);
+}
+
+/*
+ * timeout --
+ *	Execute a function after a specified length of time.
+ *
+ * untimeout --
+ *	Cancel previous timeout function call.
+ *
+ *	See AT&T BCI Driver Reference Manual for specification.  This
+ *	implementation differs from that one in that no identification
+ *	value is returned from timeout, rather, the original arguments
+ *	to timeout are used to identify entries for untimeout.
+ */
+void
+timeout(ftn, arg, ticks)
+	timeout_t ftn;
+	void *arg;
+	register int ticks;
+{
+	register struct callout *new, *p, *t;
+	register int s;
+
+	if (ticks <= 0)
+		ticks = 1;
+
+	/* Lock out the clock. */
+	s = splhigh();
+
+	/* Fill in the next free callout structure. */
+	if (callfree == NULL)
+		panic("timeout table full");
+	new = callfree;
+	callfree = new->c_next;
+	new->c_arg = arg;
+	new->c_func = ftn;
+
+	/*
+	 * The time for each event is stored as a difference from the time
+	 * of the previous event on the queue.  Walk the queue, correcting
+	 * the ticks argument for queue entries passed.  Correct the ticks
+	 * value for the queue entry immediately after the insertion point
+	 * as well.  Watch out for negative c_time values; these represent
+	 * overdue events.
+	 */
+	for (p = &calltodo;
+	    (t = p->c_next) != NULL && ticks > t->c_time; p = t)
+		if (t->c_time > 0)
+			ticks -= t->c_time;
+	new->c_time = ticks;
+	if (t != NULL)
+		t->c_time -= ticks;
+
+	/* Insert the new entry into the queue. */
+	p->c_next = new;
+	new->c_next = t;
+	splx(s);
+}
+
+void
+untimeout(ftn, arg)
+	timeout_t ftn;
+	void *arg;
+{
+	register struct callout *p, *t;
+	register int s;
+
+	s = splhigh();
+	for (p = &calltodo; (t = p->c_next) != NULL; p = t)
+		if (t->c_func == ftn && t->c_arg == arg) {
+			/* Increment next entry's tick count. */
+			if (t->c_next && t->c_time > 0)
+				t->c_next->c_time += t->c_time;
+
+			/* Move entry from callout queue to callfree queue. */
+			p->c_next = t->c_next;
+			t->c_next = callfree;
+			callfree = t;
+			break;
+		}
+	splx(s);
+}
+
+void
+gettime(struct timeval *tvp)
+{
+	int s;
+
+	s = splclock();
+	/* XXX should use microtime() iff tv_usec is used. */
+	*tvp = time;
+	splx(s);
+}
+
+/*
+ * Compute number of hz until specified time.  Used to
+ * compute third argument to timeout() from an absolute time.
+ */
+int
+hzto(tv)
+	struct timeval *tv;
+{
+	register unsigned long ticks;
+	register long sec, usec;
+	int s;
+
+	/*
+	 * If the number of usecs in the whole seconds part of the time
+	 * difference fits in a long, then the total number of usecs will
+	 * fit in an unsigned long.  Compute the total and convert it to
+	 * ticks, rounding up and adding 1 to allow for the current tick
+	 * to expire.  Rounding also depends on unsigned long arithmetic
+	 * to avoid overflow.
+	 *
+	 * Otherwise, if the number of ticks in the whole seconds part of
+	 * the time difference fits in a long, then convert the parts to
+	 * ticks separately and add, using similar rounding methods and
+	 * overflow avoidance.  This method would work in the previous
+	 * case but it is slightly slower and assumes that hz is integral.
+	 *
+	 * Otherwise, round the time difference down to the maximum
+	 * representable value.
+	 *
+	 * If ints have 32 bits, then the maximum value for any timeout in
+	 * 10ms ticks is 248 days.
+	 */
+	s = splclock();
+	sec = tv->tv_sec - time.tv_sec;
+	usec = tv->tv_usec - time.tv_usec;
+	splx(s);
+	if (usec < 0) {
+		sec--;
+		usec += 1000000;
+	}
+	if (sec < 0) {
+#ifdef DIAGNOSTIC
+		printf("hzto: negative time difference %ld sec %ld usec\n",
+		       sec, usec);
+#endif
+		ticks = 1;
+	} else if (sec <= LONG_MAX / 1000000)
+		ticks = (sec * 1000000 + (unsigned long)usec + (tick - 1))
+			/ tick + 1;
+	else if (sec <= LONG_MAX / hz)
+		ticks = sec * hz
+			+ ((unsigned long)usec + (tick - 1)) / tick + 1;
+	else
+		ticks = LONG_MAX;
+	if (ticks > INT_MAX)
+		ticks = INT_MAX;
+	return (ticks);
+}
+
+/*
+ * Start profiling on a process.
+ *
+ * Kernel profiling passes proc0 which never exits and hence
+ * keeps the profile clock running constantly.
+ */
+void
+startprofclock(p)
+	register struct proc *p;
+{
+	int s;
+
+	if ((p->p_flag & P_PROFIL) == 0) {
+		p->p_flag |= P_PROFIL;
+		if (++profprocs == 1 && stathz != 0) {
+			s = splstatclock();
+			psdiv = pscnt = psratio;
+			setstatclockrate(profhz);
+			splx(s);
+		}
+	}
+}
+
+/*
+ * Stop profiling on a process.
+ */
+void
+stopprofclock(p)
+	register struct proc *p;
+{
+	int s;
+
+	if (p->p_flag & P_PROFIL) {
+		p->p_flag &= ~P_PROFIL;
+		if (--profprocs == 0 && stathz != 0) {
+			s = splstatclock();
+			psdiv = pscnt = 1;
+			setstatclockrate(stathz);
+			splx(s);
+		}
+	}
+}
+
+/*
+ * Statistics clock.  Grab profile sample, and if divider reaches 0,
+ * do process and kernel statistics.
+ */
+void
+statclock(frame)
+	register struct clockframe *frame;
+{
+#ifdef GPROF
+	register struct gmonparam *g;
+#endif
+	register struct proc *p;
+	register int i;
+	struct pstats *pstats;
+	long rss;
+	struct rusage *ru;
+	struct vmspace *vm;
+
+	if (CLKF_USERMODE(frame)) {
+		p = curproc;
+		if (p->p_flag & P_PROFIL)
+			addupc_intr(p, CLKF_PC(frame), 1);
+		if (--pscnt > 0)
+			return;
+		/*
+		 * Came from user mode; CPU was in user state.
+		 * If this process is being profiled record the tick.
+		 */
+		p->p_uticks++;
+		if (p->p_nice > NZERO)
+			cp_time[CP_NICE]++;
+		else
+			cp_time[CP_USER]++;
+	} else {
+#ifdef GPROF
+		/*
+		 * Kernel statistics are just like addupc_intr, only easier.
+		 */
+		g = &_gmonparam;
+		if (g->state == GMON_PROF_ON) {
+			i = CLKF_PC(frame) - g->lowpc;
+			if (i < g->textsize) {
+				i /= HISTFRACTION * sizeof(*g->kcount);
+				g->kcount[i]++;
+			}
+		}
+#endif
+		if (--pscnt > 0)
+			return;
+		/*
+		 * Came from kernel mode, so we were:
+		 * - handling an interrupt,
+		 * - doing syscall or trap work on behalf of the current
+		 *   user process, or
+		 * - spinning in the idle loop.
+		 * Whichever it is, charge the time as appropriate.
+		 * Note that we charge interrupts to the current process,
+		 * regardless of whether they are ``for'' that process,
+		 * so that we know how much of its real time was spent
+		 * in ``non-process'' (i.e., interrupt) work.
+		 */
+		p = curproc;
+		if (CLKF_INTR(frame)) {
+			if (p != NULL)
+				p->p_iticks++;
+			cp_time[CP_INTR]++;
+		} else if (p != NULL) {
+			p->p_sticks++;
+			cp_time[CP_SYS]++;
+		} else
+			cp_time[CP_IDLE]++;
+	}
+	pscnt = psdiv;
+
+	/*
+	 * We maintain statistics shown by user-level statistics
+	 * programs:  the amount of time in each cpu state, and
+	 * the amount of time each of DK_NDRIVE ``drives'' is busy.
+	 *
+	 * XXX	should either run linked list of drives, or (better)
+	 *	grab timestamps in the start & done code.
+	 */
+	for (i = 0; i < DK_NDRIVE; i++)
+		if (dk_busy & (1 << i))
+			dk_time[i]++;
+
+	/*
+	 * We adjust the priority of the current process.  The priority of
+	 * a process gets worse as it accumulates CPU time.  The cpu usage
+	 * estimator (p_estcpu) is increased here.  The formula for computing
+	 * priorities (in kern_synch.c) will compute a different value each
+	 * time p_estcpu increases by 4.  The cpu usage estimator ramps up
+	 * quite quickly when the process is running (linearly), and decays
+	 * away exponentially, at a rate which is proportionally slower when
+	 * the system is busy.  The basic principal is that the system will
+	 * 90% forget that the process used a lot of CPU time in 5 * loadav
+	 * seconds.  This causes the system to favor processes which haven't
+	 * run much recently, and to round-robin among other processes.
+	 */
+	if (p != NULL) {
+		p->p_cpticks++;
+		if (++p->p_estcpu == 0)
+			p->p_estcpu--;
+		if ((p->p_estcpu & 3) == 0) {
+			resetpriority(p);
+			if (p->p_priority >= PUSER)
+				p->p_priority = p->p_usrpri;
+		}
+
+		/* Update resource usage integrals and maximums. */
+		if ((pstats = p->p_stats) != NULL &&
+		    (ru = &pstats->p_ru) != NULL &&
+		    (vm = p->p_vmspace) != NULL) {
+			ru->ru_ixrss += vm->vm_tsize * PAGE_SIZE / 1024;
+			ru->ru_idrss += vm->vm_dsize * PAGE_SIZE / 1024;
+			ru->ru_isrss += vm->vm_ssize * PAGE_SIZE / 1024;
+			rss = vm->vm_pmap.pm_stats.resident_count *
+			      PAGE_SIZE / 1024;
+			if (ru->ru_maxrss < rss)
+				ru->ru_maxrss = rss;
+        	}
+	}
+}
+
+/*
+ * Return information about system clocks.
+ */
+static int
+sysctl_kern_clockrate SYSCTL_HANDLER_ARGS
+{
+	struct clockinfo clkinfo;
+	/*
+	 * Construct clockinfo structure.
+	 */
+	clkinfo.hz = hz;
+	clkinfo.tick = tick;
+	clkinfo.profhz = profhz;
+	clkinfo.stathz = stathz ? stathz : hz;
+	return (sysctl_handle_opaque(oidp, &clkinfo, sizeof clkinfo, req));
+}
+
+SYSCTL_PROC(_kern, KERN_CLOCKRATE, clockrate, CTLTYPE_STRUCT|CTLFLAG_RD,
+	0, 0, sysctl_kern_clockrate, "S,clockinfo","");
+
+#ifdef PPS_SYNC
+/*
+ * hardpps() - discipline CPU clock oscillator to external PPS signal
+ *
+ * This routine is called at each PPS interrupt in order to discipline
+ * the CPU clock oscillator to the PPS signal. It measures the PPS phase
+ * and leaves it in a handy spot for the hardclock() routine. It
+ * integrates successive PPS phase differences and calculates the
+ * frequency offset. This is used in hardclock() to discipline the CPU
+ * clock oscillator so that intrinsic frequency error is cancelled out.
+ * The code requires the caller to capture the time and hardware counter
+ * value at the on-time PPS signal transition.
+ *
+ * Note that, on some Unix systems, this routine runs at an interrupt
+ * priority level higher than the timer interrupt routine hardclock().
+ * Therefore, the variables used are distinct from the hardclock()
+ * variables, except for certain exceptions: The PPS frequency pps_freq
+ * and phase pps_offset variables are determined by this routine and
+ * updated atomically. The time_tolerance variable can be considered a
+ * constant, since it is infrequently changed, and then only when the
+ * PPS signal is disabled. The watchdog counter pps_valid is updated
+ * once per second by hardclock() and is atomically cleared in this
+ * routine.
+ */
+void
+hardpps(tvp, usec)
+	struct timeval *tvp;		/* time at PPS */
+	long usec;			/* hardware counter at PPS */
+{
+	long u_usec, v_usec, bigtick;
+	long cal_sec, cal_usec;
+
+	/*
+	 * An occasional glitch can be produced when the PPS interrupt
+	 * occurs in the hardclock() routine before the time variable is
+	 * updated. Here the offset is discarded when the difference
+	 * between it and the last one is greater than tick/2, but not
+	 * if the interval since the first discard exceeds 30 s.
+	 */
+	time_status |= STA_PPSSIGNAL;
+	time_status &= ~(STA_PPSJITTER | STA_PPSWANDER | STA_PPSERROR);
+	pps_valid = 0;
+	u_usec = -tvp->tv_usec;
+	if (u_usec < -500000)
+		u_usec += 1000000;
+	v_usec = pps_offset - u_usec;
+	if (v_usec < 0)
+		v_usec = -v_usec;
+	if (v_usec > (tick >> 1)) {
+		if (pps_glitch > MAXGLITCH) {
+			pps_glitch = 0;
+			pps_tf[2] = u_usec;
+			pps_tf[1] = u_usec;
+		} else {
+			pps_glitch++;
+			u_usec = pps_offset;
+		}
+	} else
+		pps_glitch = 0;
+
+	/*
+	 * A three-stage median filter is used to help deglitch the pps
+	 * time. The median sample becomes the time offset estimate; the
+	 * difference between the other two samples becomes the time
+	 * dispersion (jitter) estimate.
+	 */
+	pps_tf[2] = pps_tf[1];
+	pps_tf[1] = pps_tf[0];
+	pps_tf[0] = u_usec;
+	if (pps_tf[0] > pps_tf[1]) {
+		if (pps_tf[1] > pps_tf[2]) {
+			pps_offset = pps_tf[1];		/* 0 1 2 */
+			v_usec = pps_tf[0] - pps_tf[2];
+		} else if (pps_tf[2] > pps_tf[0]) {
+			pps_offset = pps_tf[0];		/* 2 0 1 */
+			v_usec = pps_tf[2] - pps_tf[1];
+		} else {
+			pps_offset = pps_tf[2];		/* 0 2 1 */
+			v_usec = pps_tf[0] - pps_tf[1];
+		}
+	} else {
+		if (pps_tf[1] < pps_tf[2]) {
+			pps_offset = pps_tf[1];		/* 2 1 0 */
+			v_usec = pps_tf[2] - pps_tf[0];
+		} else  if (pps_tf[2] < pps_tf[0]) {
+			pps_offset = pps_tf[0];		/* 1 0 2 */
+			v_usec = pps_tf[1] - pps_tf[2];
+		} else {
+			pps_offset = pps_tf[2];		/* 1 2 0 */
+			v_usec = pps_tf[1] - pps_tf[0];
+		}
+	}
+	if (v_usec > MAXTIME)
+		pps_jitcnt++;
+	v_usec = (v_usec << PPS_AVG) - pps_jitter;
+	if (v_usec < 0)
+		pps_jitter -= -v_usec >> PPS_AVG;
+	else
+		pps_jitter += v_usec >> PPS_AVG;
+	if (pps_jitter > (MAXTIME >> 1))
+		time_status |= STA_PPSJITTER;
+
+	/*
+	 * During the calibration interval adjust the starting time when
+	 * the tick overflows. At the end of the interval compute the
+	 * duration of the interval and the difference of the hardware
+	 * counters at the beginning and end of the interval. This code
+	 * is deliciously complicated by the fact valid differences may
+	 * exceed the value of tick when using long calibration
+	 * intervals and small ticks. Note that the counter can be
+	 * greater than tick if caught at just the wrong instant, but
+	 * the values returned and used here are correct.
+	 */
+	bigtick = (long)tick << SHIFT_USEC;
+	pps_usec -= pps_freq;
+	if (pps_usec >= bigtick)
+		pps_usec -= bigtick;
+	if (pps_usec < 0)
+		pps_usec += bigtick;
+	pps_time.tv_sec++;
+	pps_count++;
+	if (pps_count < (1 << pps_shift))
+		return;
+	pps_count = 0;
+	pps_calcnt++;
+	u_usec = usec << SHIFT_USEC;
+	v_usec = pps_usec - u_usec;
+	if (v_usec >= bigtick >> 1)
+		v_usec -= bigtick;
+	if (v_usec < -(bigtick >> 1))
+		v_usec += bigtick;
+	if (v_usec < 0)
+		v_usec = -(-v_usec >> pps_shift);
+	else
+		v_usec = v_usec >> pps_shift;
+	pps_usec = u_usec;
+	cal_sec = tvp->tv_sec;
+	cal_usec = tvp->tv_usec;
+	cal_sec -= pps_time.tv_sec;
+	cal_usec -= pps_time.tv_usec;
+	if (cal_usec < 0) {
+		cal_usec += 1000000;
+		cal_sec--;
+	}
+	pps_time = *tvp;
+
+	/*
+	 * Check for lost interrupts, noise, excessive jitter and
+	 * excessive frequency error. The number of timer ticks during
+	 * the interval may vary +-1 tick. Add to this a margin of one
+	 * tick for the PPS signal jitter and maximum frequency
+	 * deviation. If the limits are exceeded, the calibration
+	 * interval is reset to the minimum and we start over.
+	 */
+	u_usec = (long)tick << 1;
+	if (!((cal_sec == -1 && cal_usec > (1000000 - u_usec))
+	    || (cal_sec == 0 && cal_usec < u_usec))
+	    || v_usec > time_tolerance || v_usec < -time_tolerance) {
+		pps_errcnt++;
+		pps_shift = PPS_SHIFT;
+		pps_intcnt = 0;
+		time_status |= STA_PPSERROR;
+		return;
+	}
+
+	/*
+	 * A three-stage median filter is used to help deglitch the pps
+	 * frequency. The median sample becomes the frequency offset
+	 * estimate; the difference between the other two samples
+	 * becomes the frequency dispersion (stability) estimate.
+	 */
+	pps_ff[2] = pps_ff[1];
+	pps_ff[1] = pps_ff[0];
+	pps_ff[0] = v_usec;
+	if (pps_ff[0] > pps_ff[1]) {
+		if (pps_ff[1] > pps_ff[2]) {
+			u_usec = pps_ff[1];		/* 0 1 2 */
+			v_usec = pps_ff[0] - pps_ff[2];
+		} else if (pps_ff[2] > pps_ff[0]) {
+			u_usec = pps_ff[0];		/* 2 0 1 */
+			v_usec = pps_ff[2] - pps_ff[1];
+		} else {
+			u_usec = pps_ff[2];		/* 0 2 1 */
+			v_usec = pps_ff[0] - pps_ff[1];
+		}
+	} else {
+		if (pps_ff[1] < pps_ff[2]) {
+			u_usec = pps_ff[1];		/* 2 1 0 */
+			v_usec = pps_ff[2] - pps_ff[0];
+		} else  if (pps_ff[2] < pps_ff[0]) {
+			u_usec = pps_ff[0];		/* 1 0 2 */
+			v_usec = pps_ff[1] - pps_ff[2];
+		} else {
+			u_usec = pps_ff[2];		/* 1 2 0 */
+			v_usec = pps_ff[1] - pps_ff[0];
+		}
+	}
+
+	/*
+	 * Here the frequency dispersion (stability) is updated. If it
+	 * is less than one-fourth the maximum (MAXFREQ), the frequency
+	 * offset is updated as well, but clamped to the tolerance. It
+	 * will be processed later by the hardclock() routine.
+	 */
+	v_usec = (v_usec >> 1) - pps_stabil;
+	if (v_usec < 0)
+		pps_stabil -= -v_usec >> PPS_AVG;
+	else
+		pps_stabil += v_usec >> PPS_AVG;
+	if (pps_stabil > MAXFREQ >> 2) {
+		pps_stbcnt++;
+		time_status |= STA_PPSWANDER;
+		return;
+	}
+	if (time_status & STA_PPSFREQ) {
+		if (u_usec < 0) {
+			pps_freq -= -u_usec >> PPS_AVG;
+			if (pps_freq < -time_tolerance)
+				pps_freq = -time_tolerance;
+			u_usec = -u_usec;
+		} else {
+			pps_freq += u_usec >> PPS_AVG;
+			if (pps_freq > time_tolerance)
+				pps_freq = time_tolerance;
+		}
+	}
+
+	/*
+	 * Here the calibration interval is adjusted. If the maximum
+	 * time difference is greater than tick / 4, reduce the interval
+	 * by half. If this is not the case for four consecutive
+	 * intervals, double the interval.
+	 */
+	if (u_usec << pps_shift > bigtick >> 2) {
+		pps_intcnt = 0;
+		if (pps_shift > PPS_SHIFT)
+			pps_shift--;
+	} else if (pps_intcnt >= 4) {
+		pps_intcnt = 0;
+		if (pps_shift < PPS_SHIFTMAX)
+			pps_shift++;
+	} else
+		pps_intcnt++;
+}
+#endif /* PPS_SYNC */
diff --git a/sys/kern/kern_xxx.c b/sys/kern/kern_xxx.c
index caa1cdd..17550b6 100644
--- a/sys/kern/kern_xxx.c
+++ b/sys/kern/kern_xxx.c
@@ -30,114 +30,230 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)kern_xxx.c	8.3 (Berkeley) 2/14/95
+ *	@(#)kern_xxx.c	8.2 (Berkeley) 11/14/93
+ * $Id$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
-#include <sys/reboot.h>
-#include <vm/vm.h>
 #include <sys/sysctl.h>
+#include <sys/utsname.h>
 
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
-
-/* ARGSUSED */
-int
-reboot(p, uap, retval)
-	struct proc *p;
-	struct reboot_args /* {
-		syscallarg(int) opt;
-	} */ *uap;
-	register_t *retval;
-{
-	int error;
-
-	if (error = suser(p->p_ucred, &p->p_acflag))
-		return (error);
-	boot(SCARG(uap, opt));
-	return (0);
-}
 
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 
+#ifndef _SYS_SYSPROTO_H_
+struct gethostname_args {
+	char	*hostname;
+	u_int	len;
+};
+#endif
 /* ARGSUSED */
 int
-compat_43_gethostname(p, uap, retval)
+ogethostname(p, uap, retval)
 	struct proc *p;
-	struct compat_43_gethostname_args /* {
-		syscallarg(char *) hostname;
-		syscallarg(u_int) len;
-	} */ *uap;
-	register_t *retval;
+	struct gethostname_args *uap;
+	int *retval;
 {
-	int name;
+	int name[2];
 
-	name = KERN_HOSTNAME;
-	return (kern_sysctl(&name, 1, SCARG(uap, hostname), &SCARG(uap, len),
-	    0, 0));
+	name[0] = CTL_KERN;
+	name[1] = KERN_HOSTNAME;
+	return (userland_sysctl(p, name, 2, uap->hostname, &uap->len, 
+		1, 0, 0, 0));
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct sethostname_args {
+	char	*hostname;
+	u_int	len;
+};
+#endif
 /* ARGSUSED */
 int
-compat_43_sethostname(p, uap, retval)
+osethostname(p, uap, retval)
 	struct proc *p;
-	register struct compat_43_sethostname_args /* {
-		syscallarg(char *) hostname;
-		syscallarg(u_int) len;
-	} */ *uap;
-	register_t *retval;
+	register struct sethostname_args *uap;
+	int *retval;
 {
-	int name;
+	int name[2];
 	int error;
 
-	if (error = suser(p->p_ucred, &p->p_acflag))
+	name[0] = CTL_KERN;
+	name[1] = KERN_HOSTNAME;
+	if ((error = suser(p->p_ucred, &p->p_acflag)))
 		return (error);
-	name = KERN_HOSTNAME;
-	return (kern_sysctl(&name, 1, 0, 0, SCARG(uap, hostname),
-	    SCARG(uap, len)));
+	return (userland_sysctl(p, name, 2, 0, 0, 0,
+		uap->hostname, uap->len, 0));
 }
 
+#ifndef _SYS_SYSPROTO_H_
+struct ogethostid_args {
+	int	dummy;
+};
+#endif
 /* ARGSUSED */
 int
-compat_43_gethostid(p, uap, retval)
+ogethostid(p, uap, retval)
 	struct proc *p;
-	void *uap;
-	register_t *retval;
+	struct ogethostid_args *uap;
+	int *retval;
 {
 
-	*(int32_t *)retval = hostid;
+	*(long *)retval = hostid;
 	return (0);
 }
 #endif /* COMPAT_43 || COMPAT_SUNOS */
 
 #ifdef COMPAT_43
+#ifndef _SYS_SYSPROTO_H_
+struct osethostid_args {
+	long	hostid;
+};
+#endif
 /* ARGSUSED */
 int
-compat_43_sethostid(p, uap, retval)
+osethostid(p, uap, retval)
 	struct proc *p;
-	struct compat_43_sethostid_args /* {
-		syscallarg(int32_t) hostid;
-	} */ *uap;
-	register_t *retval;
+	struct osethostid_args *uap;
+	int *retval;
 {
 	int error;
 
-	if (error = suser(p->p_ucred, &p->p_acflag))
+	if ((error = suser(p->p_ucred, &p->p_acflag)))
 		return (error);
-	hostid = SCARG(uap, hostid);
+	hostid = uap->hostid;
 	return (0);
 }
 
 int
-compat_43_quota(p, uap, retval)
+oquota(p, uap, retval)
 	struct proc *p;
-	void *uap;
-	register_t *retval;
+	struct oquota_args *uap;
+	int *retval;
 {
 
 	return (ENOSYS);
 }
 #endif /* COMPAT_43 */
+
+#ifndef _SYS_SYSPROTO_H_
+struct uname_args {
+        struct utsname  *name;
+};
+#endif
+
+/* ARGSUSED */
+int
+uname(p, uap, retval)
+	struct proc *p;
+	struct uname_args *uap;
+	int *retval;
+{
+	int name[2], len, rtval;
+	char *s, *us;
+
+	name[0] = CTL_KERN;
+	name[1] = KERN_OSTYPE;
+	len = sizeof uap->name->sysname;
+	rtval = userland_sysctl(p, name, 2, uap->name->sysname, &len, 
+		1, 0, 0, 0);
+	if( rtval) return rtval;
+	subyte( uap->name->sysname + sizeof(uap->name->sysname) - 1, 0);
+
+	name[1] = KERN_HOSTNAME;
+	len = sizeof uap->name->nodename;
+	rtval = userland_sysctl(p, name, 2, uap->name->nodename, &len, 
+		1, 0, 0, 0);
+	if( rtval) return rtval;
+	subyte( uap->name->nodename + sizeof(uap->name->nodename) - 1, 0);
+
+	name[1] = KERN_OSRELEASE;
+	len = sizeof uap->name->release;
+	rtval = userland_sysctl(p, name, 2, uap->name->release, &len, 
+		1, 0, 0, 0);
+	if( rtval) return rtval;
+	subyte( uap->name->release + sizeof(uap->name->release) - 1, 0);
+
+/*
+	name = KERN_VERSION;
+	len = sizeof uap->name->version;
+	rtval = userland_sysctl(p, name, 2, uap->name->version, &len, 
+		1, 0, 0, 0);
+	if( rtval) return rtval;
+	subyte( uap->name->version + sizeof(uap->name->version) - 1, 0);
+*/
+
+/*
+ * this stupid hackery to make the version field look like FreeBSD 1.1
+ */
+	for(s = version; *s && *s != '#'; s++);
+
+	for(us = uap->name->version; *s && *s != ':'; s++) {
+		rtval = subyte( us++, *s);
+		if( rtval)
+			return rtval;
+	}
+	rtval = subyte( us++, 0);
+	if( rtval)
+		return rtval;
+
+	name[1] = HW_MACHINE;
+	len = sizeof uap->name->machine;
+	rtval = userland_sysctl(p, name, 2, uap->name->machine, &len, 
+		1, 0, 0, 0);
+	if( rtval) return rtval;
+	subyte( uap->name->machine + sizeof(uap->name->machine) - 1, 0);
+
+	return 0;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct getdomainname_args {
+        char    *domainname;
+        int     len;
+};
+#endif
+
+/* ARGSUSED */
+int
+getdomainname(p, uap, retval)
+        struct proc *p;
+        struct getdomainname_args *uap;
+        int *retval;
+{
+	int domainnamelen = strlen(domainname) + 1;
+	if ((u_int)uap->len > domainnamelen + 1)
+		uap->len = domainnamelen + 1;
+	return (copyout((caddr_t)domainname, (caddr_t)uap->domainname, uap->len));
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct setdomainname_args {
+        char    *domainname;
+        int     len;
+};
+#endif
+
+/* ARGSUSED */
+int
+setdomainname(p, uap, retval)
+        struct proc *p;
+        struct setdomainname_args *uap;
+        int *retval;
+{
+        int error, domainnamelen;
+
+        if ((error = suser(p->p_ucred, &p->p_acflag)))
+                return (error);
+        if ((u_int)uap->len > sizeof (domainname) - 1)
+                return EINVAL;
+        domainnamelen = uap->len;
+        error = copyin((caddr_t)uap->domainname, domainname, uap->len);
+        domainname[domainnamelen] = 0;
+        return (error);
+}
+
diff --git a/sys/kern/makesyscalls.sh b/sys/kern/makesyscalls.sh
index 4e2c28c..dc78413 100644
--- a/sys/kern/makesyscalls.sh
+++ b/sys/kern/makesyscalls.sh
@@ -1,72 +1,43 @@
 #! /bin/sh -
-#
-#	@(#)makesyscalls.sh	8.2 (Berkeley) 2/14/95
+#	@(#)makesyscalls.sh	8.1 (Berkeley) 6/10/93
+# $Id$
 
 set -e
 
-case $# in
-    2)	;;
-    *)	echo "Usage: $0 config-file input-file" 1>&2
-	exit 1
-	;;
-esac
-
-# source the config file.
-. $1
+# name of compat option:
+compat=COMPAT_43
 
-# the config file sets the following variables:
-#	sysnames	the syscall names file
-#	sysnumhdr	the syscall numbers file
-#	syssw		the syscall switch file
-#	sysarghdr	the syscall argument struct definitions
-#	compatopts	those syscall types that are for 'compat' syscalls
-#	switchname	the name for the 'struct sysent' we define
-#	namesname	the name for the 'char *[]' we define
-#	constprefix	the prefix for the system call constants
-#
-# NOTE THAT THIS makesyscalls.sh DOES NOT SUPPORT 'LIBCOMPAT'.
+# output files:
+sysnames="syscalls.c"
+sysproto="../sys/sysproto.h"
+sysproto_h=_SYS_SYSPROTO_H_
+syshdr="../sys/syscall.h"
+syssw="init_sysent.c"
+syshide="../sys/syscall-hide.h"
+syscallprefix="SYS_"
+switchname="sysent"
+namesname="syscallnames"
 
 # tmp files:
 sysdcl="sysent.dcl"
-syscompat_pref="sysent."
+syscompat="sysent.compat"
+syscompatdcl="sysent.compatdcl"
 sysent="sysent.switch"
+sysinc="sysinc.switch"
+sysarg="sysarg.switch"
 
-syscompat_files=""
-for file in $compatopts; do
-	syscompat_files="$syscompat_files $syscompat_pref$file"
-done
+trap "rm $sysdcl $syscompat $syscompatdcl $sysent $sysinc $sysarg" 0
 
-trap "rm $sysdcl $syscompat_files $sysent" 0
-
-# Awk program (must support nawk extensions)
-# Use "awk" at Berkeley, "nawk" or "gawk" elsewhere.
-awk=${AWK:-awk}
-
-# Does this awk have a "toupper" function? (i.e. is it GNU awk)
-isgawk=`$awk 'BEGIN { print toupper("true"); exit; }' 2>/dev/null`
+case $# in
+    0)	echo "Usage: $0 input-file <config-file>" 1>&2
+	exit 1
+	;;
+esac
 
-# If this awk does not define "toupper" then define our own.
-if [ "$isgawk" = TRUE ] ; then
-	# GNU awk provides it.
-	toupper=
-else
-	# Provide our own toupper()
-	toupper='
-function toupper(str) {
-	_toupper_cmd = "echo "str" |tr a-z A-Z"
-	_toupper_cmd | getline _toupper_str;
-	close(_toupper_cmd);
-	return _toupper_str;
-}'
+if [ -f $2 ]; then
+	. $2
 fi
 
-# before handing it off to awk, make a few adjustments:
-#	(1) insert spaces around {, }, (, ), *, and commas.
-#	(2) get rid of any and all dollar signs (so that rcs id use safe)
-#
-# The awk script will deal with blank lines and lines that
-# start with the comment character (';').
-
 sed -e '
 s/\$//g
 :join
@@ -79,287 +50,311 @@ s/\$//g
 2,${
 	/^#/!s/\([{}()*,]\)/ \1 /g
 }
-' < $2 | $awk "
-$toupper
-BEGIN {
-	sysnames = \"$sysnames\"
-	sysnumhdr = \"$sysnumhdr\"
-	sysarghdr = \"$sysarghdr\"
-	switchname = \"$switchname\"
-	namesname = \"$namesname\"
-	constprefix = \"$constprefix\"
-
-	sysdcl = \"$sysdcl\"
-	syscompat_pref = \"$syscompat_pref\"
-	sysent = \"$sysent\"
-	infile = \"$2\"
-
-	compatopts = \"$compatopts\"
-	"'
-
-	printf "/*\n * System call switch table.\n *\n" > sysdcl
-	printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysdcl
-
-	ncompat = split(compatopts,compat)
-	for (i = 1; i <= ncompat; i++) {
-		compat_upper[i] = toupper(compat[i])
-		compat_file[i] = sprintf("%s%s", syscompat_pref, compat[i])
-
-		printf "\n#ifdef %s\n", compat_upper[i] > compat_file[i]
-		printf "#define %s(func) __CONCAT(%s_,func)\n\n", \
-		    compat[i], compat[i] > compat_file[i]
+' < $1 | awk "
+	BEGIN {
+		sysdcl = \"$sysdcl\"
+		sysproto = \"$sysproto\"
+		sysproto_h = \"$sysproto_h\"
+		syscompat = \"$syscompat\"
+		syscompatdcl = \"$syscompatdcl\"
+		sysent = \"$sysent\"
+		sysinc = \"$sysinc\"
+		sysarg = \"$sysarg\"
+		sysnames = \"$sysnames\"
+		syshdr = \"$syshdr\"
+		compat = \"$compat\"
+		syshide = \"$syshide\"
+		syscallprefix = \"$syscallprefix\"
+		switchname = \"$switchname\"
+		namesname = \"$namesname\"
+		infile = \"$1\"
+		"'
+
+		printf "/*\n * System call switch table.\n *\n" > sysinc
+		printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysinc
+
+		printf "/*\n * System call prototypes.\n *\n" > sysarg
+		printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysarg
+
+		printf "\n#ifdef %s\n\n", compat > syscompat
+
+		printf "/*\n * System call names.\n *\n" > sysnames
+		printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnames
+
+		printf "/*\n * System call numbers.\n *\n" > syshdr
+		printf " * DO NOT EDIT-- this file is automatically generated.\n" > syshdr
+		printf "/*\n * System call hiders.\n *\n" > syshide
+		printf " * DO NOT EDIT-- this file is automatically generated.\n" > syshide
 	}
+	NR == 1 {
+		gsub("[$]Id: ", "", $0)
+		gsub(" [$]", "", $0)
 
-	printf "/*\n * System call names.\n *\n" > sysnames
-	printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnames
+		printf " * created from%s\n */\n\n", $0 > sysinc
 
-	printf "/*\n * System call numbers.\n *\n" > sysnumhdr
-	printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysnumhdr
+		printf "\n#ifdef %s\n", compat > sysent
+		printf "#define compat(n, name) n, (sy_call_t *)__CONCAT(o,name)\n" > sysent
+		printf("#else\n") > sysent
+		printf("#define compat(n, name) 0, (sy_call_t *)nosys\n") > sysent
+		printf("#endif\n\n") > sysent
+		printf("/* The casts are bogus but will do for now. */\n") > sysent
+		printf "struct sysent %s[] = {\n",switchname > sysent
 
-	printf "/*\n * System call argument lists.\n *\n" > sysarghdr
-	printf " * DO NOT EDIT-- this file is automatically generated.\n" > sysarghdr
-}
-NR == 1 {
-	printf " * created from%s\n */\n\n", $0 > sysdcl
-
-	printf "#define\ts(type)\tsizeof(type)\n\n" > sysent
-	printf "struct sysent %s[] = {\n",switchname > sysent
+		printf " * created from%s\n */\n\n", $0 > sysarg
+		printf("#ifndef %s\n", sysproto_h) > sysarg
+		printf("#define\t%s\n\n", sysproto_h) > sysarg
+		printf "#include <sys/signal.h>\n\n", $0 > sysarg
 
-	printf " * created from%s\n */\n\n", $0 > sysnames
-	printf "char *%s[] = {\n",namesname > sysnames
+		printf " * created from%s\n */\n\n", $0 > sysnames
+		printf "char *%s[] = {\n", namesname > sysnames
 
-	printf " * created from%s\n */\n\n", $0 > sysnumhdr
+		printf " * created from%s\n */\n\n", $0 > syshdr
 
-	printf " * created from%s\n */\n\n", $0 > sysarghdr
-	printf "#define\tsyscallarg(x)\tunion { x datum; register_t pad; }\n" \
-		> sysarghdr
-	next
-}
-NF == 0 || $1 ~ /^;/ {
-	next
-}
-$1 ~ /^#[ 	]*include/ {
-	print > sysdcl
-	next
-}
-$1 ~ /^#[ 	]*if/ {
-	print > sysent
-	print > sysdcl
-	for (i = 1; i <= ncompat; i++)
-		print > compat_file[i]
-	print > sysnames
-	savesyscall = syscall
-	next
-}
-$1 ~ /^#[ 	]*else/ {
-	print > sysent
-	print > sysdcl
-	for (i = 1; i <= ncompat; i++)
-		print > compat_file[i]
-	print > sysnames
-	syscall = savesyscall
-	next
-}
-$1 ~ /^#/ {
-	print > sysent
-	print > sysdcl
-	for (i = 1; i <= ncompat; i++)
-		print > compat_file[i]
-	print > sysnames
-	next
-}
-syscall != $1 {
-	printf "%s: line %d: syscall number out of sync at %d\n", \
-	   infile, NR, syscall
-	printf "line is:\n"
-	print
-	exit 1
-}
-function parserr(was, wanted) {
-	printf "%s: line %d: unexpected %s (expected %s)\n", \
-	    infile, NR, was, wanted
-	exit 1
-}
-function parseline() {
-	f=3			# toss number and type
-	if ($NF != "}") {
-		funcalias=$NF
-		end=NF-1
-	} else {
-		funcalias=""
-		end=NF
+		printf " * created from%s\n */\n\n", $0 > syshide
+		next
 	}
-	if ($f != "{")
-		parserr($f, "{")
-	f++
-	if ($end != "}")
-		parserr($end, "}")
-	end--
-	if ($end != ";")
-		parserr($end, ";")
-	end--
-	if ($end != ")")
-		parserr($end, ")")
-	end--
-
-	f++			# toss return type
-
-	funcname=$f
-	if (funcalias == "")
-		funcalias=funcname
-	f++
+	NF == 0 || $1 ~ /^;/ {
+		next
+	}
+	$1 ~ /^#[ 	]*include/ {
+		print > sysinc
+		next
+	}
+	$1 ~ /^#[ 	]*if/ {
+		print > sysent
+		print > sysdcl
+		print > sysarg
+		print > syscompat
+		print > sysnames
+		print > syshide
+		savesyscall = syscall
+		next
+	}
+	$1 ~ /^#[ 	]*else/ {
+		print > sysent
+		print > sysdcl
+		print > sysarg
+		print > syscompat
+		print > sysnames
+		print > syshide
+		syscall = savesyscall
+		next
+	}
+	$1 ~ /^#/ {
+		print > sysent
+		print > sysdcl
+		print > sysarg
+		print > syscompat
+		print > sysnames
+		print > syshide
+		next
+	}
+	syscall != $1 {
+		printf "%s: line %d: syscall number out of sync at %d\n", \
+		   infile, NR, syscall
+		printf "line is:\n"
+		print
+		exit 1
+	}
+	function parserr(was, wanted) {
+		printf "%s: line %d: unexpected %s (expected %s)\n", \
+		    infile, NR, was, wanted
+		exit 1
+	}
+	function parseline() {
+		f=4			# toss number and type
+		argc= 0;
+		bigargc = 0;
+		if ($NF != "}") {
+			funcalias=$(NF-2)
+			argalias=$(NF-1)
+			rettype=$NF
+			end=NF-3
+		} else {
+			funcalias=""
+			argalias=""
+			rettype="int"
+			end=NF
+		}
+		if ($2 == "NODEF") {
+			funcname=$4
+			return
+		}
+		if ($f != "{")
+			parserr($f, "{")
+		f++
+		if ($end != "}")
+			parserr($end, "}")
+		end--
+		if ($end != ";")
+			parserr($end, ";")
+		end--
+		if ($end != ")")
+			parserr($end, ")")
+		end--
+
+		f++	#function return type
+
+		funcname=$f
+		if (funcalias == "")
+			funcalias = funcname
+		if (argalias == "") {
+			argalias = funcname "_args"
+			if ($2 == "COMPAT")
+				argalias = "o" argalias
+		}
+		f++
 
-	if ($f != "(")
-		parserr($f, ")")
-	f++
+		if ($f != "(")
+			parserr($f, ")")
+		f++
 
-	argc= 0;
-	if (f == end) {
-		if ($f != "void")
-			parserr($f, "argument definition")
-		return
-	}
+		if (f == end) {
+			if ($f != "void")
+				parserr($f, "argument definition")
+			return
+		}
 
-	while (f <= end) {
-		argc++
-		argtype[argc]=""
-		oldf=""
-		while (f < end && $(f+1) != ",") {
-			if (argtype[argc] != "" && oldf != "*")
-				argtype[argc] = argtype[argc]" ";
-			argtype[argc] = argtype[argc]$f;
-			oldf = $f;
-			f++
+		while (f <= end) {
+			argc++
+			argtype[argc]=""
+			oldf=""
+			while (f < end && $(f+1) != ",") {
+				if (argtype[argc] != "" && oldf != "*")
+					argtype[argc] = argtype[argc]" ";
+				argtype[argc] = argtype[argc]$f;
+				oldf = $f;
+				f++
+			}
+			if (argtype[argc] == "")
+				parserr($f, "argument definition")
+			if (argtype[argc] == "off_t")
+				bigargc++
+			argname[argc]=$f;
+			f += 2;			# skip name, and any comma
 		}
-		if (argtype[argc] == "")
-			parserr($f, "argument definition")
-		argname[argc]=$f;
-		f += 2;			# skip name, and any comma
 	}
-}
-function putent(nodefs, declfile, compatwrap) {
-	# output syscall declaration for switch table
-	if (compatwrap == "")
-		printf("int\t%s();\n", funcname) > declfile
-	else
-		printf("int\t%s(%s)();\n", compatwrap, funcname) > declfile
-
-	# output syscall switch entry
-#	printf("\t{ { %d", argc) > sysent
-#	for (i = 1; i <= argc; i++) {
-#		if (i == 5) 		# wrap the line
-#			printf(",\n\t    ") > sysent
-#		else
-#			printf(", ") > sysent
-#		printf("s(%s)", argtypenospc[i]) > sysent
-#	}
-	printf("\t{ %d, ", argc) > sysent
-	if (argc == 0)
-		printf("0") > sysent
-	else if (compatwrap == "")
-		printf("s(struct %s_args)", funcname) > sysent
-	else
-		printf("s(struct %s_%s_args)", compatwrap, funcname) > sysent
-	if (compatwrap == "")
-		wfn = sprintf("%s", funcname);
-	else
-		wfn = sprintf("%s(%s)", compatwrap, funcname);
-	printf(",\n\t    %s },", wfn) > sysent
-	for (i = 0; i < (33 - length(wfn)) / 8; i++)
-		printf("\t") > sysent
-	if (compatwrap == "")
+	{	comment = $4
+		if (NF < 7)
+			for (i = 5; i <= NF; i++)
+				comment = comment " " $i
+	}
+	$2 == "STD" || $2 == "NODEF" || $2 == "NOARGS"  || $2 == "NOPROTO" {
+		parseline()
+		if ((!nosys || funcname != "nosys") && \
+		    (funcname != "lkmnosys")) {
+			if (argc != 0 && $2 != "NOARGS" && $2 != "NOPROTO") {
+				printf("struct\t%s {\n", argalias) > sysarg
+				for (i = 1; i <= argc; i++)
+					printf("\t%s %s;\n", argtype[i],
+					    argname[i]) > sysarg
+				printf("};\n") > sysarg
+			}
+			else if($2 != "NOARGS" && $2 != "NOPROTO")
+				printf("struct\t%s {\n\tint dummy;\n};\n", \
+					argalias) > sysarg
+		}
+		if ($2 != "NOPROTO" && (!nosys || funcname != "nosys") && \
+		    (!lkmnosys || funcname != "lkmnosys")) {
+			printf("%s\t%s __P((struct proc *, struct %s *, int []))", \
+			    rettype, funcname, argalias) > sysdcl
+			if (funcname == "exit")
+				printf(" __dead2") > sysdcl
+			printf(";\n") > sysdcl
+		}
+		if (funcname == "nosys")
+			nosys = 1
+		if (funcname == "lkmnosys")
+			lkmnosys = 1
+		printf("\t{ %d, (sy_call_t *)%s },\t\t", \
+		    argc+bigargc, funcname) > sysent
+		if(length(funcname) < 11)
+			printf("\t") > sysent
 		printf("/* %d = %s */\n", syscall, funcalias) > sysent
-	else
-		printf("/* %d = %s %s */\n", syscall, compatwrap,
-		    funcalias) > sysent
-
-	# output syscall name for names table
-	if (compatwrap == "")
-		printf("\t\"%s\",\t\t\t/* %d = %s */\n", funcalias, syscall,
-		    funcalias) > sysnames
-	else
-		printf("\t\"%s_%s\",\t/* %d = %s %s */\n", compatwrap,
-		    funcalias, syscall, compatwrap, funcalias) > sysnames
-
-	# output syscall number of header, if appropriate
-	if (nodefs == "" || nodefs == "NOARGS")
-		printf("#define\t%s%s\t%d\n", constprefix, funcalias,
-		    syscall) > sysnumhdr
-	else if (nodefs != "NODEF")
-		printf("\t\t\t\t/* %d is %s %s */\n", syscall,
-		    compatwrap, funcalias) > sysnumhdr
-
-	# output syscall argument structure, if it has arguments
-	if (argc != 0 && nodefs != "NOARGS") {
-		if (compatwrap == "")
-			printf("\nstruct %s_args {\n", funcname) > sysarghdr
-		else
-			printf("\nstruct %s_%s_args {\n", compatwrap,
-			    funcname) > sysarghdr
-		for (i = 1; i <= argc; i++)
-			printf("\tsyscallarg(%s) %s;\n", argtype[i],
-			    argname[i]) > sysarghdr
-		printf("};\n") > sysarghdr
+		printf("\t\"%s\",\t\t\t/* %d = %s */\n", \
+		    funcalias, syscall, funcalias) > sysnames
+		if ($2 != "NODEF")
+			printf("#define\t%s%s\t%d\n", syscallprefix, \
+		    	    funcalias, syscall) > syshdr
+		if ($3 != "NOHIDE")
+			printf("HIDE_%s(%s)\n", $3, funcname) > syshide
+		syscall++
+		next
 	}
-}
-$2 == "STD" {
-	parseline()
-	putent("", sysdcl, "")
-	syscall++
-	next
-}
-$2 == "NODEF" || $2 == "NOARGS" {
-	parseline()
-	putent($2, sysdcl, "")
-	syscall++
-	next
-}
-$2 == "OBSOL" || $2 == "UNIMPL" {
-	if ($2 == "OBSOL")
-		comment="obsolete"
-	else
-		comment="unimplemented"
-	for (i = 3; i <= NF; i++)
-		comment=comment " " $i
-
-	printf("\t{ 0, 0,\n\t    nosys },\t\t\t\t/* %d = %s */\n", \
-	    syscall, comment) > sysent
-	printf("\t\"#%d (%s)\",\t\t/* %d = %s */\n", \
-	    syscall, comment, syscall, comment) > sysnames
-	if ($2 != "UNIMPL")
-		printf("\t\t\t\t/* %d is %s */\n", syscall, comment) > sysnumhdr
-	syscall++
-	next
-}
-{
-	for (i = 1; i <= ncompat; i++) {
-		if ($2 == compat_upper[i]) {
-			parseline();
-			putent("COMMENT", compat_file[i], compat[i])
-			syscall++
-			next
+	$2 == "COMPAT" || $2 == "CPT_NOA" {
+		parseline()
+		if (argc != 0 && $2 != "CPT_NOA") {
+			printf("struct\t%s {\n", argalias) > syscompat
+			for (i = 1; i <= argc; i++)
+				printf("\t%s %s;\n", argtype[i],
+				    argname[i]) > syscompat
+			printf("};\n") > syscompat
 		}
+		else if($2 != "CPT_NOA")
+			printf("struct\t%s {\n\tint dummy;\n};\n", \
+				argalias) > sysarg
+		printf("%s\to%s __P((struct proc *, struct %s *, int []));\n", \
+		    rettype, funcname, argalias) > syscompatdcl
+		printf("\t{ compat(%d,%s) },\t\t/* %d = old %s */\n", \
+		    argc+bigargc, funcname, syscall, funcalias) > sysent
+		printf("\t\"old.%s\",\t\t/* %d = old %s */\n", \
+		    funcalias, syscall, funcalias) > sysnames
+		printf("\t\t\t\t/* %d is old %s */\n", \
+		    syscall, funcalias) > syshdr
+		if ($3 != "NOHIDE")
+			printf("HIDE_%s(%s)\n", $3, funcname) > syshide
+		syscall++
+		next
 	}
-	printf "%s: line %d: unrecognized keyword %s\n", infile, NR, $2
-	exit 1
-}
-END {
-	printf "\n#undef\tsyscallarg\n" > sysarghdr
-
-        for (i = 1; i <= ncompat; i++) {
-		printf("\n#else /* %s */\n", compat_upper[i]) > compat_file[i]
-		printf("#define %s(func) nosys\n", compat[i]) > \
-		    compat_file[i]
-		printf("#endif /* %s */\n\n", compat_upper[i]) > compat_file[i]
-        }
-
-	printf("};\n\n") > sysent
-	printf("int\tn%s= sizeof(%s) / sizeof(%s[0]);\n", switchname,
-	    switchname, switchname) > sysent
-
-	printf("};\n") > sysnames
-} '
-
-cat $sysdcl $syscompat_files $sysent > $syssw
-
-#chmod 444 $sysnames $syshdr $syssw
+	$2 == "LIBCOMPAT" {
+		parseline()
+		printf("%s\to%s();\n", rettype, funcname) > syscompatdcl
+		printf("\t{ compat(%d,%s) },\t\t/* %d = old %s */\n", \
+		    argc+bigargc, funcname, syscall, funcalias) > sysent
+		printf("\t\"old.%s\",\t\t/* %d = old %s */\n", \
+		    funcalias, syscall, funcalias) > sysnames
+		printf("#define\t%s%s\t%d\t/* compatibility; still used by libc */\n", \
+		    syscallprefix, funcalias, syscall) > syshdr
+		if ($3 != "NOHIDE")
+			printf("HIDE_%s(%s)\n", $3, funcname) > syshide
+		syscall++
+		next
+	}
+	$2 == "OBSOL" {
+		printf("\t{ 0, (sy_call_t *)nosys },\t\t\t/* %d = obsolete %s */\n", \
+		    syscall, comment) > sysent
+		printf("\t\"obs_%s\",\t\t\t/* %d = obsolete %s */\n", \
+		    $4, syscall, comment) > sysnames
+		printf("\t\t\t\t/* %d is obsolete %s */\n", \
+		    syscall, comment) > syshdr
+		if ($3 != "NOHIDE")
+			printf("HIDE_%s(%s)\n", $3, $4) > syshide
+		syscall++
+		next
+	}
+	$2 == "UNIMPL" {
+		printf("\t{ 0, (sy_call_t *)nosys },\t\t\t/* %d = %s */\n", \
+		    syscall, comment) > sysent
+		printf("\t\"#%d\",\t\t\t/* %d = %s */\n", \
+		    syscall, syscall, comment) > sysnames
+		if ($3 != "NOHIDE")
+			printf("HIDE_%s(%s)\n", $3, $4) > syshide
+		syscall++
+		next
+	}
+	{
+		printf "%s: line %d: unrecognized keyword %s\n", infile, NR, $2
+		exit 1
+	}
+	END {
+		printf("\n#endif /* %s */\n", compat) > syscompatdcl
+		printf("\n#endif /* !%s */\n", sysproto_h) > syscompatdcl
+
+		printf("};\n") > sysent
+		printf("};\n") > sysnames
+		printf("#define\t%sMAXSYSCALL\t%d\n", syscallprefix, syscall) \
+			> syshdr
+	} '
+
+cat $sysinc $sysent >$syssw
+cat $sysarg $sysdcl $syscompat $syscompatdcl > $sysproto
diff --git a/sys/kern/md5c.c b/sys/kern/md5c.c
new file mode 100644
index 0000000..583d009
--- /dev/null
+++ b/sys/kern/md5c.c
@@ -0,0 +1,331 @@
+/*
+ * MD5C.C - RSA Data Security, Inc., MD5 message-digest algorithm
+ *
+ * Copyright (C) 1991-2, RSA Data Security, Inc. Created 1991. All
+ * rights reserved.
+ *
+ * License to copy and use this software is granted provided that it
+ * is identified as the "RSA Data Security, Inc. MD5 Message-Digest
+ * Algorithm" in all material mentioning or referencing this software
+ * or this function.
+ *
+ * License is also granted to make and use derivative works provided
+ * that such works are identified as "derived from the RSA Data
+ * Security, Inc. MD5 Message-Digest Algorithm" in all material
+ * mentioning or referencing the derived work.
+ *
+ * RSA Data Security, Inc. makes no representations concerning either
+ * the merchantability of this software or the suitability of this
+ * software for any particular purpose. It is provided "as is"
+ * without express or implied warranty of any kind.
+ *
+ * These notices must be retained in any copies of any part of this
+ * documentation and/or software.
+ *
+ * $Id$
+ *
+ * This code is the same as the code published by RSA Inc.  It has been
+ * edited for clarity and style only.
+ */
+
+#include <sys/types.h>
+
+#ifdef KERNEL
+#include <sys/param.h>
+#include <sys/systm.h>
+#else
+#include <string.h>
+#endif
+
+#include <sys/md5.h>
+
+static void MD5Transform __P((u_int32_t [4], const unsigned char [64]));
+
+#ifdef KERNEL
+#define memset(x,y,z)	bzero(x,z);
+#define memcpy(x,y,z)	bcopy(y, x, z)
+#endif
+
+#ifdef i386
+#define Encode memcpy
+#define Decode memcpy
+#else /* i386 */
+
+/*
+ * Encodes input (u_int32_t) into output (unsigned char). Assumes len is
+ * a multiple of 4.
+ */
+
+static void
+Encode (output, input, len)
+	unsigned char *output;
+	u_int32_t *input;
+	unsigned int len;
+{
+	unsigned int i, j;
+
+	for (i = 0, j = 0; j < len; i++, j += 4) {
+		output[j] = (unsigned char)(input[i] & 0xff);
+		output[j+1] = (unsigned char)((input[i] >> 8) & 0xff);
+		output[j+2] = (unsigned char)((input[i] >> 16) & 0xff);
+		output[j+3] = (unsigned char)((input[i] >> 24) & 0xff);
+	}
+}
+
+/*
+ * Decodes input (unsigned char) into output (u_int32_t). Assumes len is
+ * a multiple of 4.
+ */
+
+static void
+Decode (output, input, len)
+	u_int32_t *output;
+	const unsigned char *input;
+	unsigned int len;
+{
+	unsigned int i, j;
+
+	for (i = 0, j = 0; j < len; i++, j += 4)
+		output[i] = ((u_int32_t)input[j]) | (((u_int32_t)input[j+1]) << 8) |
+		    (((u_int32_t)input[j+2]) << 16) | (((u_int32_t)input[j+3]) << 24);
+}
+#endif /* i386 */
+
+static unsigned char PADDING[64] = {
+  0x80, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0
+};
+
+/* F, G, H and I are basic MD5 functions. */
+#define F(x, y, z) (((x) & (y)) | ((~x) & (z)))
+#define G(x, y, z) (((x) & (z)) | ((y) & (~z)))
+#define H(x, y, z) ((x) ^ (y) ^ (z))
+#define I(x, y, z) ((y) ^ ((x) | (~z)))
+
+/* ROTATE_LEFT rotates x left n bits. */
+#define ROTATE_LEFT(x, n) (((x) << (n)) | ((x) >> (32-(n))))
+
+/*
+ * FF, GG, HH, and II transformations for rounds 1, 2, 3, and 4.
+ * Rotation is separate from addition to prevent recomputation.
+ */
+#define FF(a, b, c, d, x, s, ac) { \
+	(a) += F ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+	(a) = ROTATE_LEFT ((a), (s)); \
+	(a) += (b); \
+	}
+#define GG(a, b, c, d, x, s, ac) { \
+	(a) += G ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+	(a) = ROTATE_LEFT ((a), (s)); \
+	(a) += (b); \
+	}
+#define HH(a, b, c, d, x, s, ac) { \
+	(a) += H ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+	(a) = ROTATE_LEFT ((a), (s)); \
+	(a) += (b); \
+	}
+#define II(a, b, c, d, x, s, ac) { \
+	(a) += I ((b), (c), (d)) + (x) + (u_int32_t)(ac); \
+	(a) = ROTATE_LEFT ((a), (s)); \
+	(a) += (b); \
+	}
+
+/* MD5 initialization. Begins an MD5 operation, writing a new context. */
+
+void
+MD5Init (context)
+	MD5_CTX *context;
+{
+
+	context->count[0] = context->count[1] = 0;
+
+	/* Load magic initialization constants.  */
+	context->state[0] = 0x67452301;
+	context->state[1] = 0xefcdab89;
+	context->state[2] = 0x98badcfe;
+	context->state[3] = 0x10325476;
+}
+
+/* 
+ * MD5 block update operation. Continues an MD5 message-digest
+ * operation, processing another message block, and updating the
+ * context.
+ */
+
+void
+MD5Update (context, input, inputLen)
+	MD5_CTX *context;
+	const unsigned char *input;
+	unsigned int inputLen;
+{
+	unsigned int i, index, partLen;
+
+	/* Compute number of bytes mod 64 */
+	index = (unsigned int)((context->count[0] >> 3) & 0x3F);
+
+	/* Update number of bits */
+	if ((context->count[0] += ((u_int32_t)inputLen << 3))
+	    < ((u_int32_t)inputLen << 3))
+		context->count[1]++;
+	context->count[1] += ((u_int32_t)inputLen >> 29);
+
+	partLen = 64 - index;
+
+	/* Transform as many times as possible. */
+	if (inputLen >= partLen) {
+		memcpy((void *)&context->buffer[index], (void *)input,
+		    partLen);
+		MD5Transform (context->state, context->buffer);
+
+		for (i = partLen; i + 63 < inputLen; i += 64)
+			MD5Transform (context->state, &input[i]);
+
+		index = 0;
+	}
+	else
+		i = 0;
+
+	/* Buffer remaining input */
+	memcpy ((void *)&context->buffer[index], (void *)&input[i],
+	    inputLen-i);
+}
+
+/*
+ * MD5 finalization. Ends an MD5 message-digest operation, writing the
+ * the message digest and zeroizing the context.
+ */
+
+void
+MD5Final (digest, context)
+	unsigned char digest[16];
+	MD5_CTX *context;
+{
+	unsigned char bits[8];
+	unsigned int index, padLen;
+
+	/* Save number of bits */
+	Encode (bits, context->count, 8);
+
+	/* Pad out to 56 mod 64. */
+	index = (unsigned int)((context->count[0] >> 3) & 0x3f);
+	padLen = (index < 56) ? (56 - index) : (120 - index);
+	MD5Update (context, PADDING, padLen);
+
+	/* Append length (before padding) */
+	MD5Update (context, bits, 8);
+
+	/* Store state in digest */
+	Encode (digest, context->state, 16);
+
+	/* Zeroize sensitive information. */
+	memset ((void *)context, 0, sizeof (*context));
+}
+
+/* MD5 basic transformation. Transforms state based on block. */
+
+static void
+MD5Transform (state, block)
+	u_int32_t state[4];
+	const unsigned char block[64];
+{
+	u_int32_t a = state[0], b = state[1], c = state[2], d = state[3], x[16];
+
+	Decode (x, block, 64);
+
+	/* Round 1 */
+#define S11 7
+#define S12 12
+#define S13 17
+#define S14 22
+	FF (a, b, c, d, x[ 0], S11, 0xd76aa478); /* 1 */
+	FF (d, a, b, c, x[ 1], S12, 0xe8c7b756); /* 2 */
+	FF (c, d, a, b, x[ 2], S13, 0x242070db); /* 3 */
+	FF (b, c, d, a, x[ 3], S14, 0xc1bdceee); /* 4 */
+	FF (a, b, c, d, x[ 4], S11, 0xf57c0faf); /* 5 */
+	FF (d, a, b, c, x[ 5], S12, 0x4787c62a); /* 6 */
+	FF (c, d, a, b, x[ 6], S13, 0xa8304613); /* 7 */
+	FF (b, c, d, a, x[ 7], S14, 0xfd469501); /* 8 */
+	FF (a, b, c, d, x[ 8], S11, 0x698098d8); /* 9 */
+	FF (d, a, b, c, x[ 9], S12, 0x8b44f7af); /* 10 */
+	FF (c, d, a, b, x[10], S13, 0xffff5bb1); /* 11 */
+	FF (b, c, d, a, x[11], S14, 0x895cd7be); /* 12 */
+	FF (a, b, c, d, x[12], S11, 0x6b901122); /* 13 */
+	FF (d, a, b, c, x[13], S12, 0xfd987193); /* 14 */
+	FF (c, d, a, b, x[14], S13, 0xa679438e); /* 15 */
+	FF (b, c, d, a, x[15], S14, 0x49b40821); /* 16 */
+
+	/* Round 2 */
+#define S21 5
+#define S22 9
+#define S23 14
+#define S24 20
+	GG (a, b, c, d, x[ 1], S21, 0xf61e2562); /* 17 */
+	GG (d, a, b, c, x[ 6], S22, 0xc040b340); /* 18 */
+	GG (c, d, a, b, x[11], S23, 0x265e5a51); /* 19 */
+	GG (b, c, d, a, x[ 0], S24, 0xe9b6c7aa); /* 20 */
+	GG (a, b, c, d, x[ 5], S21, 0xd62f105d); /* 21 */
+	GG (d, a, b, c, x[10], S22,  0x2441453); /* 22 */
+	GG (c, d, a, b, x[15], S23, 0xd8a1e681); /* 23 */
+	GG (b, c, d, a, x[ 4], S24, 0xe7d3fbc8); /* 24 */
+	GG (a, b, c, d, x[ 9], S21, 0x21e1cde6); /* 25 */
+	GG (d, a, b, c, x[14], S22, 0xc33707d6); /* 26 */
+	GG (c, d, a, b, x[ 3], S23, 0xf4d50d87); /* 27 */
+	GG (b, c, d, a, x[ 8], S24, 0x455a14ed); /* 28 */
+	GG (a, b, c, d, x[13], S21, 0xa9e3e905); /* 29 */
+	GG (d, a, b, c, x[ 2], S22, 0xfcefa3f8); /* 30 */
+	GG (c, d, a, b, x[ 7], S23, 0x676f02d9); /* 31 */
+	GG (b, c, d, a, x[12], S24, 0x8d2a4c8a); /* 32 */
+
+	/* Round 3 */
+#define S31 4
+#define S32 11
+#define S33 16
+#define S34 23
+	HH (a, b, c, d, x[ 5], S31, 0xfffa3942); /* 33 */
+	HH (d, a, b, c, x[ 8], S32, 0x8771f681); /* 34 */
+	HH (c, d, a, b, x[11], S33, 0x6d9d6122); /* 35 */
+	HH (b, c, d, a, x[14], S34, 0xfde5380c); /* 36 */
+	HH (a, b, c, d, x[ 1], S31, 0xa4beea44); /* 37 */
+	HH (d, a, b, c, x[ 4], S32, 0x4bdecfa9); /* 38 */
+	HH (c, d, a, b, x[ 7], S33, 0xf6bb4b60); /* 39 */
+	HH (b, c, d, a, x[10], S34, 0xbebfbc70); /* 40 */
+	HH (a, b, c, d, x[13], S31, 0x289b7ec6); /* 41 */
+	HH (d, a, b, c, x[ 0], S32, 0xeaa127fa); /* 42 */
+	HH (c, d, a, b, x[ 3], S33, 0xd4ef3085); /* 43 */
+	HH (b, c, d, a, x[ 6], S34,  0x4881d05); /* 44 */
+	HH (a, b, c, d, x[ 9], S31, 0xd9d4d039); /* 45 */
+	HH (d, a, b, c, x[12], S32, 0xe6db99e5); /* 46 */
+	HH (c, d, a, b, x[15], S33, 0x1fa27cf8); /* 47 */
+	HH (b, c, d, a, x[ 2], S34, 0xc4ac5665); /* 48 */
+
+	/* Round 4 */
+#define S41 6
+#define S42 10
+#define S43 15
+#define S44 21
+	II (a, b, c, d, x[ 0], S41, 0xf4292244); /* 49 */
+	II (d, a, b, c, x[ 7], S42, 0x432aff97); /* 50 */
+	II (c, d, a, b, x[14], S43, 0xab9423a7); /* 51 */
+	II (b, c, d, a, x[ 5], S44, 0xfc93a039); /* 52 */
+	II (a, b, c, d, x[12], S41, 0x655b59c3); /* 53 */
+	II (d, a, b, c, x[ 3], S42, 0x8f0ccc92); /* 54 */
+	II (c, d, a, b, x[10], S43, 0xffeff47d); /* 55 */
+	II (b, c, d, a, x[ 1], S44, 0x85845dd1); /* 56 */
+	II (a, b, c, d, x[ 8], S41, 0x6fa87e4f); /* 57 */
+	II (d, a, b, c, x[15], S42, 0xfe2ce6e0); /* 58 */
+	II (c, d, a, b, x[ 6], S43, 0xa3014314); /* 59 */
+	II (b, c, d, a, x[13], S44, 0x4e0811a1); /* 60 */
+	II (a, b, c, d, x[ 4], S41, 0xf7537e82); /* 61 */
+	II (d, a, b, c, x[11], S42, 0xbd3af235); /* 62 */
+	II (c, d, a, b, x[ 2], S43, 0x2ad7d2bb); /* 63 */
+	II (b, c, d, a, x[ 9], S44, 0xeb86d391); /* 64 */
+
+	state[0] += a;
+	state[1] += b;
+	state[2] += c;
+	state[3] += d;
+
+	/* Zeroize sensitive information. */
+	memset ((void *)x, 0, sizeof (x));
+}
diff --git a/sys/kern/subr_autoconf.c b/sys/kern/subr_autoconf.c
index 7281339..f48ce99 100644
--- a/sys/kern/subr_autoconf.c
+++ b/sys/kern/subr_autoconf.c
@@ -39,15 +39,14 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)subr_autoconf.c	8.3 (Berkeley) 5/17/94
+ *	@(#)subr_autoconf.c	8.1 (Berkeley) 6/10/93
  *
- * from: $Header: subr_autoconf.c,v 1.12 93/02/01 19:31:48 torek Exp $ (LBL)
+ * $Id$
  */
 
 #include <sys/param.h>
 #include <sys/device.h>
 #include <sys/malloc.h>
-#include <libkern/libkern.h>
 
 /*
  * Autoconfiguration subroutines.
@@ -284,16 +283,15 @@ config_attach(parent, cf, aux, print)
 		void **nsp;
 
 		if (old == 0) {
-			new = max(MINALLOCSIZE / sizeof(void *),
-			    dev->dv_unit + 1);
-			newbytes = new * sizeof(void *);
-			nsp = malloc(newbytes, M_DEVBUF, M_WAITOK);	/*XXX*/
-			bzero(nsp, newbytes);
+			nsp = malloc(MINALLOCSIZE, M_DEVBUF, M_WAITOK);	/*XXX*/
+			bzero(nsp, MINALLOCSIZE);
+			cd->cd_ndevs = MINALLOCSIZE / sizeof(void *);
 		} else {
 			new = cd->cd_ndevs;
 			do {
 				new *= 2;
 			} while (new <= dev->dv_unit);
+			cd->cd_ndevs = new;
 			oldbytes = old * sizeof(void *);
 			newbytes = new * sizeof(void *);
 			nsp = malloc(newbytes, M_DEVBUF, M_WAITOK);	/*XXX*/
@@ -301,7 +299,6 @@ config_attach(parent, cf, aux, print)
 			bzero(&nsp[old], newbytes - oldbytes);
 			free(cd->cd_devs, M_DEVBUF);
 		}
-		cd->cd_ndevs = new;
 		cd->cd_devs = nsp;
 	}
 	if (cd->cd_devs[dev->dv_unit])
diff --git a/sys/kern/subr_clist.c b/sys/kern/subr_clist.c
new file mode 100644
index 0000000..d907b47
--- /dev/null
+++ b/sys/kern/subr_clist.c
@@ -0,0 +1,694 @@
+/*
+ * Copyright (c) 1994, David Greenman
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ * $Id: tty_subr.c,v 1.26 1997/03/05 16:45:01 bde Exp $
+ */
+
+/*
+ * clist support routines
+ */
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
+#include <sys/tty.h>
+#include <sys/clist.h>
+#include <sys/malloc.h>
+
+static void clist_init __P((void *));
+SYSINIT(clist, SI_SUB_CLIST, SI_ORDER_FIRST, clist_init, NULL)
+
+static struct cblock *cfreelist = 0;
+int cfreecount = 0;
+static int cslushcount;
+static int ctotcount;
+
+#ifndef INITIAL_CBLOCKS
+#define	INITIAL_CBLOCKS 50
+#endif
+
+static struct cblock *cblock_alloc __P((void));
+static void cblock_alloc_cblocks __P((int number));
+static void cblock_free __P((struct cblock *cblockp));
+static void cblock_free_cblocks __P((int number));
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(cbstat, cbstat)
+{
+	printf(
+	"tot = %d (active = %d, free = %d (reserved = %d, slush = %d))\n",
+	       ctotcount * CBSIZE, ctotcount * CBSIZE - cfreecount, cfreecount,
+	       cfreecount - cslushcount * CBSIZE, cslushcount * CBSIZE);
+}
+#endif /* DDB */
+
+/*
+ * Called from init_main.c
+ */
+/* ARGSUSED*/
+static void
+clist_init(dummy)
+	void *dummy;
+{
+	/*
+	 * Allocate an initial base set of cblocks as a 'slush'.
+	 * We allocate non-slush cblocks with each initial ttyopen() and
+	 * deallocate them with each ttyclose().
+	 * We should adjust the slush allocation.  This can't be done in
+	 * the i/o routines because they are sometimes called from
+	 * interrupt handlers when it may be unsafe to call malloc().
+	 */
+	cblock_alloc_cblocks(cslushcount = INITIAL_CBLOCKS);
+}
+
+/*
+ * Remove a cblock from the cfreelist queue and return a pointer
+ * to it.
+ */
+static inline struct cblock *
+cblock_alloc()
+{
+	struct cblock *cblockp;
+
+	cblockp = cfreelist;
+	if (cblockp == NULL)
+		panic("clist reservation botch");
+	cfreelist = cblockp->c_next;
+	cblockp->c_next = NULL;
+	cfreecount -= CBSIZE;
+	return (cblockp);
+}
+
+/*
+ * Add a cblock to the cfreelist queue.
+ */
+static inline void
+cblock_free(cblockp)
+	struct cblock *cblockp;
+{
+	if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1))
+		bzero(cblockp->c_quote, sizeof cblockp->c_quote);
+	cblockp->c_next = cfreelist;
+	cfreelist = cblockp;
+	cfreecount += CBSIZE;
+}
+
+/*
+ * Allocate some cblocks for the cfreelist queue.
+ */
+static void
+cblock_alloc_cblocks(number)
+	int number;
+{
+	int i;
+	struct cblock *cbp;
+
+	for (i = 0; i < number; ++i) {
+		cbp = malloc(sizeof *cbp, M_TTYS, M_NOWAIT);
+		if (cbp == NULL) {
+			printf(
+"clist_alloc_cblocks: M_NOWAIT malloc failed, trying M_WAITOK\n");
+			cbp = malloc(sizeof *cbp, M_TTYS, M_WAITOK);
+		}
+		/*
+		 * Freed cblocks have zero quotes and garbage elsewhere.
+		 * Set the may-have-quote bit to force zeroing the quotes.
+		 */
+		setbit(cbp->c_quote, CBQSIZE * NBBY - 1);
+		cblock_free(cbp);
+	}
+	ctotcount += number;
+}
+
+/*
+ * Set the cblock allocation policy for a a clist.
+ * Must be called in process context at spltty().
+ */
+void
+clist_alloc_cblocks(clistp, ccmax, ccreserved)
+	struct clist *clistp;
+	int ccmax;
+	int ccreserved;
+{
+	int dcbr;
+
+	/*
+	 * Allow for wasted space at the head.
+	 */
+	if (ccmax != 0)
+		ccmax += CBSIZE - 1;
+	if (ccreserved != 0)
+		ccreserved += CBSIZE - 1;
+
+	clistp->c_cbmax = roundup(ccmax, CBSIZE) / CBSIZE;
+	dcbr = roundup(ccreserved, CBSIZE) / CBSIZE - clistp->c_cbreserved;
+	if (dcbr >= 0)
+		cblock_alloc_cblocks(dcbr);
+	else {
+		if (clistp->c_cbreserved + dcbr < clistp->c_cbcount)
+			dcbr = clistp->c_cbcount - clistp->c_cbreserved;
+		cblock_free_cblocks(-dcbr);
+	}
+	clistp->c_cbreserved += dcbr;
+}
+
+/*
+ * Free some cblocks from the cfreelist queue back to the
+ * system malloc pool.
+ */
+static void
+cblock_free_cblocks(number)
+	int number;
+{
+	int i;
+
+	for (i = 0; i < number; ++i)
+		free(cblock_alloc(), M_TTYS);
+	ctotcount -= number;
+}
+
+/*
+ * Free the cblocks reserved for a clist.
+ * Must be called at spltty().
+ */
+void
+clist_free_cblocks(clistp)
+	struct clist *clistp;
+{
+	if (clistp->c_cbcount != 0)
+		panic("freeing active clist cblocks");
+	cblock_free_cblocks(clistp->c_cbreserved);
+	clistp->c_cbmax = 0;
+	clistp->c_cbreserved = 0;
+}
+
+/*
+ * Get a character from the head of a clist.
+ */
+int
+getc(clistp)
+	struct clist *clistp;
+{
+	int chr = -1;
+	int s;
+	struct cblock *cblockp;
+
+	s = spltty();
+
+	/* If there are characters in the list, get one */
+	if (clistp->c_cc) {
+		cblockp = (struct cblock *)((long)clistp->c_cf & ~CROUND);
+		chr = (u_char)*clistp->c_cf;
+
+		/*
+		 * If this char is quoted, set the flag.
+		 */
+		if (isset(cblockp->c_quote, clistp->c_cf - (char *)cblockp->c_info))
+			chr |= TTY_QUOTE;
+
+		/*
+		 * Advance to next character.
+		 */
+		clistp->c_cf++;
+		clistp->c_cc--;
+		/*
+		 * If we have advanced the 'first' character pointer
+		 * past the end of this cblock, advance to the next one.
+		 * If there are no more characters, set the first and
+		 * last pointers to NULL. In either case, free the
+		 * current cblock.
+		 */
+		if ((clistp->c_cf >= (char *)(cblockp+1)) || (clistp->c_cc == 0)) {
+			if (clistp->c_cc > 0) {
+				clistp->c_cf = cblockp->c_next->c_info;
+			} else {
+				clistp->c_cf = clistp->c_cl = NULL;
+			}
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+		}
+	}
+
+	splx(s);
+	return (chr);
+}
+
+/*
+ * Copy 'amount' of chars, beginning at head of clist 'clistp' to
+ * destination linear buffer 'dest'. Return number of characters
+ * actually copied.
+ */
+int
+q_to_b(clistp, dest, amount)
+	struct clist *clistp;
+	char *dest;
+	int amount;
+{
+	struct cblock *cblockp;
+	struct cblock *cblockn;
+	char *dest_orig = dest;
+	int numc;
+	int s;
+
+	s = spltty();
+
+	while (clistp && amount && (clistp->c_cc > 0)) {
+		cblockp = (struct cblock *)((long)clistp->c_cf & ~CROUND);
+		cblockn = cblockp + 1; /* pointer arithmetic! */
+		numc = min(amount, (char *)cblockn - clistp->c_cf);
+		numc = min(numc, clistp->c_cc);
+		bcopy(clistp->c_cf, dest, numc);
+		amount -= numc;
+		clistp->c_cf += numc;
+		clistp->c_cc -= numc;
+		dest += numc;
+		/*
+		 * If this cblock has been emptied, advance to the next
+		 * one. If there are no more characters, set the first
+		 * and last pointer to NULL. In either case, free the
+		 * current cblock.
+		 */
+		if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+			if (clistp->c_cc > 0) {
+				clistp->c_cf = cblockp->c_next->c_info;
+			} else {
+				clistp->c_cf = clistp->c_cl = NULL;
+			}
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+		}
+	}
+
+	splx(s);
+	return (dest - dest_orig);
+}
+
+/*
+ * Flush 'amount' of chars, beginning at head of clist 'clistp'.
+ */
+void
+ndflush(clistp, amount)
+	struct clist *clistp;
+	int amount;
+{
+	struct cblock *cblockp;
+	struct cblock *cblockn;
+	int numc;
+	int s;
+
+	s = spltty();
+
+	while (amount && (clistp->c_cc > 0)) {
+		cblockp = (struct cblock *)((long)clistp->c_cf & ~CROUND);
+		cblockn = cblockp + 1; /* pointer arithmetic! */
+		numc = min(amount, (char *)cblockn - clistp->c_cf);
+		numc = min(numc, clistp->c_cc);
+		amount -= numc;
+		clistp->c_cf += numc;
+		clistp->c_cc -= numc;
+		/*
+		 * If this cblock has been emptied, advance to the next
+		 * one. If there are no more characters, set the first
+		 * and last pointer to NULL. In either case, free the
+		 * current cblock.
+		 */
+		if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+			if (clistp->c_cc > 0) {
+				clistp->c_cf = cblockp->c_next->c_info;
+			} else {
+				clistp->c_cf = clistp->c_cl = NULL;
+			}
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+		}
+	}
+
+	splx(s);
+}
+
+/*
+ * Add a character to the end of a clist. Return -1 is no
+ * more clists, or 0 for success.
+ */
+int
+putc(chr, clistp)
+	int chr;
+	struct clist *clistp;
+{
+	struct cblock *cblockp;
+	int s;
+
+	s = spltty();
+
+	if (clistp->c_cl == NULL) {
+		if (clistp->c_cbreserved < 1) {
+			splx(s);
+			printf("putc to a clist with no reserved cblocks\n");
+			return (-1);		/* nothing done */
+		}
+		cblockp = cblock_alloc();
+		clistp->c_cbcount = 1;
+		clistp->c_cf = clistp->c_cl = cblockp->c_info;
+		clistp->c_cc = 0;
+	} else {
+		cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND);
+		if (((long)clistp->c_cl & CROUND) == 0) {
+			struct cblock *prev = (cblockp - 1);
+
+			if (clistp->c_cbcount >= clistp->c_cbreserved) {
+				if (clistp->c_cbcount >= clistp->c_cbmax
+				    || cslushcount <= 0) {
+					splx(s);
+					return (-1);
+				}
+				--cslushcount;
+			}
+			cblockp = cblock_alloc();
+			clistp->c_cbcount++;
+			prev->c_next = cblockp;
+			clistp->c_cl = cblockp->c_info;
+		}
+	}
+
+	/*
+	 * If this character is quoted, set the quote bit, if not, clear it.
+	 */
+	if (chr & TTY_QUOTE) {
+		setbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+		/*
+		 * Use one of the spare quote bits to record that something
+		 * may be quoted.
+		 */
+		setbit(cblockp->c_quote, CBQSIZE * NBBY - 1);
+	} else
+		clrbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+
+	*clistp->c_cl++ = chr;
+	clistp->c_cc++;
+
+	splx(s);
+	return (0);
+}
+
+/*
+ * Copy data from linear buffer to clist chain. Return the
+ * number of characters not copied.
+ */
+int
+b_to_q(src, amount, clistp)
+	char *src;
+	int amount;
+	struct clist *clistp;
+{
+	struct cblock *cblockp;
+	char *firstbyte, *lastbyte;
+	u_char startmask, endmask;
+	int startbit, endbit, num_between, numc;
+	int s;
+
+	/*
+	 * Avoid allocating an initial cblock and then not using it.
+	 * c_cc == 0 must imply c_cbount == 0.
+	 */
+	if (amount <= 0)
+		return (amount);
+
+	s = spltty();
+
+	/*
+	 * If there are no cblocks assigned to this clist yet,
+	 * then get one.
+	 */
+	if (clistp->c_cl == NULL) {
+		if (clistp->c_cbreserved < 1) {
+			splx(s);
+			printf("b_to_q to a clist with no reserved cblocks.\n");
+			return (amount);	/* nothing done */
+		}
+		cblockp = cblock_alloc();
+		clistp->c_cbcount = 1;
+		clistp->c_cf = clistp->c_cl = cblockp->c_info;
+		clistp->c_cc = 0;
+	} else {
+		cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND);
+	}
+
+	while (amount) {
+		/*
+		 * Get another cblock if needed.
+		 */
+		if (((long)clistp->c_cl & CROUND) == 0) {
+			struct cblock *prev = cblockp - 1;
+
+			if (clistp->c_cbcount >= clistp->c_cbreserved) {
+				if (clistp->c_cbcount >= clistp->c_cbmax
+				    || cslushcount <= 0) {
+					splx(s);
+					return (amount);
+				}
+				--cslushcount;
+			}
+			cblockp = cblock_alloc();
+			clistp->c_cbcount++;
+			prev->c_next = cblockp;
+			clistp->c_cl = cblockp->c_info;
+		}
+
+		/*
+		 * Copy a chunk of the linear buffer up to the end
+		 * of this cblock.
+		 */
+		numc = min(amount, (char *)(cblockp + 1) - clistp->c_cl);
+		bcopy(src, clistp->c_cl, numc);
+
+		/*
+		 * Clear quote bits if they aren't known to be clear.
+		 * The following could probably be made into a seperate
+		 * "bitzero()" routine, but why bother?
+		 */
+		if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) {
+			startbit = clistp->c_cl - (char *)cblockp->c_info;
+			endbit = startbit + numc - 1;
+
+			firstbyte = (u_char *)cblockp->c_quote + (startbit / NBBY);
+			lastbyte = (u_char *)cblockp->c_quote + (endbit / NBBY);
+
+			/*
+			 * Calculate mask of bits to preserve in first and
+			 * last bytes.
+			 */
+			startmask = NBBY - (startbit % NBBY);
+			startmask = 0xff >> startmask;
+			endmask = (endbit % NBBY);
+			endmask = 0xff << (endmask + 1);
+
+			if (firstbyte != lastbyte) {
+				*firstbyte &= startmask;
+				*lastbyte &= endmask;
+
+				num_between = lastbyte - firstbyte - 1;
+				if (num_between)
+					bzero(firstbyte + 1, num_between);
+			} else {
+				*firstbyte &= (startmask | endmask);
+			}
+		}
+
+		/*
+		 * ...and update pointer for the next chunk.
+		 */
+		src += numc;
+		clistp->c_cl += numc;
+		clistp->c_cc += numc;
+		amount -= numc;
+		/*
+		 * If we go through the loop again, it's always
+		 * for data in the next cblock, so by adding one (cblock),
+		 * (which makes the pointer 1 beyond the end of this
+		 * cblock) we prepare for the assignment of 'prev'
+		 * above.
+		 */
+		cblockp += 1;
+
+	}
+
+	splx(s);
+	return (amount);
+}
+
+/*
+ * Get the next character in the clist. Store it at dst. Don't
+ * advance any clist pointers, but return a pointer to the next
+ * character position.
+ */
+char *
+nextc(clistp, cp, dst)
+	struct clist *clistp;
+	char *cp;
+	int *dst;
+{
+	struct cblock *cblockp;
+
+	++cp;
+	/*
+	 * See if the next character is beyond the end of
+	 * the clist.
+	 */
+	if (clistp->c_cc && (cp != clistp->c_cl)) {
+		/*
+		 * If the next character is beyond the end of this
+		 * cblock, advance to the next cblock.
+		 */
+		if (((long)cp & CROUND) == 0)
+			cp = ((struct cblock *)cp - 1)->c_next->c_info;
+		cblockp = (struct cblock *)((long)cp & ~CROUND);
+
+		/*
+		 * Get the character. Set the quote flag if this character
+		 * is quoted.
+		 */
+		*dst = (u_char)*cp | (isset(cblockp->c_quote, cp - (char *)cblockp->c_info) ? TTY_QUOTE : 0);
+
+		return (cp);
+	}
+
+	return (NULL);
+}
+
+/*
+ * "Unput" a character from a clist.
+ */
+int
+unputc(clistp)
+	struct clist *clistp;
+{
+	struct cblock *cblockp = 0, *cbp = 0;
+	int s;
+	int chr = -1;
+
+
+	s = spltty();
+
+	if (clistp->c_cc) {
+		--clistp->c_cc;
+		--clistp->c_cl;
+
+		chr = (u_char)*clistp->c_cl;
+
+		cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND);
+
+		/*
+		 * Set quote flag if this character was quoted.
+		 */
+		if (isset(cblockp->c_quote, (u_char *)clistp->c_cl - cblockp->c_info))
+			chr |= TTY_QUOTE;
+
+		/*
+		 * If all of the characters have been unput in this
+		 * cblock, then find the previous one and free this
+		 * one.
+		 */
+		if (clistp->c_cc && (clistp->c_cl <= (char *)cblockp->c_info)) {
+			cbp = (struct cblock *)((long)clistp->c_cf & ~CROUND);
+
+			while (cbp->c_next != cblockp)
+				cbp = cbp->c_next;
+
+			/*
+			 * When the previous cblock is at the end, the 'last'
+			 * pointer always points (invalidly) one past.
+			 */
+			clistp->c_cl = (char *)(cbp+1);
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+			cbp->c_next = NULL;
+		}
+	}
+
+	/*
+	 * If there are no more characters on the list, then
+	 * free the last cblock.
+	 */
+	if ((clistp->c_cc == 0) && clistp->c_cl) {
+		cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND);
+		cblock_free(cblockp);
+		if (--clistp->c_cbcount >= clistp->c_cbreserved)
+			++cslushcount;
+		clistp->c_cf = clistp->c_cl = NULL;
+	}
+
+	splx(s);
+	return (chr);
+}
+
+/*
+ * Move characters in source clist to destination clist,
+ * preserving quote bits.
+ */
+void
+catq(src_clistp, dest_clistp)
+	struct clist *src_clistp, *dest_clistp;
+{
+	int chr, s;
+
+	s = spltty();
+	/*
+	 * If the destination clist is empty (has no cblocks atttached),
+	 * and there are no possible complications with the resource counters,
+	 * then we simply assign the current clist to the destination.
+	 */
+	if (!dest_clistp->c_cf
+	    && src_clistp->c_cbcount <= src_clistp->c_cbmax
+	    && src_clistp->c_cbcount <= dest_clistp->c_cbmax) {
+		dest_clistp->c_cf = src_clistp->c_cf;
+		dest_clistp->c_cl = src_clistp->c_cl;
+		src_clistp->c_cf = src_clistp->c_cl = NULL;
+
+		dest_clistp->c_cc = src_clistp->c_cc;
+		src_clistp->c_cc = 0;
+		dest_clistp->c_cbcount = src_clistp->c_cbcount;
+		src_clistp->c_cbcount = 0;
+
+		splx(s);
+		return;
+	}
+
+	splx(s);
+
+	/*
+	 * XXX  This should probably be optimized to more than one
+	 * character at a time.
+	 */
+	while ((chr = getc(src_clistp)) != -1)
+		putc(chr, dest_clistp);
+}
diff --git a/sys/kern/subr_disklabel.c b/sys/kern/subr_disklabel.c
new file mode 100644
index 0000000..94315de
--- /dev/null
+++ b/sys/kern/subr_disklabel.c
@@ -0,0 +1,406 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)ufs_disksubr.c	8.5 (Berkeley) 1/21/94
+ * $Id$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/disklabel.h>
+#include <sys/diskslice.h>
+#include <sys/syslog.h>
+
+/*
+ * Seek sort for disks.
+ *
+ * The argument ap structure holds a b_actf activity chain pointer on which we
+ * keep two queues, sorted in ascending block order.  The first queue holds
+ * those requests which are positioned after the current block (in the first
+ * request); the second holds requests which came in after their block number
+ * was passed.  Thus we implement a one way scan, retracting after reaching the
+ * end of the drive to the first request on the second queue, at which time it
+ * becomes the first queue.
+ *
+ * A one-way scan is natural because of the way UNIX read-ahead blocks are
+ * allocated.
+ */
+
+void
+tqdisksort(ap, bp)
+	struct buf_queue_head *ap;
+	register struct buf *bp;
+{
+	register struct buf *bq;
+	struct buf *bn;
+
+	/* If the queue is empty, then it's easy. */
+	if ((bq = ap->tqh_first) == NULL) {
+		TAILQ_INSERT_HEAD(ap, bp, b_act);
+		return;
+	}
+
+#if 1
+	/* Put new writes after all reads */
+	if ((bp->b_flags & B_READ) == 0) {
+		while (bn = bq->b_act.tqe_next) {
+			if ((bq->b_flags & B_READ) == 0)
+				break;
+			bq = bn;
+		}
+	} else {
+		while (bn = bq->b_act.tqe_next) {
+			if ((bq->b_flags & B_READ) == 0) {
+				if (ap->tqh_first != bq) {
+					bq = *bq->b_act.tqe_prev;
+				} 
+				break;
+			}
+			bq = bn;
+		}
+		goto insert;
+	}
+#endif
+
+	/*
+	 * If we lie after the first (currently active) request, then we
+	 * must locate the second request list and add ourselves to it.
+	 */
+	if (bp->b_pblkno < bq->b_pblkno) {
+		while (bn = bq->b_act.tqe_next) {
+			/*
+			 * Check for an ``inversion'' in the normally ascending
+			 * cylinder numbers, indicating the start of the second
+			 * request list.
+			 */
+			if (bn->b_pblkno < bq->b_pblkno) {
+				/*
+				 * Search the second request list for the first
+				 * request at a larger cylinder number.  We go
+				 * before that; if there is no such request, we
+				 * go at end.
+				 */
+				do {
+					if (bp->b_pblkno < bn->b_pblkno)
+						goto insert;
+					bq = bn;
+				} while (bn = bq->b_act.tqe_next);
+				goto insert;		/* after last */
+			}
+			bq = bn;
+		}
+		/*
+		 * No inversions... we will go after the last, and
+		 * be the first request in the second request list.
+		 */
+		goto insert;
+	}
+	/*
+	 * Request is at/after the current request...
+	 * sort in the first request list.
+	 */
+	while (bn = bq->b_act.tqe_next) {
+		/*
+		 * We want to go after the current request if there is an
+		 * inversion after it (i.e. it is the end of the first
+		 * request list), or if the next request is a larger cylinder
+		 * than our request.
+		 */
+		if (bn->b_pblkno < bq->b_pblkno ||
+		    bp->b_pblkno < bn->b_pblkno)
+			goto insert;
+		bq = bn;
+	}
+	/*
+	 * Neither a second list nor a larger request... we go at the end of
+	 * the first list, which is the same as the end of the whole schebang.
+	 */
+insert:
+	TAILQ_INSERT_AFTER(ap, bq, bp, b_act);
+}
+
+
+/*
+ * Attempt to read a disk label from a device using the indicated strategy
+ * routine.  The label must be partly set up before this: secpercyl, secsize
+ * and anything required in the strategy routine (e.g., dummy bounds for the
+ * partition containing the label) must be * filled in before calling us.
+ * Returns NULL on success and an error string on failure.
+ */
+char *
+readdisklabel(dev, strat, lp)
+	dev_t dev;
+	d_strategy_t *strat;
+	register struct disklabel *lp;
+{
+	register struct buf *bp;
+	struct disklabel *dlp;
+	char *msg = NULL;
+
+	bp = geteblk((int)lp->d_secsize);
+	bp->b_dev = dev;
+	bp->b_blkno = LABELSECTOR * ((int)lp->d_secsize/DEV_BSIZE);
+	bp->b_bcount = lp->d_secsize;
+	bp->b_flags &= ~B_INVAL;
+	bp->b_flags |= B_BUSY | B_READ;
+	(*strat)(bp);
+	if (biowait(bp))
+		msg = "I/O error";
+	else for (dlp = (struct disklabel *)bp->b_data;
+	    dlp <= (struct disklabel *)((char *)bp->b_data +
+	    DEV_BSIZE - sizeof(*dlp));
+	    dlp = (struct disklabel *)((char *)dlp + sizeof(long))) {
+		if (dlp->d_magic != DISKMAGIC || dlp->d_magic2 != DISKMAGIC) {
+			if (msg == NULL)
+				msg = "no disk label";
+		} else if (dlp->d_npartitions > MAXPARTITIONS ||
+			   dkcksum(dlp) != 0)
+			msg = "disk label corrupted";
+		else {
+			*lp = *dlp;
+			msg = NULL;
+			break;
+		}
+	}
+	bp->b_flags |= B_INVAL | B_AGE;
+	brelse(bp);
+	return (msg);
+}
+
+/*
+ * Check new disk label for sensibility before setting it.
+ */
+int
+setdisklabel(olp, nlp, openmask)
+	register struct disklabel *olp, *nlp;
+	u_long openmask;
+{
+	register i;
+	register struct partition *opp, *npp;
+
+	/*
+	 * Check it is actually a disklabel we are looking at.
+	 */
+	if (nlp->d_magic != DISKMAGIC || nlp->d_magic2 != DISKMAGIC ||
+	    dkcksum(nlp) != 0)
+		return (EINVAL);
+	/*
+	 * For each partition that we think is open,
+	 */
+	while ((i = ffs((long)openmask)) != 0) {
+		i--;
+		/*
+	 	 * Check it is not changing....
+	 	 */
+		openmask &= ~(1 << i);
+		if (nlp->d_npartitions <= i)
+			return (EBUSY);
+		opp = &olp->d_partitions[i];
+		npp = &nlp->d_partitions[i];
+		if (npp->p_offset != opp->p_offset || npp->p_size < opp->p_size)
+			return (EBUSY);
+		/*
+		 * Copy internally-set partition information
+		 * if new label doesn't include it.		XXX
+		 * (If we are using it then we had better stay the same type)
+		 * This is possibly dubious, as someone else noted (XXX)
+		 */
+		if (npp->p_fstype == FS_UNUSED && opp->p_fstype != FS_UNUSED) {
+			npp->p_fstype = opp->p_fstype;
+			npp->p_fsize = opp->p_fsize;
+			npp->p_frag = opp->p_frag;
+			npp->p_cpg = opp->p_cpg;
+		}
+	}
+ 	nlp->d_checksum = 0;
+ 	nlp->d_checksum = dkcksum(nlp);
+	*olp = *nlp;
+	return (0);
+}
+
+/*
+ * Write disk label back to device after modification.
+ */
+int
+writedisklabel(dev, strat, lp)
+	dev_t dev;
+	d_strategy_t *strat;
+	register struct disklabel *lp;
+{
+	struct buf *bp;
+	struct disklabel *dlp;
+	int labelpart;
+	int error = 0;
+
+	labelpart = dkpart(dev);
+	if (lp->d_partitions[labelpart].p_offset != 0) {
+		if (lp->d_partitions[0].p_offset != 0)
+			return (EXDEV);			/* not quite right */
+		labelpart = 0;
+	}
+	bp = geteblk((int)lp->d_secsize);
+	bp->b_dev = dkmodpart(dev, labelpart);
+	bp->b_blkno = LABELSECTOR * ((int)lp->d_secsize/DEV_BSIZE);
+	bp->b_bcount = lp->d_secsize;
+#if 1
+	/*
+	 * We read the label first to see if it's there,
+	 * in which case we will put ours at the same offset into the block..
+	 * (I think this is stupid [Julian])
+	 * Note that you can't write a label out over a corrupted label!
+	 * (also stupid.. how do you write the first one? by raw writes?)
+	 */
+	bp->b_flags &= ~B_INVAL;
+	bp->b_flags |= B_BUSY | B_READ;
+	(*strat)(bp);
+	error = biowait(bp);
+	if (error)
+		goto done;
+	for (dlp = (struct disklabel *)bp->b_data;
+	    dlp <= (struct disklabel *)
+	      ((char *)bp->b_data + lp->d_secsize - sizeof(*dlp));
+	    dlp = (struct disklabel *)((char *)dlp + sizeof(long))) {
+		if (dlp->d_magic == DISKMAGIC && dlp->d_magic2 == DISKMAGIC &&
+		    dkcksum(dlp) == 0) {
+			*dlp = *lp;
+			bp->b_flags &= ~(B_DONE | B_READ);
+			bp->b_flags |= B_BUSY | B_WRITE;
+			(*strat)(bp);
+			error = biowait(bp);
+			goto done;
+		}
+	}
+	error = ESRCH;
+done:
+#else
+	bzero(bp->b_data, lp->d_secsize);
+	dlp = (struct disklabel *)bp->b_data;
+	*dlp = *lp;
+	bp->b_flags &= ~B_INVAL;
+	bp->b_flags |= B_BUSY | B_WRITE;
+	(*strat)(bp);
+	error = biowait(bp);
+#endif
+	bp->b_flags |= B_INVAL | B_AGE;
+	brelse(bp);
+	return (error);
+}
+
+/*
+ * Compute checksum for disk label.
+ */
+u_int
+dkcksum(lp)
+	register struct disklabel *lp;
+{
+	register u_short *start, *end;
+	register u_short sum = 0;
+
+	start = (u_short *)lp;
+	end = (u_short *)&lp->d_partitions[lp->d_npartitions];
+	while (start < end)
+		sum ^= *start++;
+	return (sum);
+}
+
+/*
+ * Disk error is the preface to plaintive error messages
+ * about failing disk transfers.  It prints messages of the form
+
+hp0g: hard error reading fsbn 12345 of 12344-12347 (hp0 bn %d cn %d tn %d sn %d)
+
+ * if the offset of the error in the transfer and a disk label
+ * are both available.  blkdone should be -1 if the position of the error
+ * is unknown; the disklabel pointer may be null from drivers that have not
+ * been converted to use them.  The message is printed with printf
+ * if pri is LOG_PRINTF, otherwise it uses log at the specified priority.
+ * The message should be completed (with at least a newline) with printf
+ * or addlog, respectively.  There is no trailing space.
+ */
+void
+diskerr(bp, dname, what, pri, blkdone, lp)
+	register struct buf *bp;
+	char *dname, *what;
+	int pri, blkdone;
+	register struct disklabel *lp;
+{
+	int unit = dkunit(bp->b_dev);
+	int slice = dkslice(bp->b_dev);
+	int part = dkpart(bp->b_dev);
+	register int (*pr) __P((const char *, ...));
+	char partname[2];
+	char *sname;
+	int sn;
+
+	if (pri != LOG_PRINTF) {
+		log(pri, "");
+		pr = addlog;
+	} else
+		pr = printf;
+	sname = dsname(dname, unit, slice, part, partname);
+	(*pr)("%s%s: %s %sing fsbn ", sname, partname, what,
+	      bp->b_flags & B_READ ? "read" : "writ");
+	sn = bp->b_blkno;
+	if (bp->b_bcount <= DEV_BSIZE)
+		(*pr)("%d", sn);
+	else {
+		if (blkdone >= 0) {
+			sn += blkdone;
+			(*pr)("%d of ", sn);
+		}
+		(*pr)("%d-%d", bp->b_blkno,
+		    bp->b_blkno + (bp->b_bcount - 1) / DEV_BSIZE);
+	}
+	if (lp && (blkdone >= 0 || bp->b_bcount <= lp->d_secsize)) {
+#ifdef tahoe
+		sn *= DEV_BSIZE / lp->d_secsize;		/* XXX */
+#endif
+		sn += lp->d_partitions[part].p_offset;
+		/*
+		 * XXX should add slice offset and not print the slice,
+		 * but we don't know the slice pointer.
+		 * XXX should print bp->b_pblkno so that this will work
+		 * independent of slices, labels and bad sector remapping,
+		 * but some drivers don't set bp->b_pblkno.
+		 */
+		(*pr)(" (%s bn %d; cn %d", sname, sn, sn / lp->d_secpercyl);
+		sn %= lp->d_secpercyl;
+		(*pr)(" tn %d sn %d)", sn / lp->d_nsectors, sn % lp->d_nsectors);
+	}
+}
diff --git a/sys/kern/subr_diskmbr.c b/sys/kern/subr_diskmbr.c
new file mode 100644
index 0000000..8983e950c
--- /dev/null
+++ b/sys/kern/subr_diskmbr.c
@@ -0,0 +1,456 @@
+/*-
+ * Copyright (c) 1994 Bruce D. Evans.
+ * All rights reserved.
+ *
+ * Copyright (c) 1982, 1986, 1988 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)ufs_disksubr.c	7.16 (Berkeley) 5/4/91
+ *	from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $
+ *	$Id$
+ */
+
+#include <stddef.h>
+#include <sys/param.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/disklabel.h>
+#define	DOSPTYP_EXTENDED	5
+#define	DOSPTYP_ONTRACK		84
+#include <sys/diskslice.h>
+#include <sys/malloc.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+
+#define TRACE(str)	do { if (dsi_debug) printf str; } while (0)
+
+static volatile u_char dsi_debug;
+
+static struct dos_partition historical_bogus_partition_table[NDOSPART] = {
+	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+	{ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, },
+	{ 0x80, 0, 1, 0, DOSPTYP_386BSD, 255, 255, 255, 0, 50000, },
+};
+
+static int check_part __P((char *sname, struct dos_partition *dp,
+			   u_long offset, int nsectors, int ntracks,
+			   u_long mbr_offset));
+static void extended __P((char *dname, dev_t dev, d_strategy_t *strat,
+			  struct disklabel *lp, struct diskslices *ssp,
+			  u_long ext_offset, u_long ext_size,
+			  u_long base_ext_offset, int nsectors, int ntracks,
+			  u_long mbr_offset));
+
+static int
+check_part(sname, dp, offset, nsectors, ntracks, mbr_offset )
+	char	*sname;
+	struct dos_partition *dp;
+	u_long	offset;
+	int	nsectors;
+	int	ntracks;
+	u_long	mbr_offset;
+{
+	int	chs_ecyl;
+	int	chs_esect;
+	int	chs_scyl;
+	int	chs_ssect;
+	int	error;
+	u_long	esector;
+	u_long	esector1;
+	u_long	secpercyl;
+	u_long	ssector;
+	u_long	ssector1;
+
+	secpercyl = (u_long)nsectors * ntracks;
+	chs_scyl = DPCYL(dp->dp_scyl, dp->dp_ssect);
+	chs_ssect = DPSECT(dp->dp_ssect);
+	ssector = chs_ssect - 1 + dp->dp_shd * nsectors + chs_scyl * secpercyl
+		  + mbr_offset;
+	ssector1 = offset + dp->dp_start;
+
+	/*
+	 * If ssector1 is on a cylinder >= 1024, then ssector can't be right.
+	 * Allow the C/H/S for it to be 1023/ntracks-1/nsectors, or correct
+	 * apart from the cylinder being reduced modulo 1024.
+	 */
+	if (ssector < ssector1
+	    && ((chs_ssect == nsectors && dp->dp_shd == ntracks - 1
+		 && chs_scyl == 1023)
+		|| (ssector1 - ssector) % (1024 * secpercyl) == 0)
+		|| (dp->dp_scyl == 255 && dp->dp_shd == 255
+		    && dp->dp_ssect == 255)) {
+		TRACE(("%s: C/H/S start %d/%d/%d, start %lu: allow\n",
+		       sname, chs_scyl, dp->dp_shd, chs_ssect, ssector1));
+		ssector = ssector1;
+	}
+
+	chs_ecyl = DPCYL(dp->dp_ecyl, dp->dp_esect);
+	chs_esect = DPSECT(dp->dp_esect);
+	esector = chs_esect - 1 + dp->dp_ehd * nsectors + chs_ecyl * secpercyl
+		  + mbr_offset;
+	esector1 = ssector1 + dp->dp_size - 1;
+
+	/* Allow certain bogus C/H/S values for esector, as above. */
+	if (esector < esector1
+	    && ((chs_esect == nsectors && dp->dp_ehd == ntracks - 1
+		 && chs_ecyl == 1023)
+		|| (esector1 - esector) % (1024 * secpercyl) == 0)
+		|| (dp->dp_ecyl == 255 && dp->dp_ehd == 255
+		    && dp->dp_esect == 255)) {
+		TRACE(("%s: C/H/S end %d/%d/%d, end %lu: allow\n",
+		       sname, chs_ecyl, dp->dp_ehd, chs_esect, esector1));
+		esector = esector1;
+	}
+
+	error = (ssector == ssector1 && esector == esector1) ? 0 : EINVAL;
+	if (bootverbose)
+		printf("%s: type 0x%x, start %lu, end = %lu, size %lu %s\n",
+		       sname, dp->dp_typ, ssector1, esector1, dp->dp_size,
+		       error ? "" : ": OK");
+	if (ssector != ssector1 && bootverbose)
+		printf("%s: C/H/S start %d/%d/%d (%lu) != start %lu: invalid\n",
+		       sname, chs_scyl, dp->dp_shd, chs_ssect,
+		       ssector, ssector1);
+	if (esector != esector1 && bootverbose)
+		printf("%s: C/H/S end %d/%d/%d (%lu) != end %lu: invalid\n",
+		       sname, chs_ecyl, dp->dp_ehd, chs_esect,
+		       esector, esector1);
+	return (error);
+}
+
+int
+dsinit(dname, dev, strat, lp, sspp)
+	char	*dname;
+	dev_t	dev;
+	d_strategy_t *strat;
+	struct disklabel *lp;
+	struct diskslices **sspp;
+{
+	struct buf *bp;
+	u_char	*cp;
+	int	dospart;
+	struct dos_partition *dp;
+	struct dos_partition *dp0;
+	int	error;
+	int	max_ncyls;
+	int	max_nsectors;
+	int	max_ntracks;
+	u_long	mbr_offset;
+	char	partname[2];
+	u_long	secpercyl;
+	char	*sname;
+	struct diskslice *sp;
+	struct diskslices *ssp;
+
+	/*
+	 * Allocate a dummy slices "struct" and initialize it to contain
+	 * only an empty compatibility slice (pointing to itself) and a
+	 * whole disk slice (covering the disk as described by the label).
+	 * If there is an error, then the dummy struct becomes final.
+	 */
+	ssp = malloc(offsetof(struct diskslices, dss_slices)
+		     + BASE_SLICE * sizeof *sp, M_DEVBUF, M_WAITOK);
+	*sspp = ssp;
+	ssp->dss_first_bsd_slice = COMPATIBILITY_SLICE;
+	ssp->dss_nslices = BASE_SLICE;
+	sp = &ssp->dss_slices[0];
+	bzero(sp, BASE_SLICE * sizeof *sp);
+	sp[WHOLE_DISK_SLICE].ds_size = lp->d_secperunit;
+
+	mbr_offset = DOSBBSECTOR;
+reread_mbr:
+	/* Read master boot record. */
+	bp = geteblk((int)lp->d_secsize);
+	bp->b_dev = dkmodpart(dkmodslice(dev, WHOLE_DISK_SLICE), RAW_PART);
+	bp->b_blkno = mbr_offset;
+	bp->b_bcount = lp->d_secsize;
+	bp->b_flags |= B_BUSY | B_READ;
+	(*strat)(bp);
+	if (biowait(bp) != 0) {
+		diskerr(bp, dname, "error reading primary partition table",
+			LOG_PRINTF, 0, lp);
+		printf("\n");
+		error = EIO;
+		goto done;
+	}
+
+	/* Weakly verify it. */
+	cp = bp->b_un.b_addr;
+	sname = dsname(dname, dkunit(dev), WHOLE_DISK_SLICE, RAW_PART,
+		       partname);
+	if (cp[0x1FE] != 0x55 || cp[0x1FF] != 0xAA) {
+		printf("%s: invalid primary partition table: no magic\n",
+		       sname);
+		error = EINVAL;
+		goto done;
+	}
+	dp0 = (struct dos_partition *)(cp + DOSPARTOFF);
+
+	/* Check for "Ontrack Diskmanager". */
+	for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) {
+		if (dp->dp_typ == DOSPTYP_ONTRACK) {
+			if (bootverbose)
+				printf(
+	    "%s: Found \"Ontrack Disk Manager\" on this disk.\n", sname);
+			bp->b_flags |= B_INVAL | B_AGE;
+			brelse(bp);
+			mbr_offset = 63;
+			goto reread_mbr;
+		}
+	}
+
+	if (bcmp(dp0, historical_bogus_partition_table,
+		 sizeof historical_bogus_partition_table) == 0) {
+		TRACE(("%s: invalid primary partition table: historical\n",
+		       sname));
+		error = EINVAL;
+		goto done;
+	}
+
+	/* Guess the geometry. */
+	/*
+	 * TODO:
+	 * Perhaps skip entries with 0 size.
+	 * Perhaps only look at entries of type DOSPTYP_386BSD.
+	 */
+	max_ncyls = 0;
+	max_nsectors = 0;
+	max_ntracks = 0;
+	for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) {
+		int	ncyls;
+		int	nsectors;
+		int	ntracks;
+
+		ncyls = DPCYL(dp->dp_ecyl, dp->dp_esect) + 1;
+		if (max_ncyls < ncyls)
+			max_ncyls = ncyls;
+		nsectors = DPSECT(dp->dp_esect);
+		if (max_nsectors < nsectors)
+			max_nsectors = nsectors;
+		ntracks = dp->dp_ehd + 1;
+		if (max_ntracks < ntracks)
+			max_ntracks = ntracks;
+	}
+
+	/*
+	 * Check that we have guessed the geometry right by checking the
+	 * partition entries.
+	 */
+	/*
+	 * TODO:
+	 * As above.
+	 * Check for overlaps.
+	 * Check against d_secperunit if the latter is reliable.
+	 */
+	error = 0;
+	for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++) {
+		if (dp->dp_scyl == 0 && dp->dp_shd == 0 && dp->dp_ssect == 0
+		    && dp->dp_start == 0 && dp->dp_size == 0)
+			continue;
+		sname = dsname(dname, dkunit(dev), BASE_SLICE + dospart,
+			       RAW_PART, partname);
+
+		/*
+		 * Temporarily ignore errors from this check.  We could
+		 * simplify things by accepting the table eariler if we
+		 * always ignore errors here.  Perhaps we should always
+		 * accept the table if the magic is right but not let
+		 * bad entries affect the geometry.
+		 */
+		check_part(sname, dp, mbr_offset, max_nsectors, max_ntracks,
+			   mbr_offset);
+	}
+	if (error != 0)
+		goto done;
+
+	/*
+	 * Accept the DOS partition table.
+	 * First adjust the label (we have been careful not to change it
+	 * before we can guarantee success).
+	 */
+	secpercyl = (u_long)max_nsectors * max_ntracks;
+	if (secpercyl != 0) {
+		u_long	secperunit;
+
+		lp->d_nsectors = max_nsectors;
+		lp->d_ntracks = max_ntracks;
+		lp->d_secpercyl = secpercyl;
+		secperunit = secpercyl * max_ncyls;
+		if (lp->d_secperunit < secperunit)
+			lp->d_secperunit = secperunit;
+		lp->d_ncylinders = lp->d_secperunit / secpercyl;
+	}
+
+	/*
+	 * Free the dummy slices "struct" and allocate a real new one.
+	 * Initialize special slices as above.
+	 */
+	free(ssp, M_DEVBUF);
+	ssp = malloc(offsetof(struct diskslices, dss_slices)
+#define	MAX_SLICES_SUPPORTED	MAX_SLICES  /* was (BASE_SLICE + NDOSPART) */
+		     + MAX_SLICES_SUPPORTED * sizeof *sp, M_DEVBUF, M_WAITOK);
+	*sspp = ssp;
+	ssp->dss_first_bsd_slice = COMPATIBILITY_SLICE;
+	sp = &ssp->dss_slices[0];
+	bzero(sp, MAX_SLICES_SUPPORTED * sizeof *sp);
+	sp[WHOLE_DISK_SLICE].ds_size = lp->d_secperunit;
+
+	/* Initialize normal slices. */
+	sp += BASE_SLICE;
+	for (dospart = 0, dp = dp0; dospart < NDOSPART; dospart++, dp++, sp++) {
+		sp->ds_offset = mbr_offset + dp->dp_start;
+		sp->ds_size = dp->dp_size;
+		sp->ds_type = dp->dp_typ;
+#if 0
+		lp->d_subtype |= (lp->d_subtype & 3) | dospart
+				 | DSTYPE_INDOSPART;
+#endif
+	}
+	ssp->dss_nslices = BASE_SLICE + NDOSPART;
+
+	/* Handle extended partitions. */
+	sp -= NDOSPART;
+	for (dospart = 0; dospart < NDOSPART; dospart++, sp++)
+		if (sp->ds_type == DOSPTYP_EXTENDED)
+			extended(dname, bp->b_dev, strat, lp, ssp,
+				 sp->ds_offset, sp->ds_size, sp->ds_offset,
+				 max_nsectors, max_ntracks, mbr_offset);
+
+done:
+	bp->b_flags |= B_INVAL | B_AGE;
+	brelse(bp);
+	if (error == EINVAL)
+		error = 0;
+	return (error);
+}
+
+void
+extended(dname, dev, strat, lp, ssp, ext_offset, ext_size, base_ext_offset,
+	 nsectors, ntracks, mbr_offset)
+	char	*dname;
+	dev_t	dev;
+	struct disklabel *lp;
+	d_strategy_t *strat;
+	struct diskslices *ssp;
+	u_long	ext_offset;
+	u_long	ext_size;
+	u_long	base_ext_offset;
+	int	nsectors;
+	int	ntracks;
+	u_long	mbr_offset;
+{
+	struct buf *bp;
+	u_char	*cp;
+	int	dospart;
+	struct dos_partition *dp;
+	u_long	ext_offsets[NDOSPART];
+	u_long	ext_sizes[NDOSPART];
+	char	partname[2];
+	int	slice;
+	char	*sname;
+	struct diskslice *sp;
+
+	/* Read extended boot record. */
+	bp = geteblk((int)lp->d_secsize);
+	bp->b_dev = dev;
+	bp->b_blkno = ext_offset;
+	bp->b_bcount = lp->d_secsize;
+	bp->b_flags |= B_BUSY | B_READ;
+	(*strat)(bp);
+	if (biowait(bp) != 0) {
+		diskerr(bp, dname, "error reading extended partition table",
+			LOG_PRINTF, 0, lp);
+		printf("\n");
+		goto done;
+	}
+
+	/* Weakly verify it. */
+	cp = bp->b_un.b_addr;
+	if (cp[0x1FE] != 0x55 || cp[0x1FF] != 0xAA) {
+		sname = dsname(dname, dkunit(dev), WHOLE_DISK_SLICE, RAW_PART,
+			       partname);
+		printf("%s: invalid extended partition table: no magic\n",
+		       sname);
+		goto done;
+	}
+
+	for (dospart = 0,
+	     dp = (struct dos_partition *)(bp->b_un.b_addr + DOSPARTOFF),
+	     slice = ssp->dss_nslices, sp = &ssp->dss_slices[slice];
+	     dospart < NDOSPART; dospart++, dp++) {
+		ext_sizes[dospart] = 0;
+		if (dp->dp_scyl == 0 && dp->dp_shd == 0 && dp->dp_ssect == 0
+		    && dp->dp_start == 0 && dp->dp_size == 0)
+			continue;
+		if (dp->dp_typ == DOSPTYP_EXTENDED) {
+			char buf[32];
+
+			sname = dsname(dname, dkunit(dev), WHOLE_DISK_SLICE,
+				       RAW_PART, partname);
+			strcpy(buf, sname);
+			if (strlen(buf) < sizeof buf - 11)
+				strcat(buf, "<extended>");
+			check_part(buf, dp, base_ext_offset, nsectors,
+				   ntracks, mbr_offset);
+			ext_offsets[dospart] = base_ext_offset + dp->dp_start;
+			ext_sizes[dospart] = dp->dp_size;
+		} else {
+			sname = dsname(dname, dkunit(dev), slice, RAW_PART,
+				       partname);
+			check_part(sname, dp, ext_offset, nsectors, ntracks,
+				   mbr_offset);
+			if (slice >= MAX_SLICES) {
+				printf("%s: too many slices\n", sname);
+				slice++;
+				continue;
+			}
+			sp->ds_offset = ext_offset + dp->dp_start;
+			sp->ds_size = dp->dp_size;
+			sp->ds_type = dp->dp_typ;
+			ssp->dss_nslices++;
+			slice++;
+			sp++;
+		}
+	}
+
+	/* If we found any more slices, recursively find all the subslices. */
+	for (dospart = 0; dospart < NDOSPART; dospart++)
+		if (ext_sizes[dospart] != 0)
+			extended(dname, dev, strat, lp, ssp,
+				 ext_offsets[dospart], ext_sizes[dospart],
+				 base_ext_offset, nsectors, ntracks,
+				 mbr_offset);
+
+done:
+	bp->b_flags |= B_INVAL | B_AGE;
+	brelse(bp);
+}
diff --git a/sys/kern/subr_diskslice.c b/sys/kern/subr_diskslice.c
new file mode 100644
index 0000000..44e01b0
--- /dev/null
+++ b/sys/kern/subr_diskslice.c
@@ -0,0 +1,1066 @@
+/*-
+ * Copyright (c) 1994 Bruce D. Evans.
+ * All rights reserved.
+ *
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Copyright (c) 1982, 1986, 1988 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)wd.c	7.2 (Berkeley) 5/9/91
+ *	from: wd.c,v 1.55 1994/10/22 01:57:12 phk Exp $
+ *	from: @(#)ufs_disksubr.c	7.16 (Berkeley) 5/4/91
+ *	from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $
+ *	$Id: subr_diskslice.c,v 1.35 1997/02/22 09:39:15 peter Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif
+#include <sys/disklabel.h>
+#include <sys/diskslice.h>
+#include <sys/dkbad.h>
+#include <sys/fcntl.h>
+#include <sys/malloc.h>
+#include <sys/stat.h>
+#include <sys/syslog.h>
+#include <sys/systm.h>
+#include <sys/vnode.h>
+
+#include <ufs/ufs/dinode.h>
+#include <ufs/ffs/fs.h>
+
+#define TRACE(str)	do { if (ds_debug) printf str; } while (0)
+
+typedef	u_char	bool_t;
+
+static volatile bool_t ds_debug;
+
+static void dsiodone __P((struct buf *bp));
+static char *fixlabel __P((char *sname, struct diskslice *sp,
+			   struct disklabel *lp, int writeflag));
+static void free_ds_label __P((struct diskslices *ssp, int slice));
+#ifdef DEVFS
+static void free_ds_labeldevs __P((struct diskslices *ssp, int slice));
+#endif
+static void partition_info __P((char *sname, int part, struct partition *pp));
+static void slice_info __P((char *sname, struct diskslice *sp));
+static void set_ds_bad __P((struct diskslices *ssp, int slice,
+			    struct dkbad_intern *btp));
+static void set_ds_label __P((struct diskslices *ssp, int slice,
+			      struct disklabel *lp));
+#ifdef DEVFS
+static void set_ds_labeldevs __P((char *dname, dev_t dev,
+				  struct diskslices *ssp));
+static void set_ds_labeldevs_unaliased __P((char *dname, dev_t dev,
+					    struct diskslices *ssp));
+#endif
+static void set_ds_wlabel __P((struct diskslices *ssp, int slice,
+			       int wlabel));
+
+/*
+ * Determine the size of the transfer, and make sure it is
+ * within the boundaries of the partition. Adjust transfer
+ * if needed, and signal errors or early completion.
+ *
+ * XXX TODO:
+ *	o Do bad sector remapping.  May need to split buffer.
+ *	o Split buffers that are too big for the device.
+ *	o Check for overflow.
+ *	o Finish cleaning this up.
+ */
+int
+dscheck(bp, ssp)
+	struct buf *bp;
+	struct diskslices *ssp;
+{
+	daddr_t	blkno;
+	daddr_t	labelsect;
+	struct disklabel *lp;
+	u_long	maxsz;
+	char *msg;
+	struct partition *pp;
+	struct diskslice *sp;
+	long	sz;
+
+	if (bp->b_blkno < 0) {
+		Debugger("Slice code got negative blocknumber");
+		bp->b_error = EINVAL;
+		goto bad;
+	}
+
+	sp = &ssp->dss_slices[dkslice(bp->b_dev)];
+	lp = sp->ds_label;
+	sz = (bp->b_bcount + DEV_BSIZE - 1) >> DEV_BSHIFT;
+	if (lp == NULL) {
+		blkno = bp->b_blkno;
+		labelsect = -LABELSECTOR - 1;
+		maxsz = sp->ds_size;
+	} else {
+		labelsect = lp->d_partitions[LABEL_PART].p_offset;
+if (labelsect != 0) Debugger("labelsect != 0 in dscheck()");
+		pp = &lp->d_partitions[dkpart(bp->b_dev)];
+		blkno = pp->p_offset + bp->b_blkno;
+		maxsz = pp->p_size;
+		if (sp->ds_bad != NULL && ds_debug) {
+			daddr_t	newblkno;
+
+			newblkno = transbad144(sp->ds_bad, blkno);
+			if (newblkno != blkno)
+				printf("should map bad block %lu -> %lu\n",
+				       blkno, newblkno);
+		}
+	}
+
+	/* overwriting disk label ? */
+	/* XXX should also protect bootstrap in first 8K */
+	if (blkno <= LABELSECTOR + labelsect &&
+#if LABELSECTOR != 0
+	    bp->b_blkno + sz > LABELSECTOR + labelsect &&
+#endif
+	    (bp->b_flags & B_READ) == 0 && sp->ds_wlabel == 0) {
+		bp->b_error = EROFS;
+		goto bad;
+	}
+
+#if defined(DOSBBSECTOR) && defined(notyet)
+	/* overwriting master boot record? */
+	if (blkno <= DOSBBSECTOR && (bp->b_flags & B_READ) == 0 &&
+	    sp->ds_wlabel == 0) {
+		bp->b_error = EROFS;
+		goto bad;
+	}
+#endif
+
+	/* beyond partition? */
+	if (bp->b_blkno < 0 || bp->b_blkno + sz > maxsz) {
+		/* if exactly at end of disk, return an EOF */
+		if (bp->b_blkno == maxsz) {
+			bp->b_resid = bp->b_bcount;
+			return (0);
+		}
+		/* or truncate if part of it fits */
+		sz = maxsz - bp->b_blkno;
+		if (sz <= 0) {
+			bp->b_error = EINVAL;
+			goto bad;
+		}
+		bp->b_bcount = sz << DEV_BSHIFT;
+	}
+
+	bp->b_pblkno = blkno + sp->ds_offset;
+
+	/*
+	 * Snoop on label accesses if the slice offset is nonzero.  Fudge
+	 * offsets in the label to keep the in-core label coherent with
+	 * the on-disk one.
+	 */
+	if (blkno <= LABELSECTOR + labelsect
+#if LABELSECTOR != 0
+	    && bp->b_blkno + sz > LABELSECTOR + labelsect
+#endif
+	    && sp->ds_offset != 0) {
+		struct iodone_chain *ic;
+
+		ic = malloc(sizeof *ic , M_DEVBUF, M_WAITOK);
+		ic->ic_prev_flags = bp->b_flags;
+		ic->ic_prev_iodone = bp->b_iodone;
+		ic->ic_prev_iodone_chain = bp->b_iodone_chain;
+		ic->ic_args[0].ia_long = (LABELSECTOR + labelsect - blkno)
+					 << DEV_BSHIFT;
+		if (lp)
+			ic->ic_args[0].ia_long *= lp->d_secsize / DEV_BSIZE;
+		ic->ic_args[1].ia_ptr = sp;
+		bp->b_flags |= B_CALL;
+		bp->b_iodone = dsiodone;
+		bp->b_iodone_chain = ic;
+		if (!(bp->b_flags & B_READ)) {
+			/*
+			 * XXX even disklabel(8) writes directly so we need
+			 * to adjust writes.  Perhaps we should drop support
+			 * for DIOCWLABEL (always write protect labels) and
+			 * require the use of DIOCWDINFO.
+			 *
+			 * XXX probably need to copy the data to avoid even
+			 * temporarily corrupting the in-core copy.
+			 */
+			if (bp->b_vp != NULL)
+				bp->b_vp->v_numoutput++;
+			msg = fixlabel((char *)NULL, sp,
+				       (struct disklabel *)
+				       (bp->b_data + ic->ic_args[0].ia_long),
+				       TRUE);
+			if (msg != NULL) {
+				printf("%s\n", msg);
+				bp->b_error = EROFS;
+				goto bad;
+			}
+		}
+	}
+	return (1);
+
+bad:
+	bp->b_flags |= B_ERROR;
+	return (-1);
+}
+
+void
+dsclose(dev, mode, ssp)
+	dev_t	dev;
+	int	mode;
+	struct diskslices *ssp;
+{
+	u_char	mask;
+	struct diskslice *sp;
+
+	sp = &ssp->dss_slices[dkslice(dev)];
+	mask = 1 << dkpart(dev);
+	switch (mode) {
+	case S_IFBLK:
+		sp->ds_bopenmask &= ~mask;
+		break;
+	case S_IFCHR:
+		sp->ds_copenmask &= ~mask;
+		break;
+	}
+	sp->ds_openmask = sp->ds_bopenmask | sp->ds_copenmask;
+}
+
+void
+dsgone(sspp)
+	struct diskslices **sspp;
+{
+	int	slice;
+	struct diskslice *sp;
+	struct diskslices *ssp;
+
+	for (slice = 0, ssp = *sspp; slice < ssp->dss_nslices; slice++) {
+		sp = &ssp->dss_slices[slice];
+		if (sp->ds_bad != NULL) {
+			free(sp->ds_bad, M_DEVBUF);
+			set_ds_bad(ssp, slice, (struct dkbad_intern *)NULL);
+		}
+#ifdef DEVFS
+		if (sp->ds_bdev != NULL)
+			devfs_remove_dev(sp->ds_bdev);
+		if (sp->ds_cdev != NULL)
+			devfs_remove_dev(sp->ds_cdev);
+#endif
+		free_ds_label(ssp, slice);
+	}
+	free(ssp, M_DEVBUF);
+	*sspp = NULL;
+}
+
+/*
+ * For the "write" commands (DIOCSBAD, DIOCSDINFO and DIOCWDINFO), this
+ * is subject to the same restriction as dsopen().
+ */
+int
+dsioctl(dname, dev, cmd, data, flags, sspp, strat, setgeom)
+	char	*dname;
+	dev_t	dev;
+	int	cmd;
+	caddr_t	data;
+	int	flags;
+	struct diskslices **sspp;
+	d_strategy_t *strat;
+	ds_setgeom_t *setgeom;
+{
+	int	error;
+	struct disklabel *lp;
+	int	old_wlabel;
+	int	slice;
+	struct diskslice *sp;
+	struct diskslices *ssp;
+
+	slice = dkslice(dev);
+	ssp = *sspp;
+	sp = &ssp->dss_slices[slice];
+	lp = sp->ds_label;
+	switch (cmd) {
+
+	case DIOCGDINFO:
+		if (lp == NULL)
+			return (EINVAL);
+		*(struct disklabel *)data = *lp;
+		return (0);
+
+#ifdef notyet
+	case DIOCGDINFOP:
+		if (lp == NULL)
+			return (EINVAL);
+		*(struct disklabel **)data = lp;
+		return (0);
+#endif
+
+	case DIOCGPART:
+		if (lp == NULL)
+			return (EINVAL);
+		((struct partinfo *)data)->disklab = lp;
+		((struct partinfo *)data)->part
+			= &lp->d_partitions[dkpart(dev)];
+		return (0);
+
+	case DIOCGSLICEINFO:
+		*(struct diskslices *)data = *ssp;
+		return (0);
+
+	case DIOCSBAD:
+		if (slice == WHOLE_DISK_SLICE)
+			return (ENODEV);
+		if (!(flags & FWRITE))
+			return (EBADF);
+		if (lp == NULL)
+			return (EINVAL);
+		if (sp->ds_bad != NULL)
+			free(sp->ds_bad, M_DEVBUF);
+		set_ds_bad(ssp, slice, internbad144((struct dkbad *)data, lp));
+		return (0);
+
+	case DIOCSDINFO:
+		if (slice == WHOLE_DISK_SLICE)
+			return (ENODEV);
+		if (!(flags & FWRITE))
+			return (EBADF);
+		lp = malloc(sizeof *lp, M_DEVBUF, M_WAITOK);
+		if (sp->ds_label == NULL)
+			bzero(lp, sizeof *lp);
+		else
+			bcopy(sp->ds_label, lp, sizeof *lp);
+		error = setdisklabel(lp, (struct disklabel *)data,
+				     sp->ds_label != NULL
+				     ? sp->ds_openmask : (u_long)0);
+		/* XXX why doesn't setdisklabel() check this? */
+		if (error == 0 && lp->d_partitions[RAW_PART].p_offset != 0)
+			error = EINVAL;
+#if 0 /* XXX */
+		if (error != 0 && setgeom != NULL)
+			error = setgeom(lp);
+#endif
+		if (error != 0) {
+			free(lp, M_DEVBUF);
+			return (error);
+		}
+		free_ds_label(ssp, slice);
+		set_ds_label(ssp, slice, lp);
+#ifdef DEVFS
+		set_ds_labeldevs(dname, dev, ssp);
+#endif
+		return (0);
+
+	case DIOCSYNCSLICEINFO:
+		if (slice != WHOLE_DISK_SLICE || dkpart(dev) != RAW_PART)
+			return (EINVAL);
+		if (!*(int *)data)
+			for (slice = 0; slice < ssp->dss_nslices; slice++) {
+				u_char	openmask;
+
+				openmask = ssp->dss_slices[slice].ds_openmask;
+				if (openmask
+				    && (slice != WHOLE_DISK_SLICE
+					|| openmask & ~(1 << RAW_PART)))
+					return (EBUSY);
+			}
+
+		/*
+		 * Temporarily forget the current slices struct and read
+		 * the current one.
+		 * XXX should wait for current accesses on this disk to
+		 * complete, then lock out future accesses and opens.
+		 */
+		*sspp = NULL;
+		lp = malloc(sizeof *lp, M_DEVBUF, M_WAITOK);
+		*lp = *ssp->dss_slices[WHOLE_DISK_SLICE].ds_label;
+		error = dsopen(dname, dev,
+			       ssp->dss_slices[WHOLE_DISK_SLICE].ds_copenmask
+			       & (1 << RAW_PART) ? S_IFCHR : S_IFBLK,
+			       sspp, lp, strat, setgeom, ssp->dss_bdevsw,
+			       ssp->dss_cdevsw);
+		if (error != 0) {
+			free(lp, M_DEVBUF);
+			*sspp = ssp;
+			return (error);
+		}
+
+		/*
+		 * Reopen everything.  This is a no-op except in the "force"
+		 * case and when the raw bdev and cdev are both open.  Abort
+		 * if anything fails.
+		 */
+		for (slice = 0; slice < ssp->dss_nslices; slice++) {
+			u_char	openmask;
+			int	part;
+
+			for (openmask = ssp->dss_slices[slice].ds_bopenmask,
+			     part = 0; openmask; openmask >>= 1, part++) {
+				if (!(openmask & 1))
+					continue;
+				error = dsopen(dname,
+					       dkmodslice(dkmodpart(dev, part),
+							  slice),
+					       S_IFBLK, sspp, lp, strat,
+					       setgeom, ssp->dss_bdevsw,
+					       ssp->dss_cdevsw);
+				if (error != 0) {
+					/* XXX should free devfs toks. */
+					free(lp, M_DEVBUF);
+					/* XXX should restore devfs toks. */
+					*sspp = ssp;
+					return (EBUSY);
+				}
+			}
+			for (openmask = ssp->dss_slices[slice].ds_copenmask,
+			     part = 0; openmask; openmask >>= 1, part++) {
+				if (!(openmask & 1))
+					continue;
+				error = dsopen(dname,
+					       dkmodslice(dkmodpart(dev, part),
+							  slice),
+					       S_IFCHR, sspp, lp, strat,
+					       setgeom, ssp->dss_bdevsw,
+					       ssp->dss_cdevsw);
+				if (error != 0) {
+					/* XXX should free devfs toks. */
+					free(lp, M_DEVBUF);
+					/* XXX should restore devfs toks. */
+					*sspp = ssp;
+					return (EBUSY);
+				}
+			}
+		}
+
+		/* XXX devfs tokens? */
+		free(lp, M_DEVBUF);
+		dsgone(&ssp);
+		return (0);
+
+	case DIOCWDINFO:
+		error = dsioctl(dname, dev, DIOCSDINFO, data, flags, &ssp,
+				strat, setgeom);
+		if (error != 0)
+			return (error);
+		/*
+		 * XXX this used to hack on dk_openpart to fake opening
+		 * partition 0 in case that is used instead of dkpart(dev).
+		 */
+		old_wlabel = sp->ds_wlabel;
+		set_ds_wlabel(ssp, slice, TRUE);
+		error = writedisklabel(dev, strat, sp->ds_label);
+		/* XXX should invalidate in-core label if write failed. */
+		set_ds_wlabel(ssp, slice, old_wlabel);
+		return (error);
+
+	case DIOCWLABEL:
+		if (slice == WHOLE_DISK_SLICE)
+			return (ENODEV);
+		if (!(flags & FWRITE))
+			return (EBADF);
+		set_ds_wlabel(ssp, slice, *(int *)data != 0);
+		return (0);
+
+	default:
+		return (-1);
+	}
+}
+
+static void
+dsiodone(bp)
+	struct buf *bp;
+{
+	struct iodone_chain *ic;
+	char *msg;
+
+	ic = bp->b_iodone_chain;
+	bp->b_flags = (ic->ic_prev_flags & B_CALL)
+		      | (bp->b_flags & ~(B_CALL | B_DONE));
+	bp->b_iodone = ic->ic_prev_iodone;
+	bp->b_iodone_chain = ic->ic_prev_iodone_chain;
+	if (!(bp->b_flags & B_READ)
+	    || (!(bp->b_flags & B_ERROR) && bp->b_error == 0)) {
+		msg = fixlabel((char *)NULL, ic->ic_args[1].ia_ptr,
+			       (struct disklabel *)
+			       (bp->b_data + ic->ic_args[0].ia_long),
+			       FALSE);
+		if (msg != NULL)
+			printf("%s\n", msg);
+	}
+	free(ic, M_DEVBUF);
+	biodone(bp);
+}
+
+int
+dsisopen(ssp)
+	struct diskslices *ssp;
+{
+	int	slice;
+
+	if (ssp == NULL)
+		return (0);
+	for (slice = 0; slice < ssp->dss_nslices; slice++)
+		if (ssp->dss_slices[slice].ds_openmask)
+			return (1);
+	return (0);
+}
+
+char *
+dsname(dname, unit, slice, part, partname)
+	char	*dname;
+	int	unit;
+	int	slice;
+	int	part;
+	char	*partname;
+{
+	static char name[32];
+
+	if (strlen(dname) > 16)
+		dname = "nametoolong";
+	sprintf(name, "%s%d", dname, unit);
+	partname[0] = '\0';
+	if (slice != WHOLE_DISK_SLICE || part != RAW_PART) {
+		partname[0] = 'a' + part;
+		partname[1] = '\0';
+		if (slice != COMPATIBILITY_SLICE)
+			sprintf(name + strlen(name), "s%d", slice - 1);
+	}
+	return (name);
+}
+
+/*
+ * This should only be called when the unit is inactive and the strategy
+ * routine should not allow it to become active unless we call it.  Our
+ * strategy routine must be special to allow activity.
+ */
+int
+dsopen(dname, dev, mode, sspp, lp, strat, setgeom, bdevsw, cdevsw)
+	char	*dname;
+	dev_t	dev;
+	int	mode;
+	struct diskslices **sspp;
+	struct disklabel *lp;
+	d_strategy_t *strat;
+	ds_setgeom_t *setgeom;
+	struct bdevsw *bdevsw;
+	struct cdevsw *cdevsw;
+{
+	struct dkbad *btp;
+	dev_t	dev1;
+	int	error;
+	struct disklabel *lp1;
+	char	*msg;
+	u_char	mask;
+#ifdef DEVFS
+	int	mynor;
+#endif
+	bool_t	need_init;
+	int	part;
+	char	partname[2];
+	int	slice;
+	char	*sname;
+	struct diskslice *sp;
+	struct diskslices *ssp;
+	int	unit;
+
+	/*
+	 * XXX reinitialize the slice table unless there is an open device
+	 * on the unit.  This should only be done if the media has changed.
+	 */
+	ssp = *sspp;
+	need_init = !dsisopen(ssp);
+	if (ssp != NULL && need_init)
+		dsgone(sspp);
+	if (need_init) {
+		TRACE(("dsinit\n"));
+		error = dsinit(dname, dev, strat, lp, sspp);
+		if (error != 0) {
+			dsgone(sspp);
+			return (error);
+		}
+		lp->d_npartitions = RAW_PART + 1;
+		lp->d_partitions[RAW_PART].p_size = lp->d_secperunit;
+		ssp = *sspp;
+#ifdef DEVFS
+		ssp->dss_bdevsw = bdevsw;
+		ssp->dss_cdevsw = cdevsw;
+#endif
+
+		/*
+		 * If there are no real slices, then make the compatiblity
+		 * slice cover the whole disk.
+		 */
+		if (ssp->dss_nslices == BASE_SLICE)
+			ssp->dss_slices[COMPATIBILITY_SLICE].ds_size
+				= lp->d_secperunit;
+
+		/* Point the compatibility slice at the BSD slice, if any. */
+		for (slice = BASE_SLICE; slice < ssp->dss_nslices; slice++) {
+			sp = &ssp->dss_slices[slice];
+			if (sp->ds_type == DOSPTYP_386BSD /* XXX */) {
+				ssp->dss_first_bsd_slice = slice;
+				ssp->dss_slices[COMPATIBILITY_SLICE].ds_offset
+					= sp->ds_offset;
+				ssp->dss_slices[COMPATIBILITY_SLICE].ds_size
+					= sp->ds_size;
+				ssp->dss_slices[COMPATIBILITY_SLICE].ds_type
+					= sp->ds_type;
+				break;
+			}
+		}
+
+		lp1 = malloc(sizeof *lp1, M_DEVBUF, M_WAITOK);
+		*lp1 = *lp;
+
+		/*
+		 * Initialize defaults for the label for the whole disk so
+		 * that it can be used as a template for disklabel(8).
+		 * d_rpm = 3600 is unlikely to be correct for a modern
+		 * disk, but d_rpm is normally irrelevant.
+		 */
+		if (lp1->d_rpm == 0)
+			lp1->d_rpm = 3600;
+		if (lp1->d_interleave == 0)
+			lp1->d_interleave = 1;
+		if (lp1->d_npartitions == 0)
+			lp1->d_npartitions = MAXPARTITIONS;
+		if (lp1->d_bbsize == 0)
+			lp1->d_bbsize = BBSIZE;
+		if (lp1->d_sbsize == 0)
+			lp1->d_sbsize = SBSIZE;
+
+		ssp->dss_slices[WHOLE_DISK_SLICE].ds_label = lp1;
+		ssp->dss_slices[WHOLE_DISK_SLICE].ds_wlabel = TRUE;
+		if (setgeom != NULL) {
+			error = setgeom(lp);
+			if (error != 0) {
+				dsgone(sspp);
+				return (error);
+			}
+		}
+	}
+
+	unit = dkunit(dev);
+
+	/*
+	 * Initialize secondary info for all slices.  It is needed for more
+	 * than the current slice in the DEVFS case.
+	 */
+	for (slice = 0; slice < ssp->dss_nslices; slice++) {
+		sp = &ssp->dss_slices[slice];
+		if (sp->ds_label != NULL)
+			continue;
+		dev1 = dkmodslice(dkmodpart(dev, RAW_PART), slice);
+		sname = dsname(dname, unit, slice, RAW_PART, partname);
+#ifdef DEVFS
+		if (slice != COMPATIBILITY_SLICE && sp->ds_bdev == NULL
+		    && sp->ds_size != 0) {
+			mynor = minor(dev1);
+			sp->ds_bdev =
+				devfs_add_devswf(bdevsw, mynor, DV_BLK,
+						 UID_ROOT, GID_OPERATOR, 0640,
+						 "%s", sname);
+			sp->ds_cdev =
+				devfs_add_devswf(cdevsw, mynor, DV_CHR,
+						 UID_ROOT, GID_OPERATOR, 0640,
+						 "r%s", sname);
+		}
+#endif
+		/*
+		 * XXX this should probably only be done for the need_init
+		 * case, but there may be a problem with DIOCSYNCSLICEINFO.
+		 */
+		set_ds_wlabel(ssp, slice, TRUE);	/* XXX invert */
+		lp1 = malloc(sizeof *lp1, M_DEVBUF, M_WAITOK);
+		*lp1 = *lp;
+		TRACE(("readdisklabel\n"));
+		msg = readdisklabel(dev1, strat, lp1);
+#if 0 /* XXX */
+		if (msg == NULL && setgeom != NULL && setgeom(lp1) != 0)
+			msg = "setgeom failed";
+#endif
+		if (msg == NULL)
+			msg = fixlabel(sname, sp, lp1, FALSE);
+		if (msg != NULL) {
+			free(lp1, M_DEVBUF);
+			if (sp->ds_type == DOSPTYP_386BSD /* XXX */)
+				log(LOG_WARNING, "%s: cannot find label (%s)\n",
+				    sname, msg);
+			continue;
+		}
+		if (lp1->d_flags & D_BADSECT) {
+			btp = malloc(sizeof *btp, M_DEVBUF, M_WAITOK);
+			TRACE(("readbad144\n"));
+			msg = readbad144(dev1, strat, lp1, btp);
+			if (msg != NULL) {
+				log(LOG_WARNING,
+				    "%s: cannot find bad sector table (%s)\n",
+				    sname, msg);
+				free(btp, M_DEVBUF);
+				free(lp1, M_DEVBUF);
+				continue;
+			}
+			set_ds_bad(ssp, slice, internbad144(btp, lp1));
+			free(btp, M_DEVBUF);
+			if (sp->ds_bad == NULL) {
+				free(lp1, M_DEVBUF);
+				continue;
+			}
+		}
+		set_ds_label(ssp, slice, lp1);
+#ifdef DEVFS
+		set_ds_labeldevs(dname, dev1, ssp);
+#endif
+		set_ds_wlabel(ssp, slice, FALSE);
+	}
+
+	slice = dkslice(dev);
+	if (slice >= ssp->dss_nslices)
+		return (ENXIO);
+	sp = &ssp->dss_slices[slice];
+	part = dkpart(dev);
+	if (part != RAW_PART
+	    && (sp->ds_label == NULL || part >= sp->ds_label->d_npartitions))
+		return (EINVAL);	/* XXX needs translation */
+	mask = 1 << part;
+	switch (mode) {
+	case S_IFBLK:
+		sp->ds_bopenmask |= mask;
+		break;
+	case S_IFCHR:
+		sp->ds_copenmask |= mask;
+		break;
+	}
+	sp->ds_openmask = sp->ds_bopenmask | sp->ds_copenmask;
+	return (0);
+}
+
+int
+dssize(dev, sspp, dopen, dclose)
+	dev_t	dev;
+	struct diskslices **sspp;
+	d_open_t dopen;
+	d_close_t dclose;
+{
+	struct disklabel *lp;
+	int	part;
+	int	slice;
+	struct diskslices *ssp;
+
+	slice = dkslice(dev);
+	part = dkpart(dev);
+	ssp = *sspp;
+	if (ssp == NULL || slice >= ssp->dss_nslices
+	    || !(ssp->dss_slices[slice].ds_bopenmask & (1 << part))) {
+		if (dopen(dev, FREAD, S_IFBLK, (struct proc *)NULL) != 0)
+			return (-1);
+		dclose(dev, FREAD, S_IFBLK, (struct proc *)NULL);
+		ssp = *sspp;
+	}
+	lp = ssp->dss_slices[slice].ds_label;
+	if (lp == NULL)
+		return (-1);
+	return ((int)lp->d_partitions[part].p_size);
+}
+
+static void
+free_ds_label(ssp, slice)
+	struct diskslices *ssp;
+	int	slice;
+{
+	struct disklabel *lp;
+	struct diskslice *sp;
+
+	sp = &ssp->dss_slices[slice];
+	lp = sp->ds_label;
+	if (lp == NULL)
+		return;
+#ifdef DEVFS
+	free_ds_labeldevs(ssp, slice);
+	if (slice == COMPATIBILITY_SLICE)
+		free_ds_labeldevs(ssp, ssp->dss_first_bsd_slice);
+	else if (slice == ssp->dss_first_bsd_slice)
+		free_ds_labeldevs(ssp, COMPATIBILITY_SLICE);
+#endif
+	free(lp, M_DEVBUF);
+	set_ds_label(ssp, slice, (struct disklabel *)NULL);
+}
+
+#ifdef DEVFS
+static void
+free_ds_labeldevs(ssp, slice)
+	struct diskslices *ssp;
+	int	slice;
+{
+	struct disklabel *lp;
+	int	part;
+	struct diskslice *sp;
+
+	sp = &ssp->dss_slices[slice];
+	lp = sp->ds_label;
+	if (lp == NULL)
+		return;
+	for (part = 0; part < lp->d_npartitions; part++) {
+		if (sp->ds_bdevs[part] != NULL) {
+			devfs_remove_dev(sp->ds_bdevs[part]);
+			sp->ds_bdevs[part] = NULL;
+		}
+		if (sp->ds_cdevs[part] != NULL) {
+			devfs_remove_dev(sp->ds_cdevs[part]);
+			sp->ds_cdevs[part] = NULL;
+		}
+	}
+}
+#endif
+
+static char *
+fixlabel(sname, sp, lp, writeflag)
+	char	*sname;
+	struct diskslice *sp;
+	struct disklabel *lp;
+	int	writeflag;
+{
+	u_long	end;
+	u_long	offset;
+	int	part;
+	struct partition *pp;
+	u_long	start;
+	bool_t	warned;
+
+	/* These errors "can't happen" so don't bother reporting details. */
+	if (lp->d_magic != DISKMAGIC || lp->d_magic2 != DISKMAGIC)
+		return ("fixlabel: invalid magic");
+	if (dkcksum(lp) != 0)
+		return ("fixlabel: invalid checksum");
+
+	pp = &lp->d_partitions[RAW_PART];
+	if (writeflag) {
+		start = 0;
+		offset = sp->ds_offset;
+	} else {
+		start = sp->ds_offset;
+		offset = -sp->ds_offset;
+	}
+	if (pp->p_offset != start) {
+		if (sname != NULL) {
+			printf(
+"%s: rejecting BSD label: raw partition offset != slice offset\n",
+			       sname);
+			slice_info(sname, sp);
+			partition_info(sname, RAW_PART, pp);
+		}
+		return ("fixlabel: raw partition offset != slice offset");
+	}
+	if (pp->p_size != sp->ds_size) {
+		if (sname != NULL) {
+			printf("%s: raw partition size != slice size\n", sname);
+			slice_info(sname, sp);
+			partition_info(sname, RAW_PART, pp);
+		}
+		if (pp->p_size > sp->ds_size) {
+			if (sname == NULL)
+				return ("fixlabel: raw partition size > slice size");
+			printf("%s: truncating raw partition\n", sname);
+			pp->p_size = sp->ds_size;
+		}
+	}
+	end = start + sp->ds_size;
+	if (start > end)
+		return ("fixlabel: slice wraps");
+	if (lp->d_secpercyl <= 0)
+		return ("fixlabel: d_secpercyl <= 0");
+	pp -= RAW_PART;
+	warned = FALSE;
+	for (part = 0; part < lp->d_npartitions; part++, pp++) {
+		if (pp->p_offset != 0 || pp->p_size != 0) {
+			if (pp->p_offset < start
+			    || pp->p_offset + pp->p_size > end
+			    || pp->p_offset + pp->p_size < pp->p_offset) {
+				if (sname != NULL) {
+					printf(
+"%s: rejecting partition in BSD label: it isn't entirely within the slice\n",
+					       sname);
+					if (!warned) {
+						slice_info(sname, sp);
+						warned = TRUE;
+					}
+					partition_info(sname, part, pp);
+				}
+				/* XXX else silently discard junk. */
+				bzero(pp, sizeof *pp);
+			} else
+				pp->p_offset += offset;
+		}
+	}
+	lp->d_ncylinders = sp->ds_size / lp->d_secpercyl;
+	lp->d_secperunit = sp->ds_size;
+ 	lp->d_checksum = 0;
+ 	lp->d_checksum = dkcksum(lp);
+	return (NULL);
+}
+
+static void
+partition_info(sname, part, pp)
+	char	*sname;
+	int	part;
+	struct partition *pp;
+{
+	printf("%s%c: start %lu, end %lu, size %lu\n", sname, 'a' + part,
+	       pp->p_offset, pp->p_offset + pp->p_size - 1, pp->p_size);
+}
+
+static void
+slice_info(sname, sp)
+	char	*sname;
+	struct diskslice *sp;
+{
+	printf("%s: start %lu, end %lu, size %lu\n", sname,
+	       sp->ds_offset, sp->ds_offset + sp->ds_size - 1, sp->ds_size);
+}
+
+/*
+ * Most changes to ds_bad, ds_label and ds_wlabel are made using the
+ * following functions to ensure coherency of the compatibility slice
+ * with the first BSD slice.  The openmask fields are _not_ shared and
+ * the other fields (ds_offset and ds_size) aren't changed after they
+ * are initialized.
+ */
+static void
+set_ds_bad(ssp, slice, btp)
+	struct diskslices *ssp;
+	int	slice;
+	struct dkbad_intern *btp;
+{
+	ssp->dss_slices[slice].ds_bad = btp;
+	if (slice == COMPATIBILITY_SLICE)
+		ssp->dss_slices[ssp->dss_first_bsd_slice].ds_bad = btp;
+	else if (slice == ssp->dss_first_bsd_slice)
+		ssp->dss_slices[COMPATIBILITY_SLICE].ds_bad = btp;
+}
+
+static void
+set_ds_label(ssp, slice, lp)
+	struct diskslices *ssp;
+	int	slice;
+	struct disklabel *lp;
+{
+	ssp->dss_slices[slice].ds_label = lp;
+	if (slice == COMPATIBILITY_SLICE)
+		ssp->dss_slices[ssp->dss_first_bsd_slice].ds_label = lp;
+	else if (slice == ssp->dss_first_bsd_slice)
+		ssp->dss_slices[COMPATIBILITY_SLICE].ds_label = lp;
+}
+
+#ifdef DEVFS
+static void
+set_ds_labeldevs(dname, dev, ssp)
+	char	*dname;
+	dev_t	dev;
+	struct diskslices *ssp;
+{
+	int	slice;
+
+	set_ds_labeldevs_unaliased(dname, dev, ssp);
+	if (ssp->dss_first_bsd_slice == COMPATIBILITY_SLICE)
+		return;
+	slice = dkslice(dev);
+	if (slice == COMPATIBILITY_SLICE)
+		set_ds_labeldevs_unaliased(dname,
+			dkmodslice(dev, ssp->dss_first_bsd_slice), ssp);
+	else if (slice == ssp->dss_first_bsd_slice)
+		set_ds_labeldevs_unaliased(dname,
+			dkmodslice(dev, COMPATIBILITY_SLICE), ssp);
+}
+
+static void
+set_ds_labeldevs_unaliased(dname, dev, ssp)
+	char	*dname;
+	dev_t	dev;
+	struct diskslices *ssp;
+{
+	struct disklabel *lp;
+	int	mynor;
+	int	part;
+	char	partname[2];
+	struct partition *pp;
+	int	slice;
+	char	*sname;
+	struct diskslice *sp;
+ 
+	slice = dkslice(dev);
+	sp = &ssp->dss_slices[slice];
+	if (sp->ds_size == 0)
+		return;
+	lp = sp->ds_label;
+	for (part = 0; part < lp->d_npartitions; part++) {
+		pp = &lp->d_partitions[part];
+		if (pp->p_size == 0)
+			continue;
+		sname = dsname(dname, dkunit(dev), slice, part, partname);
+		if (part == RAW_PART && sp->ds_bdev != NULL) {
+			sp->ds_bdevs[part] =
+				devfs_link(sp->ds_bdev,
+					   "%s%s", sname, partname);
+			sp->ds_cdevs[part] =
+				devfs_link(sp->ds_cdev,
+					   "r%s%s", sname, partname);
+		} else {
+			mynor = minor(dkmodpart(dev, part));
+			sp->ds_bdevs[part] =
+				devfs_add_devswf(ssp->dss_bdevsw, mynor, DV_BLK,
+						 UID_ROOT, GID_OPERATOR, 0640,
+						 "%s%s", sname, partname);
+			sp->ds_cdevs[part] =
+				devfs_add_devswf(ssp->dss_cdevsw, mynor, DV_CHR,
+						 UID_ROOT, GID_OPERATOR, 0640,
+						 "r%s%s", sname, partname);
+		}
+	}
+}
+#endif /* DEVFS */
+
+static void
+set_ds_wlabel(ssp, slice, wlabel)
+	struct diskslices *ssp;
+	int	slice;
+	int	wlabel;
+{
+	ssp->dss_slices[slice].ds_wlabel = wlabel;
+	if (slice == COMPATIBILITY_SLICE)
+		ssp->dss_slices[ssp->dss_first_bsd_slice].ds_wlabel = wlabel;
+	else if (slice == ssp->dss_first_bsd_slice)
+		ssp->dss_slices[COMPATIBILITY_SLICE].ds_wlabel = wlabel;
+}
diff --git a/sys/kern/subr_dkbad.c b/sys/kern/subr_dkbad.c
new file mode 100644
index 0000000..8fef863
--- /dev/null
+++ b/sys/kern/subr_dkbad.c
@@ -0,0 +1,159 @@
+/*-
+ * Copyright (c) 1994 Bruce D. Evans.
+ * All rights reserved.
+ *
+ * Copyright (c) 1990 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * William Jolitz.
+ *
+ * Copyright (c) 1982, 1986, 1988 Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)wd.c	7.2 (Berkeley) 5/9/91
+ *	from: wd.c,v 1.55 1994/10/22 01:57:12 phk Exp $
+ *	from: @(#)ufs_disksubr.c	7.16 (Berkeley) 5/4/91
+ *	from: ufs_disksubr.c,v 1.8 1994/06/07 01:21:39 phk Exp $
+ *	$Id$
+ */
+
+#include <sys/param.h>
+#include <sys/buf.h>
+#include <sys/conf.h>
+#include <sys/disklabel.h>
+#include <sys/dkbad.h>
+#include <sys/malloc.h>
+
+/*
+ * Internalize the bad sector table.
+ * TODO:
+ *	o Fix types.
+ *	  Type long should be daddr_t since we compare with blkno's.
+ *	  Sentinel -1 should be ((daddr_t)-1).
+ *	o Can remove explicit test for sentinel if it is a positive
+ *	  (unsigned or not) value larger than all possible blkno's.
+ *	o Check that the table is sorted.
+ *	o Use faster searches.
+ *	o Use the internal table in wddump().
+ *	o Don't duplicate so much code.
+ *	o Do all bad block handing in a driver-independent file.
+ *	o Remove limit of 126 spare sectors.
+ */
+struct dkbad_intern *
+internbad144(btp, lp)
+	struct dkbad *btp;
+	struct disklabel *lp;
+{
+	struct dkbad_intern *bip;
+	int i;
+
+	bip = malloc(sizeof *bip, M_DEVBUF, M_WAITOK);
+	/*
+	 * Spare sectors are allocated beginning with the last sector of
+	 * the second last track of the disk (the last track is used for
+	 * the bad sector list).
+	 */
+	bip->bi_maxspare = lp->d_secperunit - lp->d_nsectors - 1;
+	bip->bi_nbad = DKBAD_MAXBAD;
+	i = 0;
+	for (; i < DKBAD_MAXBAD && btp->bt_bad[i].bt_cyl != DKBAD_NOCYL; i++)
+		bip->bi_bad[i] = btp->bt_bad[i].bt_cyl * lp->d_secpercyl
+				 + (btp->bt_bad[i].bt_trksec >> 8)
+				   * lp->d_nsectors
+				 + (btp->bt_bad[i].bt_trksec & 0x00ff);
+	bip->bi_bad[i] = -1;
+	return (bip);
+}
+
+char *
+readbad144(dev, strat, lp, bdp)
+	dev_t	dev;
+	d_strategy_t *strat;
+	struct disklabel *lp;
+	struct dkbad *bdp;
+{
+	struct buf *bp;
+	struct dkbad *db;
+	int	i;
+	char	*msg;
+
+	bp = geteblk((int)lp->d_secsize);
+	i = 0;
+	do {
+		/* Read a bad sector table. */
+		bp->b_dev = dev;
+		bp->b_blkno = lp->d_secperunit - lp->d_nsectors + i;
+		if (lp->d_secsize > DEV_BSIZE)
+			bp->b_blkno *= lp->d_secsize / DEV_BSIZE;
+		else
+			bp->b_blkno /= DEV_BSIZE / lp->d_secsize;
+		bp->b_bcount = lp->d_secsize;
+		bp->b_flags = B_BUSY | B_READ;
+		(*strat)(bp);
+
+		/* If successful, validate, otherwise try another. */
+		if (biowait(bp) == 0) {
+			db = (struct dkbad *)(bp->b_un.b_addr);
+			if (db->bt_mbz == 0 && db->bt_flag == DKBAD_MAGIC) {
+				msg = NULL;
+				*bdp = *db;
+				break;
+			}
+			msg = "bad sector table corrupted";
+		} else
+			msg = "bad sector table I/O error";
+	} while ((bp->b_flags & B_ERROR) && (i += 2) < 10 &&
+		 i < lp->d_nsectors);
+	bp->b_flags |= B_INVAL | B_AGE;
+	brelse(bp);
+	return (msg);
+}
+
+daddr_t
+transbad144(bip, blkno)
+	struct dkbad_intern *bip;
+	daddr_t	blkno;
+{
+	int	i;
+
+	/*
+	 * List is sorted, so the search can terminate when it is past our
+	 * sector.
+	 */
+	for (i = 0; bip->bi_bad[i] != -1 && bip->bi_bad[i] <= blkno; i++)
+		if (bip->bi_bad[i] == blkno)
+			/*
+			 * Spare sectors are allocated in decreasing order.
+			 */
+			return (bip->bi_maxspare - i);
+	return (blkno);
+}
diff --git a/sys/kern/subr_log.c b/sys/kern/subr_log.c
index 792a1ce..1418709 100644
--- a/sys/kern/subr_log.c
+++ b/sys/kern/subr_log.c
@@ -30,7 +30,8 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)subr_log.c	8.3 (Berkeley) 2/14/95
+ *	@(#)subr_log.c	8.1 (Berkeley) 6/10/93
+ * $Id: subr_log.c,v 1.21 1997/03/23 03:36:22 bde Exp $
  */
 
 /*
@@ -39,18 +40,37 @@
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/conf.h>
 #include <sys/proc.h>
 #include <sys/vnode.h>
-#include <sys/ioctl.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
 #include <sys/msgbuf.h>
-#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/signalvar.h>
+#include <sys/kernel.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
 
 #define LOG_RDPRI	(PZERO + 1)
 
 #define LOG_ASYNC	0x04
 #define LOG_RDWAIT	0x08
 
-struct logsoftc {
+static	d_open_t	logopen;
+static	d_close_t	logclose;
+static	d_read_t	logread;
+static	d_ioctl_t	logioctl;
+static	d_select_t	logselect;
+
+#define CDEV_MAJOR 7
+static struct cdevsw log_cdevsw = 
+	{ logopen,	logclose,	logread,	nowrite,	/*7*/
+	  logioctl,	nostop,		nullreset,	nodevtotty,/* klog */
+	  logselect,	nommap,		NULL,	"log",	NULL,	-1 };
+
+static struct logsoftc {
 	int	sc_state;		/* see above for possibilities */
 	struct	selinfo sc_selp;	/* process waiting on select call */
 	int	sc_pgid;		/* process/group for async I/O */
@@ -59,36 +79,21 @@ struct logsoftc {
 int	log_open;			/* also used in log() */
 
 /*ARGSUSED*/
-int
+static	int
 logopen(dev, flags, mode, p)
 	dev_t dev;
 	int flags, mode;
 	struct proc *p;
 {
-	register struct msgbuf *mbp = msgbufp;
-
 	if (log_open)
 		return (EBUSY);
 	log_open = 1;
 	logsoftc.sc_pgid = p->p_pid;		/* signal process only */
-	/*
-	 * Potential race here with putchar() but since putchar should be
-	 * called by autoconf, msg_magic should be initialized by the time
-	 * we get here.
-	 */
-	if (mbp->msg_magic != MSG_MAGIC) {
-		register int i;
-
-		mbp->msg_magic = MSG_MAGIC;
-		mbp->msg_bufx = mbp->msg_bufr = 0;
-		for (i=0; i < MSG_BSIZE; i++)
-			mbp->msg_bufc[i] = 0;
-	}
 	return (0);
 }
 
 /*ARGSUSED*/
-int
+static	int
 logclose(dev, flag, mode, p)
 	dev_t dev;
 	int flag, mode;
@@ -101,7 +106,7 @@ logclose(dev, flag, mode, p)
 }
 
 /*ARGSUSED*/
-int
+static	int
 logread(dev, uio, flag)
 	dev_t dev;
 	struct uio *uio;
@@ -119,8 +124,8 @@ logread(dev, uio, flag)
 			return (EWOULDBLOCK);
 		}
 		logsoftc.sc_state |= LOG_RDWAIT;
-		if (error = tsleep((caddr_t)mbp, LOG_RDPRI | PCATCH,
-		    "klog", 0)) {
+		if ((error = tsleep((caddr_t)mbp, LOG_RDPRI | PCATCH,
+		    "klog", 0))) {
 			splx(s);
 			return (error);
 		}
@@ -140,14 +145,14 @@ logread(dev, uio, flag)
 		if (error)
 			break;
 		mbp->msg_bufr += l;
-		if (mbp->msg_bufr < 0 || mbp->msg_bufr >= MSG_BSIZE)
+		if (mbp->msg_bufr >= MSG_BSIZE)
 			mbp->msg_bufr = 0;
 	}
 	return (error);
 }
 
 /*ARGSUSED*/
-int
+static	int
 logselect(dev, rw, p)
 	dev_t dev;
 	int rw;
@@ -179,8 +184,8 @@ logwakeup()
 	selwakeup(&logsoftc.sc_selp);
 	if (logsoftc.sc_state & LOG_ASYNC) {
 		if (logsoftc.sc_pgid < 0)
-			gsignal(-logsoftc.sc_pgid, SIGIO); 
-		else if (p = pfind(logsoftc.sc_pgid))
+			gsignal(-logsoftc.sc_pgid, SIGIO);
+		else if ((p = pfind(logsoftc.sc_pgid)))
 			psignal(p, SIGIO);
 	}
 	if (logsoftc.sc_state & LOG_RDWAIT) {
@@ -190,10 +195,10 @@ logwakeup()
 }
 
 /*ARGSUSED*/
-int
+static	int
 logioctl(dev, com, data, flag, p)
 	dev_t dev;
-	u_long com;
+	int com;
 	caddr_t data;
 	int flag;
 	struct proc *p;
@@ -232,7 +237,33 @@ logioctl(dev, com, data, flag, p)
 		break;
 
 	default:
-		return (-1);
+		return (ENOTTY);
 	}
 	return (0);
 }
+
+static log_devsw_installed = 0;
+#ifdef DEVFS
+static	void	*log_devfs_token;
+#endif
+
+static void
+log_drvinit(void *unused)
+{
+	dev_t dev;
+
+	if( ! log_devsw_installed ) {
+		dev = makedev(CDEV_MAJOR,0);
+		cdevsw_add(&dev,&log_cdevsw,NULL);
+		log_devsw_installed = 1;
+#ifdef DEVFS
+		log_devfs_token = devfs_add_devswf(&log_cdevsw, 0, DV_CHR,
+						   UID_ROOT, GID_WHEEL, 0600,
+						   "klog");
+#endif
+    	}
+}
+
+SYSINIT(logdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,log_drvinit,NULL)
+
+
diff --git a/sys/kern/subr_param.c b/sys/kern/subr_param.c
new file mode 100644
index 0000000..f7d41bf
--- /dev/null
+++ b/sys/kern/subr_param.c
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 1980, 1986, 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)param.c	8.3 (Berkeley) 8/20/94
+ * $Id$
+ */
+
+#include "opt_sysvipc.h"
+#include "opt_param.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/socket.h>
+#include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/file.h>
+#include <sys/callout.h>
+#include <sys/clist.h>
+#include <sys/mbuf.h>
+#include <sys/kernel.h>
+
+#include <ufs/ufs/quota.h>
+
+#ifdef SYSVSHM
+#include <machine/vmparam.h>
+#include <sys/shm.h>
+#endif
+#ifdef SYSVSEM
+#include <sys/sem.h>
+#endif
+#ifdef SYSVMSG
+#include <sys/msg.h>
+#endif
+
+/*
+ * System parameter formulae.
+ *
+ * This file is copied into each directory where we compile
+ * the kernel; it should be modified there to suit local taste
+ * if necessary.
+ *
+ * Compiled with -DMAXUSERS=xx
+ */
+
+#ifndef HZ
+#define	HZ 100
+#endif
+int	hz = HZ;
+int	tick = 1000000 / HZ;
+int	tickadj = 30000 / (60 * HZ);		/* can adjust 30ms in 60s */
+#define	NPROC (20 + 16 * MAXUSERS)
+int	maxproc = NPROC;			/* maximum # of processes */
+int	maxprocperuid = NPROC-1;		/* maximum # of processes per user */
+int	maxfiles = NPROC*2;			/* system wide open files limit */
+int	maxfilesperproc = NPROC*2;		/* per-process open files limit */
+int	ncallout = 16 + NPROC;			/* maximum # of timer events */
+
+/* maximum # of mbuf clusters */
+#ifndef NMBCLUSTERS
+#define	NMBCLUSTERS (512 + MAXUSERS * 16)
+#endif
+int	nmbclusters = NMBCLUSTERS;
+
+/* allocate 1/4th amount of virtual address space for mbufs XXX */
+int	nmbufs = NMBCLUSTERS * 4;
+
+int	fscale = FSCALE;	/* kernel uses `FSCALE', user uses `fscale' */
+
+/*
+ * Values in support of System V compatible shared memory.	XXX
+ */
+#ifdef SYSVSHM
+#ifndef SHMMAX
+#define	SHMMAX	(SHMMAXPGS*PAGE_SIZE)
+#endif
+#ifndef SHMMIN
+#define	SHMMIN	1
+#endif
+#ifndef SHMMNI
+#define	SHMMNI	32			/* <= SHMMMNI in shm.h */
+#endif
+#ifndef SHMSEG
+#define	SHMSEG	8
+#endif
+#ifndef SHMALL
+#define	SHMALL	(SHMMAXPGS)
+#endif
+
+struct	shminfo shminfo = {
+	SHMMAX,
+	SHMMIN,
+	SHMMNI,
+	SHMSEG,
+	SHMALL
+};
+#endif
+
+/*
+ * Values in support of System V compatible semaphores.
+ */
+
+#ifdef SYSVSEM
+
+struct seminfo seminfo = {
+                SEMMAP,         /* # of entries in semaphore map */
+                SEMMNI,         /* # of semaphore identifiers */
+                SEMMNS,         /* # of semaphores in system */
+                SEMMNU,         /* # of undo structures in system */
+                SEMMSL,         /* max # of semaphores per id */
+                SEMOPM,         /* max # of operations per semop call */
+                SEMUME,         /* max # of undo entries per process */
+                SEMUSZ,         /* size in bytes of undo structure */
+                SEMVMX,         /* semaphore maximum value */
+                SEMAEM          /* adjust on exit max value */
+};
+#endif
+
+/*
+ * Values in support of System V compatible messages.
+ */
+
+#ifdef SYSVMSG
+
+struct msginfo msginfo = {
+                MSGMAX,         /* max chars in a message */
+                MSGMNI,         /* # of message queue identifiers */
+                MSGMNB,         /* max chars in a queue */
+                MSGTQL,         /* max messages in system */
+                MSGSSZ,         /* size of a message segment */
+                		/* (must be small power of 2 greater than 4) */
+                MSGSEG          /* number of message segments */
+};
+#endif
+
+/*
+ * These may be set to nonzero here or by patching.
+ * If they are nonzero at bootstrap time then they are
+ * initialized to values dependent on the memory size.
+ */
+#ifdef	NBUF
+int	nbuf = NBUF;
+#else
+int	nbuf = 0;
+#endif
+int	nswbuf = 0;
+
+/*
+ * These have to be allocated somewhere; allocating
+ * them here forces loader errors if this file is omitted
+ * (if they've been externed everywhere else; hah!).
+ */
+struct	buf *swbuf;
diff --git a/sys/kern/subr_prf.c b/sys/kern/subr_prf.c
index 8a9a44e..4b3ed36 100644
--- a/sys/kern/subr_prf.c
+++ b/sys/kern/subr_prf.c
@@ -35,23 +35,21 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)subr_prf.c	8.4 (Berkeley) 5/4/95
+ *	@(#)subr_prf.c	8.3 (Berkeley) 1/21/94
+ * $Id$
  */
 
+#include "opt_ddb.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/buf.h>
-#include <sys/conf.h>
-#include <sys/reboot.h>
 #include <sys/msgbuf.h>
 #include <sys/proc.h>
-#include <sys/ioctl.h>
-#include <sys/vnode.h>
-#include <sys/file.h>
 #include <sys/tty.h>
 #include <sys/tprintf.h>
 #include <sys/syslog.h>
 #include <sys/malloc.h>
+#include <machine/cons.h>
 
 /*
  * Note that stdarg.h and the ANSI style va_start macro is used for both
@@ -59,71 +57,20 @@
  */
 #include <machine/stdarg.h>
 
-#ifdef KADB
-#include <machine/kdbparam.h>
-#endif
-
 #define TOCONS	0x01
 #define TOTTY	0x02
 #define TOLOG	0x04
 
 struct	tty *constty;			/* pointer to console "window" tty */
 
-extern	cnputc();			/* standard console putc */
-int	(*v_putc)() = cnputc;		/* routine to putc on virtual console */
-
-void  logpri __P((int level));
-static void  putchar __P((int ch, int flags, struct tty *tp));
+static void (*v_putc)(int) = cnputc;	/* routine to putc on virtual console */
+static void  logpri __P((int level));
+static void  msglogchar(int c, void *dummyarg);
+struct putchar_arg {int flags; struct tty *tty; };
+static void  putchar __P((int ch, void *arg));
 static char *ksprintn __P((u_long num, int base, int *len));
-void kprintf __P((const char *fmt, int flags, struct tty *tp, va_list ap));
-
-int consintr = 1;			/* Ok to handle console interrupts? */
 
-/*
- * Variable panicstr contains argument to first call to panic; used as flag
- * to indicate that the kernel has already called panic.
- */
-const char *panicstr;
-
-/*
- * Panic is called on unresolvable fatal errors.  It prints "panic: mesg",
- * and then reboots.  If we are called twice, then we avoid trying to sync
- * the disks as this often leads to recursive panics.
- */
-#ifdef __GNUC__
-volatile void boot(int flags);	/* boot() does not return */
-volatile			/* panic() does not return */
-#endif
-void
-#ifdef __STDC__
-panic(const char *fmt, ...)
-#else
-panic(fmt, va_alist)
-	char *fmt;
-#endif
-{
-	int bootopt;
-	va_list ap;
-
-	bootopt = RB_AUTOBOOT | RB_DUMP;
-	if (panicstr)
-		bootopt |= RB_NOSYNC;
-	else
-		panicstr = fmt;
-
-	va_start(ap, fmt);
-	printf("panic: %r\n", fmt, ap);
-	va_end(ap);
-
-#ifdef KGDB
-	kgdb_panic();
-#endif
-#ifdef KADB
-	if (boothowto & RB_KDB)
-		kdbpanic();
-#endif
-	boot(bootopt);
-}
+static int consintr = 1;		/* Ok to handle console interrupts? */
 
 /*
  * Warn that a system table is full.
@@ -142,19 +89,17 @@ tablefull(tab)
  * the queue does not clear in a reasonable time.
  */
 void
-#ifdef __STDC__
 uprintf(const char *fmt, ...)
-#else
-uprintf(fmt, va_alist)
-	char *fmt;
-#endif
 {
-	register struct proc *p = curproc;
+	struct proc *p = curproc;
 	va_list ap;
+	struct putchar_arg pca;
 
 	if (p->p_flag & P_CONTROLT && p->p_session->s_ttyvp) {
 		va_start(ap, fmt);
-		kprintf(fmt, TOTTY, p->p_session->s_ttyp, ap);
+		pca.tty = p->p_session->s_ttyp;
+		pca.flags = TOTTY;
+		kvprintf(fmt, putchar, &pca, 10, ap);
 		va_end(ap);
 	}
 }
@@ -185,18 +130,13 @@ tprintf_close(sess)
  * with the given session.
  */
 void
-#ifdef __STDC__
 tprintf(tpr_t tpr, const char *fmt, ...)
-#else
-tprintf(tpr, fmt, va_alist)
-	tpr_t tpr;
-	char *fmt;
-#endif
 {
 	register struct session *sess = (struct session *)tpr;
 	struct tty *tp = NULL;
 	int flags = TOLOG;
 	va_list ap;
+	struct putchar_arg pca;
 
 	logpri(LOG_INFO);
 	if (sess && sess->s_ttyvp && ttycheckoutq(sess->s_ttyp, 0)) {
@@ -204,7 +144,9 @@ tprintf(tpr, fmt, va_alist)
 		tp = sess->s_ttyp;
 	}
 	va_start(ap, fmt);
-	kprintf(fmt, flags, tp, ap);
+	pca.tty = tp;
+	pca.flags = flags;
+	kvprintf(fmt, putchar, &pca, 10, ap);
 	va_end(ap);
 	logwakeup();
 }
@@ -215,18 +157,14 @@ tprintf(tpr, fmt, va_alist)
  * be revoke(2)'d away.  Other callers should use tprintf.
  */
 void
-#ifdef __STDC__
 ttyprintf(struct tty *tp, const char *fmt, ...)
-#else
-ttyprintf(tp, fmt, va_alist)
-	struct tty *tp;
-	char *fmt;
-#endif
 {
 	va_list ap;
-
+	struct putchar_arg pca;
 	va_start(ap, fmt);
-	kprintf(fmt, TOTTY, tp, ap);
+	pca.tty = tp;
+	pca.flags = TOTTY;
+	kvprintf(fmt, putchar, &pca, 10, ap);
 	va_end(ap);
 }
 
@@ -238,13 +176,7 @@ extern	int log_open;
  * log yet, it writes to the console also.
  */
 void
-#ifdef __STDC__
 log(int level, const char *fmt, ...)
-#else
-log(level, fmt, va_alist)
-	int level;
-	char *fmt;
-#endif
 {
 	register int s;
 	va_list ap;
@@ -252,73 +184,157 @@ log(level, fmt, va_alist)
 	s = splhigh();
 	logpri(level);
 	va_start(ap, fmt);
-	kprintf(fmt, TOLOG, NULL, ap);
-	splx(s);
+
+	kvprintf(fmt, msglogchar, NULL, 10, ap);
 	va_end(ap);
+
+	splx(s);
 	if (!log_open) {
+		struct putchar_arg pca;
 		va_start(ap, fmt);
-		kprintf(fmt, TOCONS, NULL, ap);
+		pca.tty = NULL;
+		pca.flags = TOCONS;
+		kvprintf(fmt, putchar, &pca, 10, ap);
 		va_end(ap);
 	}
 	logwakeup();
 }
 
-void
+static void
 logpri(level)
 	int level;
 {
-	register int ch;
 	register char *p;
 
-	putchar('<', TOLOG, NULL);
-	for (p = ksprintn((u_long)level, 10, NULL); ch = *p--;)
-		putchar(ch, TOLOG, NULL);
-	putchar('>', TOLOG, NULL);
+	msglogchar('<', NULL);
+	for (p = ksprintn((u_long)level, 10, NULL); *p;)
+		msglogchar(*p--, NULL);
+	msglogchar('>', NULL);
 }
 
-void
-#ifdef __STDC__
+int
 addlog(const char *fmt, ...)
-#else
-addlog(fmt, va_alist)
-	char *fmt;
-#endif
 {
 	register int s;
 	va_list ap;
+	int retval;
 
 	s = splhigh();
 	va_start(ap, fmt);
-	kprintf(fmt, TOLOG, NULL, ap);
+	retval = kvprintf(fmt, msglogchar, NULL, 10, ap);
 	splx(s);
 	va_end(ap);
 	if (!log_open) {
+		struct putchar_arg pca;
 		va_start(ap, fmt);
-		kprintf(fmt, TOCONS, NULL, ap);
+		pca.tty = NULL;
+		pca.flags = TOCONS;
+		kvprintf(fmt, putchar, &pca, 10, ap);
 		va_end(ap);
 	}
 	logwakeup();
+	return (retval);
 }
 
-void
-#ifdef __STDC__
+int
 printf(const char *fmt, ...)
-#else
-printf(fmt, va_alist)
-	char *fmt;
-#endif
 {
 	va_list ap;
 	register int savintr;
+	struct putchar_arg pca;
+	int retval;
 
 	savintr = consintr;		/* disable interrupts */
 	consintr = 0;
 	va_start(ap, fmt);
-	kprintf(fmt, TOCONS | TOLOG, NULL, ap);
+	pca.tty = NULL;
+	pca.flags = TOCONS | TOLOG;
+	retval = kvprintf(fmt, putchar, &pca, 10, ap);
 	va_end(ap);
 	if (!panicstr)
 		logwakeup();
 	consintr = savintr;		/* reenable interrupts */
+	return retval;
+}
+
+void
+vprintf(const char *fmt, va_list ap)
+{
+	register int savintr;
+	struct putchar_arg pca;
+
+	savintr = consintr;		/* disable interrupts */
+	consintr = 0;
+	pca.tty = NULL;
+	pca.flags = TOCONS | TOLOG;
+	kvprintf(fmt, putchar, &pca, 10, ap);
+	if (!panicstr)
+		logwakeup();
+	consintr = savintr;		/* reenable interrupts */
+}
+
+/*
+ * Print a character on console or users terminal.  If destination is
+ * the console then the last MSGBUFS characters are saved in msgbuf for
+ * inspection later.
+ */
+static void
+putchar(int c, void *arg)
+{
+	struct putchar_arg *ap = (struct putchar_arg*) arg;
+	int flags = ap->flags;
+	struct tty *tp = ap->tty;
+	if (panicstr)
+		constty = NULL;
+	if ((flags & TOCONS) && tp == NULL && constty) {
+		tp = constty;
+		flags |= TOTTY;
+	}
+	if ((flags & TOTTY) && tp && tputchar(c, tp) < 0 &&
+	    (flags & TOCONS) && tp == constty)
+		constty = NULL;
+	if ((flags & TOLOG))
+		msglogchar(c, NULL);
+	if ((flags & TOCONS) && constty == NULL && c != '\0')
+		(*v_putc)(c);
+}
+
+/*
+ * Scaled down version of sprintf(3).
+ */
+int
+sprintf(char *buf, const char *cfmt, ...)
+{
+	int retval;
+	va_list ap;
+
+	va_start(ap, cfmt);
+	retval = kvprintf(cfmt, NULL, (void *)buf, 10, ap);
+	buf[retval] = '\0';
+	va_end(ap);
+	return retval;
+}
+
+/*
+ * Put a number (base <= 16) in a buffer in reverse order; return an
+ * optional length and a pointer to the NULL terminated (preceded?)
+ * buffer.
+ */
+static char *
+ksprintn(ul, base, lenp)
+	register u_long ul;
+	register int base, *lenp;
+{					/* A long in base 8, plus NULL. */
+	static char buf[sizeof(long) * NBBY / 3 + 2];
+	register char *p;
+
+	p = buf;
+	do {
+		*++p = hex2ascii(ul % base);
+	} while (ul /= base);
+	if (lenp)
+		*lenp = p - buf;
+	return (p);
 }
 
 /*
@@ -337,110 +353,178 @@ printf(fmt, va_alist)
  * the next characters (up to a control character, i.e. a character <= 32),
  * give the name of the register.  Thus:
  *
- *	kprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE\n");
+ *	kvprintf("reg=%b\n", 3, "\10\2BITTWO\1BITONE\n");
  *
  * would produce output:
  *
  *	reg=3<BITTWO,BITONE>
  *
- * The format %r passes an additional format string and argument list
- * recursively.  Its usage is:
- *
- * fn(char *fmt, ...)
- * {
- *	va_list ap;
- *	va_start(ap, fmt);
- *	printf("prefix: %r: suffix\n", fmt, ap);
- *	va_end(ap);
- * }
- *
- * Space or zero padding and a field width are supported for the numeric
- * formats only.
+ * XXX:  %D  -- Hexdump, takes pointer and separator string:
+ *		("%6D", ptr, ":")   -> XX:XX:XX:XX:XX:XX
+ *		("%*D", len, ptr, " " -> XX XX XX XX ...
  */
-void
-kprintf(fmt, flags, tp, ap)
-	register const char *fmt;
-	int flags;
-	struct tty *tp;
-	va_list ap;
+int
+kvprintf(char const *fmt, void (*func)(int, void*), void *arg, int radix, va_list ap)
 {
-	register char *p, *q;
-	register int ch, n;
+#define PCHAR(c) {int cc=(c); if (func) (*func)(cc,arg); else *d++ = cc; retval++; }
+	char *p, *q, *d;
+	u_char *up;
+	int ch, n;
 	u_long ul;
-	int base, lflag, tmp, width;
+	int base, lflag, tmp, width, ladjust, sharpflag, neg, sign, dot;
+	int dwidth;
 	char padc;
+	int retval = 0;
+
+	if (!func)
+		d = (char *) arg;
+	else
+		d = NULL;
+
+	if (fmt == NULL)
+		fmt = "(fmt null)\n";
+
+	if (radix < 2 || radix > 36)
+		radix = 10;
 
 	for (;;) {
 		padc = ' ';
 		width = 0;
-		while ((ch = *(u_char *)fmt++) != '%') {
-			if (ch == '\0')
-				return;
-			putchar(ch, flags, tp);
+		while ((ch = (u_char)*fmt++) != '%') {
+			if (ch == '\0') 
+				return retval;
+			PCHAR(ch);
 		}
-		lflag = 0;
-reswitch:	switch (ch = *(u_char *)fmt++) {
-		case '0':
-			padc = '0';
+		lflag = 0; ladjust = 0; sharpflag = 0; neg = 0;
+		sign = 0; dot = 0; dwidth = 0;
+reswitch:	switch (ch = (u_char)*fmt++) {
+		case '.':
+			dot = 1;
 			goto reswitch;
-		case '1': case '2': case '3': case '4':
-		case '5': case '6': case '7': case '8': case '9':
-			for (width = 0;; ++fmt) {
-				width = width * 10 + ch - '0';
-				ch = *fmt;
-				if (ch < '0' || ch > '9')
-					break;
+		case '#':
+			sharpflag = 1;
+			goto reswitch;
+		case '+':
+			sign = 1;
+			goto reswitch;
+		case '-':
+			ladjust = 1;
+			goto reswitch;
+		case '%':
+			PCHAR(ch);
+			break;
+		case '*':
+			if (!dot) {
+				width = va_arg(ap, int);
+				if (width < 0) {
+					ladjust = !ladjust;
+					width = -width;
+				}
+			} else {
+				dwidth = va_arg(ap, int);
 			}
 			goto reswitch;
-		case 'l':
-			lflag = 1;
+		case '0':
+			if (!dot) {
+				padc = '0';
+				goto reswitch;
+			}
+		case '1': case '2': case '3': case '4':
+		case '5': case '6': case '7': case '8': case '9':
+				for (n = 0;; ++fmt) {
+					n = n * 10 + ch - '0';
+					ch = *fmt;
+					if (ch < '0' || ch > '9')
+						break;
+				}
+			if (dot)
+				dwidth = n;
+			else
+				width = n;
 			goto reswitch;
 		case 'b':
 			ul = va_arg(ap, int);
 			p = va_arg(ap, char *);
-			for (q = ksprintn(ul, *p++, NULL); ch = *q--;)
-				putchar(ch, flags, tp);
+			for (q = ksprintn(ul, *p++, NULL); *q;)
+				PCHAR(*q--);
 
 			if (!ul)
 				break;
 
-			for (tmp = 0; n = *p++;) {
+			for (tmp = 0; *p;) {
+				n = *p++;
 				if (ul & (1 << (n - 1))) {
-					putchar(tmp ? ',' : '<', flags, tp);
+					PCHAR(tmp ? ',' : '<');
 					for (; (n = *p) > ' '; ++p)
-						putchar(n, flags, tp);
+						PCHAR(n);
 					tmp = 1;
 				} else
 					for (; *p > ' '; ++p)
 						continue;
 			}
 			if (tmp)
-				putchar('>', flags, tp);
+				PCHAR('>');
 			break;
 		case 'c':
-			putchar(va_arg(ap, int), flags, tp);
-			break;
-		case 'r':
-			p = va_arg(ap, char *);
-			kprintf(p, flags, tp, va_arg(ap, va_list));
+			PCHAR(va_arg(ap, int));
 			break;
-		case 's':
+		case 'D':
+			up = va_arg(ap, u_char *);
 			p = va_arg(ap, char *);
-			while (ch = *p++)
-				putchar(ch, flags, tp);
+			if (!width)
+				width = 16;
+			while(width--) {
+				PCHAR(hex2ascii(*up >> 4));
+				PCHAR(hex2ascii(*up & 0x0f));
+				up++;
+				if (width)
+					for (q=p;*q;q++)
+						PCHAR(*q);
+			}
 			break;
 		case 'd':
 			ul = lflag ? va_arg(ap, long) : va_arg(ap, int);
-			if ((long)ul < 0) {
-				putchar('-', flags, tp);
-				ul = -(long)ul;
-			}
+			sign = 1;
 			base = 10;
 			goto number;
+		case 'l':
+			lflag = 1;
+			goto reswitch;
+		case 'n':
+			ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int);
+			base = radix;
+			goto number;
 		case 'o':
 			ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int);
 			base = 8;
 			goto number;
+		case 'p':
+			ul = (u_long)va_arg(ap, void *);
+			base = 16;
+			PCHAR('0');
+			PCHAR('x');
+			goto number;
+		case 's':
+			p = va_arg(ap, char *);
+			if (p == NULL)
+				p = "(null)";
+			if (!dot)
+				n = strlen (p);
+			else
+				for (n = 0; n < dwidth && p[n]; n++)
+					continue;
+
+			width -= n;
+
+			if (!ladjust && width > 0)
+				while (width--)
+					PCHAR(padc);
+			while (n--)
+				PCHAR(*p++);
+			if (ladjust && width > 0)
+				while (width--)
+					PCHAR(padc);
+			break;
 		case 'u':
 			ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int);
 			base = 10;
@@ -448,56 +532,71 @@ reswitch:	switch (ch = *(u_char *)fmt++) {
 		case 'x':
 			ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int);
 			base = 16;
-number:			p = ksprintn(ul, base, &tmp);
-			if (width && (width -= tmp) > 0)
+number:			if (sign && (long)ul < 0L) {
+				neg = 1;
+				ul = -(long)ul;
+			}
+			p = ksprintn(ul, base, &tmp);
+			if (sharpflag && ul != 0) {
+				if (base == 8)
+					tmp++;
+				else if (base == 16)
+					tmp += 2;
+			}
+			if (neg)
+				tmp++;
+
+			if (!ladjust && width && (width -= tmp) > 0)
+				while (width--)
+					PCHAR(padc);
+			if (neg)
+				PCHAR('-');
+			if (sharpflag && ul != 0) {
+				if (base == 8) {
+					PCHAR('0');
+				} else if (base == 16) {
+					PCHAR('0');
+					PCHAR('x');
+				}
+			}
+
+			while (*p)
+				PCHAR(*p--);
+
+			if (ladjust && width && (width -= tmp) > 0)
 				while (width--)
-					putchar(padc, flags, tp);
-			while (ch = *p--)
-				putchar(ch, flags, tp);
+					PCHAR(padc);
+
 			break;
 		default:
-			putchar('%', flags, tp);
+			PCHAR('%');
 			if (lflag)
-				putchar('l', flags, tp);
-			/* FALLTHROUGH */
-		case '%':
-			putchar(ch, flags, tp);
+				PCHAR('l');
+			PCHAR(ch);
+			break;
 		}
 	}
+#undef PCHAR
 }
 
 /*
- * Print a character on console or users terminal.  If destination is
- * the console then the last MSGBUFS characters are saved in msgbuf for
- * inspection later.
+ * Put character in log buffer.
  */
 static void
-putchar(c, flags, tp)
-	register int c;
-	int flags;
-	struct tty *tp;
+msglogchar(int c, void *dummyarg)
 {
-	extern int msgbufmapped;
-	register struct msgbuf *mbp;
+	struct msgbuf *mbp;
 
-	if (panicstr)
-		constty = NULL;
-	if ((flags & TOCONS) && tp == NULL && constty) {
-		tp = constty;
-		flags |= TOTTY;
-	}
-	if ((flags & TOTTY) && tp && tputchar(c, tp) < 0 &&
-	    (flags & TOCONS) && tp == constty)
-		constty = NULL;
-	if ((flags & TOLOG) &&
-	    c != '\0' && c != '\r' && c != 0177 && msgbufmapped) {
+	if (c != '\0' && c != '\r' && c != 0177 && msgbufmapped) {
 		mbp = msgbufp;
-		if (mbp->msg_magic != MSG_MAGIC) {
-			bzero((caddr_t)mbp, sizeof(*mbp));
+		if (mbp->msg_magic != MSG_MAGIC ||
+		    mbp->msg_bufx >= MSG_BSIZE ||
+		    mbp->msg_bufr >= MSG_BSIZE) {
+			bzero(mbp, sizeof(struct msgbuf));
 			mbp->msg_magic = MSG_MAGIC;
 		}
 		mbp->msg_bufc[mbp->msg_bufx++] = c;
-		if (mbp->msg_bufx < 0 || mbp->msg_bufx >= MSG_BSIZE)
+		if (mbp->msg_bufx >= MSG_BSIZE)
 			mbp->msg_bufx = 0;
 		/* If the buffer is full, keep the most recent data. */
 		if (mbp->msg_bufr == mbp->msg_bufx) {
@@ -505,102 +604,4 @@ putchar(c, flags, tp)
 				mbp->msg_bufr = 0;
 		}
 	}
-	if ((flags & TOCONS) && constty == NULL && c != '\0')
-		(*v_putc)(c);
-}
-
-/*
- * Scaled down version of sprintf(3).
- */
-#ifdef __STDC__
-sprintf(char *buf, const char *cfmt, ...)
-#else
-sprintf(buf, cfmt, va_alist)
-	char *buf, *cfmt;
-#endif
-{
-	register const char *fmt = cfmt;
-	register char *p, *bp;
-	register int ch, base;
-	u_long ul;
-	int lflag;
-	va_list ap;
-
-	va_start(ap, cfmt);
-	for (bp = buf; ; ) {
-		while ((ch = *(u_char *)fmt++) != '%')
-			if ((*bp++ = ch) == '\0')
-				return ((bp - buf) - 1);
-
-		lflag = 0;
-reswitch:	switch (ch = *(u_char *)fmt++) {
-		case 'l':
-			lflag = 1;
-			goto reswitch;
-		case 'c':
-			*bp++ = va_arg(ap, int);
-			break;
-		case 's':
-			p = va_arg(ap, char *);
-			while (*bp++ = *p++)
-				continue;
-			--bp;
-			break;
-		case 'd':
-			ul = lflag ? va_arg(ap, long) : va_arg(ap, int);
-			if ((long)ul < 0) {
-				*bp++ = '-';
-				ul = -(long)ul;
-			}
-			base = 10;
-			goto number;
-			break;
-		case 'o':
-			ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int);
-			base = 8;
-			goto number;
-			break;
-		case 'u':
-			ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int);
-			base = 10;
-			goto number;
-			break;
-		case 'x':
-			ul = lflag ? va_arg(ap, u_long) : va_arg(ap, u_int);
-			base = 16;
-number:			for (p = ksprintn(ul, base, NULL); ch = *p--;)
-				*bp++ = ch;
-			break;
-		default:
-			*bp++ = '%';
-			if (lflag)
-				*bp++ = 'l';
-			/* FALLTHROUGH */
-		case '%':
-			*bp++ = ch;
-		}
-	}
-	va_end(ap);
-}
-
-/*
- * Put a number (base <= 16) in a buffer in reverse order; return an
- * optional length and a pointer to the NULL terminated (preceded?)
- * buffer.
- */
-static char *
-ksprintn(ul, base, lenp)
-	register u_long ul;
-	register int base, *lenp;
-{					/* A long in base 8, plus NULL. */
-	static char buf[sizeof(long) * NBBY / 3 + 2];
-	register char *p;
-
-	p = buf;
-	do {
-		*++p = "0123456789abcdef"[ul % base];
-	} while (ul /= base);
-	if (lenp)
-		*lenp = p - buf;
-	return (p);
 }
diff --git a/sys/kern/subr_prof.c b/sys/kern/subr_prof.c
index 237553d..08ba35f 100644
--- a/sys/kern/subr_prof.c
+++ b/sys/kern/subr_prof.c
@@ -30,17 +30,17 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)subr_prof.c	8.4 (Berkeley) 2/14/95
+ *	@(#)subr_prof.c	8.3 (Berkeley) 9/23/93
+ * $Id$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/sysproto.h>
 #include <sys/kernel.h>
 #include <sys/proc.h>
-#include <sys/user.h>
-
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
+#include <sys/resourcevar.h>
+#include <sys/sysctl.h>
 
 #include <machine/cpu.h>
 
@@ -48,26 +48,57 @@
 #include <sys/malloc.h>
 #include <sys/gmon.h>
 
-/*
- * Froms is actually a bunch of unsigned shorts indexing tos
- */
+static void kmstartup __P((void *));
+SYSINIT(kmem, SI_SUB_KPROF, SI_ORDER_FIRST, kmstartup, NULL)
+
 struct gmonparam _gmonparam = { GMON_PROF_OFF };
 
+extern char btext[];
 extern char etext[];
 
+#ifdef GUPROF
+void
+nullfunc_loop_profiled()
+{
+	int i;
+
+	for (i = 0; i < CALIB_SCALE; i++)
+		nullfunc_profiled();
+}
+
+#define	nullfunc_loop_profiled_end	nullfunc_profiled	/* XXX */
+
 void
-kmstartup()
+nullfunc_profiled()
+{
+}
+#endif /* GUPROF */
+
+static void
+kmstartup(dummy)
+	void *dummy;
 {
 	char *cp;
 	struct gmonparam *p = &_gmonparam;
+#ifdef GUPROF
+	int cputime_overhead;
+	int empty_loop_time;
+	int i;
+	int mcount_overhead;
+	int mexitcount_overhead;
+	int nullfunc_loop_overhead;
+	int nullfunc_loop_profiled_time;
+	fptrint_t tmp_addr;
+#endif
+
 	/*
 	 * Round lowpc and highpc to multiples of the density we're using
 	 * so the rest of the scaling (here and in gprof) stays in ints.
 	 */
-	p->lowpc = ROUNDDOWN(KERNBASE, HISTFRACTION * sizeof(HISTCOUNTER));
+	p->lowpc = ROUNDDOWN((u_long)btext, HISTFRACTION * sizeof(HISTCOUNTER));
 	p->highpc = ROUNDUP((u_long)etext, HISTFRACTION * sizeof(HISTCOUNTER));
 	p->textsize = p->highpc - p->lowpc;
-	printf("Profiling kernel, textsize=%d [%x..%x]\n",
+	printf("Profiling kernel, textsize=%lu [%x..%x]\n",
 	       p->textsize, p->lowpc, p->highpc);
 	p->kcountsize = p->textsize / HISTFRACTION;
 	p->hashfraction = HASHFRACTION;
@@ -87,25 +118,168 @@ kmstartup()
 	bzero(cp, p->kcountsize + p->tossize + p->fromssize);
 	p->tos = (struct tostruct *)cp;
 	cp += p->tossize;
-	p->kcount = (u_short *)cp;
+	p->kcount = (HISTCOUNTER *)cp;
 	cp += p->kcountsize;
 	p->froms = (u_short *)cp;
+
+#ifdef GUPROF
+	/* Initialize pointers to overhead counters. */
+	p->cputime_count = &KCOUNT(p, PC_TO_I(p, cputime));
+	p->mcount_count = &KCOUNT(p, PC_TO_I(p, mcount));
+	p->mexitcount_count = &KCOUNT(p, PC_TO_I(p, mexitcount));
+
+	/*
+	 * Disable interrupts to avoid interference while we calibrate
+	 * things.
+	 */
+	disable_intr();
+
+	/*
+	 * Determine overheads.
+	 * XXX this needs to be repeated for each useful timer/counter.
+	 */
+	cputime_overhead = 0;
+	startguprof(p);
+	for (i = 0; i < CALIB_SCALE; i++)
+		cputime_overhead += cputime();
+
+	empty_loop();
+	startguprof(p);
+	empty_loop();
+	empty_loop_time = cputime();
+
+	nullfunc_loop_profiled();
+
+	/*
+	 * Start profiling.  There won't be any normal function calls since
+	 * interrupts are disabled, but we will call the profiling routines
+	 * directly to determine their overheads.
+	 */
+	p->state = GMON_PROF_HIRES;
+
+	startguprof(p);
+	nullfunc_loop_profiled();
+
+	startguprof(p);
+	for (i = 0; i < CALIB_SCALE; i++)
+#if defined(i386) && __GNUC__ >= 2
+		asm("pushl %0; call __mcount; popl %%ecx"
+		    :
+		    : "i" (profil)
+		    : "ax", "bx", "cx", "dx", "memory");
+#else
+#error
+#endif
+	mcount_overhead = KCOUNT(p, PC_TO_I(p, profil));
+
+	startguprof(p);
+	for (i = 0; i < CALIB_SCALE; i++)
+#if defined(i386) && __GNUC__ >= 2
+		    asm("call mexitcount; 1:"
+			: : : "ax", "bx", "cx", "dx", "memory");
+	asm("movl $1b,%0" : "=rm" (tmp_addr));
+#else
+#error
+#endif
+	mexitcount_overhead = KCOUNT(p, PC_TO_I(p, tmp_addr));
+
+	p->state = GMON_PROF_OFF;
+	stopguprof(p);
+
+	enable_intr();
+
+	nullfunc_loop_profiled_time = 0;
+	for (tmp_addr = (fptrint_t)nullfunc_loop_profiled;
+	     tmp_addr < (fptrint_t)nullfunc_loop_profiled_end;
+	     tmp_addr += HISTFRACTION * sizeof(HISTCOUNTER))
+		nullfunc_loop_profiled_time += KCOUNT(p, PC_TO_I(p, tmp_addr));
+#define CALIB_DOSCALE(count)	(((count) + CALIB_SCALE / 3) / CALIB_SCALE)
+#define	c2n(count, freq)	((int)((count) * 1000000000LL / freq))
+	printf("cputime %d, empty_loop %d, nullfunc_loop_profiled %d, mcount %d, mexitcount %d\n",
+	       CALIB_DOSCALE(c2n(cputime_overhead, p->profrate)),
+	       CALIB_DOSCALE(c2n(empty_loop_time, p->profrate)),
+	       CALIB_DOSCALE(c2n(nullfunc_loop_profiled_time, p->profrate)),
+	       CALIB_DOSCALE(c2n(mcount_overhead, p->profrate)),
+	       CALIB_DOSCALE(c2n(mexitcount_overhead, p->profrate)));
+	cputime_overhead -= empty_loop_time;
+	mcount_overhead -= empty_loop_time;
+	mexitcount_overhead -= empty_loop_time;
+
+	/*-
+	 * Profiling overheads are determined by the times between the
+	 * following events:
+	 *	MC1: mcount() is called
+	 *	MC2: cputime() (called from mcount()) latches the timer
+	 *	MC3: mcount() completes
+	 *	ME1: mexitcount() is called
+	 *	ME2: cputime() (called from mexitcount()) latches the timer
+	 *	ME3: mexitcount() completes.
+	 * The times between the events vary slightly depending on instruction
+	 * combination and cache misses, etc.  Attempt to determine the
+	 * minimum times.  These can be subtracted from the profiling times
+	 * without much risk of reducing the profiling times below what they
+	 * would be when profiling is not configured.  Abbreviate:
+	 *	ab = minimum time between MC1 and MC3
+	 *	a  = minumum time between MC1 and MC2
+	 *	b  = minimum time between MC2 and MC3
+	 *	cd = minimum time between ME1 and ME3
+	 *	c  = minimum time between ME1 and ME2
+	 *	d  = minimum time between ME2 and ME3.
+	 * These satisfy the relations:
+	 *	ab            <= mcount_overhead		(just measured)
+	 *	a + b         <= ab
+	 *	        cd    <= mexitcount_overhead		(just measured)
+	 *	        c + d <= cd
+	 *	a         + d <= nullfunc_loop_profiled_time	(just measured)
+	 *	a >= 0, b >= 0, c >= 0, d >= 0.
+	 * Assume that ab and cd are equal to the minimums.
+	 */
+	p->cputime_overhead = CALIB_DOSCALE(cputime_overhead);
+	p->mcount_overhead = CALIB_DOSCALE(mcount_overhead - cputime_overhead);
+	p->mexitcount_overhead = CALIB_DOSCALE(mexitcount_overhead
+					       - cputime_overhead);
+	nullfunc_loop_overhead = nullfunc_loop_profiled_time - empty_loop_time;
+	p->mexitcount_post_overhead = CALIB_DOSCALE((mcount_overhead
+						     - nullfunc_loop_overhead)
+						    / 4);
+	p->mexitcount_pre_overhead = p->mexitcount_overhead
+				     + p->cputime_overhead
+				     - p->mexitcount_post_overhead;
+	p->mcount_pre_overhead = CALIB_DOSCALE(nullfunc_loop_overhead)
+				 - p->mexitcount_post_overhead;
+	p->mcount_post_overhead = p->mcount_overhead
+				  + p->cputime_overhead
+				  - p->mcount_pre_overhead;
+	printf(
+"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d nsec\n",
+	       c2n(p->cputime_overhead, p->profrate),
+	       c2n(p->mcount_overhead, p->profrate),
+	       c2n(p->mcount_pre_overhead, p->profrate),
+	       c2n(p->mcount_post_overhead, p->profrate),
+	       c2n(p->cputime_overhead, p->profrate),
+	       c2n(p->mexitcount_overhead, p->profrate),
+	       c2n(p->mexitcount_pre_overhead, p->profrate),
+	       c2n(p->mexitcount_post_overhead, p->profrate));
+	printf(
+"Profiling overheads: mcount: %d+%d, %d+%d; mexitcount: %d+%d, %d+%d cycles\n",
+	       p->cputime_overhead, p->mcount_overhead,
+	       p->mcount_pre_overhead, p->mcount_post_overhead,
+	       p->cputime_overhead, p->mexitcount_overhead,
+	       p->mexitcount_pre_overhead, p->mexitcount_post_overhead);
+#endif /* GUPROF */
 }
 
 /*
  * Return kernel profiling information.
  */
-int
-sysctl_doprof(name, namelen, oldp, oldlenp, newp, newlen, p)
-	int *name;
-	u_int namelen;
-	void *oldp;
-	size_t *oldlenp;
-	void *newp;
-	size_t newlen;
+static int
+sysctl_kern_prof SYSCTL_HANDLER_ARGS
 {
+	int *name = (int *) arg1;
+	u_int namelen = arg2;
 	struct gmonparam *gp = &_gmonparam;
 	int error;
+	int state;
 
 	/* all sysctl names at this level are terminal */
 	if (namelen != 1)
@@ -113,30 +287,50 @@ sysctl_doprof(name, namelen, oldp, oldlenp, newp, newlen, p)
 
 	switch (name[0]) {
 	case GPROF_STATE:
-		error = sysctl_int(oldp, oldlenp, newp, newlen, &gp->state);
+		state = gp->state;
+		error = sysctl_handle_int(oidp, &state, 0, req);
 		if (error)
 			return (error);
-		if (gp->state == GMON_PROF_OFF)
+		if (!req->newptr)
+			return (0);
+		if (state == GMON_PROF_OFF) {
+			gp->state = state;
 			stopprofclock(&proc0);
-		else
+			stopguprof(gp);
+		} else if (state == GMON_PROF_ON) {
+			gp->state = GMON_PROF_OFF;
+			stopguprof(gp);
+			gp->profrate = profhz;
 			startprofclock(&proc0);
+			gp->state = state;
+#ifdef GUPROF
+		} else if (state == GMON_PROF_HIRES) {
+			gp->state = GMON_PROF_OFF;
+			stopprofclock(&proc0);
+			startguprof(gp);
+			gp->state = state;
+#endif
+		} else if (state != gp->state)
+			return (EINVAL);
 		return (0);
 	case GPROF_COUNT:
-		return (sysctl_struct(oldp, oldlenp, newp, newlen,
-		    gp->kcount, gp->kcountsize));
+		return (sysctl_handle_opaque(oidp, 
+			gp->kcount, gp->kcountsize, req));
 	case GPROF_FROMS:
-		return (sysctl_struct(oldp, oldlenp, newp, newlen,
-		    gp->froms, gp->fromssize));
+		return (sysctl_handle_opaque(oidp, 
+			gp->froms, gp->fromssize, req));
 	case GPROF_TOS:
-		return (sysctl_struct(oldp, oldlenp, newp, newlen,
-		    gp->tos, gp->tossize));
+		return (sysctl_handle_opaque(oidp, 
+			gp->tos, gp->tossize, req));
 	case GPROF_GMONPARAM:
-		return (sysctl_rdstruct(oldp, oldlenp, newp, gp, sizeof *gp));
+		return (sysctl_handle_opaque(oidp, gp, sizeof *gp, req));
 	default:
 		return (EOPNOTSUPP);
 	}
 	/* NOTREACHED */
 }
+
+SYSCTL_NODE(_kern, KERN_PROF, prof, CTLFLAG_RW, sysctl_kern_prof, "");
 #endif /* GPROF */
 
 /*
@@ -145,24 +339,27 @@ sysctl_doprof(name, namelen, oldp, oldlenp, newp, newlen, p)
  * The scale factor is a fixed point number with 16 bits of fraction, so that
  * 1.0 is represented as 0x10000.  A scale factor of 0 turns off profiling.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct profil_args {
+	caddr_t	samples;
+	u_int	size;
+	u_int	offset;
+	u_int	scale;
+};
+#endif
 /* ARGSUSED */
 int
 profil(p, uap, retval)
 	struct proc *p;
-	register struct profil_args /* {
-		syscallarg(caddr_t) samples;
-		syscallarg(u_int) size;
-		syscallarg(u_int) offset;
-		syscallarg(u_int) scale;
-	} */ *uap;
-	register_t *retval;
+	register struct profil_args *uap;
+	int *retval;
 {
 	register struct uprof *upp;
 	int s;
 
-	if (SCARG(uap, scale) > (1 << 16))
+	if (uap->scale > (1 << 16))
 		return (EINVAL);
-	if (SCARG(uap, scale) == 0) {
+	if (uap->scale == 0) {
 		stopprofclock(p);
 		return (0);
 	}
@@ -170,10 +367,10 @@ profil(p, uap, retval)
 
 	/* Block profile interrupts while changing state. */
 	s = splstatclock();
-	upp->pr_off = SCARG(uap, offset);
-	upp->pr_scale = SCARG(uap, scale);
-	upp->pr_base = SCARG(uap, samples);
-	upp->pr_size = SCARG(uap, size);
+	upp->pr_off = uap->offset;
+	upp->pr_scale = uap->scale;
+	upp->pr_base = uap->samples;
+	upp->pr_size = uap->size;
 	startprofclock(p);
 	splx(s);
 
diff --git a/sys/kern/subr_rlist.c b/sys/kern/subr_rlist.c
new file mode 100644
index 0000000..ef29ce3
--- /dev/null
+++ b/sys/kern/subr_rlist.c
@@ -0,0 +1,311 @@
+/*
+ * Copyright (c) 1992 William F. Jolitz, TeleMuse
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This software is a component of "386BSD" developed by
+	William F. Jolitz, TeleMuse.
+ * 4. Neither the name of the developer nor the name "386BSD"
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS A COMPONENT OF 386BSD DEVELOPED BY WILLIAM F. JOLITZ
+ * AND IS INTENDED FOR RESEARCH AND EDUCATIONAL PURPOSES ONLY. THIS
+ * SOFTWARE SHOULD NOT BE CONSIDERED TO BE A COMMERCIAL PRODUCT.
+ * THE DEVELOPER URGES THAT USERS WHO REQUIRE A COMMERCIAL PRODUCT
+ * NOT MAKE USE THIS WORK.
+ *
+ * FOR USERS WHO WISH TO UNDERSTAND THE 386BSD SYSTEM DEVELOPED
+ * BY WILLIAM F. JOLITZ, WE RECOMMEND THE USER STUDY WRITTEN
+ * REFERENCES SUCH AS THE  "PORTING UNIX TO THE 386" SERIES
+ * (BEGINNING JANUARY 1991 "DR. DOBBS JOURNAL", USA AND BEGINNING
+ * JUNE 1991 "UNIX MAGAZIN", GERMANY) BY WILLIAM F. JOLITZ AND
+ * LYNNE GREER JOLITZ, AS WELL AS OTHER BOOKS ON UNIX AND THE
+ * ON-LINE 386BSD USER MANUAL BEFORE USE. A BOOK DISCUSSING THE INTERNALS
+ * OF 386BSD ENTITLED "386BSD FROM THE INSIDE OUT" WILL BE AVAILABLE LATE 1992.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE DEVELOPER ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE DEVELOPER BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ */
+/*
+ * Changes Copyright (C) 1995, David Greenman & John Dyson; This software may
+ * be used, modified, copied, distributed, and sold, in both source and
+ * binary form provided that the above copyright and these terms are
+ * retained. Under no circumstances is the author responsible for the proper
+ * functioning of this software, nor does the author assume any responsibility
+ * for damages incurred with its use.
+ *
+ *	$Id$
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/malloc.h>
+#include <sys/rlist.h>
+#include <sys/proc.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+/*
+ * Resource lists.
+ */
+
+#define RLIST_MIN 128
+static int rlist_count=0;
+static struct rlist *rlfree;
+
+static struct rlist	*rlist_malloc __P((void));
+
+static struct rlist *
+rlist_malloc()
+{
+	struct rlist *rl;
+	int i;
+	while( rlist_count < RLIST_MIN) {
+		int s = splhigh();
+		rl = (struct rlist *)kmem_alloc(kernel_map, PAGE_SIZE);
+		splx(s);
+		if( !rl)
+			break;
+
+		for(i=0;i<(PAGE_SIZE/(sizeof *rl));i++) {
+			rl->rl_next = rlfree;
+			rlfree = rl;
+			rlist_count++;
+			rl++;
+		}
+	}
+
+	if( (rl = rlfree) == 0 )
+		panic("Cannot get an rlist entry");
+
+	--rlist_count;
+	rlfree = rl->rl_next;
+	return rl;
+}
+
+inline static void
+rlist_mfree( struct rlist *rl)
+{
+	rl->rl_next = rlfree;
+	rlfree = rl;
+	++rlist_count;
+}
+
+void
+rlist_free(rlh, start, end)
+	struct rlisthdr *rlh;
+	u_int start, end;
+{
+	struct rlist **rlp = &rlh->rlh_list;
+	struct rlist *prev_rlp = NULL, *cur_rlp = *rlp, *next_rlp = NULL;
+	int s;
+
+	s = splhigh();
+	while (rlh->rlh_lock & RLH_LOCKED) {
+		rlh->rlh_lock |= RLH_DESIRED;
+		tsleep(rlh, PSWP, "rlistf", 0);
+	}
+	rlh->rlh_lock |= RLH_LOCKED;
+	splx(s);
+
+	/*
+	 * Traverse the list looking for an entry after the one we want
+	 * to insert.
+	 */
+	while (cur_rlp != NULL) {
+		if (start < cur_rlp->rl_start)
+			break;
+#ifdef DIAGNOSTIC
+		if (prev_rlp) {
+			if (prev_rlp->rl_end + 1 == cur_rlp->rl_start)
+				panic("rlist_free: missed coalesce opportunity");
+			if (prev_rlp->rl_end ==  cur_rlp->rl_start)
+				panic("rlist_free: entries overlap");
+			if (prev_rlp->rl_end > cur_rlp->rl_start)
+				panic("entries out of order");
+		}
+#endif
+		prev_rlp = cur_rlp;
+		cur_rlp = cur_rlp->rl_next;
+	}
+
+	if (cur_rlp != NULL) {
+
+		if (end >= cur_rlp->rl_start)
+			panic("rlist_free: free end overlaps already freed area");
+
+		if (prev_rlp) {
+			if (start <= prev_rlp->rl_end)
+				panic("rlist_free: free start overlaps already freed area");
+			/*
+			 * Attempt to append
+			 */
+			if (prev_rlp->rl_end + 1 == start) {
+				prev_rlp->rl_end = end;
+				/*
+				 * Attempt to prepend and coalesce
+				 */
+				if (end + 1 == cur_rlp->rl_start) {
+					prev_rlp->rl_end = cur_rlp->rl_end;
+					prev_rlp->rl_next = cur_rlp->rl_next;
+					rlist_mfree(cur_rlp);
+				}
+				goto done;
+			}
+		}
+		/*
+		 * Attempt to prepend
+		 */
+		if (end + 1 == cur_rlp->rl_start) {
+			cur_rlp->rl_start = start;
+			goto done;
+		}
+	}
+	/*
+	 * Reached the end of the list without finding a larger entry.
+	 * Append to last entry if there is one and it's adjacent.
+	 */
+	if (prev_rlp) {
+		if (start <= prev_rlp->rl_end)
+			panic("rlist_free: free start overlaps already freed area at list tail");
+		/*
+		 * Attempt to append
+		 */
+		if (prev_rlp->rl_end + 1 == start) {
+			prev_rlp->rl_end = end;
+			goto done;
+		}
+	}
+
+	/*
+	 * Could neither append nor prepend; allocate a new entry.
+	 */
+	next_rlp = cur_rlp;
+	cur_rlp = rlist_malloc();
+	cur_rlp->rl_start = start;
+	cur_rlp->rl_end = end;
+	cur_rlp->rl_next = next_rlp;
+	if (prev_rlp) {
+		prev_rlp->rl_next = cur_rlp;
+	} else {
+		/*
+		 * No previous - this entry is the new list head.
+		 */
+		*rlp = cur_rlp;
+	}
+
+done:
+	rlh->rlh_lock &= ~RLH_LOCKED;
+	if (rlh->rlh_lock & RLH_DESIRED) {
+		wakeup(rlh);
+		rlh->rlh_lock &= ~RLH_DESIRED;
+	}
+	return;
+}
+
+/*
+ * Obtain a region of desired size from a resource list.
+ * If nothing available of that size, return 0. Otherwise,
+ * return a value of 1 and set resource start location with
+ * "*loc". (Note: loc can be zero if we don't wish the value)
+ */
+int
+rlist_alloc (rlh, size, loc)
+	struct rlisthdr *rlh;
+	unsigned size, *loc;
+{
+	struct rlist **rlp = &rlh->rlh_list;
+	register struct rlist *lp;
+	int s;
+	register struct rlist *olp = 0;
+
+	s = splhigh();
+	while (rlh->rlh_lock & RLH_LOCKED) {
+		rlh->rlh_lock |= RLH_DESIRED;
+		tsleep(rlh, PSWP, "rlistf", 0);
+	}
+	rlh->rlh_lock |= RLH_LOCKED;
+	splx(s);
+
+	/* walk list, allocating first thing that's big enough (first fit) */
+	for (; *rlp; rlp = &((*rlp)->rl_next))
+		if(size <= (*rlp)->rl_end - (*rlp)->rl_start + 1) {
+
+			/* hand it to the caller */
+			if (loc) *loc = (*rlp)->rl_start;
+			(*rlp)->rl_start += size;
+
+			/* did we eat this element entirely? */
+			if ((*rlp)->rl_start > (*rlp)->rl_end) {
+				lp = (*rlp)->rl_next;
+				rlist_mfree(*rlp);
+				/*
+				 * if the deleted element was in fromt
+				 * of the list, adjust *rlp, else don't.
+				 */
+				if (olp) {
+					olp->rl_next = lp;
+				} else {
+					*rlp = lp;
+				}
+			}
+
+			rlh->rlh_lock &= ~RLH_LOCKED;
+			if (rlh->rlh_lock & RLH_DESIRED) {
+				wakeup(rlh);
+				rlh->rlh_lock &= ~RLH_DESIRED;
+			}
+			return (1);
+		} else {
+			olp = *rlp;
+		}
+
+	rlh->rlh_lock &= ~RLH_LOCKED;
+	if (rlh->rlh_lock & RLH_DESIRED) {
+		wakeup(rlh);
+		rlh->rlh_lock &= ~RLH_DESIRED;
+	}
+	/* nothing in list that's big enough */
+	return (0);
+}
+
+/*
+ * Finished with this resource list, reclaim all space and
+ * mark it as being empty.
+ */
+void
+rlist_destroy (rlh)
+	struct rlisthdr *rlh;
+{
+	struct rlist **rlp = &rlh->rlh_list;
+	struct rlist *lp, *nlp;
+
+	lp = *rlp;
+	*rlp = 0;
+	for (; lp; lp = nlp) {
+		nlp = lp->rl_next;
+		rlist_mfree(lp);
+	}
+}
diff --git a/sys/kern/subr_rmap.c b/sys/kern/subr_rmap.c
deleted file mode 100644
index 2f31173..0000000
--- a/sys/kern/subr_rmap.c
+++ /dev/null
@@ -1,81 +0,0 @@
-/*-
- * Copyright (c) 1982, 1986, 1993
- *	The Regents of the University of California.  All rights reserved.
- * (c) UNIX System Laboratories, Inc.
- * All or some portions of this file are derived from material licensed
- * to the University of California by American Telephone and Telegraph
- * Co. or Unix System Laboratories, Inc. and are reproduced herein with
- * the permission of UNIX System Laboratories, Inc.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *	This product includes software developed by the University of
- *	California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *	from: @(#)subr_rmap.c	8.1 (Berkeley) 6/10/93
- */
-
-#include <sys/param.h>
-#include <sys/map.h>
-#include <sys/proc.h>
-
-void
-rminit(a1, a2, a3, a4, a5)
-	struct map *a1;
-	long a2, a3;
-	char *a4;
-	int a5;
-{
-
-	/*
-	 * Body deleted.
-	 */
-	return;
-}
-
-long
-rmalloc(a1, a2)
-	struct map *a1;
-	long a2;
-{
-
-	/*
-	 * Body deleted.
-	 */
-	return (0);
-}
-
-void
-rmfree(a1, a2, a3)
-	struct map *a1;
-	long a2, a3;
-{
-
-	/*
-	 * Body deleted.
-	 */
-	return;
-}
diff --git a/sys/kern/subr_trap.c b/sys/kern/subr_trap.c
new file mode 100644
index 0000000..9dca842
--- /dev/null
+++ b/sys/kern/subr_trap.c
@@ -0,0 +1,940 @@
+/*-
+ * Copyright (C) 1994, David Greenman
+ * Copyright (c) 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the University of Utah, and William Jolitz.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)trap.c	7.4 (Berkeley) 5/13/91
+ *	$Id$
+ */
+
+/*
+ * 386 Trap and System call handling
+ */
+
+#include "opt_ktrace.h"
+#include "opt_ddb.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/acct.h>
+#include <sys/kernel.h>
+#include <sys/syscall.h>
+#include <sys/sysent.h>
+#include <sys/queue.h>
+#include <sys/vmmeter.h>
+#ifdef KTRACE
+#include <sys/ktrace.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+#include <vm/vm_extern.h>
+
+#include <sys/user.h>
+
+#include <machine/cpu.h>
+#include <machine/md_var.h>
+#include <machine/psl.h>
+#include <machine/reg.h>
+#include <machine/trap.h>
+#include <machine/../isa/isa_device.h>
+
+#ifdef POWERFAIL_NMI
+#include <sys/syslog.h>
+#include <machine/clock.h>
+#endif
+
+#include "isa.h"
+#include "npx.h"
+
+int (*pmath_emulate) __P((struct trapframe *));
+
+extern void trap __P((struct trapframe frame));
+extern int trapwrite __P((unsigned addr));
+extern void syscall __P((struct trapframe frame));
+
+static int trap_pfault __P((struct trapframe *, int));
+static void trap_fatal __P((struct trapframe *));
+void dblfault_handler __P((void));
+
+extern inthand_t IDTVEC(syscall);
+
+#define MAX_TRAP_MSG		28
+static char *trap_msg[] = {
+	"",					/*  0 unused */
+	"privileged instruction fault",		/*  1 T_PRIVINFLT */
+	"",					/*  2 unused */
+	"breakpoint instruction fault",		/*  3 T_BPTFLT */
+	"",					/*  4 unused */
+	"",					/*  5 unused */
+	"arithmetic trap",			/*  6 T_ARITHTRAP */
+	"system forced exception",		/*  7 T_ASTFLT */
+	"",					/*  8 unused */
+	"general protection fault",		/*  9 T_PROTFLT */
+	"trace trap",				/* 10 T_TRCTRAP */
+	"",					/* 11 unused */
+	"page fault",				/* 12 T_PAGEFLT */
+	"",					/* 13 unused */
+	"alignment fault",			/* 14 T_ALIGNFLT */
+	"",					/* 15 unused */
+	"",					/* 16 unused */
+	"",					/* 17 unused */
+	"integer divide fault",			/* 18 T_DIVIDE */
+	"non-maskable interrupt trap",		/* 19 T_NMI */
+	"overflow trap",			/* 20 T_OFLOW */
+	"FPU bounds check fault",		/* 21 T_BOUND */
+	"FPU device not available",		/* 22 T_DNA */
+	"double fault",				/* 23 T_DOUBLEFLT */
+	"FPU operand fetch fault",		/* 24 T_FPOPFLT */
+	"invalid TSS fault",			/* 25 T_TSSFLT */
+	"segment not present fault",		/* 26 T_SEGNPFLT */
+	"stack fault",				/* 27 T_STKFLT */
+	"machine check trap",			/* 28 T_MCHK */
+};
+
+static void userret __P((struct proc *p, struct trapframe *frame,
+			 u_quad_t oticks));
+
+static inline void
+userret(p, frame, oticks)
+	struct proc *p;
+	struct trapframe *frame;
+	u_quad_t oticks;
+{
+	int sig, s;
+
+	while ((sig = CURSIG(p)) != 0)
+		postsig(sig);
+	p->p_priority = p->p_usrpri;
+	if (want_resched) {
+		/*
+		 * Since we are curproc, clock will normally just change
+		 * our priority without moving us from one queue to another
+		 * (since the running process is not on a queue.)
+		 * If that happened after we setrunqueue ourselves but before we
+		 * mi_switch()'ed, we might not be on the queue indicated by
+		 * our priority.
+		 */
+		s = splhigh();
+		setrunqueue(p);
+		p->p_stats->p_ru.ru_nivcsw++;
+		mi_switch();
+		splx(s);
+		while ((sig = CURSIG(p)) != 0)
+			postsig(sig);
+	}
+	/*
+	 * Charge system time if profiling.
+	 */
+	if (p->p_flag & P_PROFIL)
+		addupc_task(p, frame->tf_eip,
+			    (u_int)(p->p_sticks - oticks) * psratio);
+
+	curpriority = p->p_priority;
+}
+
+/*
+ * Exception, fault, and trap interface to the FreeBSD kernel.
+ * This common code is called from assembly language IDT gate entry
+ * routines that prepare a suitable stack frame, and restore this
+ * frame after the exception has been processed.
+ */
+
+void
+trap(frame)
+	struct trapframe frame;
+{
+	struct proc *p = curproc;
+	u_quad_t sticks = 0;
+	int i = 0, ucode = 0, type, code;
+#ifdef DEBUG
+	u_long eva;
+#endif
+
+	type = frame.tf_trapno;
+	code = frame.tf_err;
+
+	if (ISPL(frame.tf_cs) == SEL_UPL) {
+		/* user trap */
+
+		sticks = p->p_sticks;
+		p->p_md.md_regs = (int *)&frame;
+
+		switch (type) {
+		case T_PRIVINFLT:	/* privileged instruction fault */
+			ucode = type;
+			i = SIGILL;
+			break;
+
+		case T_BPTFLT:		/* bpt instruction fault */
+		case T_TRCTRAP:		/* trace trap */
+			frame.tf_eflags &= ~PSL_T;
+			i = SIGTRAP;
+			break;
+
+		case T_ARITHTRAP:	/* arithmetic trap */
+			ucode = code;
+			i = SIGFPE;
+			break;
+
+		case T_ASTFLT:		/* Allow process switch */
+			astoff();
+			cnt.v_soft++;
+			if (p->p_flag & P_OWEUPC) {
+				p->p_flag &= ~P_OWEUPC;
+				addupc_task(p, p->p_stats->p_prof.pr_addr,
+					    p->p_stats->p_prof.pr_ticks);
+			}
+			goto out;
+
+		case T_PROTFLT:		/* general protection fault */
+		case T_SEGNPFLT:	/* segment not present fault */
+		case T_STKFLT:		/* stack fault */
+		case T_TSSFLT:		/* invalid TSS fault */
+		case T_DOUBLEFLT:	/* double fault */
+		default:
+			ucode = code + BUS_SEGM_FAULT ;
+			i = SIGBUS;
+			break;
+
+		case T_PAGEFLT:		/* page fault */
+			i = trap_pfault(&frame, TRUE);
+			if (i == -1)
+				return;
+			if (i == 0)
+				goto out;
+
+			ucode = T_PAGEFLT;
+			break;
+
+		case T_DIVIDE:		/* integer divide fault */
+			ucode = FPE_INTDIV_TRAP;
+			i = SIGFPE;
+			break;
+
+#if NISA > 0
+		case T_NMI:
+#ifdef POWERFAIL_NMI
+			goto handle_powerfail;
+#else /* !POWERFAIL_NMI */
+#ifdef DDB
+			/* NMI can be hooked up to a pushbutton for debugging */
+			printf ("NMI ... going to debugger\n");
+			if (kdb_trap (type, 0, &frame))
+				return;
+#endif /* DDB */
+			/* machine/parity/power fail/"kitchen sink" faults */
+			if (isa_nmi(code) == 0) return;
+			panic("NMI indicates hardware failure");
+#endif /* POWERFAIL_NMI */
+#endif /* NISA > 0 */
+
+		case T_OFLOW:		/* integer overflow fault */
+			ucode = FPE_INTOVF_TRAP;
+			i = SIGFPE;
+			break;
+
+		case T_BOUND:		/* bounds check fault */
+			ucode = FPE_SUBRNG_TRAP;
+			i = SIGFPE;
+			break;
+
+		case T_DNA:
+#if NNPX > 0
+			/* if a transparent fault (due to context switch "late") */
+			if (npxdna())
+				return;
+#endif
+			if (!pmath_emulate) {
+				i = SIGFPE;
+				ucode = FPE_FPU_NP_TRAP;
+				break;
+			}
+			i = (*pmath_emulate)(&frame);
+			if (i == 0) {
+				if (!(frame.tf_eflags & PSL_T))
+					return;
+				frame.tf_eflags &= ~PSL_T;
+				i = SIGTRAP;
+			}
+			/* else ucode = emulator_only_knows() XXX */
+			break;
+
+		case T_FPOPFLT:		/* FPU operand fetch fault */
+			ucode = T_FPOPFLT;
+			i = SIGILL;
+			break;
+		}
+	} else {
+		/* kernel trap */
+
+		switch (type) {
+		case T_PAGEFLT:			/* page fault */
+			(void) trap_pfault(&frame, FALSE);
+			return;
+
+		case T_DNA:
+#if NNPX > 0
+			/*
+			 * The kernel is apparently using npx for copying.
+			 * XXX this should be fatal unless the kernel has
+			 * registered such use.
+			 */
+			if (npxdna())
+				return;
+#endif
+			break;
+
+		case T_PROTFLT:		/* general protection fault */
+		case T_SEGNPFLT:	/* segment not present fault */
+			/*
+			 * Invalid segment selectors and out of bounds
+			 * %eip's and %esp's can be set up in user mode.
+			 * This causes a fault in kernel mode when the
+			 * kernel tries to return to user mode.  We want
+			 * to get this fault so that we can fix the
+			 * problem here and not have to check all the
+			 * selectors and pointers when the user changes
+			 * them.
+			 */
+#define	MAYBE_DORETI_FAULT(where, whereto)				\
+	do {								\
+		if (frame.tf_eip == (int)where) {			\
+			frame.tf_eip = (int)whereto;			\
+			return;						\
+		}							\
+	} while (0)
+
+			if (intr_nesting_level == 0) {
+				MAYBE_DORETI_FAULT(doreti_iret,
+						   doreti_iret_fault);
+				MAYBE_DORETI_FAULT(doreti_popl_ds,
+						   doreti_popl_ds_fault);
+				MAYBE_DORETI_FAULT(doreti_popl_es,
+						   doreti_popl_es_fault);
+				if (curpcb && curpcb->pcb_onfault) {
+					frame.tf_eip = (int)curpcb->pcb_onfault;
+					return;
+				}
+			}
+			break;
+
+		case T_TSSFLT:
+			/*
+			 * PSL_NT can be set in user mode and isn't cleared
+			 * automatically when the kernel is entered.  This
+			 * causes a TSS fault when the kernel attempts to
+			 * `iret' because the TSS link is uninitialized.  We
+			 * want to get this fault so that we can fix the
+			 * problem here and not every time the kernel is
+			 * entered.
+			 */
+			if (frame.tf_eflags & PSL_NT) {
+				frame.tf_eflags &= ~PSL_NT;
+				return;
+			}
+			break;
+
+		case T_TRCTRAP:	 /* trace trap */
+			if (frame.tf_eip == (int)IDTVEC(syscall)) {
+				/*
+				 * We've just entered system mode via the
+				 * syscall lcall.  Continue single stepping
+				 * silently until the syscall handler has
+				 * saved the flags.
+				 */
+				return;
+			}
+			if (frame.tf_eip == (int)IDTVEC(syscall) + 1) {
+				/*
+				 * The syscall handler has now saved the
+				 * flags.  Stop single stepping it.
+				 */
+				frame.tf_eflags &= ~PSL_T;
+				return;
+			}
+			/*
+			 * Fall through.
+			 */
+		case T_BPTFLT:
+			/*
+			 * If DDB is enabled, let it handle the debugger trap.
+			 * Otherwise, debugger traps "can't happen".
+			 */
+#ifdef DDB
+			if (kdb_trap (type, 0, &frame))
+				return;
+#endif
+			break;
+
+#if NISA > 0
+		case T_NMI:
+#ifdef POWERFAIL_NMI
+#ifndef TIMER_FREQ
+#  define TIMER_FREQ 1193182
+#endif
+	handle_powerfail:
+		{
+		  static unsigned lastalert = 0;
+
+		  if(time.tv_sec - lastalert > 10)
+		    {
+		      log(LOG_WARNING, "NMI: power fail\n");
+		      sysbeep(TIMER_FREQ/880, hz);
+		      lastalert = time.tv_sec;
+		    }
+		  return;
+		}
+#else /* !POWERFAIL_NMI */
+#ifdef DDB
+			/* NMI can be hooked up to a pushbutton for debugging */
+			printf ("NMI ... going to debugger\n");
+			if (kdb_trap (type, 0, &frame))
+				return;
+#endif /* DDB */
+			/* machine/parity/power fail/"kitchen sink" faults */
+			if (isa_nmi(code) == 0) return;
+			/* FALL THROUGH */
+#endif /* POWERFAIL_NMI */
+#endif /* NISA > 0 */
+		}
+
+		trap_fatal(&frame);
+		return;
+	}
+
+	trapsignal(p, i, ucode);
+
+#ifdef DEBUG
+	eva = rcr2();
+	if (type <= MAX_TRAP_MSG) {
+		uprintf("fatal process exception: %s",
+			trap_msg[type]);
+		if ((type == T_PAGEFLT) || (type == T_PROTFLT))
+			uprintf(", fault VA = 0x%x", eva);
+		uprintf("\n");
+	}
+#endif
+
+out:
+	userret(p, &frame, sticks);
+}
+
+#ifdef notyet
+/*
+ * This version doesn't allow a page fault to user space while
+ * in the kernel. The rest of the kernel needs to be made "safe"
+ * before this can be used. I think the only things remaining
+ * to be made safe are the iBCS2 code and the process tracing/
+ * debugging code.
+ */
+static int
+trap_pfault(frame, usermode)
+	struct trapframe *frame;
+	int usermode;
+{
+	vm_offset_t va;
+	struct vmspace *vm = NULL;
+	vm_map_t map = 0;
+	int rv = 0;
+	vm_prot_t ftype;
+	int eva;
+	struct proc *p = curproc;
+
+	if (frame->tf_err & PGEX_W)
+		ftype = VM_PROT_READ | VM_PROT_WRITE;
+	else
+		ftype = VM_PROT_READ;
+
+	eva = rcr2();
+	va = trunc_page((vm_offset_t)eva);
+
+	if (va < VM_MIN_KERNEL_ADDRESS) {
+		vm_offset_t v;
+		vm_page_t mpte;
+
+		if (p == NULL ||
+		    (!usermode && va < VM_MAXUSER_ADDRESS &&
+		     (intr_nesting_level != 0 || curpcb == NULL ||
+		      curpcb->pcb_onfault == NULL))) {
+			trap_fatal(frame);
+			return (-1);
+		}
+
+		/*
+		 * This is a fault on non-kernel virtual memory.
+		 * vm is initialized above to NULL. If curproc is NULL
+		 * or curproc->p_vmspace is NULL the fault is fatal.
+		 */
+		vm = p->p_vmspace;
+		if (vm == NULL)
+			goto nogo;
+
+		map = &vm->vm_map;
+
+		/*
+		 * Keep swapout from messing with us during this
+		 *	critical time.
+		 */
+		++p->p_lock;
+
+		/*
+		 * Grow the stack if necessary
+		 */
+		if ((caddr_t)va > vm->vm_maxsaddr
+		    && (caddr_t)va < (caddr_t)USRSTACK) {
+			if (!grow(p, va)) {
+				rv = KERN_FAILURE;
+				--p->p_lock;
+				goto nogo;
+			}
+		}
+
+		/* Fault in the user page: */
+		rv = vm_fault(map, va, ftype, FALSE);
+
+		--p->p_lock;
+	} else {
+		/*
+		 * Don't allow user-mode faults in kernel address space.
+		 */
+		if (usermode)
+			goto nogo;
+
+		/*
+		 * Since we know that kernel virtual address addresses
+		 * always have pte pages mapped, we just have to fault
+		 * the page.
+		 */
+		rv = vm_fault(kernel_map, va, ftype, FALSE);
+	}
+
+	if (rv == KERN_SUCCESS)
+		return (0);
+nogo:
+	if (!usermode) {
+		if (intr_nesting_level == 0 && curpcb && curpcb->pcb_onfault) {
+			frame->tf_eip = (int)curpcb->pcb_onfault;
+			return (0);
+		}
+		trap_fatal(frame);
+		return (-1);
+	}
+
+	/* kludge to pass faulting virtual address to sendsig */
+	frame->tf_err = eva;
+
+	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
+}
+#endif
+
+int
+trap_pfault(frame, usermode)
+	struct trapframe *frame;
+	int usermode;
+{
+	vm_offset_t va;
+	struct vmspace *vm = NULL;
+	vm_map_t map = 0;
+	int rv = 0;
+	vm_prot_t ftype;
+	int eva;
+	struct proc *p = curproc;
+
+	eva = rcr2();
+	va = trunc_page((vm_offset_t)eva);
+
+	if (va >= KERNBASE) {
+		/*
+		 * Don't allow user-mode faults in kernel address space.
+		 */
+		if (usermode)
+			goto nogo;
+
+		map = kernel_map;
+	} else {
+		/*
+		 * This is a fault on non-kernel virtual memory.
+		 * vm is initialized above to NULL. If curproc is NULL
+		 * or curproc->p_vmspace is NULL the fault is fatal.
+		 */
+		if (p != NULL)
+			vm = p->p_vmspace;
+
+		if (vm == NULL)
+			goto nogo;
+
+		map = &vm->vm_map;
+	}
+
+	if (frame->tf_err & PGEX_W)
+		ftype = VM_PROT_READ | VM_PROT_WRITE;
+	else
+		ftype = VM_PROT_READ;
+
+	if (map != kernel_map) {
+		/*
+		 * Keep swapout from messing with us during this
+		 *	critical time.
+		 */
+		++p->p_lock;
+
+		/*
+		 * Grow the stack if necessary
+		 */
+		if ((caddr_t)va > vm->vm_maxsaddr
+		    && (caddr_t)va < (caddr_t)USRSTACK) {
+			if (!grow(p, va)) {
+				rv = KERN_FAILURE;
+				--p->p_lock;
+				goto nogo;
+			}
+		}
+
+		/* Fault in the user page: */
+		rv = vm_fault(map, va, ftype, FALSE);
+
+		--p->p_lock;
+	} else {
+		/*
+		 * Since we know that kernel virtual address addresses
+		 * always have pte pages mapped, we just have to fault
+		 * the page.
+		 */
+		rv = vm_fault(map, va, ftype, FALSE);
+	}
+
+	if (rv == KERN_SUCCESS)
+		return (0);
+nogo:
+	if (!usermode) {
+		if (intr_nesting_level == 0 && curpcb && curpcb->pcb_onfault) {
+			frame->tf_eip = (int)curpcb->pcb_onfault;
+			return (0);
+		}
+		trap_fatal(frame);
+		return (-1);
+	}
+
+	/* kludge to pass faulting virtual address to sendsig */
+	frame->tf_err = eva;
+
+	return((rv == KERN_PROTECTION_FAILURE) ? SIGBUS : SIGSEGV);
+}
+
+static void
+trap_fatal(frame)
+	struct trapframe *frame;
+{
+	int code, type, eva, ss, esp;
+	struct soft_segment_descriptor softseg;
+
+	code = frame->tf_err;
+	type = frame->tf_trapno;
+	eva = rcr2();
+	sdtossd(&gdt[IDXSEL(frame->tf_cs & 0xffff)].sd, &softseg);
+
+	if (type <= MAX_TRAP_MSG)
+		printf("\n\nFatal trap %d: %s while in %s mode\n",
+			type, trap_msg[type],
+			ISPL(frame->tf_cs) == SEL_UPL ? "user" : "kernel");
+	if (type == T_PAGEFLT) {
+		printf("fault virtual address	= 0x%x\n", eva);
+		printf("fault code		= %s %s, %s\n",
+			code & PGEX_U ? "user" : "supervisor",
+			code & PGEX_W ? "write" : "read",
+			code & PGEX_P ? "protection violation" : "page not present");
+	}
+	printf("instruction pointer	= 0x%x:0x%x\n",
+	       frame->tf_cs & 0xffff, frame->tf_eip);
+	if (ISPL(frame->tf_cs) == SEL_UPL) {
+		ss = frame->tf_ss & 0xffff;
+		esp = frame->tf_esp;
+	} else {
+		ss = GSEL(GDATA_SEL, SEL_KPL);
+		esp = (int)&frame->tf_esp;
+	}
+	printf("stack pointer	        = 0x%x:0x%x\n", ss, esp);
+	printf("frame pointer	        = 0x%x:0x%x\n", ss, frame->tf_ebp);
+	printf("code segment		= base 0x%x, limit 0x%x, type 0x%x\n",
+	       softseg.ssd_base, softseg.ssd_limit, softseg.ssd_type);
+	printf("			= DPL %d, pres %d, def32 %d, gran %d\n",
+	       softseg.ssd_dpl, softseg.ssd_p, softseg.ssd_def32,
+	       softseg.ssd_gran);
+	printf("processor eflags	= ");
+	if (frame->tf_eflags & PSL_T)
+		printf("trace trap, ");
+	if (frame->tf_eflags & PSL_I)
+		printf("interrupt enabled, ");
+	if (frame->tf_eflags & PSL_NT)
+		printf("nested task, ");
+	if (frame->tf_eflags & PSL_RF)
+		printf("resume, ");
+	if (frame->tf_eflags & PSL_VM)
+		printf("vm86, ");
+	printf("IOPL = %d\n", (frame->tf_eflags & PSL_IOPL) >> 12);
+	printf("current process		= ");
+	if (curproc) {
+		printf("%lu (%s)\n",
+		    (u_long)curproc->p_pid, curproc->p_comm ?
+		    curproc->p_comm : "");
+	} else {
+		printf("Idle\n");
+	}
+	printf("interrupt mask		= ");
+	if ((cpl & net_imask) == net_imask)
+		printf("net ");
+	if ((cpl & tty_imask) == tty_imask)
+		printf("tty ");
+	if ((cpl & bio_imask) == bio_imask)
+		printf("bio ");
+	if (cpl == 0)
+		printf("none");
+	printf("\n");
+
+#ifdef KDB
+	if (kdb_trap(&psl))
+		return;
+#endif
+#ifdef DDB
+	if (kdb_trap (type, 0, frame))
+		return;
+#endif
+	if (type <= MAX_TRAP_MSG)
+		panic(trap_msg[type]);
+	else
+		panic("unknown/reserved trap");
+}
+
+/*
+ * Double fault handler. Called when a fault occurs while writing
+ * a frame for a trap/exception onto the stack. This usually occurs
+ * when the stack overflows (such is the case with infinite recursion,
+ * for example).
+ *
+ * XXX Note that the current PTD gets replaced by IdlePTD when the
+ * task switch occurs. This means that the stack that was active at
+ * the time of the double fault is not available at <kstack> unless
+ * the machine was idle when the double fault occurred. The downside
+ * of this is that "trace <ebp>" in ddb won't work.
+ */
+void
+dblfault_handler()
+{
+	struct pcb *pcb = curpcb;
+
+	if (pcb != NULL) {
+		printf("\nFatal double fault:\n");
+		printf("eip = 0x%x\n", pcb->pcb_tss.tss_eip);
+		printf("esp = 0x%x\n", pcb->pcb_tss.tss_esp);
+		printf("ebp = 0x%x\n", pcb->pcb_tss.tss_ebp);
+	}
+
+	panic("double fault");
+}
+
+/*
+ * Compensate for 386 brain damage (missing URKR).
+ * This is a little simpler than the pagefault handler in trap() because
+ * it the page tables have already been faulted in and high addresses
+ * are thrown out early for other reasons.
+ */
+int trapwrite(addr)
+	unsigned addr;
+{
+	struct proc *p;
+	vm_offset_t va;
+	struct vmspace *vm;
+	int rv;
+
+	va = trunc_page((vm_offset_t)addr);
+	/*
+	 * XXX - MAX is END.  Changed > to >= for temp. fix.
+	 */
+	if (va >= VM_MAXUSER_ADDRESS)
+		return (1);
+
+	p = curproc;
+	vm = p->p_vmspace;
+
+	++p->p_lock;
+
+	if ((caddr_t)va >= vm->vm_maxsaddr
+	    && (caddr_t)va < (caddr_t)USRSTACK) {
+		if (!grow(p, va)) {
+			--p->p_lock;
+			return (1);
+		}
+	}
+
+	/*
+	 * fault the data page
+	 */
+	rv = vm_fault(&vm->vm_map, va, VM_PROT_READ|VM_PROT_WRITE, FALSE);
+
+	--p->p_lock;
+
+	if (rv != KERN_SUCCESS)
+		return 1;
+
+	return (0);
+}
+
+/*
+ * System call request from POSIX system call gate interface to kernel.
+ * Like trap(), argument is call by reference.
+ */
+void
+syscall(frame)
+	struct trapframe frame;
+{
+	caddr_t params;
+	int i;
+	struct sysent *callp;
+	struct proc *p = curproc;
+	u_quad_t sticks;
+	int error;
+	int args[8], rval[2];
+	u_int code;
+
+	sticks = p->p_sticks;
+	if (ISPL(frame.tf_cs) != SEL_UPL)
+		panic("syscall");
+
+	p->p_md.md_regs = (int *)&frame;
+	params = (caddr_t)frame.tf_esp + sizeof(int);
+	code = frame.tf_eax;
+	if (p->p_sysent->sv_prepsyscall) {
+		(*p->p_sysent->sv_prepsyscall)(&frame, args, &code, &params);
+	} else {
+		/*
+		 * Need to check if this is a 32 bit or 64 bit syscall.
+		 */
+		if (code == SYS_syscall) {
+			/*
+			 * Code is first argument, followed by actual args.
+			 */
+			code = fuword(params);
+			params += sizeof(int);
+		} else if (code == SYS___syscall) {
+			/*
+			 * Like syscall, but code is a quad, so as to maintain
+			 * quad alignment for the rest of the arguments.
+			 */
+			code = fuword(params);
+			params += sizeof(quad_t);
+		}
+	}
+
+ 	if (p->p_sysent->sv_mask)
+ 		code &= p->p_sysent->sv_mask;
+
+ 	if (code >= p->p_sysent->sv_size)
+ 		callp = &p->p_sysent->sv_table[0];
+  	else
+ 		callp = &p->p_sysent->sv_table[code];
+
+	if (params && (i = callp->sy_narg * sizeof(int)) &&
+	    (error = copyin(params, (caddr_t)args, (u_int)i))) {
+#ifdef KTRACE
+		if (KTRPOINT(p, KTR_SYSCALL))
+			ktrsyscall(p->p_tracep, code, callp->sy_narg, args);
+#endif
+		goto bad;
+	}
+#ifdef KTRACE
+	if (KTRPOINT(p, KTR_SYSCALL))
+		ktrsyscall(p->p_tracep, code, callp->sy_narg, args);
+#endif
+	rval[0] = 0;
+	rval[1] = frame.tf_edx;
+
+	error = (*callp->sy_call)(p, args, rval);
+
+	switch (error) {
+
+	case 0:
+		/*
+		 * Reinitialize proc pointer `p' as it may be different
+		 * if this is a child returning from fork syscall.
+		 */
+		p = curproc;
+		frame.tf_eax = rval[0];
+		frame.tf_edx = rval[1];
+		frame.tf_eflags &= ~PSL_C;
+		break;
+
+	case ERESTART:
+		/*
+		 * Reconstruct pc, assuming lcall $X,y is 7 bytes,
+		 * int 0x80 is 2 bytes. We saved this in tf_err.
+		 */
+		frame.tf_eip -= frame.tf_err;
+		break;
+
+	case EJUSTRETURN:
+		break;
+
+	default:
+bad:
+ 		if (p->p_sysent->sv_errsize)
+ 			if (error >= p->p_sysent->sv_errsize)
+  				error = -1;	/* XXX */
+   			else
+  				error = p->p_sysent->sv_errtbl[error];
+		frame.tf_eax = error;
+		frame.tf_eflags |= PSL_C;
+		break;
+	}
+
+	if (frame.tf_eflags & PSL_T) {
+		/* Traced syscall. */
+		frame.tf_eflags &= ~PSL_T;
+		trapsignal(p, SIGTRAP, 0);
+	}
+
+	userret(p, &frame, sticks);
+
+#ifdef KTRACE
+	if (KTRPOINT(p, KTR_SYSRET))
+		ktrsysret(p->p_tracep, code, error, rval[0]);
+#endif
+}
diff --git a/sys/kern/subr_xxx.c b/sys/kern/subr_xxx.c
index 45b2d64..5ff7dcc 100644
--- a/sys/kern/subr_xxx.c
+++ b/sys/kern/subr_xxx.c
@@ -30,88 +30,282 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)subr_xxx.c	8.3 (Berkeley) 3/29/95
+ *	@(#)subr_xxx.c	8.1 (Berkeley) 6/10/93
+ * $Id$
  */
 
 /*
- * Miscellaneous trivial functions, including many
- * that are often inline-expanded or done in assembler.
+ * Miscellaneous trivial functions.
  */
 #include <sys/param.h>
 #include <sys/systm.h>
 
-#include <machine/cpu.h>
-
 /*
- * Unsupported device function (e.g. writing to read-only device).
+ * Return error for operation not supported
+ * on a specific object or file type.
  */
 int
-enodev()
+eopnotsupp()
 {
 
-	return (ENODEV);
+	return (EOPNOTSUPP);
 }
 
 /*
- * Unconfigured device function; driver not configured.
+ * Return error for an inval operation
+ * on a specific object or file type.
  */
 int
-enxio()
+einval()
 {
 
-	return (ENXIO);
+	return (EINVAL);
 }
 
 /*
- * Unsupported ioctl function.
+ * Generic null operation, always returns success.
  */
 int
-enoioctl()
+nullop()
 {
 
-	return (ENOTTY);
+	return (0);
 }
 
+#include <sys/conf.h>
+
 /*
- * Unsupported system function.
- * This is used for an otherwise-reasonable operation
- * that is not supported by the current system binary.
+ * Unsupported devswitch functions (e.g. for writing to read-only device).
+ * XXX may belong elsewhere.
  */
+
+int
+noopen(dev, flags, fmt, p)
+	dev_t dev;
+	int flags;
+	int fmt;
+	struct proc *p;
+{
+
+	return (ENODEV);
+}
+
+int
+noclose(dev, flags, fmt, p)
+	dev_t dev;
+	int flags;
+	int fmt;
+	struct proc *p;
+{
+
+	return (ENODEV);
+}
+
+int
+noread(dev, uio, ioflag)
+	dev_t dev;
+	struct uio *uio;
+	int ioflag;
+{
+
+	return (ENODEV);
+}
+
+int
+nowrite(dev, uio, ioflag)
+	dev_t dev;
+	struct uio *uio;
+	int ioflag;
+{
+
+	return (ENODEV);
+}
+
+int
+noioctl(dev, cmd, data, flags, p)
+	dev_t dev;
+	int cmd;
+	caddr_t data;
+	int flags;
+	struct proc *p;
+{
+
+	return (ENODEV);
+}
+
+void
+nostop(tp, rw)
+	struct tty *tp;
+	int rw;
+{
+
+}
+
+int
+noreset(dev)
+	dev_t dev;
+{
+
+	printf("noreset(0x%x) called\n", dev);
+	return (ENODEV);
+}
+
+struct tty *
+nodevtotty(dev)
+	dev_t dev;
+{
+
+	return (NULL);
+}
+
+int
+noselect(dev, rw, p)
+	dev_t dev;
+	int rw;
+	struct proc *p;
+{
+
+	/* XXX is this distinguished from 1 for data available? */
+	return (ENODEV);
+}
+
+int
+nommap(dev, offset, nprot)
+	dev_t dev;
+	int offset;
+	int nprot;
+{
+
+	/* Don't return ENODEV.  That would allow mapping address ENODEV! */
+	return (-1);
+}
+
 int
-enosys()
+nodump(dev)
+	dev_t dev;
 {
 
-	return (ENOSYS);
+	return (ENODEV);
 }
 
 /*
- * Return error for operation not supported
- * on a specific object or file type.
+ * Null devswitch functions (for when the operation always succeeds).
+ * XXX may belong elsewhere.
+ * XXX not all are here (e.g., seltrue() isn't).
+ */
+
+/*
+ * XXX this is probably bogus.  Any device that uses it isn't checking the
+ * minor number.
  */
 int
-eopnotsupp()
+nullopen(dev, flags, fmt, p)
+	dev_t dev;
+	int flags;
+	int fmt;
+	struct proc *p;
 {
 
-	return (EOPNOTSUPP);
+	return (0);
+}
+
+int
+nullclose(dev, flags, fmt, p)
+	dev_t dev;
+	int flags;
+	int fmt;
+	struct proc *p;
+{
+
+	return (0);
 }
 
 /*
- * Return error for an inval operation
- * on a specific object or file type.
+ * Unconfigured devswitch functions (for unconfigured drivers).
+ * XXX may belong elsewhere.
  */
+
 int
-einval()
+nxopen(dev, flags, fmt, p)
+	dev_t dev;
+	int flags;
+	int fmt;
+	struct proc *p;
 {
 
-	return (EINVAL);
+	return (ENXIO);
 }
 
 /*
- * Generic null operation, always returns success.
+ * XXX all nx functions except nxopen() should probably go away.  They
+ * probably can't be called for non-open devices.
  */
+
 int
-nullop()
+nxclose(dev, flags, fmt, p)
+	dev_t dev;
+	int flags;
+	int fmt;
+	struct proc *p;
 {
 
-	return (0);
+	printf("nxclose(0x%x) called\n", dev);
+	return (ENXIO);
+}
+
+int
+nxread(dev, uio, ioflag)
+	dev_t dev;
+	struct uio *uio;
+	int ioflag;
+{
+
+	printf("nxread(0x%x) called\n", dev);
+	return (ENXIO);
+}
+
+int
+nxwrite(dev, uio, ioflag)
+	dev_t dev;
+	struct uio *uio;
+	int ioflag;
+{
+
+	printf("nxwrite(0x%x) called\n", dev);
+	return (ENXIO);
+}
+
+int
+nxioctl(dev, cmd, data, flags, p)
+	dev_t dev;
+	int cmd;
+	caddr_t data;
+	int flags;
+	struct proc *p;
+{
+
+	printf("nxioctl(0x%x) called\n", dev);
+	return (ENXIO);
+}
+
+int
+nxselect(dev, rw, p)
+	dev_t dev;
+	int rw;
+	struct proc *p;
+{
+
+	printf("nxselect(0x%x) called\n", dev);
+
+	/* XXX is this distinguished from 1 for data available? */
+	return (ENXIO);
+}
+
+int
+nxdump(dev)
+	dev_t dev;
+{
+
+	printf("nxdump(0x%x) called\n", dev);
+	return (ENXIO);
 }
diff --git a/sys/kern/sys_generic.c b/sys/kern/sys_generic.c
index 08385b3..2bcfd68 100644
--- a/sys/kern/sys_generic.c
+++ b/sys/kern/sys_generic.c
@@ -35,15 +35,24 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)sys_generic.c	8.9 (Berkeley) 2/14/95
+ *	@(#)sys_generic.c	8.5 (Berkeley) 1/21/94
+ * $Id: sys_generic.c,v 1.25 1997/03/23 03:36:23 bde Exp $
  */
 
+#include "opt_ktrace.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/sysproto.h>
 #include <sys/filedesc.h>
-#include <sys/ioctl.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
+#include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/proc.h>
+#include <sys/stat.h>
+#include <sys/signalvar.h>
+#include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/uio.h>
 #include <sys/kernel.h>
@@ -52,23 +61,26 @@
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
+#include <vm/vm.h>
 
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
+static int	selscan __P((struct proc *, fd_mask **, fd_mask **, int, int *));
 
 /*
  * Read system call.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct read_args {
+	int	fd;
+	char	*buf;
+	u_int	nbyte;
+};
+#endif
 /* ARGSUSED */
 int
 read(p, uap, retval)
 	struct proc *p;
-	register struct read_args /* {
-		syscallarg(int) fd;
-		syscallarg(char *) buf;
-		syscallarg(u_int) nbyte;
-	} */ *uap;
-	register_t *retval;
+	register struct read_args *uap;
+	int *retval;
 {
 	register struct file *fp;
 	register struct filedesc *fdp = p->p_fd;
@@ -79,15 +91,19 @@ read(p, uap, retval)
 	struct iovec ktriov;
 #endif
 
-	if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL ||
+	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
 	    (fp->f_flag & FREAD) == 0)
 		return (EBADF);
-	aiov.iov_base = (caddr_t)SCARG(uap, buf);
-	aiov.iov_len = SCARG(uap, nbyte);
+	aiov.iov_base = (caddr_t)uap->buf;
+	aiov.iov_len = uap->nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
-	auio.uio_resid = SCARG(uap, nbyte);
+
+	auio.uio_resid = uap->nbyte;
+	if (auio.uio_resid < 0)
+		return (EINVAL);
+
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_procp = p;
@@ -98,16 +114,15 @@ read(p, uap, retval)
 	if (KTRPOINT(p, KTR_GENIO))
 		ktriov = aiov;
 #endif
-	cnt = SCARG(uap, nbyte);
-	if (error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred))
+	cnt = uap->nbyte;
+	if ((error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred)))
 		if (auio.uio_resid != cnt && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
 	cnt -= auio.uio_resid;
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
-		ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_READ, &ktriov,
-		    cnt, error);
+		ktrgenio(p->p_tracep, uap->fd, UIO_READ, &ktriov, cnt, error);
 #endif
 	*retval = cnt;
 	return (error);
@@ -116,15 +131,18 @@ read(p, uap, retval)
 /*
  * Scatter read system call.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct readv_args {
+	int	fd;
+	struct	iovec *iovp;
+	u_int	iovcnt;
+};
+#endif
 int
 readv(p, uap, retval)
 	struct proc *p;
-	register struct readv_args /* {
-		syscallarg(int) fd;
-		syscallarg(struct iovec *) iovp;
-		syscallarg(u_int) iovcnt;
-	} */ *uap;
-	register_t *retval;
+	register struct readv_args *uap;
+	int *retval;
 {
 	register struct file *fp;
 	register struct filedesc *fdp = p->p_fd;
@@ -138,14 +156,14 @@ readv(p, uap, retval)
 	struct iovec *ktriov = NULL;
 #endif
 
-	if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL ||
+	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
 	    (fp->f_flag & FREAD) == 0)
 		return (EBADF);
 	/* note: can't use iovlen until iovcnt is validated */
-	iovlen = SCARG(uap, iovcnt) * sizeof (struct iovec);
-	if (SCARG(uap, iovcnt) > UIO_SMALLIOV) {
-		if (SCARG(uap, iovcnt) > UIO_MAXIOV)
+	iovlen = uap->iovcnt * sizeof (struct iovec);
+	if (uap->iovcnt > UIO_SMALLIOV) {
+		if (uap->iovcnt > UIO_MAXIOV)
 			return (EINVAL);
 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
 		needfree = iov;
@@ -154,19 +172,19 @@ readv(p, uap, retval)
 		needfree = NULL;
 	}
 	auio.uio_iov = iov;
-	auio.uio_iovcnt = SCARG(uap, iovcnt);
+	auio.uio_iovcnt = uap->iovcnt;
 	auio.uio_rw = UIO_READ;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_procp = p;
-	if (error = copyin((caddr_t)SCARG(uap, iovp), (caddr_t)iov, iovlen))
+	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
 		goto done;
 	auio.uio_resid = 0;
-	for (i = 0; i < SCARG(uap, iovcnt); i++) {
-		if (auio.uio_resid + iov->iov_len < auio.uio_resid) {
+	for (i = 0; i < uap->iovcnt; i++) {
+		auio.uio_resid += iov->iov_len;
+		if (auio.uio_resid < 0) {
 			error = EINVAL;
 			goto done;
 		}
-		auio.uio_resid += iov->iov_len;
 		iov++;
 	}
 #ifdef KTRACE
@@ -179,7 +197,7 @@ readv(p, uap, retval)
 	}
 #endif
 	cnt = auio.uio_resid;
-	if (error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred))
+	if ((error = (*fp->f_ops->fo_read)(fp, &auio, fp->f_cred)))
 		if (auio.uio_resid != cnt && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
@@ -187,7 +205,7 @@ readv(p, uap, retval)
 #ifdef KTRACE
 	if (ktriov != NULL) {
 		if (error == 0)
-			ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_READ, ktriov,
+			ktrgenio(p->p_tracep, uap->fd, UIO_READ, ktriov,
 			    cnt, error);
 		FREE(ktriov, M_TEMP);
 	}
@@ -202,15 +220,18 @@ done:
 /*
  * Write system call
  */
+#ifndef _SYS_SYSPROTO_H_
+struct write_args {
+	int	fd;
+	char	*buf;
+	u_int	nbyte;
+};
+#endif
 int
 write(p, uap, retval)
 	struct proc *p;
-	register struct write_args /* {
-		syscallarg(int) fd;
-		syscallarg(char *) buf;
-		syscallarg(u_int) nbyte;
-	} */ *uap;
-	register_t *retval;
+	register struct write_args *uap;
+	int *retval;
 {
 	register struct file *fp;
 	register struct filedesc *fdp = p->p_fd;
@@ -221,15 +242,15 @@ write(p, uap, retval)
 	struct iovec ktriov;
 #endif
 
-	if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL ||
+	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
 	    (fp->f_flag & FWRITE) == 0)
 		return (EBADF);
-	aiov.iov_base = (caddr_t)SCARG(uap, buf);
-	aiov.iov_len = SCARG(uap, nbyte);
+	aiov.iov_base = (caddr_t)uap->buf;
+	aiov.iov_len = uap->nbyte;
 	auio.uio_iov = &aiov;
 	auio.uio_iovcnt = 1;
-	auio.uio_resid = SCARG(uap, nbyte);
+	auio.uio_resid = uap->nbyte;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_procp = p;
@@ -240,8 +261,8 @@ write(p, uap, retval)
 	if (KTRPOINT(p, KTR_GENIO))
 		ktriov = aiov;
 #endif
-	cnt = SCARG(uap, nbyte);
-	if (error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred)) {
+	cnt = uap->nbyte;
+	if ((error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred))) {
 		if (auio.uio_resid != cnt && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
@@ -251,7 +272,7 @@ write(p, uap, retval)
 	cnt -= auio.uio_resid;
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_GENIO) && error == 0)
-		ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_WRITE,
+		ktrgenio(p->p_tracep, uap->fd, UIO_WRITE,
 		    &ktriov, cnt, error);
 #endif
 	*retval = cnt;
@@ -261,15 +282,18 @@ write(p, uap, retval)
 /*
  * Gather write system call
  */
+#ifndef _SYS_SYSPROTO_H_
+struct writev_args {
+	int	fd;
+	struct	iovec *iovp;
+	u_int	iovcnt;
+};
+#endif
 int
 writev(p, uap, retval)
 	struct proc *p;
-	register struct writev_args /* {
-		syscallarg(int) fd;
-		syscallarg(struct iovec *) iovp;
-		syscallarg(u_int) iovcnt;
-	} */ *uap;
-	register_t *retval;
+	register struct writev_args *uap;
+	int *retval;
 {
 	register struct file *fp;
 	register struct filedesc *fdp = p->p_fd;
@@ -283,14 +307,14 @@ writev(p, uap, retval)
 	struct iovec *ktriov = NULL;
 #endif
 
-	if (((u_int)SCARG(uap, fd)) >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL ||
+	if (((u_int)uap->fd) >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL ||
 	    (fp->f_flag & FWRITE) == 0)
 		return (EBADF);
 	/* note: can't use iovlen until iovcnt is validated */
-	iovlen = SCARG(uap, iovcnt) * sizeof (struct iovec);
-	if (SCARG(uap, iovcnt) > UIO_SMALLIOV) {
-		if (SCARG(uap, iovcnt) > UIO_MAXIOV)
+	iovlen = uap->iovcnt * sizeof (struct iovec);
+	if (uap->iovcnt > UIO_SMALLIOV) {
+		if (uap->iovcnt > UIO_MAXIOV)
 			return (EINVAL);
 		MALLOC(iov, struct iovec *, iovlen, M_IOV, M_WAITOK);
 		needfree = iov;
@@ -299,19 +323,19 @@ writev(p, uap, retval)
 		needfree = NULL;
 	}
 	auio.uio_iov = iov;
-	auio.uio_iovcnt = SCARG(uap, iovcnt);
+	auio.uio_iovcnt = uap->iovcnt;
 	auio.uio_rw = UIO_WRITE;
 	auio.uio_segflg = UIO_USERSPACE;
 	auio.uio_procp = p;
-	if (error = copyin((caddr_t)SCARG(uap, iovp), (caddr_t)iov, iovlen))
+	if ((error = copyin((caddr_t)uap->iovp, (caddr_t)iov, iovlen)))
 		goto done;
 	auio.uio_resid = 0;
-	for (i = 0; i < SCARG(uap, iovcnt); i++) {
-		if (auio.uio_resid + iov->iov_len < auio.uio_resid) {
+	for (i = 0; i < uap->iovcnt; i++) {
+		auio.uio_resid += iov->iov_len;
+		if (auio.uio_resid < 0) {
 			error = EINVAL;
 			goto done;
 		}
-		auio.uio_resid += iov->iov_len;
 		iov++;
 	}
 #ifdef KTRACE
@@ -324,7 +348,7 @@ writev(p, uap, retval)
 	}
 #endif
 	cnt = auio.uio_resid;
-	if (error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred)) {
+	if ((error = (*fp->f_ops->fo_write)(fp, &auio, fp->f_cred))) {
 		if (auio.uio_resid != cnt && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
@@ -335,7 +359,7 @@ writev(p, uap, retval)
 #ifdef KTRACE
 	if (ktriov != NULL) {
 		if (error == 0)
-			ktrgenio(p->p_tracep, SCARG(uap, fd), UIO_WRITE,
+			ktrgenio(p->p_tracep, uap->fd, UIO_WRITE,
 				ktriov, cnt, error);
 		FREE(ktriov, M_TEMP);
 	}
@@ -350,21 +374,23 @@ done:
 /*
  * Ioctl system call
  */
+#ifndef _SYS_SYSPROTO_H_
+struct ioctl_args {
+	int	fd;
+	int	com;
+	caddr_t	data;
+};
+#endif
 /* ARGSUSED */
 int
 ioctl(p, uap, retval)
 	struct proc *p;
-	register struct ioctl_args /* {
-		syscallarg(int) fd;
-		syscallarg(u_long) com;
-		syscallarg(caddr_t) data;
-	} */ *uap;
-	register_t *retval;
+	register struct ioctl_args *uap;
+	int *retval;
 {
 	register struct file *fp;
 	register struct filedesc *fdp;
-	register u_long com;
-	register int error;
+	register int com, error;
 	register u_int size;
 	caddr_t data, memp;
 	int tmp;
@@ -372,19 +398,19 @@ ioctl(p, uap, retval)
 	char stkbuf[STK_PARAMS];
 
 	fdp = p->p_fd;
-	if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles ||
-	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL)
+	if ((u_int)uap->fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[uap->fd]) == NULL)
 		return (EBADF);
 
 	if ((fp->f_flag & (FREAD | FWRITE)) == 0)
 		return (EBADF);
 
-	switch (com = SCARG(uap, com)) {
+	switch (com = uap->com) {
 	case FIONCLEX:
-		fdp->fd_ofileflags[SCARG(uap, fd)] &= ~UF_EXCLOSE;
+		fdp->fd_ofileflags[uap->fd] &= ~UF_EXCLOSE;
 		return (0);
 	case FIOCLEX:
-		fdp->fd_ofileflags[SCARG(uap, fd)] |= UF_EXCLOSE;
+		fdp->fd_ofileflags[uap->fd] |= UF_EXCLOSE;
 		return (0);
 	}
 
@@ -403,14 +429,14 @@ ioctl(p, uap, retval)
 		data = stkbuf;
 	if (com&IOC_IN) {
 		if (size) {
-			error = copyin(SCARG(uap, data), data, (u_int)size);
+			error = copyin(uap->data, data, (u_int)size);
 			if (error) {
 				if (memp)
 					free(memp, M_IOCTLOPS);
 				return (error);
 			}
 		} else
-			*(caddr_t *)data = SCARG(uap, data);
+			*(caddr_t *)data = uap->data;
 	} else if ((com&IOC_OUT) && size)
 		/*
 		 * Zero the buffer so the user always
@@ -418,12 +444,12 @@ ioctl(p, uap, retval)
 		 */
 		bzero(data, size);
 	else if (com&IOC_VOID)
-		*(caddr_t *)data = SCARG(uap, data);
+		*(caddr_t *)data = uap->data;
 
 	switch (com) {
 
 	case FIONBIO:
-		if (tmp = *(int *)data)
+		if ((tmp = *(int *)data))
 			fp->f_flag |= FNONBLOCK;
 		else
 			fp->f_flag &= ~FNONBLOCK;
@@ -431,7 +457,7 @@ ioctl(p, uap, retval)
 		break;
 
 	case FIOASYNC:
-		if (tmp = *(int *)data)
+		if ((tmp = *(int *)data))
 			fp->f_flag |= FASYNC;
 		else
 			fp->f_flag &= ~FASYNC;
@@ -456,7 +482,7 @@ ioctl(p, uap, retval)
 			tmp = p1->p_pgrp->pg_id;
 		}
 		error = (*fp->f_ops->fo_ioctl)
-			(fp, TIOCSPGRP, (caddr_t)&tmp, p);
+			(fp, (int)TIOCSPGRP, (caddr_t)&tmp, p);
 		break;
 
 	case FIOGETOWN:
@@ -465,7 +491,7 @@ ioctl(p, uap, retval)
 			*(int *)data = ((struct socket *)fp->f_data)->so_pgid;
 			break;
 		}
-		error = (*fp->f_ops->fo_ioctl)(fp, TIOCGPGRP, data, p);
+		error = (*fp->f_ops->fo_ioctl)(fp, (int)TIOCGPGRP, data, p);
 		*(int *)data = -*(int *)data;
 		break;
 
@@ -476,7 +502,7 @@ ioctl(p, uap, retval)
 		 * already set and checked above.
 		 */
 		if (error == 0 && (com&IOC_OUT) && size)
-			error = copyout(data, SCARG(uap, data), (u_int)size);
+			error = copyout(data, uap->data, (u_int)size);
 		break;
 	}
 	if (memp)
@@ -484,49 +510,88 @@ ioctl(p, uap, retval)
 	return (error);
 }
 
-int	selwait, nselcoll;
+static int	nselcoll;
+int	selwait;
 
 /*
  * Select system call.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct select_args {
+	int	nd;
+	fd_set	*in, *ou, *ex;
+	struct	timeval *tv;
+};
+#endif
 int
 select(p, uap, retval)
 	register struct proc *p;
-	register struct select_args /* {
-		syscallarg(u_int) nd;
-		syscallarg(fd_set *) in;
-		syscallarg(fd_set *) ou;
-		syscallarg(fd_set *) ex;
-		syscallarg(struct timeval *) tv;
-	} */ *uap;
-	register_t *retval;
+	register struct select_args *uap;
+	int *retval;
 {
-	fd_set ibits[3], obits[3];
+	/*
+	 * The magic 2048 here is chosen to be just enough for FD_SETSIZE
+	 * infds with the new FD_SETSIZE of 1024, and more than enough for
+	 * FD_SETSIZE infds, outfds and exceptfds with the old FD_SETSIZE
+	 * of 256.
+	 */
+	fd_mask s_selbits[howmany(2048, NFDBITS)];
+	fd_mask *ibits[3], *obits[3], *selbits, *sbp;
 	struct timeval atv;
-	int s, ncoll, error, timo = 0;
-	u_int ni;
+	int s, ncoll, error, timo;
+	u_int nbufbytes, ncpbytes, nfdbits;
 
-	bzero((caddr_t)ibits, sizeof(ibits));
-	bzero((caddr_t)obits, sizeof(obits));
-	if (SCARG(uap, nd) > FD_SETSIZE)
+	if (uap->nd < 0)
 		return (EINVAL);
-	if (SCARG(uap, nd) > p->p_fd->fd_nfiles) {
-		/* forgiving; slightly wrong */
-		SCARG(uap, nd) = p->p_fd->fd_nfiles;
-	}
-	ni = howmany(SCARG(uap, nd), NFDBITS) * sizeof(fd_mask);
+	if (uap->nd > p->p_fd->fd_nfiles)
+		uap->nd = p->p_fd->fd_nfiles;   /* forgiving; slightly wrong */
+
+	/*
+	 * Allocate just enough bits for the non-null fd_sets.  Use the
+	 * preallocated auto buffer if possible.
+	 */
+	nfdbits = roundup(uap->nd, NFDBITS);
+	ncpbytes = nfdbits / NBBY;
+	nbufbytes = 0;
+	if (uap->in != NULL)
+		nbufbytes += 2 * ncpbytes;
+	if (uap->ou != NULL)
+		nbufbytes += 2 * ncpbytes;
+	if (uap->ex != NULL)
+		nbufbytes += 2 * ncpbytes;
+	if (nbufbytes <= sizeof s_selbits)
+		selbits = &s_selbits[0];
+	else
+		selbits = malloc(nbufbytes, M_SELECT, M_WAITOK);
 
+	/*
+	 * Assign pointers into the bit buffers and fetch the input bits.
+	 * Put the output buffers together so that they can be bzeroed
+	 * together.
+	 */
+	sbp = selbits;
 #define	getbits(name, x) \
-	if (SCARG(uap, name) && (error = copyin((caddr_t)SCARG(uap, name), \
-	    (caddr_t)&ibits[x], ni))) \
-		goto done;
+	do {								\
+		if (uap->name == NULL)					\
+			ibits[x] = NULL;				\
+		else {							\
+			ibits[x] = sbp + nbufbytes / 2 / sizeof *sbp;	\
+			obits[x] = sbp;					\
+			sbp += ncpbytes / sizeof *sbp;			\
+			error = copyin(uap->name, ibits[x], ncpbytes);	\
+			if (error != 0)					\
+				goto done;				\
+		}							\
+	} while (0)
 	getbits(in, 0);
 	getbits(ou, 1);
 	getbits(ex, 2);
 #undef	getbits
+	if (nbufbytes != 0)
+		bzero(selbits, nbufbytes / 2);
 
-	if (SCARG(uap, tv)) {
-		error = copyin((caddr_t)SCARG(uap, tv), (caddr_t)&atv,
+	if (uap->tv) {
+		error = copyin((caddr_t)uap->tv, (caddr_t)&atv,
 			sizeof (atv));
 		if (error)
 			goto done;
@@ -535,31 +600,28 @@ select(p, uap, retval)
 			goto done;
 		}
 		s = splclock();
-		timevaladd(&atv, (struct timeval *)&time);
+		timevaladd(&atv, &time);
+		timo = hzto(&atv);
+		/*
+		 * Avoid inadvertently sleeping forever.
+		 */
+		if (timo == 0)
+			timo = 1;
 		splx(s);
-	}
+	} else
+		timo = 0;
 retry:
 	ncoll = nselcoll;
 	p->p_flag |= P_SELECT;
-	error = selscan(p, ibits, obits, SCARG(uap, nd), retval);
+	error = selscan(p, ibits, obits, uap->nd, retval);
 	if (error || *retval)
 		goto done;
 	s = splhigh();
-	if (SCARG(uap, tv)) {
-		if (timercmp(&time, &atv, >=)) {
-			splx(s);
-			goto done;
-		}
-		/*
-		 * If poll wait was tiny, this could be zero; we will
-		 * have to round it up to avoid sleeping forever.  If
-		 * we retry below, the timercmp above will get us out.
-		 * Note that if wait was 0, the timercmp will prevent
-		 * us from getting here the first time.
-		 */
-		timo = hzto(&atv);
-		if (timo == 0)
-			timo = 1;
+	/* this should be timercmp(&time, &atv, >=) */
+	if (uap->tv && (time.tv_sec > atv.tv_sec ||
+	    (time.tv_sec == atv.tv_sec && time.tv_usec >= atv.tv_usec))) {
+		splx(s);
+		goto done;
 	}
 	if ((p->p_flag & P_SELECT) == 0 || nselcoll != ncoll) {
 		splx(s);
@@ -578,8 +640,7 @@ done:
 	if (error == EWOULDBLOCK)
 		error = 0;
 #define	putbits(name, x) \
-	if (SCARG(uap, name) && (error2 = copyout((caddr_t)&obits[x], \
-	    (caddr_t)SCARG(uap, name), ni))) \
+	if (uap->name && (error2 = copyout(obits[x], uap->name, ncpbytes))) \
 		error = error2;
 	if (error == 0) {
 		int error2;
@@ -589,15 +650,16 @@ done:
 		putbits(ex, 2);
 #undef putbits
 	}
+	if (selbits != &s_selbits[0])
+		free(selbits, M_SELECT);
 	return (error);
 }
 
-int
+static int
 selscan(p, ibits, obits, nfd, retval)
 	struct proc *p;
-	fd_set *ibits, *obits;
-	int nfd;
-	register_t *retval;
+	fd_mask **ibits, **obits;
+	int nfd, *retval;
 {
 	register struct filedesc *fdp = p->p_fd;
 	register int msk, i, j, fd;
@@ -607,15 +669,18 @@ selscan(p, ibits, obits, nfd, retval)
 	static int flag[3] = { FREAD, FWRITE, 0 };
 
 	for (msk = 0; msk < 3; msk++) {
+		if (ibits[msk] == NULL)
+			continue;
 		for (i = 0; i < nfd; i += NFDBITS) {
-			bits = ibits[msk].fds_bits[i/NFDBITS];
+			bits = ibits[msk][i/NFDBITS];
 			while ((j = ffs(bits)) && (fd = i + --j) < nfd) {
 				bits &= ~(1 << j);
 				fp = fdp->fd_ofiles[fd];
 				if (fp == NULL)
 					return (EBADF);
 				if ((*fp->f_ops->fo_select)(fp, flag[msk], p)) {
-					FD_SET(fd, &obits[msk]);
+					obits[msk][(fd)/NFDBITS] |=
+						(1 << ((fd) % NFDBITS));
 					n++;
 				}
 			}
diff --git a/sys/kern/sys_pipe.c b/sys/kern/sys_pipe.c
new file mode 100644
index 0000000..5beac60
--- /dev/null
+++ b/sys/kern/sys_pipe.c
@@ -0,0 +1,1107 @@
+/*
+ * Copyright (c) 1996 John S. Dyson
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice immediately at the beginning of the file, without modification,
+ *    this list of conditions, and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. Absolutely no warranty of function or purpose is made by the author
+ *    John S. Dyson.
+ * 4. Modifications may be freely made to this file if the above conditions
+ *    are met.
+ *
+ * $Id: sys_pipe.c,v 1.26 1997/03/23 03:36:24 bde Exp $
+ */
+
+#ifndef OLD_PIPE
+
+/*
+ * This file contains a high-performance replacement for the socket-based
+ * pipes scheme originally used in FreeBSD/4.4Lite.  It does not support
+ * all features of sockets, but does do everything that pipes normally
+ * do.
+ */
+
+/*
+ * This code has two modes of operation, a small write mode and a large
+ * write mode.  The small write mode acts like conventional pipes with
+ * a kernel buffer.  If the buffer is less than PIPE_MINDIRECT, then the
+ * "normal" pipe buffering is done.  If the buffer is between PIPE_MINDIRECT
+ * and PIPE_SIZE in size, it is fully mapped and wired into the kernel, and
+ * the receiving process can copy it directly from the pages in the sending
+ * process.
+ *
+ * If the sending process receives a signal, it is possible that it will
+ * go away, and certainly its address space can change, because control
+ * is returned back to the user-mode side.  In that case, the pipe code
+ * arranges to copy the buffer supplied by the user process, to a pageable
+ * kernel buffer, and the receiving process will grab the data from the
+ * pageable kernel buffer.  Since signals don't happen all that often,
+ * the copy operation is normally eliminated.
+ *
+ * The constant PIPE_MINDIRECT is chosen to make sure that buffering will
+ * happen for small transfers so that the system will not spend all of
+ * its time context switching.  PIPE_SIZE is constrained by the
+ * amount of kernel virtual memory.
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/proc.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/protosw.h>
+#include <sys/stat.h>
+#include <sys/filedesc.h>
+#include <sys/malloc.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
+#include <sys/stat.h>
+#include <sys/select.h>
+#include <sys/signalvar.h>
+#include <sys/errno.h>
+#include <sys/queue.h>
+#include <sys/vmmeter.h>
+#include <sys/kernel.h>
+#include <sys/sysproto.h>
+#include <sys/pipe.h>
+
+#include <vm/vm.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_param.h>
+#include <sys/lock.h>
+#include <vm/vm_object.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_page.h>
+
+/*
+ * Use this define if you want to disable *fancy* VM things.  Expect an
+ * approx 30% decrease in transfer rate.  This could be useful for
+ * NetBSD or OpenBSD.
+ */
+/* #define PIPE_NODIRECT */
+
+/*
+ * interfaces to the outside world
+ */
+static int pipe_read __P((struct file *fp, struct uio *uio, 
+		struct ucred *cred));
+static int pipe_write __P((struct file *fp, struct uio *uio, 
+		struct ucred *cred));
+static int pipe_close __P((struct file *fp, struct proc *p));
+static int pipe_select __P((struct file *fp, int which, struct proc *p));
+static int pipe_ioctl __P((struct file *fp, int cmd, caddr_t data, struct proc *p));
+
+static struct fileops pipeops =
+    { pipe_read, pipe_write, pipe_ioctl, pipe_select, pipe_close };
+
+/*
+ * Default pipe buffer size(s), this can be kind-of large now because pipe
+ * space is pageable.  The pipe code will try to maintain locality of
+ * reference for performance reasons, so small amounts of outstanding I/O
+ * will not wipe the cache.
+ */
+#define MINPIPESIZE (PIPE_SIZE/3)
+#define MAXPIPESIZE (2*PIPE_SIZE/3)
+
+/*
+ * Maximum amount of kva for pipes -- this is kind-of a soft limit, but
+ * is there so that on large systems, we don't exhaust it.
+ */
+#define MAXPIPEKVA (8*1024*1024)
+
+/*
+ * Limit for direct transfers, we cannot, of course limit
+ * the amount of kva for pipes in general though.
+ */
+#define LIMITPIPEKVA (16*1024*1024)
+
+/*
+ * Limit the number of "big" pipes
+ */
+#define LIMITBIGPIPES	32
+int nbigpipe;
+
+static int amountpipekva;
+
+static void pipeclose __P((struct pipe *cpipe));
+static void pipeinit __P((struct pipe *cpipe));
+static __inline int pipelock __P((struct pipe *cpipe, int catch));
+static __inline void pipeunlock __P((struct pipe *cpipe));
+static __inline void pipeselwakeup __P((struct pipe *cpipe));
+#ifndef PIPE_NODIRECT
+static int pipe_build_write_buffer __P((struct pipe *wpipe, struct uio *uio));
+static void pipe_destroy_write_buffer __P((struct pipe *wpipe));
+static int pipe_direct_write __P((struct pipe *wpipe, struct uio *uio));
+static void pipe_clone_write_buffer __P((struct pipe *wpipe));
+#endif
+static void pipespace __P((struct pipe *cpipe));
+
+/*
+ * The pipe system call for the DTYPE_PIPE type of pipes
+ */
+
+/* ARGSUSED */
+int
+pipe(p, uap, retval)
+	struct proc *p;
+	struct pipe_args /* {
+		int	dummy;
+	} */ *uap;
+	int retval[];
+{
+	register struct filedesc *fdp = p->p_fd;
+	struct file *rf, *wf;
+	struct pipe *rpipe, *wpipe;
+	int fd, error;
+
+	rpipe = malloc( sizeof (*rpipe), M_TEMP, M_WAITOK);
+	pipeinit(rpipe);
+	rpipe->pipe_state |= PIPE_DIRECTOK;
+	wpipe = malloc( sizeof (*wpipe), M_TEMP, M_WAITOK);
+	pipeinit(wpipe);
+	wpipe->pipe_state |= PIPE_DIRECTOK;
+
+	error = falloc(p, &rf, &fd);
+	if (error)
+		goto free2;
+	retval[0] = fd;
+	rf->f_flag = FREAD | FWRITE;
+	rf->f_type = DTYPE_PIPE;
+	rf->f_ops = &pipeops;
+	rf->f_data = (caddr_t)rpipe;
+	error = falloc(p, &wf, &fd);
+	if (error)
+		goto free3;
+	wf->f_flag = FREAD | FWRITE;
+	wf->f_type = DTYPE_PIPE;
+	wf->f_ops = &pipeops;
+	wf->f_data = (caddr_t)wpipe;
+	retval[1] = fd;
+
+	rpipe->pipe_peer = wpipe;
+	wpipe->pipe_peer = rpipe;
+
+	return (0);
+free3:
+	ffree(rf);
+	fdp->fd_ofiles[retval[0]] = 0;
+free2:
+	(void)pipeclose(wpipe);
+	(void)pipeclose(rpipe);
+	return (error);
+}
+
+/*
+ * Allocate kva for pipe circular buffer, the space is pageable
+ */
+static void
+pipespace(cpipe)
+	struct pipe *cpipe;
+{
+	int npages, error;
+
+	npages = round_page(cpipe->pipe_buffer.size)/PAGE_SIZE;
+	/*
+	 * Create an object, I don't like the idea of paging to/from
+	 * kernel_object.
+	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
+	 */
+	cpipe->pipe_buffer.object = vm_object_allocate(OBJT_DEFAULT, npages);
+	cpipe->pipe_buffer.buffer = (caddr_t) vm_map_min(kernel_map);
+
+	/*
+	 * Insert the object into the kernel map, and allocate kva for it.
+	 * The map entry is, by default, pageable.
+	 * XXX -- minor change needed here for NetBSD/OpenBSD VM systems.
+	 */
+	error = vm_map_find(kernel_map, cpipe->pipe_buffer.object, 0,
+		(vm_offset_t *) &cpipe->pipe_buffer.buffer, 
+		cpipe->pipe_buffer.size, 1,
+		VM_PROT_ALL, VM_PROT_ALL, 0);
+
+	if (error != KERN_SUCCESS)
+		panic("pipeinit: cannot allocate pipe -- out of kvm -- code = %d", error);
+	amountpipekva += cpipe->pipe_buffer.size;
+}
+
+/*
+ * initialize and allocate VM and memory for pipe
+ */
+static void
+pipeinit(cpipe)
+	struct pipe *cpipe;
+{
+	int s;
+
+	cpipe->pipe_buffer.in = 0;
+	cpipe->pipe_buffer.out = 0;
+	cpipe->pipe_buffer.cnt = 0;
+	cpipe->pipe_buffer.size = PIPE_SIZE;
+
+	/* Buffer kva gets dynamically allocated */
+	cpipe->pipe_buffer.buffer = NULL;
+	/* cpipe->pipe_buffer.object = invalid */
+
+	cpipe->pipe_state = 0;
+	cpipe->pipe_peer = NULL;
+	cpipe->pipe_busy = 0;
+	gettime(&cpipe->pipe_ctime);
+	cpipe->pipe_atime = cpipe->pipe_ctime;
+	cpipe->pipe_mtime = cpipe->pipe_ctime;
+	bzero(&cpipe->pipe_sel, sizeof cpipe->pipe_sel);
+	cpipe->pipe_pgid = NO_PID;
+
+#ifndef PIPE_NODIRECT
+	/*
+	 * pipe data structure initializations to support direct pipe I/O
+	 */
+	cpipe->pipe_map.cnt = 0;
+	cpipe->pipe_map.kva = 0;
+	cpipe->pipe_map.pos = 0;
+	cpipe->pipe_map.npages = 0;
+	/* cpipe->pipe_map.ms[] = invalid */
+#endif
+}
+
+
+/*
+ * lock a pipe for I/O, blocking other access
+ */
+static __inline int
+pipelock(cpipe, catch)
+	struct pipe *cpipe;
+	int catch;
+{
+	int error;
+	while (cpipe->pipe_state & PIPE_LOCK) {
+		cpipe->pipe_state |= PIPE_LWANT;
+		if (error = tsleep( cpipe,
+			catch?(PRIBIO|PCATCH):PRIBIO, "pipelk", 0)) {
+			return error;
+		}
+	}
+	cpipe->pipe_state |= PIPE_LOCK;
+	return 0;
+}
+
+/*
+ * unlock a pipe I/O lock
+ */
+static __inline void
+pipeunlock(cpipe)
+	struct pipe *cpipe;
+{
+	cpipe->pipe_state &= ~PIPE_LOCK;
+	if (cpipe->pipe_state & PIPE_LWANT) {
+		cpipe->pipe_state &= ~PIPE_LWANT;
+		wakeup(cpipe);
+	}
+}
+
+static __inline void
+pipeselwakeup(cpipe)
+	struct pipe *cpipe;
+{
+	struct proc *p;
+
+	if (cpipe->pipe_state & PIPE_SEL) {
+		cpipe->pipe_state &= ~PIPE_SEL;
+		selwakeup(&cpipe->pipe_sel);
+	}
+	if (cpipe->pipe_state & PIPE_ASYNC) {
+		if (cpipe->pipe_pgid < 0)
+			gsignal(-cpipe->pipe_pgid, SIGIO);
+		else if ((p = pfind(cpipe->pipe_pgid)) != NULL)
+			psignal(p, SIGIO);
+	}
+}
+
+/* ARGSUSED */
+static int
+pipe_read(fp, uio, cred)
+	struct file *fp;
+	struct uio *uio;
+	struct ucred *cred;
+{
+
+	struct pipe *rpipe = (struct pipe *) fp->f_data;
+	int error = 0;
+	int nread = 0;
+	u_int size;
+
+	++rpipe->pipe_busy;
+	while (uio->uio_resid) {
+		/*
+		 * normal pipe buffer receive
+		 */
+		if (rpipe->pipe_buffer.cnt > 0) {
+			size = rpipe->pipe_buffer.size - rpipe->pipe_buffer.out;
+			if (size > rpipe->pipe_buffer.cnt)
+				size = rpipe->pipe_buffer.cnt;
+			if (size > (u_int) uio->uio_resid)
+				size = (u_int) uio->uio_resid;
+			if ((error = pipelock(rpipe,1)) == 0) {
+				error = uiomove( &rpipe->pipe_buffer.buffer[rpipe->pipe_buffer.out], 
+					size, uio);
+				pipeunlock(rpipe);
+			}
+			if (error) {
+				break;
+			}
+			rpipe->pipe_buffer.out += size;
+			if (rpipe->pipe_buffer.out >= rpipe->pipe_buffer.size)
+				rpipe->pipe_buffer.out = 0;
+
+			rpipe->pipe_buffer.cnt -= size;
+			nread += size;
+#ifndef PIPE_NODIRECT
+		/*
+		 * Direct copy, bypassing a kernel buffer.
+		 */
+		} else if ((size = rpipe->pipe_map.cnt) &&
+			(rpipe->pipe_state & PIPE_DIRECTW)) {
+			caddr_t va;
+			if (size > (u_int) uio->uio_resid)
+				size = (u_int) uio->uio_resid;
+			if ((error = pipelock(rpipe,1)) == 0) {
+				va = (caddr_t) rpipe->pipe_map.kva + rpipe->pipe_map.pos;
+				error = uiomove(va, size, uio);
+				pipeunlock(rpipe);
+			}
+			if (error)
+				break;
+			nread += size;
+			rpipe->pipe_map.pos += size;
+			rpipe->pipe_map.cnt -= size;
+			if (rpipe->pipe_map.cnt == 0) {
+				rpipe->pipe_state &= ~PIPE_DIRECTW;
+				wakeup(rpipe);
+			}
+#endif
+		} else {
+			/*
+			 * detect EOF condition
+			 */
+			if (rpipe->pipe_state & PIPE_EOF) {
+				/* XXX error = ? */
+				break;
+			}
+			/*
+			 * If the "write-side" has been blocked, wake it up now.
+			 */
+			if (rpipe->pipe_state & PIPE_WANTW) {
+				rpipe->pipe_state &= ~PIPE_WANTW;
+				wakeup(rpipe);
+			}
+			if (nread > 0)
+				break;
+
+			if (fp->f_flag & FNONBLOCK) {
+				error = EAGAIN;
+				break;
+			}
+
+			/*
+			 * If there is no more to read in the pipe, reset
+			 * its pointers to the beginning.  This improves
+			 * cache hit stats.
+			 */
+		
+			if ((error = pipelock(rpipe,1)) == 0) {
+				if (rpipe->pipe_buffer.cnt == 0) {
+					rpipe->pipe_buffer.in = 0;
+					rpipe->pipe_buffer.out = 0;
+				}
+				pipeunlock(rpipe);
+			} else {
+				break;
+			}
+
+			if (rpipe->pipe_state & PIPE_WANTW) {
+				rpipe->pipe_state &= ~PIPE_WANTW;
+				wakeup(rpipe);
+			}
+
+			rpipe->pipe_state |= PIPE_WANTR;
+			if (error = tsleep(rpipe, PRIBIO|PCATCH, "piperd", 0)) {
+				break;
+			}
+		}
+	}
+
+	if (error == 0)
+		gettime(&rpipe->pipe_atime);
+
+	--rpipe->pipe_busy;
+	if ((rpipe->pipe_busy == 0) && (rpipe->pipe_state & PIPE_WANT)) {
+		rpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTW);
+		wakeup(rpipe);
+	} else if (rpipe->pipe_buffer.cnt < MINPIPESIZE) {
+		/*
+		 * If there is no more to read in the pipe, reset
+		 * its pointers to the beginning.  This improves
+		 * cache hit stats.
+		 */
+		if (rpipe->pipe_buffer.cnt == 0) {
+			if ((error == 0) && (error = pipelock(rpipe,1)) == 0) {
+				rpipe->pipe_buffer.in = 0;
+				rpipe->pipe_buffer.out = 0;
+				pipeunlock(rpipe);
+			}
+		}
+
+		/*
+		 * If the "write-side" has been blocked, wake it up now.
+		 */
+		if (rpipe->pipe_state & PIPE_WANTW) {
+			rpipe->pipe_state &= ~PIPE_WANTW;
+			wakeup(rpipe);
+		}
+	}
+
+	if ((rpipe->pipe_buffer.size - rpipe->pipe_buffer.cnt) >= PIPE_BUF)
+		pipeselwakeup(rpipe);
+
+	return error;
+}
+
+#ifndef PIPE_NODIRECT
+/*
+ * Map the sending processes' buffer into kernel space and wire it.
+ * This is similar to a physical write operation.
+ */
+static int
+pipe_build_write_buffer(wpipe, uio)
+	struct pipe *wpipe;
+	struct uio *uio;
+{
+	u_int size;
+	int i;
+	vm_offset_t addr, endaddr, paddr;
+
+	size = (u_int) uio->uio_iov->iov_len;
+	if (size > wpipe->pipe_buffer.size)
+		size = wpipe->pipe_buffer.size;
+
+	endaddr = round_page(uio->uio_iov->iov_base + size);
+	for(i = 0, addr = trunc_page(uio->uio_iov->iov_base);
+		addr < endaddr;
+		addr += PAGE_SIZE, i+=1) {
+
+		vm_page_t m;
+
+		vm_fault_quick( (caddr_t) addr, VM_PROT_READ);
+		paddr = pmap_kextract(addr);
+		if (!paddr) {
+			int j;
+			for(j=0;j<i;j++)
+				vm_page_unwire(wpipe->pipe_map.ms[j]);
+			return EFAULT;
+		}
+
+		m = PHYS_TO_VM_PAGE(paddr);
+		vm_page_wire(m);
+		wpipe->pipe_map.ms[i] = m;
+	}
+
+/*
+ * set up the control block
+ */
+	wpipe->pipe_map.npages = i;
+	wpipe->pipe_map.pos = ((vm_offset_t) uio->uio_iov->iov_base) & PAGE_MASK;
+	wpipe->pipe_map.cnt = size;
+
+/*
+ * and map the buffer
+ */
+	if (wpipe->pipe_map.kva == 0) {
+		/*
+		 * We need to allocate space for an extra page because the
+		 * address range might (will) span pages at times.
+		 */
+		wpipe->pipe_map.kva = kmem_alloc_pageable(kernel_map,
+			wpipe->pipe_buffer.size + PAGE_SIZE);
+		amountpipekva += wpipe->pipe_buffer.size + PAGE_SIZE;
+	}
+	pmap_qenter(wpipe->pipe_map.kva, wpipe->pipe_map.ms,
+		wpipe->pipe_map.npages);
+
+/*
+ * and update the uio data
+ */
+
+	uio->uio_iov->iov_len -= size;
+	uio->uio_iov->iov_base += size;
+	if (uio->uio_iov->iov_len == 0)
+		uio->uio_iov++;
+	uio->uio_resid -= size;
+	uio->uio_offset += size;
+	return 0;
+}
+
+/*
+ * unmap and unwire the process buffer
+ */
+static void
+pipe_destroy_write_buffer(wpipe)
+struct pipe *wpipe;
+{
+	int i;
+	if (wpipe->pipe_map.kva) {
+		pmap_qremove(wpipe->pipe_map.kva, wpipe->pipe_map.npages);
+
+		if (amountpipekva > MAXPIPEKVA) {
+			vm_offset_t kva = wpipe->pipe_map.kva;
+			wpipe->pipe_map.kva = 0;
+			kmem_free(kernel_map, kva,
+				wpipe->pipe_buffer.size + PAGE_SIZE);
+			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
+		}
+	}
+	for (i=0;i<wpipe->pipe_map.npages;i++)
+		vm_page_unwire(wpipe->pipe_map.ms[i]);
+}
+
+/*
+ * In the case of a signal, the writing process might go away.  This
+ * code copies the data into the circular buffer so that the source
+ * pages can be freed without loss of data.
+ */
+static void
+pipe_clone_write_buffer(wpipe)
+struct pipe *wpipe;
+{
+	int size;
+	int pos;
+
+	size = wpipe->pipe_map.cnt;
+	pos = wpipe->pipe_map.pos;
+	bcopy((caddr_t) wpipe->pipe_map.kva+pos,
+			(caddr_t) wpipe->pipe_buffer.buffer,
+			size);
+
+	wpipe->pipe_buffer.in = size;
+	wpipe->pipe_buffer.out = 0;
+	wpipe->pipe_buffer.cnt = size;
+	wpipe->pipe_state &= ~PIPE_DIRECTW;
+
+	pipe_destroy_write_buffer(wpipe);
+}
+
+/*
+ * This implements the pipe buffer write mechanism.  Note that only
+ * a direct write OR a normal pipe write can be pending at any given time.
+ * If there are any characters in the pipe buffer, the direct write will
+ * be deferred until the receiving process grabs all of the bytes from
+ * the pipe buffer.  Then the direct mapping write is set-up.
+ */
+static int
+pipe_direct_write(wpipe, uio)
+	struct pipe *wpipe;
+	struct uio *uio;
+{
+	int error;
+retry:
+	while (wpipe->pipe_state & PIPE_DIRECTW) {
+		if ( wpipe->pipe_state & PIPE_WANTR) {
+			wpipe->pipe_state &= ~PIPE_WANTR;
+			wakeup(wpipe);
+		}
+		wpipe->pipe_state |= PIPE_WANTW;
+		error = tsleep(wpipe,
+				PRIBIO|PCATCH, "pipdww", 0);
+		if (error)
+			goto error1;
+		if (wpipe->pipe_state & PIPE_EOF) {
+			error = EPIPE;
+			goto error1;
+		}
+	}
+	wpipe->pipe_map.cnt = 0;	/* transfer not ready yet */
+	if (wpipe->pipe_buffer.cnt > 0) {
+		if ( wpipe->pipe_state & PIPE_WANTR) {
+			wpipe->pipe_state &= ~PIPE_WANTR;
+			wakeup(wpipe);
+		}
+			
+		wpipe->pipe_state |= PIPE_WANTW;
+		error = tsleep(wpipe,
+				PRIBIO|PCATCH, "pipdwc", 0);
+		if (error)
+			goto error1;
+		if (wpipe->pipe_state & PIPE_EOF) {
+			error = EPIPE;
+			goto error1;
+		}
+		goto retry;
+	}
+
+	wpipe->pipe_state |= PIPE_DIRECTW;
+
+	error = pipe_build_write_buffer(wpipe, uio);
+	if (error) {
+		wpipe->pipe_state &= ~PIPE_DIRECTW;
+		goto error1;
+	}
+
+	error = 0;
+	while (!error && (wpipe->pipe_state & PIPE_DIRECTW)) {
+		if (wpipe->pipe_state & PIPE_EOF) {
+			pipelock(wpipe, 0);
+			pipe_destroy_write_buffer(wpipe);
+			pipeunlock(wpipe);
+			pipeselwakeup(wpipe);
+			error = EPIPE;
+			goto error1;
+		}
+		if (wpipe->pipe_state & PIPE_WANTR) {
+			wpipe->pipe_state &= ~PIPE_WANTR;
+			wakeup(wpipe);
+		}
+		pipeselwakeup(wpipe);
+		error = tsleep(wpipe, PRIBIO|PCATCH, "pipdwt", 0);
+	}
+
+	pipelock(wpipe,0);
+	if (wpipe->pipe_state & PIPE_DIRECTW) {
+		/*
+		 * this bit of trickery substitutes a kernel buffer for
+		 * the process that might be going away.
+		 */
+		pipe_clone_write_buffer(wpipe);
+	} else {
+		pipe_destroy_write_buffer(wpipe);
+	}
+	pipeunlock(wpipe);
+	return error;
+
+error1:
+	wakeup(wpipe);
+	return error;
+}
+#endif
+	
+static int
+pipe_write(fp, uio, cred)
+	struct file *fp;
+	struct uio *uio;
+	struct ucred *cred;
+{
+	int error = 0;
+	int orig_resid;
+
+	struct pipe *wpipe, *rpipe;
+
+	rpipe = (struct pipe *) fp->f_data;
+	wpipe = rpipe->pipe_peer;
+
+	/*
+	 * detect loss of pipe read side, issue SIGPIPE if lost.
+	 */
+	if ((wpipe == NULL) || (wpipe->pipe_state & PIPE_EOF)) {
+		return EPIPE;
+	}
+
+	/*
+	 * If it is advantageous to resize the pipe buffer, do
+	 * so.
+	 */
+	if ((uio->uio_resid > PIPE_SIZE) &&
+		(nbigpipe < LIMITBIGPIPES) &&
+		(wpipe->pipe_state & PIPE_DIRECTW) == 0 &&
+		(wpipe->pipe_buffer.size <= PIPE_SIZE) &&
+		(wpipe->pipe_buffer.cnt == 0)) {
+
+		if (wpipe->pipe_buffer.buffer) {
+			amountpipekva -= wpipe->pipe_buffer.size;
+			kmem_free(kernel_map,
+				(vm_offset_t)wpipe->pipe_buffer.buffer,
+				wpipe->pipe_buffer.size);
+		}
+
+#ifndef PIPE_NODIRECT
+		if (wpipe->pipe_map.kva) {
+			amountpipekva -= wpipe->pipe_buffer.size + PAGE_SIZE;
+			kmem_free(kernel_map,
+				wpipe->pipe_map.kva,
+				wpipe->pipe_buffer.size + PAGE_SIZE);
+		}
+#endif
+
+		wpipe->pipe_buffer.in = 0;
+		wpipe->pipe_buffer.out = 0;
+		wpipe->pipe_buffer.cnt = 0;
+		wpipe->pipe_buffer.size = BIG_PIPE_SIZE;
+		wpipe->pipe_buffer.buffer = NULL;
+		++nbigpipe;
+
+#ifndef PIPE_NODIRECT
+		wpipe->pipe_map.cnt = 0;
+		wpipe->pipe_map.kva = 0;
+		wpipe->pipe_map.pos = 0;
+		wpipe->pipe_map.npages = 0;
+#endif
+
+	}
+		
+
+	if( wpipe->pipe_buffer.buffer == NULL) {
+		if ((error = pipelock(wpipe,1)) == 0) {
+			pipespace(wpipe);
+			pipeunlock(wpipe);
+		} else {
+			return error;
+		}
+	}
+
+	++wpipe->pipe_busy;
+	orig_resid = uio->uio_resid;
+	while (uio->uio_resid) {
+		int space;
+#ifndef PIPE_NODIRECT
+		/*
+		 * If the transfer is large, we can gain performance if
+		 * we do process-to-process copies directly.
+		 * If the write is non-blocking, we don't use the
+		 * direct write mechanism.
+		 */
+		if ((uio->uio_iov->iov_len >= PIPE_MINDIRECT) &&
+		    (fp->f_flag & FNONBLOCK) == 0 &&
+			(wpipe->pipe_map.kva || (amountpipekva < LIMITPIPEKVA)) &&
+			(uio->uio_iov->iov_len >= PIPE_MINDIRECT)) {
+			error = pipe_direct_write( wpipe, uio);
+			if (error) {
+				break;
+			}
+			continue;
+		}
+#endif
+
+		/*
+		 * Pipe buffered writes cannot be coincidental with
+		 * direct writes.  We wait until the currently executing
+		 * direct write is completed before we start filling the
+		 * pipe buffer.
+		 */
+	retrywrite:
+		while (wpipe->pipe_state & PIPE_DIRECTW) {
+			if (wpipe->pipe_state & PIPE_WANTR) {
+				wpipe->pipe_state &= ~PIPE_WANTR;
+				wakeup(wpipe);
+			}
+			error = tsleep(wpipe,
+					PRIBIO|PCATCH, "pipbww", 0);
+			if (error)
+				break;
+		}
+
+		space = wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt;
+
+		/* Writes of size <= PIPE_BUF must be atomic. */
+		/* XXX perhaps they need to be contiguous to be atomic? */
+		if ((space < uio->uio_resid) && (orig_resid <= PIPE_BUF))
+			space = 0;
+
+		if (space > 0 && (wpipe->pipe_buffer.cnt < PIPE_SIZE)) {
+			/*
+			 * This set the maximum transfer as a segment of
+			 * the buffer.
+			 */
+			int size = wpipe->pipe_buffer.size - wpipe->pipe_buffer.in;
+			/*
+			 * space is the size left in the buffer
+			 */
+			if (size > space)
+				size = space;
+			/*
+			 * now limit it to the size of the uio transfer
+			 */
+			if (size > uio->uio_resid)
+				size = uio->uio_resid;
+			if ((error = pipelock(wpipe,1)) == 0) {
+				/*
+				 * It is possible for a direct write to
+				 * slip in on us... handle it here...
+				 */
+				if (wpipe->pipe_state & PIPE_DIRECTW) {
+					pipeunlock(wpipe);
+					goto retrywrite;
+				}
+				error = uiomove( &wpipe->pipe_buffer.buffer[wpipe->pipe_buffer.in], 
+					size, uio);
+				pipeunlock(wpipe);
+			}
+			if (error)
+				break;
+
+			wpipe->pipe_buffer.in += size;
+			if (wpipe->pipe_buffer.in >= wpipe->pipe_buffer.size)
+				wpipe->pipe_buffer.in = 0;
+
+			wpipe->pipe_buffer.cnt += size;
+		} else {
+			/*
+			 * If the "read-side" has been blocked, wake it up now.
+			 */
+			if (wpipe->pipe_state & PIPE_WANTR) {
+				wpipe->pipe_state &= ~PIPE_WANTR;
+				wakeup(wpipe);
+			}
+
+			/*
+			 * don't block on non-blocking I/O
+			 */
+			if (fp->f_flag & FNONBLOCK) {
+				error = EAGAIN;
+				break;
+			}
+
+			/*
+			 * We have no more space and have something to offer,
+			 * wake up selects.
+			 */
+			pipeselwakeup(wpipe);
+
+			wpipe->pipe_state |= PIPE_WANTW;
+			if (error = tsleep(wpipe, (PRIBIO+1)|PCATCH, "pipewr", 0)) {
+				break;
+			}
+			/*
+			 * If read side wants to go away, we just issue a signal
+			 * to ourselves.
+			 */
+			if (wpipe->pipe_state & PIPE_EOF) {
+				error = EPIPE;
+				break;
+			}	
+		}
+	}
+
+	--wpipe->pipe_busy;
+	if ((wpipe->pipe_busy == 0) &&
+		(wpipe->pipe_state & PIPE_WANT)) {
+		wpipe->pipe_state &= ~(PIPE_WANT|PIPE_WANTR);
+		wakeup(wpipe);
+	} else if (wpipe->pipe_buffer.cnt > 0) {
+		/*
+		 * If we have put any characters in the buffer, we wake up
+		 * the reader.
+		 */
+		if (wpipe->pipe_state & PIPE_WANTR) {
+			wpipe->pipe_state &= ~PIPE_WANTR;
+			wakeup(wpipe);
+		}
+	}
+
+	/*
+	 * Don't return EPIPE if I/O was successful
+	 */
+	if ((wpipe->pipe_buffer.cnt == 0) &&
+		(uio->uio_resid == 0) &&
+		(error == EPIPE))
+		error = 0;
+
+	if (error == 0)
+		gettime(&wpipe->pipe_mtime);
+
+	/*
+	 * We have something to offer,
+	 * wake up select.
+	 */
+	if (wpipe->pipe_buffer.cnt)
+		pipeselwakeup(wpipe);
+
+	return error;
+}
+
+/*
+ * we implement a very minimal set of ioctls for compatibility with sockets.
+ */
+int
+pipe_ioctl(fp, cmd, data, p)
+	struct file *fp;
+	int cmd;
+	register caddr_t data;
+	struct proc *p;
+{
+	register struct pipe *mpipe = (struct pipe *)fp->f_data;
+
+	switch (cmd) {
+
+	case FIONBIO:
+		return (0);
+
+	case FIOASYNC:
+		if (*(int *)data) {
+			mpipe->pipe_state |= PIPE_ASYNC;
+		} else {
+			mpipe->pipe_state &= ~PIPE_ASYNC;
+		}
+		return (0);
+
+	case FIONREAD:
+		if (mpipe->pipe_state & PIPE_DIRECTW)
+			*(int *)data = mpipe->pipe_map.cnt;
+		else
+			*(int *)data = mpipe->pipe_buffer.cnt;
+		return (0);
+
+	case TIOCSPGRP:
+		mpipe->pipe_pgid = *(int *)data;
+		return (0);
+
+	case TIOCGPGRP:
+		*(int *)data = mpipe->pipe_pgid;
+		return (0);
+
+	}
+	return (ENOTTY);
+}
+
+int
+pipe_select(fp, which, p)
+	struct file *fp;
+	int which;
+	struct proc *p;
+{
+	register struct pipe *rpipe = (struct pipe *)fp->f_data;
+	struct pipe *wpipe;
+
+	wpipe = rpipe->pipe_peer;
+	switch (which) {
+
+	case FREAD:
+		if ( (rpipe->pipe_state & PIPE_DIRECTW) ||
+			(rpipe->pipe_buffer.cnt > 0) ||
+			(rpipe->pipe_state & PIPE_EOF)) {
+			return (1);
+		}
+		selrecord(p, &rpipe->pipe_sel);
+		rpipe->pipe_state |= PIPE_SEL;
+		break;
+
+	case FWRITE:
+		if ((wpipe == NULL) ||
+			(wpipe->pipe_state & PIPE_EOF) ||
+			(((wpipe->pipe_state & PIPE_DIRECTW) == 0) &&
+			 (wpipe->pipe_buffer.size - wpipe->pipe_buffer.cnt) >= PIPE_BUF)) {
+			return (1);
+		}
+		selrecord(p, &wpipe->pipe_sel);
+		wpipe->pipe_state |= PIPE_SEL;
+		break;
+
+	case 0:
+		if ((rpipe->pipe_state & PIPE_EOF) ||
+			(wpipe == NULL) ||
+			(wpipe->pipe_state & PIPE_EOF)) {
+			return (1);
+		}
+			
+		selrecord(p, &rpipe->pipe_sel);
+		rpipe->pipe_state |= PIPE_SEL;
+		break;
+	}
+	return (0);
+}
+
+int
+pipe_stat(pipe, ub)
+	register struct pipe *pipe;
+	register struct stat *ub;
+{
+	bzero((caddr_t)ub, sizeof (*ub));
+	ub->st_mode = S_IFIFO;
+	ub->st_blksize = pipe->pipe_buffer.size;
+	ub->st_size = pipe->pipe_buffer.cnt;
+	ub->st_blocks = (ub->st_size + ub->st_blksize - 1) / ub->st_blksize;
+	TIMEVAL_TO_TIMESPEC(&pipe->pipe_atime, &ub->st_atimespec);
+	TIMEVAL_TO_TIMESPEC(&pipe->pipe_mtime, &ub->st_mtimespec);
+	TIMEVAL_TO_TIMESPEC(&pipe->pipe_ctime, &ub->st_ctimespec);
+	/*
+	 * Left as 0: st_dev, st_ino, st_nlink, st_uid, st_gid, st_rdev,
+	 * st_flags, st_gen.
+	 * XXX (st_dev, st_ino) should be unique.
+	 */
+	return 0;
+}
+
+/* ARGSUSED */
+static int
+pipe_close(fp, p)
+	struct file *fp;
+	struct proc *p;
+{
+	struct pipe *cpipe = (struct pipe *)fp->f_data;
+
+	pipeclose(cpipe);
+	fp->f_data = NULL;
+	return 0;
+}
+
+/*
+ * shutdown the pipe
+ */
+static void
+pipeclose(cpipe)
+	struct pipe *cpipe;
+{
+	struct pipe *ppipe;
+	if (cpipe) {
+		
+		pipeselwakeup(cpipe);
+
+		/*
+		 * If the other side is blocked, wake it up saying that
+		 * we want to close it down.
+		 */
+		while (cpipe->pipe_busy) {
+			wakeup(cpipe);
+			cpipe->pipe_state |= PIPE_WANT|PIPE_EOF;
+			tsleep(cpipe, PRIBIO, "pipecl", 0);
+		}
+
+		/*
+		 * Disconnect from peer
+		 */
+		if (ppipe = cpipe->pipe_peer) {
+			pipeselwakeup(ppipe);
+
+			ppipe->pipe_state |= PIPE_EOF;
+			wakeup(ppipe);
+			ppipe->pipe_peer = NULL;
+		}
+
+		/*
+		 * free resources
+		 */
+		if (cpipe->pipe_buffer.buffer) {
+			if (cpipe->pipe_buffer.size > PIPE_SIZE)
+				--nbigpipe;
+			amountpipekva -= cpipe->pipe_buffer.size;
+			kmem_free(kernel_map,
+				(vm_offset_t)cpipe->pipe_buffer.buffer,
+				cpipe->pipe_buffer.size);
+		}
+#ifndef PIPE_NODIRECT
+		if (cpipe->pipe_map.kva) {
+			amountpipekva -= cpipe->pipe_buffer.size + PAGE_SIZE;
+			kmem_free(kernel_map,
+				cpipe->pipe_map.kva,
+				cpipe->pipe_buffer.size + PAGE_SIZE);
+		}
+#endif
+		free(cpipe, M_TEMP);
+	}
+}
+#endif
diff --git a/sys/kern/sys_process.c b/sys/kern/sys_process.c
index 4cc40ba..7a538b6 100644
--- a/sys/kern/sys_process.c
+++ b/sys/kern/sys_process.c
@@ -1,11 +1,6 @@
-/*-
- * Copyright (c) 1982, 1986, 1989, 1993
- *	The Regents of the University of California.  All rights reserved.
- * (c) UNIX System Laboratories, Inc.
- * All or some portions of this file are derived from material licensed
- * to the University of California by American Telephone and Telegraph
- * Co. or Unix System Laboratories, Inc. and are reproduced herein with
- * the permission of UNIX System Laboratories, Inc.
+/*
+ * Copyright (c) 1994, Sean Eric Fagan
+ * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -17,16 +12,14 @@
  *    documentation and/or other materials provided with the distribution.
  * 3. All advertising materials mentioning features or use of this software
  *    must display the following acknowledgement:
- *	This product includes software developed by the University of
- *	California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
+ *	This product includes software developed by Sean Eric Fagan.
+ * 4. The name of the author may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
@@ -35,40 +28,481 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	from: @(#)sys_process.c	8.1 (Berkeley) 6/10/93
+ *	$Id$
  */
 
 #include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
 #include <sys/proc.h>
+#include <sys/vnode.h>
+#include <sys/ptrace.h>
 #include <sys/errno.h>
+#include <sys/queue.h>
+
+#include <machine/reg.h>
+#include <machine/psl.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+
+#include <sys/user.h>
+#include <miscfs/procfs/procfs.h>
+
+/* use the equivalent procfs code */
+#if 0
+static int
+pread (struct proc *procp, unsigned int addr, unsigned int *retval) {
+	int		rv;
+	vm_map_t	map, tmap;
+	vm_object_t	object;
+	vm_offset_t	kva = 0;
+	int		page_offset;	/* offset into page */
+	vm_offset_t	pageno;		/* page number */
+	vm_map_entry_t	out_entry;
+	vm_prot_t	out_prot;
+	boolean_t	wired, single_use;
+	vm_pindex_t	pindex;
+
+	/* Map page into kernel space */
+
+	map = &procp->p_vmspace->vm_map;
+
+	page_offset = addr - trunc_page(addr);
+	pageno = trunc_page(addr);
+
+	tmap = map;
+	rv = vm_map_lookup (&tmap, pageno, VM_PROT_READ, &out_entry,
+		&object, &pindex, &out_prot, &wired, &single_use);
+
+	if (rv != KERN_SUCCESS)
+		return EINVAL;
+
+	vm_map_lookup_done (tmap, out_entry);
+
+	/* Find space in kernel_map for the page we're interested in */
+	rv = vm_map_find (kernel_map, object, IDX_TO_OFF(pindex),
+		&kva, PAGE_SIZE, 0, VM_PROT_ALL, VM_PROT_ALL, 0);
+
+	if (!rv) {
+		vm_object_reference (object);
+
+		rv = vm_map_pageable (kernel_map, kva, kva + PAGE_SIZE, 0);
+		if (!rv) {
+			*retval = 0;
+			bcopy ((caddr_t)kva + page_offset,
+			       retval, sizeof *retval);
+		}
+		vm_map_remove (kernel_map, kva, kva + PAGE_SIZE);
+	}
+
+	return rv;
+}
+
+static int
+pwrite (struct proc *procp, unsigned int addr, unsigned int datum) {
+	int		rv;
+	vm_map_t	map, tmap;
+	vm_object_t	object;
+	vm_offset_t	kva = 0;
+	int		page_offset;	/* offset into page */
+	vm_offset_t	pageno;		/* page number */
+	vm_map_entry_t	out_entry;
+	vm_prot_t	out_prot;
+	boolean_t	wired, single_use;
+	vm_pindex_t	pindex;
+	boolean_t	fix_prot = 0;
+
+	/* Map page into kernel space */
+
+	map = &procp->p_vmspace->vm_map;
+
+	page_offset = addr - trunc_page(addr);
+	pageno = trunc_page(addr);
+
+	/*
+	 * Check the permissions for the area we're interested in.
+	 */
+
+	if (vm_map_check_protection (map, pageno, pageno + PAGE_SIZE,
+		VM_PROT_WRITE) == FALSE) {
+		/*
+		 * If the page was not writable, we make it so.
+		 * XXX It is possible a page may *not* be read/executable,
+		 * if a process changes that!
+		 */
+		fix_prot = 1;
+		/* The page isn't writable, so let's try making it so... */
+		if ((rv = vm_map_protect (map, pageno, pageno + PAGE_SIZE,
+			VM_PROT_ALL, 0)) != KERN_SUCCESS)
+		  return EFAULT;	/* I guess... */
+	}
+
+	/*
+	 * Now we need to get the page.  out_entry, out_prot, wired, and
+	 * single_use aren't used.  One would think the vm code would be
+	 * a *bit* nicer...  We use tmap because vm_map_lookup() can
+	 * change the map argument.
+	 */
+
+	tmap = map;
+	rv = vm_map_lookup (&tmap, pageno, VM_PROT_WRITE, &out_entry,
+		&object, &pindex, &out_prot, &wired, &single_use);
+	if (rv != KERN_SUCCESS) {
+		return EINVAL;
+	}
+
+	/*
+	 * Okay, we've got the page.  Let's release tmap.
+	 */
+
+	vm_map_lookup_done (tmap, out_entry);
+
+	/*
+	 * Fault the page in...
+	 */
+
+	rv = vm_fault(map, pageno, VM_PROT_WRITE|VM_PROT_READ, FALSE);
+	if (rv != KERN_SUCCESS)
+		return EFAULT;
+
+	/* Find space in kernel_map for the page we're interested in */
+	rv = vm_map_find (kernel_map, object, IDX_TO_OFF(pindex),
+		&kva, PAGE_SIZE, 0,
+		VM_PROT_ALL, VM_PROT_ALL, 0);
+	if (!rv) {
+		vm_object_reference (object);
+
+		rv = vm_map_pageable (kernel_map, kva, kva + PAGE_SIZE, 0);
+		if (!rv) {
+		  bcopy (&datum, (caddr_t)kva + page_offset, sizeof datum);
+		}
+		vm_map_remove (kernel_map, kva, kva + PAGE_SIZE);
+	}
+
+	if (fix_prot)
+		vm_map_protect (map, pageno, pageno + PAGE_SIZE,
+			VM_PROT_READ|VM_PROT_EXECUTE, 0);
+	return rv;
+}
+#endif
 
 /*
  * Process debugging system call.
  */
+#ifndef _SYS_SYSPROTO_H_
 struct ptrace_args {
 	int	req;
 	pid_t	pid;
 	caddr_t	addr;
 	int	data;
 };
-ptrace(a1, a2, a3)
-	struct proc *a1;
-	struct ptrace_args *a2;
-	int *a3;
+#endif
+
+int
+ptrace(curp, uap, retval)
+	struct proc *curp;
+	struct ptrace_args *uap;
+	int *retval;
 {
+	struct proc *p;
+	struct iovec iov;
+	struct uio uio;
+	int error = 0;
+	int write;
+	int s;
+
+	if (uap->req == PT_TRACE_ME)
+		p = curp;
+	else {
+		if ((p = pfind(uap->pid)) == NULL)
+			return ESRCH;
+	}
 
 	/*
-	 * Body deleted.
+	 * Permissions check
 	 */
-	return (ENOSYS);
-}
+	switch (uap->req) {
+	case PT_TRACE_ME:
+		/* Always legal. */
+		break;
 
-trace_req(a1)
-	struct proc *a1;
-{
+	case PT_ATTACH:
+		/* Self */
+		if (p->p_pid == curp->p_pid)
+			return EINVAL;
+
+		/* Already traced */
+		if (p->p_flag & P_TRACED)
+			return EBUSY;
+
+		/* not owned by you, has done setuid (unless you're root) */
+		if ((p->p_cred->p_ruid != curp->p_cred->p_ruid) ||
+		     (p->p_flag & P_SUGID)) {
+			if (error = suser(curp->p_ucred, &curp->p_acflag))
+				return error;
+		}
+
+		/* OK */
+		break;
+
+	case PT_READ_I:
+	case PT_READ_D:
+	case PT_READ_U:
+	case PT_WRITE_I:
+	case PT_WRITE_D:
+	case PT_WRITE_U:
+	case PT_CONTINUE:
+	case PT_KILL:
+	case PT_STEP:
+	case PT_DETACH:
+#ifdef PT_GETREGS
+	case PT_GETREGS:
+#endif
+#ifdef PT_SETREGS
+	case PT_SETREGS:
+#endif
+#ifdef PT_GETFPREGS
+	case PT_GETFPREGS:
+#endif
+#ifdef PT_SETFPREGS
+	case PT_SETFPREGS:
+#endif
+		/* not being traced... */
+		if ((p->p_flag & P_TRACED) == 0)
+			return EPERM;
+
+		/* not being traced by YOU */
+		if (p->p_pptr != curp)
+			return EBUSY;
+
+		/* not currently stopped */
+		if (p->p_stat != SSTOP || (p->p_flag & P_WAITED) == 0)
+			return EBUSY;
+
+		/* OK */
+		break;
+
+	default:
+		return EINVAL;
+	}
+
+#ifdef FIX_SSTEP
+	/*
+	 * Single step fixup ala procfs
+	 */
+	FIX_SSTEP(p);
+#endif
 
 	/*
-	 * Body deleted.
+	 * Actually do the requests
 	 */
-	return (0);
+
+	write = 0;
+	*retval = 0;
+
+	switch (uap->req) {
+	case PT_TRACE_ME:
+		/* set my trace flag and "owner" so it can read/write me */
+		p->p_flag |= P_TRACED;
+		p->p_oppid = p->p_pptr->p_pid;
+		return 0;
+
+	case PT_ATTACH:
+		/* security check done above */
+		p->p_flag |= P_TRACED;
+		p->p_oppid = p->p_pptr->p_pid;
+		if (p->p_pptr != curp)
+			proc_reparent(p, curp);
+		uap->data = SIGSTOP;
+		goto sendsig;	/* in PT_CONTINUE below */
+
+	case PT_STEP:
+	case PT_CONTINUE:
+	case PT_DETACH:
+		if ((unsigned)uap->data >= NSIG)
+			return EINVAL;
+
+		PHOLD(p);
+
+		if (uap->req == PT_STEP) {
+			if ((error = ptrace_single_step (p))) {
+				PRELE(p);
+				return error;
+			}
+		}
+
+		if (uap->addr != (caddr_t)1) {
+			fill_eproc (p, &p->p_addr->u_kproc.kp_eproc);
+			if ((error = ptrace_set_pc (p, (u_int)uap->addr))) {
+				PRELE(p);
+				return error;
+			}
+		}
+		PRELE(p);
+
+		if (uap->req == PT_DETACH) {
+			/* reset process parent */
+			if (p->p_oppid != p->p_pptr->p_pid) {
+				struct proc *pp;
+
+				pp = pfind(p->p_oppid);
+				proc_reparent(p, pp ? pp : initproc);
+			}
+
+			p->p_flag &= ~(P_TRACED | P_WAITED);
+			p->p_oppid = 0;
+
+			/* should we send SIGCHLD? */
+
+		}
+
+	sendsig:
+		/* deliver or queue signal */
+		s = splhigh();
+		if (p->p_stat == SSTOP) {
+			p->p_xstat = uap->data;
+			setrunnable(p);
+		} else if (uap->data) {
+			psignal(p, uap->data);
+		}
+		splx(s);
+		return 0;
+
+	case PT_WRITE_I:
+	case PT_WRITE_D:
+		write = 1;
+		/* fallthrough */
+	case PT_READ_I:
+	case PT_READ_D:
+		/* write = 0 set above */
+		iov.iov_base = write ? (caddr_t)&uap->data : (caddr_t)retval;
+		iov.iov_len = sizeof(int);
+		uio.uio_iov = &iov;
+		uio.uio_iovcnt = 1;
+		uio.uio_offset = (off_t)(u_long)uap->addr;
+		uio.uio_resid = sizeof(int);
+		uio.uio_segflg = UIO_SYSSPACE;	/* ie: the uap */
+		uio.uio_rw = write ? UIO_WRITE : UIO_READ;
+		uio.uio_procp = p;
+		error = procfs_domem(curp, p, NULL, &uio);
+		if (uio.uio_resid != 0) {
+			/*
+			 * XXX procfs_domem() doesn't currently return ENOSPC,
+			 * so I think write() can bogusly return 0.
+			 * XXX what happens for short writes?  We don't want
+			 * to write partial data.
+			 * XXX procfs_domem() returns EPERM for other invalid
+			 * addresses.  Convert this to EINVAL.  Does this
+			 * clobber returns of EPERM for other reasons?
+			 */
+			if (error == 0 || error == ENOSPC || error == EPERM)
+				error = EINVAL;	/* EOF */
+		}
+		return (error);
+
+	case PT_READ_U:
+		if ((u_int)uap->addr > (UPAGES * PAGE_SIZE - sizeof(int))) {
+			return EFAULT;
+		}
+		error = 0;
+		PHOLD(p);	/* user had damn well better be incore! */
+		if (p->p_flag & P_INMEM) {
+			p->p_addr->u_kproc.kp_proc = *p;
+			fill_eproc (p, &p->p_addr->u_kproc.kp_eproc);
+			*retval = *(int*)((u_int)p->p_addr + (u_int)uap->addr);
+		} else {
+			*retval = 0;
+			error = EFAULT;
+		}
+		PRELE(p);
+		return error;
+
+	case PT_WRITE_U:
+		PHOLD(p);	/* user had damn well better be incore! */
+		if (p->p_flag & P_INMEM) {
+			p->p_addr->u_kproc.kp_proc = *p;
+			fill_eproc (p, &p->p_addr->u_kproc.kp_eproc);
+			error = ptrace_write_u(p, (vm_offset_t)uap->addr, uap->data);
+		} else {
+			error = EFAULT;
+		}
+		PRELE(p);
+		return error;
+
+	case PT_KILL:
+		uap->data = SIGKILL;
+		goto sendsig;	/* in PT_CONTINUE above */
+
+#ifdef PT_SETREGS
+	case PT_SETREGS:
+		write = 1;
+		/* fallthrough */
+#endif /* PT_SETREGS */
+#ifdef PT_GETREGS
+	case PT_GETREGS:
+		/* write = 0 above */
+#endif /* PT_SETREGS */
+#if defined(PT_SETREGS) || defined(PT_GETREGS)
+		if (!procfs_validregs(p))	/* no P_SYSTEM procs please */
+			return EINVAL;
+		else {
+			iov.iov_base = uap->addr;
+			iov.iov_len = sizeof(struct reg);
+			uio.uio_iov = &iov;
+			uio.uio_iovcnt = 1;
+			uio.uio_offset = 0;
+			uio.uio_resid = sizeof(struct reg);
+			uio.uio_segflg = UIO_USERSPACE;
+			uio.uio_rw = write ? UIO_WRITE : UIO_READ;
+			uio.uio_procp = curp;
+			return (procfs_doregs(curp, p, NULL, &uio));
+		}
+#endif /* defined(PT_SETREGS) || defined(PT_GETREGS) */
+
+#ifdef PT_SETFPREGS
+	case PT_SETFPREGS:
+		write = 1;
+		/* fallthrough */
+#endif /* PT_SETFPREGS */
+#ifdef PT_GETFPREGS
+	case PT_GETFPREGS:
+		/* write = 0 above */
+#endif /* PT_SETFPREGS */
+#if defined(PT_SETFPREGS) || defined(PT_GETFPREGS)
+		if (!procfs_validfpregs(p))	/* no P_SYSTEM procs please */
+			return EINVAL;
+		else {
+			iov.iov_base = uap->addr;
+			iov.iov_len = sizeof(struct fpreg);
+			uio.uio_iov = &iov;
+			uio.uio_iovcnt = 1;
+			uio.uio_offset = 0;
+			uio.uio_resid = sizeof(struct fpreg);
+			uio.uio_segflg = UIO_USERSPACE;
+			uio.uio_rw = write ? UIO_WRITE : UIO_READ;
+			uio.uio_procp = curp;
+			return (procfs_dofpregs(curp, p, NULL, &uio));
+		}
+#endif /* defined(PT_SETFPREGS) || defined(PT_GETFPREGS) */
+
+	default:
+		break;
+	}
+
+	return 0;
+}
+
+int
+trace_req(p)
+	struct proc *p;
+{
+	return 1;
 }
diff --git a/sys/kern/sys_socket.c b/sys/kern/sys_socket.c
index abc2dc7..c3e6615 100644
--- a/sys/kern/sys_socket.c
+++ b/sys/kern/sys_socket.c
@@ -30,28 +30,39 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)sys_socket.c	8.3 (Berkeley) 2/14/95
+ *	@(#)sys_socket.c	8.1 (Berkeley) 6/10/93
+ * $Id: sys_socket.c,v 1.11 1997/03/23 03:36:25 bde Exp $
  */
 
 #include <sys/param.h>
+#include <sys/queue.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
+#include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
+#include <sys/stat.h>
 #include <sys/socketvar.h>
-#include <sys/ioctl.h>
+#include <sys/filio.h>			/* XXX */
+#include <sys/sockio.h>
 #include <sys/stat.h>
 
 #include <net/if.h>
 #include <net/route.h>
 
+static int soo_read __P((struct file *fp, struct uio *uio, 
+		struct ucred *cred));
+static int soo_write __P((struct file *fp, struct uio *uio, 
+		struct ucred *cred));
+static int soo_close __P((struct file *fp, struct proc *p));
+
 struct	fileops socketops =
     { soo_read, soo_write, soo_ioctl, soo_select, soo_close };
 
 /* ARGSUSED */
-int
+static int
 soo_read(fp, uio, cred)
 	struct file *fp;
 	struct uio *uio;
@@ -63,7 +74,7 @@ soo_read(fp, uio, cred)
 }
 
 /* ARGSUSED */
-int
+static int
 soo_write(fp, uio, cred)
 	struct file *fp;
 	struct uio *uio;
@@ -77,7 +88,7 @@ soo_write(fp, uio, cred)
 int
 soo_ioctl(fp, cmd, data, p)
 	struct file *fp;
-	u_long cmd;
+	int cmd;
 	register caddr_t data;
 	struct proc *p;
 {
@@ -129,8 +140,7 @@ soo_ioctl(fp, cmd, data, p)
 		return (ifioctl(so, cmd, data, p));
 	if (IOCGROUP(cmd) == 'r')
 		return (rtioctl(cmd, data, p));
-	return ((*so->so_proto->pr_usrreq)(so, PRU_CONTROL, 
-	    (struct mbuf *)cmd, (struct mbuf *)data, (struct mbuf *)0));
+	return ((*so->so_proto->pr_usrreqs->pru_control)(so, cmd, data, 0));
 }
 
 int
@@ -183,13 +193,11 @@ soo_stat(so, ub)
 
 	bzero((caddr_t)ub, sizeof (*ub));
 	ub->st_mode = S_IFSOCK;
-	return ((*so->so_proto->pr_usrreq)(so, PRU_SENSE,
-	    (struct mbuf *)ub, (struct mbuf *)0, 
-	    (struct mbuf *)0));
+	return ((*so->so_proto->pr_usrreqs->pru_sense)(so, ub));
 }
 
 /* ARGSUSED */
-int
+static int
 soo_close(fp, p)
 	struct file *fp;
 	struct proc *p;
diff --git a/sys/kern/syscalls.c b/sys/kern/syscalls.c
index 91cbdc9..e938376 100644
--- a/sys/kern/syscalls.c
+++ b/sys/kern/syscalls.c
@@ -2,7 +2,7 @@
  * System call names.
  *
  * DO NOT EDIT-- this file is automatically generated.
- * created from	@(#)syscalls.master	8.6 (Berkeley) 3/30/95
+ * created from	Id: syscalls.master,v 1.33 1997/02/22 09:39:21 peter Exp 
  */
 
 char *syscallnames[] = {
@@ -14,10 +14,10 @@ char *syscallnames[] = {
 	"open",			/* 5 = open */
 	"close",			/* 6 = close */
 	"wait4",			/* 7 = wait4 */
-	"compat_43_creat",	/* 8 = compat_43 creat */
+	"old.creat",		/* 8 = old creat */
 	"link",			/* 9 = link */
 	"unlink",			/* 10 = unlink */
-	"#11 (obsolete execv)",		/* 11 = obsolete execv */
+	"obs_execv",			/* 11 = obsolete execv */
 	"chdir",			/* 12 = chdir */
 	"fchdir",			/* 13 = fchdir */
 	"mknod",			/* 14 = mknod */
@@ -25,7 +25,7 @@ char *syscallnames[] = {
 	"chown",			/* 16 = chown */
 	"break",			/* 17 = break */
 	"getfsstat",			/* 18 = getfsstat */
-	"compat_43_lseek",	/* 19 = compat_43 lseek */
+	"old.lseek",		/* 19 = old lseek */
 	"getpid",			/* 20 = getpid */
 	"mount",			/* 21 = mount */
 	"unmount",			/* 22 = unmount */
@@ -44,18 +44,14 @@ char *syscallnames[] = {
 	"fchflags",			/* 35 = fchflags */
 	"sync",			/* 36 = sync */
 	"kill",			/* 37 = kill */
-	"compat_43_stat",	/* 38 = compat_43 stat */
+	"old.stat",		/* 38 = old stat */
 	"getppid",			/* 39 = getppid */
-	"compat_43_lstat",	/* 40 = compat_43 lstat */
+	"old.lstat",		/* 40 = old lstat */
 	"dup",			/* 41 = dup */
 	"pipe",			/* 42 = pipe */
 	"getegid",			/* 43 = getegid */
 	"profil",			/* 44 = profil */
-#ifdef KTRACE
 	"ktrace",			/* 45 = ktrace */
-#else
-	"#45 (unimplemented ktrace)",		/* 45 = unimplemented ktrace */
-#endif
 	"sigaction",			/* 46 = sigaction */
 	"getgid",			/* 47 = getgid */
 	"sigprocmask",			/* 48 = sigprocmask */
@@ -72,83 +68,75 @@ char *syscallnames[] = {
 	"execve",			/* 59 = execve */
 	"umask",			/* 60 = umask */
 	"chroot",			/* 61 = chroot */
-	"compat_43_fstat",	/* 62 = compat_43 fstat */
-	"compat_43_getkerninfo",	/* 63 = compat_43 getkerninfo */
-	"compat_43_getpagesize",	/* 64 = compat_43 getpagesize */
+	"old.fstat",		/* 62 = old fstat */
+	"old.getkerninfo",		/* 63 = old getkerninfo */
+	"old.getpagesize",		/* 64 = old getpagesize */
 	"msync",			/* 65 = msync */
 	"vfork",			/* 66 = vfork */
-	"#67 (obsolete vread)",		/* 67 = obsolete vread */
-	"#68 (obsolete vwrite)",		/* 68 = obsolete vwrite */
+	"obs_vread",			/* 67 = obsolete vread */
+	"obs_vwrite",			/* 68 = obsolete vwrite */
 	"sbrk",			/* 69 = sbrk */
 	"sstk",			/* 70 = sstk */
-	"compat_43_mmap",	/* 71 = compat_43 mmap */
+	"old.mmap",		/* 71 = old mmap */
 	"vadvise",			/* 72 = vadvise */
 	"munmap",			/* 73 = munmap */
 	"mprotect",			/* 74 = mprotect */
 	"madvise",			/* 75 = madvise */
-	"#76 (obsolete vhangup)",		/* 76 = obsolete vhangup */
-	"#77 (obsolete vlimit)",		/* 77 = obsolete vlimit */
+	"obs_vhangup",			/* 76 = obsolete vhangup */
+	"obs_vlimit",			/* 77 = obsolete vlimit */
 	"mincore",			/* 78 = mincore */
 	"getgroups",			/* 79 = getgroups */
 	"setgroups",			/* 80 = setgroups */
 	"getpgrp",			/* 81 = getpgrp */
 	"setpgid",			/* 82 = setpgid */
 	"setitimer",			/* 83 = setitimer */
-	"compat_43_wait",	/* 84 = compat_43 wait */
+	"old.wait",		/* 84 = old wait */
 	"swapon",			/* 85 = swapon */
 	"getitimer",			/* 86 = getitimer */
-	"compat_43_gethostname",	/* 87 = compat_43 gethostname */
-	"compat_43_sethostname",	/* 88 = compat_43 sethostname */
+	"old.gethostname",		/* 87 = old gethostname */
+	"old.sethostname",		/* 88 = old sethostname */
 	"getdtablesize",			/* 89 = getdtablesize */
 	"dup2",			/* 90 = dup2 */
-	"#91 (unimplemented getdopt)",		/* 91 = unimplemented getdopt */
+	"#91",			/* 91 = getdopt */
 	"fcntl",			/* 92 = fcntl */
 	"select",			/* 93 = select */
-	"#94 (unimplemented setdopt)",		/* 94 = unimplemented setdopt */
+	"#94",			/* 94 = setdopt */
 	"fsync",			/* 95 = fsync */
 	"setpriority",			/* 96 = setpriority */
 	"socket",			/* 97 = socket */
 	"connect",			/* 98 = connect */
-	"compat_43_accept",	/* 99 = compat_43 accept */
+	"old.accept",		/* 99 = old accept */
 	"getpriority",			/* 100 = getpriority */
-	"compat_43_send",	/* 101 = compat_43 send */
-	"compat_43_recv",	/* 102 = compat_43 recv */
+	"old.send",		/* 101 = old send */
+	"old.recv",		/* 102 = old recv */
 	"sigreturn",			/* 103 = sigreturn */
 	"bind",			/* 104 = bind */
 	"setsockopt",			/* 105 = setsockopt */
 	"listen",			/* 106 = listen */
-	"#107 (obsolete vtimes)",		/* 107 = obsolete vtimes */
-	"compat_43_sigvec",	/* 108 = compat_43 sigvec */
-	"compat_43_sigblock",	/* 109 = compat_43 sigblock */
-	"compat_43_sigsetmask",	/* 110 = compat_43 sigsetmask */
+	"obs_vtimes",			/* 107 = obsolete vtimes */
+	"old.sigvec",		/* 108 = old sigvec */
+	"old.sigblock",		/* 109 = old sigblock */
+	"old.sigsetmask",		/* 110 = old sigsetmask */
 	"sigsuspend",			/* 111 = sigsuspend */
-	"compat_43_sigstack",	/* 112 = compat_43 sigstack */
-	"compat_43_recvmsg",	/* 113 = compat_43 recvmsg */
-	"compat_43_sendmsg",	/* 114 = compat_43 sendmsg */
-#ifdef TRACE
-	"vtrace",			/* 115 = vtrace */
-#else
-	"#115 (obsolete vtrace)",		/* 115 = obsolete vtrace */
-#endif
+	"old.sigstack",		/* 112 = old sigstack */
+	"old.recvmsg",		/* 113 = old recvmsg */
+	"old.sendmsg",		/* 114 = old sendmsg */
+	"obs_vtrace",			/* 115 = obsolete vtrace */
 	"gettimeofday",			/* 116 = gettimeofday */
 	"getrusage",			/* 117 = getrusage */
 	"getsockopt",			/* 118 = getsockopt */
-#ifdef vax
-	"resuba",			/* 119 = resuba */
-#else
-	"#119 (unimplemented resuba)",		/* 119 = unimplemented resuba */
-#endif
+	"#119",			/* 119 = resuba */
 	"readv",			/* 120 = readv */
 	"writev",			/* 121 = writev */
 	"settimeofday",			/* 122 = settimeofday */
 	"fchown",			/* 123 = fchown */
 	"fchmod",			/* 124 = fchmod */
-	"compat_43_recvfrom",	/* 125 = compat_43 recvfrom */
-	"compat_43_setreuid",	/* 126 = compat_43 setreuid */
-	"compat_43_setregid",	/* 127 = compat_43 setregid */
+	"old.recvfrom",		/* 125 = old recvfrom */
+	"setreuid",			/* 126 = setreuid */
+	"setregid",			/* 127 = setregid */
 	"rename",			/* 128 = rename */
-	"compat_43_truncate",	/* 129 = compat_43 truncate */
-	"compat_43_ftruncate",	/* 130 = compat_43 ftruncate */
+	"old.truncate",		/* 129 = old truncate */
+	"old.ftruncate",		/* 130 = old ftruncate */
 	"flock",			/* 131 = flock */
 	"mkfifo",			/* 132 = mkfifo */
 	"sendto",			/* 133 = sendto */
@@ -157,60 +145,56 @@ char *syscallnames[] = {
 	"mkdir",			/* 136 = mkdir */
 	"rmdir",			/* 137 = rmdir */
 	"utimes",			/* 138 = utimes */
-	"#139 (obsolete 4.2 sigreturn)",		/* 139 = obsolete 4.2 sigreturn */
+	"obs_4.2",			/* 139 = obsolete 4.2 sigreturn */
 	"adjtime",			/* 140 = adjtime */
-	"compat_43_getpeername",	/* 141 = compat_43 getpeername */
-	"compat_43_gethostid",	/* 142 = compat_43 gethostid */
-	"compat_43_sethostid",	/* 143 = compat_43 sethostid */
-	"compat_43_getrlimit",	/* 144 = compat_43 getrlimit */
-	"compat_43_setrlimit",	/* 145 = compat_43 setrlimit */
-	"compat_43_killpg",	/* 146 = compat_43 killpg */
+	"old.getpeername",		/* 141 = old getpeername */
+	"old.gethostid",		/* 142 = old gethostid */
+	"old.sethostid",		/* 143 = old sethostid */
+	"old.getrlimit",		/* 144 = old getrlimit */
+	"old.setrlimit",		/* 145 = old setrlimit */
+	"old.killpg",		/* 146 = old killpg */
 	"setsid",			/* 147 = setsid */
 	"quotactl",			/* 148 = quotactl */
-	"compat_43_quota",	/* 149 = compat_43 quota */
-	"compat_43_getsockname",	/* 150 = compat_43 getsockname */
-	"#151 (unimplemented)",		/* 151 = unimplemented */
-	"#152 (unimplemented)",		/* 152 = unimplemented */
-	"#153 (unimplemented)",		/* 153 = unimplemented */
-	"#154 (unimplemented)",		/* 154 = unimplemented */
+	"old.quota",		/* 149 = old quota */
+	"old.getsockname",		/* 150 = old getsockname */
+	"#151",			/* 151 = sem_lock */
+	"#152",			/* 152 = sem_wakeup */
+	"#153",			/* 153 = asyncdaemon */
+	"#154",			/* 154 = nosys */
 #ifdef NFS
 	"nfssvc",			/* 155 = nfssvc */
 #else
-	"#155 (unimplemented nfssvc)",		/* 155 = unimplemented nfssvc */
+	"#155",			/* 155 = nosys */
 #endif
-	"compat_43_getdirentries",	/* 156 = compat_43 getdirentries */
+	"old.getdirentries",		/* 156 = old getdirentries */
 	"statfs",			/* 157 = statfs */
 	"fstatfs",			/* 158 = fstatfs */
-	"#159 (unimplemented)",		/* 159 = unimplemented */
-	"#160 (unimplemented)",		/* 160 = unimplemented */
-#ifdef NFS
+	"#159",			/* 159 = nosys */
+	"#160",			/* 160 = nosys */
+#if defined(NFS) && !defined (NFS_NOSERVER)
 	"getfh",			/* 161 = getfh */
 #else
-	"#161 (unimplemented getfh)",		/* 161 = unimplemented getfh */
-#endif
-	"#162 (unimplemented getdomainname)",		/* 162 = unimplemented getdomainname */
-	"#163 (unimplemented setdomainname)",		/* 163 = unimplemented setdomainname */
-	"#164 (unimplemented)",		/* 164 = unimplemented */
-	"#165 (unimplemented)",		/* 165 = unimplemented */
-	"#166 (unimplemented)",		/* 166 = unimplemented */
-	"#167 (unimplemented)",		/* 167 = unimplemented */
-	"#168 (unimplemented)",		/* 168 = unimplemented */
-	"#169 (unimplemented semsys)",		/* 169 = unimplemented semsys */
-	"#170 (unimplemented msgsys)",		/* 170 = unimplemented msgsys */
-#if defined(SYSVSHM) && !defined(alpha)
-	"compat_43_shmsys",	/* 171 = compat_43 shmsys */
-#else
-	"#171 (unimplemented shmsys)",		/* 171 = unimplemented shmsys */
+	"#161",			/* 161 = nosys */
 #endif
-	"#172 (unimplemented)",		/* 172 = unimplemented */
-	"#173 (unimplemented)",		/* 173 = unimplemented */
-	"#174 (unimplemented)",		/* 174 = unimplemented */
-	"#175 (unimplemented)",		/* 175 = unimplemented */
-	"#176 (unimplemented)",		/* 176 = unimplemented */
-	"#177 (unimplemented)",		/* 177 = unimplemented */
-	"#178 (unimplemented)",		/* 178 = unimplemented */
-	"#179 (unimplemented)",		/* 179 = unimplemented */
-	"#180 (unimplemented)",		/* 180 = unimplemented */
+	"getdomainname",			/* 162 = getdomainname */
+	"setdomainname",			/* 163 = setdomainname */
+	"uname",			/* 164 = uname */
+	"sysarch",			/* 165 = sysarch */
+	"rtprio",			/* 166 = rtprio */
+	"#167",			/* 167 = nosys */
+	"#168",			/* 168 = nosys */
+	"semsys",			/* 169 = semsys */
+	"msgsys",			/* 170 = msgsys */
+	"shmsys",			/* 171 = shmsys */
+	"#172",			/* 172 = nosys */
+	"#173",			/* 173 = nosys */
+	"#174",			/* 174 = nosys */
+	"#175",			/* 175 = nosys */
+	"ntp_adjtime",			/* 176 = ntp_adjtime */
+	"#177",			/* 177 = sfork */
+	"#178",			/* 178 = getdescriptor */
+	"#179",			/* 179 = setdescriptor */
+	"#180",			/* 180 = nosys */
 	"setgid",			/* 181 = setgid */
 	"setegid",			/* 182 = setegid */
 	"seteuid",			/* 183 = seteuid */
@@ -220,17 +204,17 @@ char *syscallnames[] = {
 	"lfs_segclean",			/* 186 = lfs_segclean */
 	"lfs_segwait",			/* 187 = lfs_segwait */
 #else
-	"#184 (unimplemented lfs_bmapv)",		/* 184 = unimplemented lfs_bmapv */
-	"#185 (unimplemented lfs_markv)",		/* 185 = unimplemented lfs_markv */
-	"#186 (unimplemented lfs_segclean)",		/* 186 = unimplemented lfs_segclean */
-	"#187 (unimplemented lfs_segwait)",		/* 187 = unimplemented lfs_segwait */
+	"#184",			/* 184 = nosys */
+	"#185",			/* 185 = nosys */
+	"#186",			/* 186 = nosys */
+	"#187",			/* 187 = nosys */
 #endif
 	"stat",			/* 188 = stat */
 	"fstat",			/* 189 = fstat */
 	"lstat",			/* 190 = lstat */
 	"pathconf",			/* 191 = pathconf */
 	"fpathconf",			/* 192 = fpathconf */
-	"#193 (unimplemented)",		/* 193 = unimplemented */
+	"#193",			/* 193 = nosys */
 	"getrlimit",			/* 194 = getrlimit */
 	"setrlimit",			/* 195 = setrlimit */
 	"getdirentries",			/* 196 = getdirentries */
@@ -242,38 +226,51 @@ char *syscallnames[] = {
 	"__sysctl",			/* 202 = __sysctl */
 	"mlock",			/* 203 = mlock */
 	"munlock",			/* 204 = munlock */
-	"undelete",			/* 205 = undelete */
-	"#206 (unimplemented)",		/* 206 = unimplemented */
-	"#207 (unimplemented)",		/* 207 = unimplemented */
-	"#208 (unimplemented)",		/* 208 = unimplemented */
-	"#209 (unimplemented)",		/* 209 = unimplemented */
-	"#210 (unimplemented)",		/* 210 = unimplemented */
-	"#211 (unimplemented)",		/* 211 = unimplemented */
-	"#212 (unimplemented)",		/* 212 = unimplemented */
-	"#213 (unimplemented)",		/* 213 = unimplemented */
-	"#214 (unimplemented)",		/* 214 = unimplemented */
-	"#215 (unimplemented)",		/* 215 = unimplemented */
-	"#216 (unimplemented)",		/* 216 = unimplemented */
-	"#217 (unimplemented)",		/* 217 = unimplemented */
-	"#218 (unimplemented)",		/* 218 = unimplemented */
-	"#219 (unimplemented)",		/* 219 = unimplemented */
-	"#220 (unimplemented semctl)",		/* 220 = unimplemented semctl */
-	"#221 (unimplemented semget)",		/* 221 = unimplemented semget */
-	"#222 (unimplemented semop)",		/* 222 = unimplemented semop */
-	"#223 (unimplemented semconfig)",		/* 223 = unimplemented semconfig */
-	"#224 (unimplemented msgctl)",		/* 224 = unimplemented msgctl */
-	"#225 (unimplemented msgget)",		/* 225 = unimplemented msgget */
-	"#226 (unimplemented msgsnd)",		/* 226 = unimplemented msgsnd */
-	"#227 (unimplemented msgrcv)",		/* 227 = unimplemented msgrcv */
-#if defined(SYSVSHM) && 0
+	"utrace",			/* 205 = utrace */
+	"undelete",			/* 206 = undelete */
+	"#207",			/* 207 = nosys */
+	"#208",			/* 208 = nosys */
+	"#209",			/* 209 = nosys */
+	"lkmnosys",			/* 210 = lkmnosys */
+	"lkmnosys",			/* 211 = lkmnosys */
+	"lkmnosys",			/* 212 = lkmnosys */
+	"lkmnosys",			/* 213 = lkmnosys */
+	"lkmnosys",			/* 214 = lkmnosys */
+	"lkmnosys",			/* 215 = lkmnosys */
+	"lkmnosys",			/* 216 = lkmnosys */
+	"lkmnosys",			/* 217 = lkmnosys */
+	"lkmnosys",			/* 218 = lkmnosys */
+	"lkmnosys",			/* 219 = lkmnosys */
+	"__semctl",			/* 220 = __semctl */
+	"semget",			/* 221 = semget */
+	"semop",			/* 222 = semop */
+	"semconfig",			/* 223 = semconfig */
+	"msgctl",			/* 224 = msgctl */
+	"msgget",			/* 225 = msgget */
+	"msgsnd",			/* 226 = msgsnd */
+	"msgrcv",			/* 227 = msgrcv */
 	"shmat",			/* 228 = shmat */
 	"shmctl",			/* 229 = shmctl */
 	"shmdt",			/* 230 = shmdt */
 	"shmget",			/* 231 = shmget */
-#else
-	"#228 (unimplemented shmat)",		/* 228 = unimplemented shmat */
-	"#229 (unimplemented shmctl)",		/* 229 = unimplemented shmctl */
-	"#230 (unimplemented shmdt)",		/* 230 = unimplemented shmdt */
-	"#231 (unimplemented shmget)",		/* 231 = unimplemented shmget */
-#endif
+	"#232",			/* 232 = nosys */
+	"#233",			/* 233 = nosys */
+	"#234",			/* 234 = nosys */
+	"#235",			/* 235 = nosys */
+	"#236",			/* 236 = nosys */
+	"#237",			/* 237 = nosys */
+	"#238",			/* 238 = nosys */
+	"#239",			/* 239 = nosys */
+	"#240",			/* 240 = nosys */
+	"#241",			/* 241 = nosys */
+	"#242",			/* 242 = nosys */
+	"#243",			/* 243 = nosys */
+	"#244",			/* 244 = nosys */
+	"#245",			/* 245 = nosys */
+	"#246",			/* 246 = nosys */
+	"#247",			/* 247 = nosys */
+	"#248",			/* 248 = nosys */
+	"#249",			/* 249 = nosys */
+	"minherit",			/* 250 = minherit */
+	"rfork",			/* 251 = rfork */
 };
diff --git a/sys/kern/syscalls.conf b/sys/kern/syscalls.conf
deleted file mode 100644
index 71b82ce..0000000
--- a/sys/kern/syscalls.conf
+++ /dev/null
@@ -1,12 +0,0 @@
-#	@(#)syscalls.conf	8.1 (Berkeley) 2/14/95
-
-sysnames="syscalls.c"
-sysnumhdr="../sys/syscall.h"
-syssw="init_sysent.c"
-sysarghdr="../sys/syscallargs.h"
-compatopts="compat_43"
-libcompatopts=""
-
-switchname="sysent"
-namesname="syscallnames"
-constprefix="SYS_"
diff --git a/sys/kern/syscalls.master b/sys/kern/syscalls.master
index b57cd73..b0921d4 100644
--- a/sys/kern/syscalls.master
+++ b/sys/kern/syscalls.master
@@ -1,38 +1,32 @@
-	@(#)syscalls.master	8.6 (Berkeley) 3/30/95
-; System call name/number "master" file.
-; (See syscalls.conf to see what it is processed into.)
+	$Id$
+;	from: @(#)syscalls.master	8.2 (Berkeley) 1/13/94
 ;
-; Fields: number type [type-dependent ...]
+; System call name/number master file.
+; Processed to created init_sysent.c, syscalls.c and syscall.h.
+
+; Columns: number type nargs namespc name alt{name,tag,rtyp}/comments
 ;	number	system call number, must be in order
-;	type	one of STD, OBSOL, UNIMPL, NODEF, NOARGS, or one of
-;		the compatibility options defined in syscalls.conf.
-;
+;	type	one of STD, OBSOL, UNIMPL, COMPAT
+;	namespc one of POSIX, BSD, NOHIDE
+;	name	psuedo-prototype of syscall routine
+;		If one of the following alts is different, then all appear:
+;	altname	name of system call if different
+;	alttag	name of args struct tag if different from [o]`name'"_args"
+;	altrtyp	return type if not int (bogus - syscalls always return int)
+;		for UNIMPL/OBSOL, name continues with comments
+
 ; types:
 ;	STD	always included
-;	OBSOL	obsolete, not included in system
-;	UNIMPL	unimplemented, not included in system
-;	NODEF	included, but don't define the syscall number
-;	NOARGS	included, but don't define the syscall args structure
-;
-; The compat options are defined in the syscalls.conf file, and the
-; compat option name is prefixed to the syscall name.  Other than
-; that, they're like NODEF (for 'compat' options), or STD (for
-; 'libcompat' options).
-;
-; The type-dependent arguments are as follows:
-; For STD, NODEF, NOARGS, and compat syscalls:
-;	{ pseudo-proto } [alias]
-; For other syscalls:
-;	[comment]
-;
+;	COMPAT	included on COMPAT #ifdef
+;	LIBCOMPAT included on COMPAT #ifdef, and placed in syscall.h
+;	OBSOL	obsolete, not included in system, only specifies name
+;	UNIMPL	not implemented, placeholder only
+
 ; #ifdef's, etc. may be included, and are copied to the output files.
-; #include's are copied to the syscall switch definition file only.
 
 #include <sys/param.h>
-#include <sys/systm.h>
-#include <sys/signal.h>
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
 
 ; Reserved/unimplemented system calls in the range 0-150 inclusive
 ; are reserved for use in future Berkeley releases.
@@ -40,316 +34,359 @@
 ; redistributions should be placed in the reserved range at the end
 ; of the current calls.
 
-0	STD		{ int nosys(void); } syscall
-1	STD		{ int exit(int rval); }
-2	STD		{ int fork(void); }
-3	STD		{ int read(int fd, char *buf, u_int nbyte); }
-4	STD		{ int write(int fd, char *buf, u_int nbyte); }
-5	STD		{ int open(char *path, int flags, int mode); }
-6	STD		{ int close(int fd); }
-7	STD		{ int wait4(int pid, int *status, int options, \
-			    struct rusage *rusage); }
-8	COMPAT_43	{ int creat(char *path, int mode); }
-9	STD		{ int link(char *path, char *link); }
-10	STD		{ int unlink(char *path); }
-11	OBSOL		execv
-12	STD		{ int chdir(char *path); }
-13	STD		{ int fchdir(int fd); }
-14	STD		{ int mknod(char *path, int mode, int dev); }
-15	STD		{ int chmod(char *path, int mode); }
-16	STD		{ int chown(char *path, int uid, int gid); }
-17	STD		{ int obreak(char *nsize); } break
-18	STD		{ int getfsstat(struct statfs *buf, long bufsize, \
+0	STD	NOHIDE	{ int nosys(void); } syscall nosys_args int
+1	STD	NOHIDE	{ void exit(int rval); } exit rexit_args void
+2	STD	POSIX	{ int fork(void); }
+3	STD	POSIX	{ int read(int fd, char *buf, u_int nbyte); }
+4	STD	POSIX	{ int write(int fd, char *buf, u_int nbyte); }
+5	STD	POSIX	{ int open(char *path, int flags, int mode); }
+; XXX should be		{ int open(const char *path, int flags, ...); }
+; but we're not ready for `const' or varargs.
+; XXX man page says `mode_t mode'.
+6	STD	POSIX	{ int close(int fd); }
+7	STD	BSD	{ int wait4(int pid, int *status, int options, \
+			    struct rusage *rusage); } wait4 wait_args int
+8	COMPAT	BSD	{ int creat(char *path, int mode); }
+9	STD	POSIX	{ int link(char *path, char *link); }
+10	STD	POSIX	{ int unlink(char *path); }
+11	OBSOL	NOHIDE	execv
+12	STD	POSIX	{ int chdir(char *path); }
+13	STD	BSD	{ int fchdir(int fd); }
+14	STD	POSIX	{ int mknod(char *path, int mode, int dev); }
+15	STD	POSIX	{ int chmod(char *path, int mode); }
+16	STD	POSIX	{ int chown(char *path, int uid, int gid); }
+17	STD	BSD	{ int obreak(char *nsize); } break obreak_args int
+18	STD	BSD	{ int getfsstat(struct statfs *buf, long bufsize, \
 			    int flags); }
-19	COMPAT_43	{ long lseek(int fd, long offset, int whence); }
-20	STD		{ pid_t getpid(void); }
-21	STD		{ int mount(char *type, char *path, int flags, \
+19	COMPAT	POSIX	{ long lseek(int fd, long offset, int whence); }
+20	STD	POSIX	{ pid_t getpid(void); }
+21	STD	BSD	{ int mount(char *type, char *path, int flags, \
 			    caddr_t data); }
-22	STD		{ int unmount(char *path, int flags); }
-23	STD		{ int setuid(uid_t uid); }
-24	STD		{ uid_t getuid(void); }
-25	STD		{ uid_t geteuid(void); }
-26	STD		{ int ptrace(int req, pid_t pid, caddr_t addr, \
+; XXX 4.4lite2 uses `char *type' but we're not ready for that.
+; XXX `path' should have type `const char *' but we're not ready for that.
+22	STD	BSD	{ int unmount(char *path, int flags); }
+23	STD	POSIX	{ int setuid(uid_t uid); }
+24	STD	POSIX	{ uid_t getuid(void); }
+25	STD	POSIX	{ uid_t geteuid(void); }
+26	STD	BSD	{ int ptrace(int req, pid_t pid, caddr_t addr, \
 			    int data); }
-27	STD		{ int recvmsg(int s, struct msghdr *msg, int flags); }
-28	STD		{ int sendmsg(int s, caddr_t msg, int flags); }
-29	STD		{ int recvfrom(int s, caddr_t buf, size_t len, \
+27	STD	BSD	{ int recvmsg(int s, struct msghdr *msg, int flags); }
+28	STD	BSD	{ int sendmsg(int s, caddr_t msg, int flags); }
+29	STD	BSD	{ int recvfrom(int s, caddr_t buf, size_t len, \
 			    int flags, caddr_t from, int *fromlenaddr); }
-30	STD		{ int accept(int s, caddr_t name, int *anamelen); }
-31	STD		{ int getpeername(int fdes, caddr_t asa, int *alen); }
-32	STD		{ int getsockname(int fdes, caddr_t asa, int *alen); }
-33	STD		{ int access(char *path, int flags); }
-34	STD		{ int chflags(char *path, int flags); }
-35	STD		{ int fchflags(int fd, int flags); }
-36	STD		{ int sync(void); }
-37	STD		{ int kill(int pid, int signum); }
-38	COMPAT_43	{ int stat(char *path, struct ostat *ub); }
-39	STD		{ pid_t getppid(void); }
-40	COMPAT_43	{ int lstat(char *path, struct ostat *ub); }
-41	STD		{ int dup(u_int fd); }
-42	STD		{ int pipe(void); }
-43	STD		{ gid_t getegid(void); }
-44	STD		{ int profil(caddr_t samples, u_int size, \
+30	STD	BSD	{ int accept(int s, caddr_t name, int *anamelen); }
+31	STD	BSD	{ int getpeername(int fdes, caddr_t asa, int *alen); }
+32	STD	BSD	{ int getsockname(int fdes, caddr_t asa, int *alen); }
+33	STD	POSIX	{ int access(char *path, int flags); }
+34	STD	BSD	{ int chflags(char *path, int flags); }
+35	STD	BSD	{ int fchflags(int fd, int flags); }
+36	STD	BSD	{ int sync(void); }
+37	STD	POSIX	{ int kill(int pid, int signum); }
+38	COMPAT	POSIX	{ int stat(char *path, struct ostat *ub); }
+39	STD	POSIX	{ pid_t getppid(void); }
+40	COMPAT	POSIX	{ int lstat(char *path, struct ostat *ub); }
+41	STD	POSIX	{ int dup(u_int fd); }
+42	STD	POSIX	{ int pipe(void); }
+43	STD	POSIX	{ gid_t getegid(void); }
+44	STD	BSD	{ int profil(caddr_t samples, u_int size, \
 			    u_int offset, u_int scale); }
-#ifdef KTRACE
-45	STD		{ int ktrace(char *fname, int ops, int facs, \
+45	STD	BSD	{ int ktrace(char *fname, int ops, int facs, \
 			    int pid); }
-#else
-45	UNIMPL		ktrace
-#endif
-46	STD		{ int sigaction(int signum, struct sigaction *nsa, \
+46	STD	POSIX	{ int sigaction(int signum, struct sigaction *nsa, \
 			    struct sigaction *osa); }
-47	STD		{ gid_t getgid(void); }
-48	STD		{ int sigprocmask(int how, sigset_t mask); }
-49	STD		{ int getlogin(char *namebuf, u_int namelen); }
-50	STD		{ int setlogin(char *namebuf); }
-51	STD		{ int acct(char *path); }
-52	STD		{ int sigpending(void); }
-53	STD		{ int sigaltstack(struct sigaltstack *nss, \
+47	STD	POSIX	{ gid_t getgid(void); }
+48	STD	POSIX	{ int sigprocmask(int how, sigset_t mask); }
+; XXX note nonstandard (bogus) calling convention - the libc stub passes
+; us the mask, not a pointer to it, and we return the old mask as the
+; (int) return value.
+49	STD	BSD	{ int getlogin(char *namebuf, u_int namelen); }
+50	STD	BSD	{ int setlogin(char *namebuf); }
+51	STD	BSD	{ int acct(char *path); }
+52	STD	POSIX	{ int sigpending(void); }
+53	STD	BSD	{ int sigaltstack(struct sigaltstack *nss, \
 			    struct sigaltstack *oss); }
-54	STD		{ int ioctl(int fd, u_long com, caddr_t data); }
-55	STD		{ int reboot(int opt); }
-56	STD		{ int revoke(char *path); }
-57	STD		{ int symlink(char *path, char *link); }
-58	STD		{ int readlink(char *path, char *buf, int count); }
-59	STD		{ int execve(char *path, char **argp, char **envp); }
-60	STD		{ int umask(int newmask); }
-61	STD		{ int chroot(char *path); }
-62	COMPAT_43	{ int fstat(int fd, struct ostat *sb); }
-63	COMPAT_43	{ int getkerninfo(int op, char *where, int *size, \
-			    int arg); }
-64	COMPAT_43	{ int getpagesize(void); }
-65	STD		{ int msync(caddr_t addr, int len); }
-66	STD		{ int vfork(void); }
-67	OBSOL		vread
-68	OBSOL		vwrite
-69	STD		{ int sbrk(int incr); }
-70	STD		{ int sstk(int incr); }
-71	COMPAT_43	{ int mmap(caddr_t addr, int len, int prot, \
+54	STD	POSIX	{ int ioctl(int fd, u_long com, caddr_t data); }
+55	STD	BSD	{ int reboot(int opt); }
+56	STD	POSIX	{ int revoke(char *path); }
+57	STD	POSIX	{ int symlink(char *path, char *link); }
+58	STD	POSIX	{ int readlink(char *path, char *buf, int count); }
+59	STD	POSIX	{ int execve(char *fname, char **argv, char **envv); }
+60	STD	POSIX	{ int umask(int newmask); } umask umask_args int
+61	STD	BSD	{ int chroot(char *path); }
+62	COMPAT	POSIX	{ int fstat(int fd, struct ostat *sb); }
+63	COMPAT	BSD	{ int getkerninfo(int op, char *where, int *size, \
+			    int arg); } getkerninfo getkerninfo_args int
+64	COMPAT	BSD	{ int getpagesize(void); } \
+			    getpagesize getpagesize_args int
+65	STD	BSD	{ int msync(caddr_t addr, size_t len, int flags); }
+66	STD	BSD	{ int vfork(void); }
+67	OBSOL	NOHIDE	vread
+68	OBSOL	NOHIDE	vwrite
+69	STD	BSD	{ int sbrk(int incr); }
+70	STD	BSD	{ int sstk(int incr); }
+71	COMPAT	BSD	{ int mmap(caddr_t addr, int len, int prot, \
 			    int flags, int fd, long pos); }
-72	STD		{ int ovadvise(int anom); } vadvise
-73	STD		{ int munmap(caddr_t addr, int len); }
-74	STD		{ int mprotect(caddr_t addr, int len, int prot); }
-75	STD		{ int madvise(caddr_t addr, int len, int behav); }
-76	OBSOL		vhangup
-77	OBSOL		vlimit
-78	STD		{ int mincore(caddr_t addr, int len, char *vec); }
-79	STD		{ int getgroups(u_int gidsetsize, gid_t *gidset); }
-80	STD		{ int setgroups(u_int gidsetsize, gid_t *gidset); }
-81	STD		{ int getpgrp(void); }
-82	STD		{ int setpgid(int pid, int pgid); }
-83	STD		{ int setitimer(u_int which, struct itimerval *itv, \
+72	STD	BSD	{ int ovadvise(int anom); } vadvise ovadvise_args int
+73	STD	BSD	{ int munmap(caddr_t addr, size_t len); }
+74	STD	BSD	{ int mprotect(caddr_t addr, size_t len, int prot); }
+75	STD	BSD	{ int madvise(caddr_t addr, size_t len, int behav); }
+76	OBSOL	NOHIDE	vhangup
+77	OBSOL	NOHIDE	vlimit
+78	STD	BSD	{ int mincore(caddr_t addr, size_t len, char *vec); }
+79	STD	POSIX	{ int getgroups(u_int gidsetsize, gid_t *gidset); }
+80	STD	POSIX	{ int setgroups(u_int gidsetsize, gid_t *gidset); }
+81	STD	POSIX	{ int getpgrp(void); }
+82	STD	POSIX	{ int setpgid(int pid, int pgid); }
+83	STD	BSD	{ int setitimer(u_int which, struct itimerval *itv, \
 			    struct itimerval *oitv); }
-84	COMPAT_43	{ int wait(void); }
-85	STD		{ int swapon(char *name); }
-86	STD		{ int getitimer(u_int which, struct itimerval *itv); }
-87	COMPAT_43	{ int gethostname(char *hostname, u_int len); }
-88	COMPAT_43	{ int sethostname(char *hostname, u_int len); }
-89	STD		{ int getdtablesize(void); }
-90	STD		{ int dup2(u_int from, u_int to); }
-91	UNIMPL		getdopt
-92	STD		{ int fcntl(int fd, int cmd, void *arg); }
-93	STD		{ int select(u_int nd, fd_set *in, fd_set *ou, \
+84	COMPAT	BSD	{ int wait(void); }
+85	STD	BSD	{ int swapon(char *name); }
+86	STD	BSD	{ int getitimer(u_int which, struct itimerval *itv); }
+87	COMPAT	BSD	{ int gethostname(char *hostname, u_int len); } \
+			    gethostname gethostname_args int
+88	COMPAT	BSD	{ int sethostname(char *hostname, u_int len); } \
+			    sethostname sethostname_args int
+89	STD	BSD	{ int getdtablesize(void); }
+90	STD	POSIX	{ int dup2(u_int from, u_int to); }
+91	UNIMPL	BSD	getdopt
+92	STD	POSIX	{ int fcntl(int fd, int cmd, int arg); }
+; XXX should be		{ int fcntl(int fd, int cmd, ...); }
+; but we're not ready for varargs.
+; XXX man page says `int arg' too.
+93	STD	BSD	{ int select(int nd, fd_set *in, fd_set *ou, \
 			    fd_set *ex, struct timeval *tv); }
-94	UNIMPL		setdopt
-95	STD		{ int fsync(int fd); }
-96	STD		{ int setpriority(int which, int who, int prio); }
-97	STD		{ int socket(int domain, int type, int protocol); }
-98	STD		{ int connect(int s, caddr_t name, int namelen); }
-99	COMPAT_43	{ int accept(int s, caddr_t name, int *anamelen); }
-100	STD		{ int getpriority(int which, int who); }
-101	COMPAT_43	{ int send(int s, caddr_t buf, int len, int flags); }
-102	COMPAT_43	{ int recv(int s, caddr_t buf, int len, int flags); }
-103	STD		{ int sigreturn(struct sigcontext *sigcntxp); }
-104	STD		{ int bind(int s, caddr_t name, int namelen); }
-105	STD		{ int setsockopt(int s, int level, int name, \
+94	UNIMPL	BSD	setdopt
+95	STD	POSIX	{ int fsync(int fd); }
+96	STD	BSD	{ int setpriority(int which, int who, int prio); }
+97	STD	BSD	{ int socket(int domain, int type, int protocol); }
+98	STD	BSD	{ int connect(int s, caddr_t name, int namelen); }
+99	CPT_NOA	BSD	{ int accept(int s, caddr_t name, int *anamelen); } \
+			    accept accept_args int
+100	STD	BSD	{ int getpriority(int which, int who); }
+101	COMPAT	BSD	{ int send(int s, caddr_t buf, int len, int flags); }
+102	COMPAT	BSD	{ int recv(int s, caddr_t buf, int len, int flags); }
+103	STD	BSD	{ int sigreturn(struct sigcontext *sigcntxp); }
+104	STD	BSD	{ int bind(int s, caddr_t name, int namelen); }
+105	STD	BSD	{ int setsockopt(int s, int level, int name, \
 			    caddr_t val, int valsize); }
-106	STD		{ int listen(int s, int backlog); }
-107	OBSOL		vtimes
-108	COMPAT_43	{ int sigvec(int signum, struct sigvec *nsv, \
+106	STD	BSD	{ int listen(int s, int backlog); }
+107	OBSOL	NOHIDE	vtimes
+108	COMPAT	BSD	{ int sigvec(int signum, struct sigvec *nsv, \
 			    struct sigvec *osv); }
-109	COMPAT_43	{ int sigblock(int mask); }
-110	COMPAT_43	{ int sigsetmask(int mask); }
-111	STD		{ int sigsuspend(int mask); }
-112	COMPAT_43	{ int sigstack(struct sigstack *nss, \
+109	COMPAT	BSD	{ int sigblock(int mask); }
+110	COMPAT	BSD	{ int sigsetmask(int mask); }
+111	STD	POSIX	{ int sigsuspend(sigset_t mask); }
+; XXX note nonstandard (bogus) calling convention - the libc stub passes
+; us the mask, not a pointer to it.
+112	COMPAT	BSD	{ int sigstack(struct sigstack *nss, \
 			    struct sigstack *oss); }
-113	COMPAT_43	{ int recvmsg(int s, struct omsghdr *msg, int flags); }
-114	COMPAT_43	{ int sendmsg(int s, caddr_t msg, int flags); }
-#ifdef TRACE
-115	STD		{ int vtrace(int request, int value); }
-#else
-115	OBSOL		vtrace
-#endif
-116	STD		{ int gettimeofday(struct timeval *tp, \
+113	COMPAT	BSD	{ int recvmsg(int s, struct omsghdr *msg, int flags); }
+114	COMPAT	BSD	{ int sendmsg(int s, caddr_t msg, int flags); }
+115	OBSOL	NOHIDE	vtrace
+116	STD	BSD	{ int gettimeofday(struct timeval *tp, \
 			    struct timezone *tzp); }
-117	STD		{ int getrusage(int who, struct rusage *rusage); }
-118	STD		{ int getsockopt(int s, int level, int name, \
+117	STD	BSD	{ int getrusage(int who, struct rusage *rusage); }
+118	STD	BSD	{ int getsockopt(int s, int level, int name, \
 			    caddr_t val, int *avalsize); }
-#ifdef vax
-119	STD		{ int resuba(int value); }
-#else
-119	UNIMPL		resuba
-#endif
-120	STD		{ int readv(int fd, struct iovec *iovp, u_int iovcnt); }
-121	STD		{ int writev(int fd, struct iovec *iovp, \
+119	UNIMPL	NOHIDE	resuba (BSD/OS 2.x)
+120	STD	BSD	{ int readv(int fd, struct iovec *iovp, u_int iovcnt); }
+121	STD	BSD	{ int writev(int fd, struct iovec *iovp, \
 			    u_int iovcnt); }
-122	STD		{ int settimeofday(struct timeval *tv, \
+122	STD	BSD	{ int settimeofday(struct timeval *tv, \
 			    struct timezone *tzp); }
-123	STD		{ int fchown(int fd, int uid, int gid); }
-124	STD		{ int fchmod(int fd, int mode); }
-125	COMPAT_43	{ int recvfrom(int s, caddr_t buf, size_t len, \
-			    int flags, caddr_t from, int *fromlenaddr); }
-126	COMPAT_43	{ int setreuid(int ruid, int euid); }
-127	COMPAT_43	{ int setregid(int rgid, int egid); }
-128	STD		{ int rename(char *from, char *to); }
-129	COMPAT_43	{ int truncate(char *path, long length); }
-130	COMPAT_43	{ int ftruncate(int fd, long length); }
-131	STD		{ int flock(int fd, int how); }
-132	STD		{ int mkfifo(char *path, int mode); }
-133	STD		{ int sendto(int s, caddr_t buf, size_t len, \
+123	STD	BSD	{ int fchown(int fd, int uid, int gid); }
+124	STD	BSD	{ int fchmod(int fd, int mode); }
+125	CPT_NOA	BSD	{ int recvfrom(int s, caddr_t buf, size_t len, \
+			    int flags, caddr_t from, int *fromlenaddr); } \
+			    recvfrom recvfrom_args int
+126	STD	BSD	{ int setreuid(int ruid, int euid); }
+127	STD	BSD	{ int setregid(int rgid, int egid); }
+128	STD	POSIX	{ int rename(char *from, char *to); }
+129	COMPAT	BSD	{ int truncate(char *path, long length); }
+130	COMPAT	BSD	{ int ftruncate(int fd, long length); }
+131	STD	BSD	{ int flock(int fd, int how); }
+132	STD	POSIX	{ int mkfifo(char *path, int mode); }
+133	STD	BSD	{ int sendto(int s, caddr_t buf, size_t len, \
 			    int flags, caddr_t to, int tolen); }
-134	STD		{ int shutdown(int s, int how); }
-135	STD		{ int socketpair(int domain, int type, int protocol, \
+134	STD	BSD	{ int shutdown(int s, int how); }
+135	STD	BSD	{ int socketpair(int domain, int type, int protocol, \
 			    int *rsv); }
-136	STD		{ int mkdir(char *path, int mode); }
-137	STD		{ int rmdir(char *path); }
-138	STD		{ int utimes(char *path, struct timeval *tptr); }
-139	OBSOL		4.2 sigreturn
-140	STD		{ int adjtime(struct timeval *delta, \
+136	STD	POSIX	{ int mkdir(char *path, int mode); }
+137	STD	POSIX	{ int rmdir(char *path); }
+138	STD	BSD	{ int utimes(char *path, struct timeval *tptr); }
+139	OBSOL	NOHIDE	4.2 sigreturn
+140	STD	BSD	{ int adjtime(struct timeval *delta, \
 			    struct timeval *olddelta); }
-141	COMPAT_43	{ int getpeername(int fdes, caddr_t asa, int *alen); }
-142	COMPAT_43	{ int32_t gethostid(void); }
-143	COMPAT_43	{ int sethostid(int32_t hostid); }
-144	COMPAT_43	{ int getrlimit(u_int which, struct ogetrlimit *rlp); }
-145	COMPAT_43	{ int setrlimit(u_int which, struct ogetrlimit *rlp); }
-146	COMPAT_43	{ int killpg(int pgid, int signum); }
-147	STD		{ int setsid(void); }
-148	STD		{ int quotactl(char *path, int cmd, int uid, \
+141	COMPAT	BSD	{ int getpeername(int fdes, caddr_t asa, int *alen); }
+142	COMPAT	BSD	{ long gethostid(void); }
+143	COMPAT	BSD	{ int sethostid(long hostid); }
+144	COMPAT	BSD	{ int getrlimit(u_int which, struct ogetrlimit *rlp); }
+145	COMPAT	BSD	{ int setrlimit(u_int which, struct ogetrlimit *rlp); }
+146	COMPAT	BSD	{ int killpg(int pgid, int signum); }
+147	STD	POSIX	{ int setsid(void); }
+148	STD	BSD	{ int quotactl(char *path, int cmd, int uid, \
 			    caddr_t arg); }
-149	COMPAT_43	{ int quota(void); }
-150	COMPAT_43	{ int getsockname(int fdec, caddr_t asa, int *alen); }
+149	COMPAT	BSD	{ int quota(void); }
+150	CPT_NOA	BSD	{ int getsockname(int fdec, caddr_t asa, int *alen); }\
+			    getsockname getsockname_args int
 
 ; Syscalls 151-180 inclusive are reserved for vendor-specific
 ; system calls.  (This includes various calls added for compatibity
 ; with other Unix variants.)
 ; Some of these calls are now supported by BSD...
-151	UNIMPL
-152	UNIMPL
-153	UNIMPL
-154	UNIMPL
+151	UNIMPL	NOHIDE	sem_lock (BSD/OS 2.x)
+152	UNIMPL	NOHIDE	sem_wakeup (BSD/OS 2.x)
+153	UNIMPL	NOHIDE	asyncdaemon (BSD/OS 2.x)
+154	UNIMPL	NOHIDE	nosys
 #ifdef NFS
-155	STD		{ int nfssvc(int flag, caddr_t argp); }
+155	STD	BSD	{ int nfssvc(int flag, caddr_t argp); }
 #else
-155	UNIMPL		nfssvc
+155	UNIMPL	BSD	nosys
 #endif
-156	COMPAT_43	{ int getdirentries(int fd, char *buf, u_int count, \
+156	COMPAT	BSD	{ int getdirentries(int fd, char *buf, u_int count, \
 			    long *basep); }
-157	STD		{ int statfs(char *path, struct statfs *buf); }
-158	STD		{ int fstatfs(int fd, struct statfs *buf); }
-159	UNIMPL
-160	UNIMPL
-#ifdef NFS
-161	STD		{ int getfh(char *fname, fhandle_t *fhp); }
-#else
-161	UNIMPL		getfh
-#endif
-162	UNIMPL		getdomainname
-163	UNIMPL		setdomainname
-164	UNIMPL
-165	UNIMPL
-166	UNIMPL
-167	UNIMPL
-168	UNIMPL
-169	UNIMPL		semsys
-170	UNIMPL		msgsys
-; XXX more generally, never on machines where sizeof(void *) != sizeof(int)
-#if defined(SYSVSHM) && !defined(alpha)
-171	COMPAT_43	{ int shmsys(int which, int a2, int a3, int a4); }
+157	STD	BSD	{ int statfs(char *path, struct statfs *buf); }
+158	STD	BSD	{ int fstatfs(int fd, struct statfs *buf); }
+159	UNIMPL	NOHIDE	nosys
+160	UNIMPL	NOHIDE	nosys
+#if defined(NFS) && !defined (NFS_NOSERVER)
+161	STD	BSD	{ int getfh(char *fname, struct fhandle *fhp); }
 #else
-171	UNIMPL		shmsys
+161	UNIMPL	BSD	nosys
 #endif
-172	UNIMPL
-173	UNIMPL
-174	UNIMPL
-175	UNIMPL
-176	UNIMPL
-177	UNIMPL
-178	UNIMPL
-179	UNIMPL
-180	UNIMPL
+162	STD	BSD	{ int getdomainname(char *domainname, int len); }
+163	STD	BSD	{ int setdomainname(char *domainname, int len); }
+164	STD	BSD	{ int uname(struct utsname *name); }
+165	STD	BSD	{ int sysarch(int op, char *parms); }
+166	STD	BSD	{ int rtprio(int function, pid_t pid, \
+			    struct rtprio *rtp); }
+167	UNIMPL	NOHIDE	nosys
+168	UNIMPL	NOHIDE	nosys
+169	STD	BSD	{ int semsys(int which, int a2, int a3, int a4, \
+			    int a5); }
+; XXX should be		{ int semsys(int which, ...); }
+170	STD	BSD	{ int msgsys(int which, int a2, int a3, int a4, \
+			    int a5, int a6); }
+; XXX should be		{ int msgsys(int which, ...); }
+171	STD	BSD	{ int shmsys(int which, int a2, int a3, int a4); }
+; XXX should be		{ int shmsys(int which, ...); }
+172	UNIMPL	NOHIDE	nosys
+173	UNIMPL	NOHIDE	nosys
+174	UNIMPL	NOHIDE	nosys
+175	UNIMPL	NOHIDE	nosys
+176	STD	BSD	{ int ntp_adjtime(struct timex *tp); }
+177	UNIMPL	NOHIDE	sfork (BSD/OS 2.x)
+178	UNIMPL	NOHIDE	getdescriptor (BSD/OS 2.x)
+179	UNIMPL	NOHIDE	setdescriptor (BSD/OS 2.x)
+180	UNIMPL	NOHIDE	nosys
 
-; Syscalls 180-209 are used by/reserved for BSD
-181	STD		{ int setgid(gid_t gid); }
-182	STD		{ int setegid(gid_t egid); }
-183	STD		{ int seteuid(uid_t euid); }
+; Syscalls 180-199 are used by/reserved for BSD
+181	STD	POSIX	{ int setgid(gid_t gid); }
+182	STD	BSD	{ int setegid(gid_t egid); }
+183	STD	BSD	{ int seteuid(uid_t euid); }
 #ifdef LFS
-184	STD		{ int lfs_bmapv(fsid_t *fsidp, \
+184	STD	BSD	{ int lfs_bmapv(struct fsid **fsidp, \
 			    struct block_info *blkiov, int blkcnt); }
-185	STD		{ int lfs_markv(fsid_t *fsidp, \
+185	STD	BSD	{ int lfs_markv(struct fsid **fsidp, \
 			    struct block_info *blkiov, int blkcnt); }
-186	STD		{ int lfs_segclean(fsid_t *fsidp, u_long segment); }
-187	STD		{ int lfs_segwait(fsid_t *fsidp, struct timeval *tv); }
+186	STD	BSD	{ int lfs_segclean(struct fsid **fsidp, \
+			    u_long segment); }
+187	STD	BSD	{ int lfs_segwait(struct fsid **fsidp, \
+			    struct timeval *tv); }
 #else
-184	UNIMPL		lfs_bmapv
-185	UNIMPL		lfs_markv
-186	UNIMPL		lfs_segclean
-187	UNIMPL		lfs_segwait
+184	UNIMPL	BSD	nosys
+185	UNIMPL	BSD	nosys
+186	UNIMPL	BSD	nosys
+187	UNIMPL	BSD	nosys
 #endif
-188	STD		{ int stat(char *path, struct stat *ub); }
-189	STD		{ int fstat(int fd, struct stat *sb); }
-190	STD		{ int lstat(char *path, struct stat *ub); }
-191	STD		{ int pathconf(char *path, int name); }
-192	STD		{ int fpathconf(int fd, int name); }
-193	UNIMPL
-194	STD		{ int getrlimit(u_int which, struct rlimit *rlp); }
-195	STD		{ int setrlimit(u_int which, struct rlimit *rlp); }
-196	STD		{ int getdirentries(int fd, char *buf, u_int count, \
+188	STD	POSIX	{ int stat(char *path, struct stat *ub); }
+189	STD	POSIX	{ int fstat(int fd, struct stat *sb); }
+190	STD	POSIX	{ int lstat(char *path, struct stat *ub); }
+191	STD	POSIX	{ int pathconf(char *path, int name); }
+192	STD	POSIX	{ int fpathconf(int fd, int name); }
+193	UNIMPL	NOHIDE	nosys
+194	STD	BSD	{ int getrlimit(u_int which, \
+			    struct orlimit *rlp); } \
+			    getrlimit __getrlimit_args int
+195	STD	BSD	{ int setrlimit(u_int which, \
+			    struct orlimit *rlp); } \
+			    setrlimit __setrlimit_args int
+196	STD	BSD	{ int getdirentries(int fd, char *buf, u_int count, \
 			    long *basep); }
-197	STD		{ caddr_t mmap(caddr_t addr, size_t len, int prot, \
+197	STD	BSD	{ caddr_t mmap(caddr_t addr, size_t len, int prot, \
 			    int flags, int fd, long pad, off_t pos); }
-198	STD		{ int nosys(void); } __syscall
-199	STD		{ off_t lseek(int fd, int pad, off_t offset, \
+198	STD	NOHIDE	{ int nosys(void); } __syscall __syscall_args int
+199	STD	POSIX	{ off_t lseek(int fd, int pad, off_t offset, \
 			    int whence); }
-200	STD		{ int truncate(char *path, int pad, off_t length); }
-201	STD		{ int ftruncate(int fd, int pad, off_t length); }
-202	STD		{ int __sysctl(int *name, u_int namelen, void *old, \
-			    size_t *oldlenp, void *new, size_t newlen); }
-203	STD		{ int mlock(caddr_t addr, size_t len); }
-204	STD		{ int munlock(caddr_t addr, size_t len); }
-205	STD		{ int undelete(char *path); }
-206	UNIMPL
-207	UNIMPL
-208	UNIMPL
-209	UNIMPL
-; Syscalls 210-219 are used by/reserved for vendor-specific system calls
-210	UNIMPL
-211	UNIMPL
-212	UNIMPL
-213	UNIMPL
-214	UNIMPL
-215	UNIMPL
-216	UNIMPL
-217	UNIMPL
-218	UNIMPL
-219	UNIMPL
-; System calls 220-240 are reserved for use by BSD
-220	UNIMPL		semctl
-221	UNIMPL		semget
-222	UNIMPL		semop
-223	UNIMPL		semconfig
-224	UNIMPL		msgctl
-225	UNIMPL		msgget
-226	UNIMPL		msgsnd
-227	UNIMPL		msgrcv
-#if defined(SYSVSHM) && 0
-228	STD		{ int shmat(int shmid, void *shmaddr, int shmflg); }
-229	STD		{ int shmctl(int shmid, int cmd, \
+200	STD	BSD	{ int truncate(char *path, int pad, off_t length); }
+201	STD	BSD	{ int ftruncate(int fd, int pad, off_t length); }
+202	STD	BSD	{ int __sysctl(int *name, u_int namelen, void *old, \
+			    size_t *oldlenp, void *new, size_t newlen); } \
+			    __sysctl sysctl_args int
+; properly, __sysctl should be a NOHIDE, but making an exception
+; here allows to avoid one in libc/sys/Makefile.inc.
+203	STD	BSD	{ int mlock(caddr_t addr, size_t len); }
+204	STD	BSD	{ int munlock(caddr_t addr, size_t len); }
+205	STD	BSD	{ int utrace(caddr_t addr, size_t len); }
+206	STD	BSD	{ int undelete(char *path); }
+207	UNIMPL	NOHIDE	nosys
+208	UNIMPL	NOHIDE	nosys
+209	UNIMPL	NOHIDE	nosys
+
+;
+; The following are reserved for loadable syscalls
+;
+210	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+211	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+212	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+213	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+214	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+215	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+216	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+217	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+218	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+219	NODEF	NOHIDE	lkmnosys lkmnosys nosys_args int
+
+;
+; The following were introduced with NetBSD/4.4Lite-2
+;
+220	STD	BSD	{ int __semctl(int semid, int semnum, int cmd, \
+			    union semun *arg); }
+221	STD	BSD	{ int semget(key_t key, int nsems, int semflg); }
+222	STD	BSD	{ int semop(int semid, struct sembuf *sops, \
+			    u_int nsops); }
+223	STD	BSD	{ int semconfig(int flag); }
+224	STD	BSD	{ int msgctl(int msqid, int cmd, \
+			    struct msqid_ds *buf); }
+225	STD	BSD	{ int msgget(key_t key, int msgflg); }
+226	STD	BSD	{ int msgsnd(int msqid, void *msgp, size_t msgsz, \
+			    int msgflg); }
+227	STD	BSD	{ int msgrcv(int msqid, void *msgp, size_t msgsz, \
+			    long msgtyp, int msgflg); }
+228	STD	BSD	{ int shmat(int shmid, void *shmaddr, int shmflg); }
+229	STD	BSD	{ int shmctl(int shmid, int cmd, \
 			    struct shmid_ds *buf); }
-230	STD		{ int shmdt(void *shmaddr); }
-231	STD		{ int shmget(key_t key, int size, int shmflg); }
-#else
-228	UNIMPL		shmat
-229	UNIMPL		shmctl
-230	UNIMPL		shmdt
-231	UNIMPL		shmget
-#endif
+230	STD	BSD	{ int shmdt(void *shmaddr); }
+231	STD	BSD	{ int shmget(key_t key, int size, int shmflg); }
+;
+232	UNIMPL	NOHIDE	nosys
+233	UNIMPL	NOHIDE	nosys
+234	UNIMPL	NOHIDE	nosys
+235	UNIMPL	NOHIDE	nosys
+236	UNIMPL	NOHIDE	nosys
+237	UNIMPL	NOHIDE	nosys
+238	UNIMPL	NOHIDE	nosys
+239	UNIMPL	NOHIDE	nosys
+240	UNIMPL	NOHIDE	nosys
+241	UNIMPL	NOHIDE	nosys
+242	UNIMPL	NOHIDE	nosys
+243	UNIMPL	NOHIDE	nosys
+244	UNIMPL	NOHIDE	nosys
+245	UNIMPL	NOHIDE	nosys
+246	UNIMPL	NOHIDE	nosys
+247	UNIMPL	NOHIDE	nosys
+248	UNIMPL	NOHIDE	nosys
+249	UNIMPL	NOHIDE	nosys
+; syscall numbers initially used in OpenBSD
+250	STD	BSD	{ int minherit(caddr_t addr, size_t len, int inherit); }
+251	STD	BSD	{ int rfork(int flags); }
diff --git a/sys/kern/sysv_ipc.c b/sys/kern/sysv_ipc.c
new file mode 100644
index 0000000..a1a1965
--- /dev/null
+++ b/sys/kern/sysv_ipc.c
@@ -0,0 +1,297 @@
+/*	$Id$ */
+/*	$NetBSD: sysv_ipc.c,v 1.7 1994/06/29 06:33:11 cgd Exp $	*/
+
+/*
+ * Copyright (c) 1994 Herb Peyerl <hpeyerl@novatel.ca>
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *      This product includes software developed by Herb Peyerl.
+ * 4. The name of Herb Peyerl may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/systm.h>
+#include <sys/syslog.h>
+#include <sys/sysproto.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <sys/sem.h>
+
+#if defined(SYSVSEM) || defined(SYSVSHM) || defined(SYSVMSG)
+
+/*
+ * Check for ipc permission
+ */
+
+int
+ipcperm(cred, perm, mode)
+	struct ucred *cred;
+	struct ipc_perm *perm;
+	int mode;
+{
+
+	if (cred->cr_uid == 0)
+		return (0);
+
+	/* Check for user match. */
+	if (cred->cr_uid != perm->cuid && cred->cr_uid != perm->uid) {
+		if (mode & IPC_M)
+			return (EPERM);
+		/* Check for group match. */
+		mode >>= 3;
+		if (!groupmember(perm->gid, cred) &&
+		    !groupmember(perm->cgid, cred))
+			/* Check for `other' match. */
+			mode >>= 3;
+	}
+
+	if (mode & IPC_M)
+		return (0);
+	return ((mode & perm->mode) == mode ? 0 : EACCES);
+}
+
+#endif /* defined(SYSVSEM) || defined(SYSVSHM) || defined(SYSVMSG) */
+
+
+#if !defined(SYSVSEM) || !defined(SYSVSHM) || !defined(SYSVMSG)
+
+static void sysv_nosys __P((struct proc *p, char *s));
+
+static void 
+sysv_nosys(p, s)
+	struct proc *p;
+	char *s;
+{
+	log(LOG_ERR, "cmd %s pid %d tried to use non-present %s\n",
+			p->p_comm, p->p_pid, s);
+}
+
+#if !defined(SYSVSEM)
+
+/*
+ * SYSVSEM stubs
+ */
+
+int
+semsys(p, uap, retval)
+	struct proc *p;
+	struct semsys_args *uap;
+	int *retval;
+{
+	sysv_nosys(p, "SYSVSEM");
+	return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+semconfig(p, uap, retval)
+	struct proc *p;
+	struct semconfig_args *uap;
+	int *retval;
+{
+	sysv_nosys(p, "SYSVSEM");
+	return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+__semctl(p, uap, retval)
+	struct proc *p;
+	register struct __semctl_args *uap;
+	int *retval;
+{
+	sysv_nosys(p, "SYSVSEM");
+	return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+semget(p, uap, retval)
+	struct proc *p;
+	register struct semget_args *uap;
+	int *retval;
+{
+	sysv_nosys(p, "SYSVSEM");
+	return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+semop(p, uap, retval)
+	struct proc *p;
+	register struct semop_args *uap;
+	int *retval;
+{
+	sysv_nosys(p, "SYSVSEM");
+	return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+/* called from kern_exit.c */
+void
+semexit(p)
+	struct proc *p;
+{
+	return;
+}
+
+#endif /* !defined(SYSVSEM) */
+
+
+#if !defined(SYSVMSG)
+
+/*
+ * SYSVMSG stubs
+ */
+
+int
+msgsys(p, uap, retval)
+	struct proc *p;
+	/* XXX actually varargs. */
+	struct msgsys_args *uap;
+	int *retval;
+{
+	sysv_nosys(p, "SYSVMSG");
+	return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+msgctl(p, uap, retval)
+	struct proc *p;
+	register struct msgctl_args *uap;
+	int *retval;
+{
+	sysv_nosys(p, "SYSVMSG");
+	return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+msgget(p, uap, retval)
+	struct proc *p;
+	register struct msgget_args *uap;
+	int *retval;
+{
+	sysv_nosys(p, "SYSVMSG");
+	return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+msgsnd(p, uap, retval)
+	struct proc *p;
+	register struct msgsnd_args *uap;
+	int *retval;
+{
+	sysv_nosys(p, "SYSVMSG");
+	return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+msgrcv(p, uap, retval)
+	struct proc *p;
+	register struct msgrcv_args *uap;
+	int *retval;
+{
+	sysv_nosys(p, "SYSVMSG");
+	return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+#endif /* !defined(SYSVMSG) */
+
+
+#if !defined(SYSVSHM)
+
+/*
+ * SYSVSHM stubs
+ */
+
+int
+shmdt(p, uap, retval)
+	struct proc *p;
+	struct shmdt_args *uap;
+	int *retval;
+{
+	sysv_nosys(p, "SYSVSHM");
+	return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+shmat(p, uap, retval)
+	struct proc *p;
+	struct shmat_args *uap;
+	int *retval;
+{
+	sysv_nosys(p, "SYSVSHM");
+	return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+shmctl(p, uap, retval)
+	struct proc *p;
+	struct shmctl_args *uap;
+	int *retval;
+{
+	sysv_nosys(p, "SYSVSHM");
+	return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+shmget(p, uap, retval)
+	struct proc *p;
+	struct shmget_args *uap;
+	int *retval;
+{
+	sysv_nosys(p, "SYSVSHM");
+	return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+int
+shmsys(p, uap, retval)
+	struct proc *p;
+	/* XXX actually varargs. */
+	struct shmsys_args *uap;
+	int *retval;
+{
+	sysv_nosys(p, "SYSVSHM");
+	return nosys(p, (struct nosys_args *)uap, retval);
+};
+
+/* called from kern_fork.c */
+void
+shmfork(p1, p2)
+	struct proc *p1, *p2;
+{
+	return;
+}
+
+/* called from kern_exit.c */
+void
+shmexit(p)
+	struct proc *p;
+{
+	return;
+}
+
+#endif /* !defined(SYSVSHM) */
+
+#endif /* !defined(SYSVSEM) || !defined(SYSVSHM) || !defined(SYSVMSG) */
diff --git a/sys/kern/sysv_msg.c b/sys/kern/sysv_msg.c
new file mode 100644
index 0000000..d6e695f
--- /dev/null
+++ b/sys/kern/sysv_msg.c
@@ -0,0 +1,1034 @@
+/*	$Id$ */
+
+/*
+ * Implementation of SVID messages
+ *
+ * Author:  Daniel Boulet
+ *
+ * Copyright 1993 Daniel Boulet and RTMX Inc.
+ *
+ * This system call was implemented by Daniel Boulet under contract from RTMX.
+ *
+ * Redistribution and use in source forms, with and without modification,
+ * are permitted provided that this entire comment appears intact.
+ *
+ * Redistribution in binary form may occur without any restrictions.
+ * Obviously, it would be nice if you gave credit where credit is due
+ * but requiring it would be too onerous.
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ */
+
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/msg.h>
+#include <sys/sysent.h>
+
+static void msginit __P((void *));
+SYSINIT(sysv_msg, SI_SUB_SYSV_MSG, SI_ORDER_FIRST, msginit, NULL)
+
+#define MSG_DEBUG
+#undef MSG_DEBUG_OK
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgctl_args;
+int msgctl __P((struct proc *p, struct msgctl_args *uap, int *retval));
+struct msgget_args;
+int msgget __P((struct proc *p, struct msgget_args *uap, int *retval));
+struct msgsnd_args;
+int msgsnd __P((struct proc *p, struct msgsnd_args *uap, int *retval));
+struct msgrcv_args;
+int msgrcv __P((struct proc *p, struct msgrcv_args *uap, int *retval));
+#endif
+static void msg_freehdr __P((struct msg *msghdr));
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+static sy_call_t *msgcalls[] = {
+	(sy_call_t *)msgctl, (sy_call_t *)msgget,
+	(sy_call_t *)msgsnd, (sy_call_t *)msgrcv
+};
+
+static int nfree_msgmaps;	/* # of free map entries */
+static short free_msgmaps;	/* head of linked list of free map entries */
+static struct msg *free_msghdrs;	/* list of free msg headers */
+char *msgpool;			/* MSGMAX byte long msg buffer pool */
+struct msgmap *msgmaps;		/* MSGSEG msgmap structures */
+struct msg *msghdrs;		/* MSGTQL msg headers */
+struct msqid_ds *msqids;	/* MSGMNI msqid_ds struct's */
+
+void
+msginit(dummy)
+	void *dummy;
+{
+	register int i;
+
+	/*
+	 * msginfo.msgssz should be a power of two for efficiency reasons.
+	 * It is also pretty silly if msginfo.msgssz is less than 8
+	 * or greater than about 256 so ...
+	 */
+
+	i = 8;
+	while (i < 1024 && i != msginfo.msgssz)
+		i <<= 1;
+    	if (i != msginfo.msgssz) {
+		printf("msginfo.msgssz=%d (0x%x)\n", msginfo.msgssz,
+		    msginfo.msgssz);
+		panic("msginfo.msgssz not a small power of 2");
+	}
+
+	if (msginfo.msgseg > 32767) {
+		printf("msginfo.msgseg=%d\n", msginfo.msgseg);
+		panic("msginfo.msgseg > 32767");
+	}
+
+	if (msgmaps == NULL)
+		panic("msgmaps is NULL");
+
+	for (i = 0; i < msginfo.msgseg; i++) {
+		if (i > 0)
+			msgmaps[i-1].next = i;
+		msgmaps[i].next = -1;	/* implies entry is available */
+	}
+	free_msgmaps = 0;
+	nfree_msgmaps = msginfo.msgseg;
+
+	if (msghdrs == NULL)
+		panic("msghdrs is NULL");
+
+	for (i = 0; i < msginfo.msgtql; i++) {
+		msghdrs[i].msg_type = 0;
+		if (i > 0)
+			msghdrs[i-1].msg_next = &msghdrs[i];
+		msghdrs[i].msg_next = NULL;
+    	}
+	free_msghdrs = &msghdrs[0];
+
+	if (msqids == NULL)
+		panic("msqids is NULL");
+
+	for (i = 0; i < msginfo.msgmni; i++) {
+		msqids[i].msg_qbytes = 0;	/* implies entry is available */
+		msqids[i].msg_perm.seq = 0;	/* reset to a known value */
+	}
+}
+
+/*
+ * Entry point for all MSG calls
+ */
+int
+msgsys(p, uap, retval)
+	struct proc *p;
+	/* XXX actually varargs. */
+	struct msgsys_args /* {
+		u_int	which;
+		int	a2;
+		int	a3;
+		int	a4;
+		int	a5;
+		int	a6;
+	} */ *uap;
+	int *retval;
+{
+
+	if (uap->which >= sizeof(msgcalls)/sizeof(msgcalls[0]))
+		return (EINVAL);
+	return ((*msgcalls[uap->which])(p, &uap->a2, retval));
+}
+
+static void
+msg_freehdr(msghdr)
+	struct msg *msghdr;
+{
+	while (msghdr->msg_ts > 0) {
+		short next;
+		if (msghdr->msg_spot < 0 || msghdr->msg_spot >= msginfo.msgseg)
+			panic("msghdr->msg_spot out of range");
+		next = msgmaps[msghdr->msg_spot].next;
+		msgmaps[msghdr->msg_spot].next = free_msgmaps;
+		free_msgmaps = msghdr->msg_spot;
+		nfree_msgmaps++;
+		msghdr->msg_spot = next;
+		if (msghdr->msg_ts >= msginfo.msgssz)
+			msghdr->msg_ts -= msginfo.msgssz;
+		else
+			msghdr->msg_ts = 0;
+	}
+	if (msghdr->msg_spot != -1)
+		panic("msghdr->msg_spot != -1");
+	msghdr->msg_next = free_msghdrs;
+	free_msghdrs = msghdr;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgctl_args {
+	int	msqid;
+	int	cmd;
+	struct	msqid_ds *buf;
+};
+#endif
+
+int
+msgctl(p, uap, retval)
+	struct proc *p;
+	register struct msgctl_args *uap;
+	int *retval;
+{
+	int msqid = uap->msqid;
+	int cmd = uap->cmd;
+	struct msqid_ds *user_msqptr = uap->buf;
+	struct ucred *cred = p->p_ucred;
+	int rval, eval;
+	struct msqid_ds msqbuf;
+	register struct msqid_ds *msqptr;
+
+#ifdef MSG_DEBUG_OK
+	printf("call to msgctl(%d, %d, 0x%x)\n", msqid, cmd, user_msqptr);
+#endif
+
+	msqid = IPCID_TO_IX(msqid);
+
+	if (msqid < 0 || msqid >= msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+		printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
+		    msginfo.msgmni);
+#endif
+		return(EINVAL);
+	}
+
+	msqptr = &msqids[msqid];
+
+	if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+		printf("no such msqid\n");
+#endif
+		return(EINVAL);
+	}
+	if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+		printf("wrong sequence number\n");
+#endif
+		return(EINVAL);
+	}
+
+	eval = 0;
+	rval = 0;
+
+	switch (cmd) {
+
+	case IPC_RMID:
+	{
+		struct msg *msghdr;
+		if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_M)))
+			return(eval);
+		/* Free the message headers */
+		msghdr = msqptr->msg_first;
+		while (msghdr != NULL) {
+			struct msg *msghdr_tmp;
+
+			/* Free the segments of each message */
+			msqptr->msg_cbytes -= msghdr->msg_ts;
+			msqptr->msg_qnum--;
+			msghdr_tmp = msghdr;
+			msghdr = msghdr->msg_next;
+			msg_freehdr(msghdr_tmp);
+		}
+
+		if (msqptr->msg_cbytes != 0)
+			panic("msg_cbytes is screwed up");
+		if (msqptr->msg_qnum != 0)
+			panic("msg_qnum is screwed up");
+
+		msqptr->msg_qbytes = 0;	/* Mark it as free */
+
+		wakeup((caddr_t)msqptr);
+	}
+
+		break;
+
+	case IPC_SET:
+		if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_M)))
+			return(eval);
+		if ((eval = copyin(user_msqptr, &msqbuf, sizeof(msqbuf))) != 0)
+			return(eval);
+		if (msqbuf.msg_qbytes > msqptr->msg_qbytes && cred->cr_uid != 0)
+			return(EPERM);
+		if (msqbuf.msg_qbytes > msginfo.msgmnb) {
+#ifdef MSG_DEBUG_OK
+			printf("can't increase msg_qbytes beyond %d (truncating)\n",
+			    msginfo.msgmnb);
+#endif
+			msqbuf.msg_qbytes = msginfo.msgmnb;	/* silently restrict qbytes to system limit */
+		}
+		if (msqbuf.msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+			printf("can't reduce msg_qbytes to 0\n");
+#endif
+			return(EINVAL);		/* non-standard errno! */
+		}
+		msqptr->msg_perm.uid = msqbuf.msg_perm.uid;	/* change the owner */
+		msqptr->msg_perm.gid = msqbuf.msg_perm.gid;	/* change the owner */
+		msqptr->msg_perm.mode = (msqptr->msg_perm.mode & ~0777) |
+		    (msqbuf.msg_perm.mode & 0777);
+		msqptr->msg_qbytes = msqbuf.msg_qbytes;
+		msqptr->msg_ctime = time.tv_sec;
+		break;
+
+	case IPC_STAT:
+		if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_R))) {
+#ifdef MSG_DEBUG_OK
+			printf("requester doesn't have read access\n");
+#endif
+			return(eval);
+		}
+		eval = copyout((caddr_t)msqptr, user_msqptr,
+		    sizeof(struct msqid_ds));
+		break;
+
+	default:
+#ifdef MSG_DEBUG_OK
+		printf("invalid command %d\n", cmd);
+#endif
+		return(EINVAL);
+	}
+
+	if (eval == 0)
+		*retval = rval;
+	return(eval);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgget_args {
+	key_t	key;
+	int	msgflg;
+};
+#endif
+
+int
+msgget(p, uap, retval)
+	struct proc *p;
+	register struct msgget_args *uap;
+	int *retval;
+{
+	int msqid, eval;
+	int key = uap->key;
+	int msgflg = uap->msgflg;
+	struct ucred *cred = p->p_ucred;
+	register struct msqid_ds *msqptr = NULL;
+
+#ifdef MSG_DEBUG_OK
+	printf("msgget(0x%x, 0%o)\n", key, msgflg);
+#endif
+
+	if (key != IPC_PRIVATE) {
+		for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
+			msqptr = &msqids[msqid];
+			if (msqptr->msg_qbytes != 0 &&
+			    msqptr->msg_perm.key == key)
+				break;
+		}
+		if (msqid < msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+			printf("found public key\n");
+#endif
+			if ((msgflg & IPC_CREAT) && (msgflg & IPC_EXCL)) {
+#ifdef MSG_DEBUG_OK
+				printf("not exclusive\n");
+#endif
+				return(EEXIST);
+			}
+			if ((eval = ipcperm(cred, &msqptr->msg_perm, msgflg & 0700 ))) {
+#ifdef MSG_DEBUG_OK
+				printf("requester doesn't have 0%o access\n",
+				    msgflg & 0700);
+#endif
+				return(eval);
+			}
+			goto found;
+		}
+	}
+
+#ifdef MSG_DEBUG_OK
+	printf("need to allocate the msqid_ds\n");
+#endif
+	if (key == IPC_PRIVATE || (msgflg & IPC_CREAT)) {
+		for (msqid = 0; msqid < msginfo.msgmni; msqid++) {
+			/*
+			 * Look for an unallocated and unlocked msqid_ds.
+			 * msqid_ds's can be locked by msgsnd or msgrcv while
+			 * they are copying the message in/out.  We can't
+			 * re-use the entry until they release it.
+			 */
+			msqptr = &msqids[msqid];
+			if (msqptr->msg_qbytes == 0 &&
+			    (msqptr->msg_perm.mode & MSG_LOCKED) == 0)
+				break;
+		}
+		if (msqid == msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+			printf("no more msqid_ds's available\n");
+#endif
+			return(ENOSPC);
+		}
+#ifdef MSG_DEBUG_OK
+		printf("msqid %d is available\n", msqid);
+#endif
+		msqptr->msg_perm.key = key;
+		msqptr->msg_perm.cuid = cred->cr_uid;
+		msqptr->msg_perm.uid = cred->cr_uid;
+		msqptr->msg_perm.cgid = cred->cr_gid;
+		msqptr->msg_perm.gid = cred->cr_gid;
+		msqptr->msg_perm.mode = (msgflg & 0777);
+		/* Make sure that the returned msqid is unique */
+		msqptr->msg_perm.seq++;
+		msqptr->msg_first = NULL;
+		msqptr->msg_last = NULL;
+		msqptr->msg_cbytes = 0;
+		msqptr->msg_qnum = 0;
+		msqptr->msg_qbytes = msginfo.msgmnb;
+		msqptr->msg_lspid = 0;
+		msqptr->msg_lrpid = 0;
+		msqptr->msg_stime = 0;
+		msqptr->msg_rtime = 0;
+		msqptr->msg_ctime = time.tv_sec;
+	} else {
+#ifdef MSG_DEBUG_OK
+		printf("didn't find it and wasn't asked to create it\n");
+#endif
+		return(ENOENT);
+	}
+
+found:
+	/* Construct the unique msqid */
+	*retval = IXSEQ_TO_IPCID(msqid, msqptr->msg_perm);
+	return(0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgsnd_args {
+	int	msqid;
+	void	*msgp;
+	size_t	msgsz;
+	int	msgflg;
+};
+#endif
+
+int
+msgsnd(p, uap, retval)
+	struct proc *p;
+	register struct msgsnd_args *uap;
+	int *retval;
+{
+	int msqid = uap->msqid;
+	void *user_msgp = uap->msgp;
+	size_t msgsz = uap->msgsz;
+	int msgflg = uap->msgflg;
+	int segs_needed, eval;
+	struct ucred *cred = p->p_ucred;
+	register struct msqid_ds *msqptr;
+	register struct msg *msghdr;
+	short next;
+
+#ifdef MSG_DEBUG_OK
+	printf("call to msgsnd(%d, 0x%x, %d, %d)\n", msqid, user_msgp, msgsz,
+	    msgflg);
+#endif
+
+	msqid = IPCID_TO_IX(msqid);
+
+	if (msqid < 0 || msqid >= msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+		printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
+		    msginfo.msgmni);
+#endif
+		return(EINVAL);
+	}
+
+	msqptr = &msqids[msqid];
+	if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+		printf("no such message queue id\n");
+#endif
+		return(EINVAL);
+	}
+	if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+		printf("wrong sequence number\n");
+#endif
+		return(EINVAL);
+	}
+
+	if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_W))) {
+#ifdef MSG_DEBUG_OK
+		printf("requester doesn't have write access\n");
+#endif
+		return(eval);
+	}
+
+	segs_needed = (msgsz + msginfo.msgssz - 1) / msginfo.msgssz;
+#ifdef MSG_DEBUG_OK
+	printf("msgsz=%d, msgssz=%d, segs_needed=%d\n", msgsz, msginfo.msgssz,
+	    segs_needed);
+#endif
+	for (;;) {
+		int need_more_resources = 0;
+
+		/*
+		 * check msgsz
+		 * (inside this loop in case msg_qbytes changes while we sleep)
+		 */
+
+		if (msgsz > msqptr->msg_qbytes) {
+#ifdef MSG_DEBUG_OK
+			printf("msgsz > msqptr->msg_qbytes\n");
+#endif
+			return(EINVAL);
+		}
+
+		if (msqptr->msg_perm.mode & MSG_LOCKED) {
+#ifdef MSG_DEBUG_OK
+			printf("msqid is locked\n");
+#endif
+			need_more_resources = 1;
+		}
+		if (msgsz + msqptr->msg_cbytes > msqptr->msg_qbytes) {
+#ifdef MSG_DEBUG_OK
+			printf("msgsz + msg_cbytes > msg_qbytes\n");
+#endif
+			need_more_resources = 1;
+		}
+		if (segs_needed > nfree_msgmaps) {
+#ifdef MSG_DEBUG_OK
+			printf("segs_needed > nfree_msgmaps\n");
+#endif
+			need_more_resources = 1;
+		}
+		if (free_msghdrs == NULL) {
+#ifdef MSG_DEBUG_OK
+			printf("no more msghdrs\n");
+#endif
+			need_more_resources = 1;
+		}
+
+		if (need_more_resources) {
+			int we_own_it;
+
+			if ((msgflg & IPC_NOWAIT) != 0) {
+#ifdef MSG_DEBUG_OK
+				printf("need more resources but caller doesn't want to wait\n");
+#endif
+				return(EAGAIN);
+			}
+
+			if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0) {
+#ifdef MSG_DEBUG_OK
+				printf("we don't own the msqid_ds\n");
+#endif
+				we_own_it = 0;
+			} else {
+				/* Force later arrivals to wait for our
+				   request */
+#ifdef MSG_DEBUG_OK
+				printf("we own the msqid_ds\n");
+#endif
+				msqptr->msg_perm.mode |= MSG_LOCKED;
+				we_own_it = 1;
+			}
+#ifdef MSG_DEBUG_OK
+			printf("goodnight\n");
+#endif
+			eval = tsleep((caddr_t)msqptr, (PZERO - 4) | PCATCH,
+			    "msgwait", 0);
+#ifdef MSG_DEBUG_OK
+			printf("good morning, eval=%d\n", eval);
+#endif
+			if (we_own_it)
+				msqptr->msg_perm.mode &= ~MSG_LOCKED;
+			if (eval != 0) {
+#ifdef MSG_DEBUG_OK
+				printf("msgsnd:  interrupted system call\n");
+#endif
+				return(EINTR);
+			}
+
+			/*
+			 * Make sure that the msq queue still exists
+			 */
+
+			if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+				printf("msqid deleted\n");
+#endif
+				/* The SVID says to return EIDRM. */
+#ifdef EIDRM
+				return(EIDRM);
+#else
+				/* Unfortunately, BSD doesn't define that code
+				   yet! */
+				return(EINVAL);
+#endif
+			}
+
+		} else {
+#ifdef MSG_DEBUG_OK
+			printf("got all the resources that we need\n");
+#endif
+			break;
+		}
+	}
+
+	/*
+	 * We have the resources that we need.
+	 * Make sure!
+	 */
+
+	if (msqptr->msg_perm.mode & MSG_LOCKED)
+		panic("msg_perm.mode & MSG_LOCKED");
+	if (segs_needed > nfree_msgmaps)
+		panic("segs_needed > nfree_msgmaps");
+	if (msgsz + msqptr->msg_cbytes > msqptr->msg_qbytes)
+		panic("msgsz + msg_cbytes > msg_qbytes");
+	if (free_msghdrs == NULL)
+		panic("no more msghdrs");
+
+	/*
+	 * Re-lock the msqid_ds in case we page-fault when copying in the
+	 * message
+	 */
+
+	if ((msqptr->msg_perm.mode & MSG_LOCKED) != 0)
+		panic("msqid_ds is already locked");
+	msqptr->msg_perm.mode |= MSG_LOCKED;
+
+	/*
+	 * Allocate a message header
+	 */
+
+	msghdr = free_msghdrs;
+	free_msghdrs = msghdr->msg_next;
+	msghdr->msg_spot = -1;
+	msghdr->msg_ts = msgsz;
+
+	/*
+	 * Allocate space for the message
+	 */
+
+	while (segs_needed > 0) {
+		if (nfree_msgmaps <= 0)
+			panic("not enough msgmaps");
+		if (free_msgmaps == -1)
+			panic("nil free_msgmaps");
+		next = free_msgmaps;
+		if (next <= -1)
+			panic("next too low #1");
+		if (next >= msginfo.msgseg)
+			panic("next out of range #1");
+#ifdef MSG_DEBUG_OK
+		printf("allocating segment %d to message\n", next);
+#endif
+		free_msgmaps = msgmaps[next].next;
+		nfree_msgmaps--;
+		msgmaps[next].next = msghdr->msg_spot;
+		msghdr->msg_spot = next;
+		segs_needed--;
+	}
+
+	/*
+	 * Copy in the message type
+	 */
+
+	if ((eval = copyin(user_msgp, &msghdr->msg_type,
+	    sizeof(msghdr->msg_type))) != 0) {
+#ifdef MSG_DEBUG_OK
+		printf("error %d copying the message type\n", eval);
+#endif
+		msg_freehdr(msghdr);
+		msqptr->msg_perm.mode &= ~MSG_LOCKED;
+		wakeup((caddr_t)msqptr);
+		return(eval);
+	}
+	user_msgp = (char *)user_msgp + sizeof(msghdr->msg_type);
+
+	/*
+	 * Validate the message type
+	 */
+
+	if (msghdr->msg_type < 1) {
+		msg_freehdr(msghdr);
+		msqptr->msg_perm.mode &= ~MSG_LOCKED;
+		wakeup((caddr_t)msqptr);
+#ifdef MSG_DEBUG_OK
+		printf("mtype (%d) < 1\n", msghdr->msg_type);
+#endif
+		return(EINVAL);
+	}
+
+	/*
+	 * Copy in the message body
+	 */
+
+	next = msghdr->msg_spot;
+	while (msgsz > 0) {
+		size_t tlen;
+		if (msgsz > msginfo.msgssz)
+			tlen = msginfo.msgssz;
+		else
+			tlen = msgsz;
+		if (next <= -1)
+			panic("next too low #2");
+		if (next >= msginfo.msgseg)
+			panic("next out of range #2");
+		if ((eval = copyin(user_msgp, &msgpool[next * msginfo.msgssz],
+		    tlen)) != 0) {
+#ifdef MSG_DEBUG_OK
+			printf("error %d copying in message segment\n", eval);
+#endif
+			msg_freehdr(msghdr);
+			msqptr->msg_perm.mode &= ~MSG_LOCKED;
+			wakeup((caddr_t)msqptr);
+			return(eval);
+		}
+		msgsz -= tlen;
+		user_msgp = (char *)user_msgp + tlen;
+		next = msgmaps[next].next;
+	}
+	if (next != -1)
+		panic("didn't use all the msg segments");
+
+	/*
+	 * We've got the message.  Unlock the msqid_ds.
+	 */
+
+	msqptr->msg_perm.mode &= ~MSG_LOCKED;
+
+	/*
+	 * Make sure that the msqid_ds is still allocated.
+	 */
+
+	if (msqptr->msg_qbytes == 0) {
+		msg_freehdr(msghdr);
+		wakeup((caddr_t)msqptr);
+		/* The SVID says to return EIDRM. */
+#ifdef EIDRM
+		return(EIDRM);
+#else
+		/* Unfortunately, BSD doesn't define that code yet! */
+		return(EINVAL);
+#endif
+	}
+
+	/*
+	 * Put the message into the queue
+	 */
+
+	if (msqptr->msg_first == NULL) {
+		msqptr->msg_first = msghdr;
+		msqptr->msg_last = msghdr;
+	} else {
+		msqptr->msg_last->msg_next = msghdr;
+		msqptr->msg_last = msghdr;
+	}
+	msqptr->msg_last->msg_next = NULL;
+
+	msqptr->msg_cbytes += msghdr->msg_ts;
+	msqptr->msg_qnum++;
+	msqptr->msg_lspid = p->p_pid;
+	msqptr->msg_stime = time.tv_sec;
+
+	wakeup((caddr_t)msqptr);
+	*retval = 0;
+	return(0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct msgrcv_args {
+	int	msqid;
+	void	*msgp;
+	size_t	msgsz;
+	long	msgtyp;
+	int	msgflg;
+};
+#endif
+
+int
+msgrcv(p, uap, retval)
+	struct proc *p;
+	register struct msgrcv_args *uap;
+	int *retval;
+{
+	int msqid = uap->msqid;
+	void *user_msgp = uap->msgp;
+	size_t msgsz = uap->msgsz;
+	long msgtyp = uap->msgtyp;
+	int msgflg = uap->msgflg;
+	size_t len;
+	struct ucred *cred = p->p_ucred;
+	register struct msqid_ds *msqptr;
+	register struct msg *msghdr;
+	int eval;
+	short next;
+
+#ifdef MSG_DEBUG_OK
+	printf("call to msgrcv(%d, 0x%x, %d, %ld, %d)\n", msqid, user_msgp,
+	    msgsz, msgtyp, msgflg);
+#endif
+
+	msqid = IPCID_TO_IX(msqid);
+
+	if (msqid < 0 || msqid >= msginfo.msgmni) {
+#ifdef MSG_DEBUG_OK
+		printf("msqid (%d) out of range (0<=msqid<%d)\n", msqid,
+		    msginfo.msgmni);
+#endif
+		return(EINVAL);
+	}
+
+	msqptr = &msqids[msqid];
+	if (msqptr->msg_qbytes == 0) {
+#ifdef MSG_DEBUG_OK
+		printf("no such message queue id\n");
+#endif
+		return(EINVAL);
+	}
+	if (msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+		printf("wrong sequence number\n");
+#endif
+		return(EINVAL);
+	}
+
+	if ((eval = ipcperm(cred, &msqptr->msg_perm, IPC_R))) {
+#ifdef MSG_DEBUG_OK
+		printf("requester doesn't have read access\n");
+#endif
+		return(eval);
+	}
+
+	msghdr = NULL;
+	while (msghdr == NULL) {
+		if (msgtyp == 0) {
+			msghdr = msqptr->msg_first;
+			if (msghdr != NULL) {
+				if (msgsz < msghdr->msg_ts &&
+				    (msgflg & MSG_NOERROR) == 0) {
+#ifdef MSG_DEBUG_OK
+					printf("first message on the queue is too big (want %d, got %d)\n",
+					    msgsz, msghdr->msg_ts);
+#endif
+					return(E2BIG);
+				}
+				if (msqptr->msg_first == msqptr->msg_last) {
+					msqptr->msg_first = NULL;
+					msqptr->msg_last = NULL;
+				} else {
+					msqptr->msg_first = msghdr->msg_next;
+					if (msqptr->msg_first == NULL)
+						panic("msg_first/last screwed up #1");
+				}
+			}
+		} else {
+			struct msg *previous;
+			struct msg **prev;
+
+			previous = NULL;
+			prev = &(msqptr->msg_first);
+			while ((msghdr = *prev) != NULL) {
+				/*
+				 * Is this message's type an exact match or is
+				 * this message's type less than or equal to
+				 * the absolute value of a negative msgtyp?
+				 * Note that the second half of this test can
+				 * NEVER be true if msgtyp is positive since
+				 * msg_type is always positive!
+				 */
+
+				if (msgtyp == msghdr->msg_type ||
+				    msghdr->msg_type <= -msgtyp) {
+#ifdef MSG_DEBUG_OK
+					printf("found message type %d, requested %d\n",
+					    msghdr->msg_type, msgtyp);
+#endif
+					if (msgsz < msghdr->msg_ts &&
+					    (msgflg & MSG_NOERROR) == 0) {
+#ifdef MSG_DEBUG_OK
+						printf("requested message on the queue is too big (want %d, got %d)\n",
+						    msgsz, msghdr->msg_ts);
+#endif
+						return(E2BIG);
+					}
+					*prev = msghdr->msg_next;
+					if (msghdr == msqptr->msg_last) {
+						if (previous == NULL) {
+							if (prev !=
+							    &msqptr->msg_first)
+								panic("msg_first/last screwed up #2");
+							msqptr->msg_first =
+							    NULL;
+							msqptr->msg_last =
+							    NULL;
+						} else {
+							if (prev ==
+							    &msqptr->msg_first)
+								panic("msg_first/last screwed up #3");
+							msqptr->msg_last =
+							    previous;
+						}
+					}
+					break;
+				}
+				previous = msghdr;
+				prev = &(msghdr->msg_next);
+			}
+		}
+
+		/*
+		 * We've either extracted the msghdr for the appropriate
+		 * message or there isn't one.
+		 * If there is one then bail out of this loop.
+		 */
+
+		if (msghdr != NULL)
+			break;
+
+		/*
+		 * Hmph!  No message found.  Does the user want to wait?
+		 */
+
+		if ((msgflg & IPC_NOWAIT) != 0) {
+#ifdef MSG_DEBUG_OK
+			printf("no appropriate message found (msgtyp=%d)\n",
+			    msgtyp);
+#endif
+			/* The SVID says to return ENOMSG. */
+#ifdef ENOMSG
+			return(ENOMSG);
+#else
+			/* Unfortunately, BSD doesn't define that code yet! */
+			return(EAGAIN);
+#endif
+		}
+
+		/*
+		 * Wait for something to happen
+		 */
+
+#ifdef MSG_DEBUG_OK
+		printf("msgrcv:  goodnight\n");
+#endif
+		eval = tsleep((caddr_t)msqptr, (PZERO - 4) | PCATCH, "msgwait",
+		    0);
+#ifdef MSG_DEBUG_OK
+		printf("msgrcv:  good morning (eval=%d)\n", eval);
+#endif
+
+		if (eval != 0) {
+#ifdef MSG_DEBUG_OK
+			printf("msgsnd:  interrupted system call\n");
+#endif
+			return(EINTR);
+		}
+
+		/*
+		 * Make sure that the msq queue still exists
+		 */
+
+		if (msqptr->msg_qbytes == 0 ||
+		    msqptr->msg_perm.seq != IPCID_TO_SEQ(uap->msqid)) {
+#ifdef MSG_DEBUG_OK
+			printf("msqid deleted\n");
+#endif
+			/* The SVID says to return EIDRM. */
+#ifdef EIDRM
+			return(EIDRM);
+#else
+			/* Unfortunately, BSD doesn't define that code yet! */
+			return(EINVAL);
+#endif
+		}
+	}
+
+	/*
+	 * Return the message to the user.
+	 *
+	 * First, do the bookkeeping (before we risk being interrupted).
+	 */
+
+	msqptr->msg_cbytes -= msghdr->msg_ts;
+	msqptr->msg_qnum--;
+	msqptr->msg_lrpid = p->p_pid;
+	msqptr->msg_rtime = time.tv_sec;
+
+	/*
+	 * Make msgsz the actual amount that we'll be returning.
+	 * Note that this effectively truncates the message if it is too long
+	 * (since msgsz is never increased).
+	 */
+
+#ifdef MSG_DEBUG_OK
+	printf("found a message, msgsz=%d, msg_ts=%d\n", msgsz,
+	    msghdr->msg_ts);
+#endif
+	if (msgsz > msghdr->msg_ts)
+		msgsz = msghdr->msg_ts;
+
+	/*
+	 * Return the type to the user.
+	 */
+
+	eval = copyout((caddr_t)&(msghdr->msg_type), user_msgp,
+	    sizeof(msghdr->msg_type));
+	if (eval != 0) {
+#ifdef MSG_DEBUG_OK
+		printf("error (%d) copying out message type\n", eval);
+#endif
+		msg_freehdr(msghdr);
+		wakeup((caddr_t)msqptr);
+		return(eval);
+	}
+	user_msgp = (char *)user_msgp + sizeof(msghdr->msg_type);
+
+	/*
+	 * Return the segments to the user
+	 */
+
+	next = msghdr->msg_spot;
+	for (len = 0; len < msgsz; len += msginfo.msgssz) {
+		size_t tlen;
+
+		if (msgsz > msginfo.msgssz)
+			tlen = msginfo.msgssz;
+		else
+			tlen = msgsz;
+		if (next <= -1)
+			panic("next too low #3");
+		if (next >= msginfo.msgseg)
+			panic("next out of range #3");
+		eval = copyout((caddr_t)&msgpool[next * msginfo.msgssz],
+		    user_msgp, tlen);
+		if (eval != 0) {
+#ifdef MSG_DEBUG_OK
+			printf("error (%d) copying out message segment\n",
+			    eval);
+#endif
+			msg_freehdr(msghdr);
+			wakeup((caddr_t)msqptr);
+			return(eval);
+		}
+		user_msgp = (char *)user_msgp + tlen;
+		next = msgmaps[next].next;
+	}
+
+	/*
+	 * Done, return the actual number of bytes copied out.
+	 */
+
+	msg_freehdr(msghdr);
+	wakeup((caddr_t)msqptr);
+	*retval = msgsz;
+	return(0);
+}
diff --git a/sys/kern/sysv_sem.c b/sys/kern/sysv_sem.c
new file mode 100644
index 0000000..e66ddc6
--- /dev/null
+++ b/sys/kern/sysv_sem.c
@@ -0,0 +1,985 @@
+/*	$Id$ */
+
+/*
+ * Implementation of SVID semaphores
+ *
+ * Author:  Daniel Boulet
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ */
+
+#include "opt_sysvipc.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/sem.h>
+#include <sys/sysent.h>
+
+static void seminit __P((void *));
+SYSINIT(sysv_sem, SI_SUB_SYSV_SEM, SI_ORDER_FIRST, seminit, NULL)
+
+#ifndef _SYS_SYSPROTO_H_
+struct __semctl_args;
+int __semctl __P((struct proc *p, struct __semctl_args *uap, int *retval));
+struct semget_args;
+int semget __P((struct proc *p, struct semget_args *uap, int *retval));
+struct semop_args;
+int semop __P((struct proc *p, struct semop_args *uap, int *retval));
+struct semconfig_args;
+int semconfig __P((struct proc *p, struct semconfig_args *uap, 
+		int *retval));
+#endif
+
+static struct sem_undo *semu_alloc __P((struct proc *p));
+static int semundo_adjust __P((struct proc *p, struct sem_undo **supptr, 
+		int semid, int semnum, int adjval));
+static void semundo_clear __P((int semid, int semnum));
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+static sy_call_t *semcalls[] = {
+	(sy_call_t *)__semctl, (sy_call_t *)semget,
+	(sy_call_t *)semop, (sy_call_t *)semconfig
+};
+
+static int	semtot = 0;
+struct semid_ds *sema;		/* semaphore id pool */
+struct sem *sem;		/* semaphore pool */
+static struct sem_undo *semu_list; 	/* list of active undo structures */
+int	*semu;			/* undo structure pool */
+
+static struct proc *semlock_holder = NULL;
+
+void
+seminit(dummy)
+	void *dummy;
+{
+	register int i;
+
+	if (sema == NULL)
+		panic("sema is NULL");
+	if (semu == NULL)
+		panic("semu is NULL");
+
+	for (i = 0; i < seminfo.semmni; i++) {
+		sema[i].sem_base = 0;
+		sema[i].sem_perm.mode = 0;
+	}
+	for (i = 0; i < seminfo.semmnu; i++) {
+		register struct sem_undo *suptr = SEMU(i);
+		suptr->un_proc = NULL;
+	}
+	semu_list = NULL;
+}
+
+/*
+ * Entry point for all SEM calls
+ */
+int
+semsys(p, uap, retval)
+	struct proc *p;
+	/* XXX actually varargs. */
+	struct semsys_args /* {
+		u_int	which;
+		int	a2;
+		int	a3;
+		int	a4;
+		int	a5;
+	} */ *uap;
+	int *retval;
+{
+
+	while (semlock_holder != NULL && semlock_holder != p)
+		(void) tsleep((caddr_t)&semlock_holder, (PZERO - 4), "semsys", 0);
+
+	if (uap->which >= sizeof(semcalls)/sizeof(semcalls[0]))
+		return (EINVAL);
+	return ((*semcalls[uap->which])(p, &uap->a2, retval));
+}
+
+/*
+ * Lock or unlock the entire semaphore facility.
+ *
+ * This will probably eventually evolve into a general purpose semaphore
+ * facility status enquiry mechanism (I don't like the "read /dev/kmem"
+ * approach currently taken by ipcs and the amount of info that we want
+ * to be able to extract for ipcs is probably beyond what the capability
+ * of the getkerninfo facility.
+ *
+ * At the time that the current version of semconfig was written, ipcs is
+ * the only user of the semconfig facility.  It uses it to ensure that the
+ * semaphore facility data structures remain static while it fishes around
+ * in /dev/kmem.
+ */
+
+#ifndef _SYS_SYSPROTO_H_
+struct semconfig_args {
+	semconfig_ctl_t	flag;
+};
+#endif
+
+int
+semconfig(p, uap, retval)
+	struct proc *p;
+	struct semconfig_args *uap;
+	int *retval;
+{
+	int eval = 0;
+
+	switch (uap->flag) {
+	case SEM_CONFIG_FREEZE:
+		semlock_holder = p;
+		break;
+
+	case SEM_CONFIG_THAW:
+		semlock_holder = NULL;
+		wakeup((caddr_t)&semlock_holder);
+		break;
+
+	default:
+		printf("semconfig: unknown flag parameter value (%d) - ignored\n",
+		    uap->flag);
+		eval = EINVAL;
+		break;
+	}
+
+	*retval = 0;
+	return(eval);
+}
+
+/*
+ * Allocate a new sem_undo structure for a process
+ * (returns ptr to structure or NULL if no more room)
+ */
+
+static struct sem_undo *
+semu_alloc(p)
+	struct proc *p;
+{
+	register int i;
+	register struct sem_undo *suptr;
+	register struct sem_undo **supptr;
+	int attempt;
+
+	/*
+	 * Try twice to allocate something.
+	 * (we'll purge any empty structures after the first pass so
+	 * two passes are always enough)
+	 */
+
+	for (attempt = 0; attempt < 2; attempt++) {
+		/*
+		 * Look for a free structure.
+		 * Fill it in and return it if we find one.
+		 */
+
+		for (i = 0; i < seminfo.semmnu; i++) {
+			suptr = SEMU(i);
+			if (suptr->un_proc == NULL) {
+				suptr->un_next = semu_list;
+				semu_list = suptr;
+				suptr->un_cnt = 0;
+				suptr->un_proc = p;
+				return(suptr);
+			}
+		}
+
+		/*
+		 * We didn't find a free one, if this is the first attempt
+		 * then try to free some structures.
+		 */
+
+		if (attempt == 0) {
+			/* All the structures are in use - try to free some */
+			int did_something = 0;
+
+			supptr = &semu_list;
+			while ((suptr = *supptr) != NULL) {
+				if (suptr->un_cnt == 0)  {
+					suptr->un_proc = NULL;
+					*supptr = suptr->un_next;
+					did_something = 1;
+				} else
+					supptr = &(suptr->un_next);
+			}
+
+			/* If we didn't free anything then just give-up */
+			if (!did_something)
+				return(NULL);
+		} else {
+			/*
+			 * The second pass failed even though we freed
+			 * something after the first pass!
+			 * This is IMPOSSIBLE!
+			 */
+			panic("semu_alloc - second attempt failed");
+		}
+	}
+	return (NULL);
+}
+
+/*
+ * Adjust a particular entry for a particular proc
+ */
+
+static int
+semundo_adjust(p, supptr, semid, semnum, adjval)
+	register struct proc *p;
+	struct sem_undo **supptr;
+	int semid, semnum;
+	int adjval;
+{
+	register struct sem_undo *suptr;
+	register struct undo *sunptr;
+	int i;
+
+	/* Look for and remember the sem_undo if the caller doesn't provide
+	   it */
+
+	suptr = *supptr;
+	if (suptr == NULL) {
+		for (suptr = semu_list; suptr != NULL;
+		    suptr = suptr->un_next) {
+			if (suptr->un_proc == p) {
+				*supptr = suptr;
+				break;
+			}
+		}
+		if (suptr == NULL) {
+			if (adjval == 0)
+				return(0);
+			suptr = semu_alloc(p);
+			if (suptr == NULL)
+				return(ENOSPC);
+			*supptr = suptr;
+		}
+	}
+
+	/*
+	 * Look for the requested entry and adjust it (delete if adjval becomes
+	 * 0).
+	 */
+	sunptr = &suptr->un_ent[0];
+	for (i = 0; i < suptr->un_cnt; i++, sunptr++) {
+		if (sunptr->un_id != semid || sunptr->un_num != semnum)
+			continue;
+		if (adjval == 0)
+			sunptr->un_adjval = 0;
+		else
+			sunptr->un_adjval += adjval;
+		if (sunptr->un_adjval == 0) {
+			suptr->un_cnt--;
+			if (i < suptr->un_cnt)
+				suptr->un_ent[i] =
+				    suptr->un_ent[suptr->un_cnt];
+		}
+		return(0);
+	}
+
+	/* Didn't find the right entry - create it */
+	if (adjval == 0)
+		return(0);
+	if (suptr->un_cnt != SEMUME) {
+		sunptr = &suptr->un_ent[suptr->un_cnt];
+		suptr->un_cnt++;
+		sunptr->un_adjval = adjval;
+		sunptr->un_id = semid; sunptr->un_num = semnum;
+	} else
+		return(EINVAL);
+	return(0);
+}
+
+static void
+semundo_clear(semid, semnum)
+	int semid, semnum;
+{
+	register struct sem_undo *suptr;
+
+	for (suptr = semu_list; suptr != NULL; suptr = suptr->un_next) {
+		register struct undo *sunptr = &suptr->un_ent[0];
+		register int i = 0;
+
+		while (i < suptr->un_cnt) {
+			if (sunptr->un_id == semid) {
+				if (semnum == -1 || sunptr->un_num == semnum) {
+					suptr->un_cnt--;
+					if (i < suptr->un_cnt) {
+						suptr->un_ent[i] =
+						  suptr->un_ent[suptr->un_cnt];
+						continue;
+					}
+				}
+				if (semnum != -1)
+					break;
+			}
+			i++, sunptr++;
+		}
+	}
+}
+
+/*
+ * Note that the user-mode half of this passes a union, not a pointer
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct __semctl_args {
+	int	semid;
+	int	semnum;
+	int	cmd;
+	union	semun *arg;
+};
+#endif
+
+int
+__semctl(p, uap, retval)
+	struct proc *p;
+	register struct __semctl_args *uap;
+	int *retval;
+{
+	int semid = uap->semid;
+	int semnum = uap->semnum;
+	int cmd = uap->cmd;
+	union semun *arg = uap->arg;
+	union semun real_arg;
+	struct ucred *cred = p->p_ucred;
+	int i, rval, eval;
+	struct semid_ds sbuf;
+	register struct semid_ds *semaptr;
+
+#ifdef SEM_DEBUG
+	printf("call to semctl(%d, %d, %d, 0x%x)\n", semid, semnum, cmd, arg);
+#endif
+
+	semid = IPCID_TO_IX(semid);
+	if (semid < 0 || semid >= seminfo.semmsl)
+		return(EINVAL);
+
+	semaptr = &sema[semid];
+	if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 ||
+	    semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid))
+		return(EINVAL);
+
+	eval = 0;
+	rval = 0;
+
+	switch (cmd) {
+	case IPC_RMID:
+		if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_M)))
+			return(eval);
+		semaptr->sem_perm.cuid = cred->cr_uid;
+		semaptr->sem_perm.uid = cred->cr_uid;
+		semtot -= semaptr->sem_nsems;
+		for (i = semaptr->sem_base - sem; i < semtot; i++)
+			sem[i] = sem[i + semaptr->sem_nsems];
+		for (i = 0; i < seminfo.semmni; i++) {
+			if ((sema[i].sem_perm.mode & SEM_ALLOC) &&
+			    sema[i].sem_base > semaptr->sem_base)
+				sema[i].sem_base -= semaptr->sem_nsems;
+		}
+		semaptr->sem_perm.mode = 0;
+		semundo_clear(semid, -1);
+		wakeup((caddr_t)semaptr);
+		break;
+
+	case IPC_SET:
+		if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_M)))
+			return(eval);
+		if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+			return(eval);
+		if ((eval = copyin(real_arg.buf, (caddr_t)&sbuf,
+		    sizeof(sbuf))) != 0)
+			return(eval);
+		semaptr->sem_perm.uid = sbuf.sem_perm.uid;
+		semaptr->sem_perm.gid = sbuf.sem_perm.gid;
+		semaptr->sem_perm.mode = (semaptr->sem_perm.mode & ~0777) |
+		    (sbuf.sem_perm.mode & 0777);
+		semaptr->sem_ctime = time.tv_sec;
+		break;
+
+	case IPC_STAT:
+		if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+			return(eval);
+		if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+			return(eval);
+		eval = copyout((caddr_t)semaptr, real_arg.buf,
+		    sizeof(struct semid_ds));
+		break;
+
+	case GETNCNT:
+		if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+			return(eval);
+		if (semnum < 0 || semnum >= semaptr->sem_nsems)
+			return(EINVAL);
+		rval = semaptr->sem_base[semnum].semncnt;
+		break;
+
+	case GETPID:
+		if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+			return(eval);
+		if (semnum < 0 || semnum >= semaptr->sem_nsems)
+			return(EINVAL);
+		rval = semaptr->sem_base[semnum].sempid;
+		break;
+
+	case GETVAL:
+		if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+			return(eval);
+		if (semnum < 0 || semnum >= semaptr->sem_nsems)
+			return(EINVAL);
+		rval = semaptr->sem_base[semnum].semval;
+		break;
+
+	case GETALL:
+		if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+			return(eval);
+		if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+			return(eval);
+		for (i = 0; i < semaptr->sem_nsems; i++) {
+			eval = copyout((caddr_t)&semaptr->sem_base[i].semval,
+			    &real_arg.array[i], sizeof(real_arg.array[0]));
+			if (eval != 0)
+				break;
+		}
+		break;
+
+	case GETZCNT:
+		if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_R)))
+			return(eval);
+		if (semnum < 0 || semnum >= semaptr->sem_nsems)
+			return(EINVAL);
+		rval = semaptr->sem_base[semnum].semzcnt;
+		break;
+
+	case SETVAL:
+		if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_W)))
+			return(eval);
+		if (semnum < 0 || semnum >= semaptr->sem_nsems)
+			return(EINVAL);
+		if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+			return(eval);
+		semaptr->sem_base[semnum].semval = real_arg.val;
+		semundo_clear(semid, semnum);
+		wakeup((caddr_t)semaptr);
+		break;
+
+	case SETALL:
+		if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_W)))
+			return(eval);
+		if ((eval = copyin(arg, &real_arg, sizeof(real_arg))) != 0)
+			return(eval);
+		for (i = 0; i < semaptr->sem_nsems; i++) {
+			eval = copyin(&real_arg.array[i],
+			    (caddr_t)&semaptr->sem_base[i].semval,
+			    sizeof(real_arg.array[0]));
+			if (eval != 0)
+				break;
+		}
+		semundo_clear(semid, -1);
+		wakeup((caddr_t)semaptr);
+		break;
+
+	default:
+		return(EINVAL);
+	}
+
+	if (eval == 0)
+		*retval = rval;
+	return(eval);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct semget_args {
+	key_t	key;
+	int	nsems;
+	int	semflg;
+};
+#endif
+
+int
+semget(p, uap, retval)
+	struct proc *p;
+	register struct semget_args *uap;
+	int *retval;
+{
+	int semid, eval;
+	int key = uap->key;
+	int nsems = uap->nsems;
+	int semflg = uap->semflg;
+	struct ucred *cred = p->p_ucred;
+
+#ifdef SEM_DEBUG
+	printf("semget(0x%x, %d, 0%o)\n", key, nsems, semflg);
+#endif
+
+	if (key != IPC_PRIVATE) {
+		for (semid = 0; semid < seminfo.semmni; semid++) {
+			if ((sema[semid].sem_perm.mode & SEM_ALLOC) &&
+			    sema[semid].sem_perm.key == key)
+				break;
+		}
+		if (semid < seminfo.semmni) {
+#ifdef SEM_DEBUG
+			printf("found public key\n");
+#endif
+			if ((eval = ipcperm(cred, &sema[semid].sem_perm,
+			    semflg & 0700)))
+				return(eval);
+			if (nsems > 0 && sema[semid].sem_nsems < nsems) {
+#ifdef SEM_DEBUG
+				printf("too small\n");
+#endif
+				return(EINVAL);
+			}
+			if ((semflg & IPC_CREAT) && (semflg & IPC_EXCL)) {
+#ifdef SEM_DEBUG
+				printf("not exclusive\n");
+#endif
+				return(EEXIST);
+			}
+			goto found;
+		}
+	}
+
+#ifdef SEM_DEBUG
+	printf("need to allocate the semid_ds\n");
+#endif
+	if (key == IPC_PRIVATE || (semflg & IPC_CREAT)) {
+		if (nsems <= 0 || nsems > seminfo.semmsl) {
+#ifdef SEM_DEBUG
+			printf("nsems out of range (0<%d<=%d)\n", nsems,
+			    seminfo.semmsl);
+#endif
+			return(EINVAL);
+		}
+		if (nsems > seminfo.semmns - semtot) {
+#ifdef SEM_DEBUG
+			printf("not enough semaphores left (need %d, got %d)\n",
+			    nsems, seminfo.semmns - semtot);
+#endif
+			return(ENOSPC);
+		}
+		for (semid = 0; semid < seminfo.semmni; semid++) {
+			if ((sema[semid].sem_perm.mode & SEM_ALLOC) == 0)
+				break;
+		}
+		if (semid == seminfo.semmni) {
+#ifdef SEM_DEBUG
+			printf("no more semid_ds's available\n");
+#endif
+			return(ENOSPC);
+		}
+#ifdef SEM_DEBUG
+		printf("semid %d is available\n", semid);
+#endif
+		sema[semid].sem_perm.key = key;
+		sema[semid].sem_perm.cuid = cred->cr_uid;
+		sema[semid].sem_perm.uid = cred->cr_uid;
+		sema[semid].sem_perm.cgid = cred->cr_gid;
+		sema[semid].sem_perm.gid = cred->cr_gid;
+		sema[semid].sem_perm.mode = (semflg & 0777) | SEM_ALLOC;
+		sema[semid].sem_perm.seq =
+		    (sema[semid].sem_perm.seq + 1) & 0x7fff;
+		sema[semid].sem_nsems = nsems;
+		sema[semid].sem_otime = 0;
+		sema[semid].sem_ctime = time.tv_sec;
+		sema[semid].sem_base = &sem[semtot];
+		semtot += nsems;
+		bzero(sema[semid].sem_base,
+		    sizeof(sema[semid].sem_base[0])*nsems);
+#ifdef SEM_DEBUG
+		printf("sembase = 0x%x, next = 0x%x\n", sema[semid].sem_base,
+		    &sem[semtot]);
+#endif
+	} else {
+#ifdef SEM_DEBUG
+		printf("didn't find it and wasn't asked to create it\n");
+#endif
+		return(ENOENT);
+	}
+
+found:
+	*retval = IXSEQ_TO_IPCID(semid, sema[semid].sem_perm);
+	return(0);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct semop_args {
+	int	semid;
+	struct	sembuf *sops;
+	int	nsops;
+};
+#endif
+
+int
+semop(p, uap, retval)
+	struct proc *p;
+	register struct semop_args *uap;
+	int *retval;
+{
+	int semid = uap->semid;
+	int nsops = uap->nsops;
+	struct sembuf sops[MAX_SOPS];
+	register struct semid_ds *semaptr;
+	register struct sembuf *sopptr;
+	register struct sem *semptr;
+	struct sem_undo *suptr = NULL;
+	struct ucred *cred = p->p_ucred;
+	int i, j, eval;
+	int do_wakeup, do_undos;
+
+#ifdef SEM_DEBUG
+	printf("call to semop(%d, 0x%x, %d)\n", semid, sops, nsops);
+#endif
+
+	semid = IPCID_TO_IX(semid);	/* Convert back to zero origin */
+
+	if (semid < 0 || semid >= seminfo.semmsl)
+		return(EINVAL);
+
+	semaptr = &sema[semid];
+	if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0)
+		return(EINVAL);
+	if (semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid))
+		return(EINVAL);
+
+	if ((eval = ipcperm(cred, &semaptr->sem_perm, IPC_W))) {
+#ifdef SEM_DEBUG
+		printf("eval = %d from ipaccess\n", eval);
+#endif
+		return(eval);
+	}
+
+	if (nsops > MAX_SOPS) {
+#ifdef SEM_DEBUG
+		printf("too many sops (max=%d, nsops=%d)\n", MAX_SOPS, nsops);
+#endif
+		return(E2BIG);
+	}
+
+	if ((eval = copyin(uap->sops, &sops, nsops * sizeof(sops[0]))) != 0) {
+#ifdef SEM_DEBUG
+		printf("eval = %d from copyin(%08x, %08x, %d)\n", eval,
+		    uap->sops, &sops, nsops * sizeof(sops[0]));
+#endif
+		return(eval);
+	}
+
+	/*
+	 * Loop trying to satisfy the vector of requests.
+	 * If we reach a point where we must wait, any requests already
+	 * performed are rolled back and we go to sleep until some other
+	 * process wakes us up.  At this point, we start all over again.
+	 *
+	 * This ensures that from the perspective of other tasks, a set
+	 * of requests is atomic (never partially satisfied).
+	 */
+	do_undos = 0;
+
+	for (;;) {
+		do_wakeup = 0;
+
+		for (i = 0; i < nsops; i++) {
+			sopptr = &sops[i];
+
+			if (sopptr->sem_num >= semaptr->sem_nsems)
+				return(EFBIG);
+
+			semptr = &semaptr->sem_base[sopptr->sem_num];
+
+#ifdef SEM_DEBUG
+			printf("semop:  semaptr=%x, sem_base=%x, semptr=%x, sem[%d]=%d : op=%d, flag=%s\n",
+			    semaptr, semaptr->sem_base, semptr,
+			    sopptr->sem_num, semptr->semval, sopptr->sem_op,
+			    (sopptr->sem_flg & IPC_NOWAIT) ? "nowait" : "wait");
+#endif
+
+			if (sopptr->sem_op < 0) {
+				if (semptr->semval + sopptr->sem_op < 0) {
+#ifdef SEM_DEBUG
+					printf("semop:  can't do it now\n");
+#endif
+					break;
+				} else {
+					semptr->semval += sopptr->sem_op;
+					if (semptr->semval == 0 &&
+					    semptr->semzcnt > 0)
+						do_wakeup = 1;
+				}
+				if (sopptr->sem_flg & SEM_UNDO)
+					do_undos = 1;
+			} else if (sopptr->sem_op == 0) {
+				if (semptr->semval > 0) {
+#ifdef SEM_DEBUG
+					printf("semop:  not zero now\n");
+#endif
+					break;
+				}
+			} else {
+				if (semptr->semncnt > 0)
+					do_wakeup = 1;
+				semptr->semval += sopptr->sem_op;
+				if (sopptr->sem_flg & SEM_UNDO)
+					do_undos = 1;
+			}
+		}
+
+		/*
+		 * Did we get through the entire vector?
+		 */
+		if (i >= nsops)
+			goto done;
+
+		/*
+		 * No ... rollback anything that we've already done
+		 */
+#ifdef SEM_DEBUG
+		printf("semop:  rollback 0 through %d\n", i-1);
+#endif
+		for (j = 0; j < i; j++)
+			semaptr->sem_base[sops[j].sem_num].semval -=
+			    sops[j].sem_op;
+
+		/*
+		 * If the request that we couldn't satisfy has the
+		 * NOWAIT flag set then return with EAGAIN.
+		 */
+		if (sopptr->sem_flg & IPC_NOWAIT)
+			return(EAGAIN);
+
+		if (sopptr->sem_op == 0)
+			semptr->semzcnt++;
+		else
+			semptr->semncnt++;
+
+#ifdef SEM_DEBUG
+		printf("semop:  good night!\n");
+#endif
+		eval = tsleep((caddr_t)semaptr, (PZERO - 4) | PCATCH,
+		    "semwait", 0);
+#ifdef SEM_DEBUG
+		printf("semop:  good morning (eval=%d)!\n", eval);
+#endif
+
+		suptr = NULL;	/* sem_undo may have been reallocated */
+
+		if (eval != 0)
+			return(EINTR);
+#ifdef SEM_DEBUG
+		printf("semop:  good morning!\n");
+#endif
+
+		/*
+		 * Make sure that the semaphore still exists
+		 */
+		if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0 ||
+		    semaptr->sem_perm.seq != IPCID_TO_SEQ(uap->semid)) {
+			/* The man page says to return EIDRM. */
+			/* Unfortunately, BSD doesn't define that code! */
+#ifdef EIDRM
+			return(EIDRM);
+#else
+			return(EINVAL);
+#endif
+		}
+
+		/*
+		 * The semaphore is still alive.  Readjust the count of
+		 * waiting processes.
+		 */
+		if (sopptr->sem_op == 0)
+			semptr->semzcnt--;
+		else
+			semptr->semncnt--;
+	}
+
+done:
+	/*
+	 * Process any SEM_UNDO requests.
+	 */
+	if (do_undos) {
+		for (i = 0; i < nsops; i++) {
+			/*
+			 * We only need to deal with SEM_UNDO's for non-zero
+			 * op's.
+			 */
+			int adjval;
+
+			if ((sops[i].sem_flg & SEM_UNDO) == 0)
+				continue;
+			adjval = sops[i].sem_op;
+			if (adjval == 0)
+				continue;
+			eval = semundo_adjust(p, &suptr, semid,
+			    sops[i].sem_num, -adjval);
+			if (eval == 0)
+				continue;
+
+			/*
+			 * Oh-Oh!  We ran out of either sem_undo's or undo's.
+			 * Rollback the adjustments to this point and then
+			 * rollback the semaphore ups and down so we can return
+			 * with an error with all structures restored.  We
+			 * rollback the undo's in the exact reverse order that
+			 * we applied them.  This guarantees that we won't run
+			 * out of space as we roll things back out.
+			 */
+			for (j = i - 1; j >= 0; j--) {
+				if ((sops[j].sem_flg & SEM_UNDO) == 0)
+					continue;
+				adjval = sops[j].sem_op;
+				if (adjval == 0)
+					continue;
+				if (semundo_adjust(p, &suptr, semid,
+				    sops[j].sem_num, adjval) != 0)
+					panic("semop - can't undo undos");
+			}
+
+			for (j = 0; j < nsops; j++)
+				semaptr->sem_base[sops[j].sem_num].semval -=
+				    sops[j].sem_op;
+
+#ifdef SEM_DEBUG
+			printf("eval = %d from semundo_adjust\n", eval);
+#endif
+			return(eval);
+		} /* loop through the sops */
+	} /* if (do_undos) */
+
+	/* We're definitely done - set the sempid's */
+	for (i = 0; i < nsops; i++) {
+		sopptr = &sops[i];
+		semptr = &semaptr->sem_base[sopptr->sem_num];
+		semptr->sempid = p->p_pid;
+	}
+
+	/* Do a wakeup if any semaphore was up'd. */
+	if (do_wakeup) {
+#ifdef SEM_DEBUG
+		printf("semop:  doing wakeup\n");
+#ifdef SEM_WAKEUP
+		sem_wakeup((caddr_t)semaptr);
+#else
+		wakeup((caddr_t)semaptr);
+#endif
+		printf("semop:  back from wakeup\n");
+#else
+		wakeup((caddr_t)semaptr);
+#endif
+	}
+#ifdef SEM_DEBUG
+	printf("semop:  done\n");
+#endif
+	*retval = 0;
+	return(0);
+}
+
+/*
+ * Go through the undo structures for this process and apply the adjustments to
+ * semaphores.
+ */
+void
+semexit(p)
+	struct proc *p;
+{
+	register struct sem_undo *suptr;
+	register struct sem_undo **supptr;
+	int did_something;
+
+	/*
+	 * If somebody else is holding the global semaphore facility lock
+	 * then sleep until it is released.
+	 */
+	while (semlock_holder != NULL && semlock_holder != p) {
+#ifdef SEM_DEBUG
+		printf("semaphore facility locked - sleeping ...\n");
+#endif
+		(void) tsleep((caddr_t)&semlock_holder, (PZERO - 4), "semext", 0);
+	}
+
+	did_something = 0;
+
+	/*
+	 * Go through the chain of undo vectors looking for one
+	 * associated with this process.
+	 */
+
+	for (supptr = &semu_list; (suptr = *supptr) != NULL;
+	    supptr = &suptr->un_next) {
+		if (suptr->un_proc == p)
+			break;
+	}
+
+	if (suptr == NULL)
+		goto unlock;
+
+#ifdef SEM_DEBUG
+	printf("proc @%08x has undo structure with %d entries\n", p,
+	    suptr->un_cnt);
+#endif
+
+	/*
+	 * If there are any active undo elements then process them.
+	 */
+	if (suptr->un_cnt > 0) {
+		int ix;
+
+		for (ix = 0; ix < suptr->un_cnt; ix++) {
+			int semid = suptr->un_ent[ix].un_id;
+			int semnum = suptr->un_ent[ix].un_num;
+			int adjval = suptr->un_ent[ix].un_adjval;
+			struct semid_ds *semaptr;
+
+			semaptr = &sema[semid];
+			if ((semaptr->sem_perm.mode & SEM_ALLOC) == 0)
+				panic("semexit - semid not allocated");
+			if (semnum >= semaptr->sem_nsems)
+				panic("semexit - semnum out of range");
+
+#ifdef SEM_DEBUG
+			printf("semexit:  %08x id=%d num=%d(adj=%d) ; sem=%d\n",
+			    suptr->un_proc, suptr->un_ent[ix].un_id,
+			    suptr->un_ent[ix].un_num,
+			    suptr->un_ent[ix].un_adjval,
+			    semaptr->sem_base[semnum].semval);
+#endif
+
+			if (adjval < 0) {
+				if (semaptr->sem_base[semnum].semval < -adjval)
+					semaptr->sem_base[semnum].semval = 0;
+				else
+					semaptr->sem_base[semnum].semval +=
+					    adjval;
+			} else
+				semaptr->sem_base[semnum].semval += adjval;
+
+#ifdef SEM_WAKEUP
+			sem_wakeup((caddr_t)semaptr);
+#else
+			wakeup((caddr_t)semaptr);
+#endif
+#ifdef SEM_DEBUG
+			printf("semexit:  back from wakeup\n");
+#endif
+		}
+	}
+
+	/*
+	 * Deallocate the undo vector.
+	 */
+#ifdef SEM_DEBUG
+	printf("removing vector\n");
+#endif
+	suptr->un_proc = NULL;
+	*supptr = suptr->un_next;
+
+unlock:
+	/*
+	 * If the exiting process is holding the global semaphore facility
+	 * lock then release it.
+	 */
+	if (semlock_holder == p) {
+		semlock_holder = NULL;
+		wakeup((caddr_t)&semlock_holder);
+	}
+}
diff --git a/sys/kern/sysv_shm.c b/sys/kern/sysv_shm.c
new file mode 100644
index 0000000..9e93923
--- /dev/null
+++ b/sys/kern/sysv_shm.c
@@ -0,0 +1,622 @@
+/*	$Id$ */
+/*	$NetBSD: sysv_shm.c,v 1.23 1994/07/04 23:25:12 glass Exp $	*/
+
+/*
+ * Copyright (c) 1994 Adam Glass and Charles Hannum.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by Adam Glass and Charles
+ *	Hannum.
+ * 4. The names of the authors may not be used to endorse or promote products
+ *    derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHORS ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "opt_sysvipc.h"
+#include "opt_rlimit.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/shm.h>
+#include <sys/proc.h>
+#include <sys/malloc.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/sysent.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_object.h>
+#include <vm/vm_map.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_pager.h>
+#include <vm/vm_inherit.h>
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmat_args;
+extern int shmat __P((struct proc *p, struct shmat_args *uap, int *retval));
+struct shmctl_args;
+extern int shmctl __P((struct proc *p, struct shmctl_args *uap, int *retval));
+struct shmdt_args;
+extern int shmdt __P((struct proc *p, struct shmdt_args *uap, int *retval));
+struct shmget_args;
+extern int shmget __P((struct proc *p, struct shmget_args *uap, int *retval));
+#endif
+
+static void shminit __P((void *));
+SYSINIT(sysv_shm, SI_SUB_SYSV_SHM, SI_ORDER_FIRST, shminit, NULL)
+
+struct oshmctl_args;
+static int oshmctl __P((struct proc *p, struct oshmctl_args *uap, int *retval));
+static int shmget_allocate_segment __P((struct proc *p, struct shmget_args *uap, int mode, int *retval));
+static int shmget_existing __P((struct proc *p, struct shmget_args *uap, int mode, int segnum, int *retval));
+
+/* XXX casting to (sy_call_t *) is bogus, as usual. */
+sy_call_t *shmcalls[] = {
+	(sy_call_t *)shmat, (sy_call_t *)oshmctl,
+	(sy_call_t *)shmdt, (sy_call_t *)shmget,
+	(sy_call_t *)shmctl
+};
+
+#define	SHMSEG_FREE     	0x0200
+#define	SHMSEG_REMOVED  	0x0400
+#define	SHMSEG_ALLOCATED	0x0800
+#define	SHMSEG_WANTED		0x1000
+
+static int shm_last_free, shm_nused, shm_committed;
+struct shmid_ds	*shmsegs;
+
+struct shm_handle {
+	/* vm_offset_t kva; */
+	vm_object_t shm_object;
+};
+
+struct shmmap_state {
+	vm_offset_t va;
+	int shmid;
+};
+
+static void shm_deallocate_segment __P((struct shmid_ds *));
+static int shm_find_segment_by_key __P((key_t));
+static struct shmid_ds *shm_find_segment_by_shmid __P((int));
+static int shm_delete_mapping __P((struct proc *, struct shmmap_state *));
+
+static int
+shm_find_segment_by_key(key)
+	key_t key;
+{
+	int i;
+
+	for (i = 0; i < shminfo.shmmni; i++)
+		if ((shmsegs[i].shm_perm.mode & SHMSEG_ALLOCATED) &&
+		    shmsegs[i].shm_perm.key == key)
+			return i;
+	return -1;
+}
+
+static struct shmid_ds *
+shm_find_segment_by_shmid(shmid)
+	int shmid;
+{
+	int segnum;
+	struct shmid_ds *shmseg;
+
+	segnum = IPCID_TO_IX(shmid);
+	if (segnum < 0 || segnum >= shminfo.shmmni)
+		return NULL;
+	shmseg = &shmsegs[segnum];
+	if ((shmseg->shm_perm.mode & (SHMSEG_ALLOCATED | SHMSEG_REMOVED))
+	    != SHMSEG_ALLOCATED ||
+	    shmseg->shm_perm.seq != IPCID_TO_SEQ(shmid))
+		return NULL;
+	return shmseg;
+}
+
+static void
+shm_deallocate_segment(shmseg)
+	struct shmid_ds *shmseg;
+{
+	struct shm_handle *shm_handle;
+	size_t size;
+
+	shm_handle = shmseg->shm_internal;
+	vm_object_deallocate(shm_handle->shm_object);
+	free((caddr_t)shm_handle, M_SHM);
+	shmseg->shm_internal = NULL;
+	size = round_page(shmseg->shm_segsz);
+	shm_committed -= btoc(size);
+	shm_nused--;
+	shmseg->shm_perm.mode = SHMSEG_FREE;
+}
+
+static int
+shm_delete_mapping(p, shmmap_s)
+	struct proc *p;
+	struct shmmap_state *shmmap_s;
+{
+	struct shmid_ds *shmseg;
+	int segnum, result;
+	size_t size;
+
+	segnum = IPCID_TO_IX(shmmap_s->shmid);
+	shmseg = &shmsegs[segnum];
+	size = round_page(shmseg->shm_segsz);
+	result = vm_map_remove(&p->p_vmspace->vm_map, shmmap_s->va, shmmap_s->va + size);
+	if (result != KERN_SUCCESS)
+		return EINVAL;
+	shmmap_s->shmid = -1;
+	shmseg->shm_dtime = time.tv_sec;
+	if ((--shmseg->shm_nattch <= 0) &&
+	    (shmseg->shm_perm.mode & SHMSEG_REMOVED)) {
+		shm_deallocate_segment(shmseg);
+		shm_last_free = segnum;
+	}
+	return 0;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmdt_args {
+	void *shmaddr;
+};
+#endif
+
+int
+shmdt(p, uap, retval)
+	struct proc *p;
+	struct shmdt_args *uap;
+	int *retval;
+{
+	struct shmmap_state *shmmap_s;
+	int i;
+
+	shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
+ 	if (shmmap_s == NULL)
+ 	    return EINVAL;
+	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
+		if (shmmap_s->shmid != -1 &&
+		    shmmap_s->va == (vm_offset_t)uap->shmaddr)
+			break;
+	if (i == shminfo.shmseg)
+		return EINVAL;
+	return shm_delete_mapping(p, shmmap_s);
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmat_args {
+	int shmid;
+	void *shmaddr;
+	int shmflg;
+};
+#endif
+
+int
+shmat(p, uap, retval)
+	struct proc *p;
+	struct shmat_args *uap;
+	int *retval;
+{
+	int error, i, flags;
+	struct ucred *cred = p->p_ucred;
+	struct shmid_ds *shmseg;
+	struct shmmap_state *shmmap_s = NULL;
+	struct shm_handle *shm_handle;
+	vm_offset_t attach_va;
+	vm_prot_t prot;
+	vm_size_t size;
+	int rv;
+
+	shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
+	if (shmmap_s == NULL) {
+		size = shminfo.shmseg * sizeof(struct shmmap_state);
+		shmmap_s = malloc(size, M_SHM, M_WAITOK);
+		for (i = 0; i < shminfo.shmseg; i++)
+			shmmap_s[i].shmid = -1;
+		p->p_vmspace->vm_shm = (caddr_t)shmmap_s;
+	}
+	shmseg = shm_find_segment_by_shmid(uap->shmid);
+	if (shmseg == NULL)
+		return EINVAL;
+	error = ipcperm(cred, &shmseg->shm_perm,
+	    (uap->shmflg & SHM_RDONLY) ? IPC_R : IPC_R|IPC_W);
+	if (error)
+		return error;
+	for (i = 0; i < shminfo.shmseg; i++) {
+		if (shmmap_s->shmid == -1)
+			break;
+		shmmap_s++;
+	}
+	if (i >= shminfo.shmseg)
+		return EMFILE;
+	size = round_page(shmseg->shm_segsz);
+	prot = VM_PROT_READ;
+	if ((uap->shmflg & SHM_RDONLY) == 0)
+		prot |= VM_PROT_WRITE;
+	flags = MAP_ANON | MAP_SHARED;
+	if (uap->shmaddr) {
+		flags |= MAP_FIXED;
+		if (uap->shmflg & SHM_RND)
+			attach_va = (vm_offset_t)uap->shmaddr & ~(SHMLBA-1);
+		else if (((vm_offset_t)uap->shmaddr & (SHMLBA-1)) == 0)
+			attach_va = (vm_offset_t)uap->shmaddr;
+		else
+			return EINVAL;
+	} else {
+		/* This is just a hint to vm_map_find() about where to put it. */
+		attach_va = round_page(p->p_vmspace->vm_taddr + MAXTSIZ + MAXDSIZ);
+	}
+
+	shm_handle = shmseg->shm_internal;
+	vm_object_reference(shm_handle->shm_object);
+	rv = vm_map_find(&p->p_vmspace->vm_map, shm_handle->shm_object,
+		0, &attach_va, size, (flags & MAP_FIXED)?0:1, prot, prot, 0);
+	if (rv != KERN_SUCCESS) {
+		return ENOMEM;
+	}
+	vm_map_inherit(&p->p_vmspace->vm_map,
+		attach_va, attach_va + size, VM_INHERIT_SHARE);
+
+	shmmap_s->va = attach_va;
+	shmmap_s->shmid = uap->shmid;
+	shmseg->shm_lpid = p->p_pid;
+	shmseg->shm_atime = time.tv_sec;
+	shmseg->shm_nattch++;
+	*retval = attach_va;
+	return 0;
+}
+
+struct oshmid_ds {
+	struct	ipc_perm shm_perm;	/* operation perms */
+	int	shm_segsz;		/* size of segment (bytes) */
+	ushort	shm_cpid;		/* pid, creator */
+	ushort	shm_lpid;		/* pid, last operation */
+	short	shm_nattch;		/* no. of current attaches */
+	time_t	shm_atime;		/* last attach time */
+	time_t	shm_dtime;		/* last detach time */
+	time_t	shm_ctime;		/* last change time */
+	void	*shm_handle;		/* internal handle for shm segment */
+};
+
+struct oshmctl_args {
+	int shmid;
+	int cmd;
+	struct oshmid_ds *ubuf;
+};
+
+static int
+oshmctl(p, uap, retval)
+	struct proc *p;
+	struct oshmctl_args *uap;
+	int *retval;
+{
+#ifdef COMPAT_43
+	int error;
+	struct ucred *cred = p->p_ucred;
+	struct shmid_ds *shmseg;
+	struct oshmid_ds outbuf;
+
+	shmseg = shm_find_segment_by_shmid(uap->shmid);
+	if (shmseg == NULL)
+		return EINVAL;
+	switch (uap->cmd) {
+	case IPC_STAT:
+		error = ipcperm(cred, &shmseg->shm_perm, IPC_R);
+		if (error)
+			return error;
+		outbuf.shm_perm = shmseg->shm_perm;
+		outbuf.shm_segsz = shmseg->shm_segsz;
+		outbuf.shm_cpid = shmseg->shm_cpid;
+		outbuf.shm_lpid = shmseg->shm_lpid;
+		outbuf.shm_nattch = shmseg->shm_nattch;
+		outbuf.shm_atime = shmseg->shm_atime;
+		outbuf.shm_dtime = shmseg->shm_dtime;
+		outbuf.shm_ctime = shmseg->shm_ctime;
+		outbuf.shm_handle = shmseg->shm_internal;
+		error = copyout((caddr_t)&outbuf, uap->ubuf, sizeof(outbuf));
+		if (error)
+			return error;
+		break;
+	default:
+		/* XXX casting to (sy_call_t *) is bogus, as usual. */
+		return ((sy_call_t *)shmctl)(p, uap, retval);
+	}
+	return 0;
+#else
+	return EINVAL;
+#endif
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmctl_args {
+	int shmid;
+	int cmd;
+	struct shmid_ds *buf;
+};
+#endif
+
+int
+shmctl(p, uap, retval)
+	struct proc *p;
+	struct shmctl_args *uap;
+	int *retval;
+{
+	int error;
+	struct ucred *cred = p->p_ucred;
+	struct shmid_ds inbuf;
+	struct shmid_ds *shmseg;
+
+	shmseg = shm_find_segment_by_shmid(uap->shmid);
+	if (shmseg == NULL)
+		return EINVAL;
+	switch (uap->cmd) {
+	case IPC_STAT:
+		error = ipcperm(cred, &shmseg->shm_perm, IPC_R);
+		if (error)
+			return error;
+		error = copyout((caddr_t)shmseg, uap->buf, sizeof(inbuf));
+		if (error)
+			return error;
+		break;
+	case IPC_SET:
+		error = ipcperm(cred, &shmseg->shm_perm, IPC_M);
+		if (error)
+			return error;
+		error = copyin(uap->buf, (caddr_t)&inbuf, sizeof(inbuf));
+		if (error)
+			return error;
+		shmseg->shm_perm.uid = inbuf.shm_perm.uid;
+		shmseg->shm_perm.gid = inbuf.shm_perm.gid;
+		shmseg->shm_perm.mode =
+		    (shmseg->shm_perm.mode & ~ACCESSPERMS) |
+		    (inbuf.shm_perm.mode & ACCESSPERMS);
+		shmseg->shm_ctime = time.tv_sec;
+		break;
+	case IPC_RMID:
+		error = ipcperm(cred, &shmseg->shm_perm, IPC_M);
+		if (error)
+			return error;
+		shmseg->shm_perm.key = IPC_PRIVATE;
+		shmseg->shm_perm.mode |= SHMSEG_REMOVED;
+		if (shmseg->shm_nattch <= 0) {
+			shm_deallocate_segment(shmseg);
+			shm_last_free = IPCID_TO_IX(uap->shmid);
+		}
+		break;
+#if 0
+	case SHM_LOCK:
+	case SHM_UNLOCK:
+#endif
+	default:
+		return EINVAL;
+	}
+	return 0;
+}
+
+#ifndef _SYS_SYSPROTO_H_
+struct shmget_args {
+	key_t key;
+	size_t size;
+	int shmflg;
+};
+#endif
+
+static int
+shmget_existing(p, uap, mode, segnum, retval)
+	struct proc *p;
+	struct shmget_args *uap;
+	int mode;
+	int segnum;
+	int *retval;
+{
+	struct shmid_ds *shmseg;
+	struct ucred *cred = p->p_ucred;
+	int error;
+
+	shmseg = &shmsegs[segnum];
+	if (shmseg->shm_perm.mode & SHMSEG_REMOVED) {
+		/*
+		 * This segment is in the process of being allocated.  Wait
+		 * until it's done, and look the key up again (in case the
+		 * allocation failed or it was freed).
+		 */
+		shmseg->shm_perm.mode |= SHMSEG_WANTED;
+		error = tsleep((caddr_t)shmseg, PLOCK | PCATCH, "shmget", 0);
+		if (error)
+			return error;
+		return EAGAIN;
+	}
+	error = ipcperm(cred, &shmseg->shm_perm, mode);
+	if (error)
+		return error;
+	if (uap->size && uap->size > shmseg->shm_segsz)
+		return EINVAL;
+       if ((uap->shmflg & (IPC_CREAT | IPC_EXCL)) == (IPC_CREAT | IPC_EXCL))
+		return EEXIST;
+	*retval = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm);
+	return 0;
+}
+
+static int
+shmget_allocate_segment(p, uap, mode, retval)
+	struct proc *p;
+	struct shmget_args *uap;
+	int mode;
+	int *retval;
+{
+	int i, segnum, shmid, size;
+	struct ucred *cred = p->p_ucred;
+	struct shmid_ds *shmseg;
+	struct shm_handle *shm_handle;
+
+	if (uap->size < shminfo.shmmin || uap->size > shminfo.shmmax)
+		return EINVAL;
+	if (shm_nused >= shminfo.shmmni) /* any shmids left? */
+		return ENOSPC;
+	size = round_page(uap->size);
+	if (shm_committed + btoc(size) > shminfo.shmall)
+		return ENOMEM;
+	if (shm_last_free < 0) {
+		for (i = 0; i < shminfo.shmmni; i++)
+			if (shmsegs[i].shm_perm.mode & SHMSEG_FREE)
+				break;
+		if (i == shminfo.shmmni)
+			panic("shmseg free count inconsistent");
+		segnum = i;
+	} else  {
+		segnum = shm_last_free;
+		shm_last_free = -1;
+	}
+	shmseg = &shmsegs[segnum];
+	/*
+	 * In case we sleep in malloc(), mark the segment present but deleted
+	 * so that noone else tries to create the same key.
+	 */
+	shmseg->shm_perm.mode = SHMSEG_ALLOCATED | SHMSEG_REMOVED;
+	shmseg->shm_perm.key = uap->key;
+	shmseg->shm_perm.seq = (shmseg->shm_perm.seq + 1) & 0x7fff;
+	shm_handle = (struct shm_handle *)
+	    malloc(sizeof(struct shm_handle), M_SHM, M_WAITOK);
+	shmid = IXSEQ_TO_IPCID(segnum, shmseg->shm_perm);
+	
+	/*
+	 * We make sure that we have allocated a pager before we need
+	 * to.
+	 */
+	shm_handle->shm_object =
+		vm_pager_allocate(OBJT_SWAP, 0, OFF_TO_IDX(size),
+			VM_PROT_DEFAULT, 0);
+	shmseg->shm_internal = shm_handle;
+	shmseg->shm_perm.cuid = shmseg->shm_perm.uid = cred->cr_uid;
+	shmseg->shm_perm.cgid = shmseg->shm_perm.gid = cred->cr_gid;
+	shmseg->shm_perm.mode = (shmseg->shm_perm.mode & SHMSEG_WANTED) |
+	    (mode & ACCESSPERMS) | SHMSEG_ALLOCATED;
+	shmseg->shm_segsz = uap->size;
+	shmseg->shm_cpid = p->p_pid;
+	shmseg->shm_lpid = shmseg->shm_nattch = 0;
+	shmseg->shm_atime = shmseg->shm_dtime = 0;
+	shmseg->shm_ctime = time.tv_sec;
+	shm_committed += btoc(size);
+	shm_nused++;
+	if (shmseg->shm_perm.mode & SHMSEG_WANTED) {
+		/*
+		 * Somebody else wanted this key while we were asleep.  Wake
+		 * them up now.
+		 */
+		shmseg->shm_perm.mode &= ~SHMSEG_WANTED;
+		wakeup((caddr_t)shmseg);
+	}
+	*retval = shmid;
+	return 0;
+}
+
+int
+shmget(p, uap, retval)
+	struct proc *p;
+	struct shmget_args *uap;
+	int *retval;
+{
+	int segnum, mode, error;
+
+	mode = uap->shmflg & ACCESSPERMS;
+	if (uap->key != IPC_PRIVATE) {
+	again:
+		segnum = shm_find_segment_by_key(uap->key);
+		if (segnum >= 0) {
+			error = shmget_existing(p, uap, mode, segnum, retval);
+			if (error == EAGAIN)
+				goto again;
+			return error;
+		}
+		if ((uap->shmflg & IPC_CREAT) == 0)
+			return ENOENT;
+	}
+	return shmget_allocate_segment(p, uap, mode, retval);
+}
+
+int
+shmsys(p, uap, retval)
+	struct proc *p;
+	/* XXX actually varargs. */
+	struct shmsys_args /* {
+		u_int	which;
+		int	a2;
+		int	a3;
+		int	a4;
+	} */ *uap;
+	int *retval;
+{
+
+	if (uap->which >= sizeof(shmcalls)/sizeof(shmcalls[0]))
+		return EINVAL;
+	return ((*shmcalls[uap->which])(p, &uap->a2, retval));
+}
+
+void
+shmfork(p1, p2)
+	struct proc *p1, *p2;
+{
+	struct shmmap_state *shmmap_s;
+	size_t size;
+	int i;
+
+	size = shminfo.shmseg * sizeof(struct shmmap_state);
+	shmmap_s = malloc(size, M_SHM, M_WAITOK);
+	bcopy((caddr_t)p1->p_vmspace->vm_shm, (caddr_t)shmmap_s, size);
+	p2->p_vmspace->vm_shm = (caddr_t)shmmap_s;
+	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
+		if (shmmap_s->shmid != -1)
+			shmsegs[IPCID_TO_IX(shmmap_s->shmid)].shm_nattch++;
+}
+
+void
+shmexit(p)
+	struct proc *p;
+{
+	struct shmmap_state *shmmap_s;
+	int i;
+
+	shmmap_s = (struct shmmap_state *)p->p_vmspace->vm_shm;
+	for (i = 0; i < shminfo.shmseg; i++, shmmap_s++)
+		if (shmmap_s->shmid != -1)
+			shm_delete_mapping(p, shmmap_s);
+	free((caddr_t)p->p_vmspace->vm_shm, M_SHM);
+	p->p_vmspace->vm_shm = NULL;
+}
+
+void
+shminit(dummy)
+	void *dummy;
+{
+	int i;
+	for (i = 0; i < shminfo.shmmni; i++) {
+		shmsegs[i].shm_perm.mode = SHMSEG_FREE;
+		shmsegs[i].shm_perm.seq = 0;
+	}
+	shm_last_free = 0;
+	shm_nused = 0;
+	shm_committed = 0;
+}
diff --git a/sys/kern/tty.c b/sys/kern/tty.c
index 5d698b1..f6e14f9 100644
--- a/sys/kern/tty.c
+++ b/sys/kern/tty.c
@@ -35,39 +35,82 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)tty.c	8.13 (Berkeley) 1/9/95
+ *	@(#)tty.c	8.8 (Berkeley) 1/21/94
+ * $Id: tty.c,v 1.93 1997/03/23 03:36:26 bde Exp $
  */
 
+/*-
+ * TODO:
+ *	o Fix races for sending the start char in ttyflush().
+ *	o Handle inter-byte timeout for "MIN > 0, TIME > 0" in ttyselect().
+ *	  With luck, there will be MIN chars before select() returns().
+ *	o Handle CLOCAL consistently for ptys.  Perhaps disallow setting it.
+ *	o Don't allow input in TS_ZOMBIE case.  It would be visible through
+ *	  FIONREAD.
+ *	o Do the new sio locking stuff here and use it to avoid special
+ *	  case for EXTPROC?
+ *	o Lock PENDIN too?
+ *	o Move EXTPROC and/or PENDIN to t_state?
+ *	o Wrap most of ttioctl in spltty/splx.
+ *	o Implement TIOCNOTTY or remove it from <sys/ioctl.h>.
+ *	o Send STOP if IXOFF is toggled off while TS_TBLOCK is set.
+ *	o Don't allow certain termios flags to affect disciplines other
+ *	  than TTYDISC.  Cancel their effects before switch disciplines
+ *	  and ignore them if they are set while we are in another
+ *	  discipline.
+ *	o Handle c_ispeed = 0 to c_ispeed = c_ospeed conversion here instead
+ *	  of in drivers and fix drivers that write to tp->t_termios.
+ *	o Check for TS_CARR_ON being set while everything is closed and not
+ *	  waiting for carrier.  TS_CARR_ON isn't cleared if nothing is open,
+ *	  so it would live until the next open even if carrier drops.
+ *	o Restore TS_WOPEN since it is useful in pstat.  It must be cleared
+ *	  only when _all_ openers leave open().
+ */
+
+#include "snp.h"
+#include "opt_uconsole.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/ioctl.h>
+#include <sys/filio.h>
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#include <sys/ioctl_compat.h>
+#endif
 #include <sys/proc.h>
 #define	TTYDEFCHARS
 #include <sys/tty.h>
 #undef	TTYDEFCHARS
-#include <sys/file.h>
+#include <sys/fcntl.h>
 #include <sys/conf.h>
 #include <sys/dkstat.h>
 #include <sys/uio.h>
 #include <sys/kernel.h>
 #include <sys/vnode.h>
 #include <sys/syslog.h>
+#include <sys/signalvar.h>
+#include <sys/resourcevar.h>
+#include <sys/malloc.h>
+#if NSNP > 0
+#include <sys/snoop.h>
+#endif
 
 #include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <sys/lock.h>
+#include <vm/pmap.h>
+#include <vm/vm_map.h>
 
 static int	proc_compare __P((struct proc *p1, struct proc *p2));
-static int	ttnread __P((struct tty *));
-static void	ttyblock __P((struct tty *tp));
-static void	ttyecho __P((int, struct tty *tp));
-static void	ttyrubo __P((struct tty *, int));
-
-/* Symbolic sleep message strings. */
-char ttclos[]	= "ttycls";
-char ttopen[]	= "ttyopn";
-char ttybg[]	= "ttybg";
-char ttybuf[]	= "ttybuf";
-char ttyin[]	= "ttyin";
-char ttyout[]	= "ttyout";
+static int	ttnread __P((struct tty *tp));
+static void	ttyecho __P((int c, struct tty *tp));
+static int	ttyoutput __P((int c, register struct tty *tp));
+static void	ttypend __P((struct tty *tp));
+static void	ttyretype __P((struct tty *tp));
+static void	ttyrub __P((int c, struct tty *tp));
+static void	ttyrubo __P((struct tty *tp, int cnt));
+static void	ttyunblock __P((struct tty *tp));
+static int	ttywflush __P((struct tty *tp));
 
 /*
  * Table with character classes and parity. The 8th bit indicates parity,
@@ -95,7 +138,7 @@ char ttyout[]	= "ttyout";
 #define	TB	TAB
 #define	VT	VTAB
 
-char const char_type[] = {
+static u_char const char_type[] = {
 	E|CC, O|CC, O|CC, E|CC, O|CC, E|CC, E|CC, O|CC,	/* nul - bel */
 	O|BS, E|TB, E|NL, O|CC, E|VT, O|CR, O|CC, E|CC, /* bs - si */
 	O|CC, E|CC, E|CC, O|CC, E|CC, O|CC, O|CC, E|CC, /* dle - etb */
@@ -148,6 +191,17 @@ char const char_type[] = {
 #define	ISSET(t, f)	((t) & (f))
 
 /*
+ * Input control starts when we would not be able to fit the maximum
+ * contents of the ping-pong buffers and finishes when we would be able
+ * to fit that much plus 1/8 more.
+ */
+#define	I_HIGH_WATER	(TTYHOG - 2 * 256)	/* XXX */
+#define	I_LOW_WATER	((TTYHOG - 2 * 256) * 7 / 8)	/* XXX */
+
+#undef MAX_INPUT		/* XXX wrong in <sys/syslimits.h> */
+#define	MAX_INPUT	TTYHOG
+
+/*
  * Initial open of tty, or (re)entry to standard tty line discipline.
  */
 int
@@ -161,9 +215,20 @@ ttyopen(device, tp)
 	tp->t_dev = device;
 	if (!ISSET(tp->t_state, TS_ISOPEN)) {
 		SET(tp->t_state, TS_ISOPEN);
+		if (ISSET(tp->t_cflag, CLOCAL))
+			SET(tp->t_state, TS_CONNECTED);
 		bzero(&tp->t_winsize, sizeof(tp->t_winsize));
 	}
-	CLR(tp->t_state, TS_WOPEN);
+
+	/*
+	 * Initialize or restore a cblock allocation policy suitable for
+	 * the standard line discipline.
+	 */
+	clist_alloc_cblocks(&tp->t_canq, TTYHOG, 512);
+	clist_alloc_cblocks(&tp->t_outq, TTMAXHIWAT + OBUFSIZ + 100,
+			    TTMAXHIWAT + OBUFSIZ + 100);
+	clist_alloc_cblocks(&tp->t_rawq, TTYHOG, TTYHOG);
+
 	splx(s);
 	return (0);
 }
@@ -172,22 +237,36 @@ ttyopen(device, tp)
  * Handle close() on a tty line: flush and set to initial state,
  * bumping generation number so that pending read/write calls
  * can detect recycling of the tty.
+ * XXX our caller should have done `spltty(); l_close(); ttyclose();'
+ * and l_close() should have flushed, but we repeat the spltty() and
+ * the flush in case there are buggy callers.
  */
 int
 ttyclose(tp)
 	register struct tty *tp;
 {
-	extern struct tty *constty;	/* Temporary virtual console. */
+	int s;
 
+	s = spltty();
 	if (constty == tp)
 		constty = NULL;
 
 	ttyflush(tp, FREAD | FWRITE);
+	clist_free_cblocks(&tp->t_canq);
+	clist_free_cblocks(&tp->t_outq);
+	clist_free_cblocks(&tp->t_rawq);
+
+#if NSNP > 0
+	if (ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL)
+		snpdown((struct snoop *)tp->t_sc);
+#endif
 
 	tp->t_gen++;
+	tp->t_line = TTYDISC;
 	tp->t_pgrp = NULL;
 	tp->t_session = NULL;
 	tp->t_state = 0;
+	splx(s);
 	return (0);
 }
 
@@ -197,10 +276,10 @@ ttyclose(tp)
 }
 
 /* Is 'c' a line delimiter ("break" character)? */
-#define	TTBREAKC(c)							\
-	((c) == '\n' || ((c) == cc[VEOF] ||				\
-	(c) == cc[VEOL] || (c) == cc[VEOL2]) && (c) != _POSIX_VDISABLE)
-
+#define	TTBREAKC(c, lflag)							\
+	((c) == '\n' || (((c) == cc[VEOF] ||				\
+	  (c) == cc[VEOL] || ((c) == cc[VEOL2] && lflag & IEXTEN)) &&	\
+	 (c) != _POSIX_VDISABLE))
 
 /*
  * Process input of a single character received on a tty.
@@ -210,8 +289,8 @@ ttyinput(c, tp)
 	register int c;
 	register struct tty *tp;
 {
-	register int iflag, lflag;
-	register u_char *cc;
+	register tcflag_t iflag, lflag;
+	register cc_t *cc;
 	int i, err;
 
 	/*
@@ -232,26 +311,44 @@ ttyinput(c, tp)
 	}
 	++tk_nin;
 
+	/*
+	 * Block further input iff:
+	 * current input > threshold AND input is available to user program
+	 * AND input flow control is enabled and not yet invoked.
+	 * The 3 is slop for PARMRK.
+	 */
+	iflag = tp->t_iflag;
+	if (tp->t_rawq.c_cc + tp->t_canq.c_cc > I_HIGH_WATER - 3 &&
+	    (!ISSET(lflag, ICANON) || tp->t_canq.c_cc != 0) &&
+	    (ISSET(tp->t_cflag, CRTS_IFLOW) || ISSET(iflag, IXOFF)) &&
+	    !ISSET(tp->t_state, TS_TBLOCK))
+		ttyblock(tp);
+
 	/* Handle exceptional conditions (break, parity, framing). */
 	cc = tp->t_cc;
-	iflag = tp->t_iflag;
-	if (err = (ISSET(c, TTY_ERRORMASK))) {
+	err = (ISSET(c, TTY_ERRORMASK));
+	if (err) {
 		CLR(c, TTY_ERRORMASK);
-		if (ISSET(err, TTY_FE) && !c) {	/* Break. */
+		if (ISSET(err, TTY_BI)) {
 			if (ISSET(iflag, IGNBRK))
+				return (0);
+			if (ISSET(iflag, BRKINT)) {
+				ttyflush(tp, FREAD | FWRITE);
+				pgsignal(tp->t_pgrp, SIGINT, 1);
 				goto endcase;
-			else if (ISSET(iflag, BRKINT) &&
-			    ISSET(lflag, ISIG) &&
-			    (cc[VINTR] != _POSIX_VDISABLE))
-				c = cc[VINTR];
-			else if (ISSET(iflag, PARMRK))
+			}
+			if (ISSET(iflag, PARMRK))
 				goto parmrk;
-		} else if (ISSET(err, TTY_PE) &&
-		    ISSET(iflag, INPCK) || ISSET(err, TTY_FE)) {
+		} else if ((ISSET(err, TTY_PE) && ISSET(iflag, INPCK))
+			|| ISSET(err, TTY_FE)) {
 			if (ISSET(iflag, IGNPAR))
-				goto endcase;
+				return (0);
 			else if (ISSET(iflag, PARMRK)) {
-parmrk:				(void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
+parmrk:
+				if (tp->t_rawq.c_cc + tp->t_canq.c_cc >
+				    MAX_INPUT - 3)
+					goto input_overflow;
+				(void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
 				(void)putc(0 | TTY_QUOTE, &tp->t_rawq);
 				(void)putc(c | TTY_QUOTE, &tp->t_rawq);
 				goto endcase;
@@ -259,11 +356,7 @@ parmrk:				(void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
 				c = 0;
 		}
 	}
-	/*
-	 * In tandem mode, check high water mark.
-	 */
-	if (ISSET(iflag, IXOFF))
-		ttyblock(tp);
+
 	if (!ISSET(tp->t_state, TS_TYPEN) && ISSET(iflag, ISTRIP))
 		CLR(c, 0x80);
 	if (!ISSET(lflag, EXTPROC)) {
@@ -341,7 +434,7 @@ parmrk:				(void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
 #ifdef sun4c						/* XXX */
 					(*tp->t_stop)(tp, 0);
 #else
-					(*cdevsw[major(tp->t_dev)].d_stop)(tp,
+					(*cdevsw[major(tp->t_dev)]->d_stop)(tp,
 					   0);
 #endif
 					return (0);
@@ -361,7 +454,7 @@ parmrk:				(void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
 		 */
 		if (c == '\r') {
 			if (ISSET(iflag, IGNCR))
-				goto endcase;
+				return (0);
 			else if (ISSET(iflag, ICRNL))
 				c = '\n';
 		} else if (c == '\n' && ISSET(iflag, INLCR))
@@ -403,8 +496,7 @@ parmrk:				(void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
 		/*
 		 * word erase (^W)
 		 */
-		if (CCEQ(cc[VWERASE], c)) {
-			int alt = ISSET(lflag, ALTWERASE);
+		if (CCEQ(cc[VWERASE], c) && ISSET(lflag, IEXTEN)) {
 			int ctype;
 
 			/*
@@ -436,21 +528,21 @@ parmrk:				(void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
 				if (c == -1)
 					goto endcase;
 			} while (c != ' ' && c != '\t' &&
-			    (alt == 0 || ISALPHA(c) == ctype));
+			    (!ISSET(lflag, ALTWERASE) || ISALPHA(c) == ctype));
 			(void)putc(c, &tp->t_rawq);
 			goto endcase;
 		}
 		/*
 		 * reprint line (^R)
 		 */
-		if (CCEQ(cc[VREPRINT], c)) {
+		if (CCEQ(cc[VREPRINT], c) && ISSET(lflag, IEXTEN)) {
 			ttyretype(tp);
 			goto endcase;
 		}
 		/*
 		 * ^T - kernel info and generate SIGINFO
 		 */
-		if (CCEQ(cc[VSTATUS], c)) {
+		if (CCEQ(cc[VSTATUS], c) && ISSET(lflag, IEXTEN)) {
 			if (ISSET(lflag, ISIG))
 				pgsignal(tp->t_pgrp, SIGINFO, 1);
 			if (!ISSET(lflag, NOKERNINFO))
@@ -461,14 +553,19 @@ parmrk:				(void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
 	/*
 	 * Check for input buffer overflow
 	 */
-	if (tp->t_rawq.c_cc + tp->t_canq.c_cc >= TTYHOG) {
+	if (tp->t_rawq.c_cc + tp->t_canq.c_cc >= MAX_INPUT) {
+input_overflow:
 		if (ISSET(iflag, IMAXBEL)) {
 			if (tp->t_outq.c_cc < tp->t_hiwat)
 				(void)ttyoutput(CTRL('g'), tp);
-		} else
-			ttyflush(tp, FREAD | FWRITE);
+		}
 		goto endcase;
 	}
+
+	if (   c == 0377 && ISSET(iflag, PARMRK) && !ISSET(iflag, ISTRIP)
+	     && ISSET(iflag, IGNBRK|IGNPAR) != (IGNBRK|IGNPAR))
+		(void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
+
 	/*
 	 * Put data char in q for user and
 	 * wakeup on seeing a line delimiter.
@@ -479,7 +576,7 @@ parmrk:				(void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
 			ttyecho(c, tp);
 			goto endcase;
 		}
-		if (TTBREAKC(c)) {
+		if (TTBREAKC(c, lflag)) {
 			tp->t_rocount = 0;
 			catq(&tp->t_rawq, &tp->t_canq);
 			ttwakeup(tp);
@@ -498,7 +595,7 @@ parmrk:				(void)putc(0377 | TTY_QUOTE, &tp->t_rawq);
 			/*
 			 * Place the cursor over the '^' of the ^D.
 			 */
-			i = min(2, tp->t_column - i);
+			i = imin(2, tp->t_column - i);
 			while (i > 0) {
 				(void)ttyoutput('\b', tp);
 				i--;
@@ -525,13 +622,13 @@ startoutput:
  * Returns < 0 if succeeds, otherwise returns char to resend.
  * Must be recursive.
  */
-int
+static int
 ttyoutput(c, tp)
 	register int c;
 	register struct tty *tp;
 {
-	register long oflag;
-	register int notout, col, s;
+	register tcflag_t oflag;
+	register int col, s;
 
 	oflag = tp->t_oflag;
 	if (!ISSET(oflag, OPOST)) {
@@ -553,18 +650,15 @@ ttyoutput(c, tp)
 	if (c == '\t' &&
 	    ISSET(oflag, OXTABS) && !ISSET(tp->t_lflag, EXTPROC)) {
 		c = 8 - (tp->t_column & 7);
-		if (ISSET(tp->t_lflag, FLUSHO)) {
-			notout = 0;
-		} else {
+		if (!ISSET(tp->t_lflag, FLUSHO)) {
 			s = spltty();		/* Don't interrupt tabs. */
-			notout = b_to_q("        ", c, &tp->t_outq);
-			c -= notout;
+			c -= b_to_q("        ", c, &tp->t_outq);
 			tk_nout += c;
 			tp->t_outcc += c;
 			splx(s);
 		}
 		tp->t_column += c;
-		return (notout ? '\t' : -1);
+		return (c ? -1 : '\t');
 	}
 	if (c == CEOT && ISSET(oflag, ONOEOT))
 		return (-1);
@@ -616,12 +710,9 @@ ttyoutput(c, tp)
 int
 ttioctl(tp, cmd, data, flag)
 	register struct tty *tp;
-	u_long cmd;
+	int cmd, flag;
 	void *data;
-	int flag;
 {
-	extern struct tty *constty;	/* Temporary virtual console. */
-	extern int nlinesw;
 	register struct proc *p;
 	int s, error;
 
@@ -637,6 +728,7 @@ ttioctl(tp, cmd, data, flag)
 #ifdef notdef
 	case  TIOCSPGRP:
 #endif
+	case  TIOCSTAT:
 	case  TIOCSTI:
 	case  TIOCSWINSZ:
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
@@ -649,13 +741,16 @@ ttioctl(tp, cmd, data, flag)
 	case  TIOCSETP:
 	case  TIOCSLTC:
 #endif
-		while (isbackground(curproc, tp) &&
-		    p->p_pgrp->pg_jobc && (p->p_flag & P_PPWAIT) == 0 &&
+		while (isbackground(p, tp) &&
+		    (p->p_flag & P_PPWAIT) == 0 &&
 		    (p->p_sigignore & sigmask(SIGTTOU)) == 0 &&
 		    (p->p_sigmask & sigmask(SIGTTOU)) == 0) {
+			if (p->p_pgrp->pg_jobc == 0)
+				return (EIO);
 			pgsignal(p->p_pgrp, SIGTTOU, 1);
-			if (error = ttysleep(tp,
-			    &lbolt, TTOPRI | PCATCH, ttybg, 0))
+			error = ttysleep(tp, &lbolt, TTOPRI | PCATCH, "ttybg1",
+					 0);
+			if (error)
 				return (error);
 		}
 		break;
@@ -673,7 +768,9 @@ ttioctl(tp, cmd, data, flag)
 	case FIONBIO:			/* set/clear non-blocking i/o */
 		break;			/* XXX: delete. */
 	case FIONREAD:			/* get # bytes to read */
+		s = spltty();
 		*(int *)data = ttnread(tp);
+		splx(s);
 		break;
 	case TIOCEXCL:			/* set exclusive use of tty */
 		s = spltty();
@@ -693,8 +790,7 @@ ttioctl(tp, cmd, data, flag)
 	case TIOCCONS:			/* become virtual console */
 		if (*(int *)data) {
 			if (constty && constty != tp &&
-			    ISSET(constty->t_state, TS_CARR_ON | TS_ISOPEN) ==
-			    (TS_CARR_ON | TS_ISOPEN))
+			    ISSET(constty->t_state, TS_CONNECTED))
 				return (EBUSY);
 #ifndef	UCONSOLE
 			if (error = suser(p->p_ucred, &p->p_acflag))
@@ -705,7 +801,8 @@ ttioctl(tp, cmd, data, flag)
 			constty = NULL;
 		break;
 	case TIOCDRAIN:			/* wait till output drained */
-		if (error = ttywait(tp))
+		error = ttywait(tp);
+		if (error)
 			return (error);
 		break;
 	case TIOCGETA: {		/* get termios struct */
@@ -745,9 +842,12 @@ ttioctl(tp, cmd, data, flag)
 	case TIOCSETAF: {		/* drn out, fls in, set */
 		register struct termios *t = (struct termios *)data;
 
+		if (t->c_ispeed < 0 || t->c_ospeed < 0)
+			return (EINVAL);
 		s = spltty();
 		if (cmd == TIOCSETAW || cmd == TIOCSETAF) {
-			if (error = ttywait(tp)) {
+			error = ttywait(tp);
+			if (error) {
 				splx(s);
 				return (error);
 			}
@@ -761,35 +861,56 @@ ttioctl(tp, cmd, data, flag)
 			if (tp->t_param && (error = (*tp->t_param)(tp, t))) {
 				splx(s);
 				return (error);
-			} else {
-				if (!ISSET(tp->t_state, TS_CARR_ON) &&
-				    ISSET(tp->t_cflag, CLOCAL) &&
-				    !ISSET(t->c_cflag, CLOCAL)) {
-					CLR(tp->t_state, TS_ISOPEN);
-					SET(tp->t_state, TS_WOPEN);
-					ttwakeup(tp);
-				}
-				tp->t_cflag = t->c_cflag;
-				tp->t_ispeed = t->c_ispeed;
-				tp->t_ospeed = t->c_ospeed;
 			}
+			if (ISSET(t->c_cflag, CLOCAL) &&
+			    !ISSET(tp->t_cflag, CLOCAL)) {
+				/*
+				 * XXX disconnections would be too hard to
+				 * get rid of without this kludge.  The only
+				 * way to get rid of controlling terminals
+				 * is to exit from the session leader.
+				 */
+				CLR(tp->t_state, TS_ZOMBIE);
+
+				wakeup(TSA_CARR_ON(tp));
+				ttwakeup(tp);
+				ttwwakeup(tp);
+			}
+			if ((ISSET(tp->t_state, TS_CARR_ON) ||
+			     ISSET(t->c_cflag, CLOCAL)) &&
+			    !ISSET(tp->t_state, TS_ZOMBIE))
+				SET(tp->t_state, TS_CONNECTED);
+			else
+				CLR(tp->t_state, TS_CONNECTED);
+			tp->t_cflag = t->c_cflag;
+			tp->t_ispeed = t->c_ispeed;
+			tp->t_ospeed = t->c_ospeed;
 			ttsetwater(tp);
 		}
-		if (cmd != TIOCSETAF) {
-			if (ISSET(t->c_lflag, ICANON) !=
-			    ISSET(tp->t_lflag, ICANON))
-				if (ISSET(t->c_lflag, ICANON)) {
-					SET(tp->t_lflag, PENDIN);
-					ttwakeup(tp);
-				} else {
-					struct clist tq;
-
+		if (ISSET(t->c_lflag, ICANON) != ISSET(tp->t_lflag, ICANON) &&
+		    cmd != TIOCSETAF) {
+			if (ISSET(t->c_lflag, ICANON))
+				SET(tp->t_lflag, PENDIN);
+			else {
+				/*
+				 * XXX we really shouldn't allow toggling
+				 * ICANON while we're in a non-termios line
+				 * discipline.  Now we have to worry about
+				 * panicing for a null queue.
+				 */
+				if (tp->t_canq.c_cbreserved > 0 &&
+				    tp->t_rawq.c_cbreserved > 0) {
 					catq(&tp->t_rawq, &tp->t_canq);
-					tq = tp->t_rawq;
-					tp->t_rawq = tp->t_canq;
-					tp->t_canq = tq;
-					CLR(tp->t_lflag, PENDIN);
+					/*
+					 * XXX the queue limits may be
+					 * different, so the old queue
+					 * swapping method no longer works.
+					 */
+					catq(&tp->t_canq, &tp->t_rawq);
 				}
+				CLR(tp->t_lflag, PENDIN);
+			}
+			ttwakeup(tp);
 		}
 		tp->t_iflag = t->c_iflag;
 		tp->t_oflag = t->c_oflag;
@@ -801,6 +922,9 @@ ttioctl(tp, cmd, data, flag)
 		else
 			CLR(t->c_lflag, EXTPROC);
 		tp->t_lflag = t->c_lflag | ISSET(tp->t_lflag, PENDIN);
+		if (t->c_cc[VMIN] != tp->t_cc[VMIN] ||
+		    t->c_cc[VTIME] != tp->t_cc[VTIME])
+			ttwakeup(tp);
 		bcopy(t->c_cc, tp->t_cc, sizeof(t->c_cc));
 		splx(s);
 		break;
@@ -840,7 +964,9 @@ ttioctl(tp, cmd, data, flag)
 			return (EPERM);
 		if (p->p_ucred->cr_uid && !isctty(p, tp))
 			return (EACCES);
+		s = spltty();
 		(*linesw[tp->t_line].l_rint)(*(u_char *)data, tp);
+		splx(s);
 		break;
 	case TIOCSTOP:			/* stop output, like ^S */
 		s = spltty();
@@ -849,7 +975,7 @@ ttioctl(tp, cmd, data, flag)
 #ifdef sun4c				/* XXX */
 			(*tp->t_stop)(tp, 0);
 #else
-			(*cdevsw[major(tp->t_dev)].d_stop)(tp, 0);
+			(*cdevsw[major(tp->t_dev)]->d_stop)(tp, 0);
 #endif
 		}
 		splx(s);
@@ -857,8 +983,8 @@ ttioctl(tp, cmd, data, flag)
 	case TIOCSCTTY:			/* become controlling tty */
 		/* Session ctty vnode pointer set in vnode layer. */
 		if (!SESS_LEADER(p) ||
-		    (p->p_session->s_ttyvp || tp->t_session) &&
-		    (tp->t_session != p->p_session))
+		    ((p->p_session->s_ttyvp || tp->t_session) &&
+		    (tp->t_session != p->p_session)))
 			return (EPERM);
 		tp->t_session = p->p_session;
 		tp->t_pgrp = p->p_pgrp;
@@ -875,6 +1001,11 @@ ttioctl(tp, cmd, data, flag)
 		tp->t_pgrp = pgrp;
 		break;
 	}
+	case TIOCSTAT:			/* simulate control-T */
+		s = spltty();
+		ttyinfo(tp);
+		splx(s);
+		break;
 	case TIOCSWINSZ:		/* set window size */
 		if (bcmp((caddr_t)&tp->t_winsize, data,
 		    sizeof (struct winsize))) {
@@ -882,6 +1013,17 @@ ttioctl(tp, cmd, data, flag)
 			pgsignal(tp->t_pgrp, SIGWINCH, 1);
 		}
 		break;
+	case TIOCSDRAINWAIT:
+		error = suser(p->p_ucred, &p->p_acflag);
+		if (error)
+			return (error);
+		tp->t_timeout = *(int *)data * hz;
+		wakeup(TSA_OCOMPLETE(tp));
+		wakeup(TSA_OLOWAT(tp));
+		break;
+	case TIOCGDRAINWAIT:
+		*(int *)data = tp->t_timeout / hz;
+		break;
 	default:
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 		return (ttcompat(tp, cmd, data, flag));
@@ -893,27 +1035,27 @@ ttioctl(tp, cmd, data, flag)
 }
 
 int
-ttselect(device, rw, p)
-	dev_t device;
+ttyselect(tp, rw, p)
+	struct tty *tp;
 	int rw;
 	struct proc *p;
 {
-	register struct tty *tp;
-	int nread, s;
+	int s;
 
-	tp = &cdevsw[major(device)].d_ttys[minor(device)];
+	if (tp == NULL)
+		return (ENXIO);
 
 	s = spltty();
 	switch (rw) {
 	case FREAD:
-		nread = ttnread(tp);
-		if (nread > 0 || !ISSET(tp->t_cflag, CLOCAL) &&
-		    !ISSET(tp->t_state, TS_CARR_ON))
+		if (ttnread(tp) > 0 || ISSET(tp->t_state, TS_ZOMBIE))
 			goto win;
 		selrecord(p, &tp->t_rsel);
 		break;
 	case FWRITE:
-		if (tp->t_outq.c_cc <= tp->t_lowat) {
+		if ((tp->t_outq.c_cc <= tp->t_lowat &&
+		     ISSET(tp->t_state, TS_CONNECTED))
+		    || ISSET(tp->t_state, TS_ZOMBIE)) {
 win:			splx(s);
 			return (1);
 		}
@@ -924,6 +1066,22 @@ win:			splx(s);
 	return (0);
 }
 
+/*
+ * This is a wrapper for compatibility with the select vector used by
+ * cdevsw.  It relies on a proper xxxdevtotty routine.
+ */
+int
+ttselect(dev, rw, p)
+	dev_t dev;
+	int rw;
+	struct proc *p;
+{
+	return ttyselect((*cdevsw[major(dev)]->d_devtotty)(dev), rw, p);
+}
+
+/*
+ * Must be called at spltty().
+ */
 static int
 ttnread(tp)
 	struct tty *tp;
@@ -933,8 +1091,11 @@ ttnread(tp)
 	if (ISSET(tp->t_lflag, PENDIN))
 		ttypend(tp);
 	nread = tp->t_canq.c_cc;
-	if (!ISSET(tp->t_lflag, ICANON))
+	if (!ISSET(tp->t_lflag, ICANON)) {
 		nread += tp->t_rawq.c_cc;
+		if (nread < tp->t_cc[VMIN] && tp->t_cc[VTIME] == 0)
+			nread = 0;
+	}
 	return (nread);
 }
 
@@ -950,14 +1111,24 @@ ttywait(tp)
 	error = 0;
 	s = spltty();
 	while ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) &&
-	    (ISSET(tp->t_state, TS_CARR_ON) || ISSET(tp->t_cflag, CLOCAL))
-	    && tp->t_oproc) {
+	       ISSET(tp->t_state, TS_CONNECTED) && tp->t_oproc) {
 		(*tp->t_oproc)(tp);
-		SET(tp->t_state, TS_ASLEEP);
-		if (error = ttysleep(tp,
-		    &tp->t_outq, TTOPRI | PCATCH, ttyout, 0))
+		if ((tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)) &&
+		    ISSET(tp->t_state, TS_CONNECTED)) {
+			SET(tp->t_state, TS_SO_OCOMPLETE);
+			error = ttysleep(tp, TSA_OCOMPLETE(tp),
+					 TTOPRI | PCATCH, "ttywai",
+					 tp->t_timeout);
+			if (error) {
+				if (error == EWOULDBLOCK)
+					error = EIO;
+				break;
+			}
+		} else
 			break;
 	}
+	if (!error && (tp->t_outq.c_cc || ISSET(tp->t_state, TS_BUSY)))
+		error = EIO;
 	splx(s);
 	return (error);
 }
@@ -965,7 +1136,7 @@ ttywait(tp)
 /*
  * Flush if successfully wait.
  */
-int
+static int
 ttywflush(tp)
 	struct tty *tp;
 {
@@ -987,24 +1158,66 @@ ttyflush(tp, rw)
 	register int s;
 
 	s = spltty();
+#if 0
+again:
+#endif
+	if (rw & FWRITE) {
+		FLUSHQ(&tp->t_outq);
+		CLR(tp->t_state, TS_TTSTOP);
+	}
+#ifdef sun4c						/* XXX */
+	(*tp->t_stop)(tp, rw);
+#else
+	(*cdevsw[major(tp->t_dev)]->d_stop)(tp, rw);
+#endif
 	if (rw & FREAD) {
 		FLUSHQ(&tp->t_canq);
 		FLUSHQ(&tp->t_rawq);
+		CLR(tp->t_lflag, PENDIN);
 		tp->t_rocount = 0;
 		tp->t_rocol = 0;
 		CLR(tp->t_state, TS_LOCAL);
 		ttwakeup(tp);
+		if (ISSET(tp->t_state, TS_TBLOCK)) {
+			if (rw & FWRITE)
+				FLUSHQ(&tp->t_outq);
+			ttyunblock(tp);
+
+			/*
+			 * Don't let leave any state that might clobber the
+			 * next line discipline (although we should do more
+			 * to send the START char).  Not clearing the state
+			 * may have caused the "putc to a clist with no
+			 * reserved cblocks" panic/printf.
+			 */
+			CLR(tp->t_state, TS_TBLOCK);
+
+#if 0 /* forget it, sleeping isn't always safe and we don't know when it is */
+			if (ISSET(tp->t_iflag, IXOFF)) {
+				/*
+				 * XXX wait a bit in the hope that the stop
+				 * character (if any) will go out.  Waiting
+				 * isn't good since it allows races.  This
+				 * will be fixed when the stop character is
+				 * put in a special queue.  Don't bother with
+				 * the checks in ttywait() since the timeout
+				 * will save us.
+				 */
+				SET(tp->t_state, TS_SO_OCOMPLETE);
+				ttysleep(tp, TSA_OCOMPLETE(tp), TTOPRI,
+					 "ttyfls", hz / 10);
+				/*
+				 * Don't try sending the stop character again.
+				 */
+				CLR(tp->t_state, TS_TBLOCK);
+				goto again;
+			}
+#endif
+		}
 	}
 	if (rw & FWRITE) {
-		CLR(tp->t_state, TS_TTSTOP);
-#ifdef sun4c						/* XXX */
-		(*tp->t_stop)(tp, rw);
-#else
-		(*cdevsw[major(tp->t_dev)].d_stop)(tp, rw);
-#endif
 		FLUSHQ(&tp->t_outq);
-		wakeup((caddr_t)&tp->t_outq);
-		selwakeup(&tp->t_wsel);
+		ttwwakeup(tp);
 	}
 	splx(s);
 }
@@ -1013,42 +1226,63 @@ ttyflush(tp, rw)
  * Copy in the default termios characters.
  */
 void
+termioschars(t)
+	struct termios *t;
+{
+
+	bcopy(ttydefchars, t->c_cc, sizeof t->c_cc);
+}
+
+/*
+ * Old interface.
+ */
+void
 ttychars(tp)
 	struct tty *tp;
 {
 
-	bcopy(ttydefchars, tp->t_cc, sizeof(ttydefchars));
+	termioschars(&tp->t_termios);
 }
 
 /*
- * Send stop character on input overflow.
+ * Handle input high water.  Send stop character for the IXOFF case.  Turn
+ * on our input flow control bit and propagate the changes to the driver.
+ * XXX the stop character should be put in a special high priority queue.
  */
-static void
+void
 ttyblock(tp)
-	register struct tty *tp;
+	struct tty *tp;
 {
-	register int total;
 
-	total = tp->t_rawq.c_cc + tp->t_canq.c_cc;
-	if (tp->t_rawq.c_cc > TTYHOG) {
-		ttyflush(tp, FREAD | FWRITE);
-		CLR(tp->t_state, TS_TBLOCK);
-	}
-	/*
-	 * Block further input iff: current input > threshold
-	 * AND input is available to user program.
-	 */
-	if (total >= TTYHOG / 2 &&
-	    !ISSET(tp->t_state, TS_TBLOCK) &&
-	    !ISSET(tp->t_lflag, ICANON) || tp->t_canq.c_cc > 0 &&
-	    tp->t_cc[VSTOP] != _POSIX_VDISABLE) {
-		if (putc(tp->t_cc[VSTOP], &tp->t_outq) == 0) {
-			SET(tp->t_state, TS_TBLOCK);
-			ttstart(tp);
-		}
-	}
+	SET(tp->t_state, TS_TBLOCK);
+	if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTOP] != _POSIX_VDISABLE &&
+	    putc(tp->t_cc[VSTOP], &tp->t_outq) != 0)
+		CLR(tp->t_state, TS_TBLOCK);	/* try again later */
+	ttstart(tp);
+}
+
+/*
+ * Handle input low water.  Send start character for the IXOFF case.  Turn
+ * off our input flow control bit and propagate the changes to the driver.
+ * XXX the start character should be put in a special high priority queue.
+ */
+static void
+ttyunblock(tp)
+	struct tty *tp;
+{
+
+	CLR(tp->t_state, TS_TBLOCK);
+	if (ISSET(tp->t_iflag, IXOFF) && tp->t_cc[VSTART] != _POSIX_VDISABLE &&
+	    putc(tp->t_cc[VSTART], &tp->t_outq) != 0)
+		SET(tp->t_state, TS_TBLOCK);	/* try again later */
+	ttstart(tp);
 }
 
+#ifdef notyet
+/* Not used by any current (i386) drivers. */
+/*
+ * Restart after an inter-char delay.
+ */
 void
 ttrstrt(tp_arg)
 	void *tp_arg;
@@ -1068,6 +1302,7 @@ ttrstrt(tp_arg)
 
 	splx(s);
 }
+#endif
 
 int
 ttstart(tp)
@@ -1088,10 +1323,8 @@ ttylclose(tp, flag)
 	int flag;
 {
 
-	if (flag & IO_NDELAY)
+	if (flag & FNONBLOCK || ttywflush(tp))
 		ttyflush(tp, FREAD | FWRITE);
-	else
-		ttywflush(tp);
 	return (0);
 }
 
@@ -1106,19 +1339,23 @@ ttymodem(tp, flag)
 	int flag;
 {
 
-	if (!ISSET(tp->t_state, TS_WOPEN) && ISSET(tp->t_cflag, MDMBUF)) {
+	if (ISSET(tp->t_state, TS_CARR_ON) && ISSET(tp->t_cflag, MDMBUF)) {
 		/*
 		 * MDMBUF: do flow control according to carrier flag
+		 * XXX TS_CAR_OFLOW doesn't do anything yet.  TS_TTSTOP
+		 * works if IXON and IXANY are clear.
 		 */
 		if (flag) {
+			CLR(tp->t_state, TS_CAR_OFLOW);
 			CLR(tp->t_state, TS_TTSTOP);
 			ttstart(tp);
-		} else if (!ISSET(tp->t_state, TS_TTSTOP)) {
+		} else if (!ISSET(tp->t_state, TS_CAR_OFLOW)) {
+			SET(tp->t_state, TS_CAR_OFLOW);
 			SET(tp->t_state, TS_TTSTOP);
 #ifdef sun4c						/* XXX */
 			(*tp->t_stop)(tp, 0);
 #else
-			(*cdevsw[major(tp->t_dev)].d_stop)(tp, 0);
+			(*cdevsw[major(tp->t_dev)]->d_stop)(tp, 0);
 #endif
 		}
 	} else if (flag == 0) {
@@ -1128,6 +1365,8 @@ ttymodem(tp, flag)
 		CLR(tp->t_state, TS_CARR_ON);
 		if (ISSET(tp->t_state, TS_ISOPEN) &&
 		    !ISSET(tp->t_cflag, CLOCAL)) {
+			SET(tp->t_state, TS_ZOMBIE);
+			CLR(tp->t_state, TS_CONNECTED);
 			if (tp->t_session && tp->t_session->s_leader)
 				psignal(tp->t_session->s_leader, SIGHUP);
 			ttyflush(tp, FREAD | FWRITE);
@@ -1138,30 +1377,11 @@ ttymodem(tp, flag)
 		 * Carrier now on.
 		 */
 		SET(tp->t_state, TS_CARR_ON);
+		if (!ISSET(tp->t_state, TS_ZOMBIE))
+			SET(tp->t_state, TS_CONNECTED);
+		wakeup(TSA_CARR_ON(tp));
 		ttwakeup(tp);
-	}
-	return (1);
-}
-
-/*
- * Default modem control routine (for other line disciplines).
- * Return argument flag, to turn off device on carrier drop.
- */
-int
-nullmodem(tp, flag)
-	register struct tty *tp;
-	int flag;
-{
-
-	if (flag)
-		SET(tp->t_state, TS_CARR_ON);
-	else {
-		CLR(tp->t_state, TS_CARR_ON);
-		if (!ISSET(tp->t_cflag, CLOCAL)) {
-			if (tp->t_session && tp->t_session->s_leader)
-				psignal(tp->t_session->s_leader, SIGHUP);
-			return (0);
-		}
+		ttwwakeup(tp);
 	}
 	return (1);
 }
@@ -1170,18 +1390,25 @@ nullmodem(tp, flag)
  * Reinput pending characters after state switch
  * call at spltty().
  */
-void
+static void
 ttypend(tp)
 	register struct tty *tp;
 {
 	struct clist tq;
-	register c;
+	register int c;
 
 	CLR(tp->t_lflag, PENDIN);
 	SET(tp->t_state, TS_TYPEN);
+	/*
+	 * XXX this assumes too much about clist internals.  It may even
+	 * fail if the cblock slush pool is empty.  We can't allocate more
+	 * cblocks here because we are called from an interrupt handler
+	 * and clist_alloc_cblocks() can wait.
+	 */
 	tq = tp->t_rawq;
-	tp->t_rawq.c_cc = 0;
-	tp->t_rawq.c_cf = tp->t_rawq.c_cl = 0;
+	bzero(&tp->t_rawq, sizeof tp->t_rawq);
+	tp->t_rawq.c_cbmax = tq.c_cbmax;
+	tp->t_rawq.c_cbreserved = tq.c_cbreserved;
 	while ((c = getc(&tq)) >= 0)
 		ttyinput(c, tp);
 	CLR(tp->t_state, TS_TYPEN);
@@ -1198,34 +1425,47 @@ ttread(tp, uio, flag)
 {
 	register struct clist *qp;
 	register int c;
-	register long lflag;
-	register u_char *cc = tp->t_cc;
+	register tcflag_t lflag;
+	register cc_t *cc = tp->t_cc;
 	register struct proc *p = curproc;
 	int s, first, error = 0;
+	int has_stime = 0, last_cc = 0;
+	long slp = 0;		/* XXX this should be renamed `timo'. */
 
-loop:	lflag = tp->t_lflag;
+loop:
 	s = spltty();
+	lflag = tp->t_lflag;
 	/*
 	 * take pending input first
 	 */
-	if (ISSET(lflag, PENDIN))
+	if (ISSET(lflag, PENDIN)) {
 		ttypend(tp);
-	splx(s);
+		splx(s);	/* reduce latency */
+		s = spltty();
+		lflag = tp->t_lflag;	/* XXX ttypend() clobbers it */
+	}
 
 	/*
 	 * Hang process if it's in the background.
 	 */
 	if (isbackground(p, tp)) {
+		splx(s);
 		if ((p->p_sigignore & sigmask(SIGTTIN)) ||
 		   (p->p_sigmask & sigmask(SIGTTIN)) ||
 		    p->p_flag & P_PPWAIT || p->p_pgrp->pg_jobc == 0)
 			return (EIO);
 		pgsignal(p->p_pgrp, SIGTTIN, 1);
-		if (error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, ttybg, 0))
+		error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg2", 0);
+		if (error)
 			return (error);
 		goto loop;
 	}
 
+	if (ISSET(tp->t_state, TS_ZOMBIE)) {
+		splx(s);
+		return (0);	/* EOF */
+	}
+
 	/*
 	 * If canonical, use the canonical queue,
 	 * else use the raw queue.
@@ -1234,47 +1474,171 @@ loop:	lflag = tp->t_lflag;
 	 */
 	qp = ISSET(lflag, ICANON) ? &tp->t_canq : &tp->t_rawq;
 
-	/*
-	 * If there is no input, sleep on rawq
-	 * awaiting hardware receipt and notification.
-	 * If we have data, we don't need to check for carrier.
-	 */
-	s = spltty();
-	if (qp->c_cc <= 0) {
-		int carrier;
-
-		carrier = ISSET(tp->t_state, TS_CARR_ON) ||
-		    ISSET(tp->t_cflag, CLOCAL);
-		if (!carrier && ISSET(tp->t_state, TS_ISOPEN)) {
+	if (flag & IO_NDELAY) {
+		if (qp->c_cc > 0)
+			goto read;
+		if (!ISSET(lflag, ICANON) && cc[VMIN] == 0) {
 			splx(s);
-			return (0);	/* EOF */
+			return (0);
 		}
-		if (flag & IO_NDELAY) {
+		splx(s);
+		return (EWOULDBLOCK);
+	}
+	if (!ISSET(lflag, ICANON)) {
+		int m = cc[VMIN];
+		long t = cc[VTIME];
+		struct timeval stime, timecopy;
+		int x;
+
+		/*
+		 * Check each of the four combinations.
+		 * (m > 0 && t == 0) is the normal read case.
+		 * It should be fairly efficient, so we check that and its
+		 * companion case (m == 0 && t == 0) first.
+		 * For the other two cases, we compute the target sleep time
+		 * into slp.
+		 */
+		if (t == 0) {
+			if (qp->c_cc < m)
+				goto sleep;
+			if (qp->c_cc > 0)
+				goto read;
+
+			/* m, t and qp->c_cc are all 0.  0 is enough input. */
 			splx(s);
-			return (EWOULDBLOCK);
+			return (0);
+		}
+		t *= 100000;		/* time in us */
+#define diff(t1, t2) (((t1).tv_sec - (t2).tv_sec) * 1000000 + \
+			 ((t1).tv_usec - (t2).tv_usec))
+		if (m > 0) {
+			if (qp->c_cc <= 0)
+				goto sleep;
+			if (qp->c_cc >= m)
+				goto read;
+			gettime(&timecopy);
+			if (!has_stime) {
+				/* first character, start timer */
+				has_stime = 1;
+				stime = timecopy;
+				slp = t;
+			} else if (qp->c_cc > last_cc) {
+				/* got a character, restart timer */
+				stime = timecopy;
+				slp = t;
+			} else {
+				/* nothing, check expiration */
+				slp = t - diff(timecopy, stime);
+				if (slp <= 0)
+					goto read;
+			}
+			last_cc = qp->c_cc;
+		} else {	/* m == 0 */
+			if (qp->c_cc > 0)
+				goto read;
+			gettime(&timecopy);
+			if (!has_stime) {
+				has_stime = 1;
+				stime = timecopy;
+				slp = t;
+			} else {
+				slp = t - diff(timecopy, stime);
+				if (slp <= 0) {
+					/* Timed out, but 0 is enough input. */
+					splx(s);
+					return (0);
+				}
+			}
 		}
-		error = ttysleep(tp, &tp->t_rawq, TTIPRI | PCATCH,
-		    carrier ? ttyin : ttopen, 0);
+#undef diff
+		/*
+		 * Rounding down may make us wake up just short
+		 * of the target, so we round up.
+		 * The formula is ceiling(slp * hz/1000000).
+		 * 32-bit arithmetic is enough for hz < 169.
+		 * XXX see hzto() for how to avoid overflow if hz
+		 * is large (divide by `tick' and/or arrange to
+		 * use hzto() if hz is large).
+		 */
+		slp = (long) (((u_long)slp * hz) + 999999) / 1000000;
+		goto sleep;
+	}
+	if (qp->c_cc <= 0) {
+sleep:
+		/*
+		 * There is no input, or not enough input and we can block.
+		 */
+		error = ttysleep(tp, TSA_HUP_OR_INPUT(tp), TTIPRI | PCATCH,
+				 ISSET(tp->t_state, TS_CONNECTED) ?
+				 "ttyin" : "ttyhup", (int)slp);
 		splx(s);
-		if (error)
+		if (error == EWOULDBLOCK)
+			error = 0;
+		else if (error)
 			return (error);
+		/*
+		 * XXX what happens if another process eats some input
+		 * while we are asleep (not just here)?  It would be
+		 * safest to detect changes and reset our state variables
+		 * (has_stime and last_cc).
+		 */
+		slp = 0;
 		goto loop;
 	}
+read:
 	splx(s);
-
 	/*
 	 * Input present, check for input mapping and processing.
 	 */
 	first = 1;
-	while ((c = getc(qp)) >= 0) {
+	if (ISSET(lflag, ICANON | ISIG))
+		goto slowcase;
+	for (;;) {
+		char ibuf[IBUFSIZ];
+		int icc;
+
+		icc = imin(uio->uio_resid, IBUFSIZ);
+		icc = q_to_b(qp, ibuf, icc);
+		if (icc <= 0) {
+			if (first)
+				goto loop;
+			break;
+		}
+		error = uiomove(ibuf, icc, uio);
+		/*
+		 * XXX if there was an error then we should ungetc() the
+		 * unmoved chars and reduce icc here.
+		 */
+#if NSNP > 0
+		if (ISSET(tp->t_lflag, ECHO) &&
+		    ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL)
+			snpin((struct snoop *)tp->t_sc, ibuf, icc);
+#endif
+		if (error)
+			break;
+ 		if (uio->uio_resid == 0)
+			break;
+		first = 0;
+	}
+	goto out;
+slowcase:
+	for (;;) {
+		c = getc(qp);
+		if (c < 0) {
+			if (first)
+				goto loop;
+			break;
+		}
 		/*
 		 * delayed suspend (^Y)
 		 */
-		if (CCEQ(cc[VDSUSP], c) && ISSET(lflag, ISIG)) {
+		if (CCEQ(cc[VDSUSP], c) &&
+		    ISSET(lflag, IEXTEN | ISIG) == (IEXTEN | ISIG)) {
 			pgsignal(tp->t_pgrp, SIGTSTP, 1);
 			if (first) {
-				if (error = ttysleep(tp,
-				    &lbolt, TTIPRI | PCATCH, ttybg, 0))
+				error = ttysleep(tp, &lbolt, TTIPRI | PCATCH,
+						 "ttybg3", 0);
+				if (error)
 					break;
 				goto loop;
 			}
@@ -1290,30 +1654,39 @@ loop:	lflag = tp->t_lflag;
 		 */
  		error = ureadc(c, uio);
 		if (error)
+			/* XXX should ungetc(c, qp). */
 			break;
+#if NSNP > 0
+		/*
+		 * Only snoop directly on input in echo mode.  Non-echoed
+		 * input will be snooped later iff the application echoes it.
+		 */
+		if (ISSET(tp->t_lflag, ECHO) &&
+		    ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL)
+			snpinc((struct snoop *)tp->t_sc, (char)c);
+#endif
  		if (uio->uio_resid == 0)
 			break;
 		/*
 		 * In canonical mode check for a "break character"
 		 * marking the end of a "line of input".
 		 */
-		if (ISSET(lflag, ICANON) && TTBREAKC(c))
+		if (ISSET(lflag, ICANON) && TTBREAKC(c, lflag))
 			break;
 		first = 0;
 	}
+
+out:
 	/*
-	 * Look to unblock output now that (presumably)
+	 * Look to unblock input now that (presumably)
 	 * the input queue has gone down.
 	 */
 	s = spltty();
-	if (ISSET(tp->t_state, TS_TBLOCK) && tp->t_rawq.c_cc < TTYHOG/5) {
-		if (cc[VSTART] != _POSIX_VDISABLE &&
-		    putc(cc[VSTART], &tp->t_outq) == 0) {
-			CLR(tp->t_state, TS_TBLOCK);
-			ttstart(tp);
-		}
-	}
+	if (ISSET(tp->t_state, TS_TBLOCK) &&
+	    tp->t_rawq.c_cc + tp->t_canq.c_cc <= I_LOW_WATER)
+		ttyunblock(tp);
 	splx(s);
+
 	return (error);
 }
 
@@ -1334,17 +1707,17 @@ ttycheckoutq(tp, wait)
 	hiwat = tp->t_hiwat;
 	s = spltty();
 	oldsig = wait ? curproc->p_siglist : 0;
-	if (tp->t_outq.c_cc > hiwat + 200)
+	if (tp->t_outq.c_cc > hiwat + OBUFSIZ + 100)
 		while (tp->t_outq.c_cc > hiwat) {
 			ttstart(tp);
+			if (tp->t_outq.c_cc <= hiwat)
+				break;
 			if (wait == 0 || curproc->p_siglist != oldsig) {
 				splx(s);
 				return (0);
 			}
-			timeout((void (*)__P((void *)))wakeup,
-			    (void *)&tp->t_outq, hz);
-			SET(tp->t_state, TS_ASLEEP);
-			sleep((caddr_t)&tp->t_outq, PZERO - 1);
+			SET(tp->t_state, TS_SO_OLOWAT);
+			tsleep(TSA_OLOWAT(tp), PZERO - 1, "ttoutq", hz);
 		}
 	splx(s);
 	return (1);
@@ -1359,7 +1732,7 @@ ttwrite(tp, uio, flag)
 	register struct uio *uio;
 	int flag;
 {
-	register char *cp;
+	register char *cp = NULL;
 	register int cc, ce;
 	register struct proc *p;
 	int i, hiwat, cnt, error, s;
@@ -1371,24 +1744,24 @@ ttwrite(tp, uio, flag)
 	cc = 0;
 loop:
 	s = spltty();
-	if (!ISSET(tp->t_state, TS_CARR_ON) &&
-	    !ISSET(tp->t_cflag, CLOCAL)) {
-		if (ISSET(tp->t_state, TS_ISOPEN)) {
-			splx(s);
-			return (EIO);
-		} else if (flag & IO_NDELAY) {
+	if (ISSET(tp->t_state, TS_ZOMBIE)) {
+		splx(s);
+		if (uio->uio_resid == cnt)
+			error = EIO;
+		goto out;
+	}
+	if (!ISSET(tp->t_state, TS_CONNECTED)) {
+		if (flag & IO_NDELAY) {
 			splx(s);
 			error = EWOULDBLOCK;
 			goto out;
-		} else {
-			/* Sleep awaiting carrier. */
-			error = ttysleep(tp,
-			    &tp->t_rawq, TTIPRI | PCATCH,ttopen, 0);
-			splx(s);
-			if (error)
-				goto out;
-			goto loop;
 		}
+		error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH,
+				 "ttydcd", 0);
+		splx(s);
+		if (error)
+			goto out;
+		goto loop;
 	}
 	splx(s);
 	/*
@@ -1398,10 +1771,14 @@ loop:
 	if (isbackground(p, tp) &&
 	    ISSET(tp->t_lflag, TOSTOP) && (p->p_flag & P_PPWAIT) == 0 &&
 	    (p->p_sigignore & sigmask(SIGTTOU)) == 0 &&
-	    (p->p_sigmask & sigmask(SIGTTOU)) == 0 &&
-	     p->p_pgrp->pg_jobc) {
+	    (p->p_sigmask & sigmask(SIGTTOU)) == 0) {
+		if (p->p_pgrp->pg_jobc == 0) {
+			error = EIO;
+			goto out;
+		}
 		pgsignal(p->p_pgrp, SIGTTOU, 1);
-		if (error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, ttybg, 0))
+		error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ttybg4", 0);
+		if (error)
 			goto out;
 		goto loop;
 	}
@@ -1422,13 +1799,17 @@ loop:
 		 * leftover from last time.
 		 */
 		if (cc == 0) {
-			cc = min(uio->uio_resid, OBUFSIZ);
+			cc = imin(uio->uio_resid, OBUFSIZ);
 			cp = obuf;
 			error = uiomove(cp, cc, uio);
 			if (error) {
 				cc = 0;
 				break;
 			}
+#if NSNP > 0
+			if (ISSET(tp->t_state, TS_SNOOP) && tp->t_sc != NULL)
+				snpin((struct snoop *)tp->t_sc, cp, cc);
+#endif
 		}
 		/*
 		 * If nothing fancy need be done, grab those characters we
@@ -1444,7 +1825,7 @@ loop:
 				ce = cc;
 			else {
 				ce = cc - scanc((u_int)cc, (u_char *)cp,
-				   (u_char *)char_type, CCLASSMASK);
+						char_type, CCLASSMASK);
 				/*
 				 * If ce is zero, then we're processing
 				 * a special character through ttyoutput.
@@ -1454,9 +1835,15 @@ loop:
 					if (ttyoutput(*cp, tp) >= 0) {
 						/* No Clists, wait a bit. */
 						ttstart(tp);
-						if (error = ttysleep(tp, &lbolt,
-						    TTOPRI | PCATCH, ttybuf, 0))
-							break;
+						if (flag & IO_NDELAY) {
+							error = EWOULDBLOCK;
+							goto out;
+						}
+						error = ttysleep(tp, &lbolt,
+								 TTOPRI|PCATCH,
+								 "ttybf1", 0);
+						if (error)
+							goto out;
 						goto loop;
 					}
 					cp++;
@@ -1484,9 +1871,14 @@ loop:
 			if (i > 0) {
 				/* No Clists, wait a bit. */
 				ttstart(tp);
-				if (error = ttysleep(tp,
-				    &lbolt, TTOPRI | PCATCH, ttybuf, 0))
-					break;
+				if (flag & IO_NDELAY) {
+					error = EWOULDBLOCK;
+					goto out;
+				}
+				error = ttysleep(tp, &lbolt, TTOPRI | PCATCH,
+						 "ttybf2", 0);
+				if (error)
+					goto out;
 				goto loop;
 			}
 			if (ISSET(tp->t_lflag, FLUSHO) ||
@@ -1520,9 +1912,12 @@ ovhiwat:
 		uio->uio_resid += cc;
 		return (uio->uio_resid == cnt ? EWOULDBLOCK : 0);
 	}
-	SET(tp->t_state, TS_ASLEEP);
-	error = ttysleep(tp, &tp->t_outq, TTOPRI | PCATCH, ttyout, 0);
+	SET(tp->t_state, TS_SO_OLOWAT);
+	error = ttysleep(tp, TSA_OLOWAT(tp), TTOPRI | PCATCH, "ttywri",
+			 tp->t_timeout);
 	splx(s);
+	if (error == EWOULDBLOCK)
+		error = EIO;
 	if (error)
 		goto out;
 	goto loop;
@@ -1532,7 +1927,7 @@ ovhiwat:
  * Rubout one character from the rawq of tp
  * as cleanly as possible.
  */
-void
+static void
 ttyrub(c, tp)
 	register int c;
 	register struct tty *tp;
@@ -1635,7 +2030,7 @@ ttyrubo(tp, cnt)
  *	Reprint the rawq line.  Note, it is assumed that c_cc has already
  *	been checked.
  */
-void
+static void
 ttyretype(tp)
 	register struct tty *tp;
 {
@@ -1679,11 +2074,11 @@ ttyecho(c, tp)
 	if (!ISSET(tp->t_state, TS_CNTTB))
 		CLR(tp->t_lflag, FLUSHO);
 	if ((!ISSET(tp->t_lflag, ECHO) &&
-	    (!ISSET(tp->t_lflag, ECHONL) || c == '\n')) ||
+	     (c != '\n' || !ISSET(tp->t_lflag, ECHONL))) ||
 	    ISSET(tp->t_lflag, EXTPROC))
 		return;
 	if (ISSET(tp->t_lflag, ECHOCTL) &&
-	    (ISSET(c, TTY_CHARMASK) <= 037 && c != '\t' && c != '\n' ||
+	    ((ISSET(c, TTY_CHARMASK) <= 037 && c != '\t' && c != '\n') ||
 	    ISSET(c, TTY_CHARMASK) == 0177)) {
 		(void)ttyoutput('^', tp);
 		CLR(c, ~TTY_CHARMASK);
@@ -1703,10 +2098,33 @@ ttwakeup(tp)
 	register struct tty *tp;
 {
 
-	selwakeup(&tp->t_rsel);
+	if (tp->t_rsel.si_pid != 0)
+		selwakeup(&tp->t_rsel);
 	if (ISSET(tp->t_state, TS_ASYNC))
 		pgsignal(tp->t_pgrp, SIGIO, 1);
-	wakeup((caddr_t)&tp->t_rawq);
+	wakeup(TSA_HUP_OR_INPUT(tp));
+}
+
+/*
+ * Wake up any writers on a tty.
+ */
+void
+ttwwakeup(tp)
+	register struct tty *tp;
+{
+
+	if (tp->t_wsel.si_pid != 0 && tp->t_outq.c_cc <= tp->t_lowat)
+		selwakeup(&tp->t_wsel);
+	if (ISSET(tp->t_state, TS_BUSY | TS_SO_OCOMPLETE) ==
+	    TS_SO_OCOMPLETE && tp->t_outq.c_cc == 0) {
+		CLR(tp->t_state, TS_SO_OCOMPLETE);
+		wakeup(TSA_OCOMPLETE(tp));
+	}
+	if (ISSET(tp->t_state, TS_SO_OLOWAT) &&
+	    tp->t_outq.c_cc <= tp->t_lowat) {
+		CLR(tp->t_state, TS_SO_OLOWAT);
+		wakeup(TSA_OLOWAT(tp));
+	}
 }
 
 /*
@@ -1786,15 +2204,15 @@ ttyinfo(tp)
 
 		/* Print user time. */
 		ttyprintf(tp, "%d.%02du ",
-		    utime.tv_sec, (utime.tv_usec + 5000) / 10000);
+		    utime.tv_sec, utime.tv_usec / 10000);
 
 		/* Print system time. */
 		ttyprintf(tp, "%d.%02ds ",
-		    stime.tv_sec, (stime.tv_usec + 5000) / 10000);
+		    stime.tv_sec, stime.tv_usec / 10000);
 
-#define	pgtok(a)	(((a) * NBPG) / 1024)
+#define	pgtok(a)	(((a) * PAGE_SIZE) / 1024)
 		/* Print percentage cpu, resident set size. */
-		tmp = pick->p_pctcpu * 10000 + FSCALE / 2 >> FSHIFT;
+		tmp = (pick->p_pctcpu * 10000 + FSCALE / 2) >> FSHIFT;
 		ttyprintf(tp, "%d%% %dk\n",
 		    tmp / 100,
 		    pick->p_stat == SIDL || pick->p_stat == SZOMB ? 0 :
@@ -1891,8 +2309,7 @@ tputchar(c, tp)
 	register int s;
 
 	s = spltty();
-	if (ISSET(tp->t_state,
-	    TS_CARR_ON | TS_ISOPEN) != (TS_CARR_ON | TS_ISOPEN)) {
+	if (!ISSET(tp->t_state, TS_CONNECTED)) {
 		splx(s);
 		return (-1);
 	}
@@ -1906,7 +2323,7 @@ tputchar(c, tp)
 
 /*
  * Sleep on chan, returning ERESTART if tty changed while we napped and
- * returning any errors (e.g. EINTR/ETIMEDOUT) reported by tsleep.  If
+ * returning any errors (e.g. EINTR/EWOULDBLOCK) reported by tsleep.  If
  * the tty is revoked, restarting a pending call will redo validation done
  * at the start of the call.
  */
@@ -1918,10 +2335,44 @@ ttysleep(tp, chan, pri, wmesg, timo)
 	char *wmesg;
 {
 	int error;
-	short gen;
+	int gen;
 
 	gen = tp->t_gen;
-	if (error = tsleep(chan, pri, wmesg, timo))
+	error = tsleep(chan, pri, wmesg, timo);
+	if (error)
 		return (error);
 	return (tp->t_gen == gen ? 0 : ERESTART);
 }
+
+#ifdef notyet
+/*
+ * XXX this is usable not useful or used.  Most tty drivers have
+ * ifdefs for using ttymalloc() but assume a different interface.
+ */
+/*
+ * Allocate a tty struct.  Clists in the struct will be allocated by
+ * ttyopen().
+ */
+struct tty *
+ttymalloc()
+{
+        struct tty *tp;
+
+        tp = malloc(sizeof *tp, M_TTYS, M_WAITOK);
+        bzero(tp, sizeof *tp);
+        return (tp);
+}
+#endif
+
+#if 0 /* XXX not yet usable: session leader holds a ref (see kern_exit.c). */
+/*
+ * Free a tty struct.  Clists in the struct should have been freed by
+ * ttyclose().
+ */
+void
+ttyfree(tp)
+	struct tty *tp;
+{
+        free(tp, M_TTYS);
+}
+#endif /* 0 */
diff --git a/sys/kern/tty_compat.c b/sys/kern/tty_compat.c
index ce95853..ed58c6a 100644
--- a/sys/kern/tty_compat.c
+++ b/sys/kern/tty_compat.c
@@ -30,28 +30,39 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)tty_compat.c	8.2 (Berkeley) 1/9/95
+ *	@(#)tty_compat.c	8.1 (Berkeley) 6/10/93
+ * $Id: tty_compat.c,v 1.21 1997/02/22 09:39:24 peter Exp $
  */
 
-/* 
+/*
  * mapping routines for old line discipline (yuck)
  */
 #if defined(COMPAT_43) || defined(COMPAT_SUNOS)
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/ioctl.h>
+#include <sys/ioctl_compat.h>
 #include <sys/proc.h>
 #include <sys/tty.h>
 #include <sys/termios.h>
 #include <sys/file.h>
 #include <sys/conf.h>
 #include <sys/kernel.h>
+#include <sys/sysctl.h>
 #include <sys/syslog.h>
 
-int ttydebug = 0;
+static int ttcompatgetflags	__P((struct tty	*tp));
+static void ttcompatsetflags	__P((struct tty	*tp, struct termios *t));
+static void ttcompatsetlflags	__P((struct tty	*tp, struct termios *t));
+static int ttcompatspeedtab	__P((int speed, struct speedtab *table));
+
+static int ttydebug = 0;
+SYSCTL_INT(_debug, OID_AUTO, ttydebug, CTLFLAG_RW, &ttydebug, 0, "");
 
 static struct speedtab compatspeeds[] = {
+#define MAX_SPEED	17
+	{ 115200, 17 },
+	{ 57600, 16 },
 	{ 38400, 15 },
 	{ 19200, 14 },
 	{ 9600,	13 },
@@ -70,78 +81,61 @@ static struct speedtab compatspeeds[] = {
 	{ 0,	0 },
 	{ -1,	-1 },
 };
-static int compatspcodes[16] = { 
+static int compatspcodes[] = {
 	0, 50, 75, 110, 134, 150, 200, 300, 600, 1200,
-	1800, 2400, 4800, 9600, 19200, 38400,
+	1800, 2400, 4800, 9600, 19200, 38400, 57600, 115200,
 };
 
-/*ARGSUSED*/
-ttcompat(tp, com, data, flag)
+static int
+ttcompatspeedtab(speed, table)
+	int speed;
+	register struct speedtab *table;
+{
+	if (speed == 0)
+		return (0); /* hangup */
+	for ( ; table->sp_speed > 0; table++)
+		if (table->sp_speed <= speed) /* nearest one, rounded down */
+			return (table->sp_code);
+	return (1); /* 50, min and not hangup */
+}
+
+int
+ttsetcompat(tp, com, data, term)
 	register struct tty *tp;
-	u_long com;
+	int *com;
 	caddr_t data;
-	int flag;
+	struct termios *term;
 {
-
-	switch (com) {
-	case TIOCGETP: {
-		register struct sgttyb *sg = (struct sgttyb *)data;
-		register u_char *cc = tp->t_cc;
-		register speed;
-
-		speed = ttspeedtab(tp->t_ospeed, compatspeeds);
-		sg->sg_ospeed = (speed == -1) ? 15 : speed;
-		if (tp->t_ispeed == 0)
-			sg->sg_ispeed = sg->sg_ospeed;
-		else {
-			speed = ttspeedtab(tp->t_ispeed, compatspeeds);
-			sg->sg_ispeed = (speed == -1) ? 15 : speed;
-		}
-		sg->sg_erase = cc[VERASE];
-		sg->sg_kill = cc[VKILL];
-		sg->sg_flags = ttcompatgetflags(tp);
-		break;
-	}
-
+	switch (*com) {
 	case TIOCSETP:
 	case TIOCSETN: {
 		register struct sgttyb *sg = (struct sgttyb *)data;
-		struct termios term;
 		int speed;
 
-		term = tp->t_termios;
-		if ((speed = sg->sg_ispeed) > 15 || speed < 0)
-			term.c_ispeed = speed;
+		if ((speed = sg->sg_ispeed) > MAX_SPEED || speed < 0)
+			return(EINVAL);
+		else if (speed != ttcompatspeedtab(tp->t_ispeed, compatspeeds))
+			term->c_ispeed = compatspcodes[speed];
 		else
-			term.c_ispeed = compatspcodes[speed];
-		if ((speed = sg->sg_ospeed) > 15 || speed < 0)
-			term.c_ospeed = speed;
+			term->c_ispeed = tp->t_ispeed;
+		if ((speed = sg->sg_ospeed) > MAX_SPEED || speed < 0)
+			return(EINVAL);
+		else if (speed != ttcompatspeedtab(tp->t_ospeed, compatspeeds))
+			term->c_ospeed = compatspcodes[speed];
 		else
-			term.c_ospeed = compatspcodes[speed];
-		term.c_cc[VERASE] = sg->sg_erase;
-		term.c_cc[VKILL] = sg->sg_kill;
-		tp->t_flags = tp->t_flags&0xffff0000 | sg->sg_flags&0xffff;
-		ttcompatsetflags(tp, &term);
-		return (ttioctl(tp, com == TIOCSETP ? TIOCSETAF : TIOCSETA, 
-			&term, flag));
-	}
-
-	case TIOCGETC: {
-		struct tchars *tc = (struct tchars *)data;
-		register u_char *cc = tp->t_cc;
-
-		tc->t_intrc = cc[VINTR];
-		tc->t_quitc = cc[VQUIT];
-		tc->t_startc = cc[VSTART];
-		tc->t_stopc = cc[VSTOP];
-		tc->t_eofc = cc[VEOF];
-		tc->t_brkc = cc[VEOL];
+			term->c_ospeed = tp->t_ospeed;
+		term->c_cc[VERASE] = sg->sg_erase;
+		term->c_cc[VKILL] = sg->sg_kill;
+		tp->t_flags = (tp->t_flags&0xffff0000) | (sg->sg_flags&0xffff);
+		ttcompatsetflags(tp, term);
+		*com = (*com == TIOCSETP) ? TIOCSETAF : TIOCSETA;
 		break;
 	}
 	case TIOCSETC: {
 		struct tchars *tc = (struct tchars *)data;
-		register u_char *cc = tp->t_cc;
+		register cc_t *cc;
 
+		cc = term->c_cc;
 		cc[VINTR] = tc->t_intrc;
 		cc[VQUIT] = tc->t_quitc;
 		cc[VSTART] = tc->t_startc;
@@ -150,23 +144,96 @@ ttcompat(tp, com, data, flag)
 		cc[VEOL] = tc->t_brkc;
 		if (tc->t_brkc == -1)
 			cc[VEOL2] = _POSIX_VDISABLE;
+		*com = TIOCSETA;
 		break;
 	}
 	case TIOCSLTC: {
 		struct ltchars *ltc = (struct ltchars *)data;
-		register u_char *cc = tp->t_cc;
+		register cc_t *cc;
 
+		cc = term->c_cc;
 		cc[VSUSP] = ltc->t_suspc;
 		cc[VDSUSP] = ltc->t_dsuspc;
 		cc[VREPRINT] = ltc->t_rprntc;
 		cc[VDISCARD] = ltc->t_flushc;
 		cc[VWERASE] = ltc->t_werasc;
 		cc[VLNEXT] = ltc->t_lnextc;
+		*com = TIOCSETA;
+		break;
+	}
+	case TIOCLBIS:
+	case TIOCLBIC:
+	case TIOCLSET:
+		if (*com == TIOCLSET)
+			tp->t_flags = (tp->t_flags&0xffff) | *(int *)data<<16;
+		else {
+			tp->t_flags =
+			 (ttcompatgetflags(tp)&0xffff0000)|(tp->t_flags&0xffff);
+			if (*com == TIOCLBIS)
+				tp->t_flags |= *(int *)data<<16;
+			else
+				tp->t_flags &= ~(*(int *)data<<16);
+		}
+		ttcompatsetlflags(tp, term);
+		*com = TIOCSETA;
+		break;
+	}
+	return 0;
+}
+
+/*ARGSUSED*/
+int
+ttcompat(tp, com, data, flag)
+	register struct tty *tp;
+	int com;
+	caddr_t data;
+	int flag;
+{
+	switch (com) {
+	case TIOCSETP:
+	case TIOCSETN:
+	case TIOCSETC:
+	case TIOCSLTC:
+	case TIOCLBIS:
+	case TIOCLBIC:
+	case TIOCLSET: {
+		struct termios term;
+		int error;
+
+		term = tp->t_termios;
+		if ((error = ttsetcompat(tp, &com, data, &term)) != 0)
+			return error;
+		return ttioctl(tp, com, &term, flag);
+	}
+	case TIOCGETP: {
+		register struct sgttyb *sg = (struct sgttyb *)data;
+		register cc_t *cc = tp->t_cc;
+
+		sg->sg_ospeed = ttcompatspeedtab(tp->t_ospeed, compatspeeds);
+		if (tp->t_ispeed == 0)
+			sg->sg_ispeed = sg->sg_ospeed;
+		else
+			sg->sg_ispeed = ttcompatspeedtab(tp->t_ispeed, compatspeeds);
+		sg->sg_erase = cc[VERASE];
+		sg->sg_kill = cc[VKILL];
+		sg->sg_flags = tp->t_flags = ttcompatgetflags(tp);
+		break;
+	}
+	case TIOCGETC: {
+		struct tchars *tc = (struct tchars *)data;
+		register cc_t *cc = tp->t_cc;
+
+		tc->t_intrc = cc[VINTR];
+		tc->t_quitc = cc[VQUIT];
+		tc->t_startc = cc[VSTART];
+		tc->t_stopc = cc[VSTOP];
+		tc->t_eofc = cc[VEOF];
+		tc->t_brkc = cc[VEOL];
 		break;
 	}
 	case TIOCGLTC: {
 		struct ltchars *ltc = (struct ltchars *)data;
-		register u_char *cc = tp->t_cc;
+		register cc_t *cc = tp->t_cc;
 
 		ltc->t_suspc = cc[VSUSP];
 		ltc->t_dsuspc = cc[VDSUSP];
@@ -176,27 +243,11 @@ ttcompat(tp, com, data, flag)
 		ltc->t_lnextc = cc[VLNEXT];
 		break;
 	}
-	case TIOCLBIS:
-	case TIOCLBIC:
-	case TIOCLSET: {
-		struct termios term;
-
-		term = tp->t_termios;
-		if (com == TIOCLSET)
-			tp->t_flags = (tp->t_flags&0xffff) | *(int *)data<<16;
-		else {
-			tp->t_flags = 
-			 (ttcompatgetflags(tp)&0xffff0000)|(tp->t_flags&0xffff);
-			if (com == TIOCLBIS)
-				tp->t_flags |= *(int *)data<<16;
-			else
-				tp->t_flags &= ~(*(int *)data<<16);
-		}
-		ttcompatsetlflags(tp, &term);
-		return (ttioctl(tp, TIOCSETA, &term, flag));
-	}
 	case TIOCLGET:
-		*(int *)data = ttcompatgetflags(tp)>>16;
+		tp->t_flags =
+		 (ttcompatgetflags(tp) & 0xffff0000UL)
+		   | (tp->t_flags & 0xffff);
+		*(int *)data = tp->t_flags>>16;
 		if (ttydebug)
 			printf("CLGET: returning %x\n", *(int *)data);
 		break;
@@ -208,7 +259,7 @@ ttcompat(tp, com, data, flag)
 	case OTIOCSETD: {
 		int ldisczero = 0;
 
-		return (ttioctl(tp, TIOCSETD, 
+		return (ttioctl(tp, TIOCSETD,
 			*(int *)data == 2 ? (caddr_t)&ldisczero : data, flag));
 	    }
 
@@ -222,20 +273,26 @@ ttcompat(tp, com, data, flag)
 	return (0);
 }
 
+static int
 ttcompatgetflags(tp)
 	register struct tty *tp;
 {
-	register long iflag = tp->t_iflag;
-	register long lflag = tp->t_lflag;
-	register long oflag = tp->t_oflag;
-	register long cflag = tp->t_cflag;
+	register tcflag_t iflag	= tp->t_iflag;
+	register tcflag_t lflag	= tp->t_lflag;
+	register tcflag_t oflag	= tp->t_oflag;
+	register tcflag_t cflag	= tp->t_cflag;
 	register flags = 0;
 
 	if (iflag&IXOFF)
 		flags |= TANDEM;
 	if (iflag&ICRNL || oflag&ONLCR)
 		flags |= CRMOD;
-	if (cflag&PARENB) {
+	if ((cflag&CSIZE) == CS8) {
+		flags |= PASS8;
+		if (iflag&ISTRIP)
+			flags |= ANYP;
+	}
+	else if (cflag&PARENB) {
 		if (iflag&INPCK) {
 			if (cflag&PARODD)
 				flags |= ODDP;
@@ -243,20 +300,18 @@ ttcompatgetflags(tp)
 				flags |= EVENP;
 		} else
 			flags |= EVENP | ODDP;
-	} else {
-		if ((tp->t_flags&LITOUT) && !(oflag&OPOST))
-			flags |= LITOUT;
-		if (tp->t_flags&PASS8)
-			flags |= PASS8;
 	}
-	
-	if ((lflag&ICANON) == 0) {	
+
+	if ((lflag&ICANON) == 0) {
 		/* fudge */
-		if (iflag&IXON || lflag&ISIG || lflag&IEXTEN || cflag&PARENB)
+		if (iflag&(INPCK|ISTRIP|IXON) || lflag&(IEXTEN|ISIG)
+		    || cflag&(CSIZE|PARENB) != CS8)
 			flags |= CBREAK;
 		else
 			flags |= RAW;
 	}
+	if (!(flags&RAW) && !(oflag&OPOST) && cflag&(CSIZE|PARENB) == CS8)
+		flags |= LITOUT;
 	if (cflag&MDMBUF)
 		flags |= MDMBUF;
 	if ((cflag&HUPCL) == 0)
@@ -274,28 +329,28 @@ ttcompatgetflags(tp)
 	if ((iflag&IXANY) == 0)
 		flags |= DECCTQ;
 	flags |= lflag&(ECHO|TOSTOP|FLUSHO|PENDIN|NOFLSH);
-if (ttydebug)
-	printf("getflags: %x\n", flags);
+	if (ttydebug)
+		printf("getflags: %x\n", flags);
 	return (flags);
 }
 
+static void
 ttcompatsetflags(tp, t)
 	register struct tty *tp;
 	register struct termios *t;
 {
 	register flags = tp->t_flags;
-	register long iflag = t->c_iflag;
-	register long oflag = t->c_oflag;
-	register long lflag = t->c_lflag;
-	register long cflag = t->c_cflag;
+	register tcflag_t iflag	= t->c_iflag;
+	register tcflag_t oflag	= t->c_oflag;
+	register tcflag_t lflag	= t->c_lflag;
+	register tcflag_t cflag	= t->c_cflag;
 
 	if (flags & RAW) {
-		iflag &= IXOFF;
-		oflag &= ~OPOST;
+		iflag = IGNBRK;
 		lflag &= ~(ECHOCTL|ISIG|ICANON|IEXTEN);
 	} else {
+		iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR);
 		iflag |= BRKINT|IXON|IMAXBEL;
-		oflag |= OPOST;
 		lflag |= ISIG|IEXTEN|ECHOCTL;	/* XXX was echoctl on ? */
 		if (flags & XTABS)
 			oflag |= OXTABS;
@@ -317,49 +372,59 @@ ttcompatsetflags(tp, t)
 		lflag |= ECHO;
 	else
 		lflag &= ~ECHO;
-		
+
+	cflag &= ~(CSIZE|PARENB);
 	if (flags&(RAW|LITOUT|PASS8)) {
-		cflag &= ~(CSIZE|PARENB);
 		cflag |= CS8;
-		if ((flags&(RAW|PASS8)) == 0)
+		if (!(flags&(RAW|PASS8))
+		    || (flags&(RAW|PASS8|ANYP)) == (PASS8|ANYP))
 			iflag |= ISTRIP;
 		else
 			iflag &= ~ISTRIP;
+		if (flags&(RAW|LITOUT))
+			oflag &= ~OPOST;
+		else
+			oflag |= OPOST;
 	} else {
-		cflag &= ~CSIZE;
 		cflag |= CS7|PARENB;
 		iflag |= ISTRIP;
+		oflag |= OPOST;
 	}
+	/* XXX don't set INPCK if RAW or PASS8? */
 	if ((flags&(EVENP|ODDP)) == EVENP) {
 		iflag |= INPCK;
 		cflag &= ~PARODD;
 	} else if ((flags&(EVENP|ODDP)) == ODDP) {
 		iflag |= INPCK;
 		cflag |= PARODD;
-	} else 
+	} else
 		iflag &= ~INPCK;
-	if (flags&LITOUT)
-		oflag &= ~OPOST;	/* move earlier ? */
 	if (flags&TANDEM)
 		iflag |= IXOFF;
 	else
 		iflag &= ~IXOFF;
+	if ((flags&DECCTQ) == 0)
+		iflag |= IXANY;
+	else
+		iflag &= ~IXANY;
 	t->c_iflag = iflag;
 	t->c_oflag = oflag;
 	t->c_lflag = lflag;
 	t->c_cflag = cflag;
 }
 
+static void
 ttcompatsetlflags(tp, t)
 	register struct tty *tp;
 	register struct termios *t;
 {
 	register flags = tp->t_flags;
-	register long iflag = t->c_iflag;
-	register long oflag = t->c_oflag;
-	register long lflag = t->c_lflag;
-	register long cflag = t->c_cflag;
+	register tcflag_t iflag	= t->c_iflag;
+	register tcflag_t oflag	= t->c_oflag;
+	register tcflag_t lflag	= t->c_lflag;
+	register tcflag_t cflag	= t->c_cflag;
 
+	iflag &= ~(PARMRK|IGNPAR|IGNCR|INLCR);
 	if (flags&CRTERA)
 		lflag |= ECHOE;
 	else
@@ -376,6 +441,10 @@ ttcompatsetlflags(tp, t)
 		lflag |= ECHOCTL;
 	else
 		lflag &= ~ECHOCTL;
+	if (flags&TANDEM)
+		iflag |= IXOFF;
+	else
+		iflag &= ~IXOFF;
 	if ((flags&DECCTQ) == 0)
 		iflag |= IXANY;
 	else
@@ -390,17 +459,30 @@ ttcompatsetlflags(tp, t)
 		cflag |= HUPCL;
 	lflag &= ~(TOSTOP|FLUSHO|PENDIN|NOFLSH);
 	lflag |= flags&(TOSTOP|FLUSHO|PENDIN|NOFLSH);
-	if (flags&(LITOUT|PASS8)) {
-		iflag &= ~ISTRIP;
-		cflag &= ~(CSIZE|PARENB);
+
+	/*
+	 * The next if-else statement is copied from above so don't bother
+	 * checking it separately.  We could avoid fiddlling with the
+	 * character size if the mode is already RAW or if neither the
+	 * LITOUT bit or the PASS8 bit is being changed, but the delta of
+	 * the change is not available here and skipping the RAW case would
+	 * make the code different from above.
+	 */
+	cflag &= ~(CSIZE|PARENB);
+	if (flags&(RAW|LITOUT|PASS8)) {
 		cflag |= CS8;
-		if (flags&LITOUT)
-			oflag &= ~OPOST;
-		if ((flags&(PASS8|RAW)) == 0)
+		if (!(flags&(RAW|PASS8))
+		    || (flags&(RAW|PASS8|ANYP)) == (PASS8|ANYP))
 			iflag |= ISTRIP;
-	} else if ((flags&RAW) == 0) {
-		cflag &= ~CSIZE;
+		else
+			iflag &= ~ISTRIP;
+		if (flags&(RAW|LITOUT))
+			oflag &= ~OPOST;
+		else
+			oflag |= OPOST;
+	} else {
 		cflag |= CS7|PARENB;
+		iflag |= ISTRIP;
 		oflag |= OPOST;
 	}
 	t->c_iflag = iflag;
diff --git a/sys/kern/tty_conf.c b/sys/kern/tty_conf.c
index 1453675..2e765c8 100644
--- a/sys/kern/tty_conf.c
+++ b/sys/kern/tty_conf.c
@@ -35,92 +35,174 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)tty_conf.c	8.5 (Berkeley) 1/9/95
+ *	@(#)tty_conf.c	8.4 (Berkeley) 1/21/94
+ * $Id$
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/buf.h>
-#include <sys/ioctl.h>
-#include <sys/proc.h>
 #include <sys/tty.h>
 #include <sys/conf.h>
 
-#define	ttynodisc ((int (*) __P((dev_t, struct tty *)))enodev)
-#define	ttyerrclose ((int (*) __P((struct tty *, int flags)))enodev)
-#define	ttyerrio ((int (*) __P((struct tty *, struct uio *, int)))enodev)
-#define	ttyerrinput ((int (*) __P((int c, struct tty *)))enodev)
-#define	ttyerrstart ((int (*) __P((struct tty *)))enodev)
-
-int	nullioctl __P((struct tty *tp, u_long cmd, caddr_t data,
-			int flag, struct proc *p));
-
-#include "tb.h"
-#if NTB > 0
-int	tbopen __P((dev_t dev, struct tty *tp));
-int	tbclose __P((struct tty *tp, int flags));
-int	tbread __P((struct tty *, struct uio *, int flags));
-int	tbioctl __P((struct tty *tp, u_long cmd, caddr_t data,
-			int flag, struct proc *p));
-int	tbinput __P((int c, struct tty *tp));
+#ifndef MAXLDISC
+#define MAXLDISC 8
 #endif
 
-#include "sl.h"
-#if NSL > 0
-int	slopen __P((dev_t dev, struct tty *tp));
-int	slclose __P((struct tty *tp, int flags));
-int	sltioctl __P((struct tty *tp, u_long cmd, caddr_t data,
-			int flag, struct proc *p));
-int	slinput __P((int c, struct tty *tp));
-int	slstart __P((struct tty *tp));
+static l_open_t		l_noopen;
+static l_close_t	l_noclose;
+static l_ioctl_t	l_nullioctl;
+static l_rint_t		l_norint;
+static l_start_t	l_nostart;
+
+/*
+ * XXX it probably doesn't matter what the entries other than the l_open
+ * entry are here.  The l_nullioctl and ttymodem entries still look fishy.
+ * Reconsider the removal of nullmodem anyway.  It was too much like
+ * ttymodem, but a completely null version might be useful.
+ */
+#define NODISC(n) \
+	{ l_noopen,	l_noclose,	l_noread,	l_nowrite, \
+	  l_nullioctl,	l_norint,	l_nostart,	ttymodem }
+
+struct	linesw linesw[MAXLDISC] =
+{
+				/* 0- termios */
+	{ ttyopen,	ttylclose,	ttread,		ttwrite,
+	  l_nullioctl,	ttyinput,	ttstart,	ttymodem },
+	NODISC(1),		/* 1- defunct */
+	  			/* 2- NTTYDISC */
+#ifdef COMPAT_43
+	{ ttyopen,	ttylclose,	ttread,		ttwrite,
+	  l_nullioctl,	ttyinput,	ttstart,	ttymodem },
+#else
+	NODISC(2),
 #endif
+	NODISC(3),		/* TABLDISC */
+	NODISC(4),		/* SLIPDISC */
+	NODISC(5),		/* PPPDISC */
+	NODISC(6),		/* loadable */
+	NODISC(7),		/* loadable */
+};
 
+int	nlinesw = sizeof (linesw) / sizeof (linesw[0]);
+
+static struct linesw nodisc = NODISC(0);
 
-struct	linesw linesw[] =
+#define LOADABLE_LDISC 6
+/*
+ * ldisc_register: Register a line discipline.
+ *
+ * discipline: Index for discipline to load, or LDISC_LOAD for us to choose.
+ * linesw_p:   Pointer to linesw_p.
+ *
+ * Returns: Index used or -1 on failure.
+ */
+int
+ldisc_register(discipline, linesw_p)
+	int discipline;
+	struct linesw *linesw_p;
 {
-	{ ttyopen, ttylclose, ttread, ttwrite, nullioctl,
-	  ttyinput, ttstart, ttymodem },		/* 0- termios */
+	int slot = -1;
 
-	{ ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl,
-	  ttyerrinput, ttyerrstart, nullmodem },	/* 1- defunct */
+	if (discipline == LDISC_LOAD) {
+		int i;
+		for (i = LOADABLE_LDISC; i < MAXLDISC; i++)
+			if (bcmp(linesw + i, &nodisc, sizeof(nodisc)) == 0) {
+				slot = i;
+			}
+	}
+	else if (discipline >= 0 && discipline < MAXLDISC) {
+		slot = discipline;
+	}
 
-	{ ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl,
-	  ttyerrinput, ttyerrstart, nullmodem },	/* 2- defunct */
+	if (slot != -1 && linesw_p)
+		linesw[slot] = *linesw_p;
 
-#if NTB > 0
-	{ tbopen, tbclose, tbread, enodev, tbioctl,
-	  tbinput, ttstart, nullmodem },		/* 3- TABLDISC */
-#else
-	{ ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl,
-	  ttyerrinput, ttyerrstart, nullmodem },
-#endif
+	return slot;
+}
 
-#if NSL > 0
-	{ slopen, slclose, ttyerrio, ttyerrio, sltioctl,
-	  slinput, slstart, nullmodem },		/* 4- SLIPDISC */
-#else
-	{ ttynodisc, ttyerrclose, ttyerrio, ttyerrio, nullioctl,
-	  ttyerrinput, ttyerrstart, nullmodem },
-#endif
-};
+/*
+ * ldisc_deregister: Deregister a line discipline obtained with
+ * ldisc_register.  Can only deregister "loadable" ones now.
+ *
+ * discipline: Index for discipline to unload.
+ */
+void
+ldisc_deregister(discipline)
+	int discipline;
+{
+	if (discipline >= LOADABLE_LDISC && discipline < MAXLDISC) {
+		linesw[discipline] = nodisc;
+	}
+}
 
-int	nlinesw = sizeof (linesw) / sizeof (linesw[0]);
+static int
+l_noopen(dev, tp)
+	dev_t dev;
+	struct tty *tp;
+{
+
+	return (ENODEV);
+}
+
+static int
+l_noclose(tp, flag)
+	struct tty *tp;
+	int flag;
+{
+
+	return (ENODEV);
+}
+
+int
+l_noread(tp, uio, flag)
+	struct tty *tp;
+	struct uio *uio;
+	int flag;
+{
+
+	return (ENODEV);
+}
+
+int
+l_nowrite(tp, uio, flag)
+	struct tty *tp;
+	struct uio *uio;
+	int flag;
+{
+
+	return (ENODEV);
+}
+
+static int
+l_norint(c, tp)
+	int c;
+	struct tty *tp;
+{
+
+	return (ENODEV);
+}
+
+static int
+l_nostart(tp)
+	struct tty *tp;
+{
+
+	return (ENODEV);
+}
 
 /*
  * Do nothing specific version of line
  * discipline specific ioctl command.
  */
-/*ARGSUSED*/
-nullioctl(tp, cmd, data, flags, p)
+static int
+l_nullioctl(tp, cmd, data, flags, p)
 	struct tty *tp;
-	u_long cmd;
+	int cmd;
 	char *data;
 	int flags;
 	struct proc *p;
 {
 
-#ifdef lint
-	tp = tp; data = data; flags = flags; p = p;
-#endif
 	return (-1);
 }
diff --git a/sys/kern/tty_cons.c b/sys/kern/tty_cons.c
new file mode 100644
index 0000000..1a56c85
--- /dev/null
+++ b/sys/kern/tty_cons.c
@@ -0,0 +1,353 @@
+/*
+ * Copyright (c) 1988 University of Utah.
+ * Copyright (c) 1991 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * This code is derived from software contributed to Berkeley by
+ * the Systems Programming Group of the University of Utah Computer
+ * Science Department.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	from: @(#)cons.c	7.2 (Berkeley) 5/9/91
+ *	$Id$
+ */
+
+#include <sys/param.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+#include <sys/systm.h>
+#include <sys/conf.h>
+#include <sys/kernel.h>
+#include <sys/reboot.h>
+#include <sys/sysctl.h>
+#include <sys/proc.h>
+#include <sys/tty.h>
+
+#include <machine/cpu.h>
+#include <machine/cons.h>
+
+/* XXX this should be config(8)ed. */
+#include "sc.h"
+#include "vt.h"
+#include "sio.h"
+static struct consdev constab[] = {
+#if NSC > 0
+	{ sccnprobe,	sccninit,	sccngetc,	sccncheckc,	sccnputc },
+#endif
+#if NVT > 0
+	{ pccnprobe,	pccninit,	pccngetc,	pccncheckc,	pccnputc },
+#endif
+#if NSIO > 0
+	{ siocnprobe,	siocninit,	siocngetc,	siocncheckc,	siocnputc },
+#endif
+	{ 0 },
+};
+
+static	d_open_t	cnopen;
+static	d_close_t	cnclose;
+static	d_read_t	cnread;
+static	d_write_t	cnwrite;
+static	d_ioctl_t	cnioctl;
+static	d_select_t	cnselect;
+
+#define CDEV_MAJOR 0
+static struct cdevsw cn_cdevsw = 
+	{ cnopen,	cnclose,	cnread,		cnwrite,	/*0*/
+	  cnioctl,	nullstop,	nullreset,	nodevtotty,/* console */
+	  cnselect,	nommap,		NULL,	"console",	NULL,	-1 };
+
+struct	tty *constty = 0;	/* virtual console output device */
+
+static dev_t	cn_dev_t;
+SYSCTL_OPAQUE(_machdep, CPU_CONSDEV, consdev, CTLTYPE_OPAQUE|CTLFLAG_RD,
+	&cn_dev_t, sizeof cn_dev_t, "T,dev_t", "");
+static int cn_mute;
+SYSCTL_INT(_kern, OID_AUTO, consmute, CTLFLAG_RW, &cn_mute, 0, "");
+
+int	cons_unavail = 0;	/* XXX:
+				 * physical console not available for
+				 * input (i.e., it is in graphics mode)
+				 */
+
+static u_char cn_is_open;	/* nonzero if logical console is open */
+static u_char cn_phys_is_open;	/* nonzero if physical console is open */
+static d_close_t *cn_phys_close;	/* physical device close function */
+static d_open_t *cn_phys_open;	/* physical device open function */
+static struct consdev *cn_tab;	/* physical console device info */
+static struct tty *cn_tp;	/* physical console tty struct */
+#ifdef DEVFS
+void *cn_devfs_token;		/* represents the devfs entry */
+#endif /* DEVFS */
+
+void
+cninit()
+{
+	struct consdev *best_cp, *cp;
+
+	/*
+	 * Find the first console with the highest priority.
+	 */
+	best_cp = NULL;
+	for (cp = constab; cp->cn_probe; cp++) {
+		(*cp->cn_probe)(cp);
+		if (cp->cn_pri > CN_DEAD &&
+		    (best_cp == NULL || cp->cn_pri > best_cp->cn_pri))
+			best_cp = cp;
+	}
+
+	/*
+	 * Check if we should mute the console (for security reasons perhaps)
+	 * It can be changes dynamically using sysctl kern.consmute
+	 * once we are up and going.
+	 * 
+	 */
+        cn_mute = ((boothowto & (RB_MUTE
+			|RB_SINGLE
+			|RB_VERBOSE
+			|RB_ASKNAME
+			|RB_CONFIG)) == RB_MUTE);
+	
+	/*
+	 * If no console, give up.
+	 */
+	if (best_cp == NULL) {
+		cn_tab = best_cp;
+		return;
+	}
+
+	/*
+	 * Initialize console, then attach to it.  This ordering allows
+	 * debugging using the previous console, if any.
+	 * XXX if there was a previous console, then its driver should
+	 * be informed when we forget about it.
+	 */
+	(*best_cp->cn_init)(best_cp);
+	cn_tab = best_cp;
+}
+
+void
+cninit_finish()
+{
+	struct cdevsw *cdp;
+
+	if (cn_tab == NULL)
+		return;
+
+	/*
+	 * Hook the open and close functions.
+	 */
+	cdp = cdevsw[major(cn_tab->cn_dev)];
+	cn_phys_close = cdp->d_close;
+	cdp->d_close = cnclose;
+	cn_phys_open = cdp->d_open;
+	cdp->d_open = cnopen;
+	cn_tp = (*cdp->d_devtotty)(cn_tab->cn_dev);
+	cn_dev_t = cn_tp->t_dev;
+}
+
+static int
+cnopen(dev, flag, mode, p)
+	dev_t dev;
+	int flag, mode;
+	struct proc *p;
+{
+	dev_t cndev, physdev;
+	int retval;
+
+	if (cn_tab == NULL)
+		return (0);
+	cndev = cn_tab->cn_dev;
+	physdev = (major(dev) == major(cndev) ? dev : cndev);
+	retval = (*cn_phys_open)(physdev, flag, mode, p);
+	if (retval == 0) {
+		if (dev == cndev)
+			cn_phys_is_open = 1;
+		else if (physdev == cndev)
+			cn_is_open = 1;
+	}
+	return (retval);
+}
+
+static int
+cnclose(dev, flag, mode, p)
+	dev_t dev;
+	int flag, mode;
+	struct proc *p;
+{
+	dev_t cndev;
+
+	if (cn_tab == NULL)
+		return (0);
+	cndev = cn_tab->cn_dev;
+	if (dev == cndev) {
+		/* the physical device is about to be closed */
+		cn_phys_is_open = 0;
+		if (cn_is_open) {
+			if (cn_tp) {
+				/* perform a ttyhalfclose() */
+				/* reset session and proc group */
+				cn_tp->t_pgrp = NULL;
+				cn_tp->t_session = NULL;
+			}
+			return (0);
+		}
+	} else if (major(dev) != major(cndev)) {
+		/* the logical console is about to be closed */
+		cn_is_open = 0;
+		if (cn_phys_is_open)
+			return (0);
+		dev = cndev;
+	}
+	return ((*cn_phys_close)(dev, flag, mode, p));
+}
+
+static int
+cnread(dev, uio, flag)
+	dev_t dev;
+	struct uio *uio;
+	int flag;
+{
+	if ((cn_tab == NULL) || cn_mute)
+		return (0);
+	dev = cn_tab->cn_dev;
+	return ((*cdevsw[major(dev)]->d_read)(dev, uio, flag));
+}
+
+static int
+cnwrite(dev, uio, flag)
+	dev_t dev;
+	struct uio *uio;
+	int flag;
+{
+	if ((cn_tab == NULL) || cn_mute)
+		return (0);
+	if (constty)
+		dev = constty->t_dev;
+	else
+		dev = cn_tab->cn_dev;
+	return ((*cdevsw[major(dev)]->d_write)(dev, uio, flag));
+}
+
+static int
+cnioctl(dev, cmd, data, flag, p)
+	dev_t dev;
+	int cmd;
+	caddr_t data;
+	int flag;
+	struct proc *p;
+{
+	int error;
+
+	if ((cn_tab == NULL) || cn_mute)
+		return (0);
+	/*
+	 * Superuser can always use this to wrest control of console
+	 * output from the "virtual" console.
+	 */
+	if (cmd == TIOCCONS && constty) {
+		error = suser(p->p_ucred, (u_short *) NULL);
+		if (error)
+			return (error);
+		constty = NULL;
+		return (0);
+	}
+	dev = cn_tab->cn_dev;
+	return ((*cdevsw[major(dev)]->d_ioctl)(dev, cmd, data, flag, p));
+}
+
+static int
+cnselect(dev, rw, p)
+	dev_t dev;
+	int rw;
+	struct proc *p;
+{
+	if ((cn_tab == NULL) || cn_mute)
+		return (1);
+
+	dev = cn_tab->cn_dev;
+
+	return ((*cdevsw[major(dev)]->d_select)(dev, rw, p));
+}
+
+int
+cngetc()
+{
+	int c;
+	if ((cn_tab == NULL) || cn_mute)
+		return (-1);
+	c = (*cn_tab->cn_getc)(cn_tab->cn_dev);
+	if (c == '\r') c = '\n'; /* console input is always ICRNL */
+	return (c);
+}
+
+int
+cncheckc()
+{
+	if ((cn_tab == NULL) || cn_mute)
+		return (-1);
+	return ((*cn_tab->cn_checkc)(cn_tab->cn_dev));
+}
+
+void
+cnputc(c)
+	register int c;
+{
+	if ((cn_tab == NULL) || cn_mute)
+		return;
+	if (c) {
+		if (c == '\n')
+			(*cn_tab->cn_putc)(cn_tab->cn_dev, '\r');
+		(*cn_tab->cn_putc)(cn_tab->cn_dev, c);
+	}
+}
+
+static cn_devsw_installed = 0;
+
+static void
+cn_drvinit(void *unused)
+{
+	dev_t dev;
+
+	if( ! cn_devsw_installed ) {
+		dev = makedev(CDEV_MAJOR,0);
+		cdevsw_add(&dev,&cn_cdevsw,NULL);
+		cn_devsw_installed = 1;
+#ifdef DEVFS
+		cn_devfs_token = devfs_add_devswf(&cn_cdevsw, 0, DV_CHR,
+						  UID_ROOT, GID_WHEEL, 0600,
+						  "console");
+#endif
+	}
+}
+
+SYSINIT(cndev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,cn_drvinit,NULL)
+
+
diff --git a/sys/kern/tty_pty.c b/sys/kern/tty_pty.c
index 2c37984..ee0b653 100644
--- a/sys/kern/tty_pty.c
+++ b/sys/kern/tty_pty.c
@@ -31,6 +31,7 @@
  * SUCH DAMAGE.
  *
  *	@(#)tty_pty.c	8.4 (Berkeley) 2/20/95
+ * $Id: tty_pty.c,v 1.42 1997/03/23 03:36:28 bde Exp $
  */
 
 /*
@@ -41,14 +42,53 @@
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/ioctl.h>
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+#include <sys/ioctl_compat.h>
+#endif
 #include <sys/proc.h>
 #include <sys/tty.h>
 #include <sys/conf.h>
-#include <sys/file.h>
+#include <sys/fcntl.h>
 #include <sys/uio.h>
 #include <sys/kernel.h>
 #include <sys/vnode.h>
+#include <sys/signalvar.h>
+
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+
+#ifdef notyet
+static void ptyattach __P((int n));
+#endif
+static void ptsstart __P((struct tty *tp));
+static void ptcwakeup __P((struct tty *tp, int flag));
+
+static	d_open_t	ptsopen;
+static	d_close_t	ptsclose;
+static	d_read_t	ptsread;
+static	d_write_t	ptswrite;
+static	d_ioctl_t	ptyioctl;
+static	d_stop_t	ptsstop;
+static	d_devtotty_t	ptydevtotty;
+static	d_open_t	ptcopen;
+static	d_close_t	ptcclose;
+static	d_read_t	ptcread;
+static	d_write_t	ptcwrite;
+static	d_select_t	ptcselect;
+
+#define CDEV_MAJOR_S 5
+#define CDEV_MAJOR_C 6
+static struct cdevsw pts_cdevsw = 
+	{ ptsopen,	ptsclose,	ptsread,	ptswrite,	/*5*/
+	  ptyioctl,	ptsstop,	nullreset,	ptydevtotty,/* ttyp */
+	  ttselect,	nommap,		NULL,	"pts",	NULL,	-1 };
+
+static struct cdevsw ptc_cdevsw = 
+	{ ptcopen,	ptcclose,	ptcread,	ptcwrite,	/*6*/
+	  ptyioctl,	nullstop,	nullreset,	ptydevtotty,/* ptyp */
+	  ptcselect,	nommap,		NULL,	"ptc",	NULL,	-1 };
+
 
 #if NPTY == 1
 #undef NPTY
@@ -58,17 +98,17 @@
 #define BUFSIZ 100		/* Chunk size iomoved to/from user */
 
 /*
- * pts == /dev/tty[pqrs]?
- * ptc == /dev/pty[pqrs]?
+ * pts == /dev/tty[pqrsPQRS][0123456789abcdefghijklmnopqrstuv]
+ * ptc == /dev/pty[pqrsPQRS][0123456789abcdefghijklmnopqrstuv]
  */
-struct	tty pt_tty[NPTY];	/* XXX */
-struct	pt_ioctl {
+static struct	tty pt_tty[NPTY];	/* XXX */
+static struct	pt_ioctl {
 	int	pt_flags;
 	struct	selinfo pt_selr, pt_selw;
 	u_char	pt_send;
 	u_char	pt_ucntl;
 } pt_ioctl[NPTY];		/* XXX */
-int	npty = NPTY;		/* for pstat -t */
+static int	npty = NPTY;		/* for pstat -t */
 
 #define	PF_PKT		0x08		/* packet mode */
 #define	PF_STOPPED	0x10		/* user told stopped */
@@ -76,18 +116,16 @@ int	npty = NPTY;		/* for pstat -t */
 #define	PF_NOSTOP	0x40
 #define PF_UCNTL	0x80		/* user control mode */
 
-void	ptsstop __P((struct tty *, int));
-
+#ifdef notyet
 /*
  * Establish n (or default if n is 1) ptys in the system.
  *
  * XXX cdevsw & pstat require the array `pty[]' to be an array
  */
-void
+static void
 ptyattach(n)
 	int n;
 {
-#ifdef notyet
 	char *mem;
 	register u_long ntb;
 #define	DEFAULT_NPTY	32
@@ -102,10 +140,11 @@ ptyattach(n)
 	mem = (char *)ALIGN(mem + ntb);
 	pt_ioctl = (struct pt_ioctl *)mem;
 	npty = n;
-#endif
 }
+#endif
 
 /*ARGSUSED*/
+static	int
 ptsopen(dev, flag, devtype, p)
 	dev_t dev;
 	int flag, devtype;
@@ -118,7 +157,6 @@ ptsopen(dev, flag, devtype, p)
 		return (ENXIO);
 	tp = &pt_tty[minor(dev)];
 	if ((tp->t_state & TS_ISOPEN) == 0) {
-		tp->t_state |= TS_WOPEN;
 		ttychars(tp);		/* Set up default chars */
 		tp->t_iflag = TTYDEF_IFLAG;
 		tp->t_oflag = TTYDEF_OFLAG;
@@ -129,20 +167,22 @@ ptsopen(dev, flag, devtype, p)
 	} else if (tp->t_state&TS_XCLUDE && p->p_ucred->cr_uid != 0)
 		return (EBUSY);
 	if (tp->t_oproc)			/* Ctrlr still around. */
-		tp->t_state |= TS_CARR_ON;
+		(void)(*linesw[tp->t_line].l_modem)(tp, 1);
 	while ((tp->t_state & TS_CARR_ON) == 0) {
-		tp->t_state |= TS_WOPEN;
 		if (flag&FNONBLOCK)
 			break;
-		if (error = ttysleep(tp, (caddr_t)&tp->t_rawq, TTIPRI | PCATCH,
-		    ttopen, 0))
+		error = ttysleep(tp, TSA_CARR_ON(tp), TTIPRI | PCATCH,
+				 "ptsopn", 0);
+		if (error)
 			return (error);
 	}
 	error = (*linesw[tp->t_line].l_open)(dev, tp);
-	ptcwakeup(tp, FREAD|FWRITE);
+	if (error == 0)
+		ptcwakeup(tp, FREAD|FWRITE);
 	return (error);
 }
 
+static	int
 ptsclose(dev, flag, mode, p)
 	dev_t dev;
 	int flag, mode;
@@ -153,11 +193,12 @@ ptsclose(dev, flag, mode, p)
 
 	tp = &pt_tty[minor(dev)];
 	err = (*linesw[tp->t_line].l_close)(tp, flag);
-	err |= ttyclose(tp);
-	ptcwakeup(tp, FREAD|FWRITE);
+	ptsstop(tp, FREAD|FWRITE);
+	(void) ttyclose(tp);
 	return (err);
 }
 
+static	int
 ptsread(dev, uio, flag)
 	dev_t dev;
 	struct uio *uio;
@@ -177,15 +218,17 @@ again:
 			    p->p_flag & P_PPWAIT)
 				return (EIO);
 			pgsignal(p->p_pgrp, SIGTTIN, 1);
-			if (error = ttysleep(tp, (caddr_t)&lbolt, 
-			    TTIPRI | PCATCH, ttybg, 0))
+			error = ttysleep(tp, &lbolt, TTIPRI | PCATCH, "ptsbg",
+					 0);
+			if (error)
 				return (error);
 		}
 		if (tp->t_canq.c_cc == 0) {
 			if (flag & IO_NDELAY)
 				return (EWOULDBLOCK);
-			if (error = ttysleep(tp, (caddr_t)&tp->t_canq,
-			    TTIPRI | PCATCH, ttyin, 0))
+			error = ttysleep(tp, TSA_PTS_READ(tp), TTIPRI | PCATCH,
+					 "ptsin", 0);
+			if (error)
 				return (error);
 			goto again;
 		}
@@ -210,6 +253,7 @@ again:
  * Wakeups of controlling tty will happen
  * indirectly, when tty driver calls ptsstart.
  */
+static	int
 ptswrite(dev, uio, flag)
 	dev_t dev;
 	struct uio *uio;
@@ -227,7 +271,7 @@ ptswrite(dev, uio, flag)
  * Start output on pseudo-tty.
  * Wake up process selecting or sleeping for input from controlling tty.
  */
-void
+static void
 ptsstart(tp)
 	struct tty *tp;
 {
@@ -242,6 +286,7 @@ ptsstart(tp)
 	ptcwakeup(tp, FREAD);
 }
 
+static void
 ptcwakeup(tp, flag)
 	struct tty *tp;
 	int flag;
@@ -250,23 +295,19 @@ ptcwakeup(tp, flag)
 
 	if (flag & FREAD) {
 		selwakeup(&pti->pt_selr);
-		wakeup((caddr_t)&tp->t_outq.c_cf);
+		wakeup(TSA_PTC_READ(tp));
 	}
 	if (flag & FWRITE) {
 		selwakeup(&pti->pt_selw);
-		wakeup((caddr_t)&tp->t_rawq.c_cf);
+		wakeup(TSA_PTC_WRITE(tp));
 	}
 }
 
-/*ARGSUSED*/
-#ifdef __STDC__
-ptcopen(dev_t dev, int flag, int devtype, struct proc *p)
-#else
+static	int
 ptcopen(dev, flag, devtype, p)
 	dev_t dev;
 	int flag, devtype;
 	struct proc *p;
-#endif
 {
 	register struct tty *tp;
 	struct pt_ioctl *pti;
@@ -289,19 +330,37 @@ ptcopen(dev, flag, devtype, p)
 	return (0);
 }
 
-ptcclose(dev)
+static	int
+ptcclose(dev, flags, fmt, p)
 	dev_t dev;
+	int flags;
+	int fmt;
+	struct proc *p;
 {
 	register struct tty *tp;
 
 	tp = &pt_tty[minor(dev)];
 	(void)(*linesw[tp->t_line].l_modem)(tp, 0);
-	tp->t_state &= ~TS_CARR_ON;
+
+	/*
+	 * XXX MDMBUF makes no sense for ptys but would inhibit the above
+	 * l_modem().  CLOCAL makes sense but isn't supported.   Special
+	 * l_modem()s that ignore carrier drop make no sense for ptys but
+	 * may be in use because other parts of the line discipline make
+	 * sense for ptys.  Recover by doing everything that a normal
+	 * ttymodem() would have done except for sending a SIGHUP.
+	 */
+	if (tp->t_state & TS_ISOPEN) {
+		tp->t_state &= ~(TS_CARR_ON | TS_CONNECTED);
+		tp->t_state |= TS_ZOMBIE;
+		ttyflush(tp, FREAD | FWRITE);
+	}
+
 	tp->t_oproc = 0;		/* mark closed */
-	tp->t_session = 0;
 	return (0);
 }
 
+static	int
 ptcread(dev, uio, flag)
 	dev_t dev;
 	struct uio *uio;
@@ -327,7 +386,8 @@ ptcread(dev, uio, flag)
 				if (pti->pt_send & TIOCPKT_IOCTL) {
 					cc = min(uio->uio_resid,
 						sizeof(tp->t_termios));
-					uiomove(&tp->t_termios, cc, uio);
+					uiomove((caddr_t)&tp->t_termios, cc,
+						uio);
 				}
 				pti->pt_send = 0;
 				return (0);
@@ -342,12 +402,12 @@ ptcread(dev, uio, flag)
 			if (tp->t_outq.c_cc && (tp->t_state&TS_TTSTOP) == 0)
 				break;
 		}
-		if ((tp->t_state&TS_CARR_ON) == 0)
+		if ((tp->t_state & TS_CONNECTED) == 0)
 			return (0);	/* EOF */
 		if (flag & IO_NDELAY)
 			return (EWOULDBLOCK);
-		if (error = tsleep((caddr_t)&tp->t_outq.c_cf, TTIPRI | PCATCH,
-		    ttyin, 0))
+		error = tsleep(TSA_PTC_READ(tp), TTIPRI | PCATCH, "ptcin", 0);
+		if (error)
 			return (error);
 	}
 	if (pti->pt_flags & (PF_PKT|PF_UCNTL))
@@ -358,17 +418,11 @@ ptcread(dev, uio, flag)
 			break;
 		error = uiomove(buf, cc, uio);
 	}
-	if (tp->t_outq.c_cc <= tp->t_lowat) {
-		if (tp->t_state&TS_ASLEEP) {
-			tp->t_state &= ~TS_ASLEEP;
-			wakeup((caddr_t)&tp->t_outq);
-		}
-		selwakeup(&tp->t_wsel);
-	}
+	ttwwakeup(tp);
 	return (error);
 }
 
-void
+static	void
 ptsstop(tp, flush)
 	register struct tty *tp;
 	int flush;
@@ -392,6 +446,7 @@ ptsstop(tp, flush)
 	ptcwakeup(tp, flag);
 }
 
+static	int
 ptcselect(dev, rw, p)
 	dev_t dev;
 	int rw;
@@ -401,7 +456,7 @@ ptcselect(dev, rw, p)
 	struct pt_ioctl *pti = &pt_ioctl[minor(dev)];
 	int s;
 
-	if ((tp->t_state&TS_CARR_ON) == 0)
+	if ((tp->t_state & TS_CONNECTED) == 0)
 		return (1);
 	switch (rw) {
 
@@ -420,8 +475,8 @@ ptcselect(dev, rw, p)
 
 	case 0:					/* exceptional */
 		if ((tp->t_state&TS_ISOPEN) &&
-		    (pti->pt_flags&PF_PKT && pti->pt_send ||
-		     pti->pt_flags&PF_UCNTL && pti->pt_ucntl))
+		    ((pti->pt_flags&PF_PKT && pti->pt_send) ||
+		     (pti->pt_flags&PF_UCNTL && pti->pt_ucntl)))
 			return (1);
 		selrecord(p, &pti->pt_selr);
 		break;
@@ -446,13 +501,14 @@ ptcselect(dev, rw, p)
 	return (0);
 }
 
+static	int
 ptcwrite(dev, uio, flag)
 	dev_t dev;
 	register struct uio *uio;
 	int flag;
 {
 	register struct tty *tp = &pt_tty[minor(dev)];
-	register u_char *cp;
+	register u_char *cp = 0;
 	register int cc = 0;
 	u_char locbuf[BUFSIZ];
 	int cnt = 0;
@@ -465,7 +521,8 @@ again:
 	if (pti->pt_flags & PF_REMOTE) {
 		if (tp->t_canq.c_cc)
 			goto block;
-		while (uio->uio_resid > 0 && tp->t_canq.c_cc < TTYHOG - 1) {
+		while ((uio->uio_resid > 0 || cc > 0) &&
+		       tp->t_canq.c_cc < TTYHOG - 1) {
 			if (cc == 0) {
 				cc = min(uio->uio_resid, BUFSIZ);
 				cc = min(cc, TTYHOG - 1 - tp->t_canq.c_cc);
@@ -474,19 +531,34 @@ again:
 				if (error)
 					return (error);
 				/* check again for safety */
-				if ((tp->t_state&TS_ISOPEN) == 0)
+				if ((tp->t_state & TS_ISOPEN) == 0) {
+					/* adjust as usual */
+					uio->uio_resid += cc;
 					return (EIO);
+				}
+			}
+			if (cc > 0) {
+				cc = b_to_q((char *)cp, cc, &tp->t_canq);
+				/*
+				 * XXX we don't guarantee that the canq size
+				 * is >= TTYHOG, so the above b_to_q() may
+				 * leave some bytes uncopied.  However, space
+				 * is guaranteed for the null terminator if
+				 * we don't fail here since (TTYHOG - 1) is
+				 * not a multiple of CBSIZE.
+				 */
+				if (cc > 0)
+					break;
 			}
-			if (cc)
-				(void) b_to_q((char *)cp, cc, &tp->t_canq);
-			cc = 0;
 		}
+		/* adjust for data copied in but not written */
+		uio->uio_resid += cc;
 		(void) putc(0, &tp->t_canq);
 		ttwakeup(tp);
-		wakeup((caddr_t)&tp->t_canq);
+		wakeup(TSA_PTS_READ(tp));
 		return (0);
 	}
-	while (uio->uio_resid > 0) {
+	while (uio->uio_resid > 0 || cc > 0) {
 		if (cc == 0) {
 			cc = min(uio->uio_resid, BUFSIZ);
 			cp = locbuf;
@@ -494,13 +566,16 @@ again:
 			if (error)
 				return (error);
 			/* check again for safety */
-			if ((tp->t_state&TS_ISOPEN) == 0)
+			if ((tp->t_state & TS_ISOPEN) == 0) {
+				/* adjust for data copied in but not written */
+				uio->uio_resid += cc;
 				return (EIO);
+			}
 		}
 		while (cc > 0) {
 			if ((tp->t_rawq.c_cc + tp->t_canq.c_cc) >= TTYHOG - 2 &&
 			   (tp->t_canq.c_cc > 0 || !(tp->t_iflag&ICANON))) {
-				wakeup((caddr_t)&tp->t_rawq);
+				wakeup(TSA_HUP_OR_INPUT(tp));
 				goto block;
 			}
 			(*linesw[tp->t_line].l_rint)(*cp++, tp);
@@ -513,10 +588,13 @@ again:
 block:
 	/*
 	 * Come here to wait for slave to open, for space
-	 * in outq, or space in rawq.
+	 * in outq, or space in rawq, or an empty canq.
 	 */
-	if ((tp->t_state&TS_CARR_ON) == 0)
+	if ((tp->t_state & TS_CONNECTED) == 0) {
+		/* adjust for data copied in but not written */
+		uio->uio_resid += cc;
 		return (EIO);
+	}
 	if (flag & IO_NDELAY) {
 		/* adjust for data copied in but not written */
 		uio->uio_resid += cc;
@@ -524,8 +602,8 @@ block:
 			return (EWOULDBLOCK);
 		return (0);
 	}
-	if (error = tsleep((caddr_t)&tp->t_rawq.c_cf, TTOPRI | PCATCH,
-	    ttyout, 0)) {
+	error = tsleep(TSA_PTC_WRITE(tp), TTOPRI | PCATCH, "ptcout", 0);
+	if (error) {
 		/* adjust for data copied in but not written */
 		uio->uio_resid += cc;
 		return (error);
@@ -533,10 +611,21 @@ block:
 	goto again;
 }
 
+static	struct tty *
+ptydevtotty(dev)
+	dev_t		dev;
+{
+	if (minor(dev) >= npty)
+		return (NULL);
+
+	return &pt_tty[minor(dev)];
+}
+
 /*ARGSUSED*/
+static	int
 ptyioctl(dev, cmd, data, flag, p)
 	dev_t dev;
-	u_long cmd;
+	int cmd;
 	caddr_t data;
 	int flag;
 	struct proc *p;
@@ -572,7 +661,7 @@ ptyioctl(dev, cmd, data, flag, p)
 		}
 		return(0);
 	} else
-	if (cdevsw[major(dev)].d_open == ptcopen)
+	if (cdevsw[major(dev)]->d_open == ptcopen)
 		switch (cmd) {
 
 		case TIOCGPGRP:
@@ -610,7 +699,7 @@ ptyioctl(dev, cmd, data, flag, p)
 			return (0);
 
 #ifdef COMPAT_43
-		case TIOCSETP:		
+		case TIOCSETP:
 		case TIOCSETN:
 #endif
 		case TIOCSETD:
@@ -670,7 +759,7 @@ ptyioctl(dev, cmd, data, flag, p)
 			break;
 		}
 	}
-	stop = (tp->t_iflag & IXON) && CCEQ(cc[VSTOP], CTRL('s')) 
+	stop = (tp->t_iflag & IXON) && CCEQ(cc[VSTOP], CTRL('s'))
 		&& CCEQ(cc[VSTART], CTRL('q'));
 	if (pti->pt_flags & PF_NOSTOP) {
 		if (stop) {
@@ -689,3 +778,49 @@ ptyioctl(dev, cmd, data, flag, p)
 	}
 	return (error);
 }
+
+static ptc_devsw_installed = 0;
+#ifdef DEVFS
+#define MAXUNITS (8 * 32)
+static	void	*devfs_token_pts[MAXUNITS];
+static	void	*devfs_token_ptc[MAXUNITS];
+static  const	char jnames[] = "pqrsPQRS";
+#endif
+
+static void
+ptc_drvinit(void *unused)
+{
+#ifdef DEVFS
+	int i,j,k;
+#endif
+	dev_t dev;
+
+	if( ! ptc_devsw_installed ) {
+		dev = makedev(CDEV_MAJOR_S, 0);
+		cdevsw_add(&dev, &pts_cdevsw, NULL);
+		dev = makedev(CDEV_MAJOR_C, 0);
+		cdevsw_add(&dev, &ptc_cdevsw, NULL);
+		ptc_devsw_installed = 1;
+#ifdef DEVFS
+/*XXX*/
+#if NPTY > MAXUNITS
+#undef NPTY
+#define NPTY MAXUNITS
+#endif
+		for ( i = 0 ; i<NPTY ; i++ ) {
+			j = i / 32;
+			k = i % 32;
+			devfs_token_pts[i] = 
+				devfs_add_devswf(&pts_cdevsw,i,
+						DV_CHR,0,0,0666,
+						"tty%c%n",jnames[j],k);
+			devfs_token_ptc[i] =
+				devfs_add_devswf(&ptc_cdevsw,i,
+						DV_CHR,0,0,0666,
+						"pty%c%n",jnames[j],k);
+		}
+#endif
+    	}
+}
+
+SYSINIT(ptcdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR_C,ptc_drvinit,NULL)
diff --git a/sys/kern/tty_snoop.c b/sys/kern/tty_snoop.c
new file mode 100644
index 0000000..6e2bf5d
--- /dev/null
+++ b/sys/kern/tty_snoop.c
@@ -0,0 +1,548 @@
+/*
+ * Copyright (c) 1995 Ugen J.S.Antsilevich
+ *
+ * Redistribution and use in source forms, with and without modification,
+ * are permitted provided that this entire comment appears intact.
+ *
+ * Redistribution in binary form may occur without any restrictions.
+ * Obviously, it would be nice if you gave credit where credit is due
+ * but requiring it would be too onerous.
+ *
+ * This software is provided ``AS IS'' without any warranties of any kind.
+ *
+ * Snoop stuff.
+ */
+
+#include "snp.h"
+
+#if NSNP > 0
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/filio.h>
+#include <sys/ioctl_compat.h>	/* Oooh..We need O/NTTYDISC	 */
+#include <sys/proc.h>
+#include <sys/tty.h>
+#include <sys/fcntl.h>
+#include <sys/conf.h>
+#include <sys/uio.h>
+#include <sys/kernel.h>
+#include <sys/malloc.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+
+#include <sys/snoop.h>
+
+static	d_open_t	snpopen;
+static	d_close_t	snpclose;
+static	d_read_t	snpread;
+static	d_write_t	snpwrite;
+static	d_ioctl_t	snpioctl;
+static	d_select_t	snpselect;
+
+#define CDEV_MAJOR 53
+static struct cdevsw snp_cdevsw = 
+	{ snpopen,	snpclose,	snpread,	snpwrite,	/*53*/
+	  snpioctl,	nostop,		nullreset,	nodevtotty,/* snoop */
+	  snpselect,	nommap,		NULL,	"snp",	NULL,	-1 };
+
+
+#ifndef MIN
+#define MIN(a,b) (((a)<(b))?(a):(b))
+#endif
+
+static struct snoop snoopsw[NSNP];
+
+static struct tty	*snpdevtotty __P((dev_t dev));
+static int		snp_detach __P((struct snoop *snp));
+
+static struct tty *
+snpdevtotty (dev)
+	dev_t		dev;
+{
+	struct cdevsw	*cdp;
+	int		maj;
+
+	maj = major(dev);
+	if ((u_int)maj >= nchrdev)
+		return (NULL);
+	cdp = cdevsw[maj];
+	if (cdp == NULL)
+		return (NULL);
+	return ((*cdp->d_devtotty)(dev));
+}
+
+#define SNP_INPUT_BUF	5	/* This is even too  much,the maximal
+				 * interactive mode write is 3 bytes
+				 * length for function keys...
+				 */
+
+static	int
+snpwrite(dev, uio, flag)
+	dev_t           dev;
+	struct uio     *uio;
+	int             flag;
+{
+	int             unit = minor(dev), len, i, error;
+	struct snoop   *snp = &snoopsw[unit];
+	struct tty     *tp;
+	char		c[SNP_INPUT_BUF];
+
+	if (snp->snp_tty == NULL)
+		return (EIO);
+
+	tp = snp->snp_tty;
+
+	if ((tp->t_sc == snp) && (tp->t_state & TS_SNOOP) &&
+	    (tp->t_line == OTTYDISC || tp->t_line == NTTYDISC))
+		goto tty_input;
+
+	printf("Snoop: attempt to write to bad tty.\n");
+	return (EIO);
+
+tty_input:
+	if (!(tp->t_state & TS_ISOPEN))
+		return (EIO);
+
+	while (uio->uio_resid > 0) {
+		len = MIN(uio->uio_resid,SNP_INPUT_BUF);
+		if ((error = uiomove(c, len, uio)) != 0)
+			return (error);
+		for (i=0;i<len;i++) {
+			if (ttyinput(c[i] , tp))
+				return (EIO);
+		}
+	}
+	return 0;
+
+}
+
+
+static	int
+snpread(dev, uio, flag)
+	dev_t           dev;
+	struct uio     *uio;
+	int             flag;
+{
+	int             unit = minor(dev), s;
+	struct snoop   *snp = &snoopsw[unit];
+	int             len, n, nblen, error = 0;
+	caddr_t         from;
+	char           *nbuf;
+
+#ifdef DIAGNOSTIC
+	if ((snp->snp_len + snp->snp_base) > snp->snp_blen)
+		panic("snoop buffer error");
+#endif
+
+	if (snp->snp_tty == NULL)
+		return (EIO);
+
+	snp->snp_flags &= ~SNOOP_RWAIT;
+
+	do {
+		if (snp->snp_len == 0) {
+			if (snp->snp_flags & SNOOP_NBIO) {
+				return EWOULDBLOCK;
+			}
+			snp->snp_flags |= SNOOP_RWAIT;
+			tsleep((caddr_t) snp, (PZERO + 1) | PCATCH, "snoopread", 0);
+		}
+	} while (snp->snp_len == 0);
+
+	n = snp->snp_len;
+
+	while (snp->snp_len > 0 && uio->uio_resid > 0 && error == 0) {
+		len = MIN(uio->uio_resid, snp->snp_len);
+		from = (caddr_t) (snp->snp_buf + snp->snp_base);
+		if (len == 0)
+			break;
+
+		error = uiomove(from, len, uio);
+		snp->snp_base += len;
+		snp->snp_len -= len;
+	}
+	if ((snp->snp_flags & SNOOP_OFLOW) && (n < snp->snp_len)) {
+		snp->snp_flags &= ~SNOOP_OFLOW;
+	}
+	s = spltty();
+	nblen = snp->snp_blen;
+	if (((nblen / 2) >= SNOOP_MINLEN) && (nblen / 2) >= snp->snp_len) {
+		while (((nblen / 2) >= snp->snp_len) && ((nblen / 2) >= SNOOP_MINLEN))
+			nblen = nblen / 2;
+		if (nbuf = malloc(nblen, M_TTYS, M_NOWAIT)) {
+			bcopy(snp->snp_buf + snp->snp_base, nbuf, snp->snp_len);
+			free(snp->snp_buf, M_TTYS);
+			snp->snp_buf = nbuf;
+			snp->snp_blen = nblen;
+			snp->snp_base = 0;
+		}
+	}
+	splx(s);
+
+	return error;
+}
+
+int
+snpinc(snp, c)
+        struct snoop    *snp;
+        char            c;
+{
+        char    buf[1];
+
+        buf[0]=c;
+        return (snpin(snp,buf,1));
+}
+
+
+int
+snpin(snp, buf, n)
+	struct snoop   *snp;
+	char           *buf;
+	int             n;
+{
+	int             s_free, s_tail;
+	int             s, len, nblen;
+	caddr_t         from, to;
+	char           *nbuf;
+
+
+	if (n == 0)
+		return 0;
+
+#ifdef DIAGNOSTIC
+	if (n < 0)
+		panic("bad snoop char count");
+
+	if (!(snp->snp_flags & SNOOP_OPEN)) {
+		printf("Snoop: data coming to closed device.\n");
+		return 0;
+	}
+#endif
+	if (snp->snp_flags & SNOOP_DOWN) {
+		printf("Snoop: more data to down interface.\n");
+		return 0;
+	}
+
+	if (snp->snp_flags & SNOOP_OFLOW) {
+		printf("Snoop: buffer overflow.\n");
+		/*
+		 * On overflow we just repeat the standart close
+		 * procedure...yes , this is waste of space but.. Then next
+		 * read from device will fail if one would recall he is
+		 * snooping and retry...
+		 */
+
+		return (snpdown(snp));
+	}
+	s_tail = snp->snp_blen - (snp->snp_len + snp->snp_base);
+	s_free = snp->snp_blen - snp->snp_len;
+
+
+	if (n > s_free) {
+		s = spltty();
+		nblen = snp->snp_blen;
+		while ((n > s_free) && ((nblen * 2) <= SNOOP_MAXLEN)) {
+			nblen = snp->snp_blen * 2;
+			s_free = nblen - (snp->snp_len + snp->snp_base);
+		}
+		if ((n <= s_free) && (nbuf = malloc(nblen, M_TTYS, M_NOWAIT))) {
+			bcopy(snp->snp_buf + snp->snp_base, nbuf, snp->snp_len);
+			free(snp->snp_buf, M_TTYS);
+			snp->snp_buf = nbuf;
+			snp->snp_blen = nblen;
+			snp->snp_base = 0;
+		} else {
+			snp->snp_flags |= SNOOP_OFLOW;
+			if (snp->snp_flags & SNOOP_RWAIT) {
+				snp->snp_flags &= ~SNOOP_RWAIT;
+				wakeup((caddr_t) snp);
+			}
+			splx(s);
+			return 0;
+		}
+		splx(s);
+	}
+	if (n > s_tail) {
+		from = (caddr_t) (snp->snp_buf + snp->snp_base);
+		to = (caddr_t) (snp->snp_buf);
+		len = snp->snp_len;
+		bcopy(from, to, len);
+		snp->snp_base = 0;
+	}
+	to = (caddr_t) (snp->snp_buf + snp->snp_base + snp->snp_len);
+	bcopy(buf, to, n);
+	snp->snp_len += n;
+
+	if (snp->snp_flags & SNOOP_RWAIT) {
+		snp->snp_flags &= ~SNOOP_RWAIT;
+		wakeup((caddr_t) snp);
+	}
+	selwakeup(&snp->snp_sel);
+	snp->snp_sel.si_pid = 0;
+
+	return n;
+}
+
+static	int
+snpopen(dev, flag, mode, p)
+	dev_t           dev;
+	int             flag, mode;
+	struct proc    *p;
+{
+	struct snoop   *snp;
+	register int    unit, error;
+
+	if (error = suser(p->p_ucred, &p->p_acflag))
+		return (error);
+
+	if ((unit = minor(dev)) >= NSNP)
+		return (ENXIO);
+
+	snp = &snoopsw[unit];
+
+	if (snp->snp_flags & SNOOP_OPEN)
+		return (ENXIO);
+
+	/*
+	 * We intentionally do not OR flags with SNOOP_OPEN,but set them so
+	 * all previous settings (especially SNOOP_OFLOW) will be cleared.
+	 */
+	snp->snp_flags = SNOOP_OPEN;
+
+	snp->snp_buf = malloc(SNOOP_MINLEN, M_TTYS, M_WAITOK);
+	snp->snp_blen = SNOOP_MINLEN;
+	snp->snp_base = 0;
+	snp->snp_len = 0;
+
+	/*
+	 * snp_tty == NULL  is for inactive snoop devices.
+	 */
+	snp->snp_tty = NULL;
+	snp->snp_target = -1;
+	return (0);
+}
+
+
+static int
+snp_detach(snp)
+	struct snoop   *snp;
+{
+	struct tty     *tp;
+
+	snp->snp_base = 0;
+	snp->snp_len = 0;
+
+	/*
+	 * If line disc. changed we do not touch this pointer,SLIP/PPP will
+	 * change it anyway.
+	 */
+
+	if (snp->snp_tty == NULL)
+		goto detach_notty;
+
+	tp = snp->snp_tty;
+
+	if (tp && (tp->t_sc == snp) && (tp->t_state & TS_SNOOP) &&
+	    (tp->t_line == OTTYDISC || tp->t_line == NTTYDISC)) {
+		tp->t_sc = NULL;
+		tp->t_state &= ~TS_SNOOP;
+	} else
+		printf("Snoop: bad attached tty data.\n");
+
+	snp->snp_tty = NULL;
+	snp->snp_target = -1;
+
+detach_notty:
+	selwakeup(&snp->snp_sel);
+	snp->snp_sel.si_pid = 0;
+
+	return (0);
+}
+
+static	int
+snpclose(dev, flags, fmt, p)
+	dev_t           dev;
+	int             flags;
+	int             fmt;
+	struct proc    *p;
+{
+	register int    unit = minor(dev);
+	struct snoop   *snp = &snoopsw[unit];
+
+	snp->snp_blen = 0;
+	free(snp->snp_buf, M_TTYS);
+	snp->snp_flags &= ~SNOOP_OPEN;
+
+	return (snp_detach(snp));
+}
+
+int
+snpdown(snp)
+	struct snoop	*snp;
+{
+	snp->snp_blen = SNOOP_MINLEN;
+	free(snp->snp_buf, M_TTYS);
+	snp->snp_buf = malloc(SNOOP_MINLEN, M_TTYS, M_WAITOK);
+	snp->snp_flags |= SNOOP_DOWN;
+
+	return (snp_detach(snp));
+}
+
+
+static	int
+snpioctl(dev, cmd, data, flags, p)
+	dev_t           dev;
+	int             cmd;
+	caddr_t         data;
+	int             flags;
+	struct proc    *p;
+{
+	int             unit = minor(dev), s;
+	dev_t		tdev;
+	struct snoop   *snp = &snoopsw[unit];
+	struct tty     *tp, *tpo;
+
+	switch (cmd) {
+	case SNPSTTY:
+		tdev = *((dev_t *) data);
+		if (tdev == -1)
+			return (snpdown(snp));
+
+		tp = snpdevtotty(tdev);
+		if (!tp)
+			return (EINVAL);
+
+		if ((tp->t_sc != (caddr_t) snp) && (tp->t_state & TS_SNOOP))
+			return (EBUSY);
+
+		if ((tp->t_line != OTTYDISC) && (tp->t_line != NTTYDISC))
+			return (EBUSY);
+
+		s = spltty();
+
+		if (snp->snp_target == -1) {
+			tpo = snp->snp_tty;
+			if (tpo)
+				tpo->t_state &= ~TS_SNOOP;
+		}
+
+		tp->t_sc = (caddr_t) snp;
+		tp->t_state |= TS_SNOOP;
+		snp->snp_tty = tp;
+		snp->snp_target = tdev;
+
+		/*
+		 * Clean overflow and down flags -
+		 * we'll have a chance to get them in the future :)))
+		 */
+		snp->snp_flags &= ~SNOOP_OFLOW;
+		snp->snp_flags &= ~SNOOP_DOWN;
+		splx(s);
+		break;
+
+	case SNPGTTY:
+		/*
+		 * We keep snp_target field specially to make
+		 * SNPGTTY happy,else we can't know what is device
+		 * major/minor for tty.
+		 */
+		*((dev_t *) data) = snp->snp_target;
+		break;
+
+	case FIONBIO:
+		if (*(int *) data)
+			snp->snp_flags |= SNOOP_NBIO;
+		else
+			snp->snp_flags &= ~SNOOP_NBIO;
+		break;
+
+	case FIOASYNC:
+		if (*(int *) data)
+			snp->snp_flags |= SNOOP_ASYNC;
+		else
+			snp->snp_flags &= ~SNOOP_ASYNC;
+		break;
+
+	case FIONREAD:
+		s = spltty();
+		if (snp->snp_tty != NULL)
+			*(int *) data = snp->snp_len;
+		else
+			if (snp->snp_flags & SNOOP_DOWN) {
+				if (snp->snp_flags & SNOOP_OFLOW)
+					*(int *) data = SNP_OFLOW;
+				else
+					*(int *) data = SNP_TTYCLOSE;
+			} else {
+				*(int *) data = SNP_DETACH;
+			}
+		splx(s);
+		break;
+
+	default:
+		return (ENOTTY);
+	}
+	return (0);
+}
+
+
+static	int
+snpselect(dev, rw, p)
+	dev_t           dev;
+	int             rw;
+	struct proc    *p;
+{
+	int             unit = minor(dev);
+	struct snoop   *snp = &snoopsw[unit];
+
+	if (rw != FREAD)
+		return 1;
+
+	if (snp->snp_len > 0)
+		return 1;
+
+	/*
+	 * If snoop is down,we don't want to select() forever so we return 1.
+	 * Caller should see if we down via FIONREAD ioctl().The last should
+	 * return -1 to indicate down state.
+	 */
+	if (snp->snp_flags & SNOOP_DOWN)
+		return 1;
+
+	selrecord(p, &snp->snp_sel);
+	return 0;
+}
+
+#ifdef DEVFS
+static	void	*snp_devfs_token[NSNP];
+#endif
+static snp_devsw_installed = 0;
+
+static void
+snp_drvinit(void *unused)
+{
+	dev_t dev;
+#ifdef DEVFS
+	int	i;
+#endif
+
+	if( ! snp_devsw_installed ) {
+		dev = makedev(CDEV_MAJOR, 0);
+		cdevsw_add(&dev,&snp_cdevsw, NULL);
+		snp_devsw_installed = 1;
+#ifdef DEVFS
+		for ( i = 0 ; i < NSNP ; i++) {
+			snp_devfs_token[i] =
+				devfs_add_devswf(&snp_cdevsw, i, DV_CHR, 0, 0, 
+						0600, "snp%d", i);
+		}
+#endif
+    	}
+}
+
+SYSINIT(snpdev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,snp_drvinit,NULL)
+
+
+#endif
diff --git a/sys/kern/tty_subr.c b/sys/kern/tty_subr.c
index fe8f000..d907b47 100644
--- a/sys/kern/tty_subr.c
+++ b/sys/kern/tty_subr.c
@@ -1,32 +1,21 @@
-/*-
- * Copyright (c) 1982, 1986, 1993
- *	The Regents of the University of California.  All rights reserved.
- * (c) UNIX System Laboratories, Inc.
- * All or some portions of this file are derived from material licensed
- * to the University of California by American Telephone and Telegraph
- * Co. or Unix System Laboratories, Inc. and are reproduced herein with
- * the permission of UNIX System Laboratories, Inc.
+/*
+ * Copyright (c) 1994, David Greenman
+ * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
+ *    notice unmodified, this list of conditions, and the following
+ *    disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *	This product includes software developed by the University of
- *	California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
  *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
@@ -35,125 +24,671 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	from: @(#)tty_subr.c	8.2 (Berkeley) 9/5/93
+ * $Id: tty_subr.c,v 1.26 1997/03/05 16:45:01 bde Exp $
+ */
+
+/*
+ * clist support routines
  */
 
 #include <sys/param.h>
-#include <sys/ioctl.h>
+#include <sys/kernel.h>
+#include <sys/systm.h>
 #include <sys/tty.h>
+#include <sys/clist.h>
+#include <sys/malloc.h>
 
-char cwaiting;
-struct cblock *cfree, *cfreelist;
-int cfreecount, nclist;
+static void clist_init __P((void *));
+SYSINIT(clist, SI_SUB_CLIST, SI_ORDER_FIRST, clist_init, NULL)
 
-void
-clist_init()
+static struct cblock *cfreelist = 0;
+int cfreecount = 0;
+static int cslushcount;
+static int ctotcount;
+
+#ifndef INITIAL_CBLOCKS
+#define	INITIAL_CBLOCKS 50
+#endif
+
+static struct cblock *cblock_alloc __P((void));
+static void cblock_alloc_cblocks __P((int number));
+static void cblock_free __P((struct cblock *cblockp));
+static void cblock_free_cblocks __P((int number));
+
+#include "opt_ddb.h"
+#ifdef DDB
+#include <ddb/ddb.h>
+
+DB_SHOW_COMMAND(cbstat, cbstat)
 {
+	printf(
+	"tot = %d (active = %d, free = %d (reserved = %d, slush = %d))\n",
+	       ctotcount * CBSIZE, ctotcount * CBSIZE - cfreecount, cfreecount,
+	       cfreecount - cslushcount * CBSIZE, cslushcount * CBSIZE);
+}
+#endif /* DDB */
 
+/*
+ * Called from init_main.c
+ */
+/* ARGSUSED*/
+static void
+clist_init(dummy)
+	void *dummy;
+{
 	/*
-	 * Body deleted.
+	 * Allocate an initial base set of cblocks as a 'slush'.
+	 * We allocate non-slush cblocks with each initial ttyopen() and
+	 * deallocate them with each ttyclose().
+	 * We should adjust the slush allocation.  This can't be done in
+	 * the i/o routines because they are sometimes called from
+	 * interrupt handlers when it may be unsafe to call malloc().
 	 */
-	return;
+	cblock_alloc_cblocks(cslushcount = INITIAL_CBLOCKS);
 }
 
-getc(a1)
-	struct clist *a1;
+/*
+ * Remove a cblock from the cfreelist queue and return a pointer
+ * to it.
+ */
+static inline struct cblock *
+cblock_alloc()
 {
+	struct cblock *cblockp;
 
-	/*
-	 * Body deleted.
-	 */
-	return ((char)0);
+	cblockp = cfreelist;
+	if (cblockp == NULL)
+		panic("clist reservation botch");
+	cfreelist = cblockp->c_next;
+	cblockp->c_next = NULL;
+	cfreecount -= CBSIZE;
+	return (cblockp);
 }
 
-q_to_b(a1, a2, a3)
-	struct clist *a1;
-	char *a2;
-	int a3;
+/*
+ * Add a cblock to the cfreelist queue.
+ */
+static inline void
+cblock_free(cblockp)
+	struct cblock *cblockp;
 {
+	if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1))
+		bzero(cblockp->c_quote, sizeof cblockp->c_quote);
+	cblockp->c_next = cfreelist;
+	cfreelist = cblockp;
+	cfreecount += CBSIZE;
+}
 
-	/*
-	 * Body deleted.
-	 */
-	return (0);
+/*
+ * Allocate some cblocks for the cfreelist queue.
+ */
+static void
+cblock_alloc_cblocks(number)
+	int number;
+{
+	int i;
+	struct cblock *cbp;
+
+	for (i = 0; i < number; ++i) {
+		cbp = malloc(sizeof *cbp, M_TTYS, M_NOWAIT);
+		if (cbp == NULL) {
+			printf(
+"clist_alloc_cblocks: M_NOWAIT malloc failed, trying M_WAITOK\n");
+			cbp = malloc(sizeof *cbp, M_TTYS, M_WAITOK);
+		}
+		/*
+		 * Freed cblocks have zero quotes and garbage elsewhere.
+		 * Set the may-have-quote bit to force zeroing the quotes.
+		 */
+		setbit(cbp->c_quote, CBQSIZE * NBBY - 1);
+		cblock_free(cbp);
+	}
+	ctotcount += number;
 }
 
-ndqb(a1, a2)
-	struct clist *a1;
-	int a2;
+/*
+ * Set the cblock allocation policy for a a clist.
+ * Must be called in process context at spltty().
+ */
+void
+clist_alloc_cblocks(clistp, ccmax, ccreserved)
+	struct clist *clistp;
+	int ccmax;
+	int ccreserved;
 {
+	int dcbr;
 
 	/*
-	 * Body deleted.
+	 * Allow for wasted space at the head.
 	 */
-	return (0);
+	if (ccmax != 0)
+		ccmax += CBSIZE - 1;
+	if (ccreserved != 0)
+		ccreserved += CBSIZE - 1;
+
+	clistp->c_cbmax = roundup(ccmax, CBSIZE) / CBSIZE;
+	dcbr = roundup(ccreserved, CBSIZE) / CBSIZE - clistp->c_cbreserved;
+	if (dcbr >= 0)
+		cblock_alloc_cblocks(dcbr);
+	else {
+		if (clistp->c_cbreserved + dcbr < clistp->c_cbcount)
+			dcbr = clistp->c_cbcount - clistp->c_cbreserved;
+		cblock_free_cblocks(-dcbr);
+	}
+	clistp->c_cbreserved += dcbr;
 }
 
+/*
+ * Free some cblocks from the cfreelist queue back to the
+ * system malloc pool.
+ */
+static void
+cblock_free_cblocks(number)
+	int number;
+{
+	int i;
+
+	for (i = 0; i < number; ++i)
+		free(cblock_alloc(), M_TTYS);
+	ctotcount -= number;
+}
+
+/*
+ * Free the cblocks reserved for a clist.
+ * Must be called at spltty().
+ */
 void
-ndflush(a1, a2)
-	struct clist *a1;
-	int a2;
+clist_free_cblocks(clistp)
+	struct clist *clistp;
 {
+	if (clistp->c_cbcount != 0)
+		panic("freeing active clist cblocks");
+	cblock_free_cblocks(clistp->c_cbreserved);
+	clistp->c_cbmax = 0;
+	clistp->c_cbreserved = 0;
+}
 
-	/*
-	 * Body deleted.
-	 */
-	return;
+/*
+ * Get a character from the head of a clist.
+ */
+int
+getc(clistp)
+	struct clist *clistp;
+{
+	int chr = -1;
+	int s;
+	struct cblock *cblockp;
+
+	s = spltty();
+
+	/* If there are characters in the list, get one */
+	if (clistp->c_cc) {
+		cblockp = (struct cblock *)((long)clistp->c_cf & ~CROUND);
+		chr = (u_char)*clistp->c_cf;
+
+		/*
+		 * If this char is quoted, set the flag.
+		 */
+		if (isset(cblockp->c_quote, clistp->c_cf - (char *)cblockp->c_info))
+			chr |= TTY_QUOTE;
+
+		/*
+		 * Advance to next character.
+		 */
+		clistp->c_cf++;
+		clistp->c_cc--;
+		/*
+		 * If we have advanced the 'first' character pointer
+		 * past the end of this cblock, advance to the next one.
+		 * If there are no more characters, set the first and
+		 * last pointers to NULL. In either case, free the
+		 * current cblock.
+		 */
+		if ((clistp->c_cf >= (char *)(cblockp+1)) || (clistp->c_cc == 0)) {
+			if (clistp->c_cc > 0) {
+				clistp->c_cf = cblockp->c_next->c_info;
+			} else {
+				clistp->c_cf = clistp->c_cl = NULL;
+			}
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+		}
+	}
+
+	splx(s);
+	return (chr);
+}
+
+/*
+ * Copy 'amount' of chars, beginning at head of clist 'clistp' to
+ * destination linear buffer 'dest'. Return number of characters
+ * actually copied.
+ */
+int
+q_to_b(clistp, dest, amount)
+	struct clist *clistp;
+	char *dest;
+	int amount;
+{
+	struct cblock *cblockp;
+	struct cblock *cblockn;
+	char *dest_orig = dest;
+	int numc;
+	int s;
+
+	s = spltty();
+
+	while (clistp && amount && (clistp->c_cc > 0)) {
+		cblockp = (struct cblock *)((long)clistp->c_cf & ~CROUND);
+		cblockn = cblockp + 1; /* pointer arithmetic! */
+		numc = min(amount, (char *)cblockn - clistp->c_cf);
+		numc = min(numc, clistp->c_cc);
+		bcopy(clistp->c_cf, dest, numc);
+		amount -= numc;
+		clistp->c_cf += numc;
+		clistp->c_cc -= numc;
+		dest += numc;
+		/*
+		 * If this cblock has been emptied, advance to the next
+		 * one. If there are no more characters, set the first
+		 * and last pointer to NULL. In either case, free the
+		 * current cblock.
+		 */
+		if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+			if (clistp->c_cc > 0) {
+				clistp->c_cf = cblockp->c_next->c_info;
+			} else {
+				clistp->c_cf = clistp->c_cl = NULL;
+			}
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+		}
+	}
+
+	splx(s);
+	return (dest - dest_orig);
 }
 
-putc(a1, a2)
-	char a1;
-	struct clist *a2;
+/*
+ * Flush 'amount' of chars, beginning at head of clist 'clistp'.
+ */
+void
+ndflush(clistp, amount)
+	struct clist *clistp;
+	int amount;
 {
+	struct cblock *cblockp;
+	struct cblock *cblockn;
+	int numc;
+	int s;
+
+	s = spltty();
+
+	while (amount && (clistp->c_cc > 0)) {
+		cblockp = (struct cblock *)((long)clistp->c_cf & ~CROUND);
+		cblockn = cblockp + 1; /* pointer arithmetic! */
+		numc = min(amount, (char *)cblockn - clistp->c_cf);
+		numc = min(numc, clistp->c_cc);
+		amount -= numc;
+		clistp->c_cf += numc;
+		clistp->c_cc -= numc;
+		/*
+		 * If this cblock has been emptied, advance to the next
+		 * one. If there are no more characters, set the first
+		 * and last pointer to NULL. In either case, free the
+		 * current cblock.
+		 */
+		if ((clistp->c_cf >= (char *)cblockn) || (clistp->c_cc == 0)) {
+			if (clistp->c_cc > 0) {
+				clistp->c_cf = cblockp->c_next->c_info;
+			} else {
+				clistp->c_cf = clistp->c_cl = NULL;
+			}
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+		}
+	}
+
+	splx(s);
+}
+
+/*
+ * Add a character to the end of a clist. Return -1 is no
+ * more clists, or 0 for success.
+ */
+int
+putc(chr, clistp)
+	int chr;
+	struct clist *clistp;
+{
+	struct cblock *cblockp;
+	int s;
+
+	s = spltty();
+
+	if (clistp->c_cl == NULL) {
+		if (clistp->c_cbreserved < 1) {
+			splx(s);
+			printf("putc to a clist with no reserved cblocks\n");
+			return (-1);		/* nothing done */
+		}
+		cblockp = cblock_alloc();
+		clistp->c_cbcount = 1;
+		clistp->c_cf = clistp->c_cl = cblockp->c_info;
+		clistp->c_cc = 0;
+	} else {
+		cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND);
+		if (((long)clistp->c_cl & CROUND) == 0) {
+			struct cblock *prev = (cblockp - 1);
+
+			if (clistp->c_cbcount >= clistp->c_cbreserved) {
+				if (clistp->c_cbcount >= clistp->c_cbmax
+				    || cslushcount <= 0) {
+					splx(s);
+					return (-1);
+				}
+				--cslushcount;
+			}
+			cblockp = cblock_alloc();
+			clistp->c_cbcount++;
+			prev->c_next = cblockp;
+			clistp->c_cl = cblockp->c_info;
+		}
+	}
 
 	/*
-	 * Body deleted.
+	 * If this character is quoted, set the quote bit, if not, clear it.
 	 */
+	if (chr & TTY_QUOTE) {
+		setbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+		/*
+		 * Use one of the spare quote bits to record that something
+		 * may be quoted.
+		 */
+		setbit(cblockp->c_quote, CBQSIZE * NBBY - 1);
+	} else
+		clrbit(cblockp->c_quote, clistp->c_cl - (char *)cblockp->c_info);
+
+	*clistp->c_cl++ = chr;
+	clistp->c_cc++;
+
+	splx(s);
 	return (0);
 }
 
-b_to_q(a1, a2, a3)
-	char *a1;
-	int a2;
-	struct clist *a3;
+/*
+ * Copy data from linear buffer to clist chain. Return the
+ * number of characters not copied.
+ */
+int
+b_to_q(src, amount, clistp)
+	char *src;
+	int amount;
+	struct clist *clistp;
 {
+	struct cblock *cblockp;
+	char *firstbyte, *lastbyte;
+	u_char startmask, endmask;
+	int startbit, endbit, num_between, numc;
+	int s;
 
 	/*
-	 * Body deleted.
+	 * Avoid allocating an initial cblock and then not using it.
+	 * c_cc == 0 must imply c_cbount == 0.
 	 */
-	return (0);
+	if (amount <= 0)
+		return (amount);
+
+	s = spltty();
+
+	/*
+	 * If there are no cblocks assigned to this clist yet,
+	 * then get one.
+	 */
+	if (clistp->c_cl == NULL) {
+		if (clistp->c_cbreserved < 1) {
+			splx(s);
+			printf("b_to_q to a clist with no reserved cblocks.\n");
+			return (amount);	/* nothing done */
+		}
+		cblockp = cblock_alloc();
+		clistp->c_cbcount = 1;
+		clistp->c_cf = clistp->c_cl = cblockp->c_info;
+		clistp->c_cc = 0;
+	} else {
+		cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND);
+	}
+
+	while (amount) {
+		/*
+		 * Get another cblock if needed.
+		 */
+		if (((long)clistp->c_cl & CROUND) == 0) {
+			struct cblock *prev = cblockp - 1;
+
+			if (clistp->c_cbcount >= clistp->c_cbreserved) {
+				if (clistp->c_cbcount >= clistp->c_cbmax
+				    || cslushcount <= 0) {
+					splx(s);
+					return (amount);
+				}
+				--cslushcount;
+			}
+			cblockp = cblock_alloc();
+			clistp->c_cbcount++;
+			prev->c_next = cblockp;
+			clistp->c_cl = cblockp->c_info;
+		}
+
+		/*
+		 * Copy a chunk of the linear buffer up to the end
+		 * of this cblock.
+		 */
+		numc = min(amount, (char *)(cblockp + 1) - clistp->c_cl);
+		bcopy(src, clistp->c_cl, numc);
+
+		/*
+		 * Clear quote bits if they aren't known to be clear.
+		 * The following could probably be made into a seperate
+		 * "bitzero()" routine, but why bother?
+		 */
+		if (isset(cblockp->c_quote, CBQSIZE * NBBY - 1)) {
+			startbit = clistp->c_cl - (char *)cblockp->c_info;
+			endbit = startbit + numc - 1;
+
+			firstbyte = (u_char *)cblockp->c_quote + (startbit / NBBY);
+			lastbyte = (u_char *)cblockp->c_quote + (endbit / NBBY);
+
+			/*
+			 * Calculate mask of bits to preserve in first and
+			 * last bytes.
+			 */
+			startmask = NBBY - (startbit % NBBY);
+			startmask = 0xff >> startmask;
+			endmask = (endbit % NBBY);
+			endmask = 0xff << (endmask + 1);
+
+			if (firstbyte != lastbyte) {
+				*firstbyte &= startmask;
+				*lastbyte &= endmask;
+
+				num_between = lastbyte - firstbyte - 1;
+				if (num_between)
+					bzero(firstbyte + 1, num_between);
+			} else {
+				*firstbyte &= (startmask | endmask);
+			}
+		}
+
+		/*
+		 * ...and update pointer for the next chunk.
+		 */
+		src += numc;
+		clistp->c_cl += numc;
+		clistp->c_cc += numc;
+		amount -= numc;
+		/*
+		 * If we go through the loop again, it's always
+		 * for data in the next cblock, so by adding one (cblock),
+		 * (which makes the pointer 1 beyond the end of this
+		 * cblock) we prepare for the assignment of 'prev'
+		 * above.
+		 */
+		cblockp += 1;
+
+	}
+
+	splx(s);
+	return (amount);
 }
 
+/*
+ * Get the next character in the clist. Store it at dst. Don't
+ * advance any clist pointers, but return a pointer to the next
+ * character position.
+ */
 char *
-nextc(a1, a2, a3)
-	struct clist *a1;
-	char *a2;
-	int *a3;
+nextc(clistp, cp, dst)
+	struct clist *clistp;
+	char *cp;
+	int *dst;
 {
+	struct cblock *cblockp;
 
+	++cp;
 	/*
-	 * Body deleted.
+	 * See if the next character is beyond the end of
+	 * the clist.
 	 */
-	return ((char *)0);
+	if (clistp->c_cc && (cp != clistp->c_cl)) {
+		/*
+		 * If the next character is beyond the end of this
+		 * cblock, advance to the next cblock.
+		 */
+		if (((long)cp & CROUND) == 0)
+			cp = ((struct cblock *)cp - 1)->c_next->c_info;
+		cblockp = (struct cblock *)((long)cp & ~CROUND);
+
+		/*
+		 * Get the character. Set the quote flag if this character
+		 * is quoted.
+		 */
+		*dst = (u_char)*cp | (isset(cblockp->c_quote, cp - (char *)cblockp->c_info) ? TTY_QUOTE : 0);
+
+		return (cp);
+	}
+
+	return (NULL);
 }
 
-unputc(a1)
-	struct clist *a1;
+/*
+ * "Unput" a character from a clist.
+ */
+int
+unputc(clistp)
+	struct clist *clistp;
 {
+	struct cblock *cblockp = 0, *cbp = 0;
+	int s;
+	int chr = -1;
+
+
+	s = spltty();
+
+	if (clistp->c_cc) {
+		--clistp->c_cc;
+		--clistp->c_cl;
+
+		chr = (u_char)*clistp->c_cl;
+
+		cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND);
+
+		/*
+		 * Set quote flag if this character was quoted.
+		 */
+		if (isset(cblockp->c_quote, (u_char *)clistp->c_cl - cblockp->c_info))
+			chr |= TTY_QUOTE;
+
+		/*
+		 * If all of the characters have been unput in this
+		 * cblock, then find the previous one and free this
+		 * one.
+		 */
+		if (clistp->c_cc && (clistp->c_cl <= (char *)cblockp->c_info)) {
+			cbp = (struct cblock *)((long)clistp->c_cf & ~CROUND);
+
+			while (cbp->c_next != cblockp)
+				cbp = cbp->c_next;
+
+			/*
+			 * When the previous cblock is at the end, the 'last'
+			 * pointer always points (invalidly) one past.
+			 */
+			clistp->c_cl = (char *)(cbp+1);
+			cblock_free(cblockp);
+			if (--clistp->c_cbcount >= clistp->c_cbreserved)
+				++cslushcount;
+			cbp->c_next = NULL;
+		}
+	}
 
 	/*
-	 * Body deleted.
+	 * If there are no more characters on the list, then
+	 * free the last cblock.
 	 */
-	return ((char)0);
+	if ((clistp->c_cc == 0) && clistp->c_cl) {
+		cblockp = (struct cblock *)((long)clistp->c_cl & ~CROUND);
+		cblock_free(cblockp);
+		if (--clistp->c_cbcount >= clistp->c_cbreserved)
+			++cslushcount;
+		clistp->c_cf = clistp->c_cl = NULL;
+	}
+
+	splx(s);
+	return (chr);
 }
 
+/*
+ * Move characters in source clist to destination clist,
+ * preserving quote bits.
+ */
 void
-catq(a1, a2)
-	struct clist *a1, *a2;
+catq(src_clistp, dest_clistp)
+	struct clist *src_clistp, *dest_clistp;
 {
+	int chr, s;
+
+	s = spltty();
+	/*
+	 * If the destination clist is empty (has no cblocks atttached),
+	 * and there are no possible complications with the resource counters,
+	 * then we simply assign the current clist to the destination.
+	 */
+	if (!dest_clistp->c_cf
+	    && src_clistp->c_cbcount <= src_clistp->c_cbmax
+	    && src_clistp->c_cbcount <= dest_clistp->c_cbmax) {
+		dest_clistp->c_cf = src_clistp->c_cf;
+		dest_clistp->c_cl = src_clistp->c_cl;
+		src_clistp->c_cf = src_clistp->c_cl = NULL;
+
+		dest_clistp->c_cc = src_clistp->c_cc;
+		src_clistp->c_cc = 0;
+		dest_clistp->c_cbcount = src_clistp->c_cbcount;
+		src_clistp->c_cbcount = 0;
+
+		splx(s);
+		return;
+	}
+
+	splx(s);
 
 	/*
-	 * Body deleted.
+	 * XXX  This should probably be optimized to more than one
+	 * character at a time.
 	 */
-	return;
+	while ((chr = getc(src_clistp)) != -1)
+		putc(chr, dest_clistp);
 }
diff --git a/sys/kern/tty_tb.c b/sys/kern/tty_tb.c
index 05a46ba..8f4c84c 100644
--- a/sys/kern/tty_tb.c
+++ b/sys/kern/tty_tb.c
@@ -30,7 +30,8 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)tty_tb.c	8.2 (Berkeley) 1/9/95
+ *	@(#)tty_tb.c	8.1 (Berkeley) 6/10/93
+ * $Id$
  */
 
 #include "tb.h"
@@ -310,9 +311,7 @@ poldecode(tc, cp, polpos)
 /*ARGSUSED*/
 tbioctl(tp, cmd, data, flag)
 	struct tty *tp;
-	u_long cmd;
 	caddr_t data;
-	int flag;
 {
 	register struct tb *tbp = (struct tb *)tp->T_LINEP;
 
diff --git a/sys/kern/tty_tty.c b/sys/kern/tty_tty.c
index d9dd1b4..be164d5 100644
--- a/sys/kern/tty_tty.c
+++ b/sys/kern/tty_tty.c
@@ -1,5 +1,5 @@
 /*-
- * Copyright (c) 1982, 1986, 1991, 1993, 1995
+ * Copyright (c) 1982, 1986, 1991, 1993
  *	The Regents of the University of California.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
@@ -30,7 +30,8 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)tty_tty.c	8.4 (Berkeley) 5/14/95
+ *	@(#)tty_tty.c	8.2 (Berkeley) 9/23/93
+ * $Id: tty_tty.c,v 1.15 1997/03/23 03:36:30 bde Exp $
  */
 
 /*
@@ -39,15 +40,33 @@
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/conf.h>
-#include <sys/ioctl.h>
 #include <sys/proc.h>
 #include <sys/tty.h>
 #include <sys/vnode.h>
-#include <sys/file.h>
+#include <sys/fcntl.h>
+#include <sys/kernel.h>
+#ifdef DEVFS
+#include <sys/devfsext.h>
+#endif /*DEVFS*/
+
+static	d_open_t	cttyopen;
+static	d_read_t	cttyread;
+static	d_write_t	cttywrite;
+static	d_ioctl_t	cttyioctl;
+static	d_select_t	cttyselect;
+
+#define CDEV_MAJOR 1
+/* Don't make static, fdesc_vnops uses this. */
+struct cdevsw ctty_cdevsw = 
+	{ cttyopen,	nullclose,	cttyread,	cttywrite,	/*1*/
+	  cttyioctl,	nullstop,	nullreset,	nodevtotty,/* tty */
+	  cttyselect,	nommap,		NULL,	"ctty",	NULL,	-1 };
+
 
 #define cttyvp(p) ((p)->p_flag & P_CONTROLT ? (p)->p_session->s_ttyvp : NULL)
 
 /*ARGSUSED*/
+static	int
 cttyopen(dev, flag, mode, p)
 	dev_t dev;
 	int flag, mode;
@@ -64,7 +83,7 @@ cttyopen(dev, flag, mode, p)
 	 * Since group is tty and mode is 620 on most terminal lines
 	 * and since sessions protect terminals from processes outside
 	 * your session, this check is probably no longer necessary.
-	 * Since it inhibits setuid root programs that later switch 
+	 * Since it inhibits setuid root programs that later switch
 	 * to another user from accessing /dev/tty, we have decided
 	 * to delete this test. (mckusick 5/93)
 	 */
@@ -78,6 +97,7 @@ cttyopen(dev, flag, mode, p)
 }
 
 /*ARGSUSED*/
+static	int
 cttyread(dev, uio, flag)
 	dev_t dev;
 	struct uio *uio;
@@ -96,6 +116,7 @@ cttyread(dev, uio, flag)
 }
 
 /*ARGSUSED*/
+static	int
 cttywrite(dev, uio, flag)
 	dev_t dev;
 	struct uio *uio;
@@ -114,9 +135,10 @@ cttywrite(dev, uio, flag)
 }
 
 /*ARGSUSED*/
+static	int
 cttyioctl(dev, cmd, addr, flag, p)
 	dev_t dev;
-	u_long cmd;
+	int cmd;
 	caddr_t addr;
 	int flag;
 	struct proc *p;
@@ -125,6 +147,8 @@ cttyioctl(dev, cmd, addr, flag, p)
 
 	if (ttyvp == NULL)
 		return (EIO);
+	if (cmd == TIOCSCTTY)  /* don't allow controlling tty to be set    */
+		return EINVAL; /* to controlling tty -- infinite recursion */
 	if (cmd == TIOCNOTTY) {
 		if (!SESS_LEADER(p)) {
 			p->p_flag &= ~P_CONTROLT;
@@ -136,6 +160,7 @@ cttyioctl(dev, cmd, addr, flag, p)
 }
 
 /*ARGSUSED*/
+static	int
 cttyselect(dev, flag, p)
 	dev_t dev;
 	int flag;
@@ -147,3 +172,27 @@ cttyselect(dev, flag, p)
 		return (1);	/* try operation to get EOF/failure */
 	return (VOP_SELECT(ttyvp, flag, FREAD|FWRITE, NOCRED, p));
 }
+
+static ctty_devsw_installed = 0;
+#ifdef DEVFS
+static 	void	*ctty_devfs_token;
+#endif
+
+static void
+ctty_drvinit(void *unused)
+{
+	dev_t dev;
+
+	if( ! ctty_devsw_installed ) {
+		dev = makedev(CDEV_MAJOR,0);
+		cdevsw_add(&dev,&ctty_cdevsw,NULL);
+		ctty_devsw_installed = 1;
+#ifdef DEVFS
+		ctty_devfs_token = 
+			devfs_add_devswf(&ctty_cdevsw, 0, DV_CHR, 0, 0, 
+					0666, "tty");
+#endif
+    	}
+}
+
+SYSINIT(cttydev,SI_SUB_DRIVERS,SI_ORDER_MIDDLE+CDEV_MAJOR,ctty_drvinit,NULL)
diff --git a/sys/kern/uipc_domain.c b/sys/kern/uipc_domain.c
index 1c91f2a..a2c3477 100644
--- a/sys/kern/uipc_domain.c
+++ b/sys/kern/uipc_domain.c
@@ -30,7 +30,8 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)uipc_domain.c	8.3 (Berkeley) 2/14/95
+ *	@(#)uipc_domain.c	8.2 (Berkeley) 10/18/93
+ * $Id$
  */
 
 #include <sys/param.h>
@@ -38,69 +39,120 @@
 #include <sys/protosw.h>
 #include <sys/domain.h>
 #include <sys/mbuf.h>
-#include <sys/time.h>
 #include <sys/kernel.h>
 #include <sys/systm.h>
-#include <sys/proc.h>
-#include <vm/vm.h>
-#include <sys/sysctl.h>
 
-void	pffasttimo __P((void *));
-void	pfslowtimo __P((void *));
+/*
+ * System initialization
+ *
+ * Note: domain initialization wants to take place on a per domain basis
+ * as a result of traversing a linker set.  Most likely, each domain
+ * want to call a registration function rather than being handled here
+ * in domaininit().  Probably this will look like:
+ *
+ * SYSINIT(unique, SI_SUB_PROTO_DOMAI, SI_ORDER_ANY, domain_add, xxx)
+ *
+ * Where 'xxx' is replaced by the address of a parameter struct to be
+ * passed to the doamin_add() function.
+ */
+
+static int	x_save_spl;			/* used by kludge*/
+static void kludge_splimp __P((void *));
+static void kludge_splx __P((void *));
+static void domaininit __P((void *));
+SYSINIT(splimp, SI_SUB_PROTO_BEGIN, SI_ORDER_FIRST, kludge_splimp, &x_save_spl)
+SYSINIT(domain, SI_SUB_PROTO_DOMAIN, SI_ORDER_FIRST, domaininit, NULL)
+SYSINIT(splx, SI_SUB_PROTO_END, SI_ORDER_FIRST, kludge_splx, &x_save_spl)
+
+static void	pffasttimo __P((void *));
+static void	pfslowtimo __P((void *));
+
+struct domain *domains;
 
 #define	ADDDOMAIN(x)	{ \
-	extern struct domain __CONCAT(x,domain); \
 	__CONCAT(x,domain.dom_next) = domains; \
 	domains = &__CONCAT(x,domain); \
 }
 
-void
-domaininit()
+extern struct linker_set domain_set;
+
+/* ARGSUSED*/
+static void
+domaininit(dummy)
+	void *dummy;
 {
-	register struct domain *dp;
+	register struct domain *dp, **dpp;
 	register struct protosw *pr;
 
-#undef unix
-#ifndef lint
-	ADDDOMAIN(unix);
-	ADDDOMAIN(route);
-#ifdef INET
-	ADDDOMAIN(inet);
-#endif
-#ifdef NS
-	ADDDOMAIN(ns);
-#endif
-#ifdef ISO
-	ADDDOMAIN(iso);
-#endif
-#ifdef CCITT
-	ADDDOMAIN(ccitt);
-#endif
-#include "imp.h"
-#if NIMP > 0
-	ADDDOMAIN(imp);
-#endif
+	/*
+	 * NB - local domain is always present.
+	 */
+	ADDDOMAIN(local);
+
+	for (dpp = (struct domain **)domain_set.ls_items; *dpp; dpp++) {
+		(**dpp).dom_next = domains;
+		domains = *dpp;
+	}
+
+/* - not in our sources
+#ifdef ISDN
+	ADDDOMAIN(isdn);
 #endif
+*/
 
 	for (dp = domains; dp; dp = dp->dom_next) {
 		if (dp->dom_init)
 			(*dp->dom_init)();
-		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
+		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++){
+#ifdef PRU_OLDSTYLE
+			/* See comments in uipc_socket2.c. */
+			if (pr->pr_usrreqs == 0 && pr->pr_ousrreq)
+				pr->pr_usrreqs = &pru_oldstyle;
+#endif
 			if (pr->pr_init)
 				(*pr->pr_init)();
+		}
 	}
 
-if (max_linkhdr < 16)		/* XXX */
-max_linkhdr = 16;
+	if (max_linkhdr < 16)		/* XXX */
+		max_linkhdr = 16;
 	max_hdr = max_linkhdr + max_protohdr;
 	max_datalen = MHLEN - max_hdr;
-	timeout(pffasttimo, NULL, 1);
-	timeout(pfslowtimo, NULL, 1);
+	timeout(pffasttimo, (void *)0, 1);
+	timeout(pfslowtimo, (void *)0, 1);
 }
 
+
+/*
+ * The following two operations are kludge code.  Most likely, they should
+ * be done as a "domainpreinit()" for the first function and then rolled
+ * in as the last act of "domaininit()" for the second.
+ *
+ * In point of fact, it is questionable why other initialization prior
+ * to this does not also take place at splimp by default.
+ */
+static void
+kludge_splimp(udata)
+	void *udata;
+{
+	int	*savesplp = udata;
+
+	*savesplp = splimp();
+}
+
+static void
+kludge_splx(udata)
+	void *udata;
+{
+	int	*savesplp = udata;
+
+	splx( *savesplp);
+}
+
+
+
 struct protosw *
-pffindtype(family, type)
-	int family, type;
+pffindtype(int family, int type)
 {
 	register struct domain *dp;
 	register struct protosw *pr;
@@ -117,8 +169,7 @@ found:
 }
 
 struct protosw *
-pffindproto(family, protocol, type)
-	int family, protocol, type;
+pffindproto(int family, int protocol, int type)
 {
 	register struct domain *dp;
 	register struct protosw *pr;
@@ -142,44 +193,6 @@ found:
 	return (maybe);
 }
 
-int
-net_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
-	int *name;
-	u_int namelen;
-	void *oldp;
-	size_t *oldlenp;
-	void *newp;
-	size_t newlen;
-	struct proc *p;
-{
-	register struct domain *dp;
-	register struct protosw *pr;
-	int family, protocol;
-
-	/*
-	 * All sysctl names at this level are nonterminal;
-	 * next two components are protocol family and protocol number,
-	 * then at least one addition component.
-	 */
-	if (namelen < 3)
-		return (EISDIR);		/* overloaded */
-	family = name[0];
-	protocol = name[1];
-
-	if (family == 0)
-		return (0);
-	for (dp = domains; dp; dp = dp->dom_next)
-		if (dp->dom_family == family)
-			goto found;
-	return (ENOPROTOOPT);
-found:
-	for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
-		if (pr->pr_protocol == protocol && pr->pr_sysctl)
-			return ((*pr->pr_sysctl)(name + 2, namelen - 2,
-			    oldp, oldlenp, newp, newlen));
-	return (ENOPROTOOPT);
-}
-
 void
 pfctlinput(cmd, sa)
 	int cmd;
@@ -191,10 +204,10 @@ pfctlinput(cmd, sa)
 	for (dp = domains; dp; dp = dp->dom_next)
 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
 			if (pr->pr_ctlinput)
-				(*pr->pr_ctlinput)(cmd, sa, (caddr_t)0);
+				(*pr->pr_ctlinput)(cmd, sa, (void *)0);
 }
 
-void
+static void
 pfslowtimo(arg)
 	void *arg;
 {
@@ -205,10 +218,10 @@ pfslowtimo(arg)
 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
 			if (pr->pr_slowtimo)
 				(*pr->pr_slowtimo)();
-	timeout(pfslowtimo, NULL, hz/2);
+	timeout(pfslowtimo, (void *)0, hz/2);
 }
 
-void
+static void
 pffasttimo(arg)
 	void *arg;
 {
@@ -219,5 +232,5 @@ pffasttimo(arg)
 		for (pr = dp->dom_protosw; pr < dp->dom_protoswNPROTOSW; pr++)
 			if (pr->pr_fasttimo)
 				(*pr->pr_fasttimo)();
-	timeout(pffasttimo, NULL, hz/5);
+	timeout(pffasttimo, (void *)0, hz/5);
 }
diff --git a/sys/kern/uipc_mbuf.c b/sys/kern/uipc_mbuf.c
index 62abfd5..7032e44 100644
--- a/sys/kern/uipc_mbuf.c
+++ b/sys/kern/uipc_mbuf.c
@@ -30,35 +30,81 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)uipc_mbuf.c	8.4 (Berkeley) 2/14/95
+ *	@(#)uipc_mbuf.c	8.2 (Berkeley) 1/4/94
+ *	$Id: uipc_mbuf.c,v 1.28 1997/02/18 20:43:05 wollman Exp $
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/malloc.h>
-#include <sys/map.h>
 #define MBTYPES
 #include <sys/mbuf.h>
 #include <sys/kernel.h>
+#include <sys/sysctl.h>
 #include <sys/syslog.h>
 #include <sys/domain.h>
 #include <sys/protosw.h>
 
 #include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_extern.h>
 
-extern	vm_map_t mb_map;
-struct	mbuf *mbutl;
-char	*mclrefcnt;
+static void mbinit __P((void *));
+SYSINIT(mbuf, SI_SUB_MBUF, SI_ORDER_FIRST, mbinit, NULL)
 
-void
-mbinit()
+struct mbuf *mbutl;
+char	*mclrefcnt;
+struct mbstat mbstat;
+struct mbuf *mmbfree;
+union mcluster *mclfree;
+int	max_linkhdr;
+int	max_protohdr;
+int	max_hdr;
+int	max_datalen;
+
+SYSCTL_INT(_kern_ipc, KIPC_MAX_LINKHDR, max_linkhdr, CTLFLAG_RW,
+	   &max_linkhdr, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_PROTOHDR, max_protohdr, CTLFLAG_RW,
+	   &max_protohdr, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_HDR, max_hdr, CTLFLAG_RW, &max_hdr, 0, "");
+SYSCTL_INT(_kern_ipc, KIPC_MAX_DATALEN, max_datalen, CTLFLAG_RW,
+	   &max_datalen, 0, "");
+SYSCTL_STRUCT(_kern_ipc, KIPC_MBSTAT, mbstat, CTLFLAG_RW, &mbstat, mbstat, "");
+
+static void	m_reclaim __P((void));
+
+/* "number of clusters of pages" */
+#define NCL_INIT	1
+
+#define NMB_INIT	16
+
+/* ARGSUSED*/
+static void
+mbinit(dummy)
+	void *dummy;
 {
 	int s;
 
+	mmbfree = NULL; mclfree = NULL;
+	mbstat.m_msize = MSIZE;
+	mbstat.m_mclbytes = MCLBYTES;
+	mbstat.m_minclsize = MINCLSIZE;
+	mbstat.m_mlen = MLEN;
+	mbstat.m_mhlen = MHLEN;
+
 	s = splimp();
-	if (m_clalloc(max(4096/CLBYTES, 1), M_DONTWAIT) == 0)
+	if (m_mballoc(NMB_INIT, M_DONTWAIT) == 0)
 		goto bad;
+#if MCLBYTES <= PAGE_SIZE
+	if (m_clalloc(NCL_INIT, M_DONTWAIT) == 0)
+		goto bad;
+#else
+	/* It's OK to call contigmalloc in this context. */
+	if (m_clalloc(16, 0) == 0)
+		goto bad;
+#endif
 	splx(s);
 	return;
 bad:
@@ -66,6 +112,80 @@ bad:
 }
 
 /*
+ * Allocate at least nmb mbufs and place on mbuf free list.
+ * Must be called at splimp.
+ */
+/* ARGSUSED */
+int
+m_mballoc(nmb, nowait)
+	register int nmb;
+	int nowait;
+{
+	register caddr_t p;
+	register int i;
+	int nbytes;
+
+	/* Once we run out of map space, it will be impossible to get
+	 * any more (nothing is ever freed back to the map) (XXX which
+	 * is dumb). (however you are not dead as m_reclaim might
+	 * still be able to free a substantial amount of space).
+	 */
+	if (mb_map_full)
+		return (0);
+
+	nbytes = round_page(nmb * MSIZE);
+	p = (caddr_t)kmem_malloc(mb_map, nbytes, M_NOWAIT);
+	if (p == 0 && !nowait) {
+		mbstat.m_wait++;
+		p = (caddr_t)kmem_malloc(mb_map, nbytes, M_WAITOK);
+	}
+
+	/*
+	 * Either the map is now full, or this is nowait and there
+	 * are no pages left.
+	 */
+	if (p == NULL)
+		return (0);
+
+	nmb = nbytes / MSIZE;
+	for (i = 0; i < nmb; i++) {
+		((struct mbuf *)p)->m_next = mmbfree;
+		mmbfree = (struct mbuf *)p;
+		p += MSIZE;
+	}
+	mbstat.m_mbufs += nmb;
+	return (1);
+}
+
+#if MCLBYTES > PAGE_SIZE
+static int i_want_my_mcl;
+
+static void
+kproc_mclalloc(void)
+{
+	int status;
+
+	while (1) {
+		tsleep(&i_want_my_mcl, PVM, "mclalloc", 0);
+
+		for (; i_want_my_mcl; i_want_my_mcl--) {
+			if (m_clalloc(1, 0) == 0)
+				printf("m_clalloc failed even in process context!\n");
+		}
+	}
+}
+
+static struct proc *mclallocproc;
+static struct kproc_desc mclalloc_kp = {
+	"mclalloc",
+	kproc_mclalloc,
+	&mclallocproc
+};
+SYSINIT_KT(mclallocproc, SI_SUB_KTHREAD_UPDATE, SI_ORDER_ANY, kproc_start,
+	   &mclalloc_kp);
+#endif
+
+/*
  * Allocate some number of mbuf clusters
  * and place on cluster free list.
  * Must be called at splimp.
@@ -76,21 +196,45 @@ m_clalloc(ncl, nowait)
 	register int ncl;
 	int nowait;
 {
-	static int logged;
 	register caddr_t p;
 	register int i;
 	int npg;
 
-	npg = ncl * CLSIZE;
-	p = (caddr_t)kmem_malloc(mb_map, ctob(npg), !nowait);
+	/*
+	 * Once we run out of map space, it will be impossible
+	 * to get any more (nothing is ever freed back to the
+	 * map).
+	 */
+	if (mb_map_full) {
+		mbstat.m_drops++;
+		return (0);
+	}
+
+#if MCLBYTES > PAGE_SIZE
+	if (nowait) {
+		i_want_my_mcl += ncl;
+		wakeup(&i_want_my_mcl);
+		mbstat.m_wait++;
+		p = 0;
+	} else {
+		p = contigmalloc1(MCLBYTES * ncl, M_DEVBUF, M_WAITOK, 0ul,
+				  ~0ul, PAGE_SIZE, 0, mb_map);
+	}
+#else
+	npg = ncl;
+	p = (caddr_t)kmem_malloc(mb_map, ctob(npg),
+				 nowait ? M_NOWAIT : M_WAITOK);
+	ncl = ncl * PAGE_SIZE / MCLBYTES;
+#endif
+	/*
+	 * Either the map is now full, or this is nowait and there
+	 * are no pages left.
+	 */
 	if (p == NULL) {
-		if (logged == 0) {
-			logged++;
-			log(LOG_ERR, "mb_map full\n");
-		}
+		mbstat.m_drops++;
 		return (0);
 	}
-	ncl = ncl * CLBYTES / MCLBYTES;
+
 	for (i = 0; i < ncl; i++) {
 		((union mcluster *)p)->mcl_next = mclfree;
 		mclfree = (union mcluster *)p;
@@ -115,6 +259,10 @@ m_retry(i, t)
 #define m_retry(i, t)	(struct mbuf *)0
 	MGET(m, i, t);
 #undef m_retry
+	if (m != NULL)
+		mbstat.m_wait++;
+	else
+		mbstat.m_drops++;
 	return (m);
 }
 
@@ -131,10 +279,14 @@ m_retryhdr(i, t)
 #define m_retryhdr(i, t) (struct mbuf *)0
 	MGETHDR(m, i, t);
 #undef m_retryhdr
+	if (m != NULL)
+		mbstat.m_wait++;
+	else
+		mbstat.m_drops++;
 	return (m);
 }
 
-void
+static void
 m_reclaim()
 {
 	register struct domain *dp;
@@ -207,7 +359,8 @@ m_freem(m)
 		return;
 	do {
 		MFREE(m, n);
-	} while (m = n);
+		m = n;
+	} while (m);
 }
 
 /*
@@ -248,7 +401,7 @@ m_prepend(m, len, how)
  * continuing for "len" bytes.  If len is M_COPYALL, copy to end of mbuf.
  * The wait parameter is a choice of M_WAIT/M_DONTWAIT from caller.
  */
-int MCFail;
+#define MCFail (mbstat.m_mcfail)
 
 struct mbuf *
 m_copym(m, off0, len, wait)
@@ -296,7 +449,11 @@ m_copym(m, off0, len, wait)
 		n->m_len = min(len, m->m_len - off);
 		if (m->m_flags & M_EXT) {
 			n->m_data = m->m_data + off;
-			mclrefcnt[mtocl(m->m_ext.ext_buf)]++;
+			if(!m->m_ext.ext_ref)
+				mclrefcnt[mtocl(m->m_ext.ext_buf)]++;
+			else
+				(*(m->m_ext.ext_ref))(m->m_ext.ext_buf,
+							m->m_ext.ext_size);
 			n->m_ext = m->m_ext;
 			n->m_flags |= M_EXT;
 		} else
@@ -318,6 +475,61 @@ nospace:
 }
 
 /*
+ * Copy an entire packet, including header (which must be present).
+ * An optimization of the common case `m_copym(m, 0, M_COPYALL, how)'.
+ */
+struct mbuf *
+m_copypacket(m, how)
+	struct mbuf *m;
+	int how;
+{
+	struct mbuf *top, *n, *o;
+
+	MGET(n, how, m->m_type);
+	top = n;
+	if (!n)
+		goto nospace;
+
+	M_COPY_PKTHDR(n, m);
+	n->m_len = m->m_len;
+	if (m->m_flags & M_EXT) {
+		n->m_data = m->m_data;
+		mclrefcnt[mtocl(m->m_ext.ext_buf)]++;
+		n->m_ext = m->m_ext;
+		n->m_flags |= M_EXT;
+	} else {
+		bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
+	}
+
+	m = m->m_next;
+	while (m) {
+		MGET(o, how, m->m_type);
+		if (!o)
+			goto nospace;
+
+		n->m_next = o;
+		n = n->m_next;
+
+		n->m_len = m->m_len;
+		if (m->m_flags & M_EXT) {
+			n->m_data = m->m_data;
+			mclrefcnt[mtocl(m->m_ext.ext_buf)]++;
+			n->m_ext = m->m_ext;
+			n->m_flags |= M_EXT;
+		} else {
+			bcopy(mtod(m, char *), mtod(n, char *), n->m_len);
+		}
+
+		m = m->m_next;
+	}
+	return top;
+nospace:
+	m_freem(top);
+	MCFail++;
+	return 0;
+}
+
+/*
  * Copy data from an mbuf chain starting "off" bytes from the beginning,
  * continuing for "len" bytes, into the indicated buffer.
  */
@@ -447,8 +659,8 @@ m_adj(mp, req_len)
 			}
 			count -= m->m_len;
 		}
-		while (m = m->m_next)
-			m->m_len = 0;
+		while (m->m_next)
+			(m = m->m_next) ->m_len = 0;
 	}
 }
 
@@ -460,7 +672,7 @@ m_adj(mp, req_len)
  * If there is room, it will add up to max_protohdr-len extra bytes to the
  * contiguous region in an attempt to avoid being called next time.
  */
-int MPFail;
+#define MPFail (mbstat.m_mpfail)
 
 struct mbuf *
 m_pullup(n, len)
@@ -573,7 +785,11 @@ extpacket:
 	if (m->m_flags & M_EXT) {
 		n->m_flags |= M_EXT;
 		n->m_ext = m->m_ext;
-		mclrefcnt[mtocl(m->m_ext.ext_buf)]++;
+		if(!m->m_ext.ext_ref)
+			mclrefcnt[mtocl(m->m_ext.ext_buf)]++;
+		else
+			(*(m->m_ext.ext_ref))(m->m_ext.ext_buf,
+						m->m_ext.ext_size);
 		m->m_ext.ext_size = 0; /* For Accounting XXXXXX danger */
 		n->m_data = m->m_data + len;
 	} else {
@@ -593,7 +809,7 @@ m_devget(buf, totlen, off0, ifp, copy)
 	char *buf;
 	int totlen, off0;
 	struct ifnet *ifp;
-	void (*copy)();
+	void (*copy) __P((char *from, caddr_t to, u_int len));
 {
 	register struct mbuf *m;
 	struct mbuf *top = 0, **mp = &top;
@@ -604,12 +820,8 @@ m_devget(buf, totlen, off0, ifp, copy)
 	cp = buf;
 	epkt = cp + totlen;
 	if (off) {
-		/*
-		 * If 'off' is non-zero, packet is trailer-encapsulated,
-		 * so we have to skip the type and length fields.
-		 */
-		cp += off + 2 * sizeof(u_int16_t);
-		totlen -= 2 * sizeof(u_int16_t);
+		cp += off + 2 * sizeof(u_short);
+		totlen -= 2 * sizeof(u_short);
 	}
 	MGETHDR(m, M_DONTWAIT, MT_DATA);
 	if (m == 0)
@@ -658,3 +870,56 @@ m_devget(buf, totlen, off0, ifp, copy)
 	}
 	return (top);
 }
+
+/*
+ * Copy data from a buffer back into the indicated mbuf chain,
+ * starting "off" bytes from the beginning, extending the mbuf
+ * chain if necessary.
+ */
+void
+m_copyback(m0, off, len, cp)
+	struct	mbuf *m0;
+	register int off;
+	register int len;
+	caddr_t cp;
+{
+	register int mlen;
+	register struct mbuf *m = m0, *n;
+	int totlen = 0;
+
+	if (m0 == 0)
+		return;
+	while (off > (mlen = m->m_len)) {
+		off -= mlen;
+		totlen += mlen;
+		if (m->m_next == 0) {
+			n = m_getclr(M_DONTWAIT, m->m_type);
+			if (n == 0)
+				goto out;
+			n->m_len = min(MLEN, len + off);
+			m->m_next = n;
+		}
+		m = m->m_next;
+	}
+	while (len > 0) {
+		mlen = min (m->m_len - off, len);
+		bcopy(cp, off + mtod(m, caddr_t), (unsigned)mlen);
+		cp += mlen;
+		len -= mlen;
+		mlen += off;
+		off = 0;
+		totlen += mlen;
+		if (len == 0)
+			break;
+		if (m->m_next == 0) {
+			n = m_get(M_DONTWAIT, m->m_type);
+			if (n == 0)
+				break;
+			n->m_len = min(MLEN, len);
+			m->m_next = n;
+		}
+		m = m->m_next;
+	}
+out:	if (((m = m0)->m_flags & M_PKTHDR) && (m->m_pkthdr.len < totlen))
+		m->m_pkthdr.len = totlen;
+}
diff --git a/sys/kern/uipc_proto.c b/sys/kern/uipc_proto.c
index e89a84c..f652ce3 100644
--- a/sys/kern/uipc_proto.c
+++ b/sys/kern/uipc_proto.c
@@ -30,43 +30,47 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)uipc_proto.c	8.2 (Berkeley) 2/14/95
+ *	@(#)uipc_proto.c	8.1 (Berkeley) 6/10/93
+ * $Id: uipc_proto.c,v 1.9 1997/02/22 09:39:27 peter Exp $
  */
 
 #include <sys/param.h>
-#include <sys/socket.h>
-#include <sys/protosw.h>
+#include <sys/kernel.h>
 #include <sys/domain.h>
 #include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/socket.h>
+#include <sys/sysctl.h>
+#include <sys/un.h>
+
+#include <net/raw_cb.h>
 
 /*
- * Definitions of protocols supported in the UNIX domain.
+ * Definitions of protocols supported in the LOCAL domain.
  */
 
-int	uipc_usrreq(), raw_usrreq();
-void	raw_init(), raw_input(), raw_ctlinput();
-extern	struct domain unixdomain;		/* or at least forward */
-
-struct protosw unixsw[] = {
-{ SOCK_STREAM,	&unixdomain,	0,	PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS,
+static struct protosw localsw[] = {
+{ SOCK_STREAM,	&localdomain,	0,	PR_CONNREQUIRED|PR_WANTRCVD|PR_RIGHTS,
   0,		0,		0,		0,
   uipc_usrreq,
   0,		0,		0,		0,
 },
-{ SOCK_DGRAM,	&unixdomain,	0,		PR_ATOMIC|PR_ADDR|PR_RIGHTS,
+{ SOCK_DGRAM,	&localdomain,	0,		PR_ATOMIC|PR_ADDR|PR_RIGHTS,
   0,		0,		0,		0,
   uipc_usrreq,
   0,		0,		0,		0,
 },
 { 0,		0,		0,		0,
-  raw_input,	0,		raw_ctlinput,	0,
+  0,		0,		raw_ctlinput,	0,
   raw_usrreq,
   raw_init,	0,		0,		0,
 }
 };
 
-int	unp_externalize(), unp_dispose();
+struct domain localdomain =
+    { AF_LOCAL, "local", 0, unp_externalize, unp_dispose,
+      localsw, &localsw[sizeof(localsw)/sizeof(localsw[0])] };
 
-struct domain unixdomain =
-    { AF_UNIX, "unix", 0, unp_externalize, unp_dispose,
-      unixsw, &unixsw[sizeof(unixsw)/sizeof(unixsw[0])] };
+SYSCTL_NODE(_net, PF_LOCAL, local, CTLFLAG_RW, 0, "Local domain");
+SYSCTL_NODE(_net_local, SOCK_STREAM, stream, CTLFLAG_RW, 0, "SOCK_STREAM");
+SYSCTL_NODE(_net_local, SOCK_DGRAM, dgram, CTLFLAG_RW, 0, "SOCK_DGRAM");
diff --git a/sys/kern/uipc_sockbuf.c b/sys/kern/uipc_sockbuf.c
new file mode 100644
index 0000000..e19db0c
--- /dev/null
+++ b/sys/kern/uipc_sockbuf.c
@@ -0,0 +1,1018 @@
+/*
+ * Copyright (c) 1982, 1986, 1988, 1990, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)uipc_socket2.c	8.1 (Berkeley) 6/10/93
+ *	$Id: uipc_socket2.c,v 1.21 1997/02/19 19:15:43 wollman Exp $
+ */
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/proc.h>
+#include <sys/file.h>
+#include <sys/buf.h>
+#include <sys/malloc.h>
+#include <sys/mbuf.h>
+#include <sys/protosw.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+#include <sys/socketvar.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+
+/*
+ * Primitive routines for operating on sockets and socket buffers
+ */
+
+u_long	sb_max = SB_MAX;		/* XXX should be static */
+
+static	u_long sb_efficiency = 8;	/* parameter for sbreserve() */
+
+/*
+ * Procedures to manipulate state flags of socket
+ * and do appropriate wakeups.  Normal sequence from the
+ * active (originating) side is that soisconnecting() is
+ * called during processing of connect() call,
+ * resulting in an eventual call to soisconnected() if/when the
+ * connection is established.  When the connection is torn down
+ * soisdisconnecting() is called during processing of disconnect() call,
+ * and soisdisconnected() is called when the connection to the peer
+ * is totally severed.  The semantics of these routines are such that
+ * connectionless protocols can call soisconnected() and soisdisconnected()
+ * only, bypassing the in-progress calls when setting up a ``connection''
+ * takes no time.
+ *
+ * From the passive side, a socket is created with
+ * two queues of sockets: so_q0 for connections in progress
+ * and so_q for connections already made and awaiting user acceptance.
+ * As a protocol is preparing incoming connections, it creates a socket
+ * structure queued on so_q0 by calling sonewconn().  When the connection
+ * is established, soisconnected() is called, and transfers the
+ * socket structure to so_q, making it available to accept().
+ *
+ * If a socket is closed with sockets on either
+ * so_q0 or so_q, these sockets are dropped.
+ *
+ * If higher level protocols are implemented in
+ * the kernel, the wakeups done here will sometimes
+ * cause software-interrupt process scheduling.
+ */
+
+void
+soisconnecting(so)
+	register struct socket *so;
+{
+
+	so->so_state &= ~(SS_ISCONNECTED|SS_ISDISCONNECTING);
+	so->so_state |= SS_ISCONNECTING;
+}
+
+void
+soisconnected(so)
+	register struct socket *so;
+{
+	register struct socket *head = so->so_head;
+
+	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
+	so->so_state |= SS_ISCONNECTED;
+	if (head && (so->so_state & SS_INCOMP)) {
+		TAILQ_REMOVE(&head->so_incomp, so, so_list);
+		head->so_incqlen--;
+		so->so_state &= ~SS_INCOMP;
+		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+		so->so_state |= SS_COMP;
+		sorwakeup(head);
+		wakeup((caddr_t)&head->so_timeo);
+	} else {
+		wakeup((caddr_t)&so->so_timeo);
+		sorwakeup(so);
+		sowwakeup(so);
+	}
+}
+
+void
+soisdisconnecting(so)
+	register struct socket *so;
+{
+
+	so->so_state &= ~SS_ISCONNECTING;
+	so->so_state |= (SS_ISDISCONNECTING|SS_CANTRCVMORE|SS_CANTSENDMORE);
+	wakeup((caddr_t)&so->so_timeo);
+	sowwakeup(so);
+	sorwakeup(so);
+}
+
+void
+soisdisconnected(so)
+	register struct socket *so;
+{
+
+	so->so_state &= ~(SS_ISCONNECTING|SS_ISCONNECTED|SS_ISDISCONNECTING);
+	so->so_state |= (SS_CANTRCVMORE|SS_CANTSENDMORE);
+	wakeup((caddr_t)&so->so_timeo);
+	sowwakeup(so);
+	sorwakeup(so);
+}
+
+/*
+ * Return a random connection that hasn't been serviced yet and
+ * is eligible for discard.  There is a one in qlen chance that
+ * we will return a null, saying that there are no dropable
+ * requests.  In this case, the protocol specific code should drop
+ * the new request.  This insures fairness.
+ *
+ * This may be used in conjunction with protocol specific queue
+ * congestion routines.
+ */
+struct socket *
+sodropablereq(head)
+	register struct socket *head;
+{
+	register struct socket *so;
+	unsigned int i, j, qlen;
+
+	static int rnd;
+	static long old_mono_secs;
+	static unsigned int cur_cnt, old_cnt;
+
+	if ((i = (mono_time.tv_sec - old_mono_secs)) != 0) {
+		old_mono_secs = mono_time.tv_sec;
+		old_cnt = cur_cnt / i;
+		cur_cnt = 0;
+	}
+
+	so = TAILQ_FIRST(&head->so_incomp);
+	if (!so)
+		return (so);
+
+	qlen = head->so_incqlen;
+	if (++cur_cnt > qlen || old_cnt > qlen) {
+		rnd = (314159 * rnd + 66329) & 0xffff;
+		j = ((qlen + 1) * rnd) >> 16;
+
+		while (j-- && so)
+		    so = TAILQ_NEXT(so, so_list);
+	}
+
+	return (so);
+}
+
+/*
+ * When an attempt at a new connection is noted on a socket
+ * which accepts connections, sonewconn is called.  If the
+ * connection is possible (subject to space constraints, etc.)
+ * then we allocate a new structure, propoerly linked into the
+ * data structure of the original socket, and return this.
+ * Connstatus may be 0, or SO_ISCONFIRMING, or SO_ISCONNECTED.
+ *
+ * Currently, sonewconn() is defined as sonewconn1() in socketvar.h
+ * to catch calls that are missing the (new) second parameter.
+ */
+struct socket *
+sonewconn1(head, connstatus)
+	register struct socket *head;
+	int connstatus;
+{
+	register struct socket *so;
+
+	if (head->so_qlen > 3 * head->so_qlimit / 2)
+		return ((struct socket *)0);
+	MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_DONTWAIT);
+	if (so == NULL)
+		return ((struct socket *)0);
+	bzero((caddr_t)so, sizeof(*so));
+	so->so_head = head;
+	so->so_type = head->so_type;
+	so->so_options = head->so_options &~ SO_ACCEPTCONN;
+	so->so_linger = head->so_linger;
+	so->so_state = head->so_state | SS_NOFDREF;
+	so->so_proto = head->so_proto;
+	so->so_timeo = head->so_timeo;
+	so->so_pgid = head->so_pgid;
+	(void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat);
+
+	if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0)) {
+		(void) free((caddr_t)so, M_SOCKET);
+		return ((struct socket *)0);
+	}
+
+	if (connstatus) {
+		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+		so->so_state |= SS_COMP;
+	} else {
+		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
+		so->so_state |= SS_INCOMP;
+		head->so_incqlen++;
+	}
+	head->so_qlen++;
+	if (connstatus) {
+		sorwakeup(head);
+		wakeup((caddr_t)&head->so_timeo);
+		so->so_state |= connstatus;
+	}
+	return (so);
+}
+
+/*
+ * Socantsendmore indicates that no more data will be sent on the
+ * socket; it would normally be applied to a socket when the user
+ * informs the system that no more data is to be sent, by the protocol
+ * code (in case PRU_SHUTDOWN).  Socantrcvmore indicates that no more data
+ * will be received, and will normally be applied to the socket by a
+ * protocol when it detects that the peer will send no more data.
+ * Data queued for reading in the socket may yet be read.
+ */
+
+void
+socantsendmore(so)
+	struct socket *so;
+{
+
+	so->so_state |= SS_CANTSENDMORE;
+	sowwakeup(so);
+}
+
+void
+socantrcvmore(so)
+	struct socket *so;
+{
+
+	so->so_state |= SS_CANTRCVMORE;
+	sorwakeup(so);
+}
+
+/*
+ * Wait for data to arrive at/drain from a socket buffer.
+ */
+int
+sbwait(sb)
+	struct sockbuf *sb;
+{
+
+	sb->sb_flags |= SB_WAIT;
+	return (tsleep((caddr_t)&sb->sb_cc,
+	    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
+	    sb->sb_timeo));
+}
+
+/*
+ * Lock a sockbuf already known to be locked;
+ * return any error returned from sleep (EINTR).
+ */
+int
+sb_lock(sb)
+	register struct sockbuf *sb;
+{
+	int error;
+
+	while (sb->sb_flags & SB_LOCK) {
+		sb->sb_flags |= SB_WANT;
+		error = tsleep((caddr_t)&sb->sb_flags,
+		    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH,
+		    "sblock", 0);
+		if (error)
+			return (error);
+	}
+	sb->sb_flags |= SB_LOCK;
+	return (0);
+}
+
+/*
+ * Wakeup processes waiting on a socket buffer.
+ * Do asynchronous notification via SIGIO
+ * if the socket has the SS_ASYNC flag set.
+ */
+void
+sowakeup(so, sb)
+	register struct socket *so;
+	register struct sockbuf *sb;
+{
+	struct proc *p;
+
+	selwakeup(&sb->sb_sel);
+	sb->sb_flags &= ~SB_SEL;
+	if (sb->sb_flags & SB_WAIT) {
+		sb->sb_flags &= ~SB_WAIT;
+		wakeup((caddr_t)&sb->sb_cc);
+	}
+	if (so->so_state & SS_ASYNC) {
+		if (so->so_pgid < 0)
+			gsignal(-so->so_pgid, SIGIO);
+		else if (so->so_pgid > 0 && (p = pfind(so->so_pgid)) != 0)
+			psignal(p, SIGIO);
+	}
+}
+
+/*
+ * Socket buffer (struct sockbuf) utility routines.
+ *
+ * Each socket contains two socket buffers: one for sending data and
+ * one for receiving data.  Each buffer contains a queue of mbufs,
+ * information about the number of mbufs and amount of data in the
+ * queue, and other fields allowing select() statements and notification
+ * on data availability to be implemented.
+ *
+ * Data stored in a socket buffer is maintained as a list of records.
+ * Each record is a list of mbufs chained together with the m_next
+ * field.  Records are chained together with the m_nextpkt field. The upper
+ * level routine soreceive() expects the following conventions to be
+ * observed when placing information in the receive buffer:
+ *
+ * 1. If the protocol requires each message be preceded by the sender's
+ *    name, then a record containing that name must be present before
+ *    any associated data (mbuf's must be of type MT_SONAME).
+ * 2. If the protocol supports the exchange of ``access rights'' (really
+ *    just additional data associated with the message), and there are
+ *    ``rights'' to be received, then a record containing this data
+ *    should be present (mbuf's must be of type MT_RIGHTS).
+ * 3. If a name or rights record exists, then it must be followed by
+ *    a data record, perhaps of zero length.
+ *
+ * Before using a new socket structure it is first necessary to reserve
+ * buffer space to the socket, by calling sbreserve().  This should commit
+ * some of the available buffer space in the system buffer pool for the
+ * socket (currently, it does nothing but enforce limits).  The space
+ * should be released by calling sbrelease() when the socket is destroyed.
+ */
+
+int
+soreserve(so, sndcc, rcvcc)
+	register struct socket *so;
+	u_long sndcc, rcvcc;
+{
+
+	if (sbreserve(&so->so_snd, sndcc) == 0)
+		goto bad;
+	if (sbreserve(&so->so_rcv, rcvcc) == 0)
+		goto bad2;
+	if (so->so_rcv.sb_lowat == 0)
+		so->so_rcv.sb_lowat = 1;
+	if (so->so_snd.sb_lowat == 0)
+		so->so_snd.sb_lowat = MCLBYTES;
+	if (so->so_snd.sb_lowat > so->so_snd.sb_hiwat)
+		so->so_snd.sb_lowat = so->so_snd.sb_hiwat;
+	return (0);
+bad2:
+	sbrelease(&so->so_snd);
+bad:
+	return (ENOBUFS);
+}
+
+/*
+ * Allot mbufs to a sockbuf.
+ * Attempt to scale mbmax so that mbcnt doesn't become limiting
+ * if buffering efficiency is near the normal case.
+ */
+int
+sbreserve(sb, cc)
+	struct sockbuf *sb;
+	u_long cc;
+{
+	if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES))
+		return (0);
+	sb->sb_hiwat = cc;
+	sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
+	if (sb->sb_lowat > sb->sb_hiwat)
+		sb->sb_lowat = sb->sb_hiwat;
+	return (1);
+}
+
+/*
+ * Free mbufs held by a socket, and reserved mbuf space.
+ */
+void
+sbrelease(sb)
+	struct sockbuf *sb;
+{
+
+	sbflush(sb);
+	sb->sb_hiwat = sb->sb_mbmax = 0;
+}
+
+/*
+ * Routines to add and remove
+ * data from an mbuf queue.
+ *
+ * The routines sbappend() or sbappendrecord() are normally called to
+ * append new mbufs to a socket buffer, after checking that adequate
+ * space is available, comparing the function sbspace() with the amount
+ * of data to be added.  sbappendrecord() differs from sbappend() in
+ * that data supplied is treated as the beginning of a new record.
+ * To place a sender's address, optional access rights, and data in a
+ * socket receive buffer, sbappendaddr() should be used.  To place
+ * access rights and data in a socket receive buffer, sbappendrights()
+ * should be used.  In either case, the new data begins a new record.
+ * Note that unlike sbappend() and sbappendrecord(), these routines check
+ * for the caller that there will be enough space to store the data.
+ * Each fails if there is not enough space, or if it cannot find mbufs
+ * to store additional information in.
+ *
+ * Reliable protocols may use the socket send buffer to hold data
+ * awaiting acknowledgement.  Data is normally copied from a socket
+ * send buffer in a protocol with m_copy for output to a peer,
+ * and then removing the data from the socket buffer with sbdrop()
+ * or sbdroprecord() when the data is acknowledged by the peer.
+ */
+
+/*
+ * Append mbuf chain m to the last record in the
+ * socket buffer sb.  The additional space associated
+ * the mbuf chain is recorded in sb.  Empty mbufs are
+ * discarded and mbufs are compacted where possible.
+ */
+void
+sbappend(sb, m)
+	struct sockbuf *sb;
+	struct mbuf *m;
+{
+	register struct mbuf *n;
+
+	if (m == 0)
+		return;
+	n = sb->sb_mb;
+	if (n) {
+		while (n->m_nextpkt)
+			n = n->m_nextpkt;
+		do {
+			if (n->m_flags & M_EOR) {
+				sbappendrecord(sb, m); /* XXXXXX!!!! */
+				return;
+			}
+		} while (n->m_next && (n = n->m_next));
+	}
+	sbcompress(sb, m, n);
+}
+
+#ifdef SOCKBUF_DEBUG
+void
+sbcheck(sb)
+	register struct sockbuf *sb;
+{
+	register struct mbuf *m;
+	register int len = 0, mbcnt = 0;
+
+	for (m = sb->sb_mb; m; m = m->m_next) {
+		len += m->m_len;
+		mbcnt += MSIZE;
+		if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
+			mbcnt += m->m_ext.ext_size;
+		if (m->m_nextpkt)
+			panic("sbcheck nextpkt");
+	}
+	if (len != sb->sb_cc || mbcnt != sb->sb_mbcnt) {
+		printf("cc %d != %d || mbcnt %d != %d\n", len, sb->sb_cc,
+		    mbcnt, sb->sb_mbcnt);
+		panic("sbcheck");
+	}
+}
+#endif
+
+/*
+ * As above, except the mbuf chain
+ * begins a new record.
+ */
+void
+sbappendrecord(sb, m0)
+	register struct sockbuf *sb;
+	register struct mbuf *m0;
+{
+	register struct mbuf *m;
+
+	if (m0 == 0)
+		return;
+	m = sb->sb_mb;
+	if (m)
+		while (m->m_nextpkt)
+			m = m->m_nextpkt;
+	/*
+	 * Put the first mbuf on the queue.
+	 * Note this permits zero length records.
+	 */
+	sballoc(sb, m0);
+	if (m)
+		m->m_nextpkt = m0;
+	else
+		sb->sb_mb = m0;
+	m = m0->m_next;
+	m0->m_next = 0;
+	if (m && (m0->m_flags & M_EOR)) {
+		m0->m_flags &= ~M_EOR;
+		m->m_flags |= M_EOR;
+	}
+	sbcompress(sb, m, m0);
+}
+
+/*
+ * As above except that OOB data
+ * is inserted at the beginning of the sockbuf,
+ * but after any other OOB data.
+ */
+void
+sbinsertoob(sb, m0)
+	register struct sockbuf *sb;
+	register struct mbuf *m0;
+{
+	register struct mbuf *m;
+	register struct mbuf **mp;
+
+	if (m0 == 0)
+		return;
+	for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) {
+	    m = *mp;
+	    again:
+		switch (m->m_type) {
+
+		case MT_OOBDATA:
+			continue;		/* WANT next train */
+
+		case MT_CONTROL:
+			m = m->m_next;
+			if (m)
+				goto again;	/* inspect THIS train further */
+		}
+		break;
+	}
+	/*
+	 * Put the first mbuf on the queue.
+	 * Note this permits zero length records.
+	 */
+	sballoc(sb, m0);
+	m0->m_nextpkt = *mp;
+	*mp = m0;
+	m = m0->m_next;
+	m0->m_next = 0;
+	if (m && (m0->m_flags & M_EOR)) {
+		m0->m_flags &= ~M_EOR;
+		m->m_flags |= M_EOR;
+	}
+	sbcompress(sb, m, m0);
+}
+
+/*
+ * Append address and data, and optionally, control (ancillary) data
+ * to the receive queue of a socket.  If present,
+ * m0 must include a packet header with total length.
+ * Returns 0 if no space in sockbuf or insufficient mbufs.
+ */
+int
+sbappendaddr(sb, asa, m0, control)
+	register struct sockbuf *sb;
+	struct sockaddr *asa;
+	struct mbuf *m0, *control;
+{
+	register struct mbuf *m, *n;
+	int space = asa->sa_len;
+
+if (m0 && (m0->m_flags & M_PKTHDR) == 0)
+panic("sbappendaddr");
+	if (m0)
+		space += m0->m_pkthdr.len;
+	for (n = control; n; n = n->m_next) {
+		space += n->m_len;
+		if (n->m_next == 0)	/* keep pointer to last control buf */
+			break;
+	}
+	if (space > sbspace(sb))
+		return (0);
+	if (asa->sa_len > MLEN)
+		return (0);
+	MGET(m, M_DONTWAIT, MT_SONAME);
+	if (m == 0)
+		return (0);
+	m->m_len = asa->sa_len;
+	bcopy((caddr_t)asa, mtod(m, caddr_t), asa->sa_len);
+	if (n)
+		n->m_next = m0;		/* concatenate data to control */
+	else
+		control = m0;
+	m->m_next = control;
+	for (n = m; n; n = n->m_next)
+		sballoc(sb, n);
+	n = sb->sb_mb;
+	if (n) {
+		while (n->m_nextpkt)
+			n = n->m_nextpkt;
+		n->m_nextpkt = m;
+	} else
+		sb->sb_mb = m;
+	return (1);
+}
+
+int
+sbappendcontrol(sb, m0, control)
+	struct sockbuf *sb;
+	struct mbuf *control, *m0;
+{
+	register struct mbuf *m, *n;
+	int space = 0;
+
+	if (control == 0)
+		panic("sbappendcontrol");
+	for (m = control; ; m = m->m_next) {
+		space += m->m_len;
+		if (m->m_next == 0)
+			break;
+	}
+	n = m;			/* save pointer to last control buffer */
+	for (m = m0; m; m = m->m_next)
+		space += m->m_len;
+	if (space > sbspace(sb))
+		return (0);
+	n->m_next = m0;			/* concatenate data to control */
+	for (m = control; m; m = m->m_next)
+		sballoc(sb, m);
+	n = sb->sb_mb;
+	if (n) {
+		while (n->m_nextpkt)
+			n = n->m_nextpkt;
+		n->m_nextpkt = control;
+	} else
+		sb->sb_mb = control;
+	return (1);
+}
+
+/*
+ * Compress mbuf chain m into the socket
+ * buffer sb following mbuf n.  If n
+ * is null, the buffer is presumed empty.
+ */
+void
+sbcompress(sb, m, n)
+	register struct sockbuf *sb;
+	register struct mbuf *m, *n;
+{
+	register int eor = 0;
+	register struct mbuf *o;
+
+	while (m) {
+		eor |= m->m_flags & M_EOR;
+		if (m->m_len == 0 &&
+		    (eor == 0 ||
+		     (((o = m->m_next) || (o = n)) &&
+		      o->m_type == m->m_type))) {
+			m = m_free(m);
+			continue;
+		}
+		if (n && (n->m_flags & (M_EXT | M_EOR)) == 0 &&
+		    (n->m_data + n->m_len + m->m_len) < &n->m_dat[MLEN] &&
+		    n->m_type == m->m_type) {
+			bcopy(mtod(m, caddr_t), mtod(n, caddr_t) + n->m_len,
+			    (unsigned)m->m_len);
+			n->m_len += m->m_len;
+			sb->sb_cc += m->m_len;
+			m = m_free(m);
+			continue;
+		}
+		if (n)
+			n->m_next = m;
+		else
+			sb->sb_mb = m;
+		sballoc(sb, m);
+		n = m;
+		m->m_flags &= ~M_EOR;
+		m = m->m_next;
+		n->m_next = 0;
+	}
+	if (eor) {
+		if (n)
+			n->m_flags |= eor;
+		else
+			printf("semi-panic: sbcompress\n");
+	}
+}
+
+/*
+ * Free all mbufs in a sockbuf.
+ * Check that all resources are reclaimed.
+ */
+void
+sbflush(sb)
+	register struct sockbuf *sb;
+{
+
+	if (sb->sb_flags & SB_LOCK)
+		panic("sbflush");
+	while (sb->sb_mbcnt)
+		sbdrop(sb, (int)sb->sb_cc);
+	if (sb->sb_cc || sb->sb_mb)
+		panic("sbflush 2");
+}
+
+/*
+ * Drop data from (the front of) a sockbuf.
+ */
+void
+sbdrop(sb, len)
+	register struct sockbuf *sb;
+	register int len;
+{
+	register struct mbuf *m, *mn;
+	struct mbuf *next;
+
+	next = (m = sb->sb_mb) ? m->m_nextpkt : 0;
+	while (len > 0) {
+		if (m == 0) {
+			if (next == 0)
+				panic("sbdrop");
+			m = next;
+			next = m->m_nextpkt;
+			continue;
+		}
+		if (m->m_len > len) {
+			m->m_len -= len;
+			m->m_data += len;
+			sb->sb_cc -= len;
+			break;
+		}
+		len -= m->m_len;
+		sbfree(sb, m);
+		MFREE(m, mn);
+		m = mn;
+	}
+	while (m && m->m_len == 0) {
+		sbfree(sb, m);
+		MFREE(m, mn);
+		m = mn;
+	}
+	if (m) {
+		sb->sb_mb = m;
+		m->m_nextpkt = next;
+	} else
+		sb->sb_mb = next;
+}
+
+/*
+ * Drop a record off the front of a sockbuf
+ * and move the next record to the front.
+ */
+void
+sbdroprecord(sb)
+	register struct sockbuf *sb;
+{
+	register struct mbuf *m, *mn;
+
+	m = sb->sb_mb;
+	if (m) {
+		sb->sb_mb = m->m_nextpkt;
+		do {
+			sbfree(sb, m);
+			MFREE(m, mn);
+			m = mn;
+		} while (m);
+	}
+}
+
+/*
+ * Create a "control" mbuf containing the specified data
+ * with the specified type for presentation on a socket buffer.
+ */
+struct mbuf *
+sbcreatecontrol(p, size, type, level)
+	caddr_t p;
+	register int size;
+	int type, level;
+{
+	register struct cmsghdr *cp;
+	struct mbuf *m;
+
+	if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
+		return ((struct mbuf *) NULL);
+	cp = mtod(m, struct cmsghdr *);
+	/* XXX check size? */
+	(void)memcpy(CMSG_DATA(cp), p, size);
+	size += sizeof(*cp);
+	m->m_len = size;
+	cp->cmsg_len = size;
+	cp->cmsg_level = level;
+	cp->cmsg_type = type;
+	return (m);
+}
+
+#ifdef PRU_OLDSTYLE
+/*
+ * The following routines mediate between the old-style `pr_usrreq'
+ * protocol implementations and the new-style `struct pr_usrreqs'
+ * calling convention.
+ */
+
+/* syntactic sugar */
+#define	nomb	(struct mbuf *)0
+
+static int
+old_abort(struct socket *so)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_ABORT, nomb, nomb, nomb);
+}
+
+static int
+old_accept(struct socket *so, struct mbuf *nam)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_ACCEPT, nomb,  nam, nomb);
+}
+
+static int
+old_attach(struct socket *so, int proto)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_ATTACH, nomb,
+				       (struct mbuf *)proto, /* XXX */
+				       nomb);
+}
+
+static int
+old_bind(struct socket *so, struct mbuf *nam)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_BIND, nomb, nam, nomb);
+}
+
+static int
+old_connect(struct socket *so, struct mbuf *nam)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_CONNECT, nomb, nam, nomb);
+}
+
+static int
+old_connect2(struct socket *so1, struct socket *so2)
+{
+	return so1->so_proto->pr_ousrreq(so1, PRU_CONNECT2, nomb, 
+				       (struct mbuf *)so2, nomb);
+}
+
+static int
+old_control(struct socket *so, int cmd, caddr_t data, struct ifnet *ifp)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_CONTROL, (struct mbuf *)cmd, 
+				       (struct mbuf *)data, 
+				       (struct mbuf *)ifp);
+}
+
+static int
+old_detach(struct socket *so)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_DETACH, nomb, nomb, nomb);
+}
+
+static int
+old_disconnect(struct socket *so)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_DISCONNECT, nomb, nomb, nomb);
+}
+
+static int
+old_listen(struct socket *so)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_LISTEN, nomb, nomb, nomb);
+}
+
+static int
+old_peeraddr(struct socket *so, struct mbuf *nam)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_PEERADDR, nomb, nam, nomb);
+}
+
+static int
+old_rcvd(struct socket *so, int flags)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_RCVD, nomb,
+				       (struct mbuf *)flags, /* XXX */
+				       nomb);
+}
+
+static int
+old_rcvoob(struct socket *so, struct mbuf *m, int flags)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_RCVOOB, m,
+				       (struct mbuf *)flags, /* XXX */
+				       nomb);
+}
+
+static int
+old_send(struct socket *so, int flags, struct mbuf *m, struct mbuf *addr,
+	 struct mbuf *control)
+{
+	int req;
+
+	if (flags & PRUS_OOB) {
+		req = PRU_SENDOOB;
+	} else if(flags & PRUS_EOF) {
+		req = PRU_SEND_EOF;
+	} else {
+		req = PRU_SEND;
+	}
+	return so->so_proto->pr_ousrreq(so, req, m, addr, control);
+}
+
+static int
+old_sense(struct socket *so, struct stat *sb)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_SENSE, (struct mbuf *)sb,
+				       nomb, nomb);
+}
+
+static int
+old_shutdown(struct socket *so)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_SHUTDOWN, nomb, nomb, nomb);
+}
+
+static int
+old_sockaddr(struct socket *so, struct mbuf *nam)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_SOCKADDR, nomb, nam, nomb);
+}
+
+struct pr_usrreqs pru_oldstyle = {
+	old_abort, old_accept, old_attach, old_bind, old_connect,
+	old_connect2, old_control, old_detach, old_disconnect,
+	old_listen, old_peeraddr, old_rcvd, old_rcvoob, old_send,
+	old_sense, old_shutdown, old_sockaddr
+};
+
+#endif /* PRU_OLDSTYLE */
+
+/*
+ * Some routines that return EOPNOTSUPP for entry points that are not
+ * supported by a protocol.  Fill in as needed.
+ */
+int
+pru_accept_notsupp(struct socket *so, struct mbuf *nam)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_connect2_notsupp(struct socket *so1, struct socket *so2)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_listen_notsupp(struct socket *so)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_rcvd_notsupp(struct socket *so, int flags)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
+{
+	return EOPNOTSUPP;
+}
+
+/*
+ * This isn't really a ``null'' operation, but it's the default one
+ * and doesn't do anything destructive.
+ */
+int
+pru_sense_null(struct socket *so, struct stat *sb)
+{
+	sb->st_blksize = so->so_snd.sb_hiwat;
+	return 0;
+}
+
+/*
+ * Here is the definition of some of the basic objects in the kern.ipc
+ * branch of the MIB.
+ */
+SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
+
+/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
+static int dummy;
+SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
+
+SYSCTL_INT(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLFLAG_RW, &sb_max, 0, "")
+SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
+	   &sb_efficiency, 0, "");
diff --git a/sys/kern/uipc_socket.c b/sys/kern/uipc_socket.c
index a9c5453..9f70207 100644
--- a/sys/kern/uipc_socket.c
+++ b/sys/kern/uipc_socket.c
@@ -30,13 +30,15 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)uipc_socket.c	8.6 (Berkeley) 5/2/95
+ *	@(#)uipc_socket.c	8.3 (Berkeley) 4/15/94
+ *	$Id: uipc_socket.c,v 1.24 1997/02/24 20:30:56 wollman Exp $
  */
 
 #include <sys/param.h>
+#include <sys/queue.h>
 #include <sys/systm.h>
 #include <sys/proc.h>
-#include <sys/file.h>
+#include <sys/fcntl.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/domain.h>
@@ -45,6 +47,12 @@
 #include <sys/socket.h>
 #include <sys/socketvar.h>
 #include <sys/resourcevar.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
+
+static int somaxconn = SOMAXCONN;
+SYSCTL_INT(_kern_ipc, KIPC_SOMAXCONN, somaxconn, CTLFLAG_RW, &somaxconn,
+	   0, "");
 
 /*
  * Socket operation routines.
@@ -55,13 +63,13 @@
  */
 /*ARGSUSED*/
 int
-socreate(dom, aso, type, proto)
+socreate(dom, aso, type, proto, p)
 	int dom;
 	struct socket **aso;
 	register int type;
 	int proto;
+	struct proc *p;
 {
-	struct proc *p = curproc;		/* XXX */
 	register struct protosw *prp;
 	register struct socket *so;
 	register int error;
@@ -70,18 +78,19 @@ socreate(dom, aso, type, proto)
 		prp = pffindproto(dom, proto, type);
 	else
 		prp = pffindtype(dom, type);
-	if (prp == 0 || prp->pr_usrreq == 0)
+	if (prp == 0 || prp->pr_usrreqs == 0)
 		return (EPROTONOSUPPORT);
 	if (prp->pr_type != type)
 		return (EPROTOTYPE);
 	MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_WAIT);
 	bzero((caddr_t)so, sizeof(*so));
+	TAILQ_INIT(&so->so_incomp);
+	TAILQ_INIT(&so->so_comp);
 	so->so_type = type;
 	if (p->p_ucred->cr_uid == 0)
 		so->so_state = SS_PRIV;
 	so->so_proto = prp;
-	error = (*prp->pr_usrreq)(so, PRU_ATTACH, (struct mbuf *)0,
-	    (struct mbuf *)(long)proto, (struct mbuf *)0);
+	error = (*prp->pr_usrreqs->pru_attach)(so, proto);
 	if (error) {
 		so->so_state |= SS_NOFDREF;
 		sofree(so);
@@ -99,9 +108,7 @@ sobind(so, nam)
 	int s = splnet();
 	int error;
 
-	error =
-	    (*so->so_proto->pr_usrreq)(so, PRU_BIND,
-		(struct mbuf *)0, nam, (struct mbuf *)0);
+	error = (*so->so_proto->pr_usrreqs->pru_bind)(so, nam);
 	splx(s);
 	return (error);
 }
@@ -113,33 +120,40 @@ solisten(so, backlog)
 {
 	int s = splnet(), error;
 
-	error =
-	    (*so->so_proto->pr_usrreq)(so, PRU_LISTEN,
-		(struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);
+	error = (*so->so_proto->pr_usrreqs->pru_listen)(so);
 	if (error) {
 		splx(s);
 		return (error);
 	}
-	if (so->so_q == 0)
+	if (so->so_comp.tqh_first == NULL)
 		so->so_options |= SO_ACCEPTCONN;
-	if (backlog < 0)
-		backlog = 0;
-	so->so_qlimit = min(backlog, SOMAXCONN);
+	if (backlog < 0 || backlog > somaxconn)
+		backlog = somaxconn;
+	so->so_qlimit = backlog;
 	splx(s);
 	return (0);
 }
 
-int
+void
 sofree(so)
 	register struct socket *so;
 {
+	struct socket *head = so->so_head;
 
 	if (so->so_pcb || (so->so_state & SS_NOFDREF) == 0)
 		return;
-	if (so->so_head) {
-		if (!soqremque(so, 0) && !soqremque(so, 1))
-			panic("sofree dq");
-		so->so_head = 0;
+	if (head != NULL) {
+		if (so->so_state & SS_INCOMP) {
+			TAILQ_REMOVE(&head->so_incomp, so, so_list);
+			head->so_incqlen--;
+		} else if (so->so_state & SS_COMP) {
+			TAILQ_REMOVE(&head->so_comp, so, so_list);
+		} else {
+			panic("sofree: not queued");
+		}
+		head->so_qlen--;
+		so->so_state &= ~(SS_INCOMP|SS_COMP);
+		so->so_head = NULL;
 	}
 	sbrelease(&so->so_snd);
 	sorflush(so);
@@ -159,10 +173,16 @@ soclose(so)
 	int error = 0;
 
 	if (so->so_options & SO_ACCEPTCONN) {
-		while (so->so_q0)
-			(void) soabort(so->so_q0);
-		while (so->so_q)
-			(void) soabort(so->so_q);
+		struct socket *sp, *sonext;
+
+		for (sp = so->so_incomp.tqh_first; sp != NULL; sp = sonext) {
+			sonext = sp->so_list.tqe_next;
+			(void) soabort(sp);
+		}
+		for (sp = so->so_comp.tqh_first; sp != NULL; sp = sonext) {
+			sonext = sp->so_list.tqe_next;
+			(void) soabort(sp);
+		}
 	}
 	if (so->so_pcb == 0)
 		goto discard;
@@ -176,17 +196,17 @@ soclose(so)
 			if ((so->so_state & SS_ISDISCONNECTING) &&
 			    (so->so_state & SS_NBIO))
 				goto drop;
-			while (so->so_state & SS_ISCONNECTED)
-				if (error = tsleep((caddr_t)&so->so_timeo,
-				    PSOCK | PCATCH, netcls, so->so_linger * hz))
+			while (so->so_state & SS_ISCONNECTED) {
+				error = tsleep((caddr_t)&so->so_timeo,
+				    PSOCK | PCATCH, "soclos", so->so_linger);
+				if (error)
 					break;
+			}
 		}
 	}
 drop:
 	if (so->so_pcb) {
-		int error2 =
-		    (*so->so_proto->pr_usrreq)(so, PRU_DETACH,
-			(struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);
+		int error2 = (*so->so_proto->pr_usrreqs->pru_detach)(so);
 		if (error == 0)
 			error = error2;
 	}
@@ -207,9 +227,7 @@ soabort(so)
 	struct socket *so;
 {
 
-	return (
-	    (*so->so_proto->pr_usrreq)(so, PRU_ABORT,
-		(struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0));
+	return (*so->so_proto->pr_usrreqs->pru_abort)(so);
 }
 
 int
@@ -223,8 +241,7 @@ soaccept(so, nam)
 	if ((so->so_state & SS_NOFDREF) == 0)
 		panic("soaccept: !NOFDREF");
 	so->so_state &= ~SS_NOFDREF;
-	error = (*so->so_proto->pr_usrreq)(so, PRU_ACCEPT,
-	    (struct mbuf *)0, nam, (struct mbuf *)0);
+	error = (*so->so_proto->pr_usrreqs->pru_accept)(so, nam);
 	splx(s);
 	return (error);
 }
@@ -251,8 +268,7 @@ soconnect(so, nam)
 	    (error = sodisconnect(so))))
 		error = EISCONN;
 	else
-		error = (*so->so_proto->pr_usrreq)(so, PRU_CONNECT,
-		    (struct mbuf *)0, nam, (struct mbuf *)0);
+		error = (*so->so_proto->pr_usrreqs->pru_connect)(so, nam);
 	splx(s);
 	return (error);
 }
@@ -265,8 +281,7 @@ soconnect2(so1, so2)
 	int s = splnet();
 	int error;
 
-	error = (*so1->so_proto->pr_usrreq)(so1, PRU_CONNECT2,
-	    (struct mbuf *)0, (struct mbuf *)so2, (struct mbuf *)0);
+	error = (*so1->so_proto->pr_usrreqs->pru_connect2)(so1, so2);
 	splx(s);
 	return (error);
 }
@@ -286,8 +301,7 @@ sodisconnect(so)
 		error = EALREADY;
 		goto bad;
 	}
-	error = (*so->so_proto->pr_usrreq)(so, PRU_DISCONNECT,
-	    (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0);
+	error = (*so->so_proto->pr_usrreqs->pru_disconnect)(so);
 bad:
 	splx(s);
 	return (error);
@@ -349,7 +363,8 @@ sosend(so, addr, uio, top, control, flags)
 #define	snderr(errno)	{ error = errno; splx(s); goto release; }
 
 restart:
-	if (error = sblock(&so->so_snd, SBLOCKWAIT(flags)))
+	error = sblock(&so->so_snd, SBLOCKWAIT(flags));
+	if (error)
 		goto out;
 	do {
 		s = splnet();
@@ -358,17 +373,25 @@ restart:
 		if (so->so_error)
 			snderr(so->so_error);
 		if ((so->so_state & SS_ISCONNECTED) == 0) {
-			if (so->so_proto->pr_flags & PR_CONNREQUIRED) {
+			/*
+			 * `sendto' and `sendmsg' is allowed on a connection-
+			 * based socket if it supports implied connect.
+			 * Return ENOTCONN if not connected and no address is
+			 * supplied.
+			 */
+			if ((so->so_proto->pr_flags & PR_CONNREQUIRED) &&
+			    (so->so_proto->pr_flags & PR_IMPLOPCL) == 0) {
 				if ((so->so_state & SS_ISCONFIRMING) == 0 &&
 				    !(resid == 0 && clen != 0))
 					snderr(ENOTCONN);
 			} else if (addr == 0)
-				snderr(EDESTADDRREQ);
+			    snderr(so->so_proto->pr_flags & PR_CONNREQUIRED ?
+				   ENOTCONN : EDESTADDRREQ);
 		}
 		space = sbspace(&so->so_snd);
 		if (flags & MSG_OOB)
 			space += 1024;
-		if (atomic && resid > so->so_snd.sb_hiwat ||
+		if ((atomic && resid > so->so_snd.sb_hiwat) ||
 		    clen > so->so_snd.sb_hiwat)
 			snderr(EMSGSIZE);
 		if (space < resid + clen && uio &&
@@ -403,25 +426,15 @@ restart:
 				MGET(m, M_WAIT, MT_DATA);
 				mlen = MLEN;
 			}
-			if (resid >= MINCLSIZE && space >= MCLBYTES) {
+			if (resid >= MINCLSIZE) {
 				MCLGET(m, M_WAIT);
 				if ((m->m_flags & M_EXT) == 0)
 					goto nopages;
 				mlen = MCLBYTES;
-#ifdef	MAPPED_MBUFS
-				len = min(MCLBYTES, resid);
-#else
-				if (atomic && top == 0) {
-					len = min(MCLBYTES - max_hdr, resid);
-					m->m_data += max_hdr;
-				} else
-					len = min(MCLBYTES, resid);
-#endif
-				space -= MCLBYTES;
+				len = min(min(mlen, resid), space);
 			} else {
 nopages:
 				len = min(min(mlen, resid), space);
-				space -= len;
 				/*
 				 * For datagram protocols, leave room
 				 * for protocol headers in first mbuf.
@@ -429,6 +442,7 @@ nopages:
 				if (atomic && top == 0 && len < mlen)
 					MH_ALIGN(m, len);
 			}
+			space -= len;
 			error = uiomove(mtod(m, caddr_t), (int)len, uio);
 			resid = uio->uio_resid;
 			m->m_len = len;
@@ -446,8 +460,17 @@ nopages:
 		    if (dontroute)
 			    so->so_options |= SO_DONTROUTE;
 		    s = splnet();				/* XXX */
-		    error = (*so->so_proto->pr_usrreq)(so,
-			(flags & MSG_OOB) ? PRU_SENDOOB : PRU_SEND,
+		    error = (*so->so_proto->pr_usrreqs->pru_send)(so,
+			(flags & MSG_OOB) ? PRUS_OOB :
+			/*
+			 * If the user set MSG_EOF, the protocol
+			 * understands this flag and nothing left to
+			 * send then use PRU_SEND_EOF instead of PRU_SEND.
+			 */
+			((flags & MSG_EOF) &&
+			 (so->so_proto->pr_flags & PR_IMPLOPCL) &&
+			 (resid <= 0)) ?
+				PRUS_EOF : 0,
 			top, addr, control);
 		    splx(s);
 		    if (dontroute)
@@ -500,7 +523,7 @@ soreceive(so, paddr, uio, mp0, controlp, flagsp)
 	register int flags, len, error, s, offset;
 	struct protosw *pr = so->so_proto;
 	struct mbuf *nextrecord;
-	int moff, type;
+	int moff, type = 0;
 	int orig_resid = uio->uio_resid;
 
 	mp = mp0;
@@ -514,8 +537,7 @@ soreceive(so, paddr, uio, mp0, controlp, flagsp)
 		flags = 0;
 	if (flags & MSG_OOB) {
 		m = m_get(M_WAIT, MT_DATA);
-		error = (*pr->pr_usrreq)(so, PRU_RCVOOB, m,
-		    (struct mbuf *)(long)(flags & MSG_PEEK), (struct mbuf *)0);
+		error = (*pr->pr_usrreqs->pru_rcvoob)(so, m, flags & MSG_PEEK);
 		if (error)
 			goto bad;
 		do {
@@ -531,11 +553,11 @@ bad:
 	if (mp)
 		*mp = (struct mbuf *)0;
 	if (so->so_state & SS_ISCONFIRMING && uio->uio_resid)
-		(*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
-		    (struct mbuf *)0, (struct mbuf *)0);
+		(*pr->pr_usrreqs->pru_rcvd)(so, 0);
 
 restart:
-	if (error = sblock(&so->so_rcv, SBLOCKWAIT(flags)))
+	error = sblock(&so->so_rcv, SBLOCKWAIT(flags));
+	if (error)
 		return (error);
 	s = splnet();
 
@@ -545,17 +567,17 @@ restart:
 	 * (subject to any timeout) if:
 	 *   1. the current count is less than the low water mark, or
 	 *   2. MSG_WAITALL is set, and it is possible to do the entire
-	 *	receive operation at once if we block (resid <= hiwat), or
-	 *   3. MSG_DONTWAIT is not set.
+	 *	receive operation at once if we block (resid <= hiwat).
+	 *   3. MSG_DONTWAIT is not set
 	 * If MSG_WAITALL is set but resid is larger than the receive buffer,
 	 * we have to do the receive in sections, and thus risk returning
 	 * a short count if a timeout or signal occurs after we start.
 	 */
-	if (m == 0 || ((flags & MSG_DONTWAIT) == 0 &&
+	if (m == 0 || (((flags & MSG_DONTWAIT) == 0 &&
 	    so->so_rcv.sb_cc < uio->uio_resid) &&
 	    (so->so_rcv.sb_cc < so->so_rcv.sb_lowat ||
 	    ((flags & MSG_WAITALL) && uio->uio_resid <= so->so_rcv.sb_hiwat)) &&
-	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0) {
+	    m->m_nextpkt == 0 && (pr->pr_flags & PR_ATOMIC) == 0)) {
 #ifdef DIAGNOSTIC
 		if (m == 0 && so->so_rcv.sb_cc)
 			panic("receive 1");
@@ -687,6 +709,8 @@ dontblock:
 			splx(s);
 			error = uiomove(mtod(m, caddr_t) + moff, (int)len, uio);
 			s = splnet();
+			if (error)
+				goto release;
 		} else
 			uio->uio_resid -= len;
 		if (len == m->m_len - moff) {
@@ -753,7 +777,8 @@ dontblock:
 				splx(s);
 				return (0);
 			}
-			if (m = so->so_rcv.sb_mb)
+			m = so->so_rcv.sb_mb;
+			if (m)
 				nextrecord = m->m_nextpkt;
 		}
 	}
@@ -767,9 +792,7 @@ dontblock:
 		if (m == 0)
 			so->so_rcv.sb_mb = nextrecord;
 		if (pr->pr_flags & PR_WANTRCVD && so->so_pcb)
-			(*pr->pr_usrreq)(so, PRU_RCVD, (struct mbuf *)0,
-			    (struct mbuf *)(long)flags, (struct mbuf *)0,
-			    (struct mbuf *)0);
+			(*pr->pr_usrreqs->pru_rcvd)(so, flags);
 	}
 	if (orig_resid == uio->uio_resid && orig_resid &&
 	    (flags & MSG_EOR) == 0 && (so->so_state & SS_CANTRCVMORE) == 0) {
@@ -777,7 +800,7 @@ dontblock:
 		splx(s);
 		goto restart;
 	}
-		
+
 	if (flagsp)
 		*flagsp |= flags;
 release:
@@ -797,8 +820,7 @@ soshutdown(so, how)
 	if (how & FREAD)
 		sorflush(so);
 	if (how & FWRITE)
-		return ((*pr->pr_usrreq)(so, PRU_SHUTDOWN,
-		    (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0));
+		return ((*pr->pr_usrreqs->pru_shutdown)(so));
 	return (0);
 }
 
@@ -857,6 +879,7 @@ sosetopt(so, level, optname, m0)
 		case SO_REUSEADDR:
 		case SO_REUSEPORT:
 		case SO_OOBINLINE:
+		case SO_TIMESTAMP:
 			if (m == NULL || m->m_len < sizeof (int)) {
 				error = EINVAL;
 				goto bad;
@@ -907,7 +930,7 @@ sosetopt(so, level, optname, m0)
 				goto bad;
 			}
 			tv = mtod(m, struct timeval *);
-			if (tv->tv_sec * hz + tv->tv_usec / tick > SHRT_MAX) {
+			if (tv->tv_sec > SHRT_MAX / hz - hz) {
 				error = EDOM;
 				goto bad;
 			}
@@ -925,6 +948,11 @@ sosetopt(so, level, optname, m0)
 			break;
 		    }
 
+		case SO_PRIVSTATE:
+			/* we don't care what the parameter is... */
+			so->so_state &= ~SS_PRIV;
+			break;
+
 		default:
 			error = ENOPROTOOPT;
 			break;
@@ -976,9 +1004,14 @@ sogetopt(so, level, optname, mp)
 		case SO_REUSEPORT:
 		case SO_BROADCAST:
 		case SO_OOBINLINE:
+		case SO_TIMESTAMP:
 			*mtod(m, int *) = so->so_options & optname;
 			break;
 
+		case SO_PRIVSTATE:
+			*mtod(m, int *) = so->so_state & SS_PRIV;
+			break;
+
 		case SO_TYPE:
 			*mtod(m, int *) = so->so_type;
 			break;
diff --git a/sys/kern/uipc_socket2.c b/sys/kern/uipc_socket2.c
index 865108a..e19db0c 100644
--- a/sys/kern/uipc_socket2.c
+++ b/sys/kern/uipc_socket2.c
@@ -30,30 +30,32 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)uipc_socket2.c	8.2 (Berkeley) 2/14/95
+ *	@(#)uipc_socket2.c	8.1 (Berkeley) 6/10/93
+ *	$Id: uipc_socket2.c,v 1.21 1997/02/19 19:15:43 wollman Exp $
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/kernel.h>
 #include <sys/proc.h>
 #include <sys/file.h>
 #include <sys/buf.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
+#include <sys/stat.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
+#include <sys/signalvar.h>
+#include <sys/sysctl.h>
 
 /*
  * Primitive routines for operating on sockets and socket buffers
  */
 
-/* strings for sleep message: */
-char	netio[] = "netio";
-char	netcon[] = "netcon";
-char	netcls[] = "netcls";
+u_long	sb_max = SB_MAX;		/* XXX should be static */
 
-u_long	sb_max = SB_MAX;		/* patchable */
+static	u_long sb_efficiency = 8;	/* parameter for sbreserve() */
 
 /*
  * Procedures to manipulate state flags of socket
@@ -76,7 +78,7 @@ u_long	sb_max = SB_MAX;		/* patchable */
  * structure queued on so_q0 by calling sonewconn().  When the connection
  * is established, soisconnected() is called, and transfers the
  * socket structure to so_q, making it available to accept().
- * 
+ *
  * If a socket is closed with sockets on either
  * so_q0 or so_q, these sockets are dropped.
  *
@@ -102,8 +104,12 @@ soisconnected(so)
 
 	so->so_state &= ~(SS_ISCONNECTING|SS_ISDISCONNECTING|SS_ISCONFIRMING);
 	so->so_state |= SS_ISCONNECTED;
-	if (head && soqremque(so, 0)) {
-		soqinsque(head, so, 1);
+	if (head && (so->so_state & SS_INCOMP)) {
+		TAILQ_REMOVE(&head->so_incomp, so, so_list);
+		head->so_incqlen--;
+		so->so_state &= ~SS_INCOMP;
+		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+		so->so_state |= SS_COMP;
 		sorwakeup(head);
 		wakeup((caddr_t)&head->so_timeo);
 	} else {
@@ -138,6 +144,49 @@ soisdisconnected(so)
 }
 
 /*
+ * Return a random connection that hasn't been serviced yet and
+ * is eligible for discard.  There is a one in qlen chance that
+ * we will return a null, saying that there are no dropable
+ * requests.  In this case, the protocol specific code should drop
+ * the new request.  This insures fairness.
+ *
+ * This may be used in conjunction with protocol specific queue
+ * congestion routines.
+ */
+struct socket *
+sodropablereq(head)
+	register struct socket *head;
+{
+	register struct socket *so;
+	unsigned int i, j, qlen;
+
+	static int rnd;
+	static long old_mono_secs;
+	static unsigned int cur_cnt, old_cnt;
+
+	if ((i = (mono_time.tv_sec - old_mono_secs)) != 0) {
+		old_mono_secs = mono_time.tv_sec;
+		old_cnt = cur_cnt / i;
+		cur_cnt = 0;
+	}
+
+	so = TAILQ_FIRST(&head->so_incomp);
+	if (!so)
+		return (so);
+
+	qlen = head->so_incqlen;
+	if (++cur_cnt > qlen || old_cnt > qlen) {
+		rnd = (314159 * rnd + 66329) & 0xffff;
+		j = ((qlen + 1) * rnd) >> 16;
+
+		while (j-- && so)
+		    so = TAILQ_NEXT(so, so_list);
+	}
+
+	return (so);
+}
+
+/*
  * When an attempt at a new connection is noted on a socket
  * which accepts connections, sonewconn is called.  If the
  * connection is possible (subject to space constraints, etc.)
@@ -154,14 +203,14 @@ sonewconn1(head, connstatus)
 	int connstatus;
 {
 	register struct socket *so;
-	int soqueue = connstatus ? 1 : 0;
 
-	if (head->so_qlen + head->so_q0len > 3 * head->so_qlimit / 2)
+	if (head->so_qlen > 3 * head->so_qlimit / 2)
 		return ((struct socket *)0);
 	MALLOC(so, struct socket *, sizeof(*so), M_SOCKET, M_DONTWAIT);
-	if (so == NULL) 
+	if (so == NULL)
 		return ((struct socket *)0);
 	bzero((caddr_t)so, sizeof(*so));
+	so->so_head = head;
 	so->so_type = head->so_type;
 	so->so_options = head->so_options &~ SO_ACCEPTCONN;
 	so->so_linger = head->so_linger;
@@ -170,13 +219,21 @@ sonewconn1(head, connstatus)
 	so->so_timeo = head->so_timeo;
 	so->so_pgid = head->so_pgid;
 	(void) soreserve(so, head->so_snd.sb_hiwat, head->so_rcv.sb_hiwat);
-	soqinsque(head, so, soqueue);
-	if ((*so->so_proto->pr_usrreq)(so, PRU_ATTACH,
-	    (struct mbuf *)0, (struct mbuf *)0, (struct mbuf *)0)) {
-		(void) soqremque(so, soqueue);
+
+	if ((*so->so_proto->pr_usrreqs->pru_attach)(so, 0)) {
 		(void) free((caddr_t)so, M_SOCKET);
 		return ((struct socket *)0);
 	}
+
+	if (connstatus) {
+		TAILQ_INSERT_TAIL(&head->so_comp, so, so_list);
+		so->so_state |= SS_COMP;
+	} else {
+		TAILQ_INSERT_TAIL(&head->so_incomp, so, so_list);
+		so->so_state |= SS_INCOMP;
+		head->so_incqlen++;
+	}
+	head->so_qlen++;
 	if (connstatus) {
 		sorwakeup(head);
 		wakeup((caddr_t)&head->so_timeo);
@@ -185,57 +242,6 @@ sonewconn1(head, connstatus)
 	return (so);
 }
 
-void
-soqinsque(head, so, q)
-	register struct socket *head, *so;
-	int q;
-{
-
-	register struct socket **prev;
-	so->so_head = head;
-	if (q == 0) {
-		head->so_q0len++;
-		so->so_q0 = 0;
-		for (prev = &(head->so_q0); *prev; )
-			prev = &((*prev)->so_q0);
-	} else {
-		head->so_qlen++;
-		so->so_q = 0;
-		for (prev = &(head->so_q); *prev; )
-			prev = &((*prev)->so_q);
-	}
-	*prev = so;
-}
-
-int
-soqremque(so, q)
-	register struct socket *so;
-	int q;
-{
-	register struct socket *head, *prev, *next;
-
-	head = so->so_head;
-	prev = head;
-	for (;;) {
-		next = q ? prev->so_q : prev->so_q0;
-		if (next == so)
-			break;
-		if (next == 0)
-			return (0);
-		prev = next;
-	}
-	if (q == 0) {
-		prev->so_q0 = next->so_q0;
-		head->so_q0len--;
-	} else {
-		prev->so_q = next->so_q;
-		head->so_qlen--;
-	}
-	next->so_q0 = next->so_q = 0;
-	next->so_head = 0;
-	return (1);
-}
-
 /*
  * Socantsendmore indicates that no more data will be sent on the
  * socket; it would normally be applied to a socket when the user
@@ -274,11 +280,11 @@ sbwait(sb)
 
 	sb->sb_flags |= SB_WAIT;
 	return (tsleep((caddr_t)&sb->sb_cc,
-	    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, netio,
+	    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK | PCATCH, "sbwait",
 	    sb->sb_timeo));
 }
 
-/* 
+/*
  * Lock a sockbuf already known to be locked;
  * return any error returned from sleep (EINTR).
  */
@@ -290,9 +296,10 @@ sb_lock(sb)
 
 	while (sb->sb_flags & SB_LOCK) {
 		sb->sb_flags |= SB_WANT;
-		if (error = tsleep((caddr_t)&sb->sb_flags, 
+		error = tsleep((caddr_t)&sb->sb_flags,
 		    (sb->sb_flags & SB_NOINTR) ? PSOCK : PSOCK|PCATCH,
-		    netio, 0))
+		    "sblock", 0);
+		if (error)
 			return (error);
 	}
 	sb->sb_flags |= SB_LOCK;
@@ -390,11 +397,10 @@ sbreserve(sb, cc)
 	struct sockbuf *sb;
 	u_long cc;
 {
-
-	if (cc > sb_max * MCLBYTES / (MSIZE + MCLBYTES))
+	if ((u_quad_t)cc > (u_quad_t)sb_max * MCLBYTES / (MSIZE + MCLBYTES))
 		return (0);
 	sb->sb_hiwat = cc;
-	sb->sb_mbmax = min(cc * 2, sb_max);
+	sb->sb_mbmax = min(cc * sb_efficiency, sb_max);
 	if (sb->sb_lowat > sb->sb_hiwat)
 		sb->sb_lowat = sb->sb_hiwat;
 	return (1);
@@ -452,7 +458,8 @@ sbappend(sb, m)
 
 	if (m == 0)
 		return;
-	if (n = sb->sb_mb) {
+	n = sb->sb_mb;
+	if (n) {
 		while (n->m_nextpkt)
 			n = n->m_nextpkt;
 		do {
@@ -476,7 +483,7 @@ sbcheck(sb)
 	for (m = sb->sb_mb; m; m = m->m_next) {
 		len += m->m_len;
 		mbcnt += MSIZE;
-		if (m->m_flags & M_EXT)
+		if (m->m_flags & M_EXT) /*XXX*/ /* pretty sure this is bogus */
 			mbcnt += m->m_ext.ext_size;
 		if (m->m_nextpkt)
 			panic("sbcheck nextpkt");
@@ -502,7 +509,8 @@ sbappendrecord(sb, m0)
 
 	if (m0 == 0)
 		return;
-	if (m = sb->sb_mb)
+	m = sb->sb_mb;
+	if (m)
 		while (m->m_nextpkt)
 			m = m->m_nextpkt;
 	/*
@@ -538,7 +546,8 @@ sbinsertoob(sb, m0)
 
 	if (m0 == 0)
 		return;
-	for (mp = &sb->sb_mb; m = *mp; mp = &((*mp)->m_nextpkt)) {
+	for (mp = &sb->sb_mb; *mp ; mp = &((*mp)->m_nextpkt)) {
+	    m = *mp;
 	    again:
 		switch (m->m_type) {
 
@@ -546,7 +555,8 @@ sbinsertoob(sb, m0)
 			continue;		/* WANT next train */
 
 		case MT_CONTROL:
-			if (m = m->m_next)
+			m = m->m_next;
+			if (m)
 				goto again;	/* inspect THIS train further */
 		}
 		break;
@@ -607,7 +617,8 @@ panic("sbappendaddr");
 	m->m_next = control;
 	for (n = m; n; n = n->m_next)
 		sballoc(sb, n);
-	if (n = sb->sb_mb) {
+	n = sb->sb_mb;
+	if (n) {
 		while (n->m_nextpkt)
 			n = n->m_nextpkt;
 		n->m_nextpkt = m;
@@ -619,7 +630,7 @@ panic("sbappendaddr");
 int
 sbappendcontrol(sb, m0, control)
 	struct sockbuf *sb;
-	struct mbuf *m0, *control;
+	struct mbuf *control, *m0;
 {
 	register struct mbuf *m, *n;
 	int space = 0;
@@ -639,7 +650,8 @@ sbappendcontrol(sb, m0, control)
 	n->m_next = m0;			/* concatenate data to control */
 	for (m = control; m; m = m->m_next)
 		sballoc(sb, m);
-	if (n = sb->sb_mb) {
+	n = sb->sb_mb;
+	if (n) {
 		while (n->m_nextpkt)
 			n = n->m_nextpkt;
 		n->m_nextpkt = control;
@@ -774,6 +786,233 @@ sbdroprecord(sb)
 		do {
 			sbfree(sb, m);
 			MFREE(m, mn);
-		} while (m = mn);
+			m = mn;
+		} while (m);
 	}
 }
+
+/*
+ * Create a "control" mbuf containing the specified data
+ * with the specified type for presentation on a socket buffer.
+ */
+struct mbuf *
+sbcreatecontrol(p, size, type, level)
+	caddr_t p;
+	register int size;
+	int type, level;
+{
+	register struct cmsghdr *cp;
+	struct mbuf *m;
+
+	if ((m = m_get(M_DONTWAIT, MT_CONTROL)) == NULL)
+		return ((struct mbuf *) NULL);
+	cp = mtod(m, struct cmsghdr *);
+	/* XXX check size? */
+	(void)memcpy(CMSG_DATA(cp), p, size);
+	size += sizeof(*cp);
+	m->m_len = size;
+	cp->cmsg_len = size;
+	cp->cmsg_level = level;
+	cp->cmsg_type = type;
+	return (m);
+}
+
+#ifdef PRU_OLDSTYLE
+/*
+ * The following routines mediate between the old-style `pr_usrreq'
+ * protocol implementations and the new-style `struct pr_usrreqs'
+ * calling convention.
+ */
+
+/* syntactic sugar */
+#define	nomb	(struct mbuf *)0
+
+static int
+old_abort(struct socket *so)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_ABORT, nomb, nomb, nomb);
+}
+
+static int
+old_accept(struct socket *so, struct mbuf *nam)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_ACCEPT, nomb,  nam, nomb);
+}
+
+static int
+old_attach(struct socket *so, int proto)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_ATTACH, nomb,
+				       (struct mbuf *)proto, /* XXX */
+				       nomb);
+}
+
+static int
+old_bind(struct socket *so, struct mbuf *nam)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_BIND, nomb, nam, nomb);
+}
+
+static int
+old_connect(struct socket *so, struct mbuf *nam)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_CONNECT, nomb, nam, nomb);
+}
+
+static int
+old_connect2(struct socket *so1, struct socket *so2)
+{
+	return so1->so_proto->pr_ousrreq(so1, PRU_CONNECT2, nomb, 
+				       (struct mbuf *)so2, nomb);
+}
+
+static int
+old_control(struct socket *so, int cmd, caddr_t data, struct ifnet *ifp)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_CONTROL, (struct mbuf *)cmd, 
+				       (struct mbuf *)data, 
+				       (struct mbuf *)ifp);
+}
+
+static int
+old_detach(struct socket *so)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_DETACH, nomb, nomb, nomb);
+}
+
+static int
+old_disconnect(struct socket *so)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_DISCONNECT, nomb, nomb, nomb);
+}
+
+static int
+old_listen(struct socket *so)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_LISTEN, nomb, nomb, nomb);
+}
+
+static int
+old_peeraddr(struct socket *so, struct mbuf *nam)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_PEERADDR, nomb, nam, nomb);
+}
+
+static int
+old_rcvd(struct socket *so, int flags)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_RCVD, nomb,
+				       (struct mbuf *)flags, /* XXX */
+				       nomb);
+}
+
+static int
+old_rcvoob(struct socket *so, struct mbuf *m, int flags)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_RCVOOB, m,
+				       (struct mbuf *)flags, /* XXX */
+				       nomb);
+}
+
+static int
+old_send(struct socket *so, int flags, struct mbuf *m, struct mbuf *addr,
+	 struct mbuf *control)
+{
+	int req;
+
+	if (flags & PRUS_OOB) {
+		req = PRU_SENDOOB;
+	} else if(flags & PRUS_EOF) {
+		req = PRU_SEND_EOF;
+	} else {
+		req = PRU_SEND;
+	}
+	return so->so_proto->pr_ousrreq(so, req, m, addr, control);
+}
+
+static int
+old_sense(struct socket *so, struct stat *sb)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_SENSE, (struct mbuf *)sb,
+				       nomb, nomb);
+}
+
+static int
+old_shutdown(struct socket *so)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_SHUTDOWN, nomb, nomb, nomb);
+}
+
+static int
+old_sockaddr(struct socket *so, struct mbuf *nam)
+{
+	return so->so_proto->pr_ousrreq(so, PRU_SOCKADDR, nomb, nam, nomb);
+}
+
+struct pr_usrreqs pru_oldstyle = {
+	old_abort, old_accept, old_attach, old_bind, old_connect,
+	old_connect2, old_control, old_detach, old_disconnect,
+	old_listen, old_peeraddr, old_rcvd, old_rcvoob, old_send,
+	old_sense, old_shutdown, old_sockaddr
+};
+
+#endif /* PRU_OLDSTYLE */
+
+/*
+ * Some routines that return EOPNOTSUPP for entry points that are not
+ * supported by a protocol.  Fill in as needed.
+ */
+int
+pru_accept_notsupp(struct socket *so, struct mbuf *nam)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_connect2_notsupp(struct socket *so1, struct socket *so2)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_listen_notsupp(struct socket *so)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_rcvd_notsupp(struct socket *so, int flags)
+{
+	return EOPNOTSUPP;
+}
+
+int
+pru_rcvoob_notsupp(struct socket *so, struct mbuf *m, int flags)
+{
+	return EOPNOTSUPP;
+}
+
+/*
+ * This isn't really a ``null'' operation, but it's the default one
+ * and doesn't do anything destructive.
+ */
+int
+pru_sense_null(struct socket *so, struct stat *sb)
+{
+	sb->st_blksize = so->so_snd.sb_hiwat;
+	return 0;
+}
+
+/*
+ * Here is the definition of some of the basic objects in the kern.ipc
+ * branch of the MIB.
+ */
+SYSCTL_NODE(_kern, KERN_IPC, ipc, CTLFLAG_RW, 0, "IPC");
+
+/* This takes the place of kern.maxsockbuf, which moved to kern.ipc. */
+static int dummy;
+SYSCTL_INT(_kern, KERN_DUMMY, dummy, CTLFLAG_RW, &dummy, 0, "");
+
+SYSCTL_INT(_kern_ipc, KIPC_MAXSOCKBUF, maxsockbuf, CTLFLAG_RW, &sb_max, 0, "")
+SYSCTL_INT(_kern_ipc, KIPC_SOCKBUF_WASTE, sockbuf_waste_factor, CTLFLAG_RW,
+	   &sb_efficiency, 0, "");
diff --git a/sys/kern/uipc_syscalls.c b/sys/kern/uipc_syscalls.c
index 800434c..e3aca30 100644
--- a/sys/kern/uipc_syscalls.c
+++ b/sys/kern/uipc_syscalls.c
@@ -30,26 +30,43 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)uipc_syscalls.c	8.6 (Berkeley) 2/14/95
+ *	@(#)uipc_syscalls.c	8.4 (Berkeley) 2/21/94
+ * $Id: uipc_syscalls.c,v 1.22 1997/02/22 09:39:29 peter Exp $
  */
 
+#include "opt_ktrace.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/sysproto.h>
 #include <sys/filedesc.h>
 #include <sys/proc.h>
+#include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/buf.h>
 #include <sys/malloc.h>
 #include <sys/mbuf.h>
 #include <sys/protosw.h>
+#include <sys/stat.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
+#include <sys/signalvar.h>
+#include <sys/un.h>
 #ifdef KTRACE
 #include <sys/ktrace.h>
 #endif
 
-#include <sys/mount.h>
-#include <sys/syscallargs.h>
+extern int sendit __P((struct proc *p, int s, struct msghdr *mp, int flags,
+		       int *retsize));
+extern int recvit __P((struct proc *p, int s, struct msghdr *mp,
+		       caddr_t namelenp, int *retsize));
+  
+static int accept1 __P((struct proc *p, struct accept_args *uap, int *retval,
+			int compat));
+static int getsockname1 __P((struct proc *p, struct getsockname_args *uap,
+			     int *retval, int compat));
+static int getpeername1 __P((struct proc *p, struct getpeername_args *uap,
+			     int *retval, int compat));
 
 /*
  * System call interface to the socket abstraction.
@@ -64,24 +81,25 @@ int
 socket(p, uap, retval)
 	struct proc *p;
 	register struct socket_args /* {
-		syscallarg(int) domain;
-		syscallarg(int) type;
-		syscallarg(int) protocol;
+		int	domain;
+		int	type;
+		int	protocol;
 	} */ *uap;
-	register_t *retval;
+	int *retval;
 {
 	struct filedesc *fdp = p->p_fd;
 	struct socket *so;
 	struct file *fp;
 	int fd, error;
 
-	if (error = falloc(p, &fp, &fd))
+	error = falloc(p, &fp, &fd);
+	if (error)
 		return (error);
 	fp->f_flag = FREAD|FWRITE;
 	fp->f_type = DTYPE_SOCKET;
 	fp->f_ops = &socketops;
-	if (error = socreate(SCARG(uap, domain), &so, SCARG(uap, type),
-	    SCARG(uap, protocol))) {
+	error = socreate(uap->domain, &so, uap->type, uap->protocol, p);
+	if (error) {
 		fdp->fd_ofiles[fd] = 0;
 		ffree(fp);
 	} else {
@@ -96,20 +114,21 @@ int
 bind(p, uap, retval)
 	struct proc *p;
 	register struct bind_args /* {
-		syscallarg(int) s;
-		syscallarg(caddr_t) name;
-		syscallarg(int) namelen;
+		int	s;
+		caddr_t	name;
+		int	namelen;
 	} */ *uap;
-	register_t *retval;
+	int *retval;
 {
 	struct file *fp;
 	struct mbuf *nam;
 	int error;
 
-	if (error = getsock(p->p_fd, SCARG(uap, s), &fp))
+	error = getsock(p->p_fd, uap->s, &fp);
+	if (error)
 		return (error);
-	if (error = sockargs(&nam, SCARG(uap, name), SCARG(uap, namelen),
-	    MT_SONAME))
+	error = sockargs(&nam, uap->name, uap->namelen, MT_SONAME);
+	if (error)
 		return (error);
 	error = sobind((struct socket *)fp->f_data, nam);
 	m_freem(nam);
@@ -121,159 +140,161 @@ int
 listen(p, uap, retval)
 	struct proc *p;
 	register struct listen_args /* {
-		syscallarg(int) s;
-		syscallarg(int) backlog;
+		int	s;
+		int	backlog;
 	} */ *uap;
-	register_t *retval;
+	int *retval;
 {
 	struct file *fp;
 	int error;
 
-	if (error = getsock(p->p_fd, SCARG(uap, s), &fp))
+	error = getsock(p->p_fd, uap->s, &fp);
+	if (error)
 		return (error);
-	return (solisten((struct socket *)fp->f_data, SCARG(uap, backlog)));
+	return (solisten((struct socket *)fp->f_data, uap->backlog));
 }
 
-#ifdef COMPAT_OLDSOCK
-int
-accept(p, uap, retval)
-	struct proc *p;
-	struct accept_args /* {
-		syscallarg(int) s;
-		syscallarg(caddr_t) name;
-		syscallarg(int *) anamelen;
-	} */ *uap;
-	register_t *retval;
-{
-
-	return (accept1(p, uap, retval, 0));
-}
-
-int
-compat_43_accept(p, uap, retval)
-	struct proc *p;
-	struct accept_args /* {
-		syscallarg(int) s;
-		syscallarg(caddr_t) name;
-		syscallarg(int *) anamelen;
-	} */ *uap;
-	register_t *retval;
-{
-
-	return (accept1(p, uap, retval, 1));
-}
-#else /* COMPAT_OLDSOCK */
-
-#define	accept1	accept
-#endif
-
-int
-accept1(p, uap, retval, compat_43)
+static int
+accept1(p, uap, retval, compat)
 	struct proc *p;
 	register struct accept_args /* {
-		syscallarg(int) s;
-		syscallarg(caddr_t) name;
-		syscallarg(int *) anamelen;
+		int	s;
+		caddr_t	name;
+		int	*anamelen;
 	} */ *uap;
-	register_t *retval;
-	int compat_43;
+	int *retval;
+	int compat;
 {
 	struct file *fp;
 	struct mbuf *nam;
-	int namelen, error, s, tmpfd;
-	register struct socket *so;
-
-	if (SCARG(uap, name) && (error = copyin((caddr_t)SCARG(uap, anamelen),
-	    (caddr_t)&namelen, sizeof (namelen))))
-		return (error);
-	if (error = getsock(p->p_fd, SCARG(uap, s), &fp))
+	int namelen, error, s;
+	struct socket *head, *so;
+	short fflag;		/* type must match fp->f_flag */
+
+	if (uap->name) {
+		error = copyin((caddr_t)uap->anamelen, (caddr_t)&namelen,
+			sizeof (namelen));
+		if(error)
+			return (error);
+	}
+	error = getsock(p->p_fd, uap->s, &fp);
+	if (error)
 		return (error);
 	s = splnet();
-	so = (struct socket *)fp->f_data;
-	if ((so->so_options & SO_ACCEPTCONN) == 0) {
+	head = (struct socket *)fp->f_data;
+	if ((head->so_options & SO_ACCEPTCONN) == 0) {
 		splx(s);
 		return (EINVAL);
 	}
-	if ((so->so_state & SS_NBIO) && so->so_qlen == 0) {
+	if ((head->so_state & SS_NBIO) && head->so_comp.tqh_first == NULL) {
 		splx(s);
 		return (EWOULDBLOCK);
 	}
-	while (so->so_qlen == 0 && so->so_error == 0) {
-		if (so->so_state & SS_CANTRCVMORE) {
-			so->so_error = ECONNABORTED;
+	while (head->so_comp.tqh_first == NULL && head->so_error == 0) {
+		if (head->so_state & SS_CANTRCVMORE) {
+			head->so_error = ECONNABORTED;
 			break;
 		}
-		if (error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH,
-		    netcon, 0)) {
+		error = tsleep((caddr_t)&head->so_timeo, PSOCK | PCATCH,
+		    "accept", 0);
+		if (error) {
 			splx(s);
 			return (error);
 		}
 	}
-	if (so->so_error) {
-		error = so->so_error;
-		so->so_error = 0;
+	if (head->so_error) {
+		error = head->so_error;
+		head->so_error = 0;
 		splx(s);
 		return (error);
 	}
-	if (error = falloc(p, &fp, &tmpfd)) {
+	fflag = fp->f_flag;
+	error = falloc(p, &fp, retval);
+	if (error) {
 		splx(s);
 		return (error);
 	}
-	*retval = tmpfd;
-	{ struct socket *aso = so->so_q;
-	  if (soqremque(aso, 1) == 0)
-		panic("accept");
-	  so = aso;
-	}
+
+	so = head->so_comp.tqh_first;
+	if (so == NULL)
+		panic("accept: nothing queued");
+	TAILQ_REMOVE(&head->so_comp, so, so_list);
+	so->so_state &= ~SS_COMP;
+	so->so_head = NULL;
+	head->so_qlen--;
+
 	fp->f_type = DTYPE_SOCKET;
-	fp->f_flag = FREAD|FWRITE;
+	fp->f_flag = fflag;
 	fp->f_ops = &socketops;
 	fp->f_data = (caddr_t)so;
 	nam = m_get(M_WAIT, MT_SONAME);
 	(void) soaccept(so, nam);
-	if (SCARG(uap, name)) {
+	if (uap->name) {
 #ifdef COMPAT_OLDSOCK
-		if (compat_43)
+		if (compat)
 			mtod(nam, struct osockaddr *)->sa_family =
 			    mtod(nam, struct sockaddr *)->sa_family;
 #endif
 		if (namelen > nam->m_len)
 			namelen = nam->m_len;
 		/* SHOULD COPY OUT A CHAIN HERE */
-		if ((error = copyout(mtod(nam, caddr_t),
-		    (caddr_t)SCARG(uap, name), (u_int)namelen)) == 0)
+		error = copyout(mtod(nam, caddr_t), (caddr_t)uap->name,
+		    (u_int)namelen);
+		if (!error)
 			error = copyout((caddr_t)&namelen,
-			    (caddr_t)SCARG(uap, anamelen),
-			    sizeof (*SCARG(uap, anamelen)));
+			    (caddr_t)uap->anamelen, sizeof (*uap->anamelen));
 	}
 	m_freem(nam);
 	splx(s);
 	return (error);
 }
 
+int
+accept(p, uap, retval)
+	struct proc *p;
+	struct accept_args *uap;
+	int *retval;
+{
+
+	return (accept1(p, uap, retval, 0));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+oaccept(p, uap, retval)
+	struct proc *p;
+	struct accept_args *uap;
+	int *retval;
+{
+
+	return (accept1(p, uap, retval, 1));
+}
+#endif /* COMPAT_OLDSOCK */
+
 /* ARGSUSED */
 int
 connect(p, uap, retval)
 	struct proc *p;
 	register struct connect_args /* {
-		syscallarg(int) s;
-		syscallarg(caddr_t) name;
-		syscallarg(int) namelen;
+		int	s;
+		caddr_t	name;
+		int	namelen;
 	} */ *uap;
-	register_t *retval;
+	int *retval;
 {
 	struct file *fp;
 	register struct socket *so;
 	struct mbuf *nam;
 	int error, s;
 
-	if (error = getsock(p->p_fd, SCARG(uap, s), &fp))
+	error = getsock(p->p_fd, uap->s, &fp);
+	if (error)
 		return (error);
 	so = (struct socket *)fp->f_data;
 	if ((so->so_state & SS_NBIO) && (so->so_state & SS_ISCONNECTING))
 		return (EALREADY);
-	if (error = sockargs(&nam, SCARG(uap, name), SCARG(uap, namelen),
-	    MT_SONAME))
+	error = sockargs(&nam, uap->name, uap->namelen, MT_SONAME);
+	if (error)
 		return (error);
 	error = soconnect(so, nam);
 	if (error)
@@ -283,10 +304,12 @@ connect(p, uap, retval)
 		return (EINPROGRESS);
 	}
 	s = splnet();
-	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0)
-		if (error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH,
-		    netcon, 0))
+	while ((so->so_state & SS_ISCONNECTING) && so->so_error == 0) {
+		error = tsleep((caddr_t)&so->so_timeo, PSOCK | PCATCH,
+		    "connec", 0);
+		if (error)
 			break;
+	}
 	if (error == 0) {
 		error = so->so_error;
 		so->so_error = 0;
@@ -304,51 +327,56 @@ int
 socketpair(p, uap, retval)
 	struct proc *p;
 	register struct socketpair_args /* {
-		syscallarg(int) domain;
-		syscallarg(int) type;
-		syscallarg(int) protocol;
-		syscallarg(int *) rsv;
+		int	domain;
+		int	type;
+		int	protocol;
+		int	*rsv;
 	} */ *uap;
-	register_t *retval;
+	int retval[];
 {
 	register struct filedesc *fdp = p->p_fd;
 	struct file *fp1, *fp2;
 	struct socket *so1, *so2;
 	int fd, error, sv[2];
 
-	if (error = socreate(SCARG(uap, domain), &so1, SCARG(uap, type),
-	    SCARG(uap, protocol)))
+	error = socreate(uap->domain, &so1, uap->type, uap->protocol, p);
+	if (error)
 		return (error);
-	if (error = socreate(SCARG(uap, domain), &so2, SCARG(uap, type),
-	    SCARG(uap, protocol)))
+	error = socreate(uap->domain, &so2, uap->type, uap->protocol, p);
+	if (error)
 		goto free1;
-	if (error = falloc(p, &fp1, &fd))
+	error = falloc(p, &fp1, &fd);
+	if (error)
 		goto free2;
 	sv[0] = fd;
 	fp1->f_flag = FREAD|FWRITE;
 	fp1->f_type = DTYPE_SOCKET;
 	fp1->f_ops = &socketops;
 	fp1->f_data = (caddr_t)so1;
-	if (error = falloc(p, &fp2, &fd))
+	error = falloc(p, &fp2, &fd);
+	if (error)
 		goto free3;
 	fp2->f_flag = FREAD|FWRITE;
 	fp2->f_type = DTYPE_SOCKET;
 	fp2->f_ops = &socketops;
 	fp2->f_data = (caddr_t)so2;
 	sv[1] = fd;
-	if (error = soconnect2(so1, so2))
+	error = soconnect2(so1, so2);
+	if (error)
 		goto free4;
-	if (SCARG(uap, type) == SOCK_DGRAM) {
+	if (uap->type == SOCK_DGRAM) {
 		/*
 		 * Datagram socket connection is asymmetric.
 		 */
-		 if (error = soconnect2(so2, so1))
+		 error = soconnect2(so2, so1);
+		 if (error)
 			goto free4;
 	}
-	error = copyout((caddr_t)sv, (caddr_t)SCARG(uap, rsv),
-	    2 * sizeof (int));
+	error = copyout((caddr_t)sv, (caddr_t)uap->rsv, 2 * sizeof (int));
+#if 0	/* old pipe(2) syscall compatability, unused these days */
 	retval[0] = sv[0];		/* XXX ??? */
 	retval[1] = sv[1];		/* XXX ??? */
+#endif
 	return (error);
 free4:
 	ffree(fp2);
@@ -364,145 +392,11 @@ free1:
 }
 
 int
-sendto(p, uap, retval)
-	struct proc *p;
-	register struct sendto_args /* {
-		syscallarg(int) s;
-		syscallarg(caddr_t) buf;
-		syscallarg(size_t) len;
-		syscallarg(int) flags;
-		syscallarg(caddr_t) to;
-		syscallarg(int) tolen;
-	} */ *uap;
-	register_t *retval;
-{
-	struct msghdr msg;
-	struct iovec aiov;
-
-	msg.msg_name = SCARG(uap, to);
-	msg.msg_namelen = SCARG(uap, tolen);
-	msg.msg_iov = &aiov;
-	msg.msg_iovlen = 1;
-	msg.msg_control = 0;
-#ifdef COMPAT_OLDSOCK
-	msg.msg_flags = 0;
-#endif
-	aiov.iov_base = SCARG(uap, buf);
-	aiov.iov_len = SCARG(uap, len);
-	return (sendit(p, SCARG(uap, s), &msg, SCARG(uap, flags), retval));
-}
-
-#ifdef COMPAT_OLDSOCK
-int
-compat_43_send(p, uap, retval)
-	struct proc *p;
-	register struct compat_43_send_args /* {
-		syscallarg(int) s;
-		syscallarg(caddr_t) buf;
-		syscallarg(int) len;
-		syscallarg(int) flags;
-	} */ *uap;
-	register_t *retval;
-{
-	struct msghdr msg;
-	struct iovec aiov;
-
-	msg.msg_name = 0;
-	msg.msg_namelen = 0;
-	msg.msg_iov = &aiov;
-	msg.msg_iovlen = 1;
-	aiov.iov_base = SCARG(uap, buf);
-	aiov.iov_len = SCARG(uap, len);
-	msg.msg_control = 0;
-	msg.msg_flags = 0;
-	return (sendit(p, SCARG(uap, s), &msg, SCARG(uap, flags), retval));
-}
-
-#define MSG_COMPAT	0x8000
-int
-compat_43_sendmsg(p, uap, retval)
-	struct proc *p;
-	register struct compat_43_sendmsg_args /* {
-		syscallarg(int) s;
-		syscallarg(caddr_t) msg;
-		syscallarg(int) flags;
-	} */ *uap;
-	register_t *retval;
-{
-	struct msghdr msg;
-	struct iovec aiov[UIO_SMALLIOV], *iov;
-	int error;
-
-	if (error = copyin(SCARG(uap, msg), (caddr_t)&msg,
-	    sizeof (struct omsghdr)))
-		return (error);
-	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
-		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
-			return (EMSGSIZE);
-		MALLOC(iov, struct iovec *,
-		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV, 
-		      M_WAITOK);
-	} else
-		iov = aiov;
-	if (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
-	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec))))
-		goto done;
-	msg.msg_flags = MSG_COMPAT;
-	msg.msg_iov = iov;
-	error = sendit(p, SCARG(uap, s), &msg, SCARG(uap, flags), retval);
-done:
-	if (iov != aiov)
-		FREE(iov, M_IOV);
-	return (error);
-}
-#endif
-
-int
-sendmsg(p, uap, retval)
-	struct proc *p;
-	register struct sendmsg_args /* {
-		syscallarg(int) s;
-		syscallarg(caddr_t) msg;
-		syscallarg(int) flags;
-	} */ *uap;
-	register_t *retval;
-{
-	struct msghdr msg;
-	struct iovec aiov[UIO_SMALLIOV], *iov;
-	int error;
-
-	if (error = copyin(SCARG(uap, msg), (caddr_t)&msg, sizeof (msg)))
-		return (error);
-	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
-		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
-			return (EMSGSIZE);
-		MALLOC(iov, struct iovec *,
-		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
-		       M_WAITOK);
-	} else
-		iov = aiov;
-	if (msg.msg_iovlen &&
-	    (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
-	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
-		goto done;
-	msg.msg_iov = iov;
-#ifdef COMPAT_OLDSOCK
-	msg.msg_flags = 0;
-#endif
-	error = sendit(p, SCARG(uap, s), &msg, SCARG(uap, flags), retval);
-done:
-	if (iov != aiov)
-		FREE(iov, M_IOV);
-	return (error);
-}
-
-int
 sendit(p, s, mp, flags, retsize)
 	register struct proc *p;
 	int s;
 	register struct msghdr *mp;
-	int flags;
-	register_t *retsize;
+	int flags, *retsize;
 {
 	struct file *fp;
 	struct uio auio;
@@ -513,8 +407,9 @@ sendit(p, s, mp, flags, retsize)
 #ifdef KTRACE
 	struct iovec *ktriov = NULL;
 #endif
-	
-	if (error = getsock(p->p_fd, s, &fp))
+
+	error = getsock(p->p_fd, s, &fp);
+	if (error)
 		return (error);
 	auio.uio_iov = mp->msg_iov;
 	auio.uio_iovcnt = mp->msg_iovlen;
@@ -525,13 +420,12 @@ sendit(p, s, mp, flags, retsize)
 	auio.uio_resid = 0;
 	iov = mp->msg_iov;
 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
-		if (auio.uio_resid + iov->iov_len < auio.uio_resid)
+		if ((auio.uio_resid += iov->iov_len) < 0)
 			return (EINVAL);
-		auio.uio_resid += iov->iov_len;
 	}
 	if (mp->msg_name) {
-		if (error = sockargs(&to, mp->msg_name, mp->msg_namelen,
-		    MT_SONAME))
+		error = sockargs(&to, mp->msg_name, mp->msg_namelen, MT_SONAME);
+		if (error)
 			return (error);
 	} else
 		to = 0;
@@ -544,8 +438,9 @@ sendit(p, s, mp, flags, retsize)
 			error = EINVAL;
 			goto bad;
 		}
-		if (error = sockargs(&control, mp->msg_control,
-		    mp->msg_controllen, MT_CONTROL))
+		error = sockargs(&control, mp->msg_control,
+		    mp->msg_controllen, MT_CONTROL);
+		if (error)
 			goto bad;
 #ifdef COMPAT_OLDSOCK
 		if (mp->msg_flags == MSG_COMPAT) {
@@ -574,8 +469,9 @@ sendit(p, s, mp, flags, retsize)
 	}
 #endif
 	len = auio.uio_resid;
-	if (error = sosend((struct socket *)fp->f_data, to, &auio,
-	    (struct mbuf *)0, control, flags)) {
+	error = sosend((struct socket *)fp->f_data, to, &auio,
+	    (struct mbuf *)0, control, flags);
+	if (error) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
@@ -598,71 +494,46 @@ bad:
 	return (error);
 }
 
-#ifdef COMPAT_OLDSOCK
 int
-compat_43_recvfrom(p, uap, retval)
-	struct proc *p;
-	struct recvfrom_args /* {
-		syscallarg(int) s;
-		syscallarg(caddr_t) buf;
-		syscallarg(size_t) len;
-		syscallarg(int) flags;
-		syscallarg(caddr_t) from;
-		syscallarg(int *) fromlenaddr;
-	} */ *uap;
-	register_t *retval;
-{
-
-	SCARG(uap, flags) |= MSG_COMPAT;
-	return (recvfrom(p, uap, retval));
-}
-#endif
-
-int
-recvfrom(p, uap, retval)
+sendto(p, uap, retval)
 	struct proc *p;
-	register struct recvfrom_args /* {
-		syscallarg(int) s;
-		syscallarg(caddr_t) buf;
-		syscallarg(size_t) len;
-		syscallarg(int) flags;
-		syscallarg(caddr_t) from;
-		syscallarg(int *) fromlenaddr;
+	register struct sendto_args /* {
+		int	s;
+		caddr_t	buf;
+		size_t	len;
+		int	flags;
+		caddr_t	to;
+		int	tolen;
 	} */ *uap;
-	register_t *retval;
+	int *retval;
 {
 	struct msghdr msg;
 	struct iovec aiov;
-	int error;
 
-	if (SCARG(uap, fromlenaddr)) {
-		if (error = copyin((caddr_t)SCARG(uap, fromlenaddr),
-		    (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen)))
-			return (error);
-	} else
-		msg.msg_namelen = 0;
-	msg.msg_name = SCARG(uap, from);
+	msg.msg_name = uap->to;
+	msg.msg_namelen = uap->tolen;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
-	aiov.iov_base = SCARG(uap, buf);
-	aiov.iov_len = SCARG(uap, len);
 	msg.msg_control = 0;
-	msg.msg_flags = SCARG(uap, flags);
-	return (recvit(p, SCARG(uap, s), &msg,
-	    (caddr_t)SCARG(uap, fromlenaddr), retval));
+#ifdef COMPAT_OLDSOCK
+	msg.msg_flags = 0;
+#endif
+	aiov.iov_base = uap->buf;
+	aiov.iov_len = uap->len;
+	return (sendit(p, uap->s, &msg, uap->flags, retval));
 }
 
 #ifdef COMPAT_OLDSOCK
 int
-compat_43_recv(p, uap, retval)
+osend(p, uap, retval)
 	struct proc *p;
-	register struct compat_43_recv_args /* {
-		syscallarg(int) s;
-		syscallarg(caddr_t) buf;
-		syscallarg(int) len;
-		syscallarg(int) flags;
+	register struct osend_args /* {
+		int	s;
+		caddr_t	buf;
+		int	len;
+		int	flags;
 	} */ *uap;
-	register_t *retval;
+	int *retval;
 {
 	struct msghdr msg;
 	struct iovec aiov;
@@ -671,34 +542,29 @@ compat_43_recv(p, uap, retval)
 	msg.msg_namelen = 0;
 	msg.msg_iov = &aiov;
 	msg.msg_iovlen = 1;
-	aiov.iov_base = SCARG(uap, buf);
-	aiov.iov_len = SCARG(uap, len);
+	aiov.iov_base = uap->buf;
+	aiov.iov_len = uap->len;
 	msg.msg_control = 0;
-	msg.msg_flags = SCARG(uap, flags);
-	return (recvit(p, SCARG(uap, s), &msg, (caddr_t)0, retval));
+	msg.msg_flags = 0;
+	return (sendit(p, uap->s, &msg, uap->flags, retval));
 }
 
-/*
- * Old recvmsg.  This code takes advantage of the fact that the old msghdr
- * overlays the new one, missing only the flags, and with the (old) access
- * rights where the control fields are now.
- */
 int
-compat_43_recvmsg(p, uap, retval)
+osendmsg(p, uap, retval)
 	struct proc *p;
-	register struct compat_43_recvmsg_args /* {
-		syscallarg(int) s;
-		syscallarg(struct omsghdr *) msg;
-		syscallarg(int) flags;
+	register struct osendmsg_args /* {
+		int	s;
+		caddr_t	msg;
+		int	flags;
 	} */ *uap;
-	register_t *retval;
+	int *retval;
 {
 	struct msghdr msg;
 	struct iovec aiov[UIO_SMALLIOV], *iov;
 	int error;
 
-	if (error = copyin((caddr_t)SCARG(uap, msg), (caddr_t)&msg,
-	    sizeof (struct omsghdr)))
+	error = copyin(uap->msg, (caddr_t)&msg, sizeof (struct omsghdr));
+	if (error)
 		return (error);
 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
@@ -708,17 +574,13 @@ compat_43_recvmsg(p, uap, retval)
 		      M_WAITOK);
 	} else
 		iov = aiov;
-	msg.msg_flags = SCARG(uap, flags) | MSG_COMPAT;
-	if (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
-	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec))))
+	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
+	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
+	if (error)
 		goto done;
+	msg.msg_flags = MSG_COMPAT;
 	msg.msg_iov = iov;
-	error = recvit(p, SCARG(uap, s), &msg,
-	    (caddr_t)&SCARG(uap, msg)->msg_namelen, retval);
-
-	if (msg.msg_controllen && error == 0)
-		error = copyout((caddr_t)&msg.msg_controllen,
-		    (caddr_t)&SCARG(uap, msg)->msg_accrightslen, sizeof (int));
+	error = sendit(p, uap->s, &msg, uap->flags, retval);
 done:
 	if (iov != aiov)
 		FREE(iov, M_IOV);
@@ -727,21 +589,21 @@ done:
 #endif
 
 int
-recvmsg(p, uap, retval)
+sendmsg(p, uap, retval)
 	struct proc *p;
-	register struct recvmsg_args /* {
-		syscallarg(int) s;
-		syscallarg(struct msghdr *) msg;
-		syscallarg(int) flags;
+	register struct sendmsg_args /* {
+		int	s;
+		caddr_t	msg;
+		int	flags;
 	} */ *uap;
-	register_t *retval;
+	int *retval;
 {
 	struct msghdr msg;
-	struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
-	register int error;
+	struct iovec aiov[UIO_SMALLIOV], *iov;
+	int error;
 
-	if (error = copyin((caddr_t)SCARG(uap, msg), (caddr_t)&msg,
-	    sizeof (msg)))
+	error = copyin(uap->msg, (caddr_t)&msg, sizeof (msg));
+	if (error)
 		return (error);
 	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
 		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
@@ -751,21 +613,15 @@ recvmsg(p, uap, retval)
 		       M_WAITOK);
 	} else
 		iov = aiov;
+	if (msg.msg_iovlen &&
+	    (error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
+	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)))))
+		goto done;
+	msg.msg_iov = iov;
 #ifdef COMPAT_OLDSOCK
-	msg.msg_flags = SCARG(uap, flags) &~ MSG_COMPAT;
-#else
-	msg.msg_flags = SCARG(uap, flags);
+	msg.msg_flags = 0;
 #endif
-	uiov = msg.msg_iov;
-	msg.msg_iov = iov;
-	if (error = copyin((caddr_t)uiov, (caddr_t)iov,
-	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec))))
-		goto done;
-	if ((error = recvit(p, SCARG(uap, s), &msg, (caddr_t)0, retval)) == 0) {
-		msg.msg_iov = uiov;
-		error = copyout((caddr_t)&msg, (caddr_t)SCARG(uap, msg),
-		    sizeof(msg));
-	}
+	error = sendit(p, uap->s, &msg, uap->flags, retval);
 done:
 	if (iov != aiov)
 		FREE(iov, M_IOV);
@@ -778,19 +634,21 @@ recvit(p, s, mp, namelenp, retsize)
 	int s;
 	register struct msghdr *mp;
 	caddr_t namelenp;
-	register_t *retsize;
+	int *retsize;
 {
 	struct file *fp;
 	struct uio auio;
 	register struct iovec *iov;
 	register int i;
 	int len, error;
-	struct mbuf *from = 0, *control = 0;
+	struct mbuf *m, *from = 0, *control = 0;
+	caddr_t ctlbuf;
 #ifdef KTRACE
 	struct iovec *ktriov = NULL;
 #endif
-	
-	if (error = getsock(p->p_fd, s, &fp))
+
+	error = getsock(p->p_fd, s, &fp);
+	if (error)
 		return (error);
 	auio.uio_iov = mp->msg_iov;
 	auio.uio_iovcnt = mp->msg_iovlen;
@@ -801,9 +659,8 @@ recvit(p, s, mp, namelenp, retsize)
 	auio.uio_resid = 0;
 	iov = mp->msg_iov;
 	for (i = 0; i < mp->msg_iovlen; i++, iov++) {
-		if (auio.uio_resid + iov->iov_len < auio.uio_resid)
+		if ((auio.uio_resid += iov->iov_len) < 0)
 			return (EINVAL);
-		auio.uio_resid += iov->iov_len;
 	}
 #ifdef KTRACE
 	if (KTRPOINT(p, KTR_GENIO)) {
@@ -814,9 +671,10 @@ recvit(p, s, mp, namelenp, retsize)
 	}
 #endif
 	len = auio.uio_resid;
-	if (error = soreceive((struct socket *)fp->f_data, &from, &auio,
+	error = soreceive((struct socket *)fp->f_data, &from, &auio,
 	    (struct mbuf **)0, mp->msg_control ? &control : (struct mbuf **)0,
-	    &mp->msg_flags)) {
+	    &mp->msg_flags);
+	if (error) {
 		if (auio.uio_resid != len && (error == ERESTART ||
 		    error == EINTR || error == EWOULDBLOCK))
 			error = 0;
@@ -845,8 +703,9 @@ recvit(p, s, mp, namelenp, retsize)
 			if (len > from->m_len)
 				len = from->m_len;
 			/* else if len < from->m_len ??? */
-			if (error = copyout(mtod(from, caddr_t),
-			    (caddr_t)mp->msg_name, (unsigned)len))
+			error = copyout(mtod(from, caddr_t),
+			    (caddr_t)mp->msg_name, (unsigned)len);
+			if (error)
 				goto out;
 		}
 		mp->msg_namelen = len;
@@ -882,17 +741,29 @@ recvit(p, s, mp, namelenp, retsize)
 		}
 #endif
 		len = mp->msg_controllen;
-		if (len <= 0 || control == 0)
-			len = 0;
-		else {
-			if (len >= control->m_len)
-				len = control->m_len;
-			else
+		m = control;
+		mp->msg_controllen = 0;
+		ctlbuf = (caddr_t) mp->msg_control;
+
+		while (m && len > 0) {
+			unsigned int tocopy;
+
+			if (len >= m->m_len) 
+				tocopy = m->m_len;
+			else {
 				mp->msg_flags |= MSG_CTRUNC;
-			error = copyout((caddr_t)mtod(control, caddr_t),
-			    (caddr_t)mp->msg_control, (unsigned)len);
+				tocopy = len;
+			}
+		
+			if (error = copyout((caddr_t)mtod(m, caddr_t),
+					ctlbuf, tocopy))
+				goto out;
+
+			ctlbuf += tocopy;
+			len -= tocopy;
+			m = m->m_next;
 		}
-		mp->msg_controllen = len;
+		mp->msg_controllen = ctlbuf - mp->msg_control;
 	}
 out:
 	if (from)
@@ -902,22 +773,193 @@ out:
 	return (error);
 }
 
+int
+recvfrom(p, uap, retval)
+	struct proc *p;
+	register struct recvfrom_args /* {
+		int	s;
+		caddr_t	buf;
+		size_t	len;
+		int	flags;
+		caddr_t	from;
+		int	*fromlenaddr;
+	} */ *uap;
+	int *retval;
+{
+	struct msghdr msg;
+	struct iovec aiov;
+	int error;
+
+	if (uap->fromlenaddr) {
+		error = copyin((caddr_t)uap->fromlenaddr,
+		    (caddr_t)&msg.msg_namelen, sizeof (msg.msg_namelen));
+		if (error)
+			return (error);
+	} else
+		msg.msg_namelen = 0;
+	msg.msg_name = uap->from;
+	msg.msg_iov = &aiov;
+	msg.msg_iovlen = 1;
+	aiov.iov_base = uap->buf;
+	aiov.iov_len = uap->len;
+	msg.msg_control = 0;
+	msg.msg_flags = uap->flags;
+	return (recvit(p, uap->s, &msg, (caddr_t)uap->fromlenaddr, retval));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+orecvfrom(p, uap, retval)
+	struct proc *p;
+	struct recvfrom_args *uap;
+	int *retval;
+{
+
+	uap->flags |= MSG_COMPAT;
+	return (recvfrom(p, uap, retval));
+}
+#endif
+
+
+#ifdef COMPAT_OLDSOCK
+int
+orecv(p, uap, retval)
+	struct proc *p;
+	register struct orecv_args /* {
+		int	s;
+		caddr_t	buf;
+		int	len;
+		int	flags;
+	} */ *uap;
+	int *retval;
+{
+	struct msghdr msg;
+	struct iovec aiov;
+
+	msg.msg_name = 0;
+	msg.msg_namelen = 0;
+	msg.msg_iov = &aiov;
+	msg.msg_iovlen = 1;
+	aiov.iov_base = uap->buf;
+	aiov.iov_len = uap->len;
+	msg.msg_control = 0;
+	msg.msg_flags = uap->flags;
+	return (recvit(p, uap->s, &msg, (caddr_t)0, retval));
+}
+
+/*
+ * Old recvmsg.  This code takes advantage of the fact that the old msghdr
+ * overlays the new one, missing only the flags, and with the (old) access
+ * rights where the control fields are now.
+ */
+int
+orecvmsg(p, uap, retval)
+	struct proc *p;
+	register struct orecvmsg_args /* {
+		int	s;
+		struct	omsghdr *msg;
+		int	flags;
+	} */ *uap;
+	int *retval;
+{
+	struct msghdr msg;
+	struct iovec aiov[UIO_SMALLIOV], *iov;
+	int error;
+
+	error = copyin((caddr_t)uap->msg, (caddr_t)&msg,
+	    sizeof (struct omsghdr));
+	if (error)
+		return (error);
+	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
+		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
+			return (EMSGSIZE);
+		MALLOC(iov, struct iovec *,
+		      sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
+		      M_WAITOK);
+	} else
+		iov = aiov;
+	msg.msg_flags = uap->flags | MSG_COMPAT;
+	error = copyin((caddr_t)msg.msg_iov, (caddr_t)iov,
+	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
+	if (error)
+		goto done;
+	msg.msg_iov = iov;
+	error = recvit(p, uap->s, &msg, (caddr_t)&uap->msg->msg_namelen, retval);
+
+	if (msg.msg_controllen && error == 0)
+		error = copyout((caddr_t)&msg.msg_controllen,
+		    (caddr_t)&uap->msg->msg_accrightslen, sizeof (int));
+done:
+	if (iov != aiov)
+		FREE(iov, M_IOV);
+	return (error);
+}
+#endif
+
+int
+recvmsg(p, uap, retval)
+	struct proc *p;
+	register struct recvmsg_args /* {
+		int	s;
+		struct	msghdr *msg;
+		int	flags;
+	} */ *uap;
+	int *retval;
+{
+	struct msghdr msg;
+	struct iovec aiov[UIO_SMALLIOV], *uiov, *iov;
+	register int error;
+
+	error = copyin((caddr_t)uap->msg, (caddr_t)&msg, sizeof (msg));
+	if (error)
+		return (error);
+	if ((u_int)msg.msg_iovlen >= UIO_SMALLIOV) {
+		if ((u_int)msg.msg_iovlen >= UIO_MAXIOV)
+			return (EMSGSIZE);
+		MALLOC(iov, struct iovec *,
+		       sizeof(struct iovec) * (u_int)msg.msg_iovlen, M_IOV,
+		       M_WAITOK);
+	} else
+		iov = aiov;
+#ifdef COMPAT_OLDSOCK
+	msg.msg_flags = uap->flags &~ MSG_COMPAT;
+#else
+	msg.msg_flags = uap->flags;
+#endif
+	uiov = msg.msg_iov;
+	msg.msg_iov = iov;
+	error = copyin((caddr_t)uiov, (caddr_t)iov,
+	    (unsigned)(msg.msg_iovlen * sizeof (struct iovec)));
+	if (error)
+		goto done;
+	error = recvit(p, uap->s, &msg, (caddr_t)0, retval);
+	if (!error) {
+		msg.msg_iov = uiov;
+		error = copyout((caddr_t)&msg, (caddr_t)uap->msg, sizeof(msg));
+	}
+done:
+	if (iov != aiov)
+		FREE(iov, M_IOV);
+	return (error);
+}
+
 /* ARGSUSED */
 int
 shutdown(p, uap, retval)
 	struct proc *p;
 	register struct shutdown_args /* {
-		syscallarg(int) s;
-		syscallarg(int) how;
+		int	s;
+		int	how;
 	} */ *uap;
-	register_t *retval;
+	int *retval;
 {
 	struct file *fp;
 	int error;
 
-	if (error = getsock(p->p_fd, SCARG(uap, s), &fp))
+	error = getsock(p->p_fd, uap->s, &fp);
+	if (error)
 		return (error);
-	return (soshutdown((struct socket *)fp->f_data, SCARG(uap, how)));
+	return (soshutdown((struct socket *)fp->f_data, uap->how));
 }
 
 /* ARGSUSED */
@@ -925,35 +967,36 @@ int
 setsockopt(p, uap, retval)
 	struct proc *p;
 	register struct setsockopt_args /* {
-		syscallarg(int) s;
-		syscallarg(int) level;
-		syscallarg(int) name;
-		syscallarg(caddr_t) val;
-		syscallarg(int) valsize;
+		int	s;
+		int	level;
+		int	name;
+		caddr_t	val;
+		int	valsize;
 	} */ *uap;
-	register_t *retval;
+	int *retval;
 {
 	struct file *fp;
 	struct mbuf *m = NULL;
 	int error;
 
-	if (error = getsock(p->p_fd, SCARG(uap, s), &fp))
+	error = getsock(p->p_fd, uap->s, &fp);
+	if (error)
 		return (error);
-	if (SCARG(uap, valsize) > MLEN)
+	if (uap->valsize > MLEN)
 		return (EINVAL);
-	if (SCARG(uap, val)) {
+	if (uap->val) {
 		m = m_get(M_WAIT, MT_SOOPTS);
 		if (m == NULL)
 			return (ENOBUFS);
-		if (error = copyin(SCARG(uap, val), mtod(m, caddr_t),
-		    (u_int)SCARG(uap, valsize))) {
+		error = copyin(uap->val, mtod(m, caddr_t), (u_int)uap->valsize);
+		if (error) {
 			(void) m_free(m);
 			return (error);
 		}
-		m->m_len = SCARG(uap, valsize);
+		m->m_len = uap->valsize;
 	}
-	return (sosetopt((struct socket *)fp->f_data, SCARG(uap, level),
-	    SCARG(uap, name), m));
+	return (sosetopt((struct socket *)fp->f_data, uap->level,
+	    uap->name, m));
 }
 
 /* ARGSUSED */
@@ -961,73 +1004,88 @@ int
 getsockopt(p, uap, retval)
 	struct proc *p;
 	register struct getsockopt_args /* {
-		syscallarg(int) s;
-		syscallarg(int) level;
-		syscallarg(int) name;
-		syscallarg(caddr_t) val;
-		syscallarg(int *) avalsize;
+		int	s;
+		int	level;
+		int	name;
+		caddr_t	val;
+		int	*avalsize;
 	} */ *uap;
-	register_t *retval;
+	int *retval;
 {
 	struct file *fp;
-	struct mbuf *m = NULL;
-	int valsize, error;
+	struct mbuf *m = NULL, *m0;
+	int op, i, valsize, error;
 
-	if (error = getsock(p->p_fd, SCARG(uap, s), &fp))
+	error = getsock(p->p_fd, uap->s, &fp);
+	if (error)
 		return (error);
-	if (SCARG(uap, val)) {
-		if (error = copyin((caddr_t)SCARG(uap, avalsize),
-		    (caddr_t)&valsize, sizeof (valsize)))
+	if (uap->val) {
+		error = copyin((caddr_t)uap->avalsize, (caddr_t)&valsize,
+		    sizeof (valsize));
+		if (error)
 			return (error);
 	} else
 		valsize = 0;
-	if ((error = sogetopt((struct socket *)fp->f_data, SCARG(uap, level),
-	    SCARG(uap, name), &m)) == 0 && SCARG(uap, val) && valsize &&
-	    m != NULL) {
-		if (valsize > m->m_len)
-			valsize = m->m_len;
-		error = copyout(mtod(m, caddr_t), SCARG(uap, val),
-		    (u_int)valsize);
+	if ((error = sogetopt((struct socket *)fp->f_data, uap->level,
+	    uap->name, &m)) == 0 && uap->val && valsize && m != NULL) {
+		op = 0;
+		while (m && !error && op < valsize) {
+			i = min(m->m_len, (valsize - op));
+			error = copyout(mtod(m, caddr_t), uap->val, (u_int)i);
+			op += i;
+			uap->val += i;
+			m0 = m;
+			MFREE(m0,m);
+		}
+		valsize = op;
 		if (error == 0)
 			error = copyout((caddr_t)&valsize,
-			    (caddr_t)SCARG(uap, avalsize), sizeof (valsize));
+			    (caddr_t)uap->avalsize, sizeof (valsize));
 	}
 	if (m != NULL)
 		(void) m_free(m);
 	return (error);
 }
 
+#ifdef OLD_PIPE
 /* ARGSUSED */
 int
 pipe(p, uap, retval)
 	struct proc *p;
-	void *uap;
-	register_t *retval;
+	struct pipe_args /* {
+		int	dummy;
+	} */ *uap;
+	int retval[];
 {
 	register struct filedesc *fdp = p->p_fd;
 	struct file *rf, *wf;
 	struct socket *rso, *wso;
 	int fd, error;
 
-	if (error = socreate(AF_UNIX, &rso, SOCK_STREAM, 0))
+	error = socreate(AF_UNIX, &rso, SOCK_STREAM, 0, p);
+	if (error)
 		return (error);
-	if (error = socreate(AF_UNIX, &wso, SOCK_STREAM, 0))
+	error = socreate(AF_UNIX, &wso, SOCK_STREAM, 0, p);
+	if (error)
 		goto free1;
-	if (error = falloc(p, &rf, &fd))
+	error = falloc(p, &rf, &fd);
+	if (error)
 		goto free2;
 	retval[0] = fd;
-	rf->f_flag = FREAD;
+	rf->f_flag = FREAD | FWRITE;
 	rf->f_type = DTYPE_SOCKET;
 	rf->f_ops = &socketops;
 	rf->f_data = (caddr_t)rso;
-	if (error = falloc(p, &wf, &fd))
+	error = falloc(p, &wf, &fd);
+	if (error)
 		goto free3;
-	wf->f_flag = FWRITE;
+	wf->f_flag = FREAD | FWRITE;
 	wf->f_type = DTYPE_SOCKET;
 	wf->f_ops = &socketops;
 	wf->f_data = (caddr_t)wso;
 	retval[1] = fd;
-	if (error = unp_connect2(wso, rso))
+	error = unp_connect2(wso, rso);
+	if (error)
 		goto free4;
 	return (0);
 free4:
@@ -1042,170 +1100,153 @@ free1:
 	(void)soclose(rso);
 	return (error);
 }
-
+#endif
 /*
  * Get socket name.
  */
-#ifdef COMPAT_OLDSOCK
-int
-getsockname(p, uap, retval)
-	struct proc *p;
-	struct getsockname_args /* {
-		syscallarg(int) fdes;
-		syscallarg(caddr_t) asa;
-		syscallarg(int *) alen;
-	} */ *uap;
-	register_t *retval;
-{
-
-	return (getsockname1(p, uap, retval, 0));
-}
-
-int
-compat_43_getsockname(p, uap, retval)
-	struct proc *p;
-	struct getsockname_args /* {
-		syscallarg(int) fdes;
-		syscallarg(caddr_t) asa;
-		syscallarg(int *) alen;
-	} */ *uap;
-	register_t *retval;
-{
-
-	return (getsockname1(p, uap, retval, 1));
-}
-#else /* COMPAT_OLDSOCK */
-
-#define	getsockname1	getsockname
-#endif
-
 /* ARGSUSED */
-int
-getsockname1(p, uap, retval, compat_43)
+static int
+getsockname1(p, uap, retval, compat)
 	struct proc *p;
 	register struct getsockname_args /* {
-		syscallarg(int) fdes;
-		syscallarg(caddr_t) asa;
-		syscallarg(int *) alen;
+		int	fdes;
+		caddr_t	asa;
+		int	*alen;
 	} */ *uap;
-	register_t *retval;
-	int compat_43;
+	int *retval;
+	int compat;
 {
 	struct file *fp;
 	register struct socket *so;
 	struct mbuf *m;
 	int len, error;
 
-	if (error = getsock(p->p_fd, SCARG(uap, fdes), &fp))
+	error = getsock(p->p_fd, uap->fdes, &fp);
+	if (error)
 		return (error);
-	if (error = copyin((caddr_t)SCARG(uap, alen), (caddr_t)&len,
-	    sizeof (len)))
+	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
+	if (error)
 		return (error);
 	so = (struct socket *)fp->f_data;
 	m = m_getclr(M_WAIT, MT_SONAME);
 	if (m == NULL)
 		return (ENOBUFS);
-	if (error = (*so->so_proto->pr_usrreq)(so, PRU_SOCKADDR, 0, m, 0))
+	error = (*so->so_proto->pr_usrreqs->pru_sockaddr)(so, m);
+	if (error)
 		goto bad;
 	if (len > m->m_len)
 		len = m->m_len;
 #ifdef COMPAT_OLDSOCK
-	if (compat_43)
+	if (compat)
 		mtod(m, struct osockaddr *)->sa_family =
 		    mtod(m, struct sockaddr *)->sa_family;
 #endif
-	error = copyout(mtod(m, caddr_t), (caddr_t)SCARG(uap, asa), (u_int)len);
+	error = copyout(mtod(m, caddr_t), (caddr_t)uap->asa, (u_int)len);
 	if (error == 0)
-		error = copyout((caddr_t)&len, (caddr_t)SCARG(uap, alen),
+		error = copyout((caddr_t)&len, (caddr_t)uap->alen,
 		    sizeof (len));
 bad:
 	m_freem(m);
 	return (error);
 }
 
-/*
- * Get name of peer for connected socket.
- */
-#ifdef COMPAT_OLDSOCK
 int
-getpeername(p, uap, retval)
+getsockname(p, uap, retval)
 	struct proc *p;
-	struct getpeername_args /* {
-		syscallarg(int) fdes;
-		syscallarg(caddr_t) asa;
-		syscallarg(int *) alen;
-	} */ *uap;
-	register_t *retval;
+	struct getsockname_args *uap;
+	int *retval;
 {
 
-	return (getpeername1(p, uap, retval, 0));
+	return (getsockname1(p, uap, retval, 0));
 }
 
+#ifdef COMPAT_OLDSOCK
 int
-compat_43_getpeername(p, uap, retval)
+ogetsockname(p, uap, retval)
 	struct proc *p;
-	struct getpeername_args /* {
-		syscallarg(int) fdes;
-		syscallarg(caddr_t) asa;
-		syscallarg(int *) alen;
-	} */ *uap;
-	register_t *retval;
+	struct getsockname_args *uap;
+	int *retval;
 {
 
-	return (getpeername1(p, uap, retval, 1));
+	return (getsockname1(p, uap, retval, 1));
 }
-#else /* COMPAT_OLDSOCK */
-
-#define	getpeername1	getpeername
-#endif
+#endif /* COMPAT_OLDSOCK */
 
+/*
+ * Get name of peer for connected socket.
+ */
 /* ARGSUSED */
-int
-getpeername1(p, uap, retval, compat_43)
+static int
+getpeername1(p, uap, retval, compat)
 	struct proc *p;
 	register struct getpeername_args /* {
-		syscallarg(int) fdes;
-		syscallarg(caddr_t) asa;
-		syscallarg(int *) alen;
+		int	fdes;
+		caddr_t	asa;
+		int	*alen;
 	} */ *uap;
-	register_t *retval;
-	int compat_43;
+	int *retval;
+	int compat;
 {
 	struct file *fp;
 	register struct socket *so;
 	struct mbuf *m;
 	int len, error;
 
-	if (error = getsock(p->p_fd, SCARG(uap, fdes), &fp))
+	error = getsock(p->p_fd, uap->fdes, &fp);
+	if (error)
 		return (error);
 	so = (struct socket *)fp->f_data;
 	if ((so->so_state & (SS_ISCONNECTED|SS_ISCONFIRMING)) == 0)
 		return (ENOTCONN);
-	if (error =
-	    copyin((caddr_t)SCARG(uap, alen), (caddr_t)&len, sizeof (len)))
+	error = copyin((caddr_t)uap->alen, (caddr_t)&len, sizeof (len));
+	if (error)
 		return (error);
 	m = m_getclr(M_WAIT, MT_SONAME);
 	if (m == NULL)
 		return (ENOBUFS);
-	if (error = (*so->so_proto->pr_usrreq)(so, PRU_PEERADDR, 0, m, 0))
+	error = (*so->so_proto->pr_usrreqs->pru_peeraddr)(so, m);
+	if (error)
 		goto bad;
 	if (len > m->m_len)
 		len = m->m_len;
 #ifdef COMPAT_OLDSOCK
-	if (compat_43)
+	if (compat)
 		mtod(m, struct osockaddr *)->sa_family =
 		    mtod(m, struct sockaddr *)->sa_family;
 #endif
-	if (error =
-	    copyout(mtod(m, caddr_t), (caddr_t)SCARG(uap, asa), (u_int)len))
+	error = copyout(mtod(m, caddr_t), (caddr_t)uap->asa, (u_int)len);
+	if (error)
 		goto bad;
-	error = copyout((caddr_t)&len, (caddr_t)SCARG(uap, alen), sizeof (len));
+	error = copyout((caddr_t)&len, (caddr_t)uap->alen, sizeof (len));
 bad:
 	m_freem(m);
 	return (error);
 }
 
 int
+getpeername(p, uap, retval)
+	struct proc *p;
+	struct getpeername_args *uap;
+	int *retval;
+{
+
+	return (getpeername1(p, uap, retval, 0));
+}
+
+#ifdef COMPAT_OLDSOCK
+int
+ogetpeername(p, uap, retval)
+	struct proc *p;
+	struct ogetpeername_args *uap;
+	int *retval;
+{
+
+	/* XXX uap should have type `getpeername_args *' to begin with. */
+	return (getpeername1(p, (struct getpeername_args *)uap, retval, 1));
+}
+#endif /* COMPAT_OLDSOCK */
+
+int
 sockargs(mp, buf, buflen, type)
 	struct mbuf **mp;
 	caddr_t buf;
@@ -1228,21 +1269,21 @@ sockargs(mp, buf, buflen, type)
 		return (ENOBUFS);
 	m->m_len = buflen;
 	error = copyin(buf, mtod(m, caddr_t), (u_int)buflen);
-	if (error) {
+	if (error)
 		(void) m_free(m);
-		return (error);
-	}
-	*mp = m;
-	if (type == MT_SONAME) {
-		sa = mtod(m, struct sockaddr *);
+	else {
+		*mp = m;
+		if (type == MT_SONAME) {
+			sa = mtod(m, struct sockaddr *);
 
 #if defined(COMPAT_OLDSOCK) && BYTE_ORDER != BIG_ENDIAN
-		if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
-			sa->sa_family = sa->sa_len;
+			if (sa->sa_family == 0 && sa->sa_len < AF_MAX)
+				sa->sa_family = sa->sa_len;
 #endif
-		sa->sa_len = buflen;
+			sa->sa_len = buflen;
+		}
 	}
-	return (0);
+	return (error);
 }
 
 int
diff --git a/sys/kern/uipc_usrreq.c b/sys/kern/uipc_usrreq.c
index c6bcbfd..0a47414 100644
--- a/sys/kern/uipc_usrreq.c
+++ b/sys/kern/uipc_usrreq.c
@@ -30,24 +30,29 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)uipc_usrreq.c	8.9 (Berkeley) 5/14/95
+ *	From: @(#)uipc_usrreq.c	8.3 (Berkeley) 1/4/94
+ *	$Id: uipc_usrreq.c,v 1.21 1997/03/21 16:12:32 wpaul Exp $
  */
 
 #include <sys/param.h>
+#include <sys/queue.h>
 #include <sys/systm.h>
-#include <sys/proc.h>
-#include <sys/filedesc.h>
+#include <sys/kernel.h>
 #include <sys/domain.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/filedesc.h>
+#include <sys/mbuf.h>
+#include <sys/namei.h>
+#include <sys/proc.h>
 #include <sys/protosw.h>
 #include <sys/socket.h>
 #include <sys/socketvar.h>
-#include <sys/unpcb.h>
+#include <sys/stat.h>
+#include <sys/sysctl.h>
 #include <sys/un.h>
-#include <sys/namei.h>
+#include <sys/unpcb.h>
 #include <sys/vnode.h>
-#include <sys/file.h>
-#include <sys/stat.h>
-#include <sys/mbuf.h>
 
 /*
  * Unix communications domain.
@@ -57,8 +62,22 @@
  *	rethink name space problems
  *	need a proper out-of-band
  */
-struct	sockaddr sun_noname = { sizeof(sun_noname), AF_UNIX };
-ino_t	unp_ino;			/* prototype for fake inode numbers */
+static struct	sockaddr sun_noname = { sizeof(sun_noname), AF_LOCAL };
+static ino_t	unp_ino;		/* prototype for fake inode numbers */
+
+static int     unp_attach __P((struct socket *));
+static void    unp_detach __P((struct unpcb *));
+static int     unp_bind __P((struct unpcb *,struct mbuf *, struct proc *));
+static int     unp_connect __P((struct socket *,struct mbuf *, struct proc *));
+static void    unp_disconnect __P((struct unpcb *));
+static void    unp_shutdown __P((struct unpcb *));
+static void    unp_drop __P((struct unpcb *, int));
+static void    unp_gc __P((void));
+static void    unp_scan __P((struct mbuf *, void (*)(struct file *)));
+static void    unp_mark __P((struct file *));
+static void    unp_discard __P((struct file *));
+static int     unp_internalize __P((struct mbuf *, struct proc *));
+
 
 /*ARGSUSED*/
 int
@@ -170,6 +189,7 @@ uipc_usrreq(so, req, m, nam, control)
 		break;
 
 	case PRU_SEND:
+	case PRU_SEND_EOF:
 		if (control && (error = unp_internalize(control, p)))
 			break;
 		switch (so->so_type) {
@@ -210,6 +230,22 @@ uipc_usrreq(so, req, m, nam, control)
 		case SOCK_STREAM:
 #define	rcv (&so2->so_rcv)
 #define	snd (&so->so_snd)
+			/* Connect if not connected yet. */
+			/*
+			 * Note: A better implementation would complain
+			 * if not equal to the peer's address.
+			 */
+			if ((so->so_state & SS_ISCONNECTED) == 0) {
+				if (nam) {
+		    			error = unp_connect(so, nam, p);
+					if (error)
+						break;	/* XXX */
+				} else {
+					error = ENOTCONN;
+					break;
+				}
+			}
+
 			if (so->so_state & SS_CANTSENDMORE) {
 				error = EPIPE;
 				break;
@@ -241,6 +277,14 @@ uipc_usrreq(so, req, m, nam, control)
 		default:
 			panic("uipc 4");
 		}
+		/*
+		 * SEND_EOF is equivalent to a SEND followed by
+		 * a SHUTDOWN.
+		 */
+		if (req == PRU_SEND_EOF) {
+			socantsendmore(so);
+			unp_shutdown(unp);
+		}
 		break;
 
 	case PRU_ABORT:
@@ -306,22 +350,34 @@ release:
  * and don't really want to reserve the sendspace.  Their recvspace should
  * be large enough for at least one max-size datagram plus address.
  */
-#define	PIPSIZ	4096
-u_long	unpst_sendspace = PIPSIZ;
-u_long	unpst_recvspace = PIPSIZ;
-u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
-u_long	unpdg_recvspace = 4*1024;
-
-int	unp_rights;			/* file descriptors in flight */
-
-int
+#ifndef PIPSIZ
+#define	PIPSIZ	8192
+#endif
+static u_long	unpst_sendspace = PIPSIZ;
+static u_long	unpst_recvspace = PIPSIZ;
+static u_long	unpdg_sendspace = 2*1024;	/* really max datagram size */
+static u_long	unpdg_recvspace = 4*1024;
+
+static int	unp_rights;			/* file descriptors in flight */
+
+SYSCTL_INT(_net_local_stream, OID_AUTO, sendspace, CTLFLAG_RW, 
+	   &unpst_sendspace, 0, "");
+SYSCTL_INT(_net_local_stream, OID_AUTO, recvspace, CTLFLAG_RW,
+	   &unpst_recvspace, 0, "");
+SYSCTL_INT(_net_local_dgram, OID_AUTO, maxdgram, CTLFLAG_RW,
+	   &unpdg_sendspace, 0, "");
+SYSCTL_INT(_net_local_dgram, OID_AUTO, recvspace, CTLFLAG_RW,
+	   &unpdg_recvspace, 0, "");
+SYSCTL_INT(_net_local, OID_AUTO, inflight, CTLFLAG_RD, &unp_rights, 0, "");
+
+static int
 unp_attach(so)
 	struct socket *so;
 {
 	register struct mbuf *m;
 	register struct unpcb *unp;
 	int error;
-	
+
 	if (so->so_snd.sb_hiwat == 0 || so->so_rcv.sb_hiwat == 0) {
 		switch (so->so_type) {
 
@@ -348,11 +404,11 @@ unp_attach(so)
 	return (0);
 }
 
-void
+static void
 unp_detach(unp)
 	register struct unpcb *unp;
 {
-	
+
 	if (unp->unp_vnode) {
 		unp->unp_vnode->v_socket = 0;
 		vrele(unp->unp_vnode);
@@ -364,8 +420,6 @@ unp_detach(unp)
 		unp_drop(unp->unp_refs, ECONNRESET);
 	soisdisconnected(unp->unp_socket);
 	unp->unp_socket->so_pcb = 0;
-	m_freem(unp->unp_addr);
-	(void) m_free(dtom(unp));
 	if (unp_rights) {
 		/*
 		 * Normally the receive buffer is flushed later,
@@ -377,9 +431,11 @@ unp_detach(unp)
 		sorflush(unp->unp_socket);
 		unp_gc();
 	}
+	m_freem(unp->unp_addr);
+	(void) m_free(dtom(unp));
 }
 
-int
+static int
 unp_bind(unp, nam, p)
 	struct unpcb *unp;
 	struct mbuf *nam;
@@ -401,7 +457,8 @@ unp_bind(unp, nam, p)
 	} else
 		*(mtod(nam, caddr_t) + nam->m_len) = 0;
 /* SHOULD BE ABLE TO ADOPT EXISTING AND wakeup() ALA FIFO's */
-	if (error = namei(&nd))
+	error = namei(&nd);
+	if (error)
 		return (error);
 	vp = nd.ni_vp;
 	if (vp != NULL) {
@@ -427,7 +484,7 @@ unp_bind(unp, nam, p)
 	return (0);
 }
 
-int
+static int
 unp_connect(so, nam, p)
 	struct socket *so;
 	struct mbuf *nam;
@@ -446,14 +503,16 @@ unp_connect(so, nam, p)
 			return (EMSGSIZE);
 	} else
 		*(mtod(nam, caddr_t) + nam->m_len) = 0;
-	if (error = namei(&nd))
+	error = namei(&nd);
+	if (error)
 		return (error);
 	vp = nd.ni_vp;
 	if (vp->v_type != VSOCK) {
 		error = ENOTSOCK;
 		goto bad;
 	}
-	if (error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p))
+	error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p);
+	if (error)
 		goto bad;
 	so2 = vp->v_socket;
 	if (so2 == 0) {
@@ -515,7 +574,7 @@ unp_connect2(so, so2)
 	return (0);
 }
 
-void
+static void
 unp_disconnect(unp)
 	struct unpcb *unp;
 {
@@ -562,7 +621,7 @@ unp_abort(unp)
 }
 #endif
 
-void
+static void
 unp_shutdown(unp)
 	struct unpcb *unp;
 {
@@ -573,7 +632,7 @@ unp_shutdown(unp)
 		socantrcvmore(so);
 }
 
-void
+static void
 unp_drop(unp, errno)
 	struct unpcb *unp;
 	int errno;
@@ -591,6 +650,7 @@ unp_drop(unp, errno)
 }
 
 #ifdef notdef
+void
 unp_drain()
 {
 
@@ -609,6 +669,9 @@ unp_externalize(rights)
 	int newfds = (cm->cmsg_len - sizeof(*cm)) / sizeof (int);
 	int f;
 
+	/*
+	 * if the new FD's will not fit, then we free them all
+	 */
 	if (!fdavail(p, newfds)) {
 		for (i = 0; i < newfds; i++) {
 			fp = *rp;
@@ -617,6 +680,12 @@ unp_externalize(rights)
 		}
 		return (EMSGSIZE);
 	}
+	/*
+	 * now change each pointer to an fd in the global table to 
+	 * an integer that is the index to the local fd table entry
+	 * that we set up to point to the global one we are transferring.
+	 * XXX this assumes a pointer and int are the same size...!
+	 */
 	for (i = 0; i < newfds; i++) {
 		if (fdalloc(p, 0, &f))
 			panic("unp_externalize");
@@ -629,7 +698,11 @@ unp_externalize(rights)
 	return (0);
 }
 
-int
+#ifndef MIN
+#define	MIN(a,b) (((a)<(b))?(a):(b))
+#endif
+
+static int
 unp_internalize(control, p)
 	struct mbuf *control;
 	struct proc *p;
@@ -639,12 +712,34 @@ unp_internalize(control, p)
 	register struct file **rp;
 	register struct file *fp;
 	register int i, fd;
+	register struct cmsgcred *cmcred;
 	int oldfds;
 
-	if (cm->cmsg_type != SCM_RIGHTS || cm->cmsg_level != SOL_SOCKET ||
-	    cm->cmsg_len != control->m_len)
+	if ((cm->cmsg_type != SCM_RIGHTS && cm->cmsg_type != SCM_CREDS) ||
+	    cm->cmsg_level != SOL_SOCKET || cm->cmsg_len != control->m_len)
 		return (EINVAL);
+
+	/*
+	 * Fill in credential information.
+	 */
+	if (cm->cmsg_type == SCM_CREDS) {
+		cmcred = (struct cmsgcred *)(cm + 1);
+		cmcred->cmcred_pid = p->p_pid;
+		cmcred->cmcred_uid = p->p_cred->p_ruid;
+		cmcred->cmcred_gid = p->p_cred->p_rgid;
+		cmcred->cmcred_euid = p->p_ucred->cr_uid;
+		cmcred->cmcred_ngroups = MIN(p->p_ucred->cr_ngroups,
+							CMGROUP_MAX);
+		for (i = 0; i < cmcred->cmcred_ngroups; i++)
+			cmcred->cmcred_groups[i] = p->p_ucred->cr_groups[i];
+		return(0);
+	}
+
 	oldfds = (cm->cmsg_len - sizeof (*cm)) / sizeof (int);
+	/*
+	 * check that all the FDs passed in refer to legal OPEN files
+	 * If not, reject the entire operation.
+	 */
 	rp = (struct file **)(cm + 1);
 	for (i = 0; i < oldfds; i++) {
 		fd = *(int *)rp++;
@@ -652,6 +747,11 @@ unp_internalize(control, p)
 		    fdp->fd_ofiles[fd] == NULL)
 			return (EBADF);
 	}
+	/*
+	 * Now replace the integer FDs with pointers to
+	 * the associated global file table entry..
+	 * XXX this assumes a pointer and an int are the same size!
+	 */
 	rp = (struct file **)(cm + 1);
 	for (i = 0; i < oldfds; i++) {
 		fp = fdp->fd_ofiles[*(int *)rp];
@@ -663,10 +763,9 @@ unp_internalize(control, p)
 	return (0);
 }
 
-int	unp_defer, unp_gcing;
-extern	struct domain unixdomain;
+static int	unp_defer, unp_gcing;
 
-void
+static void
 unp_gc()
 {
 	register struct file *fp, *nextfp;
@@ -678,26 +777,56 @@ unp_gc()
 		return;
 	unp_gcing = 1;
 	unp_defer = 0;
+	/* 
+	 * before going through all this, set all FDs to 
+	 * be NOT defered and NOT externally accessible
+	 */
 	for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next)
 		fp->f_flag &= ~(FMARK|FDEFER);
 	do {
 		for (fp = filehead.lh_first; fp != 0; fp = fp->f_list.le_next) {
+			/*
+			 * If the file is not open, skip it
+			 */
 			if (fp->f_count == 0)
 				continue;
+			/*
+			 * If we already marked it as 'defer'  in a
+			 * previous pass, then try process it this time
+			 * and un-mark it
+			 */
 			if (fp->f_flag & FDEFER) {
 				fp->f_flag &= ~FDEFER;
 				unp_defer--;
 			} else {
+				/*
+				 * if it's not defered, then check if it's
+				 * already marked.. if so skip it
+				 */
 				if (fp->f_flag & FMARK)
 					continue;
+				/* 
+				 * If all references are from messages
+				 * in transit, then skip it. it's not 
+				 * externally accessible.
+				 */ 
 				if (fp->f_count == fp->f_msgcount)
 					continue;
+				/* 
+				 * If it got this far then it must be
+				 * externally accessible.
+				 */
 				fp->f_flag |= FMARK;
 			}
+			/*
+			 * either it was defered, or it is externally 
+			 * accessible and not already marked so.
+			 * Now check if it is possibly one of OUR sockets.
+			 */ 
 			if (fp->f_type != DTYPE_SOCKET ||
 			    (so = (struct socket *)fp->f_data) == 0)
 				continue;
-			if (so->so_proto->pr_domain != &unixdomain ||
+			if (so->so_proto->pr_domain != &localdomain ||
 			    (so->so_proto->pr_flags&PR_RIGHTS) == 0)
 				continue;
 #ifdef notdef
@@ -716,6 +845,13 @@ unp_gc()
 				goto restart;
 			}
 #endif
+			/*
+			 * So, Ok, it's one of our sockets and it IS externally
+			 * accessible (or was defered). Now we look
+			 * to see if we hold any file descriptors in it's
+			 * message buffers. Follow those links and mark them 
+			 * as accessible too.
+			 */
 			unp_scan(so->so_rcv.sb_mb, unp_mark);
 		}
 	} while (unp_defer);
@@ -762,18 +898,30 @@ unp_gc()
 	for (nunref = 0, fp = filehead.lh_first, fpp = extra_ref; fp != 0;
 	    fp = nextfp) {
 		nextfp = fp->f_list.le_next;
+		/* 
+		 * If it's not open, skip it
+		 */
 		if (fp->f_count == 0)
 			continue;
+		/* 
+		 * If all refs are from msgs, and it's not marked accessible
+		 * then it must be referenced from some unreachable cycle
+		 * of (shut-down) FDs, so include it in our
+		 * list of FDs to remove
+		 */
 		if (fp->f_count == fp->f_msgcount && !(fp->f_flag & FMARK)) {
 			*fpp++ = fp;
 			nunref++;
 			fp->f_count++;
 		}
 	}
+	/* 
+	 * for each FD on our hit list, do the following two things
+	 */
 	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
 		sorflush((struct socket *)(*fpp)->f_data);
 	for (i = nunref, fpp = extra_ref; --i >= 0; ++fpp)
-		closef(*fpp, (struct proc *)NULL);
+		closef(*fpp, (struct proc *) NULL);
 	free((caddr_t)extra_ref, M_FILE);
 	unp_gcing = 0;
 }
@@ -787,7 +935,7 @@ unp_dispose(m)
 		unp_scan(m, unp_discard);
 }
 
-void
+static void
 unp_scan(m0, op)
 	register struct mbuf *m0;
 	void (*op) __P((struct file *));
@@ -817,7 +965,7 @@ unp_scan(m0, op)
 	}
 }
 
-void
+static void
 unp_mark(fp)
 	struct file *fp;
 {
@@ -828,7 +976,7 @@ unp_mark(fp)
 	fp->f_flag |= (FMARK|FDEFER);
 }
 
-void
+static void
 unp_discard(fp)
 	struct file *fp;
 {
diff --git a/sys/kern/vfs_bio.c b/sys/kern/vfs_bio.c
index ec5c962..494a53d 100644
--- a/sys/kern/vfs_bio.c
+++ b/sys/kern/vfs_bio.c
@@ -1,181 +1,377 @@
-/*-
- * Copyright (c) 1982, 1986, 1989, 1993
- *	The Regents of the University of California.  All rights reserved.
- * (c) UNIX System Laboratories, Inc.
- * All or some portions of this file are derived from material licensed
- * to the University of California by American Telephone and Telegraph
- * Co. or Unix System Laboratories, Inc. and are reproduced herein with
- * the permission of UNIX System Laboratories, Inc.
+/*
+ * Copyright (c) 1994 John S. Dyson
+ * All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
  * are met:
  * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
+ *    notice immediately at the beginning of the file, without modification,
+ *    this list of conditions, and the following disclaimer.
  * 2. Redistributions in binary form must reproduce the above copyright
  *    notice, this list of conditions and the following disclaimer in the
  *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *	This product includes software developed by the University of
- *	California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
+ * 3. Absolutely no warranty of function or purpose is made by the author
+ *    John S. Dyson.
+ * 4. This work was done expressly for inclusion into FreeBSD.  Other use
+ *    is allowed if this notation is included.
+ * 5. Modifications may be freely made to this file if the above conditions
+ *    are met.
  *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * $Id$
+ */
+
+/*
+ * this file contains a new buffer I/O scheme implementing a coherent
+ * VM object and buffer cache scheme.  Pains have been taken to make
+ * sure that the performance degradation associated with schemes such
+ * as this is not realized.
  *
- *	from: @(#)vfs_bio.c	8.6 (Berkeley) 1/11/94
+ * Author:  John S. Dyson
+ * Significant help during the development and debugging phases
+ * had been provided by David Greenman, also of the FreeBSD core team.
  */
 
+#include "opt_bounce.h"
+
+#define VMIO
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/sysproto.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
 #include <sys/proc.h>
-#include <sys/buf.h>
 #include <sys/vnode.h>
+#include <sys/vmmeter.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_kern.h>
+#include <vm/vm_pageout.h>
+#include <vm/vm_page.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/lock.h>
+#include <vm/vm_map.h>
+#include <sys/buf.h>
 #include <sys/mount.h>
-#include <sys/trace.h>
 #include <sys/malloc.h>
 #include <sys/resourcevar.h>
+#include <sys/proc.h>
+
+#include <miscfs/specfs/specdev.h>
+
+static void vfs_update __P((void));
+static struct	proc *updateproc;
+static struct kproc_desc up_kp = {
+	"update",
+	vfs_update,
+	&updateproc
+};
+SYSINIT_KT(update, SI_SUB_KTHREAD_UPDATE, SI_ORDER_FIRST, kproc_start, &up_kp)
+
+struct buf *buf;		/* buffer header pool */
+struct swqueue bswlist;
+
+int count_lock_queue __P((void));
+static void vm_hold_free_pages(struct buf * bp, vm_offset_t from,
+		vm_offset_t to);
+static void vm_hold_load_pages(struct buf * bp, vm_offset_t from,
+		vm_offset_t to);
+static void vfs_clean_pages(struct buf * bp);
+static void vfs_setdirty(struct buf *bp);
+static void vfs_vmio_release(struct buf *bp);
+
+int needsbuffer;
 
 /*
- * Definitions for the buffer hash lists.
+ * Internal update daemon, process 3
+ *	The variable vfs_update_wakeup allows for internal syncs.
  */
-#define	BUFHASH(dvp, lbn)	\
-	(&bufhashtbl[((int)(dvp) / sizeof(*(dvp)) + (int)(lbn)) & bufhash])
-LIST_HEAD(bufhashhdr, buf) *bufhashtbl, invalhash;
-u_long	bufhash;
+int vfs_update_wakeup;
+
 
 /*
- * Insq/Remq for the buffer hash lists.
+ * buffers base kva
  */
-#define	binshash(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_hash)
-#define	bremhash(bp)		LIST_REMOVE(bp, b_hash)
 
 /*
- * Definitions for the buffer free lists.
+ * bogus page -- for I/O to/from partially complete buffers
+ * this is a temporary solution to the problem, but it is not
+ * really that bad.  it would be better to split the buffer
+ * for input in the case of buffers partially already in memory,
+ * but the code is intricate enough already.
  */
-#define	BQUEUES		4		/* number of free buffer queues */
+vm_page_t bogus_page;
+static vm_offset_t bogus_offset;
 
-#define	BQ_LOCKED	0		/* super-blocks &c */
-#define	BQ_LRU		1		/* lru, useful buffers */
-#define	BQ_AGE		2		/* rubbish */
-#define	BQ_EMPTY	3		/* buffer headers with no memory */
+static int bufspace, maxbufspace, vmiospace, maxvmiobufspace,
+	bufmallocspace, maxbufmallocspace;
 
-TAILQ_HEAD(bqueues, buf) bufqueues[BQUEUES];
-int needbuffer;
+static struct bufhashhdr bufhashtbl[BUFHSZ], invalhash;
+static struct bqueues bufqueues[BUFFER_QUEUES];
 
-/*
- * Insq/Remq for the buffer free lists.
- */
-#define	binsheadfree(bp, dp)	TAILQ_INSERT_HEAD(dp, bp, b_freelist)
-#define	binstailfree(bp, dp)	TAILQ_INSERT_TAIL(dp, bp, b_freelist)
+extern int vm_swap_size;
 
-void
-bremfree(bp)
-	struct buf *bp;
-{
-	struct bqueues *dp = NULL;
-
-	/*
-	 * We only calculate the head of the freelist when removing
-	 * the last element of the list as that is the only time that
-	 * it is needed (e.g. to reset the tail pointer).
-	 *
-	 * NB: This makes an assumption about how tailq's are implemented.
-	 */
-	if (bp->b_freelist.tqe_next == NULL) {
-		for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
-			if (dp->tqh_last == &bp->b_freelist.tqe_next)
-				break;
-		if (dp == &bufqueues[BQUEUES])
-			panic("bremfree: lost tail");
-	}
-	TAILQ_REMOVE(dp, bp, b_freelist);
-}
+#define BUF_MAXUSE 16
 
 /*
- * Initialize buffers and hash links for buffers.
+ * Initialize buffer headers and related structures.
  */
 void
 bufinit()
 {
-	register struct buf *bp;
-	struct bqueues *dp;
-	register int i;
-	int base, residual;
-
-	for (dp = bufqueues; dp < &bufqueues[BQUEUES]; dp++)
-		TAILQ_INIT(dp);
-	bufhashtbl = hashinit(nbuf, M_CACHE, &bufhash);
-	base = bufpages / nbuf;
-	residual = bufpages % nbuf;
+	struct buf *bp;
+	int i;
+
+	TAILQ_INIT(&bswlist);
+	LIST_INIT(&invalhash);
+
+	/* first, make a null hash table */
+	for (i = 0; i < BUFHSZ; i++)
+		LIST_INIT(&bufhashtbl[i]);
+
+	/* next, make a null set of free lists */
+	for (i = 0; i < BUFFER_QUEUES; i++)
+		TAILQ_INIT(&bufqueues[i]);
+
+	/* finally, initialize each buffer header and stick on empty q */
 	for (i = 0; i < nbuf; i++) {
 		bp = &buf[i];
-		bzero((char *)bp, sizeof *bp);
+		bzero(bp, sizeof *bp);
+		bp->b_flags = B_INVAL;	/* we're just an empty header */
 		bp->b_dev = NODEV;
 		bp->b_rcred = NOCRED;
 		bp->b_wcred = NOCRED;
+		bp->b_qindex = QUEUE_EMPTY;
 		bp->b_vnbufs.le_next = NOLIST;
-		bp->b_data = buffers + i * MAXBSIZE;
-		if (i < residual)
-			bp->b_bufsize = (base + 1) * CLBYTES;
-		else
-			bp->b_bufsize = base * CLBYTES;
-		bp->b_flags = B_INVAL;
-		dp = bp->b_bufsize ? &bufqueues[BQ_AGE] : &bufqueues[BQ_EMPTY];
-		binsheadfree(bp, dp);
-		binshash(bp, &invalhash);
+		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
+		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
 	}
+/*
+ * maxbufspace is currently calculated to support all filesystem blocks
+ * to be 8K.  If you happen to use a 16K filesystem, the size of the buffer
+ * cache is still the same as it would be for 8K filesystems.  This
+ * keeps the size of the buffer cache "in check" for big block filesystems.
+ */
+	maxbufspace = (nbuf + 8) * DFLTBSIZE;
+/*
+ * reserve 1/3 of the buffers for metadata (VDIR) which might not be VMIO'ed
+ */
+	maxvmiobufspace = 2 * maxbufspace / 3;
+/*
+ * Limit the amount of malloc memory since it is wired permanently into
+ * the kernel space.  Even though this is accounted for in the buffer
+ * allocation, we don't want the malloced region to grow uncontrolled.
+ * The malloc scheme improves memory utilization significantly on average
+ * (small) directories.
+ */
+	maxbufmallocspace = maxbufspace / 20;
+
+	bogus_offset = kmem_alloc_pageable(kernel_map, PAGE_SIZE);
+	bogus_page = vm_page_alloc(kernel_object,
+			((bogus_offset - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
+			VM_ALLOC_NORMAL);
+
 }
 
-bread(a1, a2, a3, a4, a5)
-	struct vnode *a1;
-	daddr_t a2;
-	int a3;
-	struct ucred *a4;
-	struct buf **a5;
+/*
+ * Free the kva allocation for a buffer
+ * Must be called only at splbio or higher,
+ *  as this is the only locking for buffer_map.
+ */
+static void
+bfreekva(struct buf * bp)
 {
+	if (bp->b_kvasize == 0)
+		return;
+		
+	vm_map_delete(buffer_map,
+		(vm_offset_t) bp->b_kvabase,
+		(vm_offset_t) bp->b_kvabase + bp->b_kvasize);
+
+	bp->b_kvasize = 0;
 
-	/*
-	 * Body deleted.
-	 */
-	return (EIO);
 }
 
-breadn(a1, a2, a3, a4, a5, a6, a7, a8)
-	struct vnode *a1;
-	daddr_t a2; int a3;
-	daddr_t a4[]; int a5[];
-	int a6;
-	struct ucred *a7;
-	struct buf **a8;
+/*
+ * remove the buffer from the appropriate free list
+ */
+void
+bremfree(struct buf * bp)
 {
+	int s = splbio();
 
-	/*
-	 * Body deleted.
-	 */
-	return (EIO);
+	if (bp->b_qindex != QUEUE_NONE) {
+		TAILQ_REMOVE(&bufqueues[bp->b_qindex], bp, b_freelist);
+		bp->b_qindex = QUEUE_NONE;
+	} else {
+		panic("bremfree: removing a buffer when not on a queue");
+	}
+	splx(s);
 }
 
-bwrite(a1)
-	struct buf *a1;
+/*
+ * Get a buffer with the specified data.  Look in the cache first.
+ */
+int
+bread(struct vnode * vp, daddr_t blkno, int size, struct ucred * cred,
+    struct buf ** bpp)
 {
+	struct buf *bp;
+
+	bp = getblk(vp, blkno, size, 0, 0);
+	*bpp = bp;
+
+	/* if not found in cache, do some I/O */
+	if ((bp->b_flags & B_CACHE) == 0) {
+		if (curproc != NULL)
+			curproc->p_stats->p_ru.ru_inblock++;
+		bp->b_flags |= B_READ;
+		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
+		if (bp->b_rcred == NOCRED) {
+			if (cred != NOCRED)
+				crhold(cred);
+			bp->b_rcred = cred;
+		}
+		vfs_busy_pages(bp, 0);
+		VOP_STRATEGY(bp);
+		return (biowait(bp));
+	}
+	return (0);
+}
+
+/*
+ * Operates like bread, but also starts asynchronous I/O on
+ * read-ahead blocks.
+ */
+int
+breadn(struct vnode * vp, daddr_t blkno, int size,
+    daddr_t * rablkno, int *rabsize,
+    int cnt, struct ucred * cred, struct buf ** bpp)
+{
+	struct buf *bp, *rabp;
+	int i;
+	int rv = 0, readwait = 0;
+
+	*bpp = bp = getblk(vp, blkno, size, 0, 0);
+
+	/* if not found in cache, do some I/O */
+	if ((bp->b_flags & B_CACHE) == 0) {
+		if (curproc != NULL)
+			curproc->p_stats->p_ru.ru_inblock++;
+		bp->b_flags |= B_READ;
+		bp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
+		if (bp->b_rcred == NOCRED) {
+			if (cred != NOCRED)
+				crhold(cred);
+			bp->b_rcred = cred;
+		}
+		vfs_busy_pages(bp, 0);
+		VOP_STRATEGY(bp);
+		++readwait;
+	}
+	for (i = 0; i < cnt; i++, rablkno++, rabsize++) {
+		if (inmem(vp, *rablkno))
+			continue;
+		rabp = getblk(vp, *rablkno, *rabsize, 0, 0);
+
+		if ((rabp->b_flags & B_CACHE) == 0) {
+			if (curproc != NULL)
+				curproc->p_stats->p_ru.ru_inblock++;
+			rabp->b_flags |= B_READ | B_ASYNC;
+			rabp->b_flags &= ~(B_DONE | B_ERROR | B_INVAL);
+			if (rabp->b_rcred == NOCRED) {
+				if (cred != NOCRED)
+					crhold(cred);
+				rabp->b_rcred = cred;
+			}
+			vfs_busy_pages(rabp, 0);
+			VOP_STRATEGY(rabp);
+		} else {
+			brelse(rabp);
+		}
+	}
+
+	if (readwait) {
+		rv = biowait(bp);
+	}
+	return (rv);
+}
+
+/*
+ * Write, release buffer on completion.  (Done by iodone
+ * if async.)
+ */
+int
+bwrite(struct buf * bp)
+{
+	int oldflags = bp->b_flags;
+
+	if (bp->b_flags & B_INVAL) {
+		brelse(bp);
+		return (0);
+	}
+	if (!(bp->b_flags & B_BUSY))
+		panic("bwrite: buffer is not busy???");
+
+	bp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
+	bp->b_flags |= B_WRITEINPROG;
+
+	if ((oldflags & (B_ASYNC|B_DELWRI)) == (B_ASYNC|B_DELWRI)) {
+		reassignbuf(bp, bp->b_vp);
+	}
+
+	bp->b_vp->v_numoutput++;
+	vfs_busy_pages(bp, 1);
+	if (curproc != NULL)
+		curproc->p_stats->p_ru.ru_oublock++;
+	VOP_STRATEGY(bp);
 
 	/*
-	 * Body deleted.
+	 * Handle ordered writes here.
+	 * If the write was originally flagged as ordered,
+	 * then we check to see if it was converted to async.
+	 * If it was converted to async, and is done now, then
+	 * we release the buffer.  Otherwise we clear the
+	 * ordered flag because it is not needed anymore.
+	 *
+ 	 * Note that biodone has been modified so that it does
+	 * not release ordered buffers.  This allows us to have
+	 * a chance to determine whether or not the driver
+	 * has set the async flag in the strategy routine.  Otherwise
+	 * if biodone was not modified, then the buffer may have been
+	 * reused before we have had a chance to check the flag.
 	 */
-	return (EIO);
+
+	if ((oldflags & B_ORDERED) == B_ORDERED) {
+		int s;
+		s = splbio();
+		if (bp->b_flags & B_ASYNC)  {
+			if ((bp->b_flags & B_DONE)) {
+				if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
+					brelse(bp);
+				else
+					bqrelse(bp);
+			}
+			splx(s);
+			return (0);
+		} else {
+			bp->b_flags &= ~B_ORDERED;
+		}
+		splx(s);
+	}
+
+	if ((oldflags & B_ASYNC) == 0) {
+		int rtval = biowait(bp);
+
+		if (oldflags & B_DELWRI) {
+			reassignbuf(bp, bp->b_vp);
+		}
+		brelse(bp);
+		return (rtval);
+	}
+	return (0);
 }
 
 int
@@ -185,155 +381,1566 @@ vn_bwrite(ap)
 	return (bwrite(ap->a_bp));
 }
 
-bdwrite(a1)
-	struct buf *a1;
+/*
+ * Delayed write. (Buffer is marked dirty).
+ */
+void
+bdwrite(struct buf * bp)
 {
 
+	if ((bp->b_flags & B_BUSY) == 0) {
+		panic("bdwrite: buffer is not busy");
+	}
+	if (bp->b_flags & B_INVAL) {
+		brelse(bp);
+		return;
+	}
+	if (bp->b_flags & B_TAPE) {
+		bawrite(bp);
+		return;
+	}
+	bp->b_flags &= ~(B_READ|B_RELBUF);
+	if ((bp->b_flags & B_DELWRI) == 0) {
+		bp->b_flags |= B_DONE | B_DELWRI;
+		reassignbuf(bp, bp->b_vp);
+	}
+
 	/*
-	 * Body deleted.
+	 * This bmap keeps the system from needing to do the bmap later,
+	 * perhaps when the system is attempting to do a sync.  Since it
+	 * is likely that the indirect block -- or whatever other datastructure
+	 * that the filesystem needs is still in memory now, it is a good
+	 * thing to do this.  Note also, that if the pageout daemon is
+	 * requesting a sync -- there might not be enough memory to do
+	 * the bmap then...  So, this is important to do.
 	 */
-	return;
-}
+	if( bp->b_lblkno == bp->b_blkno) {
+		VOP_BMAP(bp->b_vp, bp->b_lblkno, NULL, &bp->b_blkno, NULL, NULL);
+	}
 
-bawrite(a1)
-	struct buf *a1;
-{
+	/*
+	 * Set the *dirty* buffer range based upon the VM system dirty pages.
+	 */
+	vfs_setdirty(bp);
 
 	/*
-	 * Body deleted.
+	 * We need to do this here to satisfy the vnode_pager and the
+	 * pageout daemon, so that it thinks that the pages have been
+	 * "cleaned".  Note that since the pages are in a delayed write
+	 * buffer -- the VFS layer "will" see that the pages get written
+	 * out on the next sync, or perhaps the cluster will be completed.
 	 */
+	vfs_clean_pages(bp);
+	bqrelse(bp);
 	return;
 }
 
-brelse(a1)
-	struct buf *a1;
+/*
+ * Asynchronous write.
+ * Start output on a buffer, but do not wait for it to complete.
+ * The buffer is released when the output completes.
+ */
+void
+bawrite(struct buf * bp)
+{
+	bp->b_flags |= B_ASYNC;
+	(void) VOP_BWRITE(bp);
+}
+
+/*
+ * Ordered write.
+ * Start output on a buffer, but only wait for it to complete if the
+ * output device cannot guarantee ordering in some other way.  Devices
+ * that can perform asynchronous ordered writes will set the B_ASYNC
+ * flag in their strategy routine.
+ * The buffer is released when the output completes.
+ */
+int
+bowrite(struct buf * bp)
 {
+	bp->b_flags |= B_ORDERED;
+	return (VOP_BWRITE(bp));
+}
+
+/*
+ * Release a buffer.
+ */
+void
+brelse(struct buf * bp)
+{
+	int s;
+
+	if (bp->b_flags & B_CLUSTER) {
+		relpbuf(bp);
+		return;
+	}
+	/* anyone need a "free" block? */
+	s = splbio();
+
+	/* anyone need this block? */
+	if (bp->b_flags & B_WANTED) {
+		bp->b_flags &= ~(B_WANTED | B_AGE);
+		wakeup(bp);
+	} 
+
+	if (bp->b_flags & B_LOCKED)
+		bp->b_flags &= ~B_ERROR;
+
+	if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR)) ||
+	    (bp->b_bufsize <= 0)) {
+		bp->b_flags |= B_INVAL;
+		bp->b_flags &= ~(B_DELWRI | B_CACHE);
+		if (((bp->b_flags & B_VMIO) == 0) && bp->b_vp) {
+			if (bp->b_bufsize)
+				allocbuf(bp, 0);
+			brelvp(bp);
+		}
+	}
 
 	/*
-	 * Body deleted.
+	 * VMIO buffer rundown.  It is not very necessary to keep a VMIO buffer
+	 * constituted, so the B_INVAL flag is used to *invalidate* the buffer,
+	 * but the VM object is kept around.  The B_NOCACHE flag is used to
+	 * invalidate the pages in the VM object.
 	 */
-	return;
+	if (bp->b_flags & B_VMIO) {
+		vm_ooffset_t foff;
+		vm_object_t obj;
+		int i, resid;
+		vm_page_t m;
+		struct vnode *vp;
+		int iototal = bp->b_bufsize;
+
+		vp = bp->b_vp;
+		if (!vp) 
+			panic("brelse: missing vp");
+
+		if (bp->b_npages) {
+			vm_pindex_t poff;
+			obj = (vm_object_t) vp->v_object;
+			if (vp->v_type == VBLK)
+				foff = ((vm_ooffset_t) bp->b_lblkno) << DEV_BSHIFT;
+			else
+				foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
+			poff = OFF_TO_IDX(foff);
+			for (i = 0; i < bp->b_npages; i++) {
+				m = bp->b_pages[i];
+				if (m == bogus_page) {
+					m = vm_page_lookup(obj, poff + i);
+					if (!m) {
+						panic("brelse: page missing\n");
+					}
+					bp->b_pages[i] = m;
+					pmap_qenter(trunc_page(bp->b_data),
+						bp->b_pages, bp->b_npages);
+				}
+				resid = IDX_TO_OFF(m->pindex+1) - foff;
+				if (resid > iototal)
+					resid = iototal;
+				if (resid > 0) {
+					/*
+					 * Don't invalidate the page if the local machine has already
+					 * modified it.  This is the lesser of two evils, and should
+					 * be fixed.
+					 */
+					if (bp->b_flags & (B_NOCACHE | B_ERROR)) {
+						vm_page_test_dirty(m);
+						if (m->dirty == 0) {
+							vm_page_set_invalid(m, (vm_offset_t) foff, resid);
+							if (m->valid == 0)
+								vm_page_protect(m, VM_PROT_NONE);
+						}
+					}
+					if (resid >= PAGE_SIZE) {
+						if ((m->valid & VM_PAGE_BITS_ALL) != VM_PAGE_BITS_ALL) {
+							bp->b_flags |= B_INVAL;
+						}
+					} else {
+						if (!vm_page_is_valid(m,
+							(((vm_offset_t) bp->b_data) & PAGE_MASK), resid)) {
+							bp->b_flags |= B_INVAL;
+						}
+					}
+				}
+				foff += resid;
+				iototal -= resid;
+			}
+		}
+		if (bp->b_flags & (B_INVAL | B_RELBUF))
+			vfs_vmio_release(bp);
+	}
+	if (bp->b_qindex != QUEUE_NONE)
+		panic("brelse: free buffer onto another queue???");
+
+	/* enqueue */
+	/* buffers with no memory */
+	if (bp->b_bufsize == 0) {
+		bp->b_qindex = QUEUE_EMPTY;
+		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_EMPTY], bp, b_freelist);
+		LIST_REMOVE(bp, b_hash);
+		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+		bp->b_dev = NODEV;
+		/*
+		 * Get rid of the kva allocation *now*
+		 */
+		bfreekva(bp);
+		if (needsbuffer) {
+			wakeup(&needsbuffer);
+			needsbuffer=0;
+		}
+		/* buffers with junk contents */
+	} else if (bp->b_flags & (B_ERROR | B_INVAL | B_NOCACHE | B_RELBUF)) {
+		bp->b_qindex = QUEUE_AGE;
+		TAILQ_INSERT_HEAD(&bufqueues[QUEUE_AGE], bp, b_freelist);
+		LIST_REMOVE(bp, b_hash);
+		LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+		bp->b_dev = NODEV;
+		if (needsbuffer) {
+			wakeup(&needsbuffer);
+			needsbuffer=0;
+		}
+		/* buffers that are locked */
+	} else if (bp->b_flags & B_LOCKED) {
+		bp->b_qindex = QUEUE_LOCKED;
+		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
+		/* buffers with stale but valid contents */
+	} else if (bp->b_flags & B_AGE) {
+		bp->b_qindex = QUEUE_AGE;
+		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_AGE], bp, b_freelist);
+		if (needsbuffer) {
+			wakeup(&needsbuffer);
+			needsbuffer=0;
+		}
+		/* buffers with valid and quite potentially reuseable contents */
+	} else {
+		bp->b_qindex = QUEUE_LRU;
+		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+		if (needsbuffer) {
+			wakeup(&needsbuffer);
+			needsbuffer=0;
+		}
+	}
+
+	/* unlock */
+	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
+				B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+	splx(s);
 }
 
+/*
+ * Release a buffer.
+ */
+void
+bqrelse(struct buf * bp)
+{
+	int s;
+
+	s = splbio();
+
+
+	/* anyone need this block? */
+	if (bp->b_flags & B_WANTED) {
+		bp->b_flags &= ~(B_WANTED | B_AGE);
+		wakeup(bp);
+	} 
+
+	if (bp->b_qindex != QUEUE_NONE)
+		panic("bqrelse: free buffer onto another queue???");
+
+	if (bp->b_flags & B_LOCKED) {
+		bp->b_flags &= ~B_ERROR;
+		bp->b_qindex = QUEUE_LOCKED;
+		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LOCKED], bp, b_freelist);
+		/* buffers with stale but valid contents */
+	} else {
+		bp->b_qindex = QUEUE_LRU;
+		TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+		if (needsbuffer) {
+			wakeup(&needsbuffer);
+			needsbuffer=0;
+		}
+	}
+
+	/* unlock */
+	bp->b_flags &= ~(B_ORDERED | B_WANTED | B_BUSY |
+		B_ASYNC | B_NOCACHE | B_AGE | B_RELBUF);
+	splx(s);
+}
+
+static void
+vfs_vmio_release(bp)
+	struct buf *bp;
+{
+	int i;
+	vm_page_t m;
+
+	for (i = 0; i < bp->b_npages; i++) {
+		m = bp->b_pages[i];
+		bp->b_pages[i] = NULL;
+		vm_page_unwire(m);
+		/*
+		 * We don't mess with busy pages, it is
+		 * the responsibility of the process that
+		 * busied the pages to deal with them.
+		 */
+		if ((m->flags & PG_BUSY) || (m->busy != 0))
+			continue;
+			
+		if (m->wire_count == 0) {
+
+			if (m->flags & PG_WANTED) {
+				m->flags &= ~PG_WANTED;
+				wakeup(m);
+			}
+
+			/*
+			 * If this is an async free -- we cannot place
+			 * pages onto the cache queue, so our policy for
+			 * such buffers is to avoid the cache queue, and
+			 * only modify the active queue or free queue.
+			 */
+			if ((bp->b_flags & B_ASYNC) == 0) {
+
+			/*
+			 * In the case of sync buffer frees, we can do pretty much
+			 * anything to any of the memory queues.  Specifically,
+			 * the cache queue is free to be modified.
+			 */
+				if (m->valid) {
+					if(m->dirty == 0)
+						vm_page_test_dirty(m);
+					/*
+					 * this keeps pressure off of the process memory
+					 */
+					if ((vm_swap_size == 0) ||
+						(cnt.v_free_count < cnt.v_free_min)) {
+						if ((m->dirty == 0) &&
+							(m->hold_count == 0)) 
+							vm_page_cache(m);
+						else
+							vm_page_deactivate(m);
+					}
+				} else if (m->hold_count == 0) {
+					vm_page_protect(m, VM_PROT_NONE);
+					vm_page_free(m);
+				}
+			} else {
+				/*
+				 * If async, then at least we clear the
+				 * act_count.
+				 */
+				m->act_count = 0;
+			}
+		}
+	}
+	bufspace -= bp->b_bufsize;
+	vmiospace -= bp->b_bufsize;
+	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
+	bp->b_npages = 0;
+	bp->b_bufsize = 0;
+	bp->b_flags &= ~B_VMIO;
+	if (bp->b_vp)
+		brelvp(bp);
+}
+
+/*
+ * Check to see if a block is currently memory resident.
+ */
 struct buf *
-incore(a1, a2)
-	struct vnode *a1;
-	daddr_t a2;
+gbincore(struct vnode * vp, daddr_t blkno)
 {
+	struct buf *bp;
+	struct bufhashhdr *bh;
 
+	bh = BUFHASH(vp, blkno);
+	bp = bh->lh_first;
+
+	/* Search hash chain */
+	while (bp != NULL) {
+		/* hit */
+		if (bp->b_vp == vp && bp->b_lblkno == blkno &&
+		    (bp->b_flags & B_INVAL) == 0) {
+			break;
+		}
+		bp = bp->b_hash.le_next;
+	}
+	return (bp);
+}
+
+/*
+ * this routine implements clustered async writes for
+ * clearing out B_DELWRI buffers...  This is much better
+ * than the old way of writing only one buffer at a time.
+ */
+int
+vfs_bio_awrite(struct buf * bp)
+{
+	int i;
+	daddr_t lblkno = bp->b_lblkno;
+	struct vnode *vp = bp->b_vp;
+	int s;
+	int ncl;
+	struct buf *bpa;
+	int nwritten;
+
+	s = splbio();
 	/*
-	 * Body deleted.
+	 * right now we support clustered writing only to regular files
 	 */
-	return (0);
+	if ((vp->v_type == VREG) && 
+	    (vp->v_mount != 0) && /* Only on nodes that have the size info */
+	    (bp->b_flags & (B_CLUSTEROK | B_INVAL)) == B_CLUSTEROK) {
+		int size;
+		int maxcl;
+
+		size = vp->v_mount->mnt_stat.f_iosize;
+		maxcl = MAXPHYS / size;
+
+		for (i = 1; i < maxcl; i++) {
+			if ((bpa = gbincore(vp, lblkno + i)) &&
+			    ((bpa->b_flags & (B_BUSY | B_DELWRI | B_CLUSTEROK | B_INVAL)) ==
+			    (B_DELWRI | B_CLUSTEROK)) &&
+			    (bpa->b_bufsize == size)) {
+				if ((bpa->b_blkno == bpa->b_lblkno) ||
+				    (bpa->b_blkno != bp->b_blkno + ((i * size) >> DEV_BSHIFT)))
+					break;
+			} else {
+				break;
+			}
+		}
+		ncl = i;
+		/*
+		 * this is a possible cluster write
+		 */
+		if (ncl != 1) {
+			nwritten = cluster_wbuild(vp, size, lblkno, ncl);
+			splx(s);
+			return nwritten;
+		}
+	}
+	bremfree(bp);
+	splx(s);
+	/*
+	 * default (old) behavior, writing out only one block
+	 */
+	bp->b_flags |= B_BUSY | B_ASYNC;
+	nwritten = bp->b_bufsize;
+	(void) VOP_BWRITE(bp);
+	return nwritten;
 }
 
-struct buf *
-getblk(a1, a2, a3, a4, a5)
-	struct vnode *a1;
-	daddr_t a2;
-	int a3, a4, a5;
+
+/*
+ * Find a buffer header which is available for use.
+ */
+static struct buf *
+getnewbuf(int slpflag, int slptimeo, int size, int maxsize)
 {
+	struct buf *bp;
+	int nbyteswritten = 0;
+	vm_offset_t addr;
+
+start:
+	if (bufspace >= maxbufspace)
+		goto trytofreespace;
+
+	/* can we constitute a new buffer? */
+	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_EMPTY]))) {
+		if (bp->b_qindex != QUEUE_EMPTY)
+			panic("getnewbuf: inconsistent EMPTY queue, qindex=%d",
+			    bp->b_qindex);
+		bp->b_flags |= B_BUSY;
+		bremfree(bp);
+		goto fillbuf;
+	}
+trytofreespace:
+	/*
+	 * We keep the file I/O from hogging metadata I/O
+	 * This is desirable because file data is cached in the
+	 * VM/Buffer cache even if a buffer is freed.
+	 */
+	if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_AGE]))) {
+		if (bp->b_qindex != QUEUE_AGE)
+			panic("getnewbuf: inconsistent AGE queue, qindex=%d",
+			    bp->b_qindex);
+	} else if ((bp = TAILQ_FIRST(&bufqueues[QUEUE_LRU]))) {
+		if (bp->b_qindex != QUEUE_LRU)
+			panic("getnewbuf: inconsistent LRU queue, qindex=%d",
+			    bp->b_qindex);
+	}
+	if (!bp) {
+		/* wait for a free buffer of any kind */
+		needsbuffer = 1;
+		tsleep(&needsbuffer,
+			(PRIBIO + 1) | slpflag, "newbuf", slptimeo);
+		return (0);
+	}
+
+#if defined(DIAGNOSTIC)
+	if (bp->b_flags & B_BUSY) {
+		panic("getnewbuf: busy buffer on free list\n");
+	}
+#endif
 
 	/*
-	 * Body deleted.
+	 * We are fairly aggressive about freeing VMIO buffers, but since
+	 * the buffering is intact without buffer headers, there is not
+	 * much loss.  We gain by maintaining non-VMIOed metadata in buffers.
 	 */
-	return ((struct buf *)0);
+	if ((bp->b_qindex == QUEUE_LRU) && (bp->b_usecount > 0)) {
+		if ((bp->b_flags & B_VMIO) == 0 ||
+			(vmiospace < maxvmiobufspace)) {
+			--bp->b_usecount;
+			TAILQ_REMOVE(&bufqueues[QUEUE_LRU], bp, b_freelist);
+			if (TAILQ_FIRST(&bufqueues[QUEUE_LRU]) != NULL) {
+				TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+				goto start;
+			}
+			TAILQ_INSERT_TAIL(&bufqueues[QUEUE_LRU], bp, b_freelist);
+		}
+	}
+
+	/* if we are a delayed write, convert to an async write */
+	if ((bp->b_flags & (B_DELWRI | B_INVAL)) == B_DELWRI) {
+		nbyteswritten += vfs_bio_awrite(bp);
+		if (!slpflag && !slptimeo) {
+			return (0);
+		}
+		goto start;
+	}
+
+	if (bp->b_flags & B_WANTED) {
+		bp->b_flags &= ~B_WANTED;
+		wakeup(bp);
+	}
+	bremfree(bp);
+	bp->b_flags |= B_BUSY;
+
+	if (bp->b_flags & B_VMIO) {
+		bp->b_flags &= ~B_ASYNC;
+		vfs_vmio_release(bp);
+	}
+
+	if (bp->b_vp)
+		brelvp(bp);
+
+fillbuf:
+	/* we are not free, nor do we contain interesting data */
+	if (bp->b_rcred != NOCRED) {
+		crfree(bp->b_rcred);
+		bp->b_rcred = NOCRED;
+	}
+	if (bp->b_wcred != NOCRED) {
+		crfree(bp->b_wcred);
+		bp->b_wcred = NOCRED;
+	}
+
+	LIST_REMOVE(bp, b_hash);
+	LIST_INSERT_HEAD(&invalhash, bp, b_hash);
+	if (bp->b_bufsize) {
+		allocbuf(bp, 0);
+	}
+	bp->b_flags = B_BUSY;
+	bp->b_dev = NODEV;
+	bp->b_vp = NULL;
+	bp->b_blkno = bp->b_lblkno = 0;
+	bp->b_iodone = 0;
+	bp->b_error = 0;
+	bp->b_resid = 0;
+	bp->b_bcount = 0;
+	bp->b_npages = 0;
+	bp->b_dirtyoff = bp->b_dirtyend = 0;
+	bp->b_validoff = bp->b_validend = 0;
+	bp->b_usecount = 4;
+
+	maxsize = (maxsize + PAGE_MASK) & ~PAGE_MASK;
+
+	/*
+	 * we assume that buffer_map is not at address 0
+	 */
+	addr = 0;
+	if (maxsize != bp->b_kvasize) {
+		bfreekva(bp);
+		
+		/*
+		 * See if we have buffer kva space
+		 */
+		if (vm_map_findspace(buffer_map,
+			vm_map_min(buffer_map), maxsize, &addr)) {
+			bp->b_flags |= B_INVAL;
+			brelse(bp);
+			goto trytofreespace;
+		}
+	}
+
+	/*
+	 * See if we are below are allocated minimum
+	 */
+	if (bufspace >= (maxbufspace + nbyteswritten)) {
+		bp->b_flags |= B_INVAL;
+		brelse(bp);
+		goto trytofreespace;
+	}
+
+	/*
+	 * create a map entry for the buffer -- in essence
+	 * reserving the kva space.
+	 */
+	if (addr) {
+		vm_map_insert(buffer_map, NULL, 0,
+			addr, addr + maxsize,
+			VM_PROT_ALL, VM_PROT_ALL, MAP_NOFAULT);
+
+		bp->b_kvabase = (caddr_t) addr;
+		bp->b_kvasize = maxsize;
+	}
+	bp->b_data = bp->b_kvabase;
+	
+	return (bp);
 }
 
+/*
+ * Check to see if a block is currently memory resident.
+ */
 struct buf *
-geteblk(a1)
-	int a1;
+incore(struct vnode * vp, daddr_t blkno)
 {
+	struct buf *bp;
 
-	/*
-	 * Body deleted.
-	 */
-	return ((struct buf *)0);
+	int s = splbio();
+	bp = gbincore(vp, blkno);
+	splx(s);
+	return (bp);
 }
 
-allocbuf(a1, a2)
-	struct buf *a1;
-	int a2;
+/*
+ * Returns true if no I/O is needed to access the
+ * associated VM object.  This is like incore except
+ * it also hunts around in the VM system for the data.
+ */
+
+int
+inmem(struct vnode * vp, daddr_t blkno)
 {
+	vm_object_t obj;
+	vm_offset_t toff, tinc;
+	vm_page_t m;
+	vm_ooffset_t off;
+
+	if (incore(vp, blkno))
+		return 1;
+	if (vp->v_mount == NULL)
+		return 0;
+	if ((vp->v_object == NULL) || (vp->v_flag & VVMIO) == 0)
+		return 0;
+
+	obj = vp->v_object;
+	tinc = PAGE_SIZE;
+	if (tinc > vp->v_mount->mnt_stat.f_iosize)
+		tinc = vp->v_mount->mnt_stat.f_iosize;
+	off = blkno * vp->v_mount->mnt_stat.f_iosize;
+
+	for (toff = 0; toff < vp->v_mount->mnt_stat.f_iosize; toff += tinc) {
+
+		m = vm_page_lookup(obj, OFF_TO_IDX(off + toff));
+		if (!m)
+			return 0;
+		if (vm_page_is_valid(m, (vm_offset_t) (toff + off), tinc) == 0)
+			return 0;
+	}
+	return 1;
+}
 
+/*
+ * now we set the dirty range for the buffer --
+ * for NFS -- if the file is mapped and pages have
+ * been written to, let it know.  We want the
+ * entire range of the buffer to be marked dirty if
+ * any of the pages have been written to for consistancy
+ * with the b_validoff, b_validend set in the nfs write
+ * code, and used by the nfs read code.
+ */
+static void
+vfs_setdirty(struct buf *bp) {
+	int i;
+	vm_object_t object;
+	vm_offset_t boffset, offset;
 	/*
-	 * Body deleted.
+	 * We qualify the scan for modified pages on whether the
+	 * object has been flushed yet.  The OBJ_WRITEABLE flag
+	 * is not cleared simply by protecting pages off.
 	 */
-	return (0);
+	if ((bp->b_flags & B_VMIO) &&
+		((object = bp->b_pages[0]->object)->flags & (OBJ_WRITEABLE|OBJ_CLEANING))) {
+		/*
+		 * test the pages to see if they have been modified directly
+		 * by users through the VM system.
+		 */
+		for (i = 0; i < bp->b_npages; i++)
+			vm_page_test_dirty(bp->b_pages[i]);
+
+		/*
+		 * scan forwards for the first page modified
+		 */
+		for (i = 0; i < bp->b_npages; i++) {
+			if (bp->b_pages[i]->dirty) {
+				break;
+			}
+		}
+		boffset = (i << PAGE_SHIFT);
+		if (boffset < bp->b_dirtyoff) {
+			bp->b_dirtyoff = boffset;
+		}
+
+		/*
+		 * scan backwards for the last page modified
+		 */
+		for (i = bp->b_npages - 1; i >= 0; --i) {
+			if (bp->b_pages[i]->dirty) {
+				break;
+			}
+		}
+		boffset = (i + 1);
+		offset = boffset + bp->b_pages[0]->pindex;
+		if (offset >= object->size)
+			boffset = object->size - bp->b_pages[0]->pindex;
+		if (bp->b_dirtyend < (boffset << PAGE_SHIFT))
+			bp->b_dirtyend = (boffset << PAGE_SHIFT);
+	}
 }
 
+/*
+ * Get a block given a specified block and offset into a file/device.
+ */
 struct buf *
-getnewbuf(a1, a2)
-	int a1, a2;
+getblk(struct vnode * vp, daddr_t blkno, int size, int slpflag, int slptimeo)
 {
+	struct buf *bp;
+	int s;
+	struct bufhashhdr *bh;
+	int maxsize;
 
-	/*
-	 * Body deleted.
-	 */
-	return ((struct buf *)0);
+	if (vp->v_mount) {
+		maxsize = vp->v_mount->mnt_stat.f_iosize;
+		/*
+		 * This happens on mount points.
+		 */
+		if (maxsize < size)
+			maxsize = size;
+	} else {
+		maxsize = size;
+	}
+
+	if (size > MAXBSIZE)
+		panic("getblk: size(%d) > MAXBSIZE(%d)\n", size, MAXBSIZE);
+
+	s = splbio();
+loop:
+	if ((bp = gbincore(vp, blkno))) {
+		if (bp->b_flags & B_BUSY) {
+			bp->b_flags |= B_WANTED;
+			if (bp->b_usecount < BUF_MAXUSE)
+				++bp->b_usecount;
+			if (!tsleep(bp,
+				(PRIBIO + 1) | slpflag, "getblk", slptimeo))
+				goto loop;
+
+			splx(s);
+			return (struct buf *) NULL;
+		}
+		bp->b_flags |= B_BUSY | B_CACHE;
+		bremfree(bp);
+				
+		/*
+		 * check for size inconsistancies (note that they shouldn't happen
+		 * but do when filesystems don't handle the size changes correctly.)
+		 * We are conservative on metadata and don't just extend the buffer
+		 * but write and re-constitute it.
+		 */
+
+		if (bp->b_bcount != size) {
+			if ((bp->b_flags & B_VMIO) && (size <= bp->b_kvasize)) {
+				allocbuf(bp, size);
+			} else {
+				bp->b_flags |= B_NOCACHE;
+				VOP_BWRITE(bp);
+				goto loop;
+			}
+		}
+
+		if (bp->b_usecount < BUF_MAXUSE)
+			++bp->b_usecount;
+		splx(s);
+		return (bp);
+	} else {
+		vm_object_t obj;
+
+		if ((bp = getnewbuf(slpflag, slptimeo, size, maxsize)) == 0) {
+			if (slpflag || slptimeo) {
+				splx(s);
+				return NULL;
+			}
+			goto loop;
+		}
+
+		/*
+		 * This code is used to make sure that a buffer is not
+		 * created while the getnewbuf routine is blocked.
+		 * Normally the vnode is locked so this isn't a problem.
+		 * VBLK type I/O requests, however, don't lock the vnode.
+		 */
+		if (!VOP_ISLOCKED(vp) && gbincore(vp, blkno)) {
+			bp->b_flags |= B_INVAL;
+			brelse(bp);
+			goto loop;
+		}
+
+		/*
+		 * Insert the buffer into the hash, so that it can
+		 * be found by incore.
+		 */
+		bp->b_blkno = bp->b_lblkno = blkno;
+		bgetvp(vp, bp);
+		LIST_REMOVE(bp, b_hash);
+		bh = BUFHASH(vp, blkno);
+		LIST_INSERT_HEAD(bh, bp, b_hash);
+
+		if ((obj = vp->v_object) && (vp->v_flag & VVMIO)) {
+			bp->b_flags |= (B_VMIO | B_CACHE);
+#if defined(VFS_BIO_DEBUG)
+			if (vp->v_type != VREG && vp->v_type != VBLK)
+				printf("getblk: vmioing file type %d???\n", vp->v_type);
+#endif
+		} else {
+			bp->b_flags &= ~B_VMIO;
+		}
+		splx(s);
+
+		allocbuf(bp, size);
+#ifdef	PC98
+		/*
+		 * 1024byte/sector support
+		 */
+#define B_XXX2 0x8000000
+		if (vp->v_flag & 0x10000) bp->b_flags |= B_XXX2;
+#endif
+		return (bp);
+	}
 }
 
-biowait(a1)
-	struct buf *a1;
+/*
+ * Get an empty, disassociated buffer of given size.
+ */
+struct buf *
+geteblk(int size)
 {
+	struct buf *bp;
+	int s;
 
-	/*
-	 * Body deleted.
-	 */
-	return (EIO);
+	s = splbio();
+	while ((bp = getnewbuf(0, 0, size, MAXBSIZE)) == 0);
+	splx(s);
+	allocbuf(bp, size);
+	bp->b_flags |= B_INVAL;
+	return (bp);
 }
 
+
+/*
+ * This code constitutes the buffer memory from either anonymous system
+ * memory (in the case of non-VMIO operations) or from an associated
+ * VM object (in the case of VMIO operations).
+ *
+ * Note that this code is tricky, and has many complications to resolve
+ * deadlock or inconsistant data situations.  Tread lightly!!!
+ *
+ * Modify the length of a buffer's underlying buffer storage without
+ * destroying information (unless, of course the buffer is shrinking).
+ */
+int
+allocbuf(struct buf * bp, int size)
+{
+
+	int s;
+	int newbsize, mbsize;
+	int i;
+
+	if (!(bp->b_flags & B_BUSY))
+		panic("allocbuf: buffer not busy");
+
+	if (bp->b_kvasize < size)
+		panic("allocbuf: buffer too small");
+
+	if ((bp->b_flags & B_VMIO) == 0) {
+		caddr_t origbuf;
+		int origbufsize;
+		/*
+		 * Just get anonymous memory from the kernel
+		 */
+		mbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+#if !defined(NO_B_MALLOC)
+		if (bp->b_flags & B_MALLOC)
+			newbsize = mbsize;
+		else
+#endif
+			newbsize = round_page(size);
+
+		if (newbsize < bp->b_bufsize) {
+#if !defined(NO_B_MALLOC)
+			/*
+			 * malloced buffers are not shrunk
+			 */
+			if (bp->b_flags & B_MALLOC) {
+				if (newbsize) {
+					bp->b_bcount = size;
+				} else {
+					free(bp->b_data, M_BIOBUF);
+					bufspace -= bp->b_bufsize;
+					bufmallocspace -= bp->b_bufsize;
+					bp->b_data = bp->b_kvabase;
+					bp->b_bufsize = 0;
+					bp->b_bcount = 0;
+					bp->b_flags &= ~B_MALLOC;
+				}
+				return 1;
+			}		
+#endif
+			vm_hold_free_pages(
+			    bp,
+			    (vm_offset_t) bp->b_data + newbsize,
+			    (vm_offset_t) bp->b_data + bp->b_bufsize);
+		} else if (newbsize > bp->b_bufsize) {
+#if !defined(NO_B_MALLOC)
+			/*
+			 * We only use malloced memory on the first allocation.
+			 * and revert to page-allocated memory when the buffer grows.
+			 */
+			if ( (bufmallocspace < maxbufmallocspace) &&
+				(bp->b_bufsize == 0) &&
+				(mbsize <= PAGE_SIZE/2)) {
+
+				bp->b_data = malloc(mbsize, M_BIOBUF, M_WAITOK);
+				bp->b_bufsize = mbsize;
+				bp->b_bcount = size;
+				bp->b_flags |= B_MALLOC;
+				bufspace += mbsize;
+				bufmallocspace += mbsize;
+				return 1;
+			}
+#endif
+			origbuf = NULL;
+			origbufsize = 0;
+#if !defined(NO_B_MALLOC)
+			/*
+			 * If the buffer is growing on it's other-than-first allocation,
+			 * then we revert to the page-allocation scheme.
+			 */
+			if (bp->b_flags & B_MALLOC) {
+				origbuf = bp->b_data;
+				origbufsize = bp->b_bufsize;
+				bp->b_data = bp->b_kvabase;
+				bufspace -= bp->b_bufsize;
+				bufmallocspace -= bp->b_bufsize;
+				bp->b_bufsize = 0;
+				bp->b_flags &= ~B_MALLOC;
+				newbsize = round_page(newbsize);
+			}
+#endif
+			vm_hold_load_pages(
+			    bp,
+			    (vm_offset_t) bp->b_data + bp->b_bufsize,
+			    (vm_offset_t) bp->b_data + newbsize);
+#if !defined(NO_B_MALLOC)
+			if (origbuf) {
+				bcopy(origbuf, bp->b_data, origbufsize);
+				free(origbuf, M_BIOBUF);
+			}
+#endif
+		}
+	} else {
+		vm_page_t m;
+		int desiredpages;
+
+		newbsize = (size + DEV_BSIZE - 1) & ~(DEV_BSIZE - 1);
+		desiredpages = (round_page(newbsize) >> PAGE_SHIFT);
+
+#if !defined(NO_B_MALLOC)
+		if (bp->b_flags & B_MALLOC)
+			panic("allocbuf: VMIO buffer can't be malloced");
+#endif
+
+		if (newbsize < bp->b_bufsize) {
+			if (desiredpages < bp->b_npages) {
+				for (i = desiredpages; i < bp->b_npages; i++) {
+					/*
+					 * the page is not freed here -- it
+					 * is the responsibility of vnode_pager_setsize
+					 */
+					m = bp->b_pages[i];
+#if defined(DIAGNOSTIC)
+					if (m == bogus_page)
+						panic("allocbuf: bogus page found");
+#endif
+					s = splvm();
+					while ((m->flags & PG_BUSY) || (m->busy != 0)) {
+						m->flags |= PG_WANTED;
+						tsleep(m, PVM, "biodep", 0);
+					}
+					splx(s);
+
+					bp->b_pages[i] = NULL;
+					vm_page_unwire(m);
+				}
+				pmap_qremove((vm_offset_t) trunc_page(bp->b_data) +
+				    (desiredpages << PAGE_SHIFT), (bp->b_npages - desiredpages));
+				bp->b_npages = desiredpages;
+			}
+		} else if (newbsize > bp->b_bufsize) {
+			vm_object_t obj;
+			vm_offset_t tinc, toff;
+			vm_ooffset_t off;
+			vm_pindex_t objoff;
+			int pageindex, curbpnpages;
+			struct vnode *vp;
+			int bsize;
+
+			vp = bp->b_vp;
+
+			if (vp->v_type == VBLK)
+				bsize = DEV_BSIZE;
+			else
+				bsize = vp->v_mount->mnt_stat.f_iosize;
+
+			if (bp->b_npages < desiredpages) {
+				obj = vp->v_object;
+				tinc = PAGE_SIZE;
+				if (tinc > bsize)
+					tinc = bsize;
+				off = (vm_ooffset_t) bp->b_lblkno * bsize;
+				curbpnpages = bp->b_npages;
+		doretry:
+				bp->b_flags |= B_CACHE;
+				for (toff = 0; toff < newbsize; toff += tinc) {
+					int bytesinpage;
+
+					pageindex = toff >> PAGE_SHIFT;
+					objoff = OFF_TO_IDX(off + toff);
+					if (pageindex < curbpnpages) {
+
+						m = bp->b_pages[pageindex];
+#ifdef VFS_BIO_DIAG
+						if (m->pindex != objoff)
+							panic("allocbuf: page changed offset??!!!?");
+#endif
+						bytesinpage = tinc;
+						if (tinc > (newbsize - toff))
+							bytesinpage = newbsize - toff;
+						if ((bp->b_flags & B_CACHE) &&
+							!vm_page_is_valid(m,
+							(vm_offset_t) ((toff + off) & PAGE_MASK),
+							bytesinpage)) {
+							bp->b_flags &= ~B_CACHE;
+						}
+						continue;
+					}
+					m = vm_page_lookup(obj, objoff);
+					if (!m) {
+						m = vm_page_alloc(obj, objoff, VM_ALLOC_NORMAL);
+						if (!m) {
+							VM_WAIT;
+							goto doretry;
+						}
+						/*
+						 * Normally it is unwise to clear PG_BUSY without
+						 * PAGE_WAKEUP -- but it is okay here, as there is
+						 * no chance for blocking between here and vm_page_alloc
+						 */
+						m->flags &= ~PG_BUSY;
+						vm_page_wire(m);
+						bp->b_flags &= ~B_CACHE;
+					} else if (m->flags & PG_BUSY) {
+						s = splvm();
+						if (m->flags & PG_BUSY) {
+							m->flags |= PG_WANTED;
+							tsleep(m, PVM, "pgtblk", 0);
+						}
+						splx(s);
+						goto doretry;
+					} else {
+						if ((curproc != pageproc) &&
+							((m->queue - m->pc) == PQ_CACHE) &&
+						    ((cnt.v_free_count + cnt.v_cache_count) <
+								(cnt.v_free_min + cnt.v_cache_min))) {
+							pagedaemon_wakeup();
+						}
+						bytesinpage = tinc;
+						if (tinc > (newbsize - toff))
+							bytesinpage = newbsize - toff;
+						if ((bp->b_flags & B_CACHE) &&
+							!vm_page_is_valid(m,
+							(vm_offset_t) ((toff + off) & PAGE_MASK),
+							bytesinpage)) {
+							bp->b_flags &= ~B_CACHE;
+						}
+						vm_page_wire(m);
+					}
+					bp->b_pages[pageindex] = m;
+					curbpnpages = pageindex + 1;
+				}
+				bp->b_data = (caddr_t) trunc_page(bp->b_data);
+				bp->b_npages = curbpnpages;
+				pmap_qenter((vm_offset_t) bp->b_data,
+					bp->b_pages, bp->b_npages);
+				((vm_offset_t) bp->b_data) |= off & PAGE_MASK;
+			}
+		}
+	}
+	if (bp->b_flags & B_VMIO)
+		vmiospace += bp->b_bufsize;
+	bufspace += (newbsize - bp->b_bufsize);
+	bp->b_bufsize = newbsize;
+	bp->b_bcount = size;
+	return 1;
+}
+
+/*
+ * Wait for buffer I/O completion, returning error status.
+ */
+int
+biowait(register struct buf * bp)
+{
+	int s;
+
+	s = splbio();
+	while ((bp->b_flags & B_DONE) == 0)
+		tsleep(bp, PRIBIO, "biowait", 0);
+	splx(s);
+	if (bp->b_flags & B_EINTR) {
+		bp->b_flags &= ~B_EINTR;
+		return (EINTR);
+	}
+	if (bp->b_flags & B_ERROR) {
+		return (bp->b_error ? bp->b_error : EIO);
+	} else {
+		return (0);
+	}
+}
+
+/*
+ * Finish I/O on a buffer, calling an optional function.
+ * This is usually called from interrupt level, so process blocking
+ * is not *a good idea*.
+ */
 void
-biodone(a1)
-	struct buf *a1;
+biodone(register struct buf * bp)
 {
+	int s;
+
+	s = splbio();
+	if (!(bp->b_flags & B_BUSY))
+		panic("biodone: buffer not busy");
+
+	if (bp->b_flags & B_DONE) {
+		splx(s);
+		printf("biodone: buffer already done\n");
+		return;
+	}
+	bp->b_flags |= B_DONE;
+
+	if ((bp->b_flags & B_READ) == 0) {
+		vwakeup(bp);
+	}
+#ifdef BOUNCE_BUFFERS
+	if (bp->b_flags & B_BOUNCE)
+		vm_bounce_free(bp);
+#endif
+
+	/* call optional completion function if requested */
+	if (bp->b_flags & B_CALL) {
+		bp->b_flags &= ~B_CALL;
+		(*bp->b_iodone) (bp);
+		splx(s);
+		return;
+	}
+	if (bp->b_flags & B_VMIO) {
+		int i, resid;
+		vm_ooffset_t foff;
+		vm_page_t m;
+		vm_object_t obj;
+		int iosize;
+		struct vnode *vp = bp->b_vp;
+
+		if (vp->v_type == VBLK)
+			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
+		else
+			foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
+		obj = vp->v_object;
+		if (!obj) {
+			panic("biodone: no object");
+		}
+#if defined(VFS_BIO_DEBUG)
+		if (obj->paging_in_progress < bp->b_npages) {
+			printf("biodone: paging in progress(%d) < bp->b_npages(%d)\n",
+			    obj->paging_in_progress, bp->b_npages);
+		}
+#endif
+		iosize = bp->b_bufsize;
+		for (i = 0; i < bp->b_npages; i++) {
+			int bogusflag = 0;
+			m = bp->b_pages[i];
+			if (m == bogus_page) {
+				bogusflag = 1;
+				m = vm_page_lookup(obj, OFF_TO_IDX(foff));
+				if (!m) {
+#if defined(VFS_BIO_DEBUG)
+					printf("biodone: page disappeared\n");
+#endif
+					--obj->paging_in_progress;
+					continue;
+				}
+				bp->b_pages[i] = m;
+				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
+			}
+#if defined(VFS_BIO_DEBUG)
+			if (OFF_TO_IDX(foff) != m->pindex) {
+				printf("biodone: foff(%d)/m->pindex(%d) mismatch\n", foff, m->pindex);
+			}
+#endif
+			resid = IDX_TO_OFF(m->pindex + 1) - foff;
+			if (resid > iosize)
+				resid = iosize;
+			/*
+			 * In the write case, the valid and clean bits are
+			 * already changed correctly, so we only need to do this
+			 * here in the read case.
+			 */
+			if ((bp->b_flags & B_READ) && !bogusflag && resid > 0) {
+				vm_page_set_validclean(m,
+					(vm_offset_t) (foff & PAGE_MASK), resid);
+			}
 
+			/*
+			 * when debugging new filesystems or buffer I/O methods, this
+			 * is the most common error that pops up.  if you see this, you
+			 * have not set the page busy flag correctly!!!
+			 */
+			if (m->busy == 0) {
+				printf("biodone: page busy < 0, "
+				    "pindex: %d, foff: 0x(%x,%x), "
+				    "resid: %d, index: %d\n",
+				    (int) m->pindex, (int)(foff >> 32),
+						(int) foff & 0xffffffff, resid, i);
+				if (vp->v_type != VBLK)
+					printf(" iosize: %ld, lblkno: %d, flags: 0x%lx, npages: %d\n",
+					    bp->b_vp->v_mount->mnt_stat.f_iosize,
+					    (int) bp->b_lblkno,
+					    bp->b_flags, bp->b_npages);
+				else
+					printf(" VDEV, lblkno: %d, flags: 0x%lx, npages: %d\n",
+					    (int) bp->b_lblkno,
+					    bp->b_flags, bp->b_npages);
+				printf(" valid: 0x%x, dirty: 0x%x, wired: %d\n",
+				    m->valid, m->dirty, m->wire_count);
+				panic("biodone: page busy < 0\n");
+			}
+			--m->busy;
+			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
+				m->flags &= ~PG_WANTED;
+				wakeup(m);
+			}
+			--obj->paging_in_progress;
+			foff += resid;
+			iosize -= resid;
+		}
+		if (obj && obj->paging_in_progress == 0 &&
+		    (obj->flags & OBJ_PIPWNT)) {
+			obj->flags &= ~OBJ_PIPWNT;
+			wakeup(obj);
+		}
+	}
 	/*
-	 * Body deleted.
+	 * For asynchronous completions, release the buffer now. The brelse
+	 * checks for B_WANTED and will do the wakeup there if necessary - so
+	 * no need to do a wakeup here in the async case.
 	 */
-	return;
+
+	if (bp->b_flags & B_ASYNC) {
+		if ((bp->b_flags & B_ORDERED) == 0) {
+			if ((bp->b_flags & (B_NOCACHE | B_INVAL | B_ERROR | B_RELBUF)) != 0)
+				brelse(bp);
+			else
+				bqrelse(bp);
+		}
+	} else {
+		bp->b_flags &= ~B_WANTED;
+		wakeup(bp);
+	}
+	splx(s);
 }
 
 int
 count_lock_queue()
 {
+	int count;
+	struct buf *bp;
 
-	/*
-	 * Body deleted.
-	 */
-	return (0);
+	count = 0;
+	for (bp = TAILQ_FIRST(&bufqueues[QUEUE_LOCKED]);
+	    bp != NULL;
+	    bp = TAILQ_NEXT(bp, b_freelist))
+		count++;
+	return (count);
+}
+
+int vfs_update_interval = 30;
+
+static void
+vfs_update()
+{
+	while (1) {
+		tsleep(&vfs_update_wakeup, PUSER, "update",
+		    hz * vfs_update_interval);
+		vfs_update_wakeup = 0;
+		sync(curproc, NULL, NULL);
+	}
 }
 
-#ifdef DIAGNOSTIC
+static int
+sysctl_kern_updateinterval SYSCTL_HANDLER_ARGS
+{
+	int error = sysctl_handle_int(oidp,
+		oidp->oid_arg1, oidp->oid_arg2, req);
+	if (!error)
+		wakeup(&vfs_update_wakeup);
+	return error;
+}
+
+SYSCTL_PROC(_kern, KERN_UPDATEINTERVAL, update, CTLTYPE_INT|CTLFLAG_RW,
+	&vfs_update_interval, 0, sysctl_kern_updateinterval, "I", "");
+
+
 /*
- * Print out statistics on the current allocation of the buffer pool.
- * Can be enabled to print out on every ``sync'' by setting "syncprt"
- * in vfs_syscalls.c using sysctl.
+ * This routine is called in lieu of iodone in the case of
+ * incomplete I/O.  This keeps the busy status for pages
+ * consistant.
  */
 void
-vfs_bufstats()
+vfs_unbusy_pages(struct buf * bp)
 {
-	int s, i, j, count;
-	register struct buf *bp;
-	register struct bqueues *dp;
-	int counts[MAXBSIZE/CLBYTES+1];
-	static char *bname[BQUEUES] = { "LOCKED", "LRU", "AGE", "EMPTY" };
-
-	for (dp = bufqueues, i = 0; dp < &bufqueues[BQUEUES]; dp++, i++) {
-		count = 0;
-		for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
-			counts[j] = 0;
-		s = splbio();
-		for (bp = dp->tqh_first; bp; bp = bp->b_freelist.tqe_next) {
-			counts[bp->b_bufsize/CLBYTES]++;
-			count++;
+	int i;
+
+	if (bp->b_flags & B_VMIO) {
+		struct vnode *vp = bp->b_vp;
+		vm_object_t obj = vp->v_object;
+		vm_ooffset_t foff;
+
+		foff = (vm_ooffset_t) vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
+
+		for (i = 0; i < bp->b_npages; i++) {
+			vm_page_t m = bp->b_pages[i];
+
+			if (m == bogus_page) {
+				m = vm_page_lookup(obj, OFF_TO_IDX(foff) + i);
+				if (!m) {
+					panic("vfs_unbusy_pages: page missing\n");
+				}
+				bp->b_pages[i] = m;
+				pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
+			}
+			--obj->paging_in_progress;
+			--m->busy;
+			if ((m->busy == 0) && (m->flags & PG_WANTED)) {
+				m->flags &= ~PG_WANTED;
+				wakeup(m);
+			}
+		}
+		if (obj->paging_in_progress == 0 &&
+		    (obj->flags & OBJ_PIPWNT)) {
+			obj->flags &= ~OBJ_PIPWNT;
+			wakeup(obj);
+		}
+	}
+}
+
+/*
+ * This routine is called before a device strategy routine.
+ * It is used to tell the VM system that paging I/O is in
+ * progress, and treat the pages associated with the buffer
+ * almost as being PG_BUSY.  Also the object paging_in_progress
+ * flag is handled to make sure that the object doesn't become
+ * inconsistant.
+ */
+void
+vfs_busy_pages(struct buf * bp, int clear_modify)
+{
+	int i;
+
+	if (bp->b_flags & B_VMIO) {
+		vm_object_t obj = bp->b_vp->v_object;
+		vm_ooffset_t foff;
+		int iocount = bp->b_bufsize;
+
+		if (bp->b_vp->v_type == VBLK)
+			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
+		else
+			foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
+		vfs_setdirty(bp);
+		for (i = 0; i < bp->b_npages; i++) {
+			vm_page_t m = bp->b_pages[i];
+			int resid = IDX_TO_OFF(m->pindex + 1) - foff;
+
+			if (resid > iocount)
+				resid = iocount;
+			if ((bp->b_flags & B_CLUSTER) == 0) {
+				obj->paging_in_progress++;
+				m->busy++;
+			}
+			vm_page_protect(m, VM_PROT_NONE);
+			if (clear_modify) {
+				vm_page_set_validclean(m,
+					(vm_offset_t) (foff & PAGE_MASK), resid);
+			} else if (bp->b_bcount >= PAGE_SIZE) {
+				if (m->valid && (bp->b_flags & B_CACHE) == 0) {
+					bp->b_pages[i] = bogus_page;
+					pmap_qenter(trunc_page(bp->b_data), bp->b_pages, bp->b_npages);
+				}
+			}
+			foff += resid;
+			iocount -= resid;
+		}
+	}
+}
+
+/*
+ * Tell the VM system that the pages associated with this buffer
+ * are clean.  This is used for delayed writes where the data is
+ * going to go to disk eventually without additional VM intevention.
+ */
+void
+vfs_clean_pages(struct buf * bp)
+{
+	int i;
+
+	if (bp->b_flags & B_VMIO) {
+		vm_ooffset_t foff;
+		int iocount = bp->b_bufsize;
+
+		if (bp->b_vp->v_type == VBLK)
+			foff = (vm_ooffset_t) DEV_BSIZE * bp->b_lblkno;
+		else
+			foff = (vm_ooffset_t) bp->b_vp->v_mount->mnt_stat.f_iosize * bp->b_lblkno;
+
+		for (i = 0; i < bp->b_npages; i++) {
+			vm_page_t m = bp->b_pages[i];
+			int resid = IDX_TO_OFF(m->pindex + 1) - foff;
+
+			if (resid > iocount)
+				resid = iocount;
+			if (resid > 0) {
+				vm_page_set_validclean(m,
+					((vm_offset_t) foff & PAGE_MASK), resid);
+			}
+			foff += resid;
+			iocount -= resid;
+		}
+	}
+}
+
+void
+vfs_bio_clrbuf(struct buf *bp) {
+	int i;
+	if( bp->b_flags & B_VMIO) {
+		if( (bp->b_npages == 1) && (bp->b_bufsize < PAGE_SIZE)) {
+			int mask;
+			mask = 0;
+			for(i=0;i<bp->b_bufsize;i+=DEV_BSIZE)
+				mask |= (1 << (i/DEV_BSIZE));
+			if( bp->b_pages[0]->valid != mask) {
+				bzero(bp->b_data, bp->b_bufsize);
+			}
+			bp->b_pages[0]->valid = mask;
+			bp->b_resid = 0;
+			return;
+		}
+		for(i=0;i<bp->b_npages;i++) {
+			if( bp->b_pages[i]->valid == VM_PAGE_BITS_ALL)
+				continue;
+			if( bp->b_pages[i]->valid == 0) {
+				if ((bp->b_pages[i]->flags & PG_ZERO) == 0) {
+					bzero(bp->b_data + (i << PAGE_SHIFT), PAGE_SIZE);
+				}
+			} else {
+				int j;
+				for(j=0;j<PAGE_SIZE/DEV_BSIZE;j++) {
+					if( (bp->b_pages[i]->valid & (1<<j)) == 0)
+						bzero(bp->b_data + (i << PAGE_SHIFT) + j * DEV_BSIZE, DEV_BSIZE);
+				}
+			}
+			/* bp->b_pages[i]->valid = VM_PAGE_BITS_ALL; */
+		}
+		bp->b_resid = 0;
+	} else {
+		clrbuf(bp);
+	}
+}
+
+/*
+ * vm_hold_load_pages and vm_hold_unload pages get pages into
+ * a buffers address space.  The pages are anonymous and are
+ * not associated with a file object.
+ */
+void
+vm_hold_load_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
+{
+	vm_offset_t pg;
+	vm_page_t p;
+	int index;
+
+	to = round_page(to);
+	from = round_page(from);
+	index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
+
+	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
+
+tryagain:
+
+		p = vm_page_alloc(kernel_object, ((pg - VM_MIN_KERNEL_ADDRESS) >> PAGE_SHIFT),
+		    VM_ALLOC_NORMAL);
+		if (!p) {
+			VM_WAIT;
+			goto tryagain;
+		}
+		vm_page_wire(p);
+		pmap_kenter(pg, VM_PAGE_TO_PHYS(p));
+		bp->b_pages[index] = p;
+		PAGE_WAKEUP(p);
+	}
+	bp->b_npages = to >> PAGE_SHIFT;
+}
+
+void
+vm_hold_free_pages(struct buf * bp, vm_offset_t from, vm_offset_t to)
+{
+	vm_offset_t pg;
+	vm_page_t p;
+	int index;
+
+	from = round_page(from);
+	to = round_page(to);
+	index = (from - trunc_page(bp->b_data)) >> PAGE_SHIFT;
+
+	for (pg = from; pg < to; pg += PAGE_SIZE, index++) {
+		p = bp->b_pages[index];
+		if (p && (index < bp->b_npages)) {
+			if (p->busy) {
+				printf("vm_hold_free_pages: blkno: %d, lblkno: %d\n",
+					bp->b_blkno, bp->b_lblkno);
+			}
+			bp->b_pages[index] = NULL;
+			pmap_kremove(pg);
+			vm_page_unwire(p);
+			vm_page_free(p);
 		}
-		splx(s);
-		printf("%s: total-%d", bname[i], count);
-		for (j = 0; j <= MAXBSIZE/CLBYTES; j++)
-			if (counts[j] != 0)
-				printf(", %d-%d", j * CLBYTES, counts[j]);
-		printf("\n");
 	}
+	bp->b_npages = from >> PAGE_SHIFT;
 }
-#endif /* DIAGNOSTIC */
diff --git a/sys/kern/vfs_cache.c b/sys/kern/vfs_cache.c
index c20966b..ef0f222 100644
--- a/sys/kern/vfs_cache.c
+++ b/sys/kern/vfs_cache.c
@@ -33,13 +33,14 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- * from: vfs_cache.c,v 1.11 1995/03/12 02:01:20 phk Exp $
- *
  *	@(#)vfs_cache.c	8.5 (Berkeley) 3/22/95
+ * $Id: vfs_cache.c,v 1.23 1997/02/22 09:39:31 peter Exp $
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/sysctl.h>
 #include <sys/time.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
@@ -47,6 +48,8 @@
 #include <sys/errno.h>
 #include <sys/malloc.h>
 
+#define MAXVNODEUSE 32
+
 /*
  * Name caching works as follows:
  *
@@ -72,14 +75,24 @@
  * Structures associated with name cacheing.
  */
 #define NCHHASH(dvp, cnp) \
-	(&nchashtbl[((dvp)->v_id + (cnp)->cn_hash) & nchash])
-LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
-u_long	nchash;				/* size of hash table - 1 */
-long	numcache;			/* number of cache entries allocated */
-TAILQ_HEAD(, namecache) nclruhead;	/* LRU chain */
+	(&nchashtbl[((dvp)->v_id + (cnp)->cn_hash) % nchash])
+static LIST_HEAD(nchashhead, namecache) *nchashtbl;	/* Hash Table */
+static u_long	nchash;			/* size of hash table */
+static u_long	numcache;		/* number of cache entries allocated */
+static TAILQ_HEAD(, namecache) nclruhead;	/* LRU chain */
 struct	nchstats nchstats;		/* cache effectiveness statistics */
 
-int doingcache = 1;			/* 1 => enable the cache */
+static int	doingcache = 1;		/* 1 => enable the cache */
+SYSCTL_INT(_debug, OID_AUTO, vfscache, CTLFLAG_RW, &doingcache, 0, "");
+
+#ifdef NCH_STATISTICS
+u_long	nchnbr;
+#define NCHNBR(ncp) (ncp)->nc_nbr = ++nchnbr;
+#define NCHHIT(ncp) (ncp)->nc_hits++
+#else
+#define NCHNBR(ncp)
+#define NCHHIT(ncp)
+#endif
 
 /*
  * Delete an entry from its hash list and move it to the front
@@ -100,13 +113,14 @@ int doingcache = 1;			/* 1 => enable the cache */
 	if (ncp->nc_lru.tqe_next != 0) {			\
 		TAILQ_REMOVE(&nclruhead, ncp, nc_lru);		\
 		TAILQ_INSERT_TAIL(&nclruhead, ncp, nc_lru);	\
+		NCHNBR(ncp);					\
 	}							\
 }
 
 /*
- * Lookup an entry in the cache 
+ * Lookup an entry in the cache
  *
- * We don't do this if the segment name is long, simply so the cache 
+ * We don't do this if the segment name is long, simply so the cache
  * can avoid holding long names (which would either waste space, or
  * add greatly to the complexity).
  *
@@ -160,18 +174,22 @@ cache_lookup(dvp, vpp, cnp)
 		return (0);
 	}
 
+	NCHHIT(ncp);
+
 	/* We don't want to have an entry, so dump it */
 	if ((cnp->cn_flags & MAKEENTRY) == 0) {
 		nchstats.ncs_badhits++;
 		PURGE(ncp);
 		return (0);
-	} 
+	}
 
 	/* We found a "positive" match, return the vnode */
         if (ncp->nc_vp) {
 		nchstats.ncs_goodhits++;
 		TOUCH(ncp);
 		*vpp = ncp->nc_vp;
+		if ((*vpp)->v_usage < MAXVNODEUSE)
+			(*vpp)->v_usage++;
 		return (-1);
 	}
 
@@ -207,10 +225,10 @@ cache_enter(dvp, vp, cnp)
 	if (!doingcache)
 		return;
 
-#ifdef DIAGNOSTIC
-	if (cnp->cn_namelen > NCHNAMLEN)
-		panic("cache_enter: name too long");
-#endif
+	if (cnp->cn_namelen > NCHNAMLEN) {
+		printf("cache_enter: name too long");
+		return;
+	}
 
 	/*
 	 * We allocate a new entry if we are less than the maximum
@@ -244,9 +262,11 @@ cache_enter(dvp, vp, cnp)
 	 * otherwise unused.
 	 */
 	ncp->nc_vp = vp;
-	if (vp)
+	if (vp) {
 		ncp->nc_vpid = vp->v_id;
-	else
+		if (vp->v_usage < MAXVNODEUSE)
+			++vp->v_usage;
+	} else
 		ncp->nc_vpid = cnp->cn_flags & ISWHITEOUT;
 	ncp->nc_dvp = dvp;
 	ncp->nc_dvpid = dvp->v_id;
@@ -265,14 +285,14 @@ nchinit()
 {
 
 	TAILQ_INIT(&nclruhead);
-	nchashtbl = hashinit(desiredvnodes, M_CACHE, &nchash);
+	nchashtbl = phashinit(desiredvnodes, M_CACHE, &nchash);
 }
 
 /*
- * Invalidate a all entries to particular vnode.
- * 
- * We actually just increment the v_id, that will do it. The entries will
- * be purged by lookup as they get found. If the v_id wraps around, we
+ * Invalidate all entries to particular vnode.
+ *
+ * We actually just increment the v_id, that will do it. The stale entries
+ * will be purged by lookup as they get found. If the v_id wraps around, we
  * need to ditch the entire cache, to avoid confusion. No valid vnode will
  * ever have (v_id == 0).
  */
@@ -282,11 +302,12 @@ cache_purge(vp)
 {
 	struct namecache *ncp;
 	struct nchashhead *ncpp;
+	static u_long nextvnodeid;
 
 	vp->v_id = ++nextvnodeid;
 	if (nextvnodeid != 0)
 		return;
-	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
+	for (ncpp = &nchashtbl[nchash - 1]; ncpp >= nchashtbl; ncpp--) {
 		while (ncp = ncpp->lh_first)
 			PURGE(ncp);
 	}
@@ -297,7 +318,7 @@ cache_purge(vp)
  * Flush all entries referencing a particular filesystem.
  *
  * Since we need to check it anyway, we will flush all the invalid
- * entriess at the same time.
+ * entries at the same time.
  */
 void
 cache_purgevfs(mp)
@@ -307,7 +328,7 @@ cache_purgevfs(mp)
 	struct namecache *ncp, *nnp;
 
 	/* Scan hash tables for applicable entries */
-	for (ncpp = &nchashtbl[nchash]; ncpp >= nchashtbl; ncpp--) {
+	for (ncpp = &nchashtbl[nchash - 1]; ncpp >= nchashtbl; ncpp--) {
 		for (ncp = ncpp->lh_first; ncp != 0; ncp = nnp) {
 			nnp = ncp->nc_hash.le_next;
 			if (ncp->nc_dvpid != ncp->nc_dvp->v_id ||
diff --git a/sys/kern/vfs_cluster.c b/sys/kern/vfs_cluster.c
index e01d24f..b00da1f 100644
--- a/sys/kern/vfs_cluster.c
+++ b/sys/kern/vfs_cluster.c
@@ -1,6 +1,8 @@
 /*-
  * Copyright (c) 1993
  *	The Regents of the University of California.  All rights reserved.
+ * Modifications/enhancements:
+ * 	Copyright (c) 1995 John S. Dyson.  All rights reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -30,233 +32,281 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)vfs_cluster.c	8.10 (Berkeley) 3/28/95
+ *	@(#)vfs_cluster.c	8.7 (Berkeley) 2/13/94
+ * $Id: vfs_cluster.c,v 1.42 1997/02/22 09:39:31 peter Exp $
  */
 
 #include <sys/param.h>
+#include <sys/systm.h>
 #include <sys/proc.h>
 #include <sys/buf.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
-#include <sys/trace.h>
 #include <sys/malloc.h>
 #include <sys/resourcevar.h>
-#include <libkern/libkern.h>
+#include <sys/vmmeter.h>
+#include <miscfs/specfs/specdev.h>
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_prot.h>
+#include <vm/vm_object.h>
+#include <vm/vm_page.h>
+
+#if defined(CLUSTERDEBUG)
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
+static int	rcluster= 0;
+SYSCTL_INT(_debug, 14, rcluster, CTLFLAG_RW, &rcluster, 0, "");
+#endif
 
-/*
- * Local declarations
- */
-struct buf *cluster_newbuf __P((struct vnode *, struct buf *, long, daddr_t,
-	    daddr_t, long, int));
-struct buf *cluster_rbuild __P((struct vnode *, u_quad_t, struct buf *,
-	    daddr_t, daddr_t, long, int, long));
-void	    cluster_wbuild __P((struct vnode *, struct buf *, long,
-	    daddr_t, int, daddr_t));
-struct cluster_save *cluster_collectbufs __P((struct vnode *, struct buf *));
+#ifdef notyet_block_reallocation_enabled
+#ifdef DEBUG
+#include <sys/sysctl.h>
+#include <sys/kernel.h>
 
-#ifdef DIAGNOSTIC
-/*
- * Set to 1 if reads of block zero should cause readahead to be done.
- * Set to 0 treats a read of block zero as a non-sequential read.
- *
- * Setting to one assumes that most reads of block zero of files are due to
- * sequential passes over the files (e.g. cat, sum) where additional blocks
- * will soon be needed.  Setting to zero assumes that the majority are
- * surgical strikes to get particular info (e.g. size, file) where readahead
- * blocks will not be used and, in fact, push out other potentially useful
- * blocks from the cache.  The former seems intuitive, but some quick tests
- * showed that the latter performed better from a system-wide point of view.
- */
-int	doclusterraz = 0;
-#define ISSEQREAD(vp, blk) \
-	(((blk) != 0 || doclusterraz) && \
-	 ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
+static int	doreallocblks = 0;
+SYSCTL_INT(_debug, 13, doreallocblks, CTLFLAG_RW, &doreallocblks, 0, "");
 #else
-#define ISSEQREAD(vp, blk) \
-	((blk) != 0 && ((blk) == (vp)->v_lastr + 1 || (blk) == (vp)->v_lastr))
+#define	doreallocblks 0
 #endif
+#endif /* notyet_block_reallocation_enabled */
+
+#ifdef notyet_block_reallocation_enabled
+static struct cluster_save *
+	cluster_collectbufs __P((struct vnode *vp, struct buf *last_bp));
+#endif
+static struct buf *
+	cluster_rbuild __P((struct vnode *vp, u_quad_t filesize, daddr_t lbn,
+			    daddr_t blkno, long size, int run, struct buf *fbp));
+
+extern vm_page_t	bogus_page;
 
 /*
- * This replaces bread.  If this is a bread at the beginning of a file and
- * lastr is 0, we assume this is the first read and we'll read up to two
- * blocks if they are sequential.  After that, we'll do regular read ahead
- * in clustered chunks.
- *
- * There are 4 or 5 cases depending on how you count:
- *	Desired block is in the cache:
- *	    1 Not sequential access (0 I/Os).
- *	    2 Access is sequential, do read-ahead (1 ASYNC).
- *	Desired block is not in cache:
- *	    3 Not sequential access (1 SYNC).
- *	    4 Sequential access, next block is contiguous (1 SYNC).
- *	    5 Sequential access, next block is not contiguous (1 SYNC, 1 ASYNC)
- *
- * There are potentially two buffers that require I/O.
- * 	bp is the block requested.
- *	rbp is the read-ahead block.
- *	If either is NULL, then you don't have to do the I/O.
+ * Maximum number of blocks for read-ahead.
  */
-cluster_read(vp, filesize, lblkno, size, cred, bpp)
+#define MAXRA 32
+
+/*
+ * This replaces bread.
+ */
+int
+cluster_read(vp, filesize, lblkno, size, cred, totread, seqcount, bpp)
 	struct vnode *vp;
 	u_quad_t filesize;
 	daddr_t lblkno;
 	long size;
 	struct ucred *cred;
+	long totread;
+	int seqcount;
 	struct buf **bpp;
 {
-	struct buf *bp, *rbp;
-	daddr_t blkno, ioblkno;
-	long flags;
-	int error, num_ra, alreadyincore;
-
-#ifdef DIAGNOSTIC
-	if (size == 0)
-		panic("cluster_read: size = 0");
-#endif
+	struct buf *bp, *rbp, *reqbp;
+	daddr_t blkno, rablkno, origblkno;
+	int error, num_ra;
+	int i;
+	int maxra, racluster;
+	long origtotread;
 
 	error = 0;
-	flags = B_READ;
-	*bpp = bp = getblk(vp, lblkno, size, 0, 0);
-	if (bp->b_flags & B_CACHE) {
-		/*
-		 * Desired block is in cache; do any readahead ASYNC.
-		 * Case 1, 2.
-		 */
-		trace(TR_BREADHIT, pack(vp, size), lblkno);
-		flags |= B_ASYNC;
-		ioblkno = lblkno + (vp->v_ralen ? vp->v_ralen : 1);
-		alreadyincore = incore(vp, ioblkno) != NULL;
-		bp = NULL;
-	} else {
-		/* Block wasn't in cache, case 3, 4, 5. */
-		trace(TR_BREADMISS, pack(vp, size), lblkno);
-		bp->b_flags |= B_READ;
-		ioblkno = lblkno;
-		alreadyincore = 0;
-		curproc->p_stats->p_ru.ru_inblock++;		/* XXX */
-	}
+
 	/*
-	 * XXX
-	 * Replace 1 with a window size based on some permutation of
-	 * maxcontig and rot_delay.  This will let you figure out how
-	 * many blocks you should read-ahead (case 2, 4, 5).
-	 *
-	 * If the access isn't sequential, reset the window to 1.
-	 * Note that a read to the same block is considered sequential.
-	 * This catches the case where the file is being read sequentially,
-	 * but at smaller than the filesystem block size.
+	 * Try to limit the amount of read-ahead by a few
+	 * ad-hoc parameters.  This needs work!!!
 	 */
-	rbp = NULL;
-	if (!ISSEQREAD(vp, lblkno)) {
-		vp->v_ralen = 0;
-		vp->v_maxra = lblkno;
-	} else if ((ioblkno + 1) * size <= filesize && !alreadyincore &&
-	    !(error = VOP_BMAP(vp, ioblkno, NULL, &blkno, &num_ra)) &&
-	    blkno != -1) {
-		/*
-		 * Reading sequentially, and the next block is not in the
-		 * cache.  We are going to try reading ahead.
-		 */
-		if (num_ra) {
-			/*
-			 * If our desired readahead block had been read
-			 * in a previous readahead but is no longer in
-			 * core, then we may be reading ahead too far
-			 * or are not using our readahead very rapidly.
-			 * In this case we scale back the window.
-			 */
-			if (!alreadyincore && ioblkno <= vp->v_maxra)
-				vp->v_ralen = max(vp->v_ralen >> 1, 1);
-			/*
-			 * There are more sequential blocks than our current
-			 * window allows, scale up.  Ideally we want to get
-			 * in sync with the filesystem maxcontig value.
-			 */
-			else if (num_ra > vp->v_ralen && lblkno != vp->v_lastr)
-				vp->v_ralen = vp->v_ralen ?
-					min(num_ra, vp->v_ralen << 1) : 1;
+	racluster = MAXPHYS/size;
+	maxra = 2 * racluster + (totread / size);
+	if (maxra > MAXRA)
+		maxra = MAXRA;
+	if (maxra > nbuf/8)
+		maxra = nbuf/8;
 
-			if (num_ra > vp->v_ralen)
-				num_ra = vp->v_ralen;
-		}
+	/*
+	 * get the requested block
+	 */
+	*bpp = reqbp = bp = getblk(vp, lblkno, size, 0, 0);
+	origblkno = lblkno;
+	origtotread = totread;
 
-		if (num_ra)				/* case 2, 4 */
-			rbp = cluster_rbuild(vp, filesize,
-			    bp, ioblkno, blkno, size, num_ra, flags);
-		else if (ioblkno == lblkno) {
-			bp->b_blkno = blkno;
-			/* Case 5: check how many blocks to read ahead */
-			++ioblkno;
-			if ((ioblkno + 1) * size > filesize ||
-			    incore(vp, ioblkno) || (error = VOP_BMAP(vp,
-			     ioblkno, NULL, &blkno, &num_ra)) || blkno == -1)
-				goto skip_readahead;
+	/*
+	 * if it is in the cache, then check to see if the reads have been
+	 * sequential.  If they have, then try some read-ahead, otherwise
+	 * back-off on prospective read-aheads.
+	 */
+	if (bp->b_flags & B_CACHE) {
+		if (!seqcount) {
+			return 0;
+		} else if ((bp->b_flags & B_RAM) == 0) {
+			return 0;
+		} else {
+			int s;
+			struct buf *tbp;
+			bp->b_flags &= ~B_RAM;
 			/*
-			 * Adjust readahead as above.
-			 * Don't check alreadyincore, we know it is 0 from
-			 * the previous conditional.
+			 * We do the spl here so that there is no window
+			 * between the incore and the b_usecount increment
+			 * below.  We opt to keep the spl out of the loop
+			 * for efficiency.
 			 */
-			if (num_ra) {
-				if (ioblkno <= vp->v_maxra)
-					vp->v_ralen = max(vp->v_ralen >> 1, 1);
-				else if (num_ra > vp->v_ralen &&
-					 lblkno != vp->v_lastr)
-					vp->v_ralen = vp->v_ralen ?
-						min(num_ra,vp->v_ralen<<1) : 1;
-				if (num_ra > vp->v_ralen)
-					num_ra = vp->v_ralen;
+			s = splbio();
+			for(i=1;i<maxra;i++) {
+
+				if (!(tbp = incore(vp, lblkno+i))) {
+					break;
+				}
+
+				/*
+				 * Set another read-ahead mark so we know to check
+				 * again.
+				 */
+				if (((i % racluster) == (racluster - 1)) ||
+					(i == (maxra - 1)))
+					tbp->b_flags |= B_RAM;
+
+#if 0
+				if (tbp->b_usecount == 0) {
+					/*
+					 * Make sure that the soon-to-be used readaheads
+					 * are still there.  The getblk/bqrelse pair will
+					 * boost the priority of the buffer.
+					 */
+					tbp = getblk(vp, lblkno+i, size, 0, 0);
+					bqrelse(tbp);
+				}
+#endif
 			}
-			flags |= B_ASYNC;
-			if (num_ra)
-				rbp = cluster_rbuild(vp, filesize,
-				    NULL, ioblkno, blkno, size, num_ra, flags);
-			else {
-				rbp = getblk(vp, ioblkno, size, 0, 0);
-				rbp->b_flags |= flags;
-				rbp->b_blkno = blkno;
+			splx(s);
+			if (i >= maxra) {
+				return 0;
 			}
+			lblkno += i;
+		}
+		reqbp = bp = NULL;
+	} else {
+		u_quad_t firstread;
+		firstread = (u_quad_t) lblkno * size;
+		if (firstread + totread > filesize)
+			totread = filesize - firstread;
+		if (totread > size) {
+			int nblks = 0;
+			int ncontigafter;
+			while (totread > 0) {
+				nblks++;
+				totread -= size;
+			}
+			if (nblks == 1)
+				goto single_block_read;
+			if (nblks > racluster)
+				nblks = racluster;
+
+	    		error = VOP_BMAP(vp, lblkno, NULL,
+				&blkno, &ncontigafter, NULL);
+			if (error)
+				goto single_block_read;
+			if (blkno == -1)
+				goto single_block_read;
+			if (ncontigafter == 0)
+				goto single_block_read;
+			if (ncontigafter + 1 < nblks)
+				nblks = ncontigafter + 1;
+
+			bp = cluster_rbuild(vp, filesize, lblkno,
+				blkno, size, nblks, bp);
+			lblkno += nblks;
 		} else {
-			/* case 2; read ahead single block */
-			rbp = getblk(vp, ioblkno, size, 0, 0);
-			rbp->b_flags |= flags;
-			rbp->b_blkno = blkno;
+single_block_read:
+			/*
+			 * if it isn't in the cache, then get a chunk from
+			 * disk if sequential, otherwise just get the block.
+			 */
+			bp->b_flags |= B_READ | B_RAM;
+			lblkno += 1;
 		}
+	}
 
-		if (rbp == bp)			/* case 4 */
-			rbp = NULL;
-		else if (rbp) {			/* case 2, 5 */
-			trace(TR_BREADMISSRA,
-			    pack(vp, (num_ra + 1) * size), ioblkno);
-			curproc->p_stats->p_ru.ru_inblock++;	/* XXX */
+	/*
+	 * if we have been doing sequential I/O, then do some read-ahead
+	 */
+	rbp = NULL;
+	/* if (seqcount && (lblkno < (origblkno + maxra))) { */
+	if (seqcount && (lblkno < (origblkno + seqcount))) {
+		/*
+		 * we now build the read-ahead buffer if it is desirable.
+		 */
+		if (((u_quad_t)(lblkno + 1) * size) <= filesize &&
+		    !(error = VOP_BMAP(vp, lblkno, NULL, &blkno, &num_ra, NULL)) &&
+		    blkno != -1) {
+			int nblksread;
+			int ntoread = num_ra + 1;
+			nblksread = (origtotread + size - 1) / size;
+			if (seqcount < nblksread)
+				seqcount = nblksread;
+			if (seqcount < ntoread)
+				ntoread = seqcount;
+			if (num_ra) {
+				rbp = cluster_rbuild(vp, filesize, lblkno,
+					blkno, size, ntoread, NULL);
+			} else {
+				rbp = getblk(vp, lblkno, size, 0, 0);
+				rbp->b_flags |= B_READ | B_ASYNC | B_RAM;
+				rbp->b_blkno = blkno;
+			}
 		}
 	}
 
-	/* XXX Kirk, do we need to make sure the bp has creds? */
-skip_readahead:
-	if (bp)
-		if (bp->b_flags & (B_DONE | B_DELWRI))
+	/*
+	 * handle the synchronous read
+	 */
+	if (bp) {
+		if (bp->b_flags & (B_DONE | B_DELWRI)) {
 			panic("cluster_read: DONE bp");
-		else 
+		} else {
+#if defined(CLUSTERDEBUG)
+			if (rcluster)
+				printf("S(%d,%d,%d) ",
+					bp->b_lblkno, bp->b_bcount, seqcount);
+#endif
+			if ((bp->b_flags & B_CLUSTER) == 0)
+				vfs_busy_pages(bp, 0);
 			error = VOP_STRATEGY(bp);
-
-	if (rbp)
-		if (error || rbp->b_flags & (B_DONE | B_DELWRI)) {
+			curproc->p_stats->p_ru.ru_inblock++;
+		}
+	}
+	/*
+	 * and if we have read-aheads, do them too
+	 */
+	if (rbp) {
+		if (error) {
 			rbp->b_flags &= ~(B_ASYNC | B_READ);
 			brelse(rbp);
-		} else
-			(void) VOP_STRATEGY(rbp);
+		} else if (rbp->b_flags & B_CACHE) {
+			rbp->b_flags &= ~(B_ASYNC | B_READ);
+			bqrelse(rbp);
+		} else {
+#if defined(CLUSTERDEBUG)
+			if (rcluster) {
+				if (bp)
+					printf("A+(%d,%d,%d,%d) ",
+					rbp->b_lblkno, rbp->b_bcount,
+					rbp->b_lblkno - origblkno,
+					seqcount);
+				else
+					printf("A(%d,%d,%d,%d) ",
+					rbp->b_lblkno, rbp->b_bcount,
+					rbp->b_lblkno - origblkno,
+					seqcount);
+			}
+#endif
 
-	/*
-	 * Recalculate our maximum readahead
-	 */
-	if (rbp == NULL)
-		rbp = bp;
-	if (rbp)
-		vp->v_maxra = rbp->b_lblkno + (rbp->b_bufsize / size) - 1;
-
-	if (bp)
-		return(biowait(bp));
-	return(error);
+			if ((rbp->b_flags & B_CLUSTER) == 0)
+				vfs_busy_pages(rbp, 0);
+			(void) VOP_STRATEGY(rbp);
+			curproc->p_stats->p_ru.ru_inblock++;
+		}
+	}
+	if (reqbp)
+		return (biowait(reqbp));
+	else
+		return (error);
 }
 
 /*
@@ -264,145 +314,139 @@ skip_readahead:
  * read ahead.  We will read as many blocks as possible sequentially
  * and then parcel them up into logical blocks in the buffer hash table.
  */
-struct buf *
-cluster_rbuild(vp, filesize, bp, lbn, blkno, size, run, flags)
+static struct buf *
+cluster_rbuild(vp, filesize, lbn, blkno, size, run, fbp)
 	struct vnode *vp;
 	u_quad_t filesize;
-	struct buf *bp;
 	daddr_t lbn;
 	daddr_t blkno;
 	long size;
 	int run;
-	long flags;
+	struct buf *fbp;
 {
-	struct cluster_save *b_save;
-	struct buf *tbp;
+	struct buf *bp, *tbp;
 	daddr_t bn;
-	int i, inc;
+	int i, inc, j;
 
 #ifdef DIAGNOSTIC
 	if (size != vp->v_mount->mnt_stat.f_iosize)
 		panic("cluster_rbuild: size %d != filesize %d\n",
-			size, vp->v_mount->mnt_stat.f_iosize);
+		    size, vp->v_mount->mnt_stat.f_iosize);
 #endif
-	if (size * (lbn + run + 1) > filesize)
+	/*
+	 * avoid a division
+	 */
+	while ((u_quad_t) size * (lbn + run) > filesize) {
 		--run;
-	if (run == 0) {
-		if (!bp) {
-			bp = getblk(vp, lbn, size, 0, 0);
-			bp->b_blkno = blkno;
-			bp->b_flags |= flags;
-		}
-		return(bp);
 	}
 
-	bp = cluster_newbuf(vp, bp, flags, blkno, lbn, size, run + 1);
-	if (bp->b_flags & (B_DONE | B_DELWRI))
-		return (bp);
+	if (fbp) {
+		tbp = fbp;
+		tbp->b_flags |= B_READ; 
+	} else {
+		tbp = getblk(vp, lbn, size, 0, 0);
+		if (tbp->b_flags & B_CACHE)
+			return tbp;
+		tbp->b_flags |= B_ASYNC | B_READ | B_RAM;
+	}
+
+	tbp->b_blkno = blkno;
+	if( (tbp->b_flags & B_MALLOC) ||
+		((tbp->b_flags & B_VMIO) == 0) || (run <= 1) )
+		return tbp;
 
-	b_save = malloc(sizeof(struct buf *) * run + sizeof(struct cluster_save),
-	    M_SEGMENT, M_WAITOK);
-	b_save->bs_bufsize = b_save->bs_bcount = size;
-	b_save->bs_nchildren = 0;
-	b_save->bs_children = (struct buf **)(b_save + 1);
-	b_save->bs_saveaddr = bp->b_saveaddr;
-	bp->b_saveaddr = (caddr_t) b_save;
+	bp = trypbuf();
+	if (bp == 0)
+		return tbp;
+
+	(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
+	bp->b_flags = B_ASYNC | B_READ | B_CALL | B_BUSY | B_CLUSTER | B_VMIO;
+	bp->b_iodone = cluster_callback;
+	bp->b_blkno = blkno;
+	bp->b_lblkno = lbn;
+	pbgetvp(vp, bp);
+
+	TAILQ_INIT(&bp->b_cluster.cluster_head);
+
+	bp->b_bcount = 0;
+	bp->b_bufsize = 0;
+	bp->b_npages = 0;
 
 	inc = btodb(size);
-	for (bn = blkno + inc, i = 1; i <= run; ++i, bn += inc) {
-		/*
-		 * A component of the cluster is already in core,
-		 * terminate the cluster early.
-		 */
-		if (incore(vp, lbn + i))
-			break;
-		tbp = getblk(vp, lbn + i, 0, 0, 0);
-		/*
-		 * getblk may return some memory in the buffer if there were
-		 * no empty buffers to shed it to.  If there is currently
-		 * memory in the buffer, we move it down size bytes to make
-		 * room for the valid pages that cluster_callback will insert.
-		 * We do this now so we don't have to do it at interrupt time
-		 * in the callback routine.
-		 */
-		if (tbp->b_bufsize != 0) {
-			caddr_t bdata = (char *)tbp->b_data;
+	for (bn = blkno, i = 0; i < run; ++i, bn += inc) {
+		if (i != 0) {
+			if ((bp->b_npages * PAGE_SIZE) +
+				round_page(size) > MAXPHYS)
+				break;
 
-			/*
-			 * No room in the buffer to add another page,
-			 * terminate the cluster early.
-			 */
-			if (tbp->b_bufsize + size > MAXBSIZE) {
-#ifdef DIAGNOSTIC
-				if (tbp->b_bufsize != MAXBSIZE)
-					panic("cluster_rbuild: too much memory");
-#endif
-				brelse(tbp);
+			if (incore(vp, lbn + i))
 				break;
+
+			tbp = getblk(vp, lbn + i, size, 0, 0);
+
+			if ((tbp->b_flags & B_CACHE) ||
+				(tbp->b_flags & B_VMIO) == 0) {
+				bqrelse(tbp);
+				break;
+			}
+
+			for (j=0;j<tbp->b_npages;j++) {
+				if (tbp->b_pages[j]->valid) {
+					break;
+				}
 			}
-			if (tbp->b_bufsize > size) {
+
+			if (j != tbp->b_npages) {
 				/*
-				 * XXX if the source and destination regions
-				 * overlap we have to copy backward to avoid
-				 * clobbering any valid pages (i.e. pagemove
-				 * implementations typically can't handle
-				 * overlap).
+				 * force buffer to be re-constituted later
 				 */
-				bdata += tbp->b_bufsize;
-				while (bdata > (char *)tbp->b_data) {
-					bdata -= CLBYTES;
-					pagemove(bdata, bdata + size, CLBYTES);
-				}
-			} else 
-				pagemove(bdata, bdata + size, tbp->b_bufsize);
+				tbp->b_flags |= B_RELBUF;
+				brelse(tbp);
+				break;
+			}
+
+			if ((fbp && (i == 1)) || (i == (run - 1)))
+				tbp->b_flags |= B_RAM;
+			tbp->b_flags |= B_READ | B_ASYNC;
+			if (tbp->b_blkno == tbp->b_lblkno) {
+				tbp->b_blkno = bn;
+			} else if (tbp->b_blkno != bn) {
+				brelse(tbp);
+				break;
+			}
 		}
-		tbp->b_blkno = bn;
-		tbp->b_flags |= flags | B_READ | B_ASYNC;
-		++b_save->bs_nchildren;
-		b_save->bs_children[i - 1] = tbp;
-	}
-	/*
-	 * The cluster may have been terminated early, adjust the cluster
-	 * buffer size accordingly.  If no cluster could be formed,
-	 * deallocate the cluster save info.
-	 */
-	if (i <= run) {
-		if (i == 1) {
-			bp->b_saveaddr = b_save->bs_saveaddr;
-			bp->b_flags &= ~B_CALL;
-			bp->b_iodone = NULL;
-			free(b_save, M_SEGMENT);
+		TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
+			tbp, b_cluster.cluster_entry);
+		for (j = 0; j < tbp->b_npages; j += 1) {
+			vm_page_t m;
+			m = tbp->b_pages[j];
+			++m->busy;
+			++m->object->paging_in_progress;
+			if ((bp->b_npages == 0) ||
+				(bp->b_pages[bp->b_npages-1] != m)) {
+				bp->b_pages[bp->b_npages] = m;
+				bp->b_npages++;
+			}
+			if ((m->valid & VM_PAGE_BITS_ALL) == VM_PAGE_BITS_ALL)
+				tbp->b_pages[j] = bogus_page;
 		}
-		allocbuf(bp, size * i);
+		bp->b_bcount += tbp->b_bcount;
+		bp->b_bufsize += tbp->b_bufsize;
 	}
-	return(bp);
-}
 
-/*
- * Either get a new buffer or grow the existing one.
- */
-struct buf *
-cluster_newbuf(vp, bp, flags, blkno, lblkno, size, run)
-	struct vnode *vp;
-	struct buf *bp;
-	long flags;
-	daddr_t blkno;
-	daddr_t lblkno;
-	long size;
-	int run;
-{
-	if (!bp) {
-		bp = getblk(vp, lblkno, size, 0, 0);
-		if (bp->b_flags & (B_DONE | B_DELWRI)) {
-			bp->b_blkno = blkno;
-			return(bp);
-		}
+	for(j=0;j<bp->b_npages;j++) {
+		if ((bp->b_pages[j]->valid & VM_PAGE_BITS_ALL) ==
+			VM_PAGE_BITS_ALL)
+			bp->b_pages[j] = bogus_page;
 	}
-	allocbuf(bp, run * size);
-	bp->b_blkno = blkno;
-	bp->b_iodone = cluster_callback;
-	bp->b_flags |= flags | B_CALL;
-	return(bp);
+	if (bp->b_bufsize > bp->b_kvasize)
+		panic("cluster_rbuild: b_bufsize(%d) > b_kvasize(%d)\n",
+			bp->b_bufsize, bp->b_kvasize);
+	bp->b_kvasize = bp->b_bufsize;
+
+	pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+		(vm_page_t *)bp->b_pages, bp->b_npages);
+	return (bp);
 }
 
 /*
@@ -415,10 +459,7 @@ void
 cluster_callback(bp)
 	struct buf *bp;
 {
-	struct cluster_save *b_save;
-	struct buf **bpp, *tbp;
-	long bsize;
-	caddr_t cp;
+	struct buf *nbp, *tbp;
 	int error = 0;
 
 	/*
@@ -427,46 +468,21 @@ cluster_callback(bp)
 	if (bp->b_flags & B_ERROR)
 		error = bp->b_error;
 
-	b_save = (struct cluster_save *)(bp->b_saveaddr);
-	bp->b_saveaddr = b_save->bs_saveaddr;
-
-	bsize = b_save->bs_bufsize;
-	cp = (char *)bp->b_data + bsize;
+	pmap_qremove(trunc_page((vm_offset_t) bp->b_data), bp->b_npages);
 	/*
 	 * Move memory from the large cluster buffer into the component
 	 * buffers and mark IO as done on these.
 	 */
-	for (bpp = b_save->bs_children; b_save->bs_nchildren--; ++bpp) {
-		tbp = *bpp;
-		pagemove(cp, tbp->b_data, bsize);
-		tbp->b_bufsize += bsize;
-		tbp->b_bcount = bsize;
+	for (tbp = TAILQ_FIRST(&bp->b_cluster.cluster_head);
+		tbp; tbp = nbp) {
+		nbp = TAILQ_NEXT(&tbp->b_cluster, cluster_entry);
 		if (error) {
 			tbp->b_flags |= B_ERROR;
 			tbp->b_error = error;
 		}
 		biodone(tbp);
-		bp->b_bufsize -= bsize;
-		cp += bsize;
-	}
-	/*
-	 * If there was excess memory in the cluster buffer,
-	 * slide it up adjacent to the remaining valid data.
-	 */
-	if (bp->b_bufsize != bsize) {
-		if (bp->b_bufsize < bsize)
-			panic("cluster_callback: too little memory");
-		pagemove(cp, (char *)bp->b_data + bsize, bp->b_bufsize - bsize);
-	}
-	bp->b_bcount = bsize;
-	bp->b_iodone = NULL;
-	free(b_save, M_SEGMENT);
-	if (bp->b_flags & B_ASYNC)
-		brelse(bp);
-	else {
-		bp->b_flags &= ~B_WANTED;
-		wakeup((caddr_t)bp);
 	}
+	relpbuf(bp);
 }
 
 /*
@@ -481,38 +497,53 @@ cluster_callback(bp)
  */
 void
 cluster_write(bp, filesize)
-        struct buf *bp;
+	struct buf *bp;
 	u_quad_t filesize;
 {
-        struct vnode *vp;
-        daddr_t lbn;
-        int maxclen, cursize;
+	struct vnode *vp;
+	daddr_t lbn;
+	int maxclen, cursize;
+	int lblocksize;
+	int async;
 
-        vp = bp->b_vp;
-        lbn = bp->b_lblkno;
+	vp = bp->b_vp;
+	async = vp->v_mount->mnt_flag & MNT_ASYNC;
+	lblocksize = vp->v_mount->mnt_stat.f_iosize;
+	lbn = bp->b_lblkno;
 
 	/* Initialize vnode to beginning of file. */
 	if (lbn == 0)
 		vp->v_lasta = vp->v_clen = vp->v_cstart = vp->v_lastw = 0;
 
-        if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
-	    (bp->b_blkno != vp->v_lasta + btodb(bp->b_bcount))) {
-		maxclen = MAXBSIZE / vp->v_mount->mnt_stat.f_iosize - 1;
+	if (vp->v_clen == 0 || lbn != vp->v_lastw + 1 ||
+	    (bp->b_blkno != vp->v_lasta + btodb(lblocksize))) {
+		maxclen = MAXPHYS / lblocksize - 1;
 		if (vp->v_clen != 0) {
 			/*
 			 * Next block is not sequential.
 			 *
 			 * If we are not writing at end of file, the process
-			 * seeked to another point in the file since its
-			 * last write, or we have reached our maximum
-			 * cluster size, then push the previous cluster.
-			 * Otherwise try reallocating to make it sequential.
+			 * seeked to another point in the file since its last
+			 * write, or we have reached our maximum cluster size,
+			 * then push the previous cluster. Otherwise try
+			 * reallocating to make it sequential.
 			 */
 			cursize = vp->v_lastw - vp->v_cstart + 1;
-			if ((lbn + 1) * bp->b_bcount != filesize ||
+#ifndef notyet_block_reallocation_enabled
+			if (((u_quad_t)(lbn + 1) * lblocksize) != filesize ||
+				lbn != vp->v_lastw + 1 ||
+				vp->v_clen <= cursize) {
+				if (!async)
+					cluster_wbuild(vp, lblocksize,
+						vp->v_cstart, cursize);
+			}
+#else
+			if (!doreallocblks ||
+			    (lbn + 1) * lblocksize != filesize ||
 			    lbn != vp->v_lastw + 1 || vp->v_clen <= cursize) {
-				cluster_wbuild(vp, NULL, bp->b_bcount,
-				    vp->v_cstart, cursize, lbn);
+				if (!async)
+					cluster_wbuild(vp, lblocksize,
+						vp->v_cstart, cursize);
 			} else {
 				struct buf **bpp, **endbp;
 				struct cluster_save *buflist;
@@ -528,8 +559,8 @@ cluster_write(bp, filesize)
 					     bpp < endbp; bpp++)
 						brelse(*bpp);
 					free(buflist, M_SEGMENT);
-					cluster_wbuild(vp, NULL, bp->b_bcount,
-					    vp->v_cstart, cursize, lbn);
+					cluster_wbuild(vp, lblocksize,
+					    vp->v_cstart, cursize);
 				} else {
 					/*
 					 * Succeeded, keep building cluster.
@@ -543,14 +574,16 @@ cluster_write(bp, filesize)
 					return;
 				}
 			}
+#endif /* notyet_block_reallocation_enabled */
 		}
 		/*
-		 * Consider beginning a cluster.
-		 * If at end of file, make cluster as large as possible,
-		 * otherwise find size of existing cluster.
+		 * Consider beginning a cluster. If at end of file, make
+		 * cluster as large as possible, otherwise find size of
+		 * existing cluster.
 		 */
-		if ((lbn + 1) * bp->b_bcount != filesize &&
-		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen) ||
+		if (((u_quad_t) (lbn + 1) * lblocksize) != filesize &&
+		    (bp->b_blkno == bp->b_lblkno) &&
+		    (VOP_BMAP(vp, lbn, NULL, &bp->b_blkno, &maxclen, NULL) ||
 		     bp->b_blkno == -1)) {
 			bawrite(bp);
 			vp->v_clen = 0;
@@ -559,26 +592,25 @@ cluster_write(bp, filesize)
 			vp->v_lastw = lbn;
 			return;
 		}
-                vp->v_clen = maxclen;
-                if (maxclen == 0) {		/* I/O not contiguous */
+		vp->v_clen = maxclen;
+		if (!async && maxclen == 0) {	/* I/O not contiguous */
 			vp->v_cstart = lbn + 1;
-                        bawrite(bp);
-                } else {			/* Wait for rest of cluster */
+			bawrite(bp);
+		} else {	/* Wait for rest of cluster */
 			vp->v_cstart = lbn;
-                        bdwrite(bp);
+			bdwrite(bp);
 		}
 	} else if (lbn == vp->v_cstart + vp->v_clen) {
 		/*
 		 * At end of cluster, write it out.
 		 */
-		cluster_wbuild(vp, bp, bp->b_bcount, vp->v_cstart,
-		    vp->v_clen + 1, lbn);
+		bdwrite(bp);
+		cluster_wbuild(vp, lblocksize, vp->v_cstart, vp->v_clen + 1);
 		vp->v_clen = 0;
 		vp->v_cstart = lbn + 1;
 	} else
 		/*
-		 * In the middle of a cluster, so just delay the
-		 * I/O for now.
+		 * In the middle of a cluster, so just delay the I/O for now.
 		 */
 		bdwrite(bp);
 	vp->v_lastw = lbn;
@@ -592,165 +624,168 @@ cluster_write(bp, filesize)
  * performed.  Check to see that it doesn't fall in the middle of
  * the current block (if last_bp == NULL).
  */
-void
-cluster_wbuild(vp, last_bp, size, start_lbn, len, lbn)
+int
+cluster_wbuild(vp, size, start_lbn, len)
 	struct vnode *vp;
-	struct buf *last_bp;
 	long size;
 	daddr_t start_lbn;
 	int len;
-	daddr_t	lbn;
 {
-	struct cluster_save *b_save;
 	struct buf *bp, *tbp;
-	caddr_t	cp;
-	int i, s;
-
-#ifdef DIAGNOSTIC
-	if (size != vp->v_mount->mnt_stat.f_iosize)
-		panic("cluster_wbuild: size %d != filesize %d\n",
-			size, vp->v_mount->mnt_stat.f_iosize);
-#endif
-redo:
-	while ((!incore(vp, start_lbn) || start_lbn == lbn) && len) {
-		++start_lbn;
-		--len;
-	}
-
-	/* Get more memory for current buffer */
-	if (len <= 1) {
-		if (last_bp) {
-			bawrite(last_bp);
-		} else if (len) {
-			bp = getblk(vp, start_lbn, size, 0, 0);
-			bawrite(bp);
+	int i, j, s;
+	int totalwritten = 0;
+	int dbsize = btodb(size);
+	while (len > 0) {
+		s = splbio();
+		if ( ((tbp = gbincore(vp, start_lbn)) == NULL) ||
+			((tbp->b_flags & (B_INVAL|B_BUSY|B_DELWRI)) != B_DELWRI)) {
+			++start_lbn;
+			--len;
+			splx(s);
+			continue;
 		}
-		return;
-	}
-
-	bp = getblk(vp, start_lbn, size, 0, 0);
-	if (!(bp->b_flags & B_DELWRI)) {
-		++start_lbn;
-		--len;
-		brelse(bp);
-		goto redo;
-	}
+		bremfree(tbp);
+		tbp->b_flags |= B_BUSY;
+		tbp->b_flags &= ~B_DONE;
+		splx(s);
 
 	/*
-	 * Extra memory in the buffer, punt on this buffer.
-	 * XXX we could handle this in most cases, but we would have to
-	 * push the extra memory down to after our max possible cluster
-	 * size and then potentially pull it back up if the cluster was
-	 * terminated prematurely--too much hassle.
+	 * Extra memory in the buffer, punt on this buffer. XXX we could
+	 * handle this in most cases, but we would have to push the extra
+	 * memory down to after our max possible cluster size and then
+	 * potentially pull it back up if the cluster was terminated
+	 * prematurely--too much hassle.
 	 */
-	if (bp->b_bcount != bp->b_bufsize) {
-		++start_lbn;
-		--len;
-		bawrite(bp);
-		goto redo;
-	}
+		if (((tbp->b_flags & (B_CLUSTEROK|B_MALLOC)) != B_CLUSTEROK) ||
+			(tbp->b_bcount != tbp->b_bufsize) ||
+			(tbp->b_bcount != size) ||
+			len == 1) {
+			totalwritten += tbp->b_bufsize;
+			bawrite(tbp);
+			++start_lbn;
+			--len;
+			continue;
+		}
 
-	--len;
-	b_save = malloc(sizeof(struct buf *) * len + sizeof(struct cluster_save),
-	    M_SEGMENT, M_WAITOK);
-	b_save->bs_bcount = bp->b_bcount;
-	b_save->bs_bufsize = bp->b_bufsize;
-	b_save->bs_nchildren = 0;
-	b_save->bs_children = (struct buf **)(b_save + 1);
-	b_save->bs_saveaddr = bp->b_saveaddr;
-	bp->b_saveaddr = (caddr_t) b_save;
-
-	bp->b_flags |= B_CALL;
-	bp->b_iodone = cluster_callback;
-	cp = (char *)bp->b_data + size;
-	for (++start_lbn, i = 0; i < len; ++i, ++start_lbn) {
-		/*
-		 * Block is not in core or the non-sequential block
-		 * ending our cluster was part of the cluster (in which
-		 * case we don't want to write it twice).
-		 */
-		if (!incore(vp, start_lbn) ||
-		    last_bp == NULL && start_lbn == lbn)
-			break;
+		bp = trypbuf();
+		if (bp == NULL) {
+			totalwritten += tbp->b_bufsize;
+			bawrite(tbp);
+			++start_lbn;
+			--len;
+			continue;
+		}
 
-		/*
-		 * Get the desired block buffer (unless it is the final
-		 * sequential block whose buffer was passed in explictly
-		 * as last_bp).
-		 */
-		if (last_bp == NULL || start_lbn != lbn) {
-			tbp = getblk(vp, start_lbn, size, 0, 0);
-			if (!(tbp->b_flags & B_DELWRI)) {
-				brelse(tbp);
-				break;
-			}
-		} else
-			tbp = last_bp;
-
-		++b_save->bs_nchildren;
-
-		/* Move memory from children to parent */
-		if (tbp->b_blkno != (bp->b_blkno + btodb(bp->b_bufsize))) {
-			printf("Clustered Block: %d addr %x bufsize: %d\n",
-			    bp->b_lblkno, bp->b_blkno, bp->b_bufsize);
-			printf("Child Block: %d addr: %x\n", tbp->b_lblkno,
-			    tbp->b_blkno);
-			panic("Clustered write to wrong blocks");
+		TAILQ_INIT(&bp->b_cluster.cluster_head);
+		bp->b_bcount = 0;
+		bp->b_bufsize = 0;
+		bp->b_npages = 0;
+		if (tbp->b_wcred != NOCRED) {
+		    bp->b_wcred = tbp->b_wcred;
+		    crhold(bp->b_wcred);
 		}
 
-		pagemove(tbp->b_data, cp, size);
-		bp->b_bcount += size;
-		bp->b_bufsize += size;
+		bp->b_blkno = tbp->b_blkno;
+		bp->b_lblkno = tbp->b_lblkno;
+		(vm_offset_t) bp->b_data |= ((vm_offset_t) tbp->b_data) & PAGE_MASK;
+		bp->b_flags |= B_CALL | B_BUSY | B_CLUSTER | (tbp->b_flags & (B_VMIO|B_NEEDCOMMIT));
+		bp->b_iodone = cluster_callback;
+		pbgetvp(vp, bp);
+
+		for (i = 0; i < len; ++i, ++start_lbn) {
+			if (i != 0) {
+				s = splbio();
+				if ((tbp = gbincore(vp, start_lbn)) == NULL) {
+					splx(s);
+					break;
+				}
 
-		tbp->b_bufsize -= size;
-		tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
-		tbp->b_flags |= (B_ASYNC | B_AGE);
-		s = splbio();
-		reassignbuf(tbp, tbp->b_vp);		/* put on clean list */
-		++tbp->b_vp->v_numoutput;
-		splx(s);
-		b_save->bs_children[i] = tbp;
+				if ((tbp->b_flags & (B_VMIO|B_CLUSTEROK|B_INVAL|B_BUSY|B_DELWRI|B_NEEDCOMMIT)) != (B_DELWRI|B_CLUSTEROK|(bp->b_flags & (B_VMIO|B_NEEDCOMMIT)))) {
+					splx(s);
+					break;
+				}
 
-		cp += size;
-	}
+				if (tbp->b_wcred != bp->b_wcred) {
+					splx(s);
+					break;
+				}
 
-	if (i == 0) {
-		/* None to cluster */
-		bp->b_saveaddr = b_save->bs_saveaddr;
-		bp->b_flags &= ~B_CALL;
-		bp->b_iodone = NULL;
-		free(b_save, M_SEGMENT);
-	}
-	bawrite(bp);
-	if (i < len) {
-		len -= i + 1;
-		start_lbn += 1;
-		goto redo;
+				if ((tbp->b_bcount != size) ||
+					((bp->b_blkno + dbsize * i) != tbp->b_blkno) ||
+					((tbp->b_npages + bp->b_npages) > (MAXPHYS / PAGE_SIZE))) {
+					splx(s);
+					break;
+				}
+				bremfree(tbp);
+				tbp->b_flags |= B_BUSY;
+				tbp->b_flags &= ~B_DONE;
+				splx(s);
+			}
+			if (tbp->b_flags & B_VMIO) {
+				for (j = 0; j < tbp->b_npages; j += 1) {
+					vm_page_t m;
+					m = tbp->b_pages[j];
+					++m->busy;
+					++m->object->paging_in_progress;
+					if ((bp->b_npages == 0) ||
+						(bp->b_pages[bp->b_npages - 1] != m)) {
+						bp->b_pages[bp->b_npages] = m;
+						bp->b_npages++;
+					}
+				}
+			}
+			bp->b_bcount += size;
+			bp->b_bufsize += size;
+
+			tbp->b_flags &= ~(B_READ | B_DONE | B_ERROR | B_DELWRI);
+			tbp->b_flags |= B_ASYNC;
+			s = splbio();
+			reassignbuf(tbp, tbp->b_vp);	/* put on clean list */
+			++tbp->b_vp->v_numoutput;
+			splx(s);
+			TAILQ_INSERT_TAIL(&bp->b_cluster.cluster_head,
+				tbp, b_cluster.cluster_entry);
+		}
+		pmap_qenter(trunc_page((vm_offset_t) bp->b_data),
+			(vm_page_t *) bp->b_pages, bp->b_npages);
+		if (bp->b_bufsize > bp->b_kvasize)
+			panic("cluster_wbuild: b_bufsize(%d) > b_kvasize(%d)\n",
+				bp->b_bufsize, bp->b_kvasize);
+		bp->b_kvasize = bp->b_bufsize;
+		totalwritten += bp->b_bufsize;
+		bp->b_dirtyoff = 0;
+		bp->b_dirtyend = bp->b_bufsize;
+		bawrite(bp);
+
+		len -= i;
 	}
+	return totalwritten;
 }
 
+#ifdef notyet_block_reallocation_enabled
 /*
  * Collect together all the buffers in a cluster.
  * Plus add one additional buffer.
  */
-struct cluster_save *
+static struct cluster_save *
 cluster_collectbufs(vp, last_bp)
 	struct vnode *vp;
 	struct buf *last_bp;
 {
 	struct cluster_save *buflist;
-	daddr_t	lbn;
+	daddr_t lbn;
 	int i, len;
 
 	len = vp->v_lastw - vp->v_cstart + 1;
 	buflist = malloc(sizeof(struct buf *) * (len + 1) + sizeof(*buflist),
 	    M_SEGMENT, M_WAITOK);
 	buflist->bs_nchildren = 0;
-	buflist->bs_children = (struct buf **)(buflist + 1);
+	buflist->bs_children = (struct buf **) (buflist + 1);
 	for (lbn = vp->v_cstart, i = 0; i < len; lbn++, i++)
-		    (void)bread(vp, lbn, last_bp->b_bcount, NOCRED,
-			&buflist->bs_children[i]);
+		(void) bread(vp, lbn, last_bp->b_bcount, NOCRED,
+		    &buflist->bs_children[i]);
 	buflist->bs_children[i] = last_bp;
 	buflist->bs_nchildren = i + 1;
 	return (buflist);
 }
+#endif /* notyet_block_reallocation_enabled */
diff --git a/sys/kern/vfs_conf.c b/sys/kern/vfs_conf.c
index 9b57797..779a1c4 100644
--- a/sys/kern/vfs_conf.c
+++ b/sys/kern/vfs_conf.c
@@ -1,6 +1,7 @@
 /*
  * Copyright (c) 1989, 1993, 1995
  *	The Regents of the University of California.  All rights reserved.
+ * Copyright (c) 1995 Artisoft, Inc.  All Rights Reserved.
  *
  * Redistribution and use in source and binary forms, with or without
  * modification, are permitted provided that the following conditions
@@ -30,219 +31,123 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)vfs_conf.c	8.11 (Berkeley) 5/10/95
+ *	@(#)vfs_conf.c	8.8 (Berkeley) 3/31/94
+ * $Id$
  */
 
-#include <sys/param.h>
-#include <sys/mount.h>
-#include <sys/vnode.h>
-
 /*
- * These define the root filesystem, device, and root filesystem type.
+ * PURPOSE:	This file abstracts the root mounting interface from
+ *		the per file system semantics for handling mounts,
+ *		the overall intent of which is to move the BSD
+ *		internals dependence out of the FS code, both to
+ *		make the FS code more portable and to free up some
+ *		of the BSD internals so that they may more easily
+ *		be changed.
+ *
+ * NOTE1:	Code is single entry/single exit to aid debugging
+ *		and conversion for kernel multithreading.
+ *
+ * NOTE2:	Code notes lock state in headers on entry and exit
+ *		as an aid to conversion for kernel multithreading
+ *		on SMP reentrancy
  */
-struct mount *rootfs;
-struct vnode *rootvnode;
-int (*mountroot)() = NULL;
+#include <sys/param.h>		/* dev_t (types.h)*/
+#include <sys/systm.h>		/* rootvp*/
+#include <sys/proc.h>		/* curproc*/
+#include <sys/vnode.h>		/* NULLVP*/
+#include <sys/mount.h>		/* struct mount*/
+#include <sys/malloc.h>		/* M_MOUNT*/
 
 /*
- * Set up the initial array of known filesystem types.
+ * GLOBALS
  */
-extern	struct vfsops ufs_vfsops;
-extern	int ffs_mountroot();
-extern	struct vfsops lfs_vfsops;
-extern	int lfs_mountroot();
-extern	struct vfsops mfs_vfsops;
-extern	int mfs_mountroot();
-extern	struct vfsops cd9660_vfsops;
-extern	int cd9660_mountroot();
-extern	struct vfsops msdos_vfsops;
-extern	struct vfsops adosfs_vfsops;
-extern	struct vfsops nfs_vfsops;
-extern	int nfs_mountroot();
-extern	struct vfsops afs_vfsops;
-extern	struct vfsops procfs_vfsops;
-extern	struct vfsops null_vfsops;
-extern	struct vfsops union_vfsops;
-extern	struct vfsops umap_vfsops;
-extern	struct vfsops portal_vfsops;
-extern	struct vfsops fdesc_vfsops;
-extern	struct vfsops kernfs_vfsops;
 
 /*
- * Set up the filesystem operations for vnodes.
+ *  These define the root filesystem, device, and root filesystem type.
  */
-static struct vfsconf vfsconflist[] = {
-
-	/* Fast Filesystem */
-#ifdef FFS
-	{ &ufs_vfsops, "ufs", 1, 0, MNT_LOCAL, ffs_mountroot, NULL },
-#endif
-
-	/* Log-based Filesystem */
-#ifdef LFS
-	{ &lfs_vfsops, "lfs", 5, 0, MNT_LOCAL, lfs_mountroot, NULL },
-#endif
-
-	/* Memory-based Filesystem */
-#ifdef MFS
-	{ &mfs_vfsops, "mfs", 3, 0, MNT_LOCAL, mfs_mountroot, NULL },
-#endif
-
-	/* ISO9660 (aka CDROM) Filesystem */
-#ifdef CD9660
-	{ &cd9660_vfsops, "cd9660", 14, 0, MNT_LOCAL, cd9660_mountroot, NULL },
-#endif
-
-	/* MSDOS Filesystem */
-#ifdef MSDOS
-	{ &msdos_vfsops, "msdos", 4, 0, MNT_LOCAL, NULL, NULL },
-#endif
-
-	/* AmigaDOS Filesystem */
-#ifdef ADOSFS
-	{ &adosfs_vfsops, "adosfs", 16, 0, MNT_LOCAL, NULL, NULL },
-#endif
-
-	/* Sun-compatible Network Filesystem */
-#ifdef NFS
-	{ &nfs_vfsops, "nfs", 2, 0, 0, nfs_mountroot, NULL },
-#endif
-
-	/* Andrew Filesystem */
-#ifdef AFS
-	{ &afs_vfsops, "andrewfs", 13, 0, 0, afs_mountroot, NULL },
-#endif
-
-	/* /proc Filesystem */
-#ifdef PROCFS
-	{ &procfs_vfsops, "procfs", 12, 0, 0, NULL, NULL },
-#endif
-
-	/* Loopback (Minimal) Filesystem Layer */
-#ifdef NULLFS
-	{ &null_vfsops, "loopback", 9, 0, 0, NULL, NULL },
-#endif
-
-	/* Union (translucent) Filesystem */
-#ifdef UNION
-	{ &union_vfsops, "union", 15, 0, 0, NULL, NULL },
-#endif
-
-	/* User/Group Identifer Remapping Filesystem */
-#ifdef UMAPFS
-	{ &umap_vfsops, "umap", 10, 0, 0, NULL, NULL },
-#endif
-
-	/* Portal Filesystem */
-#ifdef PORTAL
-	{ &portal_vfsops, "portal", 8, 0, 0, NULL, NULL },
-#endif
-
-	/* File Descriptor Filesystem */
-#ifdef FDESC
-	{ &fdesc_vfsops, "fdesc", 7, 0, 0, NULL, NULL },
-#endif
-
-	/* Kernel Information Filesystem */
-#ifdef KERNFS
-	{ &kernfs_vfsops, "kernfs", 11, 0, 0, NULL, NULL },
-#endif
-
-};
+struct mount *rootfs;
+struct vnode *rootvnode;
+char *mountrootfsname;
 
 /*
- * Initially the size of the list, vfs_init will set maxvfsconf
+ * vfs_init() will set maxvfsconf
  * to the highest defined type number.
  */
-int maxvfsconf = sizeof(vfsconflist) / sizeof (struct vfsconf);
-struct vfsconf *vfsconf = vfsconflist;
+int maxvfsconf;
+struct vfsconf *vfsconf;
 
 /*
+ * Common root mount code shared by all filesystems
+ */
+#define ROOTNAME	"root_device"
+
+/*
+ * vfs_mountrootfs
+ *
+ * Common entry point for root mounts
+ *
+ * PARAMETERS:
+ *		fsname	name of the filesystem
+ *
+ * RETURNS:	0	Success
+ *		!0	error number (errno.h)
  *
- * vfs_opv_descs enumerates the list of vnode classes, each with it's own
- * vnode operation vector.  It is consulted at system boot to build operation
- * vectors.  It is NULL terminated.
+ * LOCK STATE:
+ *		ENTRY
+ *			<no locks held>
+ *		EXIT
+ *			<no locks held>
  *
+ * NOTES:
+ *		This code is currently supported only for use for
+ *		the FFS file system type.  This is a matter of
+ *		fixing the other file systems, not this code!
  */
-extern struct vnodeopv_desc ffs_vnodeop_opv_desc;
-extern struct vnodeopv_desc ffs_specop_opv_desc;
-extern struct vnodeopv_desc ffs_fifoop_opv_desc;
-extern struct vnodeopv_desc lfs_vnodeop_opv_desc;
-extern struct vnodeopv_desc lfs_specop_opv_desc;
-extern struct vnodeopv_desc lfs_fifoop_opv_desc;
-extern struct vnodeopv_desc mfs_vnodeop_opv_desc;
-extern struct vnodeopv_desc dead_vnodeop_opv_desc;
-extern struct vnodeopv_desc fifo_vnodeop_opv_desc;
-extern struct vnodeopv_desc spec_vnodeop_opv_desc;
-extern struct vnodeopv_desc nfsv2_vnodeop_opv_desc;
-extern struct vnodeopv_desc spec_nfsv2nodeop_opv_desc;
-extern struct vnodeopv_desc fifo_nfsv2nodeop_opv_desc;
-extern struct vnodeopv_desc fdesc_vnodeop_opv_desc;
-extern struct vnodeopv_desc portal_vnodeop_opv_desc;
-extern struct vnodeopv_desc null_vnodeop_opv_desc;
-extern struct vnodeopv_desc umap_vnodeop_opv_desc;
-extern struct vnodeopv_desc kernfs_vnodeop_opv_desc;
-extern struct vnodeopv_desc procfs_vnodeop_opv_desc;
-extern struct vnodeopv_desc cd9660_vnodeop_opv_desc;
-extern struct vnodeopv_desc cd9660_specop_opv_desc;
-extern struct vnodeopv_desc cd9660_fifoop_opv_desc;
-extern struct vnodeopv_desc union_vnodeop_opv_desc;
-
-struct vnodeopv_desc *vfs_opv_descs[] = {
-	&ffs_vnodeop_opv_desc,
-	&ffs_specop_opv_desc,
-#ifdef FIFO
-	&ffs_fifoop_opv_desc,
-#endif
-	&dead_vnodeop_opv_desc,
-#ifdef FIFO
-	&fifo_vnodeop_opv_desc,
-#endif
-	&spec_vnodeop_opv_desc,
-#ifdef LFS
-	&lfs_vnodeop_opv_desc,
-	&lfs_specop_opv_desc,
-#ifdef FIFO
-	&lfs_fifoop_opv_desc,
-#endif
-#endif
-#ifdef MFS
-	&mfs_vnodeop_opv_desc,
-#endif
-#ifdef NFS
-	&nfsv2_vnodeop_opv_desc,
-	&spec_nfsv2nodeop_opv_desc,
-#ifdef FIFO
-	&fifo_nfsv2nodeop_opv_desc,
-#endif
-#endif
-#ifdef FDESC
-	&fdesc_vnodeop_opv_desc,
-#endif
-#ifdef PORTAL
-	&portal_vnodeop_opv_desc,
-#endif
-#ifdef NULLFS
-	&null_vnodeop_opv_desc,
-#endif
-#ifdef UMAPFS
-	&umap_vnodeop_opv_desc,
-#endif
-#ifdef KERNFS
-	&kernfs_vnodeop_opv_desc,
-#endif
-#ifdef PROCFS
-	&procfs_vnodeop_opv_desc,
-#endif
-#ifdef CD9660
-	&cd9660_vnodeop_opv_desc,
-	&cd9660_specop_opv_desc,
-#ifdef FIFO
-	&cd9660_fifoop_opv_desc,
-#endif
-#endif
-#ifdef UNION
-	&union_vnodeop_opv_desc,
-#endif
-	NULL
-};
+int
+vfs_mountrootfs(fsname)
+	char			*fsname;
+{
+	struct mount		*mp;
+	int			err = 0;
+	struct proc		*p = curproc;	/* XXX */
+
+	/*
+	 *  New root mount structure
+	 */
+	err = vfs_rootmountalloc(fsname, ROOTNAME, &mp);
+	if (err)
+		return (err);
+	mp->mnt_flag		|= MNT_ROOTFS;
+
+	/*
+	 * Attempt the mount
+	 */
+	err = VFS_MOUNT(mp, NULL, NULL, NULL, p);
+	if (err)
+		goto error_2;
+
+	simple_lock(&mountlist_slock);
+	/* Add fs to list of mounted file systems*/
+	CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+	simple_unlock(&mountlist_slock);
+
+	vfs_unbusy(mp, p);
+
+	/* root mount, update system time from FS specific data*/
+	inittodr(mp->mnt_time);
+
+	goto success;
+
+
+error_2:	/* mount error*/
+
+	vfs_unbusy(mp, p);
+
+error_1:	/* lock error*/
+
+	/* free mount struct before failing*/
+	free( mp, M_MOUNT);
+
+success:
+	return( err);
+}
diff --git a/sys/kern/vfs_export.c b/sys/kern/vfs_export.c
new file mode 100644
index 0000000..0b487fd
--- /dev/null
+++ b/sys/kern/vfs_export.c
@@ -0,0 +1,2079 @@
+/*
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
+ * $Id: vfs_subr.c,v 1.79 1997/03/04 18:31:56 bde Exp $
+ */
+
+/*
+ * External virtual filesystem routines
+ */
+#include "opt_ddb.h"
+#include "opt_devfs.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
+#include <sys/proc.h>
+#include <sys/mount.h>
+#include <sys/time.h>
+#include <sys/vnode.h>
+#include <sys/stat.h>
+#include <sys/namei.h>
+#include <sys/ucred.h>
+#include <sys/buf.h>
+#include <sys/errno.h>
+#include <sys/malloc.h>
+#include <sys/domain.h>
+#include <sys/mbuf.h>
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_pager.h>
+#include <vm/vnode_pager.h>
+#include <sys/sysctl.h>
+
+#include <miscfs/specfs/specdev.h>
+
+#ifdef DDB
+extern void	printlockedvnodes __P((void));
+#endif
+static void	vclean __P((struct vnode *vp, int flags, struct proc *p));
+extern void	vgonel __P((struct vnode *vp, struct proc *p));
+unsigned long	numvnodes;
+extern void	vfs_unmountroot __P((struct mount *rootfs));
+extern void	vputrele __P((struct vnode *vp, int put));
+
+enum vtype iftovt_tab[16] = {
+	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
+	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
+};
+int vttoif_tab[9] = {
+	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
+	S_IFSOCK, S_IFIFO, S_IFMT,
+};
+
+/*
+ * Insq/Remq for the vnode usage lists.
+ */
+#define	bufinsvn(bp, dp)	LIST_INSERT_HEAD(dp, bp, b_vnbufs)
+#define	bufremvn(bp) {							\
+	LIST_REMOVE(bp, b_vnbufs);					\
+	(bp)->b_vnbufs.le_next = NOLIST;				\
+}
+TAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
+static u_long freevnodes = 0;
+
+struct mntlist mountlist;	/* mounted filesystem list */
+struct simplelock mountlist_slock;
+static struct simplelock mntid_slock;
+struct simplelock mntvnode_slock;
+struct simplelock vnode_free_list_slock;
+static struct simplelock spechash_slock;
+
+int desiredvnodes;
+SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "");
+
+static void	vfs_free_addrlist __P((struct netexport *nep));
+static int	vfs_free_netcred __P((struct radix_node *rn, void *w));
+static int	vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
+				       struct export_args *argp));
+
+/*
+ * Initialize the vnode management data structures.
+ */
+void
+vntblinit()
+{
+
+	desiredvnodes = maxproc + vm_object_cache_max;
+	simple_lock_init(&mntvnode_slock);
+	simple_lock_init(&mntid_slock);
+	simple_lock_init(&spechash_slock);
+	TAILQ_INIT(&vnode_free_list);
+	simple_lock_init(&vnode_free_list_slock);
+	CIRCLEQ_INIT(&mountlist);
+}
+
+/*
+ * Mark a mount point as busy. Used to synchronize access and to delay
+ * unmounting. Interlock is not released on failure.
+ */
+int
+vfs_busy(mp, flags, interlkp, p)
+	struct mount *mp;
+	int flags;
+	struct simplelock *interlkp;
+	struct proc *p;
+{
+	int lkflags;
+
+	if (mp->mnt_flag & MNT_UNMOUNT) {
+		if (flags & LK_NOWAIT)
+			return (ENOENT);
+		mp->mnt_flag |= MNT_MWAIT;
+		if (interlkp) {
+			simple_unlock(interlkp);
+		}
+		/*
+		 * Since all busy locks are shared except the exclusive
+		 * lock granted when unmounting, the only place that a
+		 * wakeup needs to be done is at the release of the
+		 * exclusive lock at the end of dounmount.
+		 */
+		tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
+		if (interlkp) {
+			simple_lock(interlkp);
+		}
+		return (ENOENT);
+	}
+	lkflags = LK_SHARED;
+	if (interlkp)
+		lkflags |= LK_INTERLOCK;
+	if (lockmgr(&mp->mnt_lock, lkflags, interlkp, p))
+		panic("vfs_busy: unexpected lock failure");
+	return (0);
+}
+
+/*
+ * Free a busy filesystem.
+ */
+void
+vfs_unbusy(mp, p)
+	struct mount *mp;
+	struct proc *p;
+{
+
+	lockmgr(&mp->mnt_lock, LK_RELEASE, NULL, p);
+}
+
+/*
+ * Lookup a filesystem type, and if found allocate and initialize
+ * a mount structure for it.
+ *
+ * Devname is usually updated by mount(8) after booting.
+ */
+int
+vfs_rootmountalloc(fstypename, devname, mpp)
+	char *fstypename;
+	char *devname;
+	struct mount **mpp;
+{
+	struct proc *p = curproc;	/* XXX */
+	struct vfsconf *vfsp;
+	struct mount *mp;
+
+	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+		if (!strcmp(vfsp->vfc_name, fstypename))
+			break;
+	if (vfsp == NULL)
+		return (ENODEV);
+	mp = malloc((u_long)sizeof(struct mount), M_MOUNT, M_WAITOK);
+	bzero((char *)mp, (u_long)sizeof(struct mount));
+	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
+	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
+	LIST_INIT(&mp->mnt_vnodelist);
+	mp->mnt_vfc = vfsp;
+	mp->mnt_op = vfsp->vfc_vfsops;
+	mp->mnt_flag = MNT_RDONLY;
+	mp->mnt_vnodecovered = NULLVP;
+	vfsp->vfc_refcount++;
+	mp->mnt_stat.f_type = vfsp->vfc_typenum;
+	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
+	mp->mnt_stat.f_mntonname[0] = '/';
+	mp->mnt_stat.f_mntonname[1] = 0;
+	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
+	*mpp = mp;
+	return (0);
+}
+
+/*
+ * Find an appropriate filesystem to use for the root. If a filesystem
+ * has not been preselected, walk through the list of known filesystems
+ * trying those that have mountroot routines, and try them until one
+ * works or we have tried them all.
+ */
+#ifdef notdef	/* XXX JH */
+int
+lite2_vfs_mountroot(void)
+{
+	struct vfsconf *vfsp;
+	extern int (*lite2_mountroot)(void);
+	int error;
+
+	if (lite2_mountroot != NULL)
+		return ((*lite2_mountroot)());
+	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
+		if (vfsp->vfc_mountroot == NULL)
+			continue;
+		if ((error = (*vfsp->vfc_mountroot)()) == 0)
+			return (0);
+		printf("%s_mountroot failed: %d\n", vfsp->vfc_name, error);
+	}
+	return (ENODEV);
+}
+#endif
+
+/*
+ * Lookup a mount point by filesystem identifier.
+ */
+struct mount *
+vfs_getvfs(fsid)
+	fsid_t *fsid;
+{
+	register struct mount *mp;
+
+	simple_lock(&mountlist_slock);
+	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
+	    mp = mp->mnt_list.cqe_next) {
+		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
+		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
+			simple_unlock(&mountlist_slock);
+			return (mp);
+	    }
+	}
+	simple_unlock(&mountlist_slock);
+	return ((struct mount *) 0);
+}
+
+/*
+ * Get a new unique fsid
+ */
+void
+vfs_getnewfsid(mp)
+	struct mount *mp;
+{
+	static u_short xxxfs_mntid;
+
+	fsid_t tfsid;
+	int mtype;
+
+	simple_lock(&mntid_slock); 
+	mtype = mp->mnt_vfc->vfc_typenum;
+	mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
+	mp->mnt_stat.f_fsid.val[1] = mtype;
+	if (xxxfs_mntid == 0)
+		++xxxfs_mntid;
+	tfsid.val[0] = makedev(nblkdev + mtype, xxxfs_mntid);
+	tfsid.val[1] = mtype;
+	if (mountlist.cqh_first != (void *)&mountlist) {
+		while (vfs_getvfs(&tfsid)) {
+			tfsid.val[0]++;
+			xxxfs_mntid++;
+		}
+	}
+	mp->mnt_stat.f_fsid.val[0] = tfsid.val[0];
+	simple_unlock(&mntid_slock);
+}
+
+/*
+ * Set vnode attributes to VNOVAL
+ */
+void
+vattr_null(vap)
+	register struct vattr *vap;
+{
+
+	vap->va_type = VNON;
+	vap->va_size = VNOVAL;
+	vap->va_bytes = VNOVAL;
+	vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid =
+	    vap->va_fsid = vap->va_fileid =
+	    vap->va_blocksize = vap->va_rdev =
+	    vap->va_atime.tv_sec = vap->va_atime.tv_nsec =
+	    vap->va_mtime.tv_sec = vap->va_mtime.tv_nsec =
+	    vap->va_ctime.tv_sec = vap->va_ctime.tv_nsec =
+	    vap->va_flags = vap->va_gen = VNOVAL;
+	vap->va_vaflags = 0;
+}
+
+/*
+ * Routines having to do with the management of the vnode table.
+ */
+extern vop_t **dead_vnodeop_p;
+
+/*
+ * Return the next vnode from the free list.
+ */
+int
+getnewvnode(tag, mp, vops, vpp)
+	enum vtagtype tag;
+	struct mount *mp;
+	vop_t **vops;
+	struct vnode **vpp;
+{
+	struct proc *p = curproc;	/* XXX */
+	struct vnode *vp;
+
+	simple_lock(&vnode_free_list_slock);
+retry:
+	/*
+	 * we allocate a new vnode if
+	 * 	1. we don't have any free
+	 *		Pretty obvious, we actually used to panic, but that
+	 *		is a silly thing to do.
+	 *	2. we havn't filled our pool yet
+	 *		We don't want to trash the incore (VM-)vnodecache.
+	 *	3. if less that 1/4th of our vnodes are free.
+	 *		We don't want to trash the namei cache either.
+	 */
+	if (freevnodes < (numvnodes >> 2) ||
+	    numvnodes < desiredvnodes ||
+	    vnode_free_list.tqh_first == NULL) {
+		simple_unlock(&vnode_free_list_slock);
+		vp = (struct vnode *) malloc((u_long) sizeof *vp,
+		    M_VNODE, M_WAITOK);
+		bzero((char *) vp, sizeof *vp);
+		numvnodes++;
+	} else {
+		for (vp = vnode_free_list.tqh_first;
+				vp != NULLVP; vp = vp->v_freelist.tqe_next) {
+			if (simple_lock_try(&vp->v_interlock))
+				break;
+		}
+		/*
+		 * Unless this is a bad time of the month, at most
+		 * the first NCPUS items on the free list are
+		 * locked, so this is close enough to being empty.
+		 */
+		if (vp == NULLVP) {
+			simple_unlock(&vnode_free_list_slock);
+			tablefull("vnode");
+			*vpp = 0;
+			return (ENFILE);
+		}
+		if (vp->v_usecount)
+			panic("free vnode isn't");
+		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+		if (vp->v_usage > 0) {
+			simple_unlock(&vp->v_interlock);
+			--vp->v_usage;
+			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+			goto retry;
+		}
+		freevnodes--;
+
+		/* see comment on why 0xdeadb is set at end of vgone (below) */
+		vp->v_freelist.tqe_prev = (struct vnode **) 0xdeadb;
+		simple_unlock(&vnode_free_list_slock);
+		vp->v_lease = NULL;
+		if (vp->v_type != VBAD)
+			vgonel(vp, p);
+		else {
+			simple_unlock(&vp->v_interlock);
+		}
+
+#ifdef DIAGNOSTIC
+		{
+			int s;
+
+			if (vp->v_data)
+				panic("cleaned vnode isn't");
+			s = splbio();
+			if (vp->v_numoutput)
+				panic("Clean vnode has pending I/O's");
+			splx(s);
+		}
+#endif
+		vp->v_flag = 0;
+		vp->v_lastr = 0;
+		vp->v_lastw = 0;
+		vp->v_lasta = 0;
+		vp->v_cstart = 0;
+		vp->v_clen = 0;
+		vp->v_socket = 0;
+		vp->v_writecount = 0;	/* XXX */
+		vp->v_usage = 0;
+	}
+	vp->v_type = VNON;
+	cache_purge(vp);
+	vp->v_tag = tag;
+	vp->v_op = vops;
+	insmntque(vp, mp);
+	*vpp = vp;
+	vp->v_usecount = 1;
+	vp->v_data = 0;
+	return (0);
+}
+
+/*
+ * Move a vnode from one mount queue to another.
+ */
+void
+insmntque(vp, mp)
+	register struct vnode *vp;
+	register struct mount *mp;
+{
+
+	simple_lock(&mntvnode_slock);
+	/*
+	 * Delete from old mount point vnode list, if on one.
+	 */
+	if (vp->v_mount != NULL)
+		LIST_REMOVE(vp, v_mntvnodes);
+	/*
+	 * Insert into list of vnodes for the new mount point, if available.
+	 */
+	if ((vp->v_mount = mp) == NULL) {
+		simple_unlock(&mntvnode_slock);
+		return;
+	}
+	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
+	simple_unlock(&mntvnode_slock);
+}
+
+/*
+ * Update outstanding I/O count and do wakeup if requested.
+ */
+void
+vwakeup(bp)
+	register struct buf *bp;
+{
+	register struct vnode *vp;
+
+	bp->b_flags &= ~B_WRITEINPROG;
+	if ((vp = bp->b_vp)) {
+		vp->v_numoutput--;
+		if (vp->v_numoutput < 0)
+			panic("vwakeup: neg numoutput");
+		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
+			vp->v_flag &= ~VBWAIT;
+			wakeup((caddr_t) &vp->v_numoutput);
+		}
+	}
+}
+
+/*
+ * Flush out and invalidate all buffers associated with a vnode.
+ * Called with the underlying object locked.
+ */
+int
+vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
+	register struct vnode *vp;
+	int flags;
+	struct ucred *cred;
+	struct proc *p;
+	int slpflag, slptimeo;
+{
+	register struct buf *bp;
+	struct buf *nbp, *blist;
+	int s, error;
+	vm_object_t object;
+
+	if (flags & V_SAVE) {
+		if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)))
+			return (error);
+		if (vp->v_dirtyblkhd.lh_first != NULL)
+			panic("vinvalbuf: dirty bufs");
+	}
+
+	s = splbio();
+	for (;;) {
+		if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA))
+			while (blist && blist->b_lblkno < 0)
+				blist = blist->b_vnbufs.le_next;
+		if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
+		    (flags & V_SAVEMETA))
+			while (blist && blist->b_lblkno < 0)
+				blist = blist->b_vnbufs.le_next;
+		if (!blist)
+			break;
+
+		for (bp = blist; bp; bp = nbp) {
+			nbp = bp->b_vnbufs.le_next;
+			if ((flags & V_SAVEMETA) && bp->b_lblkno < 0)
+				continue;
+			if (bp->b_flags & B_BUSY) {
+				bp->b_flags |= B_WANTED;
+				error = tsleep((caddr_t) bp,
+				    slpflag | (PRIBIO + 1), "vinvalbuf",
+				    slptimeo);
+				if (error) {
+					splx(s);
+					return (error);
+				}
+				break;
+			}
+			bremfree(bp);
+			bp->b_flags |= B_BUSY;
+			/*
+			 * XXX Since there are no node locks for NFS, I
+			 * believe there is a slight chance that a delayed
+			 * write will occur while sleeping just above, so
+			 * check for it.
+			 */
+			if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
+				(void) VOP_BWRITE(bp);
+				break;
+			}
+			bp->b_flags |= (B_INVAL|B_NOCACHE|B_RELBUF);
+			brelse(bp);
+		}
+	}
+
+	while (vp->v_numoutput > 0) {
+		vp->v_flag |= VBWAIT;
+		tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
+	}
+
+	splx(s);
+
+	/*
+	 * Destroy the copy in the VM cache, too.
+	 */
+	object = vp->v_object;
+	if (object != NULL) {
+		vm_object_page_remove(object, 0, object->size,
+		    (flags & V_SAVE) ? TRUE : FALSE);
+	}
+	if (!(flags & V_SAVEMETA) &&
+	    (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first))
+		panic("vinvalbuf: flush failed");
+	return (0);
+}
+
+/*
+ * Associate a buffer with a vnode.
+ */
+void
+bgetvp(vp, bp)
+	register struct vnode *vp;
+	register struct buf *bp;
+{
+	int s;
+
+	if (bp->b_vp)
+		panic("bgetvp: not free");
+	VHOLD(vp);
+	bp->b_vp = vp;
+	if (vp->v_type == VBLK || vp->v_type == VCHR)
+		bp->b_dev = vp->v_rdev;
+	else
+		bp->b_dev = NODEV;
+	/*
+	 * Insert onto list for new vnode.
+	 */
+	s = splbio();
+	bufinsvn(bp, &vp->v_cleanblkhd);
+	splx(s);
+}
+
+/*
+ * Disassociate a buffer from a vnode.
+ */
+void
+brelvp(bp)
+	register struct buf *bp;
+{
+	struct vnode *vp;
+	int s;
+
+	if (bp->b_vp == (struct vnode *) 0)
+		panic("brelvp: NULL");
+	/*
+	 * Delete from old vnode list, if on one.
+	 */
+	s = splbio();
+	if (bp->b_vnbufs.le_next != NOLIST)
+		bufremvn(bp);
+	splx(s);
+
+	vp = bp->b_vp;
+	bp->b_vp = (struct vnode *) 0;
+	HOLDRELE(vp);
+}
+
+/*
+ * Associate a p-buffer with a vnode.
+ */
+void
+pbgetvp(vp, bp)
+	register struct vnode *vp;
+	register struct buf *bp;
+{
+#if defined(DIAGNOSTIC)
+	if (bp->b_vp)
+		panic("pbgetvp: not free");
+#endif
+	bp->b_vp = vp;
+	if (vp->v_type == VBLK || vp->v_type == VCHR)
+		bp->b_dev = vp->v_rdev;
+	else
+		bp->b_dev = NODEV;
+}
+
+/*
+ * Disassociate a p-buffer from a vnode.
+ */
+void
+pbrelvp(bp)
+	register struct buf *bp;
+{
+	struct vnode *vp;
+
+#if defined(DIAGNOSTIC)
+	if (bp->b_vp == (struct vnode *) 0)
+		panic("pbrelvp: NULL");
+#endif
+
+	bp->b_vp = (struct vnode *) 0;
+}
+
+/*
+ * Reassign a buffer from one vnode to another.
+ * Used to assign file specific control information
+ * (indirect blocks) to the vnode to which they belong.
+ */
+void
+reassignbuf(bp, newvp)
+	register struct buf *bp;
+	register struct vnode *newvp;
+{
+	int s;
+
+	if (newvp == NULL) {
+		printf("reassignbuf: NULL");
+		return;
+	}
+
+	s = splbio();
+	/*
+	 * Delete from old vnode list, if on one.
+	 */
+	if (bp->b_vnbufs.le_next != NOLIST)
+		bufremvn(bp);
+	/*
+	 * If dirty, put on list of dirty buffers; otherwise insert onto list
+	 * of clean buffers.
+	 */
+	if (bp->b_flags & B_DELWRI) {
+		struct buf *tbp;
+
+		tbp = newvp->v_dirtyblkhd.lh_first;
+		if (!tbp || (tbp->b_lblkno > bp->b_lblkno)) {
+			bufinsvn(bp, &newvp->v_dirtyblkhd);
+		} else {
+			while (tbp->b_vnbufs.le_next &&
+				(tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) {
+				tbp = tbp->b_vnbufs.le_next;
+			}
+			LIST_INSERT_AFTER(tbp, bp, b_vnbufs);
+		}
+	} else {
+		bufinsvn(bp, &newvp->v_cleanblkhd);
+	}
+	splx(s);
+}
+
+#ifndef DEVFS_ROOT
+/*
+ * Create a vnode for a block device.
+ * Used for root filesystem, argdev, and swap areas.
+ * Also used for memory file system special devices.
+ */
+int
+bdevvp(dev, vpp)
+	dev_t dev;
+	struct vnode **vpp;
+{
+	register struct vnode *vp;
+	struct vnode *nvp;
+	int error;
+
+	if (dev == NODEV)
+		return (0);
+	error = getnewvnode(VT_NON, (struct mount *) 0, spec_vnodeop_p, &nvp);
+	if (error) {
+		*vpp = 0;
+		return (error);
+	}
+	vp = nvp;
+	vp->v_type = VBLK;
+	if ((nvp = checkalias(vp, dev, (struct mount *) 0))) {
+		vput(vp);
+		vp = nvp;
+	}
+	*vpp = vp;
+	return (0);
+}
+#endif /* !DEVFS_ROOT */
+
+/*
+ * Check to see if the new vnode represents a special device
+ * for which we already have a vnode (either because of
+ * bdevvp() or because of a different vnode representing
+ * the same block device). If such an alias exists, deallocate
+ * the existing contents and return the aliased vnode. The
+ * caller is responsible for filling it with its new contents.
+ */
+struct vnode *
+checkalias(nvp, nvp_rdev, mp)
+	register struct vnode *nvp;
+	dev_t nvp_rdev;
+	struct mount *mp;
+{
+	struct proc *p = curproc;	/* XXX */
+	struct vnode *vp;
+	struct vnode **vpp;
+
+	if (nvp->v_type != VBLK && nvp->v_type != VCHR)
+		return (NULLVP);
+
+	vpp = &speclisth[SPECHASH(nvp_rdev)];
+loop:
+	simple_lock(&spechash_slock);
+	for (vp = *vpp; vp; vp = vp->v_specnext) {
+		if (nvp_rdev != vp->v_rdev || nvp->v_type != vp->v_type)
+			continue;
+		/*
+		 * Alias, but not in use, so flush it out.
+		 */
+		simple_lock(&vp->v_interlock);
+		if (vp->v_usecount == 0) {
+			simple_unlock(&spechash_slock);
+			vgonel(vp, p);
+			goto loop;
+		}
+		if (vget(vp, LK_EXCLUSIVE | LK_INTERLOCK, p)) {
+			simple_unlock(&spechash_slock);
+			goto loop;
+		}
+		break;
+	}
+	if (vp == NULL || vp->v_tag != VT_NON) {
+		MALLOC(nvp->v_specinfo, struct specinfo *,
+		    sizeof(struct specinfo), M_VNODE, M_WAITOK);
+		nvp->v_rdev = nvp_rdev;
+		nvp->v_hashchain = vpp;
+		nvp->v_specnext = *vpp;
+		nvp->v_specflags = 0;
+		simple_unlock(&spechash_slock);
+		*vpp = nvp;
+		if (vp != NULLVP) {
+			nvp->v_flag |= VALIASED;
+			vp->v_flag |= VALIASED;
+			vput(vp);
+		}
+		return (NULLVP);
+	}
+	simple_unlock(&spechash_slock);
+	VOP_UNLOCK(vp, 0, p);
+	simple_lock(&vp->v_interlock);
+	vclean(vp, 0, p);
+	vp->v_op = nvp->v_op;
+	vp->v_tag = nvp->v_tag;
+	nvp->v_type = VNON;
+	insmntque(vp, mp);
+	return (vp);
+}
+
+/*
+ * Grab a particular vnode from the free list, increment its
+ * reference count and lock it. The vnode lock bit is set the
+ * vnode is being eliminated in vgone. The process is awakened
+ * when the transition is completed, and an error returned to
+ * indicate that the vnode is no longer usable (possibly having
+ * been changed to a new file system type).
+ */
+int
+vget(vp, flags, p)
+	register struct vnode *vp;
+	int flags;
+	struct proc *p;
+{
+	int error;
+
+	/*
+	 * If the vnode is in the process of being cleaned out for
+	 * another use, we wait for the cleaning to finish and then
+	 * return failure. Cleaning is determined by checking that
+	 * the VXLOCK flag is set.
+	 */
+	if ((flags & LK_INTERLOCK) == 0) {
+		simple_lock(&vp->v_interlock);
+	}
+	if (vp->v_flag & VXLOCK) {
+		vp->v_flag |= VXWANT;
+		simple_unlock(&vp->v_interlock);
+		tsleep((caddr_t)vp, PINOD, "vget", 0);
+		return (ENOENT);
+	}
+	if (vp->v_usecount == 0) {
+		simple_lock(&vnode_free_list_slock);
+		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+		simple_unlock(&vnode_free_list_slock);
+		freevnodes--;
+	}
+	vp->v_usecount++;
+	/*
+	 * Create the VM object, if needed
+	 */
+	if ((vp->v_type == VREG) &&
+		((vp->v_object == NULL) ||
+			(vp->v_object->flags & OBJ_VFS_REF) == 0)) {
+		/*
+		 * XXX vfs_object_create probably needs the interlock.
+		 */
+		simple_unlock(&vp->v_interlock);
+		vfs_object_create(vp, curproc, curproc->p_ucred, 0);
+		simple_lock(&vp->v_interlock);
+	}
+	if (flags & LK_TYPE_MASK) {
+		if (error = vn_lock(vp, flags | LK_INTERLOCK, p))
+			vrele(vp);
+		return (error);
+	}
+	simple_unlock(&vp->v_interlock);
+	return (0);
+}
+
+/*
+ * Stubs to use when there is no locking to be done on the underlying object.
+ * A minimal shared lock is necessary to ensure that the underlying object
+ * is not revoked while an operation is in progress. So, an active shared
+ * count is maintained in an auxillary vnode lock structure.
+ */
+int
+vop_nolock(ap)
+	struct vop_lock_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+		struct proc *a_p;
+	} */ *ap;
+{
+#ifdef notyet
+	/*
+	 * This code cannot be used until all the non-locking filesystems
+	 * (notably NFS) are converted to properly lock and release nodes.
+	 * Also, certain vnode operations change the locking state within
+	 * the operation (create, mknod, remove, link, rename, mkdir, rmdir,
+	 * and symlink). Ideally these operations should not change the
+	 * lock state, but should be changed to let the caller of the
+	 * function unlock them. Otherwise all intermediate vnode layers
+	 * (such as union, umapfs, etc) must catch these functions to do
+	 * the necessary locking at their layer. Note that the inactive
+	 * and lookup operations also change their lock state, but this 
+	 * cannot be avoided, so these two operations will always need
+	 * to be handled in intermediate layers.
+	 */
+	struct vnode *vp = ap->a_vp;
+	int vnflags, flags = ap->a_flags;
+
+	if (vp->v_vnlock == NULL) {
+		if ((flags & LK_TYPE_MASK) == LK_DRAIN)
+			return (0);
+		MALLOC(vp->v_vnlock, struct lock *, sizeof(struct lock),
+		    M_VNODE, M_WAITOK);
+		lockinit(vp->v_vnlock, PVFS, "vnlock", 0, 0);
+	}
+	switch (flags & LK_TYPE_MASK) {
+	case LK_DRAIN:
+		vnflags = LK_DRAIN;
+		break;
+	case LK_EXCLUSIVE:
+	case LK_SHARED:
+		vnflags = LK_SHARED;
+		break;
+	case LK_UPGRADE:
+	case LK_EXCLUPGRADE:
+	case LK_DOWNGRADE:
+		return (0);
+	case LK_RELEASE:
+	default:
+		panic("vop_nolock: bad operation %d", flags & LK_TYPE_MASK);
+	}
+	if (flags & LK_INTERLOCK)
+		vnflags |= LK_INTERLOCK;
+	return(lockmgr(vp->v_vnlock, vnflags, &vp->v_interlock, ap->a_p));
+#else /* for now */
+	/*
+	 * Since we are not using the lock manager, we must clear
+	 * the interlock here.
+	 */
+	if (ap->a_flags & LK_INTERLOCK) {
+		simple_unlock(&ap->a_vp->v_interlock);
+	}
+	return (0);
+#endif
+}
+
+/*
+ * Do the inverse of vop_nolock, handling the interlock in a compatible way.
+ */
+int
+vop_nounlock(ap)
+	struct vop_unlock_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+		struct proc *a_p;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+
+	if (vp->v_vnlock == NULL) {
+		if (ap->a_flags & LK_INTERLOCK)
+			simple_unlock(&ap->a_vp->v_interlock);
+		return (0);
+	}
+	return (lockmgr(vp->v_vnlock, LK_RELEASE | ap->a_flags,
+		&ap->a_vp->v_interlock, ap->a_p));
+}
+
+/*
+ * Return whether or not the node is in use.
+ */
+int
+vop_noislocked(ap)
+	struct vop_islocked_args /* {
+		struct vnode *a_vp;
+	} */ *ap;
+{
+	struct vnode *vp = ap->a_vp;
+
+	if (vp->v_vnlock == NULL)
+		return (0);
+	return (lockstatus(vp->v_vnlock));
+}
+
+/* #ifdef DIAGNOSTIC */
+/*
+ * Vnode reference, just increment the count
+ */
+void
+vref(vp)
+	struct vnode *vp;
+{
+	simple_lock(&vp->v_interlock);
+	if (vp->v_usecount <= 0)
+		panic("vref used where vget required");
+
+	vp->v_usecount++;
+
+	if ((vp->v_type == VREG) &&
+		((vp->v_object == NULL) ||
+			((vp->v_object->flags & OBJ_VFS_REF) == 0)) ) {
+		/*
+		 * We need to lock to VP during the time that
+		 * the object is created.  This is necessary to
+		 * keep the system from re-entrantly doing it
+		 * multiple times.
+		 * XXX vfs_object_create probably needs the interlock?
+		 */
+		simple_unlock(&vp->v_interlock);
+		vfs_object_create(vp, curproc, curproc->p_ucred, 0);
+		return;
+	}
+	simple_unlock(&vp->v_interlock);
+}
+
+/*
+ * Vnode put/release.
+ * If count drops to zero, call inactive routine and return to freelist.
+ */
+void
+vputrele(vp, put)
+	struct vnode *vp;
+	int put;
+{
+	struct proc *p = curproc;	/* XXX */
+
+#ifdef DIAGNOSTIC
+	if (vp == NULL)
+		panic("vputrele: null vp");
+#endif
+	simple_lock(&vp->v_interlock);
+	vp->v_usecount--;
+
+	if ((vp->v_usecount == 1) &&
+		vp->v_object &&
+		(vp->v_object->flags & OBJ_VFS_REF)) {
+		vp->v_object->flags &= ~OBJ_VFS_REF;
+		if (put) {
+			VOP_UNLOCK(vp, LK_INTERLOCK, p);
+		} else {
+			simple_unlock(&vp->v_interlock);
+		}
+		vm_object_deallocate(vp->v_object);
+		return;
+	}
+
+	if (vp->v_usecount > 0) {
+		if (put) {
+			VOP_UNLOCK(vp, LK_INTERLOCK, p);
+		} else {
+			simple_unlock(&vp->v_interlock);
+		}
+		return;
+	}
+
+	if (vp->v_usecount < 0) {
+#ifdef DIAGNOSTIC
+		vprint("vputrele: negative ref count", vp);
+#endif
+		panic("vputrele: negative ref cnt");
+	}
+	simple_lock(&vnode_free_list_slock);
+	if (vp->v_flag & VAGE) {
+		vp->v_flag &= ~VAGE;
+		vp->v_usage = 0;
+		if(vp->v_tag != VT_TFS)
+			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+	} else {
+		if(vp->v_tag != VT_TFS)
+			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+	}
+	freevnodes++;
+	simple_unlock(&vnode_free_list_slock);
+
+	/*
+	 * If we are doing a vput, the node is already locked, and we must
+	 * call VOP_INACTIVE with the node locked.  So, in the case of
+	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
+	 */
+	if (put) {
+		simple_unlock(&vp->v_interlock);
+		VOP_INACTIVE(vp, p);
+	} else if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
+		VOP_INACTIVE(vp, p);
+	}
+}
+
+/*
+ * vput(), just unlock and vrele()
+ */
+void
+vput(vp)
+	struct vnode *vp;
+{
+	vputrele(vp, 1);
+}
+
+void
+vrele(vp)
+	struct vnode *vp;
+{
+	vputrele(vp, 0);
+}
+
+#ifdef DIAGNOSTIC
+/*
+ * Page or buffer structure gets a reference.
+ */
+void
+vhold(vp)
+	register struct vnode *vp;
+{
+
+	simple_lock(&vp->v_interlock);
+	vp->v_holdcnt++;
+	simple_unlock(&vp->v_interlock);
+}
+
+/*
+ * Page or buffer structure frees a reference.
+ */
+void
+holdrele(vp)
+	register struct vnode *vp;
+{
+
+	simple_lock(&vp->v_interlock);
+	if (vp->v_holdcnt <= 0)
+		panic("holdrele: holdcnt");
+	vp->v_holdcnt--;
+	simple_unlock(&vp->v_interlock);
+}
+#endif /* DIAGNOSTIC */
+
+/*
+ * Remove any vnodes in the vnode table belonging to mount point mp.
+ *
+ * If MNT_NOFORCE is specified, there should not be any active ones,
+ * return error if any are found (nb: this is a user error, not a
+ * system error). If MNT_FORCE is specified, detach any active vnodes
+ * that are found.
+ */
+#ifdef DIAGNOSTIC
+static int busyprt = 0;		/* print out busy vnodes */
+SYSCTL_INT(_debug, 1, busyprt, CTLFLAG_RW, &busyprt, 0, "");
+#endif
+
+int
+vflush(mp, skipvp, flags)
+	struct mount *mp;
+	struct vnode *skipvp;
+	int flags;
+{
+	struct proc *p = curproc;	/* XXX */
+	struct vnode *vp, *nvp;
+	int busy = 0;
+
+	simple_lock(&mntvnode_slock);
+loop:
+	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
+		/*
+		 * Make sure this vnode wasn't reclaimed in getnewvnode().
+		 * Start over if it has (it won't be on the list anymore).
+		 */
+		if (vp->v_mount != mp)
+			goto loop;
+		nvp = vp->v_mntvnodes.le_next;
+		/*
+		 * Skip over a selected vnode.
+		 */
+		if (vp == skipvp)
+			continue;
+
+		simple_lock(&vp->v_interlock);
+		/*
+		 * Skip over a vnodes marked VSYSTEM.
+		 */
+		if ((flags & SKIPSYSTEM) && (vp->v_flag & VSYSTEM)) {
+			simple_unlock(&vp->v_interlock);
+			continue;
+		}
+		/*
+		 * If WRITECLOSE is set, only flush out regular file vnodes
+		 * open for writing.
+		 */
+		if ((flags & WRITECLOSE) &&
+		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
+			simple_unlock(&vp->v_interlock);
+			continue;
+		}
+
+		if (vp->v_object && (vp->v_object->flags & OBJ_VFS_REF)) {
+			simple_unlock(&vp->v_interlock);
+			simple_unlock(&mntvnode_slock);
+			vm_object_reference(vp->v_object);
+			pager_cache(vp->v_object, FALSE);
+			vp->v_object->flags &= ~OBJ_VFS_REF;
+			vm_object_deallocate(vp->v_object);
+			simple_lock(&mntvnode_slock);
+			simple_lock(&vp->v_interlock);
+		}
+
+		/*
+		 * With v_usecount == 0, all we need to do is clear out the
+		 * vnode data structures and we are done.
+		 */
+		if (vp->v_usecount == 0) {
+			simple_unlock(&mntvnode_slock);
+			vgonel(vp, p);
+			simple_lock(&mntvnode_slock);
+			continue;
+		}
+
+		/*
+		 * If FORCECLOSE is set, forcibly close the vnode. For block
+		 * or character devices, revert to an anonymous device. For
+		 * all other files, just kill them.
+		 */
+		if (flags & FORCECLOSE) {
+			simple_unlock(&mntvnode_slock);
+			if (vp->v_type != VBLK && vp->v_type != VCHR) {
+				vgonel(vp, p);
+			} else {
+				vclean(vp, 0, p);
+				vp->v_op = spec_vnodeop_p;
+				insmntque(vp, (struct mount *) 0);
+			}
+			simple_lock(&mntvnode_slock);
+			continue;
+		}
+#ifdef DIAGNOSTIC
+		if (busyprt)
+			vprint("vflush: busy vnode", vp);
+#endif
+		simple_unlock(&vp->v_interlock);
+		busy++;
+	}
+	simple_unlock(&mntvnode_slock);
+	if (busy)
+		return (EBUSY);
+	return (0);
+}
+
+/*
+ * Disassociate the underlying file system from a vnode.
+ */
+static void
+vclean(struct vnode *vp, int flags, struct proc *p)
+{
+	int active;
+
+	/*
+	 * Check to see if the vnode is in use. If so we have to reference it
+	 * before we clean it out so that its count cannot fall to zero and
+	 * generate a race against ourselves to recycle it.
+	 */
+	if ((active = vp->v_usecount))
+		vp->v_usecount++;
+	/*
+	 * Prevent the vnode from being recycled or brought into use while we
+	 * clean it out.
+	 */
+	if (vp->v_flag & VXLOCK)
+		panic("vclean: deadlock");
+	vp->v_flag |= VXLOCK;
+	/*
+	 * Even if the count is zero, the VOP_INACTIVE routine may still
+	 * have the object locked while it cleans it out. The VOP_LOCK
+	 * ensures that the VOP_INACTIVE routine is done with its work.
+	 * For active vnodes, it ensures that no other activity can
+	 * occur while the underlying object is being cleaned out.
+	 */
+	VOP_LOCK(vp, LK_DRAIN | LK_INTERLOCK, p);
+	/*
+	 * Clean out any buffers associated with the vnode.
+	 */
+	if (flags & DOCLOSE)
+		vinvalbuf(vp, V_SAVE, NOCRED, p, 0, 0);
+	/*
+	 * If purging an active vnode, it must be closed and
+	 * deactivated before being reclaimed. Note that the
+	 * VOP_INACTIVE will unlock the vnode.
+	 */
+	if (active) {
+		if (flags & DOCLOSE)
+			VOP_CLOSE(vp, IO_NDELAY, NOCRED, p);
+		VOP_INACTIVE(vp, p);
+	} else {
+		/*
+		 * Any other processes trying to obtain this lock must first
+		 * wait for VXLOCK to clear, then call the new lock operation.
+		 */
+		VOP_UNLOCK(vp, 0, p);
+	}
+	/*
+	 * Reclaim the vnode.
+	 */
+	if (VOP_RECLAIM(vp, p))
+		panic("vclean: cannot reclaim");
+	if (active)
+		vrele(vp);
+	cache_purge(vp);
+	if (vp->v_vnlock) {
+		if ((vp->v_vnlock->lk_flags & LK_DRAINED) == 0)
+			vprint("vclean: lock not drained", vp);
+		FREE(vp->v_vnlock, M_VNODE);
+		vp->v_vnlock = NULL;
+	}
+
+	/*
+	 * Done with purge, notify sleepers of the grim news.
+	 */
+	vp->v_op = dead_vnodeop_p;
+	vp->v_tag = VT_NON;
+	vp->v_flag &= ~VXLOCK;
+	if (vp->v_flag & VXWANT) {
+		vp->v_flag &= ~VXWANT;
+		wakeup((caddr_t) vp);
+	}
+}
+
+/*
+ * Eliminate all activity associated with the requested vnode
+ * and with all vnodes aliased to the requested vnode.
+ */
+int
+vop_revoke(ap)
+	struct vop_revoke_args /* {
+		struct vnode *a_vp;
+		int a_flags;
+	} */ *ap;
+{
+	struct vnode *vp, *vq;
+	struct proc *p = curproc;	/* XXX */
+
+#ifdef DIAGNOSTIC
+	if ((ap->a_flags & REVOKEALL) == 0)
+		panic("vop_revoke");
+#endif
+
+	vp = ap->a_vp;
+	simple_lock(&vp->v_interlock);
+
+	if (vp->v_flag & VALIASED) {
+		/*
+		 * If a vgone (or vclean) is already in progress,
+		 * wait until it is done and return.
+		 */
+		if (vp->v_flag & VXLOCK) {
+			vp->v_flag |= VXWANT;
+			simple_unlock(&vp->v_interlock);
+			tsleep((caddr_t)vp, PINOD, "vop_revokeall", 0);
+			return (0);
+		}
+		/*
+		 * Ensure that vp will not be vgone'd while we
+		 * are eliminating its aliases.
+		 */
+		vp->v_flag |= VXLOCK;
+		simple_unlock(&vp->v_interlock);
+		while (vp->v_flag & VALIASED) {
+			simple_lock(&spechash_slock);
+			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+				if (vq->v_rdev != vp->v_rdev ||
+				    vq->v_type != vp->v_type || vp == vq)
+					continue;
+				simple_unlock(&spechash_slock);
+				vgone(vq);
+				break;
+			}
+			if (vq == NULLVP) {
+				simple_unlock(&spechash_slock);
+			}
+		}
+		/*
+		 * Remove the lock so that vgone below will
+		 * really eliminate the vnode after which time
+		 * vgone will awaken any sleepers.
+		 */
+		simple_lock(&vp->v_interlock);
+		vp->v_flag &= ~VXLOCK;
+	}
+	vgonel(vp, p);
+	return (0);
+}
+
+/*
+ * Recycle an unused vnode to the front of the free list.
+ * Release the passed interlock if the vnode will be recycled.
+ */
+int
+vrecycle(vp, inter_lkp, p)
+	struct vnode *vp;
+	struct simplelock *inter_lkp;
+	struct proc *p;
+{
+
+	simple_lock(&vp->v_interlock);
+	if (vp->v_usecount == 0) {
+		if (inter_lkp) {
+			simple_unlock(inter_lkp);
+		}
+		vgonel(vp, p);
+		return (1);
+	}
+	simple_unlock(&vp->v_interlock);
+	return (0);
+}
+
+/*
+ * Eliminate all activity associated with a vnode
+ * in preparation for reuse.
+ */
+void
+vgone(vp)
+	register struct vnode *vp;
+{
+	struct proc *p = curproc;	/* XXX */
+
+	simple_lock(&vp->v_interlock);
+	vgonel(vp, p);
+}
+
+/*
+ * vgone, with the vp interlock held.
+ */
+void
+vgonel(vp, p)
+	struct vnode *vp;
+	struct proc *p;
+{
+	struct vnode *vq;
+	struct vnode *vx;
+
+	/*
+	 * If a vgone (or vclean) is already in progress,
+	 * wait until it is done and return.
+	 */
+	if (vp->v_flag & VXLOCK) {
+		vp->v_flag |= VXWANT;
+		simple_unlock(&vp->v_interlock);
+		tsleep((caddr_t)vp, PINOD, "vgone", 0);
+		return;
+	}
+
+	if (vp->v_object) {
+		vp->v_object->flags |= OBJ_VNODE_GONE;
+	}
+
+	/*
+	 * Clean out the filesystem specific data.
+	 */
+	vclean(vp, DOCLOSE, p);
+	/*
+	 * Delete from old mount point vnode list, if on one.
+	 */
+	if (vp->v_mount != NULL)
+		insmntque(vp, (struct mount *)0);
+	/*
+	 * If special device, remove it from special device alias list
+	 * if it is on one.
+	 */
+	if ((vp->v_type == VBLK || vp->v_type == VCHR) && vp->v_specinfo != 0) {
+		simple_lock(&spechash_slock);
+		if (*vp->v_hashchain == vp) {
+			*vp->v_hashchain = vp->v_specnext;
+		} else {
+			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+				if (vq->v_specnext != vp)
+					continue;
+				vq->v_specnext = vp->v_specnext;
+				break;
+			}
+			if (vq == NULL)
+				panic("missing bdev");
+		}
+		if (vp->v_flag & VALIASED) {
+			vx = NULL;
+			for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+				if (vq->v_rdev != vp->v_rdev ||
+				    vq->v_type != vp->v_type)
+					continue;
+				if (vx)
+					break;
+				vx = vq;
+			}
+			if (vx == NULL)
+				panic("missing alias");
+			if (vq == NULL)
+				vx->v_flag &= ~VALIASED;
+			vp->v_flag &= ~VALIASED;
+		}
+		simple_unlock(&spechash_slock);
+		FREE(vp->v_specinfo, M_VNODE);
+		vp->v_specinfo = NULL;
+	}
+
+	/*
+	 * If it is on the freelist and not already at the head,
+	 * move it to the head of the list. The test of the back
+	 * pointer and the reference count of zero is because
+	 * it will be removed from the free list by getnewvnode,
+	 * but will not have its reference count incremented until
+	 * after calling vgone. If the reference count were
+	 * incremented first, vgone would (incorrectly) try to
+	 * close the previous instance of the underlying object.
+	 * So, the back pointer is explicitly set to `0xdeadb' in
+	 * getnewvnode after removing it from the freelist to ensure
+	 * that we do not try to move it here.
+	 */
+	if (vp->v_usecount == 0) {
+		simple_lock(&vnode_free_list_slock);
+		if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
+			vnode_free_list.tqh_first != vp) {
+			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+		}
+		simple_unlock(&vnode_free_list_slock);
+	}
+
+	vp->v_type = VBAD;
+}
+
+/*
+ * Lookup a vnode by device number.
+ */
+int
+vfinddev(dev, type, vpp)
+	dev_t dev;
+	enum vtype type;
+	struct vnode **vpp;
+{
+	register struct vnode *vp;
+	int rc = 0;
+
+	simple_lock(&spechash_slock);
+	for (vp = speclisth[SPECHASH(dev)]; vp; vp = vp->v_specnext) {
+		if (dev != vp->v_rdev || type != vp->v_type)
+			continue;
+		*vpp = vp;
+		rc = 1;
+		break;
+	}
+	simple_unlock(&spechash_slock);
+	return (rc);
+}
+
+/*
+ * Calculate the total number of references to a special device.
+ */
+int
+vcount(vp)
+	register struct vnode *vp;
+{
+	struct vnode *vq, *vnext;
+	int count;
+
+loop:
+	if ((vp->v_flag & VALIASED) == 0)
+		return (vp->v_usecount);
+	simple_lock(&spechash_slock);
+	for (count = 0, vq = *vp->v_hashchain; vq; vq = vnext) {
+		vnext = vq->v_specnext;
+		if (vq->v_rdev != vp->v_rdev || vq->v_type != vp->v_type)
+			continue;
+		/*
+		 * Alias, but not in use, so flush it out.
+		 */
+		if (vq->v_usecount == 0 && vq != vp) {
+			simple_unlock(&spechash_slock);
+			vgone(vq);
+			goto loop;
+		}
+		count += vq->v_usecount;
+	}
+	simple_unlock(&spechash_slock);
+	return (count);
+}
+
+/*
+ * Print out a description of a vnode.
+ */
+static char *typename[] =
+{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
+
+void
+vprint(label, vp)
+	char *label;
+	register struct vnode *vp;
+{
+	char buf[64];
+
+	if (label != NULL)
+		printf("%s: ", label);
+	printf("type %s, usecount %d, writecount %d, refcount %ld,",
+	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
+	    vp->v_holdcnt);
+	buf[0] = '\0';
+	if (vp->v_flag & VROOT)
+		strcat(buf, "|VROOT");
+	if (vp->v_flag & VTEXT)
+		strcat(buf, "|VTEXT");
+	if (vp->v_flag & VSYSTEM)
+		strcat(buf, "|VSYSTEM");
+	if (vp->v_flag & VXLOCK)
+		strcat(buf, "|VXLOCK");
+	if (vp->v_flag & VXWANT)
+		strcat(buf, "|VXWANT");
+	if (vp->v_flag & VBWAIT)
+		strcat(buf, "|VBWAIT");
+	if (vp->v_flag & VALIASED)
+		strcat(buf, "|VALIASED");
+	if (buf[0] != '\0')
+		printf(" flags (%s)", &buf[1]);
+	if (vp->v_data == NULL) {
+		printf("\n");
+	} else {
+		printf("\n\t");
+		VOP_PRINT(vp);
+	}
+}
+
+#ifdef DDB
+/*
+ * List all of the locked vnodes in the system.
+ * Called when debugging the kernel.
+ */
+void
+printlockedvnodes()
+{
+	struct proc *p = curproc;	/* XXX */
+	struct mount *mp, *nmp;
+	struct vnode *vp;
+
+	printf("Locked vnodes\n");
+	simple_lock(&mountlist_slock);
+	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+			nmp = mp->mnt_list.cqe_next;
+			continue;
+		}
+		for (vp = mp->mnt_vnodelist.lh_first;
+		     vp != NULL;
+		     vp = vp->v_mntvnodes.le_next) {
+			if (VOP_ISLOCKED(vp))
+				vprint((char *)0, vp);
+		}
+		simple_lock(&mountlist_slock);
+		nmp = mp->mnt_list.cqe_next;
+		vfs_unbusy(mp, p);
+	}
+	simple_unlock(&mountlist_slock);
+}
+#endif
+
+/*
+ * Top level filesystem related information gathering.
+ */
+static int	sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS);
+
+static int
+vfs_sysctl SYSCTL_HANDLER_ARGS
+{
+	int *name = (int *)arg1 - 1;	/* XXX */
+	u_int namelen = arg2 + 1;	/* XXX */
+	struct vfsconf *vfsp;
+
+#ifndef NO_COMPAT_PRELITE2
+	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
+	if (namelen == 1)
+		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
+#endif
+
+#ifdef notyet
+	/* all sysctl names at this level are at least name and field */
+	if (namelen < 2)
+		return (ENOTDIR);		/* overloaded */
+	if (name[0] != VFS_GENERIC) {
+		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+			if (vfsp->vfc_typenum == name[0])
+				break;
+		if (vfsp == NULL)
+			return (EOPNOTSUPP);
+		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
+		    oldp, oldlenp, newp, newlen, p));
+	}
+#endif
+	switch (name[1]) {
+	case VFS_MAXTYPENUM:
+		if (namelen != 2)
+			return (ENOTDIR);
+		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
+	case VFS_CONF:
+		if (namelen != 3)
+			return (ENOTDIR);	/* overloaded */
+		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+			if (vfsp->vfc_typenum == name[2])
+				break;
+		if (vfsp == NULL)
+			return (EOPNOTSUPP);
+		return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
+	}
+	return (EOPNOTSUPP);
+}
+
+SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
+	"Generic filesystem");
+
+#ifndef NO_COMPAT_PRELITE2
+
+static int
+sysctl_ovfs_conf SYSCTL_HANDLER_ARGS
+{
+	int error;
+	struct vfsconf *vfsp;
+	struct ovfsconf ovfs;
+
+	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
+		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
+		strcpy(ovfs.vfc_name, vfsp->vfc_name);
+		ovfs.vfc_index = vfsp->vfc_typenum;
+		ovfs.vfc_refcount = vfsp->vfc_refcount;
+		ovfs.vfc_flags = vfsp->vfc_flags;
+		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
+		if (error)
+			return error;
+	}
+	return 0;
+}
+
+#endif /* !NO_COMPAT_PRELITE2 */
+
+int kinfo_vdebug = 1;
+int kinfo_vgetfailed;
+
+#define KINFO_VNODESLOP	10
+/*
+ * Dump vnode list (via sysctl).
+ * Copyout address of vnode followed by vnode.
+ */
+/* ARGSUSED */
+static int
+sysctl_vnode SYSCTL_HANDLER_ARGS
+{
+	struct proc *p = curproc;	/* XXX */
+	struct mount *mp, *nmp;
+	struct vnode *nvp, *vp;
+	int error;
+
+#define VPTRSZ	sizeof (struct vnode *)
+#define VNODESZ	sizeof (struct vnode)
+
+	req->lock = 0;
+	if (!req->oldptr) /* Make an estimate */
+		return (SYSCTL_OUT(req, 0,
+			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
+
+	simple_lock(&mountlist_slock);
+	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+			nmp = mp->mnt_list.cqe_next;
+			continue;
+		}
+again:
+		simple_lock(&mntvnode_slock);
+		for (vp = mp->mnt_vnodelist.lh_first;
+		     vp != NULL;
+		     vp = nvp) {
+			/*
+			 * Check that the vp is still associated with
+			 * this filesystem.  RACE: could have been
+			 * recycled onto the same filesystem.
+			 */
+			if (vp->v_mount != mp) {
+				simple_unlock(&mntvnode_slock);
+				if (kinfo_vdebug)
+					printf("kinfo: vp changed\n");
+				goto again;
+			}
+			nvp = vp->v_mntvnodes.le_next;
+			simple_unlock(&mntvnode_slock);
+			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
+			    (error = SYSCTL_OUT(req, vp, VNODESZ)))
+				return (error);
+			simple_lock(&mntvnode_slock);
+		}
+		simple_unlock(&mntvnode_slock);
+		simple_lock(&mountlist_slock);
+		nmp = mp->mnt_list.cqe_next;
+		vfs_unbusy(mp, p);
+	}
+	simple_unlock(&mountlist_slock);
+
+	return (0);
+}
+
+SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
+	0, 0, sysctl_vnode, "S,vnode", "");
+
+/*
+ * Check to see if a filesystem is mounted on a block device.
+ */
+int
+vfs_mountedon(vp)
+	struct vnode *vp;
+{
+	struct vnode *vq;
+	int error = 0;
+
+	if (vp->v_specflags & SI_MOUNTEDON)
+		return (EBUSY);
+	if (vp->v_flag & VALIASED) {
+		simple_lock(&spechash_slock);
+		for (vq = *vp->v_hashchain; vq; vq = vq->v_specnext) {
+			if (vq->v_rdev != vp->v_rdev ||
+			    vq->v_type != vp->v_type)
+				continue;
+			if (vq->v_specflags & SI_MOUNTEDON) {
+				error = EBUSY;
+				break;
+			}
+		}
+		simple_unlock(&spechash_slock);
+	}
+	return (error);
+}
+
+/*
+ * Unmount all filesystems. The list is traversed in reverse order
+ * of mounting to avoid dependencies.
+ */
+void
+vfs_unmountall()
+{
+	struct mount *mp, *nmp;
+	struct proc *p = initproc;	/* XXX XXX should this be proc0? */
+	int error;
+
+	/*
+	 * Since this only runs when rebooting, it is not interlocked.
+	 */
+	for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
+		nmp = mp->mnt_list.cqe_prev;
+		error = dounmount(mp, MNT_FORCE, p);
+		if (error) {
+			printf("unmount of %s failed (",
+			    mp->mnt_stat.f_mntonname);
+			if (error == EBUSY)
+				printf("BUSY)\n");
+			else
+				printf("%d)\n", error);
+		}
+	}
+}
+
+/*
+ * Build hash lists of net addresses and hang them off the mount point.
+ * Called by ufs_mount() to set up the lists of export addresses.
+ */
+static int
+vfs_hang_addrlist(struct mount *mp, struct netexport *nep, 
+	struct export_args *argp)
+{
+	register struct netcred *np;
+	register struct radix_node_head *rnh;
+	register int i;
+	struct radix_node *rn;
+	struct sockaddr *saddr, *smask = 0;
+	struct domain *dom;
+	int error;
+
+	if (argp->ex_addrlen == 0) {
+		if (mp->mnt_flag & MNT_DEFEXPORTED)
+			return (EPERM);
+		np = &nep->ne_defexported;
+		np->netc_exflags = argp->ex_flags;
+		np->netc_anon = argp->ex_anon;
+		np->netc_anon.cr_ref = 1;
+		mp->mnt_flag |= MNT_DEFEXPORTED;
+		return (0);
+	}
+	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
+	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
+	bzero((caddr_t) np, i);
+	saddr = (struct sockaddr *) (np + 1);
+	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
+		goto out;
+	if (saddr->sa_len > argp->ex_addrlen)
+		saddr->sa_len = argp->ex_addrlen;
+	if (argp->ex_masklen) {
+		smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
+		error = copyin(argp->ex_addr, (caddr_t) smask, argp->ex_masklen);
+		if (error)
+			goto out;
+		if (smask->sa_len > argp->ex_masklen)
+			smask->sa_len = argp->ex_masklen;
+	}
+	i = saddr->sa_family;
+	if ((rnh = nep->ne_rtable[i]) == 0) {
+		/*
+		 * Seems silly to initialize every AF when most are not used,
+		 * do so on demand here
+		 */
+		for (dom = domains; dom; dom = dom->dom_next)
+			if (dom->dom_family == i && dom->dom_rtattach) {
+				dom->dom_rtattach((void **) &nep->ne_rtable[i],
+				    dom->dom_rtoffset);
+				break;
+			}
+		if ((rnh = nep->ne_rtable[i]) == 0) {
+			error = ENOBUFS;
+			goto out;
+		}
+	}
+	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
+	    np->netc_rnodes);
+	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
+		error = EPERM;
+		goto out;
+	}
+	np->netc_exflags = argp->ex_flags;
+	np->netc_anon = argp->ex_anon;
+	np->netc_anon.cr_ref = 1;
+	return (0);
+out:
+	free(np, M_NETADDR);
+	return (error);
+}
+
+/* ARGSUSED */
+static int
+vfs_free_netcred(struct radix_node *rn, void *w)
+{
+	register struct radix_node_head *rnh = (struct radix_node_head *) w;
+
+	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
+	free((caddr_t) rn, M_NETADDR);
+	return (0);
+}
+
+/*
+ * Free the net address hash lists that are hanging off the mount points.
+ */
+static void
+vfs_free_addrlist(struct netexport *nep)
+{
+	register int i;
+	register struct radix_node_head *rnh;
+
+	for (i = 0; i <= AF_MAX; i++)
+		if ((rnh = nep->ne_rtable[i])) {
+			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
+			    (caddr_t) rnh);
+			free((caddr_t) rnh, M_RTABLE);
+			nep->ne_rtable[i] = 0;
+		}
+}
+
+int
+vfs_export(mp, nep, argp)
+	struct mount *mp;
+	struct netexport *nep;
+	struct export_args *argp;
+{
+	int error;
+
+	if (argp->ex_flags & MNT_DELEXPORT) {
+		vfs_free_addrlist(nep);
+		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
+	}
+	if (argp->ex_flags & MNT_EXPORTED) {
+		if ((error = vfs_hang_addrlist(mp, nep, argp)))
+			return (error);
+		mp->mnt_flag |= MNT_EXPORTED;
+	}
+	return (0);
+}
+
+struct netcred *
+vfs_export_lookup(mp, nep, nam)
+	register struct mount *mp;
+	struct netexport *nep;
+	struct mbuf *nam;
+{
+	register struct netcred *np;
+	register struct radix_node_head *rnh;
+	struct sockaddr *saddr;
+
+	np = NULL;
+	if (mp->mnt_flag & MNT_EXPORTED) {
+		/*
+		 * Lookup in the export list first.
+		 */
+		if (nam != NULL) {
+			saddr = mtod(nam, struct sockaddr *);
+			rnh = nep->ne_rtable[saddr->sa_family];
+			if (rnh != NULL) {
+				np = (struct netcred *)
+					(*rnh->rnh_matchaddr)((caddr_t)saddr,
+							      rnh);
+				if (np && np->netc_rnodes->rn_flags & RNF_ROOT)
+					np = NULL;
+			}
+		}
+		/*
+		 * If no address match, use the default if it exists.
+		 */
+		if (np == NULL && mp->mnt_flag & MNT_DEFEXPORTED)
+			np = &nep->ne_defexported;
+	}
+	return (np);
+}
+
+/*
+ * perform msync on all vnodes under a mount point
+ * the mount point must be locked.
+ */
+void
+vfs_msync(struct mount *mp, int flags) {
+	struct vnode *vp, *nvp;
+loop:
+	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
+
+		if (vp->v_mount != mp)
+			goto loop;
+		nvp = vp->v_mntvnodes.le_next;
+		if (VOP_ISLOCKED(vp) && (flags != MNT_WAIT))
+			continue;
+		if (vp->v_object &&
+		   (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
+			vm_object_page_clean(vp->v_object, 0, 0, TRUE, TRUE);
+		}
+	}
+}
+
+/*
+ * Create the VM object needed for VMIO and mmap support.  This
+ * is done for all VREG files in the system.  Some filesystems might
+ * afford the additional metadata buffering capability of the
+ * VMIO code by making the device node be VMIO mode also.
+ */
+int
+vfs_object_create(vp, p, cred, waslocked)
+	struct vnode *vp;
+	struct proc *p;
+	struct ucred *cred;
+	int waslocked;
+{
+	struct vattr vat;
+	vm_object_t object;
+	int error = 0;
+
+retry:
+	if ((object = vp->v_object) == NULL) {
+		if (vp->v_type == VREG) {
+			if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
+				goto retn;
+			(void) vnode_pager_alloc(vp,
+				OFF_TO_IDX(round_page(vat.va_size)), 0, 0);
+		} else {
+			/*
+			 * This simply allocates the biggest object possible
+			 * for a VBLK vnode.  This should be fixed, but doesn't
+			 * cause any problems (yet).
+			 */
+			(void) vnode_pager_alloc(vp, INT_MAX, 0, 0);
+		}
+		vp->v_object->flags |= OBJ_VFS_REF;
+	} else {
+		if (object->flags & OBJ_DEAD) {
+			if (waslocked)
+				VOP_UNLOCK(vp, 0, p);
+			tsleep(object, PVM, "vodead", 0);
+			if (waslocked)
+				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+			goto retry;
+		}
+		if ((object->flags & OBJ_VFS_REF) == 0) {
+			object->flags |= OBJ_VFS_REF;
+			vm_object_reference(object);
+		}
+	}
+	if (vp->v_object)
+		vp->v_flag |= VVMIO;
+
+retn:
+	return error;
+}
diff --git a/sys/kern/vfs_extattr.c b/sys/kern/vfs_extattr.c
new file mode 100644
index 0000000..2997fe5
--- /dev/null
+++ b/sys/kern/vfs_extattr.c
@@ -0,0 +1,2756 @@
+/*
+ * Copyright (c) 1989, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ * (c) UNIX System Laboratories, Inc.
+ * All or some portions of this file are derived from material licensed
+ * to the University of California by American Telephone and Telegraph
+ * Co. or Unix System Laboratories, Inc. and are reproduced herein with
+ * the permission of UNIX System Laboratories, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
+ * $Id: vfs_syscalls.c,v 1.60 1997/03/23 03:36:35 bde Exp $
+ */
+
+/*
+ * XXX - The following is required because of some magic done 
+ * in getdirentries() below which is only done if the translucent
+ * filesystem `UNION' is compiled into the kernel.  This is broken,
+ * but I don't have time to study the code deeply enough to understand
+ * what's going on and determine an appropriate fix.  -GAW
+ */
+#include "opt_union.h"
+
+#include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
+#include <sys/namei.h>
+#include <sys/filedesc.h>
+#include <sys/kernel.h>
+#include <sys/fcntl.h>
+#include <sys/file.h>
+#include <sys/stat.h>
+#include <sys/unistd.h>
+#include <sys/vnode.h>
+#include <sys/mount.h>
+#include <sys/proc.h>
+#include <sys/uio.h>
+#include <sys/malloc.h>
+#include <sys/dirent.h>
+
+#ifdef UNION
+#include <miscfs/union/union.h>
+#endif
+
+#include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <sys/sysctl.h>
+
+static int change_dir __P((struct nameidata *ndp, struct proc *p));
+static void checkdirs __P((struct vnode *olddp));
+
+/*
+ * Virtual File System System Calls
+ */
+
+/*
+ * Mount a file system.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mount_args {
+	char	*type;
+	char	*path;
+	int	flags;
+	caddr_t	data;
+};
+#endif
+/* ARGSUSED */
+int
+mount(p, uap, retval)
+	struct proc *p;
+	register struct mount_args /* {
+		syscallarg(char *) type;
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+		syscallarg(caddr_t) data;
+	} */ *uap;
+	register_t *retval;
+{
+	struct vnode *vp;
+	struct mount *mp;
+	struct vfsconf *vfsp;
+	int error, flag = 0;
+	struct vattr va;
+	u_long fstypenum;
+	struct nameidata nd;
+	char fstypename[MFSNAMELEN];
+
+	/*
+	 * Get vnode to be covered
+	 */
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	if (SCARG(uap, flags) & MNT_UPDATE) {
+		if ((vp->v_flag & VROOT) == 0) {
+			vput(vp);
+			return (EINVAL);
+		}
+		mp = vp->v_mount;
+		flag = mp->mnt_flag;
+		/*
+		 * We only allow the filesystem to be reloaded if it
+		 * is currently mounted read-only.
+		 */
+		if ((SCARG(uap, flags) & MNT_RELOAD) &&
+		    ((mp->mnt_flag & MNT_RDONLY) == 0)) {
+			vput(vp);
+			return (EOPNOTSUPP);	/* Needs translation */
+		}
+		mp->mnt_flag |=
+		    SCARG(uap, flags) & (MNT_RELOAD | MNT_FORCE | MNT_UPDATE);
+		/*
+		 * Only root, or the user that did the original mount is
+		 * permitted to update it.
+		 */
+		if (mp->mnt_stat.f_owner != p->p_ucred->cr_uid &&
+		    (error = suser(p->p_ucred, &p->p_acflag))) {
+			vput(vp);
+			return (error);
+		}
+		/*
+		 * Do not allow NFS export by non-root users. Silently
+		 * enforce MNT_NOSUID and MNT_NODEV for non-root users.
+		 */
+		if (p->p_ucred->cr_uid != 0) {
+			if (SCARG(uap, flags) & MNT_EXPORTED) {
+				vput(vp);
+				return (EPERM);
+			}
+			SCARG(uap, flags) |= MNT_NOSUID | MNT_NODEV;
+		}
+		if (vfs_busy(mp, LK_NOWAIT, 0, p)) {
+			vput(vp);
+			return (EBUSY);
+		}
+		VOP_UNLOCK(vp, 0, p);
+		goto update;
+	}
+	/*
+	 * If the user is not root, ensure that they own the directory
+	 * onto which we are attempting to mount.
+	 */
+	if ((error = VOP_GETATTR(vp, &va, p->p_ucred, p)) ||
+	    (va.va_uid != p->p_ucred->cr_uid &&
+	     (error = suser(p->p_ucred, &p->p_acflag)))) {
+		vput(vp);
+		return (error);
+	}
+	/*
+	 * Do not allow NFS export by non-root users. Silently
+	 * enforce MNT_NOSUID and MNT_NODEV for non-root users.
+	 */
+	if (p->p_ucred->cr_uid != 0) {
+		if (SCARG(uap, flags) & MNT_EXPORTED) {
+			vput(vp);
+			return (EPERM);
+		}
+		SCARG(uap, flags) |= MNT_NOSUID | MNT_NODEV;
+	}
+	if (error = vinvalbuf(vp, V_SAVE, p->p_ucred, p, 0, 0))
+		return (error);
+	if (vp->v_type != VDIR) {
+		vput(vp);
+		return (ENOTDIR);
+	}
+#ifdef COMPAT_43
+	/*
+	 * Historically filesystem types were identified by number. If we
+	 * get an integer for the filesystem type instead of a string, we
+	 * check to see if it matches one of the historic filesystem types.
+	 */
+	fstypenum = (u_long)SCARG(uap, type);
+	if (fstypenum < maxvfsconf) {
+		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+			if (vfsp->vfc_typenum == fstypenum)
+				break;
+		if (vfsp == NULL) {
+			vput(vp);
+			return (ENODEV);
+		}
+		strncpy(fstypename, vfsp->vfc_name, MFSNAMELEN);
+	} else
+#endif /* COMPAT_43 */
+	if (error = copyinstr(SCARG(uap, type), fstypename, MFSNAMELEN, NULL)) {
+		vput(vp);
+		return (error);
+	}
+	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
+		if (!strcmp(vfsp->vfc_name, fstypename))
+			break;
+	if (vfsp == NULL) {
+		vput(vp);
+		return (ENODEV);
+	}
+	if (vp->v_mountedhere != NULL) {
+		vput(vp);
+		return (EBUSY);
+	}
+
+	/*
+	 * Allocate and initialize the filesystem.
+	 */
+	mp = (struct mount *)malloc((u_long)sizeof(struct mount),
+		M_MOUNT, M_WAITOK);
+	bzero((char *)mp, (u_long)sizeof(struct mount));
+	lockinit(&mp->mnt_lock, PVFS, "vfslock", 0, 0);
+	(void)vfs_busy(mp, LK_NOWAIT, 0, p);
+	mp->mnt_op = vfsp->vfc_vfsops;
+	mp->mnt_vfc = vfsp;
+	vfsp->vfc_refcount++;
+	mp->mnt_stat.f_type = vfsp->vfc_typenum;
+	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
+	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
+	vp->v_mountedhere = mp;
+	mp->mnt_vnodecovered = vp;
+	mp->mnt_stat.f_owner = p->p_ucred->cr_uid;
+update:
+	/*
+	 * Set the mount level flags.
+	 */
+	if (SCARG(uap, flags) & MNT_RDONLY)
+		mp->mnt_flag |= MNT_RDONLY;
+	else if (mp->mnt_flag & MNT_RDONLY)
+		mp->mnt_flag |= MNT_WANTRDWR;
+	mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
+	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOATIME);
+	mp->mnt_flag |= SCARG(uap, flags) & (MNT_NOSUID | MNT_NOEXEC |
+	    MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_FORCE |
+	    MNT_NOATIME);
+	/*
+	 * Mount the filesystem.
+	 */
+	error = VFS_MOUNT(mp, SCARG(uap, path), SCARG(uap, data), &nd, p);
+	if (mp->mnt_flag & MNT_UPDATE) {
+		vrele(vp);
+		if (mp->mnt_flag & MNT_WANTRDWR)
+			mp->mnt_flag &= ~MNT_RDONLY;
+		mp->mnt_flag &=~
+		    (MNT_UPDATE | MNT_RELOAD | MNT_FORCE | MNT_WANTRDWR);
+		if (error)
+			mp->mnt_flag = flag;
+		vfs_unbusy(mp, p);
+		return (error);
+	}
+	/*
+	 * Put the new filesystem on the mount list after root.
+	 */
+	cache_purge(vp);
+	if (!error) {
+		simple_lock(&mountlist_slock);
+		CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+		simple_unlock(&mountlist_slock);
+		checkdirs(vp);
+		VOP_UNLOCK(vp, 0, p);
+		vfs_unbusy(mp, p);
+		if (error = VFS_START(mp, 0, p))
+			vrele(vp);
+	} else {
+		mp->mnt_vnodecovered->v_mountedhere = (struct mount *)0;
+		mp->mnt_vfc->vfc_refcount--;
+		vfs_unbusy(mp, p);
+		free((caddr_t)mp, M_MOUNT);
+		vput(vp);
+	}
+	return (error);
+}
+
+/*
+ * Scan all active processes to see if any of them have a current
+ * or root directory onto which the new filesystem has just been
+ * mounted. If so, replace them with the new mount point.
+ */
+static void
+checkdirs(olddp)
+	struct vnode *olddp;
+{
+	struct filedesc *fdp;
+	struct vnode *newdp;
+	struct proc *p;
+
+	if (olddp->v_usecount == 1)
+		return;
+	if (VFS_ROOT(olddp->v_mountedhere, &newdp))
+		panic("mount: lost mount");
+	for (p = allproc.lh_first; p != 0; p = p->p_list.le_next) {
+		fdp = p->p_fd;
+		if (fdp->fd_cdir == olddp) {
+			vrele(fdp->fd_cdir);
+			VREF(newdp);
+			fdp->fd_cdir = newdp;
+		}
+		if (fdp->fd_rdir == olddp) {
+			vrele(fdp->fd_rdir);
+			VREF(newdp);
+			fdp->fd_rdir = newdp;
+		}
+	}
+	if (rootvnode == olddp) {
+		vrele(rootvnode);
+		VREF(newdp);
+		rootvnode = newdp;
+	}
+	vput(newdp);
+}
+
+/*
+ * Unmount a file system.
+ *
+ * Note: unmount takes a path to the vnode mounted on as argument,
+ * not special file (as before).
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unmount_args {
+	char	*path;
+	int	flags;
+};
+#endif
+/* ARGSUSED */
+int
+unmount(p, uap, retval)
+	struct proc *p;
+	register struct unmount_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct vnode *vp;
+	struct mount *mp;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	mp = vp->v_mount;
+
+	/*
+	 * Only root, or the user that did the original mount is
+	 * permitted to unmount this filesystem.
+	 */
+	if ((mp->mnt_stat.f_owner != p->p_ucred->cr_uid) &&
+	    (error = suser(p->p_ucred, &p->p_acflag))) {
+		vput(vp);
+		return (error);
+	}
+
+	/*
+	 * Don't allow unmounting the root file system.
+	 */
+	if (mp->mnt_flag & MNT_ROOTFS) {
+		vput(vp);
+		return (EINVAL);
+	}
+
+	/*
+	 * Must be the root of the filesystem
+	 */
+	if ((vp->v_flag & VROOT) == 0) {
+		vput(vp);
+		return (EINVAL);
+	}
+	vput(vp);
+	return (dounmount(mp, SCARG(uap, flags), p));
+}
+
+/*
+ * Do the actual file system unmount.
+ */
+int
+dounmount(mp, flags, p)
+	register struct mount *mp;
+	int flags;
+	struct proc *p;
+{
+	struct vnode *coveredvp;
+	int error;
+
+	simple_lock(&mountlist_slock);
+	mp->mnt_flag |= MNT_UNMOUNT;
+	lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK, &mountlist_slock, p);
+	mp->mnt_flag &=~ MNT_ASYNC;
+	vfs_msync(mp, MNT_NOWAIT);
+	vnode_pager_umount(mp);	/* release cached vnodes */
+	cache_purgevfs(mp);	/* remove cache entries for this file sys */
+	if (((mp->mnt_flag & MNT_RDONLY) ||
+	     (error = VFS_SYNC(mp, MNT_WAIT, p->p_ucred, p)) == 0) ||
+	    (flags & MNT_FORCE))
+		error = VFS_UNMOUNT(mp, flags, p);
+	simple_lock(&mountlist_slock);
+	if (error) {
+		mp->mnt_flag &= ~MNT_UNMOUNT;
+		lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK | LK_REENABLE,
+		    &mountlist_slock, p);
+		return (error);
+	}
+	CIRCLEQ_REMOVE(&mountlist, mp, mnt_list);
+	if ((coveredvp = mp->mnt_vnodecovered) != NULLVP) {
+		coveredvp->v_mountedhere = (struct mount *)0;
+		vrele(coveredvp);
+	}
+	mp->mnt_vfc->vfc_refcount--;
+	if (mp->mnt_vnodelist.lh_first != NULL)
+		panic("unmount: dangling vnode");
+	lockmgr(&mp->mnt_lock, LK_RELEASE | LK_INTERLOCK, &mountlist_slock, p);
+	if (mp->mnt_flag & MNT_MWAIT)
+		wakeup((caddr_t)mp);
+	free((caddr_t)mp, M_MOUNT);
+	return (0);
+}
+
+/*
+ * Sync each mounted filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct sync_args {
+        int     dummy;
+};
+#endif
+
+#ifdef DEBUG
+int syncprt = 0;
+SYSCTL_INT(_debug, 0, syncprt, CTLFLAG_RW, &syncprt, 0, "");
+#endif
+
+/* ARGSUSED */
+int
+sync(p, uap, retval)
+	struct proc *p;
+	struct sync_args *uap;
+	register_t *retval;
+{
+	register struct mount *mp, *nmp;
+	int asyncflag;
+
+	simple_lock(&mountlist_slock);
+	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+			nmp = mp->mnt_list.cqe_next;
+			continue;
+		}
+		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
+			asyncflag = mp->mnt_flag & MNT_ASYNC;
+			mp->mnt_flag &= ~MNT_ASYNC;
+			vfs_msync(mp, MNT_NOWAIT);
+			VFS_SYNC(mp, MNT_NOWAIT, p != NULL ? p->p_ucred : NOCRED, p);
+			if (asyncflag)
+				mp->mnt_flag |= MNT_ASYNC;
+		}
+		simple_lock(&mountlist_slock);
+		nmp = mp->mnt_list.cqe_next;
+		vfs_unbusy(mp, p);
+	}
+	simple_unlock(&mountlist_slock);
+#if 0
+/*
+ * XXX don't call vfs_bufstats() yet because that routine
+ * was not imported in the Lite2 merge.
+ */
+#ifdef DIAGNOSTIC
+	if (syncprt)
+		vfs_bufstats();
+#endif /* DIAGNOSTIC */
+#endif
+	return (0);
+}
+
+/*
+ * Change filesystem quotas.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct quotactl_args {
+	char *path;
+	int cmd;
+	int uid;
+	caddr_t arg;
+};
+#endif
+/* ARGSUSED */
+int
+quotactl(p, uap, retval)
+	struct proc *p;
+	register struct quotactl_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) cmd;
+		syscallarg(int) uid;
+		syscallarg(caddr_t) arg;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct mount *mp;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	mp = nd.ni_vp->v_mount;
+	vrele(nd.ni_vp);
+	return (VFS_QUOTACTL(mp, SCARG(uap, cmd), SCARG(uap, uid),
+	    SCARG(uap, arg), p));
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct statfs_args {
+	char *path;
+	struct statfs *buf;
+};
+#endif
+/* ARGSUSED */
+int
+statfs(p, uap, retval)
+	struct proc *p;
+	register struct statfs_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct statfs *) buf;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct mount *mp;
+	register struct statfs *sp;
+	int error;
+	struct nameidata nd;
+	struct statfs sb;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	mp = nd.ni_vp->v_mount;
+	sp = &mp->mnt_stat;
+	vrele(nd.ni_vp);
+	error = VFS_STATFS(mp, sp, p);
+	if (error)
+		return (error);
+	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	if (p->p_ucred->cr_uid != 0) {
+		bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb));
+		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+		sp = &sb;
+	}
+	return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Get filesystem statistics.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fstatfs_args {
+	int fd;
+	struct statfs *buf;
+};
+#endif
+/* ARGSUSED */
+int
+fstatfs(p, uap, retval)
+	struct proc *p;
+	register struct fstatfs_args /* {
+		syscallarg(int) fd;
+		syscallarg(struct statfs *) buf;
+	} */ *uap;
+	register_t *retval;
+{
+	struct file *fp;
+	struct mount *mp;
+	register struct statfs *sp;
+	int error;
+	struct statfs sb;
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	mp = ((struct vnode *)fp->f_data)->v_mount;
+	sp = &mp->mnt_stat;
+	error = VFS_STATFS(mp, sp, p);
+	if (error)
+		return (error);
+	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	if (p->p_ucred->cr_uid != 0) {
+		bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb));
+		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+		sp = &sb;
+	}
+	return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp)));
+}
+
+/*
+ * Get statistics on all filesystems.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getfsstat_args {
+	struct statfs *buf;
+	long bufsize;
+	int flags;
+};
+#endif
+int
+getfsstat(p, uap, retval)
+	struct proc *p;
+	register struct getfsstat_args /* {
+		syscallarg(struct statfs *) buf;
+		syscallarg(long) bufsize;
+		syscallarg(int) flags;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct mount *mp, *nmp;
+	register struct statfs *sp;
+	caddr_t sfsp;
+	long count, maxcount, error;
+
+	maxcount = SCARG(uap, bufsize) / sizeof(struct statfs);
+	sfsp = (caddr_t)SCARG(uap, buf);
+	count = 0;
+	simple_lock(&mountlist_slock);
+	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
+		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
+			nmp = mp->mnt_list.cqe_next;
+			continue;
+		}
+		if (sfsp && count < maxcount) {
+			sp = &mp->mnt_stat;
+			/*
+			 * If MNT_NOWAIT is specified, do not refresh the
+			 * fsstat cache. MNT_WAIT overrides MNT_NOWAIT.
+			 */
+			if (((SCARG(uap, flags) & MNT_NOWAIT) == 0 ||
+			    (SCARG(uap, flags) & MNT_WAIT)) &&
+			    (error = VFS_STATFS(mp, sp, p))) {
+				simple_lock(&mountlist_slock);
+				nmp = mp->mnt_list.cqe_next;
+				vfs_unbusy(mp, p);
+				continue;
+			}
+			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+			error = copyout((caddr_t)sp, sfsp, sizeof(*sp));
+			if (error) {
+				vfs_unbusy(mp, p);
+				return (error);
+			}
+			sfsp += sizeof(*sp);
+		}
+		count++;
+		simple_lock(&mountlist_slock);
+		nmp = mp->mnt_list.cqe_next;
+		vfs_unbusy(mp, p);
+	}
+	simple_unlock(&mountlist_slock);
+	if (sfsp && count > maxcount)
+		*retval = maxcount;
+	else
+		*retval = count;
+	return (0);
+}
+
+/*
+ * Change current working directory to a given file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchdir_args {
+	int	fd;
+};
+#endif
+/* ARGSUSED */
+int
+fchdir(p, uap, retval)
+	struct proc *p;
+	struct fchdir_args /* {
+		syscallarg(int) fd;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct filedesc *fdp = p->p_fd;
+	struct vnode *vp, *tdp;
+	struct mount *mp;
+	struct file *fp;
+	int error;
+
+	if (error = getvnode(fdp, SCARG(uap, fd), &fp))
+		return (error);
+	vp = (struct vnode *)fp->f_data;
+	VREF(vp);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	if (vp->v_type != VDIR)
+		error = ENOTDIR;
+	else
+		error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p);
+	while (!error && (mp = vp->v_mountedhere) != NULL) {
+		if (vfs_busy(mp, 0, 0, p))
+			continue;
+		error = VFS_ROOT(mp, &tdp);
+		vfs_unbusy(mp, p);
+		if (error)
+			break;
+		vput(vp);
+		vp = tdp;
+	}
+	if (error) {
+		vput(vp);
+		return (error);
+	}
+	VOP_UNLOCK(vp, 0, p);
+	vrele(fdp->fd_cdir);
+	fdp->fd_cdir = vp;
+	return (0);
+}
+
+/*
+ * Change current working directory (``.'').
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chdir_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+chdir(p, uap, retval)
+	struct proc *p;
+	struct chdir_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct filedesc *fdp = p->p_fd;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = change_dir(&nd, p))
+		return (error);
+	vrele(fdp->fd_cdir);
+	fdp->fd_cdir = nd.ni_vp;
+	return (0);
+}
+
+/*
+ * Change notion of root (``/'') directory.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chroot_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+chroot(p, uap, retval)
+	struct proc *p;
+	struct chroot_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct filedesc *fdp = p->p_fd;
+	int error;
+	struct nameidata nd;
+
+	error = suser(p->p_ucred, &p->p_acflag);
+	if (error)
+		return (error);
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = change_dir(&nd, p))
+		return (error);
+	if (fdp->fd_rdir != NULL)
+		vrele(fdp->fd_rdir);
+	fdp->fd_rdir = nd.ni_vp;
+	return (0);
+}
+
+/*
+ * Common routine for chroot and chdir.
+ */
+static int
+change_dir(ndp, p)
+	register struct nameidata *ndp;
+	struct proc *p;
+{
+	struct vnode *vp;
+	int error;
+
+	error = namei(ndp);
+	if (error)
+		return (error);
+	vp = ndp->ni_vp;
+	if (vp->v_type != VDIR)
+		error = ENOTDIR;
+	else
+		error = VOP_ACCESS(vp, VEXEC, p->p_ucred, p);
+	if (error)
+		vput(vp);
+	else
+		VOP_UNLOCK(vp, 0, p);
+	return (error);
+}
+
+/*
+ * Check permissions, allocate an open file structure,
+ * and call the device open routine if any.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct open_args {
+	char	*path;
+	int	flags;
+	int	mode;
+};
+#endif
+int
+open(p, uap, retval)
+	struct proc *p;
+	register struct open_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+		syscallarg(int) mode;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct filedesc *fdp = p->p_fd;
+	register struct file *fp;
+	register struct vnode *vp;
+	int flags, cmode;
+	struct file *nfp;
+	int type, indx, error;
+	struct flock lf;
+	struct nameidata nd;
+
+	error = falloc(p, &nfp, &indx);
+	if (error)
+		return (error);
+	fp = nfp;
+	flags = FFLAGS(SCARG(uap, flags));
+	cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	p->p_dupfd = -indx - 1;			/* XXX check for fdopen */
+	error = vn_open(&nd, flags, cmode);
+	if (error) {
+		ffree(fp);
+		if ((error == ENODEV || error == ENXIO) &&
+		    p->p_dupfd >= 0 &&			/* XXX from fdopen */
+		    (error =
+			dupfdopen(fdp, indx, p->p_dupfd, flags, error)) == 0) {
+			*retval = indx;
+			return (0);
+		}
+		if (error == ERESTART)
+			error = EINTR;
+		fdp->fd_ofiles[indx] = NULL;
+		return (error);
+	}
+	p->p_dupfd = 0;
+	vp = nd.ni_vp;
+
+	fp->f_flag = flags & FMASK;
+	fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE);
+	fp->f_ops = &vnops;
+	fp->f_data = (caddr_t)vp;
+	if (flags & (O_EXLOCK | O_SHLOCK)) {
+		lf.l_whence = SEEK_SET;
+		lf.l_start = 0;
+		lf.l_len = 0;
+		if (flags & O_EXLOCK)
+			lf.l_type = F_WRLCK;
+		else
+			lf.l_type = F_RDLCK;
+		type = F_FLOCK;
+		if ((flags & FNONBLOCK) == 0)
+			type |= F_WAIT;
+		VOP_UNLOCK(vp, 0, p);
+		if (error = VOP_ADVLOCK(vp, (caddr_t)fp, F_SETLK, &lf, type)) {
+			(void) vn_close(vp, fp->f_flag, fp->f_cred, p);
+			ffree(fp);
+			fdp->fd_ofiles[indx] = NULL;
+			return (error);
+		}
+		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+		fp->f_flag |= FHASLOCK;
+	}
+	VOP_UNLOCK(vp, 0, p);
+	*retval = indx;
+	return (0);
+}
+
+#ifdef COMPAT_43
+/*
+ * Create a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ocreat_args {
+	char	*path;
+	int	mode;
+};
+#endif
+int
+ocreat(p, uap, retval)
+	struct proc *p;
+	register struct ocreat_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+	register_t *retval;
+{
+	struct open_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+		syscallarg(int) mode;
+	} */ nuap;
+
+	SCARG(&nuap, path) = SCARG(uap, path);
+	SCARG(&nuap, mode) = SCARG(uap, mode);
+	SCARG(&nuap, flags) = O_WRONLY | O_CREAT | O_TRUNC;
+	return (open(p, &nuap, retval));
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Create a special file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mknod_args {
+	char	*path;
+	int	mode;
+	int	dev;
+};
+#endif
+/* ARGSUSED */
+int
+mknod(p, uap, retval)
+	struct proc *p;
+	register struct mknod_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+		syscallarg(int) dev;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct vnode *vp;
+	struct vattr vattr;
+	int error;
+	int whiteout;
+	struct nameidata nd;
+
+	error = suser(p->p_ucred, &p->p_acflag);
+	if (error)
+		return (error);
+	NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	if (vp != NULL)
+		error = EEXIST;
+	else {
+		VATTR_NULL(&vattr);
+		vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask;
+		vattr.va_rdev = SCARG(uap, dev);
+		whiteout = 0;
+
+		switch (SCARG(uap, mode) & S_IFMT) {
+		case S_IFMT:	/* used by badsect to flag bad sectors */
+			vattr.va_type = VBAD;
+			break;
+		case S_IFCHR:
+			vattr.va_type = VCHR;
+			break;
+		case S_IFBLK:
+			vattr.va_type = VBLK;
+			break;
+		case S_IFWHT:
+			whiteout = 1;
+			break;
+		default:
+			error = EINVAL;
+			break;
+		}
+	}
+	if (!error) {
+		VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+		if (whiteout) {
+			error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, CREATE);
+			if (error)
+				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+			vput(nd.ni_dvp);
+		} else {
+			error = VOP_MKNOD(nd.ni_dvp, &nd.ni_vp,
+						&nd.ni_cnd, &vattr);
+		}
+	} else {
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+		if (nd.ni_dvp == vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		if (vp)
+			vrele(vp);
+	}
+	return (error);
+}
+
+/*
+ * Create a named pipe.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkfifo_args {
+	char	*path;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+mkfifo(p, uap, retval)
+	struct proc *p;
+	register struct mkfifo_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+	register_t *retval;
+{
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	if (nd.ni_vp != NULL) {
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+		if (nd.ni_dvp == nd.ni_vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		vrele(nd.ni_vp);
+		return (EEXIST);
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_type = VFIFO;
+	vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask;
+	VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+	return (VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr));
+}
+
+/*
+ * Make a hard file link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct link_args {
+	char	*path;
+	char	*link;
+};
+#endif
+/* ARGSUSED */
+int
+link(p, uap, retval)
+	struct proc *p;
+	register struct link_args /* {
+		syscallarg(char *) path;
+		syscallarg(char *) link;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct vnode *vp;
+	struct nameidata nd;
+	int error;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	if (vp->v_type == VDIR)
+		error = EPERM;		/* POSIX */
+	else {
+		NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, link), p);
+		error = namei(&nd);
+		if (!error) {
+			if (nd.ni_vp != NULL) {
+				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+				if (nd.ni_dvp == nd.ni_vp)
+					vrele(nd.ni_dvp);
+				else
+					vput(nd.ni_dvp);
+				if (nd.ni_vp)
+					vrele(nd.ni_vp);
+				error = EEXIST;
+			} else {
+				VOP_LEASE(nd.ni_dvp, p, p->p_ucred,
+				    LEASE_WRITE);
+				VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+				error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
+			}
+		}
+	}
+	vrele(vp);
+	return (error);
+}
+
+/*
+ * Make a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct symlink_args {
+	char	*path;
+	char	*link;
+};
+#endif
+/* ARGSUSED */
+int
+symlink(p, uap, retval)
+	struct proc *p;
+	register struct symlink_args /* {
+		syscallarg(char *) path;
+		syscallarg(char *) link;
+	} */ *uap;
+	register_t *retval;
+{
+	struct vattr vattr;
+	char *path;
+	int error;
+	struct nameidata nd;
+
+	MALLOC(path, char *, MAXPATHLEN, M_NAMEI, M_WAITOK);
+	if (error = copyinstr(SCARG(uap, path), path, MAXPATHLEN, NULL))
+		goto out;
+	NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, link), p);
+	if (error = namei(&nd))
+		goto out;
+	if (nd.ni_vp) {
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+		if (nd.ni_dvp == nd.ni_vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		vrele(nd.ni_vp);
+		error = EEXIST;
+		goto out;
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_mode = ACCESSPERMS &~ p->p_fd->fd_cmask;
+	VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+	error = VOP_SYMLINK(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr, path);
+out:
+	FREE(path, M_NAMEI);
+	return (error);
+}
+
+/*
+ * Delete a whiteout from the filesystem.
+ */
+/* ARGSUSED */
+int
+undelete(p, uap, retval)
+	struct proc *p;
+	register struct undelete_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+	register_t *retval;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, DELETE, LOCKPARENT|DOWHITEOUT, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	error = namei(&nd);
+	if (error)
+		return (error);
+
+	if (nd.ni_vp != NULLVP || !(nd.ni_cnd.cn_flags & ISWHITEOUT)) {
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+		if (nd.ni_dvp == nd.ni_vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		if (nd.ni_vp)
+			vrele(nd.ni_vp);
+		return (EEXIST);
+	}
+
+	VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+	if (error = VOP_WHITEOUT(nd.ni_dvp, &nd.ni_cnd, DELETE))
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+	vput(nd.ni_dvp);
+	return (error);
+}
+
+/*
+ * Delete a name from the filesystem.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct unlink_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+unlink(p, uap, retval)
+	struct proc *p;
+	struct unlink_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct vnode *vp;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, DELETE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+
+	if (vp->v_type == VDIR)
+		error = EPERM;		/* POSIX */
+	else {
+		/*
+		 * The root of a mounted filesystem cannot be deleted.
+		 *
+		 * XXX: can this only be a VDIR case?
+		 */
+		if (vp->v_flag & VROOT)
+			error = EBUSY;
+		else
+			(void) vnode_pager_uncache(vp, p);
+	}
+
+	if (!error) {
+		VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+		error = VOP_REMOVE(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
+	} else {
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+		if (nd.ni_dvp == vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		if (vp != NULLVP)
+			vput(vp);
+	}
+	return (error);
+}
+
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lseek_args {
+	int	fd;
+	int	pad;
+	off_t	offset;
+	int	whence;
+};
+#endif
+int
+lseek(p, uap, retval)
+	struct proc *p;
+	register struct lseek_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) pad;
+		syscallarg(off_t) offset;
+		syscallarg(int) whence;
+	} */ *uap;
+	register_t *retval;	/* XXX */
+{
+	struct ucred *cred = p->p_ucred;
+	register struct filedesc *fdp = p->p_fd;
+	register struct file *fp;
+	struct vattr vattr;
+	int error;
+
+	if ((u_int)SCARG(uap, fd) >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[SCARG(uap, fd)]) == NULL)
+		return (EBADF);
+	if (fp->f_type != DTYPE_VNODE)
+		return (ESPIPE);
+	switch (SCARG(uap, whence)) {
+	case L_INCR:
+		fp->f_offset += SCARG(uap, offset);
+		break;
+	case L_XTND:
+		error=VOP_GETATTR((struct vnode *)fp->f_data, &vattr, cred, p);
+		if (error)
+			return (error);
+		fp->f_offset = SCARG(uap, offset) + vattr.va_size;
+		break;
+	case L_SET:
+		fp->f_offset = SCARG(uap, offset);
+		break;
+	default:
+		return (EINVAL);
+	}
+	*(off_t *)retval = fp->f_offset;
+	return (0);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Reposition read/write file offset.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olseek_args {
+	int	fd;
+	long	offset;
+	int	whence;
+};
+#endif
+int
+olseek(p, uap, retval)
+	struct proc *p;
+	register struct olseek_args /* {
+		syscallarg(int) fd;
+		syscallarg(long) offset;
+		syscallarg(int) whence;
+	} */ *uap;
+	register_t *retval;
+{
+	struct lseek_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) pad;
+		syscallarg(off_t) offset;
+		syscallarg(int) whence;
+	} */ nuap;
+	off_t qret;
+	int error;
+
+	SCARG(&nuap, fd) = SCARG(uap, fd);
+	SCARG(&nuap, offset) = SCARG(uap, offset);
+	SCARG(&nuap, whence) = SCARG(uap, whence);
+	error = lseek(p, &nuap, (register_t *) &qret);
+	*(long *)retval = qret;
+	return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Check access permissions.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct access_args {
+	char	*path;
+	int	flags;
+};
+#endif
+int
+access(p, uap, retval)
+	struct proc *p;
+	register struct access_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct ucred *cred = p->p_ucred;
+	register struct vnode *vp;
+	int error, flags, t_gid, t_uid;
+	struct nameidata nd;
+
+	t_uid = cred->cr_uid;
+	t_gid = cred->cr_groups[0];
+	cred->cr_uid = p->p_cred->p_ruid;
+	cred->cr_groups[0] = p->p_cred->p_rgid;
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		goto out1;
+	vp = nd.ni_vp;
+
+	/* Flags == 0 means only check for existence. */
+	if (SCARG(uap, flags)) {
+		flags = 0;
+		if (SCARG(uap, flags) & R_OK)
+			flags |= VREAD;
+		if (SCARG(uap, flags) & W_OK)
+			flags |= VWRITE;
+		if (SCARG(uap, flags) & X_OK)
+			flags |= VEXEC;
+		if ((flags & VWRITE) == 0 || (error = vn_writechk(vp)) == 0)
+			error = VOP_ACCESS(vp, flags, cred, p);
+	}
+	vput(vp);
+out1:
+	cred->cr_uid = t_uid;
+	cred->cr_groups[0] = t_gid;
+	return (error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ostat_args {
+	char	*path;
+	struct ostat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+ostat(p, uap, retval)
+	struct proc *p;
+	register struct ostat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct ostat *) ub;
+	} */ *uap;
+	register_t *retval;
+{
+	struct stat sb;
+	struct ostat osb;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = vn_stat(nd.ni_vp, &sb, p);
+	vput(nd.ni_vp);
+	if (error)
+		return (error);
+	cvtstat(&sb, &osb);
+	error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb));
+	return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct olstat_args {
+	char	*path;
+	struct ostat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+olstat(p, uap, retval)
+	struct proc *p;
+	register struct olstat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct ostat *) ub;
+	} */ *uap;
+	register_t *retval;
+{
+	struct vnode *vp, *dvp;
+	struct stat sb, sb1;
+	struct ostat osb;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKPARENT, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	/*
+	 * For symbolic links, always return the attributes of its
+	 * containing directory, except for mode, size, and links.
+	 */
+	vp = nd.ni_vp;
+	dvp = nd.ni_dvp;
+	if (vp->v_type != VLNK) {
+		if (dvp == vp)
+			vrele(dvp);
+		else
+			vput(dvp);
+		error = vn_stat(vp, &sb, p);
+		vput(vp);
+		if (error)
+			return (error);
+	} else {
+		error = vn_stat(dvp, &sb, p);
+		vput(dvp);
+		if (error) {
+			vput(vp);
+			return (error);
+		}
+		error = vn_stat(vp, &sb1, p);
+		vput(vp);
+		if (error)
+			return (error);
+		sb.st_mode &= ~S_IFDIR;
+		sb.st_mode |= S_IFLNK;
+		sb.st_nlink = sb1.st_nlink;
+		sb.st_size = sb1.st_size;
+		sb.st_blocks = sb1.st_blocks;
+	}
+	cvtstat(&sb, &osb);
+	error = copyout((caddr_t)&osb, (caddr_t)SCARG(uap, ub), sizeof (osb));
+	return (error);
+}
+
+/*
+ * Convert from an old to a new stat structure.
+ */
+void
+cvtstat(st, ost)
+	struct stat *st;
+	struct ostat *ost;
+{
+
+	ost->st_dev = st->st_dev;
+	ost->st_ino = st->st_ino;
+	ost->st_mode = st->st_mode;
+	ost->st_nlink = st->st_nlink;
+	ost->st_uid = st->st_uid;
+	ost->st_gid = st->st_gid;
+	ost->st_rdev = st->st_rdev;
+	if (st->st_size < (quad_t)1 << 32)
+		ost->st_size = st->st_size;
+	else
+		ost->st_size = -2;
+	ost->st_atime = st->st_atime;
+	ost->st_mtime = st->st_mtime;
+	ost->st_ctime = st->st_ctime;
+	ost->st_blksize = st->st_blksize;
+	ost->st_blocks = st->st_blocks;
+	ost->st_flags = st->st_flags;
+	ost->st_gen = st->st_gen;
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Get file status; this version follows links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct stat_args {
+	char	*path;
+	struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+stat(p, uap, retval)
+	struct proc *p;
+	register struct stat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct stat *) ub;
+	} */ *uap;
+	register_t *retval;
+{
+	struct stat sb;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = vn_stat(nd.ni_vp, &sb, p);
+	vput(nd.ni_vp);
+	if (error)
+		return (error);
+	error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb));
+	return (error);
+}
+
+/*
+ * Get file status; this version does not follow links.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+	char	*path;
+	struct stat *ub;
+};
+#endif
+/* ARGSUSED */
+int
+lstat(p, uap, retval)
+	struct proc *p;
+	register struct lstat_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct stat *) ub;
+	} */ *uap;
+	register_t *retval;
+{
+	int error;
+	struct vnode *vp, *dvp;
+	struct stat sb, sb1;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF | LOCKPARENT, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	/*
+	 * For symbolic links, always return the attributes of its containing
+	 * directory, except for mode, size, inode number, and links.
+	 */
+	vp = nd.ni_vp;
+	dvp = nd.ni_dvp;
+	if (vp->v_type != VLNK) {
+		if (dvp == vp)
+			vrele(dvp);
+		else
+			vput(dvp);
+		error = vn_stat(vp, &sb, p);
+		vput(vp);
+		if (error)
+			return (error);
+	} else {
+		error = vn_stat(dvp, &sb, p);
+		vput(dvp);
+		if (error) {
+			vput(vp);
+			return (error);
+		}
+		error = vn_stat(vp, &sb1, p);
+		vput(vp);
+		if (error)
+			return (error);
+		sb.st_mode &= ~S_IFDIR;
+		sb.st_mode |= S_IFLNK;
+		sb.st_nlink = sb1.st_nlink;
+		sb.st_size = sb1.st_size;
+		sb.st_blocks = sb1.st_blocks;
+		sb.st_ino = sb1.st_ino;
+	}
+	error = copyout((caddr_t)&sb, (caddr_t)SCARG(uap, ub), sizeof (sb));
+	return (error);
+}
+
+/*
+ * Get configurable pathname variables.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct pathconf_args {
+	char	*path;
+	int	name;
+};
+#endif
+/* ARGSUSED */
+int
+pathconf(p, uap, retval)
+	struct proc *p;
+	register struct pathconf_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) name;
+	} */ *uap;
+	register_t *retval;
+{
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	error = VOP_PATHCONF(nd.ni_vp, SCARG(uap, name), retval);
+	vput(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Return target name of a symbolic link.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct readlink_args {
+	char	*path;
+	char	*buf;
+	int	count;
+};
+#endif
+/* ARGSUSED */
+int
+readlink(p, uap, retval)
+	struct proc *p;
+	register struct readlink_args /* {
+		syscallarg(char *) path;
+		syscallarg(char *) buf;
+		syscallarg(int) count;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct vnode *vp;
+	struct iovec aiov;
+	struct uio auio;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, NOFOLLOW | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	if (vp->v_type != VLNK)
+		error = EINVAL;
+	else {
+		aiov.iov_base = SCARG(uap, buf);
+		aiov.iov_len = SCARG(uap, count);
+		auio.uio_iov = &aiov;
+		auio.uio_iovcnt = 1;
+		auio.uio_offset = 0;
+		auio.uio_rw = UIO_READ;
+		auio.uio_segflg = UIO_USERSPACE;
+		auio.uio_procp = p;
+		auio.uio_resid = SCARG(uap, count);
+		error = VOP_READLINK(vp, &auio, p->p_ucred);
+	}
+	vput(vp);
+	*retval = SCARG(uap, count) - auio.uio_resid;
+	return (error);
+}
+
+/*
+ * Change flags of a file given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chflags_args {
+	char	*path;
+	int	flags;
+};
+#endif
+/* ARGSUSED */
+int
+chflags(p, uap, retval)
+	struct proc *p;
+	register struct chflags_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) flags;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct vnode *vp;
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	VATTR_NULL(&vattr);
+	vattr.va_flags = SCARG(uap, flags);
+	error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+	vput(vp);
+	return (error);
+}
+
+/*
+ * Change flags of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchflags_args {
+	int	fd;
+	int	flags;
+};
+#endif
+/* ARGSUSED */
+int
+fchflags(p, uap, retval)
+	struct proc *p;
+	register struct fchflags_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) flags;
+	} */ *uap;
+	register_t *retval;
+{
+	struct vattr vattr;
+	struct vnode *vp;
+	struct file *fp;
+	int error;
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	vp = (struct vnode *)fp->f_data;
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	VATTR_NULL(&vattr);
+	vattr.va_flags = SCARG(uap, flags);
+	error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+	VOP_UNLOCK(vp, 0, p);
+	return (error);
+}
+
+/*
+ * Change mode of a file given path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chmod_args {
+	char	*path;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+chmod(p, uap, retval)
+	struct proc *p;
+	register struct chmod_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct vnode *vp;
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	VATTR_NULL(&vattr);
+	vattr.va_mode = SCARG(uap, mode) & ALLPERMS;
+	error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+	vput(vp);
+	return (error);
+}
+
+/*
+ * Change mode of a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchmod_args {
+	int	fd;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+fchmod(p, uap, retval)
+	struct proc *p;
+	register struct fchmod_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) mode;
+	} */ *uap;
+	register_t *retval;
+{
+	struct vattr vattr;
+	struct vnode *vp;
+	struct file *fp;
+	int error;
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	vp = (struct vnode *)fp->f_data;
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	VATTR_NULL(&vattr);
+	vattr.va_mode = SCARG(uap, mode) & ALLPERMS;
+	error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+	VOP_UNLOCK(vp, 0, p);
+	return (error);
+}
+
+/*
+ * Set ownership given a path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct chown_args {
+	char	*path;
+	int	uid;
+	int	gid;
+};
+#endif
+/* ARGSUSED */
+int
+chown(p, uap, retval)
+	struct proc *p;
+	register struct chown_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) uid;
+		syscallarg(int) gid;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct vnode *vp;
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	VATTR_NULL(&vattr);
+	vattr.va_uid = SCARG(uap, uid);
+	vattr.va_gid = SCARG(uap, gid);
+	error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+	vput(vp);
+	return (error);
+}
+
+/*
+ * Set ownership given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fchown_args {
+	int	fd;
+	int	uid;
+	int	gid;
+};
+#endif
+/* ARGSUSED */
+int
+fchown(p, uap, retval)
+	struct proc *p;
+	register struct fchown_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) uid;
+		syscallarg(int) gid;
+	} */ *uap;
+	register_t *retval;
+{
+	struct vattr vattr;
+	struct vnode *vp;
+	struct file *fp;
+	int error;
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	vp = (struct vnode *)fp->f_data;
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	VATTR_NULL(&vattr);
+	vattr.va_uid = SCARG(uap, uid);
+	vattr.va_gid = SCARG(uap, gid);
+	error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+	VOP_UNLOCK(vp, 0, p);
+	return (error);
+}
+
+/*
+ * Set the access and modification times of a file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct utimes_args {
+	char	*path;
+	struct	timeval *tptr;
+};
+#endif
+/* ARGSUSED */
+int
+utimes(p, uap, retval)
+	struct proc *p;
+	register struct utimes_args /* {
+		syscallarg(char *) path;
+		syscallarg(struct timeval *) tptr;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct vnode *vp;
+	struct timeval tv[2];
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+	VATTR_NULL(&vattr);
+	if (SCARG(uap, tptr) == NULL) {
+		microtime(&tv[0]);
+		tv[1] = tv[0];
+		vattr.va_vaflags |= VA_UTIMES_NULL;
+	} else if (error = copyin((caddr_t)SCARG(uap, tptr), (caddr_t)tv,
+	    sizeof (tv)))
+  		return (error);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	vattr.va_atime.tv_sec = tv[0].tv_sec;
+	vattr.va_atime.tv_nsec = tv[0].tv_usec * 1000;
+	vattr.va_mtime.tv_sec = tv[1].tv_sec;
+	vattr.va_mtime.tv_nsec = tv[1].tv_usec * 1000;
+	error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+	vput(vp);
+	return (error);
+}
+
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct truncate_args {
+	char	*path;
+	int	pad;
+	off_t	length;
+};
+#endif
+/* ARGSUSED */
+int
+truncate(p, uap, retval)
+	struct proc *p;
+	register struct truncate_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) pad;
+		syscallarg(off_t) length;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct vnode *vp;
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+	if (uap->length < 0)
+		return(EINVAL);
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	if (vp->v_type == VDIR)
+		error = EISDIR;
+	else if ((error = vn_writechk(vp)) == 0 &&
+	    (error = VOP_ACCESS(vp, VWRITE, p->p_ucred, p)) == 0) {
+		VATTR_NULL(&vattr);
+		vattr.va_size = SCARG(uap, length);
+		error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
+	}
+	vput(vp);
+	return (error);
+}
+
+/*
+ * Truncate a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ftruncate_args {
+	int	fd;
+	int	pad;
+	off_t	length;
+};
+#endif
+/* ARGSUSED */
+int
+ftruncate(p, uap, retval)
+	struct proc *p;
+	register struct ftruncate_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) pad;
+		syscallarg(off_t) length;
+	} */ *uap;
+	register_t *retval;
+{
+	struct vattr vattr;
+	struct vnode *vp;
+	struct file *fp;
+	int error;
+
+	if (uap->length < 0)
+		return(EINVAL);
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	if ((fp->f_flag & FWRITE) == 0)
+		return (EINVAL);
+	vp = (struct vnode *)fp->f_data;
+	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	if (vp->v_type == VDIR)
+		error = EISDIR;
+	else if ((error = vn_writechk(vp)) == 0) {
+		VATTR_NULL(&vattr);
+		vattr.va_size = SCARG(uap, length);
+		error = VOP_SETATTR(vp, &vattr, fp->f_cred, p);
+	}
+	VOP_UNLOCK(vp, 0, p);
+	return (error);
+}
+
+#if defined(COMPAT_43) || defined(COMPAT_SUNOS)
+/*
+ * Truncate a file given its path name.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct otruncate_args {
+	char	*path;
+	long	length;
+};
+#endif
+/* ARGSUSED */
+int
+otruncate(p, uap, retval)
+	struct proc *p;
+	register struct otruncate_args /* {
+		syscallarg(char *) path;
+		syscallarg(long) length;
+	} */ *uap;
+	register_t *retval;
+{
+	struct truncate_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) pad;
+		syscallarg(off_t) length;
+	} */ nuap;
+
+	SCARG(&nuap, path) = SCARG(uap, path);
+	SCARG(&nuap, length) = SCARG(uap, length);
+	return (truncate(p, &nuap, retval));
+}
+
+/*
+ * Truncate a file given a file descriptor.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct oftruncate_args {
+	int	fd;
+	long	length;
+};
+#endif
+/* ARGSUSED */
+int
+oftruncate(p, uap, retval)
+	struct proc *p;
+	register struct oftruncate_args /* {
+		syscallarg(int) fd;
+		syscallarg(long) length;
+	} */ *uap;
+	register_t *retval;
+{
+	struct ftruncate_args /* {
+		syscallarg(int) fd;
+		syscallarg(int) pad;
+		syscallarg(off_t) length;
+	} */ nuap;
+
+	SCARG(&nuap, fd) = SCARG(uap, fd);
+	SCARG(&nuap, length) = SCARG(uap, length);
+	return (ftruncate(p, &nuap, retval));
+}
+#endif /* COMPAT_43 || COMPAT_SUNOS */
+
+/*
+ * Sync an open file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct fsync_args {
+	int	fd;
+};
+#endif
+/* ARGSUSED */
+int
+fsync(p, uap, retval)
+	struct proc *p;
+	struct fsync_args /* {
+		syscallarg(int) fd;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct vnode *vp;
+	struct file *fp;
+	int error;
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	vp = (struct vnode *)fp->f_data;
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	if (vp->v_object) {
+		vm_object_page_clean(vp->v_object, 0, 0 ,0, FALSE);
+	}
+	error = VOP_FSYNC(vp, fp->f_cred,
+		(vp->v_mount && (vp->v_mount->mnt_flag & MNT_ASYNC)) ? 
+		MNT_NOWAIT : MNT_WAIT, p);
+	VOP_UNLOCK(vp, 0, p);
+	return (error);
+}
+
+/*
+ * Rename files.  Source and destination must either both be directories,
+ * or both not be directories.  If target is a directory, it must be empty.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rename_args {
+	char	*from;
+	char	*to;
+};
+#endif
+/* ARGSUSED */
+int
+rename(p, uap, retval)
+	struct proc *p;
+	register struct rename_args /* {
+		syscallarg(char *) from;
+		syscallarg(char *) to;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct vnode *tvp, *fvp, *tdvp;
+	struct nameidata fromnd, tond;
+	int error;
+
+	NDINIT(&fromnd, DELETE, WANTPARENT | SAVESTART, UIO_USERSPACE,
+	    SCARG(uap, from), p);
+	if (error = namei(&fromnd))
+		return (error);
+	fvp = fromnd.ni_vp;
+	NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART,
+	    UIO_USERSPACE, SCARG(uap, to), p);
+	if (fromnd.ni_vp->v_type == VDIR)
+		tond.ni_cnd.cn_flags |= WILLBEDIR;
+	if (error = namei(&tond)) {
+		/* Translate error code for rename("dir1", "dir2/."). */
+		if (error == EISDIR && fvp->v_type == VDIR)
+			error = EINVAL;
+		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
+		vrele(fromnd.ni_dvp);
+		vrele(fvp);
+		goto out1;
+	}
+	tdvp = tond.ni_dvp;
+	tvp = tond.ni_vp;
+	if (tvp != NULL) {
+		if (fvp->v_type == VDIR && tvp->v_type != VDIR) {
+			error = ENOTDIR;
+			goto out;
+		} else if (fvp->v_type != VDIR && tvp->v_type == VDIR) {
+			error = EISDIR;
+			goto out;
+		}
+	}
+	if (fvp == tdvp)
+		error = EINVAL;
+	/*
+	 * If source is the same as the destination (that is the
+	 * same inode number with the same name in the same directory),
+	 * then there is nothing to do.
+	 */
+	if (fvp == tvp && fromnd.ni_dvp == tdvp &&
+	    fromnd.ni_cnd.cn_namelen == tond.ni_cnd.cn_namelen &&
+	    !bcmp(fromnd.ni_cnd.cn_nameptr, tond.ni_cnd.cn_nameptr,
+	      fromnd.ni_cnd.cn_namelen))
+		error = -1;
+out:
+	if (!error) {
+		VOP_LEASE(tdvp, p, p->p_ucred, LEASE_WRITE);
+		if (fromnd.ni_dvp != tdvp)
+			VOP_LEASE(fromnd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+		if (tvp) {
+			VOP_LEASE(tvp, p, p->p_ucred, LEASE_WRITE);
+			(void) vnode_pager_uncache(tvp, p);
+		}
+		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
+				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
+	} else {
+		VOP_ABORTOP(tond.ni_dvp, &tond.ni_cnd);
+		if (tdvp == tvp)
+			vrele(tdvp);
+		else
+			vput(tdvp);
+		if (tvp)
+			vput(tvp);
+		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
+		vrele(fromnd.ni_dvp);
+		vrele(fvp);
+	}
+	vrele(tond.ni_startdir);
+	FREE(tond.ni_cnd.cn_pnbuf, M_NAMEI);
+out1:
+	if (fromnd.ni_startdir)
+		vrele(fromnd.ni_startdir);
+	FREE(fromnd.ni_cnd.cn_pnbuf, M_NAMEI);
+	if (error == -1)
+		return (0);
+	return (error);
+}
+
+/*
+ * Make a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct mkdir_args {
+	char	*path;
+	int	mode;
+};
+#endif
+/* ARGSUSED */
+int
+mkdir(p, uap, retval)
+	struct proc *p;
+	register struct mkdir_args /* {
+		syscallarg(char *) path;
+		syscallarg(int) mode;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct vnode *vp;
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+	nd.ni_cnd.cn_flags |= WILLBEDIR;
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	if (vp != NULL) {
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+		if (nd.ni_dvp == vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		vrele(vp);
+		return (EEXIST);
+	}
+	VATTR_NULL(&vattr);
+	vattr.va_type = VDIR;
+	vattr.va_mode = (SCARG(uap, mode) & ACCESSPERMS) &~ p->p_fd->fd_cmask;
+	VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+	error = VOP_MKDIR(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr);
+	if (!error)
+		vput(nd.ni_vp);
+	return (error);
+}
+
+/*
+ * Remove a directory file.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct rmdir_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+rmdir(p, uap, retval)
+	struct proc *p;
+	struct rmdir_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct vnode *vp;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, DELETE, LOCKPARENT | LOCKLEAF, UIO_USERSPACE,
+	    SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	if (vp->v_type != VDIR) {
+		error = ENOTDIR;
+		goto out;
+	}
+	/*
+	 * No rmdir "." please.
+	 */
+	if (nd.ni_dvp == vp) {
+		error = EINVAL;
+		goto out;
+	}
+	/*
+	 * The root of a mounted filesystem cannot be deleted.
+	 */
+	if (vp->v_flag & VROOT)
+		error = EBUSY;
+out:
+	if (!error) {
+		VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
+		VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+		error = VOP_RMDIR(nd.ni_dvp, nd.ni_vp, &nd.ni_cnd);
+	} else {
+		VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
+		if (nd.ni_dvp == vp)
+			vrele(nd.ni_dvp);
+		else
+			vput(nd.ni_dvp);
+		vput(vp);
+	}
+	return (error);
+}
+
+#ifdef COMPAT_43
+/*
+ * Read a block of directory entries in a file system independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct ogetdirentries_args {
+	int	fd;
+	char	*buf;
+	u_int	count;
+	long	*basep;
+};
+#endif
+int
+ogetdirentries(p, uap, retval)
+	struct proc *p;
+	register struct ogetdirentries_args /* {
+		syscallarg(int) fd;
+		syscallarg(char *) buf;
+		syscallarg(u_int) count;
+		syscallarg(long *) basep;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct vnode *vp;
+	struct file *fp;
+	struct uio auio, kuio;
+	struct iovec aiov, kiov;
+	struct dirent *dp, *edp;
+	caddr_t dirbuf;
+	int error, eofflag, readcnt;
+	long loff;
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	if ((fp->f_flag & FREAD) == 0)
+		return (EBADF);
+	vp = (struct vnode *)fp->f_data;
+unionread:
+	if (vp->v_type != VDIR)
+		return (EINVAL);
+	aiov.iov_base = SCARG(uap, buf);
+	aiov.iov_len = SCARG(uap, count);
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_procp = p;
+	auio.uio_resid = SCARG(uap, count);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	loff = auio.uio_offset = fp->f_offset;
+#	if (BYTE_ORDER != LITTLE_ENDIAN)
+		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
+			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
+			    NULL, NULL);
+			fp->f_offset = auio.uio_offset;
+		} else
+#	endif
+	{
+		kuio = auio;
+		kuio.uio_iov = &kiov;
+		kuio.uio_segflg = UIO_SYSSPACE;
+		kiov.iov_len = SCARG(uap, count);
+		MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK);
+		kiov.iov_base = dirbuf;
+		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
+			    NULL, NULL);
+		fp->f_offset = kuio.uio_offset;
+		if (error == 0) {
+			readcnt = SCARG(uap, count) - kuio.uio_resid;
+			edp = (struct dirent *)&dirbuf[readcnt];
+			for (dp = (struct dirent *)dirbuf; dp < edp; ) {
+#				if (BYTE_ORDER == LITTLE_ENDIAN)
+					/*
+					 * The expected low byte of
+					 * dp->d_namlen is our dp->d_type.
+					 * The high MBZ byte of dp->d_namlen
+					 * is our dp->d_namlen.
+					 */
+					dp->d_type = dp->d_namlen;
+					dp->d_namlen = 0;
+#				else
+					/*
+					 * The dp->d_type is the high byte
+					 * of the expected dp->d_namlen,
+					 * so must be zero'ed.
+					 */
+					dp->d_type = 0;
+#				endif
+				if (dp->d_reclen > 0) {
+					dp = (struct dirent *)
+					    ((char *)dp + dp->d_reclen);
+				} else {
+					error = EIO;
+					break;
+				}
+			}
+			if (dp >= edp)
+				error = uiomove(dirbuf, readcnt, &auio);
+		}
+		FREE(dirbuf, M_TEMP);
+	}
+	VOP_UNLOCK(vp, 0, p);
+	if (error)
+		return (error);
+
+#ifdef UNION
+{
+	if ((SCARG(uap, count) == auio.uio_resid) &&
+	    (vp->v_op == union_vnodeop_p)) {
+		struct vnode *lvp;
+
+		lvp = union_dircache(vp, p);
+		if (lvp != NULLVP) {
+			struct vattr va;
+
+			/*
+			 * If the directory is opaque,
+			 * then don't show lower entries
+			 */
+			error = VOP_GETATTR(vp, &va, fp->f_cred, p);
+			if (va.va_flags & OPAQUE) {
+				vput(lvp);
+				lvp = NULL;
+			}
+		}
+		
+		if (lvp != NULLVP) {
+			error = VOP_OPEN(lvp, FREAD, fp->f_cred, p);
+			if (error) {
+				vput(lvp);
+				return (error);
+			}
+			VOP_UNLOCK(lvp, 0, p);
+			fp->f_data = (caddr_t) lvp;
+			fp->f_offset = 0;
+			error = vn_close(vp, FREAD, fp->f_cred, p);
+			if (error)
+				return (error);
+			vp = lvp;
+			goto unionread;
+		}
+	}
+}
+#endif /* UNION */
+
+	if ((SCARG(uap, count) == auio.uio_resid) &&
+	    (vp->v_flag & VROOT) &&
+	    (vp->v_mount->mnt_flag & MNT_UNION)) {
+		struct vnode *tvp = vp;
+		vp = vp->v_mount->mnt_vnodecovered;
+		VREF(vp);
+		fp->f_data = (caddr_t) vp;
+		fp->f_offset = 0;
+		vrele(tvp);
+		goto unionread;
+	}
+	error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep),
+	    sizeof(long));
+	*retval = SCARG(uap, count) - auio.uio_resid;
+	return (error);
+}
+#endif /* COMPAT_43 */
+
+/*
+ * Read a block of directory entries in a file system independent format.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct getdirentries_args {
+	int	fd;
+	char	*buf;
+	u_int	count;
+	long	*basep;
+};
+#endif
+int
+getdirentries(p, uap, retval)
+	struct proc *p;
+	register struct getdirentries_args /* {
+		syscallarg(int) fd;
+		syscallarg(char *) buf;
+		syscallarg(u_int) count;
+		syscallarg(long *) basep;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct vnode *vp;
+	struct file *fp;
+	struct uio auio;
+	struct iovec aiov;
+	long loff;
+	int error, eofflag;
+
+	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
+		return (error);
+	if ((fp->f_flag & FREAD) == 0)
+		return (EBADF);
+	vp = (struct vnode *)fp->f_data;
+unionread:
+	if (vp->v_type != VDIR)
+		return (EINVAL);
+	aiov.iov_base = SCARG(uap, buf);
+	aiov.iov_len = SCARG(uap, count);
+	auio.uio_iov = &aiov;
+	auio.uio_iovcnt = 1;
+	auio.uio_rw = UIO_READ;
+	auio.uio_segflg = UIO_USERSPACE;
+	auio.uio_procp = p;
+	auio.uio_resid = SCARG(uap, count);
+	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+	loff = auio.uio_offset = fp->f_offset;
+	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
+	fp->f_offset = auio.uio_offset;
+	VOP_UNLOCK(vp, 0, p);
+	if (error)
+		return (error);
+
+#ifdef UNION
+{
+	if ((SCARG(uap, count) == auio.uio_resid) &&
+	    (vp->v_op == union_vnodeop_p)) {
+		struct vnode *lvp;
+
+		lvp = union_dircache(vp, p);
+		if (lvp != NULLVP) {
+			struct vattr va;
+
+			/*
+			 * If the directory is opaque,
+			 * then don't show lower entries
+			 */
+			error = VOP_GETATTR(vp, &va, fp->f_cred, p);
+			if (va.va_flags & OPAQUE) {
+				vput(lvp);
+				lvp = NULL;
+			}
+		}
+
+		if (lvp != NULLVP) {
+			error = VOP_OPEN(lvp, FREAD, fp->f_cred, p);
+			if (error) {
+				vput(lvp);
+				return (error);
+			}
+			VOP_UNLOCK(lvp, 0, p);
+			fp->f_data = (caddr_t) lvp;
+			fp->f_offset = 0;
+			error = vn_close(vp, FREAD, fp->f_cred, p);
+			if (error)
+				return (error);
+			vp = lvp;
+			goto unionread;
+		}
+	}
+}
+#endif /* UNION */
+
+	if ((SCARG(uap, count) == auio.uio_resid) &&
+	    (vp->v_flag & VROOT) &&
+	    (vp->v_mount->mnt_flag & MNT_UNION)) {
+		struct vnode *tvp = vp;
+		vp = vp->v_mount->mnt_vnodecovered;
+		VREF(vp);
+		fp->f_data = (caddr_t) vp;
+		fp->f_offset = 0;
+		vrele(tvp);
+		goto unionread;
+	}
+	error = copyout((caddr_t)&loff, (caddr_t)SCARG(uap, basep),
+	    sizeof(long));
+	*retval = SCARG(uap, count) - auio.uio_resid;
+	return (error);
+}
+
+/*
+ * Set the mode mask for creation of filesystem nodes.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct umask_args {
+	int	newmask;
+};
+#endif
+int
+umask(p, uap, retval)
+	struct proc *p;
+	struct umask_args /* {
+		syscallarg(int) newmask;
+	} */ *uap;
+	int *retval;	/* XXX */
+{
+	register struct filedesc *fdp;
+
+	fdp = p->p_fd;
+	*retval = fdp->fd_cmask;
+	fdp->fd_cmask = SCARG(uap, newmask) & ALLPERMS;
+	return (0);
+}
+
+/*
+ * Void all references to file by ripping underlying filesystem
+ * away from vnode.
+ */
+#ifndef _SYS_SYSPROTO_H_
+struct revoke_args {
+	char	*path;
+};
+#endif
+/* ARGSUSED */
+int
+revoke(p, uap, retval)
+	struct proc *p;
+	register struct revoke_args /* {
+		syscallarg(char *) path;
+	} */ *uap;
+	register_t *retval;
+{
+	register struct vnode *vp;
+	struct vattr vattr;
+	int error;
+	struct nameidata nd;
+
+	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
+	if (error = namei(&nd))
+		return (error);
+	vp = nd.ni_vp;
+	if (error = VOP_GETATTR(vp, &vattr, p->p_ucred, p))
+		goto out;
+	if (p->p_ucred->cr_uid != vattr.va_uid &&
+	    (error = suser(p->p_ucred, &p->p_acflag)))
+		goto out;
+	if (vp->v_usecount > 1 || (vp->v_flag & VALIASED))
+		VOP_REVOKE(vp, REVOKEALL);
+out:
+	vrele(vp);
+	return (error);
+}
+
+/*
+ * Convert a user file descriptor to a kernel file entry.
+ */
+int
+getvnode(fdp, fd, fpp)
+	struct filedesc *fdp;
+	int fd;
+	struct file **fpp;
+{
+	struct file *fp;
+
+	if ((u_int)fd >= fdp->fd_nfiles ||
+	    (fp = fdp->fd_ofiles[fd]) == NULL)
+		return (EBADF);
+	if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO)
+		return (EINVAL);
+	*fpp = fp;
+	return (0);
+}
diff --git a/sys/kern/vfs_init.c b/sys/kern/vfs_init.c
index b5abe58..21061e8 100644
--- a/sys/kern/vfs_init.c
+++ b/sys/kern/vfs_init.c
@@ -35,11 +35,14 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)vfs_init.c	8.5 (Berkeley) 5/11/95
+ *	@(#)vfs_init.c	8.3 (Berkeley) 1/4/94
+ * $Id: vfs_init.c,v 1.24 1997/02/22 09:39:32 peter Exp $
  */
 
 
 #include <sys/param.h>
+#include <sys/systm.h>
+#include <sys/kernel.h>
 #include <sys/mount.h>
 #include <sys/time.h>
 #include <sys/vnode.h>
@@ -49,6 +52,12 @@
 #include <sys/buf.h>
 #include <sys/errno.h>
 #include <sys/malloc.h>
+#include <sys/proc.h>
+
+static void	vfs_op_init __P((void));
+
+static void vfsinit __P((void *));
+SYSINIT(vfs, SI_SUB_VFS, SI_ORDER_FIRST, vfsinit, NULL)
 
 /*
  * Sigh, such primitive tools are these...
@@ -59,8 +68,13 @@
 #define DODEBUG(A)
 #endif
 
-extern struct vnodeopv_desc *vfs_opv_descs[];
-				/* a list of lists of vnodeops defns */
+struct vfsconf void_vfsconf;
+
+extern struct linker_set vfs_opv_descs_;
+#define vfs_opv_descs ((struct vnodeopv_desc **)vfs_opv_descs_.ls_items)
+
+extern struct linker_set vfs_set;
+
 extern struct vnodeop_desc *vfs_op_descs[];
 				/* and the operations they perform */
 /*
@@ -69,9 +83,7 @@ extern struct vnodeop_desc *vfs_op_descs[];
  * extra level of indirection for arrays.  It's an interesting
  * "feature" of C.
  */
-int vfs_opv_numops;
-
-typedef (*PFI)();   /* the standard Pointer to a Function returning an Int */
+static int vfs_opv_numops;
 
 /*
  * A miscellaneous routine.
@@ -101,33 +113,35 @@ vn_default_error()
  * that is a(whole)nother story.) This is a feature.
  */
 void
-vfs_opv_init()
+vfs_opv_init(struct vnodeopv_desc **them)
 {
 	int i, j, k;
-	int (***opv_desc_vector_p)();
-	int (**opv_desc_vector)();
+	vop_t ***opv_desc_vector_p;
+	vop_t **opv_desc_vector;
 	struct vnodeopv_entry_desc *opve_descp;
 
 	/*
 	 * Allocate the dynamic vectors and fill them in.
 	 */
-	for (i=0; vfs_opv_descs[i]; i++) {
-		opv_desc_vector_p = vfs_opv_descs[i]->opv_desc_vector_p;
+	for (i=0; them[i]; i++) {
+		opv_desc_vector_p = them[i]->opv_desc_vector_p;
 		/*
 		 * Allocate and init the vector, if it needs it.
 		 * Also handle backwards compatibility.
 		 */
 		if (*opv_desc_vector_p == NULL) {
 			/* XXX - shouldn't be M_VNODE */
-			MALLOC(*opv_desc_vector_p, PFI*,
-			       vfs_opv_numops*sizeof(PFI), M_VNODE, M_WAITOK);
-			bzero (*opv_desc_vector_p, vfs_opv_numops*sizeof(PFI));
+			MALLOC(*opv_desc_vector_p, vop_t **,
+			       vfs_opv_numops * sizeof(vop_t *), M_VNODE,
+			       M_WAITOK);
+			bzero(*opv_desc_vector_p,
+			      vfs_opv_numops * sizeof(vop_t *));
 			DODEBUG(printf("vector at %x allocated\n",
 			    opv_desc_vector_p));
 		}
 		opv_desc_vector = *opv_desc_vector_p;
-		for (j=0; vfs_opv_descs[i]->opv_desc_ops[j].opve_op; j++) {
-			opve_descp = &(vfs_opv_descs[i]->opv_desc_ops[j]);
+		for (j=0; them[i]->opv_desc_ops[j].opve_op; j++) {
+			opve_descp = &(them[i]->opv_desc_ops[j]);
 
 			/*
 			 * Sanity check:  is this operation listed
@@ -166,8 +180,8 @@ vfs_opv_init()
 	 * with their default.  (Sigh, an O(n^3) algorithm.  I
 	 * could make it better, but that'd be work, and n is small.)
 	 */
-	for (i = 0; vfs_opv_descs[i]; i++) {
-		opv_desc_vector = *(vfs_opv_descs[i]->opv_desc_vector_p);
+	for (i = 0; them[i]; i++) {
+		opv_desc_vector = *(them[i]->opv_desc_vector_p);
 		/*
 		 * Force every operations vector to have a default routine.
 		 */
@@ -176,7 +190,7 @@ vfs_opv_init()
 		}
 		for (k = 0; k<vfs_opv_numops; k++)
 			if (opv_desc_vector[k] == NULL)
-				opv_desc_vector[k] = 
+				opv_desc_vector[k] =
 					opv_desc_vector[VOFFSET(vop_default)];
 	}
 }
@@ -184,7 +198,7 @@ vfs_opv_init()
 /*
  * Initialize known vnode operations vectors.
  */
-void
+static void
 vfs_op_init()
 {
 	int i;
@@ -216,10 +230,13 @@ struct vattr va_null;
 /*
  * Initialize the vnode structures and initialize each file system type.
  */
-vfsinit()
+/* ARGSUSED*/
+static void
+vfsinit(dummy)
+	void *dummy;
 {
-	struct vfsconf *vfsp;
-	int i, maxtypenum;
+	struct vfsconf **vfc;
+	int maxtypenum;
 
 	/*
 	 * Initialize the vnode table
@@ -233,15 +250,19 @@ vfsinit()
 	 * Build vnode operation vectors.
 	 */
 	vfs_op_init();
-	vfs_opv_init();   /* finish the job */
+	vfs_opv_init(vfs_opv_descs);   /* finish the job */
 	/*
 	 * Initialize each file system type.
 	 */
 	vattr_null(&va_null);
 	maxtypenum = 0;
-	for (vfsp = vfsconf, i = 1; i <= maxvfsconf; i++, vfsp++) {
-		if (i < maxvfsconf)
-			vfsp->vfc_next = vfsp + 1;
+	vfc = (struct vfsconf **)vfs_set.ls_items;
+	vfsconf = *vfc;		/* simulate Lite2 vfsconf array */
+	while (*vfc) {
+		struct vfsconf *vfsp = *vfc;
+
+		vfc++;
+		vfsp->vfc_next = *vfc;
 		if (maxtypenum <= vfsp->vfc_typenum)
 			maxtypenum = vfsp->vfc_typenum + 1;
 		(*vfsp->vfc_vfsops->vfs_init)(vfsp);
@@ -249,3 +270,30 @@ vfsinit()
 	/* next vfc_typenum to be used */
 	maxvfsconf = maxtypenum;
 }
+
+/*
+ * kernel related system variables.
+ */
+
+/*
+ * This goop is here to support a loadable NFS module... grumble...
+ */
+int (*lease_check_hook) __P((struct vop_lease_args *))
+     = 0;
+void (*lease_updatetime) __P((int))
+     = 0;
+
+int
+lease_check(ap)
+	struct vop_lease_args /* {
+		struct vnode *a_vp;
+		struct proc *a_p;
+		struct ucred *a_cred;
+		int a_flag;
+	} */ *ap;
+{
+    if (lease_check_hook)
+	return (*lease_check_hook)(ap);
+    else
+	return 0;
+}
diff --git a/sys/kern/vfs_lookup.c b/sys/kern/vfs_lookup.c
index 826fbfe..0c04b01 100644
--- a/sys/kern/vfs_lookup.c
+++ b/sys/kern/vfs_lookup.c
@@ -35,10 +35,14 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)vfs_lookup.c	8.10 (Berkeley) 5/27/95
+ *	@(#)vfs_lookup.c	8.4 (Berkeley) 2/16/94
+ * $Id$
  */
 
+#include "opt_ktrace.h"
+
 #include <sys/param.h>
+#include <sys/systm.h>
 #include <sys/syslimits.h>
 #include <sys/time.h>
 #include <sys/namei.h>
@@ -105,10 +109,17 @@ namei(ndp)
 		MALLOC(cnp->cn_pnbuf, caddr_t, MAXPATHLEN, M_NAMEI, M_WAITOK);
 	if (ndp->ni_segflg == UIO_SYSSPACE)
 		error = copystr(ndp->ni_dirp, cnp->cn_pnbuf,
-			    MAXPATHLEN, &ndp->ni_pathlen);
+			    MAXPATHLEN, (u_int *)&ndp->ni_pathlen);
 	else
 		error = copyinstr(ndp->ni_dirp, cnp->cn_pnbuf,
-			    MAXPATHLEN, &ndp->ni_pathlen);
+			    MAXPATHLEN, (u_int *)&ndp->ni_pathlen);
+
+	/*
+	 * Don't allow empty pathnames.
+	 */
+	if (!error && *cnp->cn_pnbuf == '\0')
+		error = ENOENT;
+
 	if (error) {
 		free(cnp->cn_pnbuf, M_NAMEI);
 		ndp->ni_vp = NULL;
@@ -143,7 +154,8 @@ namei(ndp)
 			VREF(dp);
 		}
 		ndp->ni_startdir = dp;
-		if (error = lookup(ndp)) {
+		error = lookup(ndp);
+		if (error) {
 			FREE(cnp->cn_pnbuf, M_NAMEI);
 			return (error);
 		}
@@ -176,7 +188,8 @@ namei(ndp)
 		auio.uio_segflg = UIO_SYSSPACE;
 		auio.uio_procp = (struct proc *)0;
 		auio.uio_resid = MAXPATHLEN;
-		if (error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred)) {
+		error = VOP_READLINK(ndp->ni_vp, &auio, cnp->cn_cred);
+		if (error) {
 			if (ndp->ni_pathlen > 1)
 				free(cp, M_NAMEI);
 			break;
@@ -226,7 +239,7 @@ namei(ndp)
  * the target is returned locked, otherwise it is returned unlocked.
  * When creating or renaming and LOCKPARENT is specified, the target may not
  * be ".".  When deleting and LOCKPARENT is specified, the target may be ".".
- * 
+ *
  * Overall outline of lookup:
  *
  * dirloop:
@@ -254,6 +267,7 @@ lookup(ndp)
 	int docache;			/* == 0 do not cache last component */
 	int wantparent;			/* 1 => wantparent or lockparent flag */
 	int rdonly;			/* lookup read-only flag bit */
+	int trailing_slash;
 	int error = 0;
 	struct componentname *cnp = &ndp->ni_cnd;
 	struct proc *p = cnp->cn_proc;
@@ -264,7 +278,8 @@ lookup(ndp)
 	wantparent = cnp->cn_flags & (LOCKPARENT | WANTPARENT);
 	docache = (cnp->cn_flags & NOCACHE) ^ NOCACHE;
 	if (cnp->cn_nameiop == DELETE ||
-	    (wantparent && cnp->cn_nameiop != CREATE))
+	    (wantparent && cnp->cn_nameiop != CREATE &&
+	     cnp->cn_nameiop != LOOKUP))
 		docache = 0;
 	rdonly = cnp->cn_flags & RDONLY;
 	ndp->ni_dvp = NULL;
@@ -300,6 +315,25 @@ dirloop:
 #endif
 	ndp->ni_pathlen -= cnp->cn_namelen;
 	ndp->ni_next = cp;
+
+	/*
+	 * Replace multiple slashes by a single slash and trailing slashes
+	 * by a null.  This must be done before VOP_LOOKUP() because some
+	 * fs's don't know about trailing slashes.  Remember if there were
+	 * trailing slashes to handle symlinks, existing non-directories
+	 * and non-existing files that won't be directories specially later.
+	 */
+	trailing_slash = 0;
+	while (*cp == '/' && (cp[1] == '/' || cp[1] == '\0')) {
+		cp++;
+		ndp->ni_pathlen--;
+		if (*cp == '\0') {
+			trailing_slash = 1;
+			*ndp->ni_next = '\0';	/* XXX for direnter() ... */
+		}
+	}
+	ndp->ni_next = cp;
+
 	cnp->cn_flags |= MAKEENTRY;
 	if (*cp == '\0' && docache == 0)
 		cnp->cn_flags &= ~MAKEENTRY;
@@ -404,6 +438,11 @@ unionlookup:
 			error = EROFS;
 			goto bad;
 		}
+		if (*cp == '\0' && trailing_slash &&
+		     !(cnp->cn_flags & WILLBEDIR)) {
+			error = ENOENT;
+			goto bad;
+		}
 		/*
 		 * We return with ni_vp NULL to indicate that the entry
 		 * doesn't currently exist, leaving a pointer to the
@@ -431,6 +470,7 @@ unionlookup:
 	}
 
 	dp = ndp->ni_vp;
+
 	/*
 	 * Check to see if the vnode has been mounted on;
 	 * if so find the root of the mounted file system.
@@ -451,11 +491,20 @@ unionlookup:
 	 * Check for symbolic link
 	 */
 	if ((dp->v_type == VLNK) &&
-	    ((cnp->cn_flags & FOLLOW) || *ndp->ni_next == '/')) {
+	    ((cnp->cn_flags & FOLLOW) || trailing_slash ||
+	     *ndp->ni_next == '/')) {
 		cnp->cn_flags |= ISSYMLINK;
 		return (0);
 	}
 
+	/*
+	 * Check for bogus trailing slashes.
+	 */
+	if (trailing_slash && dp->v_type != VDIR) {
+		error = ENOTDIR;
+		goto bad2;
+	}
+
 nextname:
 	/*
 	 * Not a symbolic link.  If more pathname,
diff --git a/sys/kern/vfs_mount.c b/sys/kern/vfs_mount.c
new file mode 100644
index 0000000..779a1c4
--- /dev/null
+++ b/sys/kern/vfs_mount.c
@@ -0,0 +1,153 @@
+/*
+ * Copyright (c) 1989, 1993, 1995
+ *	The Regents of the University of California.  All rights reserved.
+ * Copyright (c) 1995 Artisoft, Inc.  All Rights Reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ * 3. All advertising materials mentioning features or use of this software
+ *    must display the following acknowledgement:
+ *	This product includes software developed by the University of
+ *	California, Berkeley and its contributors.
+ * 4. Neither the name of the University nor the names of its contributors
+ *    may be used to endorse or promote products derived from this software
+ *    without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ *
+ *	@(#)vfs_conf.c	8.8 (Berkeley) 3/31/94
+ * $Id$
+ */
+
+/*
+ * PURPOSE:	This file abstracts the root mounting interface from
+ *		the per file system semantics for handling mounts,
+ *		the overall intent of which is to move the BSD
+ *		internals dependence out of the FS code, both to
+ *		make the FS code more portable and to free up some
+ *		of the BSD internals so that they may more easily
+ *		be changed.
+ *
+ * NOTE1:	Code is single entry/single exit to aid debugging
+ *		and conversion for kernel multithreading.
+ *
+ * NOTE2:	Code notes lock state in headers on entry and exit
+ *		as an aid to conversion for kernel multithreading
+ *		on SMP reentrancy
+ */
+#include <sys/param.h>		/* dev_t (types.h)*/
+#include <sys/systm.h>		/* rootvp*/
+#include <sys/proc.h>		/* curproc*/
+#include <sys/vnode.h>		/* NULLVP*/
+#include <sys/mount.h>		/* struct mount*/
+#include <sys/malloc.h>		/* M_MOUNT*/
+
+/*
+ * GLOBALS
+ */
+
+/*
+ *  These define the root filesystem, device, and root filesystem type.
+ */
+struct mount *rootfs;
+struct vnode *rootvnode;
+char *mountrootfsname;
+
+/*
+ * vfs_init() will set maxvfsconf
+ * to the highest defined type number.
+ */
+int maxvfsconf;
+struct vfsconf *vfsconf;
+
+/*
+ * Common root mount code shared by all filesystems
+ */
+#define ROOTNAME	"root_device"
+
+/*
+ * vfs_mountrootfs
+ *
+ * Common entry point for root mounts
+ *
+ * PARAMETERS:
+ *		fsname	name of the filesystem
+ *
+ * RETURNS:	0	Success
+ *		!0	error number (errno.h)
+ *
+ * LOCK STATE:
+ *		ENTRY
+ *			<no locks held>
+ *		EXIT
+ *			<no locks held>
+ *
+ * NOTES:
+ *		This code is currently supported only for use for
+ *		the FFS file system type.  This is a matter of
+ *		fixing the other file systems, not this code!
+ */
+int
+vfs_mountrootfs(fsname)
+	char			*fsname;
+{
+	struct mount		*mp;
+	int			err = 0;
+	struct proc		*p = curproc;	/* XXX */
+
+	/*
+	 *  New root mount structure
+	 */
+	err = vfs_rootmountalloc(fsname, ROOTNAME, &mp);
+	if (err)
+		return (err);
+	mp->mnt_flag		|= MNT_ROOTFS;
+
+	/*
+	 * Attempt the mount
+	 */
+	err = VFS_MOUNT(mp, NULL, NULL, NULL, p);
+	if (err)
+		goto error_2;
+
+	simple_lock(&mountlist_slock);
+	/* Add fs to list of mounted file systems*/
+	CIRCLEQ_INSERT_TAIL(&mountlist, mp, mnt_list);
+	simple_unlock(&mountlist_slock);
+
+	vfs_unbusy(mp, p);
+
+	/* root mount, update system time from FS specific data*/
+	inittodr(mp->mnt_time);
+
+	goto success;
+
+
+error_2:	/* mount error*/
+
+	vfs_unbusy(mp, p);
+
+error_1:	/* lock error*/
+
+	/* free mount struct before failing*/
+	free( mp, M_MOUNT);
+
+success:
+	return( err);
+}
diff --git a/sys/kern/vfs_subr.c b/sys/kern/vfs_subr.c
index f891e02..0b487fd 100644
--- a/sys/kern/vfs_subr.c
+++ b/sys/kern/vfs_subr.c
@@ -36,14 +36,19 @@
  * SUCH DAMAGE.
  *
  *	@(#)vfs_subr.c	8.31 (Berkeley) 5/26/95
+ * $Id: vfs_subr.c,v 1.79 1997/03/04 18:31:56 bde Exp $
  */
 
 /*
  * External virtual filesystem routines
  */
+#include "opt_ddb.h"
+#include "opt_devfs.h"
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/kernel.h>
+#include <sys/file.h>
 #include <sys/proc.h>
 #include <sys/mount.h>
 #include <sys/time.h>
@@ -58,15 +63,29 @@
 #include <sys/mbuf.h>
 
 #include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
+#include <vm/vm_pager.h>
+#include <vm/vnode_pager.h>
 #include <sys/sysctl.h>
 
 #include <miscfs/specfs/specdev.h>
 
+#ifdef DDB
+extern void	printlockedvnodes __P((void));
+#endif
+static void	vclean __P((struct vnode *vp, int flags, struct proc *p));
+extern void	vgonel __P((struct vnode *vp, struct proc *p));
+unsigned long	numvnodes;
+extern void	vfs_unmountroot __P((struct mount *rootfs));
+extern void	vputrele __P((struct vnode *vp, int put));
+
 enum vtype iftovt_tab[16] = {
 	VNON, VFIFO, VCHR, VNON, VDIR, VNON, VBLK, VNON,
 	VREG, VNON, VLNK, VNON, VSOCK, VNON, VNON, VBAD,
 };
-int	vttoif_tab[9] = {
+int vttoif_tab[9] = {
 	0, S_IFREG, S_IFDIR, S_IFBLK, S_IFCHR, S_IFLNK,
 	S_IFSOCK, S_IFIFO, S_IFMT,
 };
@@ -80,13 +99,23 @@ int	vttoif_tab[9] = {
 	(bp)->b_vnbufs.le_next = NOLIST;				\
 }
 TAILQ_HEAD(freelst, vnode) vnode_free_list;	/* vnode free list */
-struct mntlist mountlist;			/* mounted filesystem list */
+static u_long freevnodes = 0;
+
+struct mntlist mountlist;	/* mounted filesystem list */
 struct simplelock mountlist_slock;
 static struct simplelock mntid_slock;
 struct simplelock mntvnode_slock;
 struct simplelock vnode_free_list_slock;
 static struct simplelock spechash_slock;
 
+int desiredvnodes;
+SYSCTL_INT(_kern, KERN_MAXVNODES, maxvnodes, CTLFLAG_RW, &desiredvnodes, 0, "");
+
+static void	vfs_free_addrlist __P((struct netexport *nep));
+static int	vfs_free_netcred __P((struct radix_node *rn, void *w));
+static int	vfs_hang_addrlist __P((struct mount *mp, struct netexport *nep,
+				       struct export_args *argp));
+
 /*
  * Initialize the vnode management data structures.
  */
@@ -94,6 +123,7 @@ void
 vntblinit()
 {
 
+	desiredvnodes = maxproc + vm_object_cache_max;
 	simple_lock_init(&mntvnode_slock);
 	simple_lock_init(&mntid_slock);
 	simple_lock_init(&spechash_slock);
@@ -119,17 +149,19 @@ vfs_busy(mp, flags, interlkp, p)
 		if (flags & LK_NOWAIT)
 			return (ENOENT);
 		mp->mnt_flag |= MNT_MWAIT;
-		if (interlkp)
+		if (interlkp) {
 			simple_unlock(interlkp);
+		}
 		/*
 		 * Since all busy locks are shared except the exclusive
 		 * lock granted when unmounting, the only place that a
 		 * wakeup needs to be done is at the release of the
 		 * exclusive lock at the end of dounmount.
 		 */
-		sleep((caddr_t)mp, PVFS);
-		if (interlkp)
+		tsleep((caddr_t)mp, PVFS, "vfs_busy", 0);
+		if (interlkp) {
 			simple_lock(interlkp);
+		}
 		return (ENOENT);
 	}
 	lkflags = LK_SHARED;
@@ -187,6 +219,7 @@ vfs_rootmountalloc(fstypename, devname, mpp)
 	mp->mnt_flag |= vfsp->vfc_flags & MNT_VISFLAGMASK;
 	strncpy(mp->mnt_stat.f_fstypename, vfsp->vfc_name, MFSNAMELEN);
 	mp->mnt_stat.f_mntonname[0] = '/';
+	mp->mnt_stat.f_mntonname[1] = 0;
 	(void) copystr(devname, mp->mnt_stat.f_mntfromname, MNAMELEN - 1, 0);
 	*mpp = mp;
 	return (0);
@@ -198,15 +231,16 @@ vfs_rootmountalloc(fstypename, devname, mpp)
  * trying those that have mountroot routines, and try them until one
  * works or we have tried them all.
  */
+#ifdef notdef	/* XXX JH */
 int
-vfs_mountroot()
+lite2_vfs_mountroot(void)
 {
 	struct vfsconf *vfsp;
-	extern int (*mountroot)(void);
+	extern int (*lite2_mountroot)(void);
 	int error;
 
-	if (mountroot != NULL)
-		return ((*mountroot)());
+	if (lite2_mountroot != NULL)
+		return ((*lite2_mountroot)());
 	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
 		if (vfsp->vfc_mountroot == NULL)
 			continue;
@@ -216,6 +250,7 @@ vfs_mountroot()
 	}
 	return (ENODEV);
 }
+#endif
 
 /*
  * Lookup a mount point by filesystem identifier.
@@ -228,15 +263,15 @@ vfs_getvfs(fsid)
 
 	simple_lock(&mountlist_slock);
 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist;
-	     mp = mp->mnt_list.cqe_next) {
+	    mp = mp->mnt_list.cqe_next) {
 		if (mp->mnt_stat.f_fsid.val[0] == fsid->val[0] &&
 		    mp->mnt_stat.f_fsid.val[1] == fsid->val[1]) {
 			simple_unlock(&mountlist_slock);
 			return (mp);
-		}
+	    }
 	}
 	simple_unlock(&mountlist_slock);
-	return ((struct mount *)0);
+	return ((struct mount *) 0);
 }
 
 /*
@@ -246,12 +281,12 @@ void
 vfs_getnewfsid(mp)
 	struct mount *mp;
 {
-static u_short xxxfs_mntid;
+	static u_short xxxfs_mntid;
 
 	fsid_t tfsid;
 	int mtype;
 
-	simple_lock(&mntid_slock);
+	simple_lock(&mntid_slock); 
 	mtype = mp->mnt_vfc->vfc_typenum;
 	mp->mnt_stat.f_fsid.val[0] = makedev(nblkdev + mtype, 0);
 	mp->mnt_stat.f_fsid.val[1] = mtype;
@@ -278,25 +313,22 @@ vattr_null(vap)
 {
 
 	vap->va_type = VNON;
-	vap->va_size = vap->va_bytes = VNOVAL;
+	vap->va_size = VNOVAL;
+	vap->va_bytes = VNOVAL;
 	vap->va_mode = vap->va_nlink = vap->va_uid = vap->va_gid =
-		vap->va_fsid = vap->va_fileid =
-		vap->va_blocksize = vap->va_rdev =
-		vap->va_atime.ts_sec = vap->va_atime.ts_nsec =
-		vap->va_mtime.ts_sec = vap->va_mtime.ts_nsec =
-		vap->va_ctime.ts_sec = vap->va_ctime.ts_nsec =
-		vap->va_flags = vap->va_gen = VNOVAL;
+	    vap->va_fsid = vap->va_fileid =
+	    vap->va_blocksize = vap->va_rdev =
+	    vap->va_atime.tv_sec = vap->va_atime.tv_nsec =
+	    vap->va_mtime.tv_sec = vap->va_mtime.tv_nsec =
+	    vap->va_ctime.tv_sec = vap->va_ctime.tv_nsec =
+	    vap->va_flags = vap->va_gen = VNOVAL;
 	vap->va_vaflags = 0;
 }
 
 /*
  * Routines having to do with the management of the vnode table.
  */
-extern int (**dead_vnodeop_p)();
-static void vclean __P((struct vnode *vp, int flag, struct proc *p));
-extern void vgonel __P((struct vnode *vp, struct proc *p));
-long numvnodes;
-extern struct vattr va_null;
+extern vop_t **dead_vnodeop_p;
 
 /*
  * Return the next vnode from the free list.
@@ -305,23 +337,31 @@ int
 getnewvnode(tag, mp, vops, vpp)
 	enum vtagtype tag;
 	struct mount *mp;
-	int (**vops)();
+	vop_t **vops;
 	struct vnode **vpp;
 {
 	struct proc *p = curproc;	/* XXX */
 	struct vnode *vp;
-	int s;
-	int cnt;
 
-top:
 	simple_lock(&vnode_free_list_slock);
-	if ((vnode_free_list.tqh_first == NULL &&
-	     numvnodes < 2 * desiredvnodes) ||
-	    numvnodes < desiredvnodes) {
+retry:
+	/*
+	 * we allocate a new vnode if
+	 * 	1. we don't have any free
+	 *		Pretty obvious, we actually used to panic, but that
+	 *		is a silly thing to do.
+	 *	2. we havn't filled our pool yet
+	 *		We don't want to trash the incore (VM-)vnodecache.
+	 *	3. if less that 1/4th of our vnodes are free.
+	 *		We don't want to trash the namei cache either.
+	 */
+	if (freevnodes < (numvnodes >> 2) ||
+	    numvnodes < desiredvnodes ||
+	    vnode_free_list.tqh_first == NULL) {
 		simple_unlock(&vnode_free_list_slock);
-		vp = (struct vnode *)malloc((u_long)sizeof *vp,
+		vp = (struct vnode *) malloc((u_long) sizeof *vp,
 		    M_VNODE, M_WAITOK);
-		bzero((char *)vp, sizeof *vp);
+		bzero((char *) vp, sizeof *vp);
 		numvnodes++;
 	} else {
 		for (vp = vnode_free_list.tqh_first;
@@ -343,31 +383,45 @@ top:
 		if (vp->v_usecount)
 			panic("free vnode isn't");
 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
+		if (vp->v_usage > 0) {
+			simple_unlock(&vp->v_interlock);
+			--vp->v_usage;
+			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+			goto retry;
+		}
+		freevnodes--;
+
 		/* see comment on why 0xdeadb is set at end of vgone (below) */
-		vp->v_freelist.tqe_prev = (struct vnode **)0xdeadb;
+		vp->v_freelist.tqe_prev = (struct vnode **) 0xdeadb;
 		simple_unlock(&vnode_free_list_slock);
 		vp->v_lease = NULL;
 		if (vp->v_type != VBAD)
 			vgonel(vp, p);
-		else
+		else {
 			simple_unlock(&vp->v_interlock);
+		}
+
 #ifdef DIAGNOSTIC
-		if (vp->v_data)
-			panic("cleaned vnode isn't");
-		s = splbio();
-		if (vp->v_numoutput)
-			panic("Clean vnode has pending I/O's");
-		splx(s);
+		{
+			int s;
+
+			if (vp->v_data)
+				panic("cleaned vnode isn't");
+			s = splbio();
+			if (vp->v_numoutput)
+				panic("Clean vnode has pending I/O's");
+			splx(s);
+		}
 #endif
 		vp->v_flag = 0;
 		vp->v_lastr = 0;
-		vp->v_ralen = 0;
-		vp->v_maxra = 0;
 		vp->v_lastw = 0;
 		vp->v_lasta = 0;
 		vp->v_cstart = 0;
 		vp->v_clen = 0;
 		vp->v_socket = 0;
+		vp->v_writecount = 0;	/* XXX */
+		vp->v_usage = 0;
 	}
 	vp->v_type = VNON;
 	cache_purge(vp);
@@ -385,8 +439,8 @@ top:
  */
 void
 insmntque(vp, mp)
-	struct vnode *vp;
-	struct mount *mp;
+	register struct vnode *vp;
+	register struct mount *mp;
 {
 
 	simple_lock(&mntvnode_slock);
@@ -398,8 +452,11 @@ insmntque(vp, mp)
 	/*
 	 * Insert into list of vnodes for the new mount point, if available.
 	 */
-	if ((vp->v_mount = mp) != NULL)
-		LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
+	if ((vp->v_mount = mp) == NULL) {
+		simple_unlock(&mntvnode_slock);
+		return;
+	}
+	LIST_INSERT_HEAD(&mp->mnt_vnodelist, vp, v_mntvnodes);
 	simple_unlock(&mntvnode_slock);
 }
 
@@ -413,14 +470,13 @@ vwakeup(bp)
 	register struct vnode *vp;
 
 	bp->b_flags &= ~B_WRITEINPROG;
-	if (vp = bp->b_vp) {
-		if (--vp->v_numoutput < 0)
+	if ((vp = bp->b_vp)) {
+		vp->v_numoutput--;
+		if (vp->v_numoutput < 0)
 			panic("vwakeup: neg numoutput");
-		if ((vp->v_flag & VBWAIT) && vp->v_numoutput <= 0) {
-			if (vp->v_numoutput < 0)
-				panic("vwakeup: neg numoutput 2");
+		if ((vp->v_numoutput == 0) && (vp->v_flag & VBWAIT)) {
 			vp->v_flag &= ~VBWAIT;
-			wakeup((caddr_t)&vp->v_numoutput);
+			wakeup((caddr_t) &vp->v_numoutput);
 		}
 	}
 }
@@ -440,15 +496,18 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
 	register struct buf *bp;
 	struct buf *nbp, *blist;
 	int s, error;
+	vm_object_t object;
 
 	if (flags & V_SAVE) {
-		if (error = VOP_FSYNC(vp, cred, MNT_WAIT, p))
+		if ((error = VOP_FSYNC(vp, cred, MNT_WAIT, p)))
 			return (error);
 		if (vp->v_dirtyblkhd.lh_first != NULL)
 			panic("vinvalbuf: dirty bufs");
 	}
+
+	s = splbio();
 	for (;;) {
-		if ((blist = vp->v_cleanblkhd.lh_first) && flags & V_SAVEMETA)
+		if ((blist = vp->v_cleanblkhd.lh_first) && (flags & V_SAVEMETA))
 			while (blist && blist->b_lblkno < 0)
 				blist = blist->b_vnbufs.le_next;
 		if (!blist && (blist = vp->v_dirtyblkhd.lh_first) &&
@@ -460,35 +519,51 @@ vinvalbuf(vp, flags, cred, p, slpflag, slptimeo)
 
 		for (bp = blist; bp; bp = nbp) {
 			nbp = bp->b_vnbufs.le_next;
-			if (flags & V_SAVEMETA && bp->b_lblkno < 0)
+			if ((flags & V_SAVEMETA) && bp->b_lblkno < 0)
 				continue;
-			s = splbio();
 			if (bp->b_flags & B_BUSY) {
 				bp->b_flags |= B_WANTED;
-				error = tsleep((caddr_t)bp,
-					slpflag | (PRIBIO + 1), "vinvalbuf",
-					slptimeo);
-				splx(s);
-				if (error)
+				error = tsleep((caddr_t) bp,
+				    slpflag | (PRIBIO + 1), "vinvalbuf",
+				    slptimeo);
+				if (error) {
+					splx(s);
 					return (error);
+				}
 				break;
 			}
 			bremfree(bp);
 			bp->b_flags |= B_BUSY;
-			splx(s);
 			/*
-			 * XXX Since there are no node locks for NFS, I believe
-			 * there is a slight chance that a delayed write will
-			 * occur while sleeping just above, so check for it.
+			 * XXX Since there are no node locks for NFS, I
+			 * believe there is a slight chance that a delayed
+			 * write will occur while sleeping just above, so
+			 * check for it.
 			 */
 			if ((bp->b_flags & B_DELWRI) && (flags & V_SAVE)) {
 				(void) VOP_BWRITE(bp);
 				break;
 			}
-			bp->b_flags |= B_INVAL;
+			bp->b_flags |= (B_INVAL|B_NOCACHE|B_RELBUF);
 			brelse(bp);
 		}
 	}
+
+	while (vp->v_numoutput > 0) {
+		vp->v_flag |= VBWAIT;
+		tsleep(&vp->v_numoutput, PVM, "vnvlbv", 0);
+	}
+
+	splx(s);
+
+	/*
+	 * Destroy the copy in the VM cache, too.
+	 */
+	object = vp->v_object;
+	if (object != NULL) {
+		vm_object_page_remove(object, 0, object->size,
+		    (flags & V_SAVE) ? TRUE : FALSE);
+	}
 	if (!(flags & V_SAVEMETA) &&
 	    (vp->v_dirtyblkhd.lh_first || vp->v_cleanblkhd.lh_first))
 		panic("vinvalbuf: flush failed");
@@ -503,6 +578,7 @@ bgetvp(vp, bp)
 	register struct vnode *vp;
 	register struct buf *bp;
 {
+	int s;
 
 	if (bp->b_vp)
 		panic("bgetvp: not free");
@@ -515,7 +591,9 @@ bgetvp(vp, bp)
 	/*
 	 * Insert onto list for new vnode.
 	 */
+	s = splbio();
 	bufinsvn(bp, &vp->v_cleanblkhd);
+	splx(s);
 }
 
 /*
@@ -526,20 +604,60 @@ brelvp(bp)
 	register struct buf *bp;
 {
 	struct vnode *vp;
+	int s;
 
 	if (bp->b_vp == (struct vnode *) 0)
 		panic("brelvp: NULL");
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
+	s = splbio();
 	if (bp->b_vnbufs.le_next != NOLIST)
 		bufremvn(bp);
+	splx(s);
+
 	vp = bp->b_vp;
 	bp->b_vp = (struct vnode *) 0;
 	HOLDRELE(vp);
 }
 
 /*
+ * Associate a p-buffer with a vnode.
+ */
+void
+pbgetvp(vp, bp)
+	register struct vnode *vp;
+	register struct buf *bp;
+{
+#if defined(DIAGNOSTIC)
+	if (bp->b_vp)
+		panic("pbgetvp: not free");
+#endif
+	bp->b_vp = vp;
+	if (vp->v_type == VBLK || vp->v_type == VCHR)
+		bp->b_dev = vp->v_rdev;
+	else
+		bp->b_dev = NODEV;
+}
+
+/*
+ * Disassociate a p-buffer from a vnode.
+ */
+void
+pbrelvp(bp)
+	register struct buf *bp;
+{
+	struct vnode *vp;
+
+#if defined(DIAGNOSTIC)
+	if (bp->b_vp == (struct vnode *) 0)
+		panic("pbrelvp: NULL");
+#endif
+
+	bp->b_vp = (struct vnode *) 0;
+}
+
+/*
  * Reassign a buffer from one vnode to another.
  * Used to assign file specific control information
  * (indirect blocks) to the vnode to which they belong.
@@ -549,28 +667,43 @@ reassignbuf(bp, newvp)
 	register struct buf *bp;
 	register struct vnode *newvp;
 {
-	register struct buflists *listheadp;
+	int s;
 
 	if (newvp == NULL) {
 		printf("reassignbuf: NULL");
 		return;
 	}
+
+	s = splbio();
 	/*
 	 * Delete from old vnode list, if on one.
 	 */
 	if (bp->b_vnbufs.le_next != NOLIST)
 		bufremvn(bp);
 	/*
-	 * If dirty, put on list of dirty buffers;
-	 * otherwise insert onto list of clean buffers.
+	 * If dirty, put on list of dirty buffers; otherwise insert onto list
+	 * of clean buffers.
 	 */
-	if (bp->b_flags & B_DELWRI)
-		listheadp = &newvp->v_dirtyblkhd;
-	else
-		listheadp = &newvp->v_cleanblkhd;
-	bufinsvn(bp, listheadp);
+	if (bp->b_flags & B_DELWRI) {
+		struct buf *tbp;
+
+		tbp = newvp->v_dirtyblkhd.lh_first;
+		if (!tbp || (tbp->b_lblkno > bp->b_lblkno)) {
+			bufinsvn(bp, &newvp->v_dirtyblkhd);
+		} else {
+			while (tbp->b_vnbufs.le_next &&
+				(tbp->b_vnbufs.le_next->b_lblkno < bp->b_lblkno)) {
+				tbp = tbp->b_vnbufs.le_next;
+			}
+			LIST_INSERT_AFTER(tbp, bp, b_vnbufs);
+		}
+	} else {
+		bufinsvn(bp, &newvp->v_cleanblkhd);
+	}
+	splx(s);
 }
 
+#ifndef DEVFS_ROOT
 /*
  * Create a vnode for a block device.
  * Used for root filesystem, argdev, and swap areas.
@@ -585,24 +718,23 @@ bdevvp(dev, vpp)
 	struct vnode *nvp;
 	int error;
 
-	if (dev == NODEV) {
-		*vpp = NULLVP;
-		return (ENODEV);
-	}
-	error = getnewvnode(VT_NON, (struct mount *)0, spec_vnodeop_p, &nvp);
+	if (dev == NODEV)
+		return (0);
+	error = getnewvnode(VT_NON, (struct mount *) 0, spec_vnodeop_p, &nvp);
 	if (error) {
-		*vpp = NULLVP;
+		*vpp = 0;
 		return (error);
 	}
 	vp = nvp;
 	vp->v_type = VBLK;
-	if (nvp = checkalias(vp, dev, (struct mount *)0)) {
+	if ((nvp = checkalias(vp, dev, (struct mount *) 0))) {
 		vput(vp);
 		vp = nvp;
 	}
 	*vpp = vp;
 	return (0);
 }
+#endif /* !DEVFS_ROOT */
 
 /*
  * Check to see if the new vnode represents a special device
@@ -648,7 +780,7 @@ loop:
 	}
 	if (vp == NULL || vp->v_tag != VT_NON) {
 		MALLOC(nvp->v_specinfo, struct specinfo *,
-			sizeof(struct specinfo), M_VNODE, M_WAITOK);
+		    sizeof(struct specinfo), M_VNODE, M_WAITOK);
 		nvp->v_rdev = nvp_rdev;
 		nvp->v_hashchain = vpp;
 		nvp->v_specnext = *vpp;
@@ -683,7 +815,7 @@ loop:
  */
 int
 vget(vp, flags, p)
-	struct vnode *vp;
+	register struct vnode *vp;
 	int flags;
 	struct proc *p;
 {
@@ -695,8 +827,9 @@ vget(vp, flags, p)
 	 * return failure. Cleaning is determined by checking that
 	 * the VXLOCK flag is set.
 	 */
-	if ((flags & LK_INTERLOCK) == 0)
+	if ((flags & LK_INTERLOCK) == 0) {
 		simple_lock(&vp->v_interlock);
+	}
 	if (vp->v_flag & VXLOCK) {
 		vp->v_flag |= VXWANT;
 		simple_unlock(&vp->v_interlock);
@@ -707,8 +840,22 @@ vget(vp, flags, p)
 		simple_lock(&vnode_free_list_slock);
 		TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 		simple_unlock(&vnode_free_list_slock);
+		freevnodes--;
 	}
 	vp->v_usecount++;
+	/*
+	 * Create the VM object, if needed
+	 */
+	if ((vp->v_type == VREG) &&
+		((vp->v_object == NULL) ||
+			(vp->v_object->flags & OBJ_VFS_REF) == 0)) {
+		/*
+		 * XXX vfs_object_create probably needs the interlock.
+		 */
+		simple_unlock(&vp->v_interlock);
+		vfs_object_create(vp, curproc, curproc->p_ucred, 0);
+		simple_lock(&vp->v_interlock);
+	}
 	if (flags & LK_TYPE_MASK) {
 		if (error = vn_lock(vp, flags | LK_INTERLOCK, p))
 			vrele(vp);
@@ -781,14 +928,15 @@ vop_nolock(ap)
 	 * Since we are not using the lock manager, we must clear
 	 * the interlock here.
 	 */
-	if (ap->a_flags & LK_INTERLOCK)
+	if (ap->a_flags & LK_INTERLOCK) {
 		simple_unlock(&ap->a_vp->v_interlock);
+	}
 	return (0);
 #endif
 }
 
 /*
- * Decrement the active use count.
+ * Do the inverse of vop_nolock, handling the interlock in a compatible way.
  */
 int
 vop_nounlock(ap)
@@ -800,9 +948,13 @@ vop_nounlock(ap)
 {
 	struct vnode *vp = ap->a_vp;
 
-	if (vp->v_vnlock == NULL)
+	if (vp->v_vnlock == NULL) {
+		if (ap->a_flags & LK_INTERLOCK)
+			simple_unlock(&ap->a_vp->v_interlock);
 		return (0);
-	return (lockmgr(vp->v_vnlock, LK_RELEASE, NULL, ap->a_p));
+	}
+	return (lockmgr(vp->v_vnlock, LK_RELEASE | ap->a_flags,
+		&ap->a_vp->v_interlock, ap->a_p));
 }
 
 /*
@@ -821,91 +973,124 @@ vop_noislocked(ap)
 	return (lockstatus(vp->v_vnlock));
 }
 
+/* #ifdef DIAGNOSTIC */
 /*
- * Vnode reference.
+ * Vnode reference, just increment the count
  */
 void
 vref(vp)
 	struct vnode *vp;
 {
-
 	simple_lock(&vp->v_interlock);
 	if (vp->v_usecount <= 0)
 		panic("vref used where vget required");
+
 	vp->v_usecount++;
+
+	if ((vp->v_type == VREG) &&
+		((vp->v_object == NULL) ||
+			((vp->v_object->flags & OBJ_VFS_REF) == 0)) ) {
+		/*
+		 * We need to lock to VP during the time that
+		 * the object is created.  This is necessary to
+		 * keep the system from re-entrantly doing it
+		 * multiple times.
+		 * XXX vfs_object_create probably needs the interlock?
+		 */
+		simple_unlock(&vp->v_interlock);
+		vfs_object_create(vp, curproc, curproc->p_ucred, 0);
+		return;
+	}
 	simple_unlock(&vp->v_interlock);
 }
 
 /*
- * vput(), just unlock and vrele()
+ * Vnode put/release.
+ * If count drops to zero, call inactive routine and return to freelist.
  */
 void
-vput(vp)
+vputrele(vp, put)
 	struct vnode *vp;
+	int put;
 {
 	struct proc *p = curproc;	/* XXX */
 
-#ifdef DIGANOSTIC
+#ifdef DIAGNOSTIC
 	if (vp == NULL)
-		panic("vput: null vp");
+		panic("vputrele: null vp");
 #endif
 	simple_lock(&vp->v_interlock);
 	vp->v_usecount--;
+
+	if ((vp->v_usecount == 1) &&
+		vp->v_object &&
+		(vp->v_object->flags & OBJ_VFS_REF)) {
+		vp->v_object->flags &= ~OBJ_VFS_REF;
+		if (put) {
+			VOP_UNLOCK(vp, LK_INTERLOCK, p);
+		} else {
+			simple_unlock(&vp->v_interlock);
+		}
+		vm_object_deallocate(vp->v_object);
+		return;
+	}
+
 	if (vp->v_usecount > 0) {
-		simple_unlock(&vp->v_interlock);
-		VOP_UNLOCK(vp, 0, p);
+		if (put) {
+			VOP_UNLOCK(vp, LK_INTERLOCK, p);
+		} else {
+			simple_unlock(&vp->v_interlock);
+		}
 		return;
 	}
+
+	if (vp->v_usecount < 0) {
 #ifdef DIAGNOSTIC
-	if (vp->v_usecount < 0 || vp->v_writecount != 0) {
-		vprint("vput: bad ref count", vp);
-		panic("vput: ref cnt");
-	}
+		vprint("vputrele: negative ref count", vp);
 #endif
-	/*
-	 * insert at tail of LRU list
-	 */
+		panic("vputrele: negative ref cnt");
+	}
 	simple_lock(&vnode_free_list_slock);
-	TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+	if (vp->v_flag & VAGE) {
+		vp->v_flag &= ~VAGE;
+		vp->v_usage = 0;
+		if(vp->v_tag != VT_TFS)
+			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
+	} else {
+		if(vp->v_tag != VT_TFS)
+			TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
+	}
+	freevnodes++;
 	simple_unlock(&vnode_free_list_slock);
-	simple_unlock(&vp->v_interlock);
-	VOP_INACTIVE(vp, p);
+
+	/*
+	 * If we are doing a vput, the node is already locked, and we must
+	 * call VOP_INACTIVE with the node locked.  So, in the case of
+	 * vrele, we explicitly lock the vnode before calling VOP_INACTIVE.
+	 */
+	if (put) {
+		simple_unlock(&vp->v_interlock);
+		VOP_INACTIVE(vp, p);
+	} else if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0) {
+		VOP_INACTIVE(vp, p);
+	}
 }
 
 /*
- * Vnode release.
- * If count drops to zero, call inactive routine and return to freelist.
+ * vput(), just unlock and vrele()
  */
 void
-vrele(vp)
+vput(vp)
 	struct vnode *vp;
 {
-	struct proc *p = curproc;	/* XXX */
+	vputrele(vp, 1);
+}
 
-#ifdef DIAGNOSTIC
-	if (vp == NULL)
-		panic("vrele: null vp");
-#endif
-	simple_lock(&vp->v_interlock);
-	vp->v_usecount--;
-	if (vp->v_usecount > 0) {
-		simple_unlock(&vp->v_interlock);
-		return;
-	}
-#ifdef DIAGNOSTIC
-	if (vp->v_usecount < 0 || vp->v_writecount != 0) {
-		vprint("vrele: bad ref count", vp);
-		panic("vrele: ref cnt");
-	}
-#endif
-	/*
-	 * insert at tail of LRU list
-	 */
-	simple_lock(&vnode_free_list_slock);
-	TAILQ_INSERT_TAIL(&vnode_free_list, vp, v_freelist);
-	simple_unlock(&vnode_free_list_slock);
-	if (vn_lock(vp, LK_EXCLUSIVE | LK_INTERLOCK, p) == 0)
-		VOP_INACTIVE(vp, p);
+void
+vrele(vp)
+	struct vnode *vp;
+{
+	vputrele(vp, 0);
 }
 
 #ifdef DIAGNOSTIC
@@ -947,8 +1132,8 @@ holdrele(vp)
  * that are found.
  */
 #ifdef DIAGNOSTIC
-int busyprt = 0;	/* print out busy vnodes */
-struct ctldebug debug1 = { "busyprt", &busyprt };
+static int busyprt = 0;		/* print out busy vnodes */
+SYSCTL_INT(_debug, 1, busyprt, CTLFLAG_RW, &busyprt, 0, "");
 #endif
 
 int
@@ -964,6 +1149,10 @@ vflush(mp, skipvp, flags)
 	simple_lock(&mntvnode_slock);
 loop:
 	for (vp = mp->mnt_vnodelist.lh_first; vp; vp = nvp) {
+		/*
+		 * Make sure this vnode wasn't reclaimed in getnewvnode().
+		 * Start over if it has (it won't be on the list anymore).
+		 */
 		if (vp->v_mount != mp)
 			goto loop;
 		nvp = vp->v_mntvnodes.le_next;
@@ -982,17 +1171,29 @@ loop:
 			continue;
 		}
 		/*
-		 * If WRITECLOSE is set, only flush out regular file
-		 * vnodes open for writing.
+		 * If WRITECLOSE is set, only flush out regular file vnodes
+		 * open for writing.
 		 */
 		if ((flags & WRITECLOSE) &&
 		    (vp->v_writecount == 0 || vp->v_type != VREG)) {
 			simple_unlock(&vp->v_interlock);
 			continue;
 		}
+
+		if (vp->v_object && (vp->v_object->flags & OBJ_VFS_REF)) {
+			simple_unlock(&vp->v_interlock);
+			simple_unlock(&mntvnode_slock);
+			vm_object_reference(vp->v_object);
+			pager_cache(vp->v_object, FALSE);
+			vp->v_object->flags &= ~OBJ_VFS_REF;
+			vm_object_deallocate(vp->v_object);
+			simple_lock(&mntvnode_slock);
+			simple_lock(&vp->v_interlock);
+		}
+
 		/*
-		 * With v_usecount == 0, all we need to do is clear
-		 * out the vnode data structures and we are done.
+		 * With v_usecount == 0, all we need to do is clear out the
+		 * vnode data structures and we are done.
 		 */
 		if (vp->v_usecount == 0) {
 			simple_unlock(&mntvnode_slock);
@@ -1000,10 +1201,11 @@ loop:
 			simple_lock(&mntvnode_slock);
 			continue;
 		}
+
 		/*
-		 * If FORCECLOSE is set, forcibly close the vnode.
-		 * For block or character devices, revert to an
-		 * anonymous device. For all other files, just kill them.
+		 * If FORCECLOSE is set, forcibly close the vnode. For block
+		 * or character devices, revert to an anonymous device. For
+		 * all other files, just kill them.
 		 */
 		if (flags & FORCECLOSE) {
 			simple_unlock(&mntvnode_slock);
@@ -1012,7 +1214,7 @@ loop:
 			} else {
 				vclean(vp, 0, p);
 				vp->v_op = spec_vnodeop_p;
-				insmntque(vp, (struct mount *)0);
+				insmntque(vp, (struct mount *) 0);
 			}
 			simple_lock(&mntvnode_slock);
 			continue;
@@ -1032,27 +1234,22 @@ loop:
 
 /*
  * Disassociate the underlying file system from a vnode.
- * The vnode interlock is held on entry.
  */
 static void
-vclean(vp, flags, p)
-	struct vnode *vp;
-	int flags;
-	struct proc *p;
+vclean(struct vnode *vp, int flags, struct proc *p)
 {
 	int active;
 
 	/*
-	 * Check to see if the vnode is in use.
-	 * If so we have to reference it before we clean it out
-	 * so that its count cannot fall to zero and generate a
-	 * race against ourselves to recycle it.
+	 * Check to see if the vnode is in use. If so we have to reference it
+	 * before we clean it out so that its count cannot fall to zero and
+	 * generate a race against ourselves to recycle it.
 	 */
-	if (active = vp->v_usecount)
+	if ((active = vp->v_usecount))
 		vp->v_usecount++;
 	/*
-	 * Prevent the vnode from being recycled or
-	 * brought into use while we clean it out.
+	 * Prevent the vnode from being recycled or brought into use while we
+	 * clean it out.
 	 */
 	if (vp->v_flag & VXLOCK)
 		panic("vclean: deadlock");
@@ -1109,12 +1306,12 @@ vclean(vp, flags, p)
 	vp->v_flag &= ~VXLOCK;
 	if (vp->v_flag & VXWANT) {
 		vp->v_flag &= ~VXWANT;
-		wakeup((caddr_t)vp);
+		wakeup((caddr_t) vp);
 	}
 }
 
 /*
- * Eliminate all activity associated with  the requested vnode
+ * Eliminate all activity associated with the requested vnode
  * and with all vnodes aliased to the requested vnode.
  */
 int
@@ -1162,8 +1359,9 @@ vop_revoke(ap)
 				vgone(vq);
 				break;
 			}
-			if (vq == NULLVP)
+			if (vq == NULLVP) {
 				simple_unlock(&spechash_slock);
+			}
 		}
 		/*
 		 * Remove the lock so that vgone below will
@@ -1190,8 +1388,9 @@ vrecycle(vp, inter_lkp, p)
 
 	simple_lock(&vp->v_interlock);
 	if (vp->v_usecount == 0) {
-		if (inter_lkp)
+		if (inter_lkp) {
 			simple_unlock(inter_lkp);
+		}
 		vgonel(vp, p);
 		return (1);
 	}
@@ -1205,7 +1404,7 @@ vrecycle(vp, inter_lkp, p)
  */
 void
 vgone(vp)
-	struct vnode *vp;
+	register struct vnode *vp;
 {
 	struct proc *p = curproc;	/* XXX */
 
@@ -1234,6 +1433,11 @@ vgonel(vp, p)
 		tsleep((caddr_t)vp, PINOD, "vgone", 0);
 		return;
 	}
+
+	if (vp->v_object) {
+		vp->v_object->flags |= OBJ_VNODE_GONE;
+	}
+
 	/*
 	 * Clean out the filesystem specific data.
 	 */
@@ -1281,6 +1485,7 @@ vgonel(vp, p)
 		FREE(vp->v_specinfo, M_VNODE);
 		vp->v_specinfo = NULL;
 	}
+
 	/*
 	 * If it is on the freelist and not already at the head,
 	 * move it to the head of the list. The test of the back
@@ -1297,12 +1502,13 @@ vgonel(vp, p)
 	if (vp->v_usecount == 0) {
 		simple_lock(&vnode_free_list_slock);
 		if ((vp->v_freelist.tqe_prev != (struct vnode **)0xdeadb) &&
-		    vnode_free_list.tqh_first != vp) {
+			vnode_free_list.tqh_first != vp) {
 			TAILQ_REMOVE(&vnode_free_list, vp, v_freelist);
 			TAILQ_INSERT_HEAD(&vnode_free_list, vp, v_freelist);
 		}
 		simple_unlock(&vnode_free_list_slock);
 	}
+
 	vp->v_type = VBAD;
 }
 
@@ -1315,7 +1521,7 @@ vfinddev(dev, type, vpp)
 	enum vtype type;
 	struct vnode **vpp;
 {
-	struct vnode *vp;
+	register struct vnode *vp;
 	int rc = 0;
 
 	simple_lock(&spechash_slock);
@@ -1335,7 +1541,7 @@ vfinddev(dev, type, vpp)
  */
 int
 vcount(vp)
-	struct vnode *vp;
+	register struct vnode *vp;
 {
 	struct vnode *vq, *vnext;
 	int count;
@@ -1366,7 +1572,7 @@ loop:
  * Print out a description of a vnode.
  */
 static char *typename[] =
-   { "VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD" };
+{"VNON", "VREG", "VDIR", "VBLK", "VCHR", "VLNK", "VSOCK", "VFIFO", "VBAD"};
 
 void
 vprint(label, vp)
@@ -1377,9 +1583,9 @@ vprint(label, vp)
 
 	if (label != NULL)
 		printf("%s: ", label);
-	printf("type %s, usecount %d, writecount %d, refcount %d,",
-		typename[vp->v_type], vp->v_usecount, vp->v_writecount,
-		vp->v_holdcnt);
+	printf("type %s, usecount %d, writecount %d, refcount %ld,",
+	    typename[vp->v_type], vp->v_usecount, vp->v_writecount,
+	    vp->v_holdcnt);
 	buf[0] = '\0';
 	if (vp->v_flag & VROOT)
 		strcat(buf, "|VROOT");
@@ -1405,7 +1611,7 @@ vprint(label, vp)
 	}
 }
 
-#ifdef DEBUG
+#ifdef DDB
 /*
  * List all of the locked vnodes in the system.
  * Called when debugging the kernel.
@@ -1441,19 +1647,22 @@ printlockedvnodes()
 /*
  * Top level filesystem related information gathering.
  */
-int
-vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
-	int *name;
-	u_int namelen;
-	void *oldp;
-	size_t *oldlenp;
-	void *newp;
-	size_t newlen;
-	struct proc *p;
+static int	sysctl_ovfs_conf __P(SYSCTL_HANDLER_ARGS);
+
+static int
+vfs_sysctl SYSCTL_HANDLER_ARGS
 {
-	struct ctldebug *cdp;
+	int *name = (int *)arg1 - 1;	/* XXX */
+	u_int namelen = arg2 + 1;	/* XXX */
 	struct vfsconf *vfsp;
 
+#ifndef NO_COMPAT_PRELITE2
+	/* Resolve ambiguity between VFS_VFSCONF and VFS_GENERIC. */
+	if (namelen == 1)
+		return (sysctl_ovfs_conf(oidp, arg1, arg2, req));
+#endif
+
+#ifdef notyet
 	/* all sysctl names at this level are at least name and field */
 	if (namelen < 2)
 		return (ENOTDIR);		/* overloaded */
@@ -1466,58 +1675,83 @@ vfs_sysctl(name, namelen, oldp, oldlenp, newp, newlen, p)
 		return ((*vfsp->vfc_vfsops->vfs_sysctl)(&name[1], namelen - 1,
 		    oldp, oldlenp, newp, newlen, p));
 	}
+#endif
 	switch (name[1]) {
 	case VFS_MAXTYPENUM:
-		return (sysctl_rdint(oldp, oldlenp, newp, maxvfsconf));
+		if (namelen != 2)
+			return (ENOTDIR);
+		return (SYSCTL_OUT(req, &maxvfsconf, sizeof(int)));
 	case VFS_CONF:
-		if (namelen < 3)
+		if (namelen != 3)
 			return (ENOTDIR);	/* overloaded */
 		for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next)
 			if (vfsp->vfc_typenum == name[2])
 				break;
 		if (vfsp == NULL)
 			return (EOPNOTSUPP);
-		return (sysctl_rdstruct(oldp, oldlenp, newp, vfsp,
-		    sizeof(struct vfsconf)));
+		return (SYSCTL_OUT(req, vfsp, sizeof *vfsp));
 	}
 	return (EOPNOTSUPP);
 }
 
+SYSCTL_NODE(_vfs, VFS_GENERIC, generic, CTLFLAG_RD, vfs_sysctl,
+	"Generic filesystem");
+
+#ifndef NO_COMPAT_PRELITE2
+
+static int
+sysctl_ovfs_conf SYSCTL_HANDLER_ARGS
+{
+	int error;
+	struct vfsconf *vfsp;
+	struct ovfsconf ovfs;
+
+	for (vfsp = vfsconf; vfsp; vfsp = vfsp->vfc_next) {
+		ovfs.vfc_vfsops = vfsp->vfc_vfsops;	/* XXX used as flag */
+		strcpy(ovfs.vfc_name, vfsp->vfc_name);
+		ovfs.vfc_index = vfsp->vfc_typenum;
+		ovfs.vfc_refcount = vfsp->vfc_refcount;
+		ovfs.vfc_flags = vfsp->vfc_flags;
+		error = SYSCTL_OUT(req, &ovfs, sizeof ovfs);
+		if (error)
+			return error;
+	}
+	return 0;
+}
+
+#endif /* !NO_COMPAT_PRELITE2 */
+
 int kinfo_vdebug = 1;
 int kinfo_vgetfailed;
+
 #define KINFO_VNODESLOP	10
 /*
  * Dump vnode list (via sysctl).
  * Copyout address of vnode followed by vnode.
  */
 /* ARGSUSED */
-int
-sysctl_vnode(where, sizep, p)
-	char *where;
-	size_t *sizep;
-	struct proc *p;
+static int
+sysctl_vnode SYSCTL_HANDLER_ARGS
 {
+	struct proc *p = curproc;	/* XXX */
 	struct mount *mp, *nmp;
 	struct vnode *nvp, *vp;
-	char *bp = where, *savebp;
-	char *ewhere;
 	int error;
 
 #define VPTRSZ	sizeof (struct vnode *)
 #define VNODESZ	sizeof (struct vnode)
-	if (where == NULL) {
-		*sizep = (numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ);
-		return (0);
-	}
-	ewhere = where + *sizep;
-		
+
+	req->lock = 0;
+	if (!req->oldptr) /* Make an estimate */
+		return (SYSCTL_OUT(req, 0,
+			(numvnodes + KINFO_VNODESLOP) * (VPTRSZ + VNODESZ)));
+
 	simple_lock(&mountlist_slock);
 	for (mp = mountlist.cqh_first; mp != (void *)&mountlist; mp = nmp) {
 		if (vfs_busy(mp, LK_NOWAIT, &mountlist_slock, p)) {
 			nmp = mp->mnt_list.cqe_next;
 			continue;
 		}
-		savebp = bp;
 again:
 		simple_lock(&mntvnode_slock);
 		for (vp = mp->mnt_vnodelist.lh_first;
@@ -1532,20 +1766,13 @@ again:
 				simple_unlock(&mntvnode_slock);
 				if (kinfo_vdebug)
 					printf("kinfo: vp changed\n");
-				bp = savebp;
 				goto again;
 			}
 			nvp = vp->v_mntvnodes.le_next;
-			if (bp + VPTRSZ + VNODESZ > ewhere) {
-				simple_unlock(&mntvnode_slock);
-				*sizep = bp - where;
-				return (ENOMEM);
-			}
 			simple_unlock(&mntvnode_slock);
-			if ((error = copyout((caddr_t)&vp, bp, VPTRSZ)) ||
-			   (error = copyout((caddr_t)vp, bp + VPTRSZ, VNODESZ)))
+			if ((error = SYSCTL_OUT(req, &vp, VPTRSZ)) ||
+			    (error = SYSCTL_OUT(req, vp, VNODESZ)))
 				return (error);
-			bp += VPTRSZ + VNODESZ;
 			simple_lock(&mntvnode_slock);
 		}
 		simple_unlock(&mntvnode_slock);
@@ -1555,10 +1782,12 @@ again:
 	}
 	simple_unlock(&mountlist_slock);
 
-	*sizep = bp - where;
 	return (0);
 }
 
+SYSCTL_PROC(_kern, KERN_VNODE, vnode, CTLTYPE_OPAQUE|CTLFLAG_RD,
+	0, 0, sysctl_vnode, "S,vnode", "");
+
 /*
  * Check to see if a filesystem is mounted on a block device.
  */
@@ -1595,14 +1824,23 @@ void
 vfs_unmountall()
 {
 	struct mount *mp, *nmp;
-	struct proc *p = curproc;	/* XXX */
+	struct proc *p = initproc;	/* XXX XXX should this be proc0? */
+	int error;
 
 	/*
 	 * Since this only runs when rebooting, it is not interlocked.
 	 */
 	for (mp = mountlist.cqh_last; mp != (void *)&mountlist; mp = nmp) {
 		nmp = mp->mnt_list.cqe_prev;
-		(void) dounmount(mp, MNT_FORCE, p);
+		error = dounmount(mp, MNT_FORCE, p);
+		if (error) {
+			printf("unmount of %s failed (",
+			    mp->mnt_stat.f_mntonname);
+			if (error == EBUSY)
+				printf("BUSY)\n");
+			else
+				printf("%d)\n", error);
+		}
 	}
 }
 
@@ -1611,10 +1849,8 @@ vfs_unmountall()
  * Called by ufs_mount() to set up the lists of export addresses.
  */
 static int
-vfs_hang_addrlist(mp, nep, argp)
-	struct mount *mp;
-	struct netexport *nep;
-	struct export_args *argp;
+vfs_hang_addrlist(struct mount *mp, struct netexport *nep, 
+	struct export_args *argp)
 {
 	register struct netcred *np;
 	register struct radix_node_head *rnh;
@@ -1635,16 +1871,16 @@ vfs_hang_addrlist(mp, nep, argp)
 		return (0);
 	}
 	i = sizeof(struct netcred) + argp->ex_addrlen + argp->ex_masklen;
-	np = (struct netcred *)malloc(i, M_NETADDR, M_WAITOK);
-	bzero((caddr_t)np, i);
-	saddr = (struct sockaddr *)(np + 1);
-	if (error = copyin(argp->ex_addr, (caddr_t)saddr, argp->ex_addrlen))
+	np = (struct netcred *) malloc(i, M_NETADDR, M_WAITOK);
+	bzero((caddr_t) np, i);
+	saddr = (struct sockaddr *) (np + 1);
+	if ((error = copyin(argp->ex_addr, (caddr_t) saddr, argp->ex_addrlen)))
 		goto out;
 	if (saddr->sa_len > argp->ex_addrlen)
 		saddr->sa_len = argp->ex_addrlen;
 	if (argp->ex_masklen) {
-		smask = (struct sockaddr *)((caddr_t)saddr + argp->ex_addrlen);
-		error = copyin(argp->ex_addr, (caddr_t)smask, argp->ex_masklen);
+		smask = (struct sockaddr *) ((caddr_t) saddr + argp->ex_addrlen);
+		error = copyin(argp->ex_addr, (caddr_t) smask, argp->ex_masklen);
 		if (error)
 			goto out;
 		if (smask->sa_len > argp->ex_masklen)
@@ -1653,13 +1889,13 @@ vfs_hang_addrlist(mp, nep, argp)
 	i = saddr->sa_family;
 	if ((rnh = nep->ne_rtable[i]) == 0) {
 		/*
-		 * Seems silly to initialize every AF when most are not
-		 * used, do so on demand here
+		 * Seems silly to initialize every AF when most are not used,
+		 * do so on demand here
 		 */
 		for (dom = domains; dom; dom = dom->dom_next)
 			if (dom->dom_family == i && dom->dom_rtattach) {
-				dom->dom_rtattach((void **)&nep->ne_rtable[i],
-					dom->dom_rtoffset);
+				dom->dom_rtattach((void **) &nep->ne_rtable[i],
+				    dom->dom_rtoffset);
 				break;
 			}
 		if ((rnh = nep->ne_rtable[i]) == 0) {
@@ -1667,23 +1903,11 @@ vfs_hang_addrlist(mp, nep, argp)
 			goto out;
 		}
 	}
-	rn = (*rnh->rnh_addaddr)((caddr_t)saddr, (caddr_t)smask, rnh,
-		np->netc_rnodes);
-	if (rn == 0) {
-		/*
-		 * One of the reasons that rnh_addaddr may fail is that
-		 * the entry already exists. To check for this case, we
-		 * look up the entry to see if it is there. If so, we
-		 * do not need to make a new entry but do return success.
-		 */
-		free(np, M_NETADDR);
-		rn = (*rnh->rnh_matchaddr)((caddr_t)saddr, rnh);
-		if (rn != 0 && (rn->rn_flags & RNF_ROOT) == 0 &&
-		    ((struct netcred *)rn)->netc_exflags == argp->ex_flags &&
-		    !bcmp((caddr_t)&((struct netcred *)rn)->netc_anon,
-			    (caddr_t)&argp->ex_anon, sizeof(struct ucred)))
-			return (0);
-		return (EPERM);
+	rn = (*rnh->rnh_addaddr) ((caddr_t) saddr, (caddr_t) smask, rnh,
+	    np->netc_rnodes);
+	if (rn == 0 || np != (struct netcred *) rn) {	/* already exists */
+		error = EPERM;
+		goto out;
 	}
 	np->netc_exflags = argp->ex_flags;
 	np->netc_anon = argp->ex_anon;
@@ -1696,14 +1920,12 @@ out:
 
 /* ARGSUSED */
 static int
-vfs_free_netcred(rn, w)
-	struct radix_node *rn;
-	caddr_t w;
+vfs_free_netcred(struct radix_node *rn, void *w)
 {
-	register struct radix_node_head *rnh = (struct radix_node_head *)w;
+	register struct radix_node_head *rnh = (struct radix_node_head *) w;
 
-	(*rnh->rnh_deladdr)(rn->rn_key, rn->rn_mask, rnh);
-	free((caddr_t)rn, M_NETADDR);
+	(*rnh->rnh_deladdr) (rn->rn_key, rn->rn_mask, rnh);
+	free((caddr_t) rn, M_NETADDR);
 	return (0);
 }
 
@@ -1711,17 +1933,16 @@ vfs_free_netcred(rn, w)
  * Free the net address hash lists that are hanging off the mount points.
  */
 static void
-vfs_free_addrlist(nep)
-	struct netexport *nep;
+vfs_free_addrlist(struct netexport *nep)
 {
 	register int i;
 	register struct radix_node_head *rnh;
 
 	for (i = 0; i <= AF_MAX; i++)
-		if (rnh = nep->ne_rtable[i]) {
-			(*rnh->rnh_walktree)(rnh, vfs_free_netcred,
-			    (caddr_t)rnh);
-			free((caddr_t)rnh, M_RTABLE);
+		if ((rnh = nep->ne_rtable[i])) {
+			(*rnh->rnh_walktree) (rnh, vfs_free_netcred,
+			    (caddr_t) rnh);
+			free((caddr_t) rnh, M_RTABLE);
 			nep->ne_rtable[i] = 0;
 		}
 }
@@ -1739,7 +1960,7 @@ vfs_export(mp, nep, argp)
 		mp->mnt_flag &= ~(MNT_EXPORTED | MNT_DEFEXPORTED);
 	}
 	if (argp->ex_flags & MNT_EXPORTED) {
-		if (error = vfs_hang_addrlist(mp, nep, argp))
+		if ((error = vfs_hang_addrlist(mp, nep, argp)))
 			return (error);
 		mp->mnt_flag |= MNT_EXPORTED;
 	}
@@ -1780,3 +2001,79 @@ vfs_export_lookup(mp, nep, nam)
 	}
 	return (np);
 }
+
+/*
+ * perform msync on all vnodes under a mount point
+ * the mount point must be locked.
+ */
+void
+vfs_msync(struct mount *mp, int flags) {
+	struct vnode *vp, *nvp;
+loop:
+	for (vp = mp->mnt_vnodelist.lh_first; vp != NULL; vp = nvp) {
+
+		if (vp->v_mount != mp)
+			goto loop;
+		nvp = vp->v_mntvnodes.le_next;
+		if (VOP_ISLOCKED(vp) && (flags != MNT_WAIT))
+			continue;
+		if (vp->v_object &&
+		   (vp->v_object->flags & OBJ_MIGHTBEDIRTY)) {
+			vm_object_page_clean(vp->v_object, 0, 0, TRUE, TRUE);
+		}
+	}
+}
+
+/*
+ * Create the VM object needed for VMIO and mmap support.  This
+ * is done for all VREG files in the system.  Some filesystems might
+ * afford the additional metadata buffering capability of the
+ * VMIO code by making the device node be VMIO mode also.
+ */
+int
+vfs_object_create(vp, p, cred, waslocked)
+	struct vnode *vp;
+	struct proc *p;
+	struct ucred *cred;
+	int waslocked;
+{
+	struct vattr vat;
+	vm_object_t object;
+	int error = 0;
+
+retry:
+	if ((object = vp->v_object) == NULL) {
+		if (vp->v_type == VREG) {
+			if ((error = VOP_GETATTR(vp, &vat, cred, p)) != 0)
+				goto retn;
+			(void) vnode_pager_alloc(vp,
+				OFF_TO_IDX(round_page(vat.va_size)), 0, 0);
+		} else {
+			/*
+			 * This simply allocates the biggest object possible
+			 * for a VBLK vnode.  This should be fixed, but doesn't
+			 * cause any problems (yet).
+			 */
+			(void) vnode_pager_alloc(vp, INT_MAX, 0, 0);
+		}
+		vp->v_object->flags |= OBJ_VFS_REF;
+	} else {
+		if (object->flags & OBJ_DEAD) {
+			if (waslocked)
+				VOP_UNLOCK(vp, 0, p);
+			tsleep(object, PVM, "vodead", 0);
+			if (waslocked)
+				vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
+			goto retry;
+		}
+		if ((object->flags & OBJ_VFS_REF) == 0) {
+			object->flags |= OBJ_VFS_REF;
+			vm_object_reference(object);
+		}
+	}
+	if (vp->v_object)
+		vp->v_flag |= VVMIO;
+
+retn:
+	return error;
+}
diff --git a/sys/kern/vfs_syscalls.c b/sys/kern/vfs_syscalls.c
index 0cf7680..2997fe5 100644
--- a/sys/kern/vfs_syscalls.c
+++ b/sys/kern/vfs_syscalls.c
@@ -35,16 +35,30 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)vfs_syscalls.c	8.41 (Berkeley) 6/15/95
+ *	@(#)vfs_syscalls.c	8.13 (Berkeley) 4/15/94
+ * $Id: vfs_syscalls.c,v 1.60 1997/03/23 03:36:35 bde Exp $
  */
 
+/*
+ * XXX - The following is required because of some magic done 
+ * in getdirentries() below which is only done if the translucent
+ * filesystem `UNION' is compiled into the kernel.  This is broken,
+ * but I don't have time to study the code deeply enough to understand
+ * what's going on and determine an appropriate fix.  -GAW
+ */
+#include "opt_union.h"
+
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/sysent.h>
+#include <sys/sysproto.h>
 #include <sys/namei.h>
 #include <sys/filedesc.h>
 #include <sys/kernel.h>
+#include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/stat.h>
+#include <sys/unistd.h>
 #include <sys/vnode.h>
 #include <sys/mount.h>
 #include <sys/proc.h>
@@ -52,9 +66,14 @@
 #include <sys/malloc.h>
 #include <sys/dirent.h>
 
-#include <sys/syscallargs.h>
+#ifdef UNION
+#include <miscfs/union/union.h>
+#endif
 
 #include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vm_extern.h>
 #include <sys/sysctl.h>
 
 static int change_dir __P((struct nameidata *ndp, struct proc *p));
@@ -67,6 +86,14 @@ static void checkdirs __P((struct vnode *olddp));
 /*
  * Mount a file system.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct mount_args {
+	char	*type;
+	char	*path;
+	int	flags;
+	caddr_t	data;
+};
+#endif
 /* ARGSUSED */
 int
 mount(p, uap, retval)
@@ -82,7 +109,7 @@ mount(p, uap, retval)
 	struct vnode *vp;
 	struct mount *mp;
 	struct vfsconf *vfsp;
-	int error, flag;
+	int error, flag = 0;
 	struct vattr va;
 	u_long fstypenum;
 	struct nameidata nd;
@@ -228,9 +255,10 @@ update:
 	else if (mp->mnt_flag & MNT_RDONLY)
 		mp->mnt_flag |= MNT_WANTRDWR;
 	mp->mnt_flag &=~ (MNT_NOSUID | MNT_NOEXEC | MNT_NODEV |
-	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC);
+	    MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_NOATIME);
 	mp->mnt_flag |= SCARG(uap, flags) & (MNT_NOSUID | MNT_NOEXEC |
-	    MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC);
+	    MNT_NODEV | MNT_SYNCHRONOUS | MNT_UNION | MNT_ASYNC | MNT_FORCE |
+	    MNT_NOATIME);
 	/*
 	 * Mount the filesystem.
 	 */
@@ -313,6 +341,12 @@ checkdirs(olddp)
  * Note: unmount takes a path to the vnode mounted on as argument,
  * not special file (as before).
  */
+#ifndef _SYS_SYSPROTO_H_
+struct unmount_args {
+	char	*path;
+	int	flags;
+};
+#endif
 /* ARGSUSED */
 int
 unmount(p, uap, retval)
@@ -380,6 +414,7 @@ dounmount(mp, flags, p)
 	mp->mnt_flag |= MNT_UNMOUNT;
 	lockmgr(&mp->mnt_lock, LK_DRAIN | LK_INTERLOCK, &mountlist_slock, p);
 	mp->mnt_flag &=~ MNT_ASYNC;
+	vfs_msync(mp, MNT_NOWAIT);
 	vnode_pager_umount(mp);	/* release cached vnodes */
 	cache_purgevfs(mp);	/* remove cache entries for this file sys */
 	if (((mp->mnt_flag & MNT_RDONLY) ||
@@ -411,16 +446,22 @@ dounmount(mp, flags, p)
 /*
  * Sync each mounted filesystem.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct sync_args {
+        int     dummy;
+};
+#endif
+
 #ifdef DEBUG
 int syncprt = 0;
-struct ctldebug debug0 = { "syncprt", &syncprt };
+SYSCTL_INT(_debug, 0, syncprt, CTLFLAG_RW, &syncprt, 0, "");
 #endif
 
 /* ARGSUSED */
 int
 sync(p, uap, retval)
 	struct proc *p;
-	void *uap;
+	struct sync_args *uap;
 	register_t *retval;
 {
 	register struct mount *mp, *nmp;
@@ -435,7 +476,8 @@ sync(p, uap, retval)
 		if ((mp->mnt_flag & MNT_RDONLY) == 0) {
 			asyncflag = mp->mnt_flag & MNT_ASYNC;
 			mp->mnt_flag &= ~MNT_ASYNC;
-			VFS_SYNC(mp, MNT_NOWAIT, p->p_ucred, p);
+			vfs_msync(mp, MNT_NOWAIT);
+			VFS_SYNC(mp, MNT_NOWAIT, p != NULL ? p->p_ucred : NOCRED, p);
 			if (asyncflag)
 				mp->mnt_flag |= MNT_ASYNC;
 		}
@@ -444,16 +486,30 @@ sync(p, uap, retval)
 		vfs_unbusy(mp, p);
 	}
 	simple_unlock(&mountlist_slock);
+#if 0
+/*
+ * XXX don't call vfs_bufstats() yet because that routine
+ * was not imported in the Lite2 merge.
+ */
 #ifdef DIAGNOSTIC
 	if (syncprt)
 		vfs_bufstats();
 #endif /* DIAGNOSTIC */
+#endif
 	return (0);
 }
 
 /*
  * Change filesystem quotas.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct quotactl_args {
+	char *path;
+	int cmd;
+	int uid;
+	caddr_t arg;
+};
+#endif
 /* ARGSUSED */
 int
 quotactl(p, uap, retval)
@@ -482,6 +538,12 @@ quotactl(p, uap, retval)
 /*
  * Get filesystem statistics.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct statfs_args {
+	char *path;
+	struct statfs *buf;
+};
+#endif
 /* ARGSUSED */
 int
 statfs(p, uap, retval)
@@ -496,6 +558,7 @@ statfs(p, uap, retval)
 	register struct statfs *sp;
 	int error;
 	struct nameidata nd;
+	struct statfs sb;
 
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
 	if (error = namei(&nd))
@@ -503,15 +566,27 @@ statfs(p, uap, retval)
 	mp = nd.ni_vp->v_mount;
 	sp = &mp->mnt_stat;
 	vrele(nd.ni_vp);
-	if (error = VFS_STATFS(mp, sp, p))
+	error = VFS_STATFS(mp, sp, p);
+	if (error)
 		return (error);
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	if (p->p_ucred->cr_uid != 0) {
+		bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb));
+		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+		sp = &sb;
+	}
 	return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp)));
 }
 
 /*
  * Get filesystem statistics.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct fstatfs_args {
+	int fd;
+	struct statfs *buf;
+};
+#endif
 /* ARGSUSED */
 int
 fstatfs(p, uap, retval)
@@ -526,20 +601,34 @@ fstatfs(p, uap, retval)
 	struct mount *mp;
 	register struct statfs *sp;
 	int error;
+	struct statfs sb;
 
 	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
 		return (error);
 	mp = ((struct vnode *)fp->f_data)->v_mount;
 	sp = &mp->mnt_stat;
-	if (error = VFS_STATFS(mp, sp, p))
+	error = VFS_STATFS(mp, sp, p);
+	if (error)
 		return (error);
 	sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
+	if (p->p_ucred->cr_uid != 0) {
+		bcopy((caddr_t)sp, (caddr_t)&sb, sizeof(sb));
+		sb.f_fsid.val[0] = sb.f_fsid.val[1] = 0;
+		sp = &sb;
+	}
 	return (copyout((caddr_t)sp, (caddr_t)SCARG(uap, buf), sizeof(*sp)));
 }
 
 /*
  * Get statistics on all filesystems.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct getfsstat_args {
+	struct statfs *buf;
+	long bufsize;
+	int flags;
+};
+#endif
 int
 getfsstat(p, uap, retval)
 	struct proc *p;
@@ -579,8 +668,11 @@ getfsstat(p, uap, retval)
 				continue;
 			}
 			sp->f_flags = mp->mnt_flag & MNT_VISFLAGMASK;
-			if (error = copyout((caddr_t)sp, sfsp, sizeof(*sp)))
+			error = copyout((caddr_t)sp, sfsp, sizeof(*sp));
+			if (error) {
+				vfs_unbusy(mp, p);
 				return (error);
+			}
 			sfsp += sizeof(*sp);
 		}
 		count++;
@@ -599,6 +691,11 @@ getfsstat(p, uap, retval)
 /*
  * Change current working directory to a given file descriptor.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct fchdir_args {
+	int	fd;
+};
+#endif
 /* ARGSUSED */
 int
 fchdir(p, uap, retval)
@@ -646,6 +743,11 @@ fchdir(p, uap, retval)
 /*
  * Change current working directory (``.'').
  */
+#ifndef _SYS_SYSPROTO_H_
+struct chdir_args {
+	char	*path;
+};
+#endif
 /* ARGSUSED */
 int
 chdir(p, uap, retval)
@@ -671,6 +773,11 @@ chdir(p, uap, retval)
 /*
  * Change notion of root (``/'') directory.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct chroot_args {
+	char	*path;
+};
+#endif
 /* ARGSUSED */
 int
 chroot(p, uap, retval)
@@ -684,7 +791,8 @@ chroot(p, uap, retval)
 	int error;
 	struct nameidata nd;
 
-	if (error = suser(p->p_ucred, &p->p_acflag))
+	error = suser(p->p_ucred, &p->p_acflag);
+	if (error)
 		return (error);
 	NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_USERSPACE,
 	    SCARG(uap, path), p);
@@ -707,7 +815,8 @@ change_dir(ndp, p)
 	struct vnode *vp;
 	int error;
 
-	if (error = namei(ndp))
+	error = namei(ndp);
+	if (error)
 		return (error);
 	vp = ndp->ni_vp;
 	if (vp->v_type != VDIR)
@@ -725,6 +834,13 @@ change_dir(ndp, p)
  * Check permissions, allocate an open file structure,
  * and call the device open routine if any.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct open_args {
+	char	*path;
+	int	flags;
+	int	mode;
+};
+#endif
 int
 open(p, uap, retval)
 	struct proc *p;
@@ -743,16 +859,17 @@ open(p, uap, retval)
 	int type, indx, error;
 	struct flock lf;
 	struct nameidata nd;
-	extern struct fileops vnops;
 
-	if (error = falloc(p, &nfp, &indx))
+	error = falloc(p, &nfp, &indx);
+	if (error)
 		return (error);
 	fp = nfp;
 	flags = FFLAGS(SCARG(uap, flags));
 	cmode = ((SCARG(uap, mode) &~ fdp->fd_cmask) & ALLPERMS) &~ S_ISTXT;
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
 	p->p_dupfd = -indx - 1;			/* XXX check for fdopen */
-	if (error = vn_open(&nd, flags, cmode)) {
+	error = vn_open(&nd, flags, cmode);
+	if (error) {
 		ffree(fp);
 		if ((error == ENODEV || error == ENXIO) &&
 		    p->p_dupfd >= 0 &&			/* XXX from fdopen */
@@ -768,8 +885,9 @@ open(p, uap, retval)
 	}
 	p->p_dupfd = 0;
 	vp = nd.ni_vp;
+
 	fp->f_flag = flags & FMASK;
-	fp->f_type = DTYPE_VNODE;
+	fp->f_type = (vp->v_type == VFIFO ? DTYPE_FIFO : DTYPE_VNODE);
 	fp->f_ops = &vnops;
 	fp->f_data = (caddr_t)vp;
 	if (flags & (O_EXLOCK | O_SHLOCK)) {
@@ -802,10 +920,16 @@ open(p, uap, retval)
 /*
  * Create a file.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct ocreat_args {
+	char	*path;
+	int	mode;
+};
+#endif
 int
-compat_43_creat(p, uap, retval)
+ocreat(p, uap, retval)
 	struct proc *p;
-	register struct compat_43_creat_args /* {
+	register struct ocreat_args /* {
 		syscallarg(char *) path;
 		syscallarg(int) mode;
 	} */ *uap;
@@ -827,6 +951,13 @@ compat_43_creat(p, uap, retval)
 /*
  * Create a special file.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct mknod_args {
+	char	*path;
+	int	mode;
+	int	dev;
+};
+#endif
 /* ARGSUSED */
 int
 mknod(p, uap, retval)
@@ -844,7 +975,8 @@ mknod(p, uap, retval)
 	int whiteout;
 	struct nameidata nd;
 
-	if (error = suser(p->p_ucred, &p->p_acflag))
+	error = suser(p->p_ucred, &p->p_acflag);
+	if (error)
 		return (error);
 	NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
 	if (error = namei(&nd))
@@ -902,6 +1034,12 @@ mknod(p, uap, retval)
 /*
  * Create a named pipe.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct mkfifo_args {
+	char	*path;
+	int	mode;
+};
+#endif
 /* ARGSUSED */
 int
 mkfifo(p, uap, retval)
@@ -916,9 +1054,6 @@ mkfifo(p, uap, retval)
 	int error;
 	struct nameidata nd;
 
-#ifndef FIFO
-	return (EOPNOTSUPP);
-#else
 	NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
 	if (error = namei(&nd))
 		return (error);
@@ -936,12 +1071,17 @@ mkfifo(p, uap, retval)
 	vattr.va_mode = (SCARG(uap, mode) & ALLPERMS) &~ p->p_fd->fd_cmask;
 	VOP_LEASE(nd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
 	return (VOP_MKNOD(nd.ni_dvp, &nd.ni_vp, &nd.ni_cnd, &vattr));
-#endif /* FIFO */
 }
 
 /*
  * Make a hard file link.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct link_args {
+	char	*path;
+	char	*link;
+};
+#endif
 /* ARGSUSED */
 int
 link(p, uap, retval)
@@ -960,20 +1100,13 @@ link(p, uap, retval)
 	if (error = namei(&nd))
 		return (error);
 	vp = nd.ni_vp;
-	if (vp->v_type != VDIR ||
-	    (error = suser(p->p_ucred, &p->p_acflag)) == 0) {
-		nd.ni_cnd.cn_nameiop = CREATE;
-		nd.ni_cnd.cn_flags = LOCKPARENT;
-		nd.ni_dirp = SCARG(uap, link);
-		if ((error = namei(&nd)) == 0) {
-			if (nd.ni_vp != NULL)
-				error = EEXIST;
-			if (!error) {
-				VOP_LEASE(nd.ni_dvp, p, p->p_ucred,
-				    LEASE_WRITE);
-				VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
-				error = VOP_LINK(vp, nd.ni_dvp, &nd.ni_cnd);
-			} else {
+	if (vp->v_type == VDIR)
+		error = EPERM;		/* POSIX */
+	else {
+		NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, link), p);
+		error = namei(&nd);
+		if (!error) {
+			if (nd.ni_vp != NULL) {
 				VOP_ABORTOP(nd.ni_dvp, &nd.ni_cnd);
 				if (nd.ni_dvp == nd.ni_vp)
 					vrele(nd.ni_dvp);
@@ -981,6 +1114,12 @@ link(p, uap, retval)
 					vput(nd.ni_dvp);
 				if (nd.ni_vp)
 					vrele(nd.ni_vp);
+				error = EEXIST;
+			} else {
+				VOP_LEASE(nd.ni_dvp, p, p->p_ucred,
+				    LEASE_WRITE);
+				VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
+				error = VOP_LINK(nd.ni_dvp, vp, &nd.ni_cnd);
 			}
 		}
 	}
@@ -991,6 +1130,12 @@ link(p, uap, retval)
 /*
  * Make a symbolic link.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct symlink_args {
+	char	*path;
+	char	*link;
+};
+#endif
 /* ARGSUSED */
 int
 symlink(p, uap, retval)
@@ -1073,6 +1218,11 @@ undelete(p, uap, retval)
 /*
  * Delete a name from the filesystem.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct unlink_args {
+	char	*path;
+};
+#endif
 /* ARGSUSED */
 int
 unlink(p, uap, retval)
@@ -1093,15 +1243,18 @@ unlink(p, uap, retval)
 	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
 
-	if (vp->v_type != VDIR ||
-	    (error = suser(p->p_ucred, &p->p_acflag)) == 0) {
+	if (vp->v_type == VDIR)
+		error = EPERM;		/* POSIX */
+	else {
 		/*
 		 * The root of a mounted filesystem cannot be deleted.
+		 *
+		 * XXX: can this only be a VDIR case?
 		 */
 		if (vp->v_flag & VROOT)
 			error = EBUSY;
 		else
-			(void)vnode_pager_uncache(vp);
+			(void) vnode_pager_uncache(vp, p);
 	}
 
 	if (!error) {
@@ -1122,6 +1275,14 @@ unlink(p, uap, retval)
 /*
  * Reposition read/write file offset.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct lseek_args {
+	int	fd;
+	int	pad;
+	off_t	offset;
+	int	whence;
+};
+#endif
 int
 lseek(p, uap, retval)
 	struct proc *p;
@@ -1131,7 +1292,7 @@ lseek(p, uap, retval)
 		syscallarg(off_t) offset;
 		syscallarg(int) whence;
 	} */ *uap;
-	register_t *retval;
+	register_t *retval;	/* XXX */
 {
 	struct ucred *cred = p->p_ucred;
 	register struct filedesc *fdp = p->p_fd;
@@ -1149,8 +1310,8 @@ lseek(p, uap, retval)
 		fp->f_offset += SCARG(uap, offset);
 		break;
 	case L_XTND:
-		if (error =
-		    VOP_GETATTR((struct vnode *)fp->f_data, &vattr, cred, p))
+		error=VOP_GETATTR((struct vnode *)fp->f_data, &vattr, cred, p);
+		if (error)
 			return (error);
 		fp->f_offset = SCARG(uap, offset) + vattr.va_size;
 		break;
@@ -1168,10 +1329,17 @@ lseek(p, uap, retval)
 /*
  * Reposition read/write file offset.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct olseek_args {
+	int	fd;
+	long	offset;
+	int	whence;
+};
+#endif
 int
-compat_43_lseek(p, uap, retval)
+olseek(p, uap, retval)
 	struct proc *p;
-	register struct compat_43_lseek_args /* {
+	register struct olseek_args /* {
 		syscallarg(int) fd;
 		syscallarg(long) offset;
 		syscallarg(int) whence;
@@ -1190,7 +1358,7 @@ compat_43_lseek(p, uap, retval)
 	SCARG(&nuap, fd) = SCARG(uap, fd);
 	SCARG(&nuap, offset) = SCARG(uap, offset);
 	SCARG(&nuap, whence) = SCARG(uap, whence);
-	error = lseek(p, &nuap, &qret);
+	error = lseek(p, &nuap, (register_t *) &qret);
 	*(long *)retval = qret;
 	return (error);
 }
@@ -1199,6 +1367,12 @@ compat_43_lseek(p, uap, retval)
 /*
  * Check access permissions.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct access_args {
+	char	*path;
+	int	flags;
+};
+#endif
 int
 access(p, uap, retval)
 	struct proc *p;
@@ -1246,11 +1420,17 @@ out1:
 /*
  * Get file status; this version follows links.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct ostat_args {
+	char	*path;
+	struct ostat *ub;
+};
+#endif
 /* ARGSUSED */
 int
-compat_43_stat(p, uap, retval)
+ostat(p, uap, retval)
 	struct proc *p;
-	register struct compat_43_stat_args /* {
+	register struct ostat_args /* {
 		syscallarg(char *) path;
 		syscallarg(struct ostat *) ub;
 	} */ *uap;
@@ -1277,11 +1457,17 @@ compat_43_stat(p, uap, retval)
 /*
  * Get file status; this version does not follow links.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct olstat_args {
+	char	*path;
+	struct ostat *ub;
+};
+#endif
 /* ARGSUSED */
 int
-compat_43_lstat(p, uap, retval)
+olstat(p, uap, retval)
 	struct proc *p;
-	register struct compat_43_lstat_args /* {
+	register struct olstat_args /* {
 		syscallarg(char *) path;
 		syscallarg(struct ostat *) ub;
 	} */ *uap;
@@ -1367,6 +1553,12 @@ cvtstat(st, ost)
 /*
  * Get file status; this version follows links.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct stat_args {
+	char	*path;
+	struct stat *ub;
+};
+#endif
 /* ARGSUSED */
 int
 stat(p, uap, retval)
@@ -1396,6 +1588,12 @@ stat(p, uap, retval)
 /*
  * Get file status; this version does not follow links.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct lstat_args {
+	char	*path;
+	struct stat *ub;
+};
+#endif
 /* ARGSUSED */
 int
 lstat(p, uap, retval)
@@ -1455,6 +1653,12 @@ lstat(p, uap, retval)
 /*
  * Get configurable pathname variables.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct pathconf_args {
+	char	*path;
+	int	name;
+};
+#endif
 /* ARGSUSED */
 int
 pathconf(p, uap, retval)
@@ -1480,6 +1684,13 @@ pathconf(p, uap, retval)
 /*
  * Return target name of a symbolic link.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct readlink_args {
+	char	*path;
+	char	*buf;
+	int	count;
+};
+#endif
 /* ARGSUSED */
 int
 readlink(p, uap, retval)
@@ -1524,6 +1735,12 @@ readlink(p, uap, retval)
 /*
  * Change flags of a file given a path name.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct chflags_args {
+	char	*path;
+	int	flags;
+};
+#endif
 /* ARGSUSED */
 int
 chflags(p, uap, retval)
@@ -1555,6 +1772,12 @@ chflags(p, uap, retval)
 /*
  * Change flags of a file given a file descriptor.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct fchflags_args {
+	int	fd;
+	int	flags;
+};
+#endif
 /* ARGSUSED */
 int
 fchflags(p, uap, retval)
@@ -1585,6 +1808,12 @@ fchflags(p, uap, retval)
 /*
  * Change mode of a file given path name.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct chmod_args {
+	char	*path;
+	int	mode;
+};
+#endif
 /* ARGSUSED */
 int
 chmod(p, uap, retval)
@@ -1616,6 +1845,12 @@ chmod(p, uap, retval)
 /*
  * Change mode of a file given a file descriptor.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct fchmod_args {
+	int	fd;
+	int	mode;
+};
+#endif
 /* ARGSUSED */
 int
 fchmod(p, uap, retval)
@@ -1646,6 +1881,13 @@ fchmod(p, uap, retval)
 /*
  * Set ownership given a path name.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct chown_args {
+	char	*path;
+	int	uid;
+	int	gid;
+};
+#endif
 /* ARGSUSED */
 int
 chown(p, uap, retval)
@@ -1679,6 +1921,13 @@ chown(p, uap, retval)
 /*
  * Set ownership given a file descriptor.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct fchown_args {
+	int	fd;
+	int	uid;
+	int	gid;
+};
+#endif
 /* ARGSUSED */
 int
 fchown(p, uap, retval)
@@ -1711,6 +1960,12 @@ fchown(p, uap, retval)
 /*
  * Set the access and modification times of a file.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct utimes_args {
+	char	*path;
+	struct	timeval *tptr;
+};
+#endif
 /* ARGSUSED */
 int
 utimes(p, uap, retval)
@@ -1741,10 +1996,10 @@ utimes(p, uap, retval)
 	vp = nd.ni_vp;
 	VOP_LEASE(vp, p, p->p_ucred, LEASE_WRITE);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
-	vattr.va_atime.ts_sec = tv[0].tv_sec;
-	vattr.va_atime.ts_nsec = tv[0].tv_usec * 1000;
-	vattr.va_mtime.ts_sec = tv[1].tv_sec;
-	vattr.va_mtime.ts_nsec = tv[1].tv_usec * 1000;
+	vattr.va_atime.tv_sec = tv[0].tv_sec;
+	vattr.va_atime.tv_nsec = tv[0].tv_usec * 1000;
+	vattr.va_mtime.tv_sec = tv[1].tv_sec;
+	vattr.va_mtime.tv_nsec = tv[1].tv_usec * 1000;
 	error = VOP_SETATTR(vp, &vattr, p->p_ucred, p);
 	vput(vp);
 	return (error);
@@ -1753,6 +2008,13 @@ utimes(p, uap, retval)
 /*
  * Truncate a file given its path name.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct truncate_args {
+	char	*path;
+	int	pad;
+	off_t	length;
+};
+#endif
 /* ARGSUSED */
 int
 truncate(p, uap, retval)
@@ -1769,6 +2031,8 @@ truncate(p, uap, retval)
 	int error;
 	struct nameidata nd;
 
+	if (uap->length < 0)
+		return(EINVAL);
 	NDINIT(&nd, LOOKUP, FOLLOW, UIO_USERSPACE, SCARG(uap, path), p);
 	if (error = namei(&nd))
 		return (error);
@@ -1790,6 +2054,13 @@ truncate(p, uap, retval)
 /*
  * Truncate a file given a file descriptor.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct ftruncate_args {
+	int	fd;
+	int	pad;
+	off_t	length;
+};
+#endif
 /* ARGSUSED */
 int
 ftruncate(p, uap, retval)
@@ -1806,6 +2077,8 @@ ftruncate(p, uap, retval)
 	struct file *fp;
 	int error;
 
+	if (uap->length < 0)
+		return(EINVAL);
 	if (error = getvnode(p->p_fd, SCARG(uap, fd), &fp))
 		return (error);
 	if ((fp->f_flag & FWRITE) == 0)
@@ -1828,11 +2101,17 @@ ftruncate(p, uap, retval)
 /*
  * Truncate a file given its path name.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct otruncate_args {
+	char	*path;
+	long	length;
+};
+#endif
 /* ARGSUSED */
 int
-compat_43_truncate(p, uap, retval)
+otruncate(p, uap, retval)
 	struct proc *p;
-	register struct compat_43_truncate_args /* {
+	register struct otruncate_args /* {
 		syscallarg(char *) path;
 		syscallarg(long) length;
 	} */ *uap;
@@ -1852,11 +2131,17 @@ compat_43_truncate(p, uap, retval)
 /*
  * Truncate a file given a file descriptor.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct oftruncate_args {
+	int	fd;
+	long	length;
+};
+#endif
 /* ARGSUSED */
 int
-compat_43_ftruncate(p, uap, retval)
+oftruncate(p, uap, retval)
 	struct proc *p;
-	register struct compat_43_ftruncate_args /* {
+	register struct oftruncate_args /* {
 		syscallarg(int) fd;
 		syscallarg(long) length;
 	} */ *uap;
@@ -1877,6 +2162,11 @@ compat_43_ftruncate(p, uap, retval)
 /*
  * Sync an open file.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct fsync_args {
+	int	fd;
+};
+#endif
 /* ARGSUSED */
 int
 fsync(p, uap, retval)
@@ -1894,7 +2184,12 @@ fsync(p, uap, retval)
 		return (error);
 	vp = (struct vnode *)fp->f_data;
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
-	error = VOP_FSYNC(vp, fp->f_cred, MNT_WAIT, p);
+	if (vp->v_object) {
+		vm_object_page_clean(vp->v_object, 0, 0 ,0, FALSE);
+	}
+	error = VOP_FSYNC(vp, fp->f_cred,
+		(vp->v_mount && (vp->v_mount->mnt_flag & MNT_ASYNC)) ? 
+		MNT_NOWAIT : MNT_WAIT, p);
 	VOP_UNLOCK(vp, 0, p);
 	return (error);
 }
@@ -1903,6 +2198,12 @@ fsync(p, uap, retval)
  * Rename files.  Source and destination must either both be directories,
  * or both not be directories.  If target is a directory, it must be empty.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct rename_args {
+	char	*from;
+	char	*to;
+};
+#endif
 /* ARGSUSED */
 int
 rename(p, uap, retval)
@@ -1924,7 +2225,12 @@ rename(p, uap, retval)
 	fvp = fromnd.ni_vp;
 	NDINIT(&tond, RENAME, LOCKPARENT | LOCKLEAF | NOCACHE | SAVESTART,
 	    UIO_USERSPACE, SCARG(uap, to), p);
+	if (fromnd.ni_vp->v_type == VDIR)
+		tond.ni_cnd.cn_flags |= WILLBEDIR;
 	if (error = namei(&tond)) {
+		/* Translate error code for rename("dir1", "dir2/."). */
+		if (error == EISDIR && fvp->v_type == VDIR)
+			error = EINVAL;
 		VOP_ABORTOP(fromnd.ni_dvp, &fromnd.ni_cnd);
 		vrele(fromnd.ni_dvp);
 		vrele(fvp);
@@ -1958,8 +2264,10 @@ out:
 		VOP_LEASE(tdvp, p, p->p_ucred, LEASE_WRITE);
 		if (fromnd.ni_dvp != tdvp)
 			VOP_LEASE(fromnd.ni_dvp, p, p->p_ucred, LEASE_WRITE);
-		if (tvp)
+		if (tvp) {
 			VOP_LEASE(tvp, p, p->p_ucred, LEASE_WRITE);
+			(void) vnode_pager_uncache(tvp, p);
+		}
 		error = VOP_RENAME(fromnd.ni_dvp, fromnd.ni_vp, &fromnd.ni_cnd,
 				   tond.ni_dvp, tond.ni_vp, &tond.ni_cnd);
 	} else {
@@ -1988,6 +2296,12 @@ out1:
 /*
  * Make a directory file.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct mkdir_args {
+	char	*path;
+	int	mode;
+};
+#endif
 /* ARGSUSED */
 int
 mkdir(p, uap, retval)
@@ -2004,6 +2318,7 @@ mkdir(p, uap, retval)
 	struct nameidata nd;
 
 	NDINIT(&nd, CREATE, LOCKPARENT, UIO_USERSPACE, SCARG(uap, path), p);
+	nd.ni_cnd.cn_flags |= WILLBEDIR;
 	if (error = namei(&nd))
 		return (error);
 	vp = nd.ni_vp;
@@ -2029,6 +2344,11 @@ mkdir(p, uap, retval)
 /*
  * Remove a directory file.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct rmdir_args {
+	char	*path;
+};
+#endif
 /* ARGSUSED */
 int
 rmdir(p, uap, retval)
@@ -2083,10 +2403,18 @@ out:
 /*
  * Read a block of directory entries in a file system independent format.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct ogetdirentries_args {
+	int	fd;
+	char	*buf;
+	u_int	count;
+	long	*basep;
+};
+#endif
 int
-compat_43_getdirentries(p, uap, retval)
+ogetdirentries(p, uap, retval)
 	struct proc *p;
-	register struct compat_43_getdirentries_args /* {
+	register struct ogetdirentries_args /* {
 		syscallarg(int) fd;
 		syscallarg(char *) buf;
 		syscallarg(u_int) count;
@@ -2124,7 +2452,7 @@ unionread:
 #	if (BYTE_ORDER != LITTLE_ENDIAN)
 		if (vp->v_mount->mnt_maxsymlinklen <= 0) {
 			error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
-			    (int *)0, (u_long *)0);
+			    NULL, NULL);
 			fp->f_offset = auio.uio_offset;
 		} else
 #	endif
@@ -2136,7 +2464,7 @@ unionread:
 		MALLOC(dirbuf, caddr_t, SCARG(uap, count), M_TEMP, M_WAITOK);
 		kiov.iov_base = dirbuf;
 		error = VOP_READDIR(vp, &kuio, fp->f_cred, &eofflag,
-			    (int *)0, (u_long *)0);
+			    NULL, NULL);
 		fp->f_offset = kuio.uio_offset;
 		if (error == 0) {
 			readcnt = SCARG(uap, count) - kuio.uio_resid;
@@ -2178,9 +2506,6 @@ unionread:
 
 #ifdef UNION
 {
-	extern int (**union_vnodeop_p)();
-	extern struct vnode *union_dircache __P((struct vnode*, struct proc*));
-
 	if ((SCARG(uap, count) == auio.uio_resid) &&
 	    (vp->v_op == union_vnodeop_p)) {
 		struct vnode *lvp;
@@ -2240,6 +2565,14 @@ unionread:
 /*
  * Read a block of directory entries in a file system independent format.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct getdirentries_args {
+	int	fd;
+	char	*buf;
+	u_int	count;
+	long	*basep;
+};
+#endif
 int
 getdirentries(p, uap, retval)
 	struct proc *p;
@@ -2276,8 +2609,7 @@ unionread:
 	auio.uio_resid = SCARG(uap, count);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
 	loff = auio.uio_offset = fp->f_offset;
-	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag,
-			    (int *)0, (u_long *)0);
+	error = VOP_READDIR(vp, &auio, fp->f_cred, &eofflag, NULL, NULL);
 	fp->f_offset = auio.uio_offset;
 	VOP_UNLOCK(vp, 0, p);
 	if (error)
@@ -2285,9 +2617,6 @@ unionread:
 
 #ifdef UNION
 {
-	extern int (**union_vnodeop_p)();
-	extern struct vnode *union_dircache __P((struct vnode*, struct proc*));
-
 	if ((SCARG(uap, count) == auio.uio_resid) &&
 	    (vp->v_op == union_vnodeop_p)) {
 		struct vnode *lvp;
@@ -2346,13 +2675,18 @@ unionread:
 /*
  * Set the mode mask for creation of filesystem nodes.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct umask_args {
+	int	newmask;
+};
+#endif
 int
 umask(p, uap, retval)
 	struct proc *p;
 	struct umask_args /* {
 		syscallarg(int) newmask;
 	} */ *uap;
-	register_t *retval;
+	int *retval;	/* XXX */
 {
 	register struct filedesc *fdp;
 
@@ -2366,6 +2700,11 @@ umask(p, uap, retval)
  * Void all references to file by ripping underlying filesystem
  * away from vnode.
  */
+#ifndef _SYS_SYSPROTO_H_
+struct revoke_args {
+	char	*path;
+};
+#endif
 /* ARGSUSED */
 int
 revoke(p, uap, retval)
@@ -2402,15 +2741,15 @@ out:
 int
 getvnode(fdp, fd, fpp)
 	struct filedesc *fdp;
-	struct file **fpp;
 	int fd;
+	struct file **fpp;
 {
 	struct file *fp;
 
 	if ((u_int)fd >= fdp->fd_nfiles ||
 	    (fp = fdp->fd_ofiles[fd]) == NULL)
 		return (EBADF);
-	if (fp->f_type != DTYPE_VNODE)
+	if (fp->f_type != DTYPE_VNODE && fp->f_type != DTYPE_FIFO)
 		return (EINVAL);
 	*fpp = fp;
 	return (0);
diff --git a/sys/kern/vfs_vnops.c b/sys/kern/vfs_vnops.c
index 3cfc6fd..cb6c932 100644
--- a/sys/kern/vfs_vnops.c
+++ b/sys/kern/vfs_vnops.c
@@ -35,12 +35,14 @@
  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  * SUCH DAMAGE.
  *
- *	@(#)vfs_vnops.c	8.14 (Berkeley) 6/15/95
+ *	@(#)vfs_vnops.c	8.2 (Berkeley) 1/21/94
+ * $Id: vfs_vnops.c,v 1.33 1997/03/23 03:36:38 bde Exp $
  */
 
 #include <sys/param.h>
 #include <sys/systm.h>
 #include <sys/kernel.h>
+#include <sys/fcntl.h>
 #include <sys/file.h>
 #include <sys/stat.h>
 #include <sys/buf.h>
@@ -48,10 +50,22 @@
 #include <sys/mount.h>
 #include <sys/namei.h>
 #include <sys/vnode.h>
-#include <sys/ioctl.h>
-#include <sys/tty.h>
+#include <sys/filio.h>
+#include <sys/ttycom.h>
 
 #include <vm/vm.h>
+#include <vm/vm_param.h>
+#include <vm/vm_object.h>
+#include <vm/vnode_pager.h>
+
+static int vn_closefile __P((struct file *fp, struct proc *p));
+static int vn_ioctl __P((struct file *fp, int com, caddr_t data, 
+		struct proc *p));
+static int vn_read __P((struct file *fp, struct uio *uio, 
+		struct ucred *cred));
+static int vn_select __P((struct file *fp, int which, struct proc *p));
+static int vn_write __P((struct file *fp, struct uio *uio, 
+		struct ucred *cred));
 
 struct 	fileops vnops =
 	{ vn_read, vn_write, vn_ioctl, vn_select, vn_closefile };
@@ -60,6 +74,7 @@ struct 	fileops vnops =
  * Common code for vnode open operations.
  * Check permissions, and call the VOP_OPEN or VOP_CREATE routine.
  */
+int
 vn_open(ndp, fmode, cmode)
 	register struct nameidata *ndp;
 	int fmode, cmode;
@@ -76,7 +91,8 @@ vn_open(ndp, fmode, cmode)
 		ndp->ni_cnd.cn_flags = LOCKPARENT | LOCKLEAF;
 		if ((fmode & O_EXCL) == 0)
 			ndp->ni_cnd.cn_flags |= FOLLOW;
-		if (error = namei(ndp))
+		error = namei(ndp);
+		if (error)
 			return (error);
 		if (ndp->ni_vp == NULL) {
 			VATTR_NULL(vap);
@@ -107,7 +123,8 @@ vn_open(ndp, fmode, cmode)
 	} else {
 		ndp->ni_cnd.cn_nameiop = LOOKUP;
 		ndp->ni_cnd.cn_flags = FOLLOW | LOCKLEAF;
-		if (error = namei(ndp))
+		error = namei(ndp);
+		if (error)
 			return (error);
 		vp = ndp->ni_vp;
 	}
@@ -117,7 +134,8 @@ vn_open(ndp, fmode, cmode)
 	}
 	if ((fmode & O_CREAT) == 0) {
 		if (fmode & FREAD) {
-			if (error = VOP_ACCESS(vp, VREAD, cred, p))
+			error = VOP_ACCESS(vp, VREAD, cred, p);
+			if (error)
 				goto bad;
 		}
 		if (fmode & (FWRITE | O_TRUNC)) {
@@ -125,8 +143,11 @@ vn_open(ndp, fmode, cmode)
 				error = EISDIR;
 				goto bad;
 			}
-			if ((error = vn_writechk(vp)) ||
-			    (error = VOP_ACCESS(vp, VWRITE, cred, p)))
+			error = vn_writechk(vp);
+			if (error)
+				goto bad;
+		        error = VOP_ACCESS(vp, VWRITE, cred, p);
+			if (error)
 				goto bad;
 		}
 	}
@@ -136,11 +157,21 @@ vn_open(ndp, fmode, cmode)
 		vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);	/* XXX */
 		VATTR_NULL(vap);
 		vap->va_size = 0;
-		if (error = VOP_SETATTR(vp, vap, cred, p))
+		error = VOP_SETATTR(vp, vap, cred, p);
+		if (error)
 			goto bad;
 	}
-	if (error = VOP_OPEN(vp, fmode, cred, p))
+	error = VOP_OPEN(vp, fmode, cred, p);
+	if (error)
 		goto bad;
+	/*
+	 * Make sure that a VM object is created for VMIO support.
+	 */
+	if (vp->v_type == VREG) {
+		if ((error = vfs_object_create(vp, p, cred, 1)) != 0)
+			goto bad;
+	}
+
 	if (fmode & FWRITE)
 		vp->v_writecount++;
 	return (0);
@@ -153,6 +184,7 @@ bad:
  * Check for write permissions on the specified vnode.
  * Prototype text segments cannot be written.
  */
+int
 vn_writechk(vp)
 	register struct vnode *vp;
 {
@@ -162,7 +194,7 @@ vn_writechk(vp)
 	 * the vnode, try to free it up once.  If
 	 * we fail, we can't allow writing.
 	 */
-	if ((vp->v_flag & VTEXT) && !vnode_pager_uncache(vp))
+	if (vp->v_flag & VTEXT)
 		return (ETXTBSY);
 	return (0);
 }
@@ -170,6 +202,7 @@ vn_writechk(vp)
 /*
  * Vnode close call
  */
+int
 vn_close(vp, flags, cred, p)
 	register struct vnode *vp;
 	int flags;
@@ -188,6 +221,7 @@ vn_close(vp, flags, cred, p)
 /*
  * Package up an I/O request on a vnode into a uio and do it.
  */
+int
 vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p)
 	enum uio_rw rw;
 	struct vnode *vp;
@@ -233,6 +267,7 @@ vn_rdwr(rw, vp, base, len, offset, segflg, ioflg, cred, aresid, p)
 /*
  * File table vnode read routine.
  */
+static int
 vn_read(fp, uio, cred)
 	struct file *fp;
 	struct uio *uio;
@@ -241,14 +276,46 @@ vn_read(fp, uio, cred)
 	struct vnode *vp = (struct vnode *)fp->f_data;
 	struct proc *p = uio->uio_procp;
 	int count, error;
+	int flag, seq;
 
 	VOP_LEASE(vp, p, cred, LEASE_READ);
 	vn_lock(vp, LK_EXCLUSIVE | LK_RETRY, p);
 	uio->uio_offset = fp->f_offset;
 	count = uio->uio_resid;
-	error = VOP_READ(vp, uio, (fp->f_flag & FNONBLOCK) ? IO_NDELAY : 0,
-		cred);
+	flag = 0;
+	if (fp->f_flag & FNONBLOCK)
+		flag |= IO_NDELAY;
+
+	/*
+	 * Sequential read heuristic.
+	 * If we have been doing sequential input,
+	 * a rewind operation doesn't turn off
+	 * sequential input mode.
+	 */
+	if (((fp->f_offset == 0) && (fp->f_seqcount > 0)) ||
+		(fp->f_offset == fp->f_nextread)) {
+		int tmpseq = fp->f_seqcount;
+		/*
+		 * XXX we assume that the filesystem block size is
+		 * the default.  Not true, but still gives us a pretty
+		 * good indicator of how sequential the read operations
+		 * are.
+		 */
+		tmpseq += ((count + BKVASIZE - 1) / BKVASIZE);
+		if (tmpseq >= CHAR_MAX)
+			tmpseq = CHAR_MAX;
+		fp->f_seqcount = tmpseq;
+		flag |= (fp->f_seqcount << 16);
+	} else {
+		if (fp->f_seqcount > 1)
+			fp->f_seqcount = 1;
+		else
+			fp->f_seqcount = 0;
+	}
+
+	error = VOP_READ(vp, uio, flag, cred);
 	fp->f_offset += count - uio->uio_resid;
+	fp->f_nextread = fp->f_offset;
 	VOP_UNLOCK(vp, 0, p);
 	return (error);
 }
@@ -256,6 +323,7 @@ vn_read(fp, uio, cred)
 /*
  * File table vnode write routine.
  */
+static int
 vn_write(fp, uio, cred)
 	struct file *fp;
 	struct uio *uio;
@@ -288,6 +356,7 @@ vn_write(fp, uio, cred)
 /*
  * File table vnode stat routine.
  */
+int
 vn_stat(vp, sb, p)
 	struct vnode *vp;
 	register struct stat *sb;
@@ -344,17 +413,27 @@ vn_stat(vp, sb, p)
 	sb->st_ctimespec = vap->va_ctime;
 	sb->st_blksize = vap->va_blocksize;
 	sb->st_flags = vap->va_flags;
-	sb->st_gen = vap->va_gen;
+	if (p->p_ucred->cr_uid != 0)
+		sb->st_gen = 0;
+	else
+		sb->st_gen = vap->va_gen;
+
+#if (S_BLKSIZE == 512)
+	/* Optimize this case */
+	sb->st_blocks = vap->va_bytes >> 9;
+#else
 	sb->st_blocks = vap->va_bytes / S_BLKSIZE;
+#endif
 	return (0);
 }
 
 /*
  * File table vnode ioctl routine.
  */
+static int
 vn_ioctl(fp, com, data, p)
 	struct file *fp;
-	u_long com;
+	int com;
 	caddr_t data;
 	struct proc *p;
 {
@@ -367,7 +446,8 @@ vn_ioctl(fp, com, data, p)
 	case VREG:
 	case VDIR:
 		if (com == FIONREAD) {
-			if (error = VOP_GETATTR(vp, &vattr, p->p_ucred, p))
+			error = VOP_GETATTR(vp, &vattr, p->p_ucred, p);
+			if (error)
 				return (error);
 			*(int *)data = vattr.va_size - fp->f_offset;
 			return (0);
@@ -384,8 +464,15 @@ vn_ioctl(fp, com, data, p)
 	case VBLK:
 		error = VOP_IOCTL(vp, com, data, fp->f_flag, p->p_ucred, p);
 		if (error == 0 && com == TIOCSCTTY) {
+
+			/* Do nothing if reassigning same control tty */
+			if (p->p_session->s_ttyvp == vp)
+				return (0);
+
+			/* Get rid of reference to old control tty */
 			if (p->p_session->s_ttyvp)
 				vrele(p->p_session->s_ttyvp);
+
 			p->p_session->s_ttyvp = vp;
 			VREF(vp);
 		}
@@ -396,6 +483,7 @@ vn_ioctl(fp, com, data, p)
 /*
  * File table vnode select routine.
  */
+static int
 vn_select(fp, which, p)
 	struct file *fp;
 	int which;
@@ -407,6 +495,19 @@ vn_select(fp, which, p)
 }
 
 /*
+ * File table vnode close routine.
+ */
+static int
+vn_closefile(fp, p)
+	struct file *fp;
+	struct proc *p;
+{
+
+	return (vn_close(((struct vnode *)fp->f_data), fp->f_flag,
+		fp->f_cred, p));
+}
+
+/*
  * Check that the vnode is still valid, and if so
  * acquire requested lock.
  */
@@ -419,8 +520,9 @@ vn_lock(vp, flags, p)
 	int error;
 	
 	do {
-		if ((flags & LK_INTERLOCK) == 0)
+		if ((flags & LK_INTERLOCK) == 0) {
 			simple_lock(&vp->v_interlock);
+		}
 		if (vp->v_flag & VXLOCK) {
 			vp->v_flag |= VXWANT;
 			simple_unlock(&vp->v_interlock);
@@ -435,15 +537,3 @@ vn_lock(vp, flags, p)
 	} while (flags & LK_RETRY);
 	return (error);
 }
-
-/*
- * File table vnode close routine.
- */
-vn_closefile(fp, p)
-	struct file *fp;
-	struct proc *p;
-{
-
-	return (vn_close(((struct vnode *)fp->f_data), fp->f_flag,
-		fp->f_cred, p));
-}
diff --git a/sys/kern/vnode_if.pl b/sys/kern/vnode_if.pl
new file mode 100644
index 0000000..75f49a7
--- /dev/null
+++ b/sys/kern/vnode_if.pl
@@ -0,0 +1,459 @@
+#!/bin/sh -
+#
+# Copyright (c) 1992, 1993
+#	The Regents of the University of California.  All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions
+# are met:
+# 1. Redistributions of source code must retain the above copyright
+#    notice, this list of conditions and the following disclaimer.
+# 2. Redistributions in binary form must reproduce the above copyright
+#    notice, this list of conditions and the following disclaimer in the
+#    documentation and/or other materials provided with the distribution.
+# 3. All advertising materials mentioning features or use of this software
+#    must display the following acknowledgement:
+#	This product includes software developed by the University of
+#	California, Berkeley and its contributors.
+# 4. Neither the name of the University nor the names of its contributors
+#    may be used to endorse or promote products derived from this software
+#    without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+# ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+# ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+# DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+# OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+# OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+# SUCH DAMAGE.
+#
+#	@(#)vnode_if.sh	8.1 (Berkeley) 6/10/93
+# $Id$
+#
+
+# Script to produce VFS front-end sugar.
+#
+# usage: vnode_if.sh srcfile
+#	(where srcfile is currently /sys/kern/vnode_if.src)
+#
+# These awk scripts are not particularly well written, specifically they
+# don't use arrays well and figure out the same information repeatedly.
+# Please rewrite them if you actually understand how to use awk.  Note,
+# they use nawk extensions and gawk's toupper.
+
+if [ $# -ne 1 ] ; then
+	echo 'usage: vnode_if.sh srcfile'
+	exit 1
+fi
+
+# Name of the source file.
+SRC=$1
+
+# Names of the created files.
+CFILE=vnode_if.c
+HEADER=vnode_if.h
+
+# Awk program (must support nawk extensions and gawk's "toupper")
+# Use "awk" at Berkeley, "gawk" elsewhere.
+AWK=awk
+
+# Print out header information for vnode_if.h.
+cat << END_OF_LEADING_COMMENT > $HEADER
+/*
+ * This file is produced automatically.
+ * Do not modify anything in here by hand.
+ *
+ * Created from @(#)vnode_if.sh	8.1 (Berkeley) 6/10/93
+ */
+
+extern struct vnodeop_desc vop_default_desc;
+END_OF_LEADING_COMMENT
+
+# Awk script to take vnode_if.src and turn it into vnode_if.h.
+$AWK '
+	NF == 0 || $0 ~ "^#" {
+		next;
+	}
+	{
+		# Get the function name.
+		name = $1;
+		uname = toupper(name);
+
+		# Get the function arguments.
+		for (c1 = 0;; ++c1) {
+			if (getline <= 0)
+				exit
+			if ($0 ~ "^};")
+				break;
+			a[c1] = $0;
+		}
+
+		# Print out the vop_F_args structure.
+		printf("struct %s_args {\n\tstruct vnodeop_desc *a_desc;\n",
+		    name);
+		for (c2 = 0; c2 < c1; ++c2) {
+			c3 = split(a[c2], t);
+			printf("\t");
+			if (t[2] ~ "WILLRELE")
+				c4 = 3;
+			else 
+				c4 = 2;
+			for (; c4 < c3; ++c4)
+				printf("%s ", t[c4]);
+			beg = match(t[c3], "[^*]");
+			printf("%sa_%s\n",
+			    substr(t[c4], 0, beg - 1), substr(t[c4], beg));
+		}
+		printf("};\n");
+
+		# Print out extern declaration.
+		printf("extern struct vnodeop_desc %s_desc;\n", name);
+
+		# Print out prototype.
+		printf("static int %s __P((\n", uname);
+		sep = ",\n";
+		for (c2 = 0; c2 < c1; ++c2) {
+			if (c2 == c1 - 1)
+				sep = "));\n";
+			c3 = split(a[c2], t);
+			printf("\t");
+			if (t[2] ~ "WILLRELE")
+				c4 = 3;
+			else
+				c4 = 2;
+			for (; c4 < c3; ++c4)
+				printf("%s ", t[c4]);
+			beg = match(t[c3], "[^*]");
+			end = match(t[c3], ";");
+			printf("%s%s%s",
+			    substr(t[c4], 0, beg - 1),
+			    substr(t[c4], beg, end - beg), sep);
+		}
+
+		# Print out inline struct.
+		printf("static inline int %s(", uname);
+		sep = ", ";
+		for (c2 = 0; c2 < c1; ++c2) {
+			if (c2 == c1 - 1)
+				sep = ")\n";
+			c3 = split(a[c2], t);
+			beg = match(t[c3], "[^*]");
+			end = match(t[c3], ";");
+			printf("%s%s", substr(t[c3], beg, end - beg), sep);
+		}
+		for (c2 = 0; c2 < c1; ++c2) {
+			c3 = split(a[c2], t);
+			printf("\t");
+			if (t[2] ~ "WILLRELE")
+				c4 = 3;
+			else
+				c4 = 2;
+			for (; c4 < c3; ++c4)
+				printf("%s ", t[c4]);
+			beg = match(t[c3], "[^*]");
+			printf("%s%s\n",
+			    substr(t[c4], 0, beg - 1), substr(t[c4], beg));
+		}
+		printf("{\n\tstruct %s_args a;\n\n", name);
+		printf("\ta.a_desc = VDESC(%s);\n", name);
+		for (c2 = 0; c2 < c1; ++c2) {
+			c3 = split(a[c2], t);
+			printf("\t");
+			beg = match(t[c3], "[^*]");
+			end = match(t[c3], ";");
+			printf("a.a_%s = %s\n",
+			    substr(t[c3], beg, end - beg), substr(t[c3], beg));
+		}
+		c1 = split(a[0], t);
+		beg = match(t[c1], "[^*]");
+		end = match(t[c1], ";");
+		printf("\treturn (VCALL(%s, VOFFSET(%s), &a));\n}\n",
+		    substr(t[c1], beg, end - beg), name);
+	}' < $SRC >> $HEADER
+
+# Print out header information for vnode_if.c.
+cat << END_OF_LEADING_COMMENT > $CFILE
+/*
+ * This file is produced automatically.
+ * Do not modify anything in here by hand.
+ *
+ * Created from @(#)vnode_if.sh	8.1 (Berkeley) 6/10/93
+ */
+
+#include <sys/param.h>
+#include <sys/mount.h>
+#include <sys/vnode.h>
+
+struct vnodeop_desc vop_default_desc = {
+	0,
+	"default",
+	0,
+	NULL,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	NULL,
+};
+
+END_OF_LEADING_COMMENT
+
+# Awk script to take vnode_if.src and turn it into vnode_if.c.
+$AWK 'function kill_surrounding_ws (s) {
+		sub (/^[ \t]*/, "", s);
+		sub (/[ \t]*$/, "", s);
+		return s;
+	}
+
+	function read_args() {
+		numargs = 0;
+		while (getline ln) {
+			if (ln ~ /}/) {
+				break;
+			};
+	
+			# Delete comments, if any.
+			gsub (/\/\*.*\*\//, "", ln);
+			
+			# Delete leading/trailing space.
+			ln = kill_surrounding_ws(ln);
+	
+			# Pick off direction.
+			if (1 == sub(/^INOUT[ \t]+/, "", ln))
+				dir = "INOUT";
+			else if (1 == sub(/^IN[ \t]+/, "", ln))
+				dir = "IN";
+			else if (1 == sub(/^OUT[ \t]+/, "", ln))
+				dir = "OUT";
+			else
+				bail("No IN/OUT direction for \"" ln "\".");
+
+			# check for "WILLRELE"
+			if (1 == sub(/^WILLRELE[ \t]+/, "", ln)) {
+				rele = "WILLRELE";
+			} else {
+				rele = "WONTRELE";
+			};
+	
+			# kill trailing ;
+			if (1 != sub (/;$/, "", ln)) {
+				bail("Missing end-of-line ; in \"" ln "\".");
+			};
+	
+			# pick off variable name
+			if (!(i = match(ln, /[A-Za-z0-9_]+$/))) {
+				bail("Missing var name \"a_foo\" in \"" ln "\".");
+			};
+			arg = substr (ln, i);
+			# Want to <<substr(ln, i) = "";>>, but nawk cannot.
+			# Hack around this.
+			ln = substr(ln, 1, i-1);
+	
+			# what is left must be type
+			# (put clean it up some)
+			type = ln;
+			gsub (/[ \t]+/, " ", type);   # condense whitespace
+			type = kill_surrounding_ws(type);
+	
+			# (boy this was easier in Perl)
+	
+			numargs++;
+			dirs[numargs] = dir;
+			reles[numargs] = rele;
+			types[numargs] = type;
+			args[numargs] = arg;
+		};
+	}
+
+	function generate_operation_vp_offsets() {
+		printf ("static int %s_vp_offsets[] = {\n", name);
+		# as a side effect, figure out the releflags
+		releflags = "";
+		vpnum = 0;
+		for (i=1; i<=numargs; i++) {
+			if (types[i] == "struct vnode *") {
+				printf ("\tVOPARG_OFFSETOF(struct %s_args,a_%s),\n",
+					name, args[i]);
+				if (reles[i] == "WILLRELE") {
+					releflags = releflags "|VDESC_VP" vpnum "_WILLRELE";
+				};
+				vpnum++;
+			};
+		};
+		sub (/^\|/, "", releflags);
+		print "\tVDESC_NO_OFFSET";
+		print "};";
+	}
+	
+	function find_arg_with_type (type) {
+		for (i=1; i<=numargs; i++) {
+			if (types[i] == type) {
+				return "VOPARG_OFFSETOF(struct " name "_args,a_" args[i] ")";
+			};
+		};
+		return "VDESC_NO_OFFSET";
+	}
+	
+	function generate_operation_desc() {
+		printf ("struct vnodeop_desc %s_desc = {\n", name);
+		# offset
+		printf ("\t0,\n");
+		# printable name
+		printf ("\t\"%s\",\n", name);
+		# flags
+		vppwillrele = "";
+		for (i=1; i<=numargs; i++) {
+			if (types[i] == "struct vnode **" &&
+				(reles[i] == "WILLRELE")) {
+				vppwillrele = "|VDESC_VPP_WILLRELE";
+			};
+		};
+		if (releflags == "") {
+			printf ("\t0%s,\n", vppwillrele);
+		} else {
+			printf ("\t%s%s,\n", releflags, vppwillrele);
+		};
+		# vp offsets
+		printf ("\t%s_vp_offsets,\n", name);
+		# vpp (if any)
+		printf ("\t%s,\n", find_arg_with_type("struct vnode **"));
+		# cred (if any)
+		printf ("\t%s,\n", find_arg_with_type("struct ucred *"));
+		# proc (if any)
+		printf ("\t%s,\n", find_arg_with_type("struct proc *"));
+		# componentname
+		printf ("\t%s,\n", find_arg_with_type("struct componentname *"));
+		# transport layer information
+		printf ("\tNULL,\n};\n");
+	}
+
+	NF == 0 || $0 ~ "^#" {
+		next;
+	}
+	{
+		# get the function name
+		name = $1;
+
+		# get the function arguments
+		read_args();
+
+		# Print out the vop_F_vp_offsets structure.  This all depends
+		# on naming conventions and nothing else.
+		generate_operation_vp_offsets();
+
+		# Print out the vnodeop_desc structure.
+		generate_operation_desc();
+
+		printf "\n";
+
+	}' < $SRC >> $CFILE
+# THINGS THAT DON'T WORK RIGHT YET.
+# 
+# Two existing BSD vnodeops (bwrite and strategy) don't take any vnodes as
+# arguments.  This means that these operations can't function successfully
+# through a bypass routine.
+#
+# Bwrite and strategy will be replaced when the VM page/buffer cache
+# integration happens.
+#
+# To get around this problem for now we handle these ops as special cases.
+
+cat << END_OF_SPECIAL_CASES >> $HEADER
+#include <sys/buf.h>
+struct vop_strategy_args {
+	struct vnodeop_desc *a_desc;
+	struct buf *a_bp;
+};
+extern struct vnodeop_desc vop_strategy_desc;
+static int VOP_STRATEGY __P((
+	struct buf *bp));
+static inline int VOP_STRATEGY(bp)
+	struct buf *bp;
+{
+	struct vop_strategy_args a;
+
+	a.a_desc = VDESC(vop_strategy);
+	a.a_bp = bp;
+	return (VCALL((bp)->b_vp, VOFFSET(vop_strategy), &a));
+}
+
+struct vop_bwrite_args {
+	struct vnodeop_desc *a_desc;
+	struct buf *a_bp;
+};
+extern struct vnodeop_desc vop_bwrite_desc;
+static int VOP_BWRITE __P((
+	struct buf *bp));
+static inline int VOP_BWRITE(bp)
+	struct buf *bp;
+{
+	struct vop_bwrite_args a;
+
+	a.a_desc = VDESC(vop_bwrite);
+	a.a_bp = bp;
+	return (VCALL((bp)->b_vp, VOFFSET(vop_bwrite), &a));
+}
+END_OF_SPECIAL_CASES
+
+cat << END_OF_SPECIAL_CASES >> $CFILE
+static int vop_strategy_vp_offsets[] = {
+	VDESC_NO_OFFSET
+};
+struct vnodeop_desc vop_strategy_desc = {
+	0,
+	"vop_strategy",
+	0,
+	vop_strategy_vp_offsets,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	NULL,
+};
+static int vop_bwrite_vp_offsets[] = {
+	VDESC_NO_OFFSET
+};
+struct vnodeop_desc vop_bwrite_desc = {
+	0,
+	"vop_bwrite",
+	0,
+	vop_bwrite_vp_offsets,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	NULL,
+};
+END_OF_SPECIAL_CASES
+
+# Add the vfs_op_descs array to the C file.
+$AWK '
+	BEGIN {
+		printf("\nstruct vnodeop_desc *vfs_op_descs[] = {\n");
+		printf("\t&vop_default_desc,	/* MUST BE FIRST */\n");
+		printf("\t&vop_strategy_desc,	/* XXX: SPECIAL CASE */\n");
+		printf("\t&vop_bwrite_desc,	/* XXX: SPECIAL CASE */\n");
+	}
+	END {
+		printf("\tNULL\n};\n");
+	}
+	NF == 0 || $0 ~ "^#" {
+		next;
+	}
+	{
+		# Get the function name.
+		printf("\t&%s_desc,\n", $1);
+
+		# Skip the function arguments.
+		for (;;) {
+			if (getline <= 0)
+				exit
+			if ($0 ~ "^};")
+				break;
+		}
+	}' < $SRC >> $CFILE
+
diff --git a/sys/kern/vnode_if.sh b/sys/kern/vnode_if.sh
index 8b74d83..75f49a7 100644
--- a/sys/kern/vnode_if.sh
+++ b/sys/kern/vnode_if.sh
@@ -1,9 +1,8 @@
 #!/bin/sh -
-copyright='
-/*
- * Copyright (c) 1992, 1993, 1994, 1995
- *	The Regents of the University of California.  All rights reserved.
- *
+#
+# Copyright (c) 1992, 1993
+#	The Regents of the University of California.  All rights reserved.
+#
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions
 # are met:
@@ -31,17 +30,20 @@ copyright='
 # LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
 # OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
 # SUCH DAMAGE.
- *
- * from: NetBSD: vnode_if.sh,v 1.7 1994/08/25 03:04:28 cgd Exp $
- */
-'
-SCRIPT_ID='@(#)vnode_if.sh	8.7 (Berkeley) 5/11/95'
+#
+#	@(#)vnode_if.sh	8.1 (Berkeley) 6/10/93
+# $Id$
+#
 
 # Script to produce VFS front-end sugar.
 #
 # usage: vnode_if.sh srcfile
 #	(where srcfile is currently /sys/kern/vnode_if.src)
 #
+# These awk scripts are not particularly well written, specifically they
+# don't use arrays well and figure out the same information repeatedly.
+# Please rewrite them if you actually understand how to use awk.  Note,
+# they use nawk extensions and gawk's toupper.
 
 if [ $# -ne 1 ] ; then
 	echo 'usage: vnode_if.sh srcfile'
@@ -49,180 +51,139 @@ if [ $# -ne 1 ] ; then
 fi
 
 # Name of the source file.
-src=$1
+SRC=$1
 
 # Names of the created files.
-out_c=vnode_if.c
-out_h=vnode_if.h
-
-# Awk program (must support nawk extensions)
-# Use "awk" at Berkeley, "nawk" or "gawk" elsewhere.
-awk=${AWK:-awk}
-
-# Does this awk have a "toupper" function? (i.e. is it GNU awk)
-isgawk=`$awk 'BEGIN { print toupper("true"); exit; }' 2>/dev/null`
-
-# If this awk does not define "toupper" then define our own.
-if [ "$isgawk" = TRUE ] ; then
-	# GNU awk provides it.
-	toupper=
-else
-	# Provide our own toupper()
-	toupper='
-function toupper(str) {
-	_toupper_cmd = "echo "str" |tr a-z A-Z"
-	_toupper_cmd | getline _toupper_str;
-	close(_toupper_cmd);
-	return _toupper_str;
-}'
-fi
+CFILE=vnode_if.c
+HEADER=vnode_if.h
 
-#
-# This is the common part of all awk programs that read $src
-# This parses the input for one function into the arrays:
-#	argdir, argtype, argname, willrele
-# and calls "doit()" to generate output for the function.
-#
-# Input to this parser is pre-processed slightly by sed
-# so this awk parser doesn't have to work so hard.  The
-# changes done by the sed pre-processing step are:
-#	insert a space beween * and pointer name
-#	replace semicolons with spaces
-#
-sed_prep='s:\*\([^\*/]\):\* \1:g
-s/;/ /'
-awk_parser='
-# Comment line
-/^#/	{ next; }
-# First line of description
-/^vop_/	{
-	name=$1;
-	argc=0;
-	next;
-}
-# Last line of description
-/^}/	{
-	doit();
-	next;
-}
-# Middle lines of description
-{
-	argdir[argc] = $1; i=2;
-	if ($2 == "WILLRELE") {
-		willrele[argc] = 1;
-		i++;
-	} else
-		willrele[argc] = 0;
-	argtype[argc] = $i; i++;
-	while (i < NF) {
-		argtype[argc] = argtype[argc]" "$i;
-		i++;
-	}
-	argname[argc] = $i;
-	argc++;
-	next;
-}
-'
+# Awk program (must support nawk extensions and gawk's "toupper")
+# Use "awk" at Berkeley, "gawk" elsewhere.
+AWK=awk
 
-# This is put after the copyright on each generated file.
-warning="
+# Print out header information for vnode_if.h.
+cat << END_OF_LEADING_COMMENT > $HEADER
 /*
- * Warning: This file is generated automatically.
- * (Modifications made here may easily be lost!)
+ * This file is produced automatically.
+ * Do not modify anything in here by hand.
  *
- * Created by the script:
- *	${SCRIPT_ID}
+ * Created from @(#)vnode_if.sh	8.1 (Berkeley) 6/10/93
  */
-"
-
-# Get rid of ugly spaces
-space_elim='s:\([^/]\*\) :\1:g'
-
-#
-# Redirect stdout to the H file.
-#
-echo "$0: Creating $out_h" 1>&2
-exec > $out_h
 
-# Begin stuff
-echo "$copyright"
-echo "$warning"
-echo '
 extern struct vnodeop_desc vop_default_desc;
-'
-
-# Body stuff
-# This awk program needs toupper() so define it if necessary.
-sed -e "$sed_prep" $src | $awk "$toupper"'
-function doit() {
-	# Declare arg struct, descriptor.
-	printf("\nstruct %s_args {\n", name);
-	printf("\tstruct vnodeop_desc * a_desc;\n");
-	for (i=0; i<argc; i++) {
-		printf("\t%s a_%s;\n", argtype[i], argname[i]);
-	}
-	printf("};\n");
-	printf("extern struct vnodeop_desc %s_desc;\n", name);
-	# Define inline function.
-	printf("#define %s(", toupper(name));
-	for (i=0; i<argc; i++) {
-		printf("%s", argname[i]);
-		if (i < (argc-1)) printf(", ");
-	}
-	printf(") _%s(", toupper(name));
-	for (i=0; i<argc; i++) {
-		printf("%s", argname[i]);
-		if (i < (argc-1)) printf(", ");
-	}
-	printf(")\n");
-	printf("static __inline int _%s(", toupper(name));
-	for (i=0; i<argc; i++) {
-		printf("%s", argname[i]);
-		if (i < (argc-1)) printf(", ");
-	}
-	printf(")\n");
-	for (i=0; i<argc; i++) {
-		printf("\t%s %s;\n", argtype[i], argname[i]);
-	}
-	printf("{\n\tstruct %s_args a;\n", name);
-	printf("\ta.a_desc = VDESC(%s);\n", name);
-	for (i=0; i<argc; i++) {
-		printf("\ta.a_%s = %s;\n", argname[i], argname[i]);
+END_OF_LEADING_COMMENT
+
+# Awk script to take vnode_if.src and turn it into vnode_if.h.
+$AWK '
+	NF == 0 || $0 ~ "^#" {
+		next;
 	}
-	printf("\treturn (VCALL(%s%s, VOFFSET(%s), &a));\n}\n",
-		argname[0], arg0special, name);
-}
-BEGIN	{
-	arg0special="";
-}
-END	{
-	printf("\n/* Special cases: */\n#include <sys/buf.h>\n");
-	argc=1;
-	argtype[0]="struct buf *";
-	argname[0]="bp";
-	arg0special="->b_vp";
-	name="vop_strategy";
-	doit();
-	name="vop_bwrite";
-	doit();
-}
-'"$awk_parser" | sed -e "$space_elim"
+	{
+		# Get the function name.
+		name = $1;
+		uname = toupper(name);
 
-# End stuff
-echo '
-/* End of special cases. */'
+		# Get the function arguments.
+		for (c1 = 0;; ++c1) {
+			if (getline <= 0)
+				exit
+			if ($0 ~ "^};")
+				break;
+			a[c1] = $0;
+		}
 
+		# Print out the vop_F_args structure.
+		printf("struct %s_args {\n\tstruct vnodeop_desc *a_desc;\n",
+		    name);
+		for (c2 = 0; c2 < c1; ++c2) {
+			c3 = split(a[c2], t);
+			printf("\t");
+			if (t[2] ~ "WILLRELE")
+				c4 = 3;
+			else 
+				c4 = 2;
+			for (; c4 < c3; ++c4)
+				printf("%s ", t[c4]);
+			beg = match(t[c3], "[^*]");
+			printf("%sa_%s\n",
+			    substr(t[c4], 0, beg - 1), substr(t[c4], beg));
+		}
+		printf("};\n");
 
-#
-# Redirect stdout to the C file.
-#
-echo "$0: Creating $out_c" 1>&2
-exec > $out_c
+		# Print out extern declaration.
+		printf("extern struct vnodeop_desc %s_desc;\n", name);
+
+		# Print out prototype.
+		printf("static int %s __P((\n", uname);
+		sep = ",\n";
+		for (c2 = 0; c2 < c1; ++c2) {
+			if (c2 == c1 - 1)
+				sep = "));\n";
+			c3 = split(a[c2], t);
+			printf("\t");
+			if (t[2] ~ "WILLRELE")
+				c4 = 3;
+			else
+				c4 = 2;
+			for (; c4 < c3; ++c4)
+				printf("%s ", t[c4]);
+			beg = match(t[c3], "[^*]");
+			end = match(t[c3], ";");
+			printf("%s%s%s",
+			    substr(t[c4], 0, beg - 1),
+			    substr(t[c4], beg, end - beg), sep);
+		}
+
+		# Print out inline struct.
+		printf("static inline int %s(", uname);
+		sep = ", ";
+		for (c2 = 0; c2 < c1; ++c2) {
+			if (c2 == c1 - 1)
+				sep = ")\n";
+			c3 = split(a[c2], t);
+			beg = match(t[c3], "[^*]");
+			end = match(t[c3], ";");
+			printf("%s%s", substr(t[c3], beg, end - beg), sep);
+		}
+		for (c2 = 0; c2 < c1; ++c2) {
+			c3 = split(a[c2], t);
+			printf("\t");
+			if (t[2] ~ "WILLRELE")
+				c4 = 3;
+			else
+				c4 = 2;
+			for (; c4 < c3; ++c4)
+				printf("%s ", t[c4]);
+			beg = match(t[c3], "[^*]");
+			printf("%s%s\n",
+			    substr(t[c4], 0, beg - 1), substr(t[c4], beg));
+		}
+		printf("{\n\tstruct %s_args a;\n\n", name);
+		printf("\ta.a_desc = VDESC(%s);\n", name);
+		for (c2 = 0; c2 < c1; ++c2) {
+			c3 = split(a[c2], t);
+			printf("\t");
+			beg = match(t[c3], "[^*]");
+			end = match(t[c3], ";");
+			printf("a.a_%s = %s\n",
+			    substr(t[c3], beg, end - beg), substr(t[c3], beg));
+		}
+		c1 = split(a[0], t);
+		beg = match(t[c1], "[^*]");
+		end = match(t[c1], ";");
+		printf("\treturn (VCALL(%s, VOFFSET(%s), &a));\n}\n",
+		    substr(t[c1], beg, end - beg), name);
+	}' < $SRC >> $HEADER
+
+# Print out header information for vnode_if.c.
+cat << END_OF_LEADING_COMMENT > $CFILE
+/*
+ * This file is produced automatically.
+ * Do not modify anything in here by hand.
+ *
+ * Created from @(#)vnode_if.sh	8.1 (Berkeley) 6/10/93
+ */
 
-# Begin stuff
-echo "$copyright"
-echo "$warning"
-echo '
 #include <sys/param.h>
 #include <sys/mount.h>
 #include <sys/vnode.h>
@@ -238,107 +199,261 @@ struct vnodeop_desc vop_default_desc = {
 	VDESC_NO_OFFSET,
 	NULL,
 };
-'
-
-# Body stuff
-sed -e "$sed_prep" $src | $awk '
-function do_offset(typematch) {
-	for (i=0; i<argc; i++) {
-		if (argtype[i] == typematch) {
-			printf("\tVOPARG_OFFSETOF(struct %s_args, a_%s),\n",
-				name, argname[i]);
-			return i;
-		};
-	};
-	print "\tVDESC_NO_OFFSET,";
-	return -1;
-}
 
-function doit() {
-	# Define offsets array
-	printf("\nint %s_vp_offsets[] = {\n", name);
-	for (i=0; i<argc; i++) {
-		if (argtype[i] == "struct vnode *") {
-			printf ("\tVOPARG_OFFSETOF(struct %s_args,a_%s),\n",
-				name, argname[i]);
-		}
+END_OF_LEADING_COMMENT
+
+# Awk script to take vnode_if.src and turn it into vnode_if.c.
+$AWK 'function kill_surrounding_ws (s) {
+		sub (/^[ \t]*/, "", s);
+		sub (/[ \t]*$/, "", s);
+		return s;
 	}
-	print "\tVDESC_NO_OFFSET";
-	print "};";
-	# Define F_desc
-	printf("struct vnodeop_desc %s_desc = {\n", name);
-	# offset
-	printf ("\t0,\n");
-	# printable name
-	printf ("\t\"%s\",\n", name);
-	# flags
-	printf("\t0");
-	vpnum = 0;
-	for (i=0; i<argc; i++) {
-		if (willrele[i]) {
-			if (argdir[i] ~ /OUT/) {
-				printf(" | VDESC_VPP_WILLRELE");
+
+	function read_args() {
+		numargs = 0;
+		while (getline ln) {
+			if (ln ~ /}/) {
+				break;
+			};
+	
+			# Delete comments, if any.
+			gsub (/\/\*.*\*\//, "", ln);
+			
+			# Delete leading/trailing space.
+			ln = kill_surrounding_ws(ln);
+	
+			# Pick off direction.
+			if (1 == sub(/^INOUT[ \t]+/, "", ln))
+				dir = "INOUT";
+			else if (1 == sub(/^IN[ \t]+/, "", ln))
+				dir = "IN";
+			else if (1 == sub(/^OUT[ \t]+/, "", ln))
+				dir = "OUT";
+			else
+				bail("No IN/OUT direction for \"" ln "\".");
+
+			# check for "WILLRELE"
+			if (1 == sub(/^WILLRELE[ \t]+/, "", ln)) {
+				rele = "WILLRELE";
 			} else {
-				printf(" | VDESC_VP%s_WILLRELE", vpnum);
+				rele = "WONTRELE";
 			};
-			vpnum++;
-		}
+	
+			# kill trailing ;
+			if (1 != sub (/;$/, "", ln)) {
+				bail("Missing end-of-line ; in \"" ln "\".");
+			};
+	
+			# pick off variable name
+			if (!(i = match(ln, /[A-Za-z0-9_]+$/))) {
+				bail("Missing var name \"a_foo\" in \"" ln "\".");
+			};
+			arg = substr (ln, i);
+			# Want to <<substr(ln, i) = "";>>, but nawk cannot.
+			# Hack around this.
+			ln = substr(ln, 1, i-1);
+	
+			# what is left must be type
+			# (put clean it up some)
+			type = ln;
+			gsub (/[ \t]+/, " ", type);   # condense whitespace
+			type = kill_surrounding_ws(type);
+	
+			# (boy this was easier in Perl)
+	
+			numargs++;
+			dirs[numargs] = dir;
+			reles[numargs] = rele;
+			types[numargs] = type;
+			args[numargs] = arg;
+		};
 	}
-	print ",";
-	# vp offsets
-	printf ("\t%s_vp_offsets,\n", name);
-	# vpp (if any)
-	do_offset("struct vnode **");
-	# cred (if any)
-	do_offset("struct ucred *");
-	# proc (if any)
-	do_offset("struct proc *");
-	# componentname
-	do_offset("struct componentname *");
-	# transport layer information
-	printf ("\tNULL,\n};\n");
-}
-END	{
-	printf("\n/* Special cases: */\n");
-	argc=1;
-	argdir[0]="IN";
-	argtype[0]="struct buf *";
-	argname[0]="bp";
-	willrele[0]=0;
-	name="vop_strategy";
-	doit();
-	name="vop_bwrite";
-	doit();
+
+	function generate_operation_vp_offsets() {
+		printf ("static int %s_vp_offsets[] = {\n", name);
+		# as a side effect, figure out the releflags
+		releflags = "";
+		vpnum = 0;
+		for (i=1; i<=numargs; i++) {
+			if (types[i] == "struct vnode *") {
+				printf ("\tVOPARG_OFFSETOF(struct %s_args,a_%s),\n",
+					name, args[i]);
+				if (reles[i] == "WILLRELE") {
+					releflags = releflags "|VDESC_VP" vpnum "_WILLRELE";
+				};
+				vpnum++;
+			};
+		};
+		sub (/^\|/, "", releflags);
+		print "\tVDESC_NO_OFFSET";
+		print "};";
+	}
+	
+	function find_arg_with_type (type) {
+		for (i=1; i<=numargs; i++) {
+			if (types[i] == type) {
+				return "VOPARG_OFFSETOF(struct " name "_args,a_" args[i] ")";
+			};
+		};
+		return "VDESC_NO_OFFSET";
+	}
+	
+	function generate_operation_desc() {
+		printf ("struct vnodeop_desc %s_desc = {\n", name);
+		# offset
+		printf ("\t0,\n");
+		# printable name
+		printf ("\t\"%s\",\n", name);
+		# flags
+		vppwillrele = "";
+		for (i=1; i<=numargs; i++) {
+			if (types[i] == "struct vnode **" &&
+				(reles[i] == "WILLRELE")) {
+				vppwillrele = "|VDESC_VPP_WILLRELE";
+			};
+		};
+		if (releflags == "") {
+			printf ("\t0%s,\n", vppwillrele);
+		} else {
+			printf ("\t%s%s,\n", releflags, vppwillrele);
+		};
+		# vp offsets
+		printf ("\t%s_vp_offsets,\n", name);
+		# vpp (if any)
+		printf ("\t%s,\n", find_arg_with_type("struct vnode **"));
+		# cred (if any)
+		printf ("\t%s,\n", find_arg_with_type("struct ucred *"));
+		# proc (if any)
+		printf ("\t%s,\n", find_arg_with_type("struct proc *"));
+		# componentname
+		printf ("\t%s,\n", find_arg_with_type("struct componentname *"));
+		# transport layer information
+		printf ("\tNULL,\n};\n");
+	}
+
+	NF == 0 || $0 ~ "^#" {
+		next;
+	}
+	{
+		# get the function name
+		name = $1;
+
+		# get the function arguments
+		read_args();
+
+		# Print out the vop_F_vp_offsets structure.  This all depends
+		# on naming conventions and nothing else.
+		generate_operation_vp_offsets();
+
+		# Print out the vnodeop_desc structure.
+		generate_operation_desc();
+
+		printf "\n";
+
+	}' < $SRC >> $CFILE
+# THINGS THAT DON'T WORK RIGHT YET.
+# 
+# Two existing BSD vnodeops (bwrite and strategy) don't take any vnodes as
+# arguments.  This means that these operations can't function successfully
+# through a bypass routine.
+#
+# Bwrite and strategy will be replaced when the VM page/buffer cache
+# integration happens.
+#
+# To get around this problem for now we handle these ops as special cases.
+
+cat << END_OF_SPECIAL_CASES >> $HEADER
+#include <sys/buf.h>
+struct vop_strategy_args {
+	struct vnodeop_desc *a_desc;
+	struct buf *a_bp;
+};
+extern struct vnodeop_desc vop_strategy_desc;
+static int VOP_STRATEGY __P((
+	struct buf *bp));
+static inline int VOP_STRATEGY(bp)
+	struct buf *bp;
+{
+	struct vop_strategy_args a;
+
+	a.a_desc = VDESC(vop_strategy);
+	a.a_bp = bp;
+	return (VCALL((bp)->b_vp, VOFFSET(vop_strategy), &a));
 }
-'"$awk_parser" | sed -e "$space_elim"
 
-# End stuff
-echo '
-/* End of special cases. */'
+struct vop_bwrite_args {
+	struct vnodeop_desc *a_desc;
+	struct buf *a_bp;
+};
+extern struct vnodeop_desc vop_bwrite_desc;
+static int VOP_BWRITE __P((
+	struct buf *bp));
+static inline int VOP_BWRITE(bp)
+	struct buf *bp;
+{
+	struct vop_bwrite_args a;
 
-# Add the vfs_op_descs array to the C file.
-# Begin stuff
-echo '
-struct vnodeop_desc *vfs_op_descs[] = {
-	&vop_default_desc,	/* MUST BE FIRST */
-	&vop_strategy_desc,	/* XXX: SPECIAL CASE */
-	&vop_bwrite_desc,	/* XXX: SPECIAL CASE */
-'
-
-# Body stuff
-sed -e "$sed_prep" $src | $awk '
-function doit() {
-	printf("\t&%s_desc,\n", name);
+	a.a_desc = VDESC(vop_bwrite);
+	a.a_bp = bp;
+	return (VCALL((bp)->b_vp, VOFFSET(vop_bwrite), &a));
 }
-'"$awk_parser"
+END_OF_SPECIAL_CASES
 
-# End stuff
-echo '	NULL
+cat << END_OF_SPECIAL_CASES >> $CFILE
+static int vop_strategy_vp_offsets[] = {
+	VDESC_NO_OFFSET
+};
+struct vnodeop_desc vop_strategy_desc = {
+	0,
+	"vop_strategy",
+	0,
+	vop_strategy_vp_offsets,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	NULL,
+};
+static int vop_bwrite_vp_offsets[] = {
+	VDESC_NO_OFFSET
+};
+struct vnodeop_desc vop_bwrite_desc = {
+	0,
+	"vop_bwrite",
+	0,
+	vop_bwrite_vp_offsets,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	VDESC_NO_OFFSET,
+	NULL,
 };
-'
+END_OF_SPECIAL_CASES
 
-exit 0
+# Add the vfs_op_descs array to the C file.
+$AWK '
+	BEGIN {
+		printf("\nstruct vnodeop_desc *vfs_op_descs[] = {\n");
+		printf("\t&vop_default_desc,	/* MUST BE FIRST */\n");
+		printf("\t&vop_strategy_desc,	/* XXX: SPECIAL CASE */\n");
+		printf("\t&vop_bwrite_desc,	/* XXX: SPECIAL CASE */\n");
+	}
+	END {
+		printf("\tNULL\n};\n");
+	}
+	NF == 0 || $0 ~ "^#" {
+		next;
+	}
+	{
+		# Get the function name.
+		printf("\t&%s_desc,\n", $1);
+
+		# Skip the function arguments.
+		for (;;) {
+			if (getline <= 0)
+				exit
+			if ($0 ~ "^};")
+				break;
+		}
+	}' < $SRC >> $CFILE
 
-# Local Variables:
-# tab-width: 4
-# End:
diff --git a/sys/kern/vnode_if.src b/sys/kern/vnode_if.src
index 1e32f29..7e3338f 100644
--- a/sys/kern/vnode_if.src
+++ b/sys/kern/vnode_if.src
@@ -31,6 +31,7 @@
 # SUCH DAMAGE.
 #
 #	@(#)vnode_if.src	8.12 (Berkeley) 5/14/95
+# $Id: vnode_if.src,v 1.9.2000.1 1996/09/17 14:32:01 peter Exp $
 #
 
 #
@@ -255,8 +256,8 @@ vop_remove {
 #% link		tdvp	L U U
 #
 vop_link {
-	IN WILLRELE struct vnode *vp;
-	IN struct vnode *tdvp;
+	IN WILLRELE struct vnode *tdvp;
+	IN struct vnode *vp;
 	IN struct componentname *cnp;
 };
 
@@ -385,6 +386,7 @@ vop_bmap {
 	OUT struct vnode **vpp;
 	IN daddr_t *bnp;
 	OUT int *runp;
+	OUT int *runb;
 };
 
 #
@@ -486,6 +488,23 @@ vop_update {
 	IN int waitfor;
 };
 
+vop_getpages {
+	IN struct vnode *vp;
+	IN vm_page_t *m;
+	IN int count;
+	IN int reqpage;
+	IN vm_ooffset_t offset;
+};
+
+vop_putpages {
+	IN struct vnode *vp;
+	IN vm_page_t *m;
+	IN int count;
+	IN int sync;
+	IN int *rtvals;
+	IN vm_ooffset_t offset;
+};
+
 #
 # Needs work: no vp?
 #